Merge branch 'refs/heads/trunk' into multiple_render_targets_example_update_test

2024-11-25 08:13:27 +00:00 · 2024-05-27 20:38:41 +03:00 · 2024-05-27 20:38:41 +03:00 · a56355a584
commit a56355a584
parent 9c3cf48c3a d9c054c645
380 changed files with 41454 additions and 7405 deletions
--- a/.config/nextest.toml
+++ b/.config/nextest.toml
@ -3,7 +3,17 @@
 [profile.default]
 slow-timeout = { period = "45s", terminate-after = 2 }

-# Use two threads for tests with "2_threads" in their name
+# Use two threads for tests with "2 threads" in their name
 [[profile.default.overrides]]
-filter = 'test(~2_threads)'
+filter = 'test(~2_threads) | test(~2 threads)'
 threads-required = 2
+
+# Use four threads for tests with "4 threads" in their name
+[[profile.default.overrides]]
+filter = 'test(~4_threads) | test(~4 threads)'
+threads-required = 4
+
+# Use eight threads for tests with "8 threads" in their name
+[[profile.default.overrides]]
+filter = 'test(~8_threads) | test(~8 threads)'
+threads-required = 8
--- a/.deny.toml
+++ b/.deny.toml
@ -1,13 +1,18 @@
 [bans]
 multiple-versions = "deny"
 skip-tree = [
+	# We never enable loom in any of our dependencies but it causes dupes
+	{ name = "loom", version = "0.7.2" },
 	{ name = "windows-sys", version = "0.45" },
-	{ name = "winit", version = "0.27.5" },
+	{ name = "winit", version = "0.27" },
+	{ name = "winit", version = "0.29" },
 	{ name = "rustc_version", version = "0.2.3" },
 	{ name = "sourcemap", version = "7.1.1" },
 ]
 skip = [
 	{ name = "hlsl-snapshots", version = "0.1.0" },
+	# Strum uses an old version
+	{ name = "heck", version = "0.4.0" },
 ]
 wildcards = "deny"
 allow-wildcard-paths = true
@ -20,6 +25,7 @@ allow = [
 	"BSD-3-Clause",
 	"CC0-1.0",
 	"ISC",
+	"MPL-2.0",
 	"MIT",
 	"MIT-0",
 	"Unicode-DFS-2016",
--- a/.envrc
+++ b/.envrc
@ -0,0 +1 @@
+use nix
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -78,6 +78,7 @@ jobs:
    # runtime is normally 2-8 minutes
    #
    # currently high due to documentation time problems on mac.
+    # https://github.com/rust-lang/rust/issues/114891
    timeout-minutes: 30

    strategy:
@ -225,10 +226,22 @@ jobs:
          cargo clippy --target ${{ matrix.target }} --no-default-features

          # Check with all features.
-          cargo clippy --target ${{ matrix.target }} --tests --all-features
+          cargo clippy --target ${{ matrix.target }} --tests --benches --all-features

          # build docs
          cargo +${{ env.DOCS_RUST_VERSION }} doc --target ${{ matrix.target }} --all-features --no-deps
+      - name: check private item docs
+        if: matrix.kind == 'native'
+        shell: bash
+        run: |
+          set -e
+
+          # wgpu_core package
+          cargo +${{ env.DOCS_RUST_VERSION }} doc --target ${{ matrix.target }} \
+                --package wgpu-core \
+                --package wgpu-hal \
+                --package naga \
+                --all-features --no-deps --document-private-items

  # We run minimal checks on the MSRV of the core crates, ensuring that
  # its dependency tree does not cause issues for firefox.
@ -614,7 +627,7 @@ jobs:
          cargo fmt --manifest-path xtask/Cargo.toml -- --check

      - name: Check for typos
-        uses: crate-ci/typos@v1.19.0
+        uses: crate-ci/typos@v1.21.0

  check-cts-runner:
    # runtime is normally 2 minutes
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@ -41,7 +41,7 @@ jobs:
        if: ${{ failure() }}

      - name: Deploy the docs
-        uses: JamesIves/github-pages-deploy-action@v4.5.0
+        uses: JamesIves/github-pages-deploy-action@v4.6.1
        if: github.ref == 'refs/heads/trunk'
        with:
          token: ${{ secrets.WEB_DEPLOY }}
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -41,7 +41,7 @@ jobs:
        run: cargo xtask run-wasm --no-serve

      - name: Deploy WebGPU examples
-        uses: JamesIves/github-pages-deploy-action@v4.5.0
+        uses: JamesIves/github-pages-deploy-action@v4.6.1
        if: github.ref == 'refs/heads/trunk'
        with:
          token: ${{ secrets.WEB_DEPLOY }}
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -41,6 +41,273 @@ Bottom level categories:

 ### Major Changes

+#### Remove lifetime bounds on `wgpu::ComputePass`
+
+TODO(wumpf): This is still work in progress. Should write a bit more about it. Also will very likely extend to `wgpu::RenderPass` before release.
+
+`wgpu::ComputePass` recording methods (e.g. `wgpu::ComputePass:set_render_pipeline`) no longer impose a lifetime constraint passed in resources.
+
+By @wumpf in [#5569](https://github.com/gfx-rs/wgpu/pull/5569), [#5575](https://github.com/gfx-rs/wgpu/pull/5575).
+
+#### Querying shader compilation errors
+
+Wgpu now supports querying [shader compilation info](https://www.w3.org/TR/webgpu/#dom-gpushadermodule-getcompilationinfo).
+
+This allows you to get more structured information about compilation errors, warnings and info:
+```rust
+...
+let lighting_shader = ctx.device.create_shader_module(include_wgsl!("lighting.wgsl"));
+let compilation_info = lighting_shader.get_compilation_info().await;
+for message in compilation_info
+    .messages
+    .iter()
+    .filter(|m| m.message_type == wgpu::CompilationMessageType::Error)
+{
+    let line = message.location.map(|l| l.line_number).unwrap_or(1);
+    println!("Compile error at line {line}");
+}
+```
+
+By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410)
+
+### New features
+
+#### Vulkan
+
+- Added a `PipelineCache` resource to allow using Vulkan pipeline caches. By @DJMcNab in [#5319](https://github.com/gfx-rs/wgpu/pull/5319)
+
+#### General
+
+- Added `as_hal` for `Buffer` to access wgpu created buffers form wgpu-hal. By @JasondeWolff in [#5724](https://github.com/gfx-rs/wgpu/pull/5724)
+
+#### Naga
+
+- Implement `WGSL`'s `unpack4xI8`,`unpack4xU8`,`pack4xI8` and `pack4xU8`. By @VlaDexa in [#5424](https://github.com/gfx-rs/wgpu/pull/5424)
+
+### Changes
+
+#### General
+
+- Avoid introducing spurious features for optional dependencies. By @bjorn3 in [#5691](https://github.com/gfx-rs/wgpu/pull/5691)
+
+### Bug Fixes
+
+### General
+
+- Ensure render pipelines have at least 1 target. By @ErichDonGubler in [#5715](https://github.com/gfx-rs/wgpu/pull/5715)
+
+#### Vulkan
+
+- Fix enablement of subgroup ops extension on Vulkan devices that don't support Vulkan 1.3. By @cwfitzgerald in [#5624](https://github.com/gfx-rs/wgpu/pull/5624).
+
+#### GLES / OpenGL
+
+-  Fix regression on OpenGL (EGL) where non-sRGB still used sRGB [#5642](https://github.com/gfx-rs/wgpu/pull/5642)
+-  Fix `ClearColorF`, `ClearColorU` and `ClearColorI` commands being issued before `SetDrawColorBuffers` [#5666](https://github.com/gfx-rs/wgpu/pull/5666)
+-  Replace `glClear` with `glClearBufferF` because `glDrawBuffers` requires that the ith buffer must be `COLOR_ATTACHMENTi` or `NONE` [#5666](https://github.com/gfx-rs/wgpu/pull/5666)
+
+## v0.20.0 (2024-04-28)
+
+### Major Changes
+
+#### Pipeline overridable constants
+
+Wgpu supports now [pipeline-overridable constants](https://www.w3.org/TR/webgpu/#dom-gpuprogrammablestage-constants)
+
+This allows you to define constants in wgsl like this:
+```rust
+override some_factor: f32 = 42.1337; // Specifies a default of 42.1337 if it's not set.
+```
+And then set them at runtime like so on your pipeline consuming this shader:
+```rust
+// ...
+fragment: Some(wgpu::FragmentState {
+    compilation_options: wgpu::PipelineCompilationOptions {
+        constants: &[("some_factor".to_owned(), 0.1234)].into(), // Sets `some_factor` to 0.1234.
+        ..Default::default()
+    },
+    // ...
+}),
+// ...
+```
+
+By @teoxoy & @jimblandy in [#5500](https://github.com/gfx-rs/wgpu/pull/5500)
+
+#### Changed feature requirements for timestamps
+
+Due to a specification change `write_timestamp` is no longer supported on WebGPU.
+`wgpu::CommandEncoder::write_timestamp` requires now the new `wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS` feature which is available on all native backends but not on WebGPU.
+
+By @wumpf in [#5188](https://github.com/gfx-rs/wgpu/pull/5188)
+
+
+#### Wgsl const evaluation for many more built-ins
+
+Many numeric built-ins have had a constant evaluation implementation added for them, which allows them to be used in a `const` context:
+
+`abs`, `acos`, `acosh`, `asin`, `asinh`, `atan`, `atanh`, `cos`, `cosh`, `round`, `saturate`, `sin`, `sinh`, `sqrt`, `step`, `tan`, `tanh`, `ceil`, `countLeadingZeros`, `countOneBits`, `countTrailingZeros`, `degrees`, `exp`, `exp2`, `floor`, `fract`, `fma`, `inverseSqrt`, `log`, `log2`, `max`, `min`, `radians`, `reverseBits`, `sign`, `trunc`
+
+By @ErichDonGubler in [#4879](https://github.com/gfx-rs/wgpu/pull/4879), [#5098](https://github.com/gfx-rs/wgpu/pull/5098)
+
+#### New **native-only** wgsl features
+
+##### Subgroup operations
+
+The following subgroup operations are available in wgsl now:
+
+`subgroupBallot`, `subgroupAll`, `subgroupAny`, `subgroupAdd`, `subgroupMul`, `subgroupMin`, `subgroupMax`, `subgroupAnd`, `subgroupOr`, `subgroupXor`, `subgroupExclusiveAdd`, `subgroupExclusiveMul`, `subgroupInclusiveAdd`, `subgroupInclusiveMul`, `subgroupBroadcastFirst`, `subgroupBroadcast`, `subgroupShuffle`, `subgroupShuffleDown`, `subgroupShuffleUp`, `subgroupShuffleXor`
+
+
+Availability is governed by the following feature flags:
+* `wgpu::Features::SUBGROUP` for all operations except `subgroupBarrier` in fragment & compute, supported on Vulkan, DX12 and Metal.
+* `wgpu::Features::SUBGROUP_VERTEX`, for all operations except `subgroupBarrier` general operations in  vertex shaders, supported on Vulkan
+* `wgpu::Features::SUBGROUP_BARRIER`, for support of the `subgroupBarrier` operation, supported on Vulkan & Metal
+
+Note that there currently [some differences](https://github.com/gfx-rs/wgpu/issues/5555) between wgpu's native-only implementation and the [open WebGPU proposal](https://github.com/gpuweb/gpuweb/blob/main/proposals/subgroups.md).
+
+By @exrook and @lichtso in [#5301](https://github.com/gfx-rs/wgpu/pull/5301)
+
+##### Signed and unsigned 64 bit integer support in shaders.
+
+`wgpu::Features::SHADER_INT64` enables 64 bit integer signed and unsigned integer variables in wgsl (`i64` and `u64` respectively).
+Supported on Vulkan, DX12 (requires DXC) and Metal (with MSL 2.3+ support).
+
+By @atlv24 and @cwfitzgerald in [#5154](https://github.com/gfx-rs/wgpu/pull/5154)
+
+### New features
+
+#### General
+
+- Implemented the `Unorm10_10_10_2` VertexFormat by @McMackety in [#5477](https://github.com/gfx-rs/wgpu/pull/5477)
+- `wgpu-types`'s `trace` and `replay` features have been replaced by the `serde` feature. By @KirmesBude in [#5149](https://github.com/gfx-rs/wgpu/pull/5149)
+- `wgpu-core`'s `serial-pass` feature has been removed. Use `serde` instead. By @KirmesBude in [#5149](https://github.com/gfx-rs/wgpu/pull/5149)
+- Added `InstanceFlags::GPU_BASED_VALIDATION`, which enables GPU-based validation for shaders. This is currently only supported on the DX12 and Vulkan backends; other platforms ignore this flag, for now. By @ErichDonGubler in [#5146](https://github.com/gfx-rs/wgpu/pull/5146), [#5046](https://github.com/gfx-rs/wgpu/pull/5046).
+  - When set, this flag implies `InstanceFlags::VALIDATION`.
+  - This has been added to the set of flags set by `InstanceFlags::advanced_debugging`. Since the overhead is potentially very large, the flag is not enabled by default in debug builds when using `InstanceFlags::from_build_config`.
+  - As with other instance flags, this flag can be changed in calls to `InstanceFlags::with_env` with the new `WGPU_GPU_BASED_VALIDATION` environment variable.
+- `wgpu::Instance` can now report which `wgpu::Backends` are available based on the build configuration. By @wumpf [#5167](https://github.com/gfx-rs/wgpu/pull/5167)
+  ```diff
+  -wgpu::Instance::any_backend_feature_enabled()
+  +!wgpu::Instance::enabled_backend_features().is_empty()
+  ```
+- Breaking change: [`wgpu_core::pipeline::ProgrammableStageDescriptor`](https://docs.rs/wgpu-core/latest/wgpu_core/pipeline/struct.ProgrammableStageDescriptor.html#structfield.entry_point) is now optional. By @ErichDonGubler in [#5305](https://github.com/gfx-rs/wgpu/pull/5305).
+- `Features::downlevel{_webgl2,}_features` was made const by @MultisampledNight in [#5343](https://github.com/gfx-rs/wgpu/pull/5343)
+- Breaking change: [`wgpu_core::pipeline::ShaderError`](https://docs.rs/wgpu-core/latest/wgpu_core/pipeline/struct.ShaderError.html) has been moved to `naga`. By @stefnotch in [#5410](https://github.com/gfx-rs/wgpu/pull/5410)
+- More as_hal methods and improvements by @JMS55 in [#5452](https://github.com/gfx-rs/wgpu/pull/5452)
+  - Added `wgpu::CommandEncoder::as_hal_mut`
+  - Added `wgpu::TextureView::as_hal`
+  - `wgpu::Texture::as_hal` now returns a user-defined type to match the other as_hal functions
+
+#### Naga
+
+- Allow user to select which MSL version to use via `--metal-version` with Naga CLI. By @pcleavelin in [#5392](https://github.com/gfx-rs/wgpu/pull/5392)
+- Support `arrayLength` for runtime-sized arrays inside binding arrays (for WGSL input and SPIR-V output). By @kvark in [#5428](https://github.com/gfx-rs/wgpu/pull/5428)
+- Added `--shader-stage` and `--input-kind` options to naga-cli for specifying vertex/fragment/compute shaders, and frontend. by @ratmice in [#5411](https://github.com/gfx-rs/wgpu/pull/5411)
+- Added a `create_validator` function to wgpu_core `Device` to create naga `Validator`s. By @atlv24 [#5606](https://github.com/gfx-rs/wgpu/pull/5606)
+
+#### WebGPU
+
+- Implement the `device_set_device_lost_callback` method for `ContextWebGpu`. By @suti in [#5438](https://github.com/gfx-rs/wgpu/pull/5438)
+- Add support for storage texture access modes `ReadOnly` and `ReadWrite`. By @JolifantoBambla in [#5434](https://github.com/gfx-rs/wgpu/pull/5434)
+
+#### GLES / OpenGL
+
+- Log an error when GLES texture format heuristics fail. By @PolyMeilex in [#5266](https://github.com/gfx-rs/wgpu/issues/5266)
+- Cache the sample count to keep `get_texture_format_features` cheap. By @Dinnerbone in [#5346](https://github.com/gfx-rs/wgpu/pull/5346)
+- Mark `DEPTH32FLOAT_STENCIL8` as supported in GLES. By @Dinnerbone in [#5370](https://github.com/gfx-rs/wgpu/pull/5370)
+- Desktop GL now also supports `TEXTURE_COMPRESSION_ETC2`. By @Valaphee in [#5568](https://github.com/gfx-rs/wgpu/pull/5568)
+- Don't create a program for shader-clearing if that workaround isn't required. By @Dinnerbone in [#5348](https://github.com/gfx-rs/wgpu/pull/5348).
+- OpenGL will now be preferred over OpenGL ES on EGL, making it consistent with WGL. By @valaphee in [#5482](https://github.com/gfx-rs/wgpu/pull/5482)
+- Fill out `driver` and `driver_info`, with the OpenGL flavor and version, similar to Vulkan. By @valaphee in [#5482](https://github.com/gfx-rs/wgpu/pull/5482)
+
+#### Metal
+
+- Metal 3.0 and 3.1 detection. By @atlv24 in [#5497](https://github.com/gfx-rs/wgpu/pull/5497)
+
+#### DX12
+
+- Shader Model 6.1-6.7 detection. By @atlv24 in [#5498](https://github.com/gfx-rs/wgpu/pull/5498)
+
+### Other performance improvements
+
+- Simplify and speed up the allocation of internal IDs. By @nical in [#5229](https://github.com/gfx-rs/wgpu/pull/5229)
+- Use memory pooling for UsageScopes to avoid frequent large allocations. by @robtfm in [#5414](https://github.com/gfx-rs/wgpu/pull/5414)
+- Eager release of GPU resources comes from device.trackers. By @bradwerth in [#5075](https://github.com/gfx-rs/wgpu/pull/5075)
+- Support disabling zero-initialization of workgroup local memory in compute shaders. By @DJMcNab in [#5508](https://github.com/gfx-rs/wgpu/pull/5508)
+
+### Documentation
+
+- Improved `wgpu_hal` documentation. By @jimblandy in [#5516](https://github.com/gfx-rs/wgpu/pull/5516), [#5524](https://github.com/gfx-rs/wgpu/pull/5524), [#5562](https://github.com/gfx-rs/wgpu/pull/5562), [#5563](https://github.com/gfx-rs/wgpu/pull/5563), [#5566](https://github.com/gfx-rs/wgpu/pull/5566), [#5617](https://github.com/gfx-rs/wgpu/pull/5617), [#5618](https://github.com/gfx-rs/wgpu/pull/5618)
+- Add mention of primitive restart in the description of `PrimitiveState::strip_index_format`. By @cpsdqs in [#5350](https://github.com/gfx-rs/wgpu/pull/5350)
+- Document and tweak precise behaviour of `SourceLocation`. By @stefnotch in [#5386](https://github.com/gfx-rs/wgpu/pull/5386) and [#5410](https://github.com/gfx-rs/wgpu/pull/5410)
+- Give short example of WGSL `push_constant` syntax. By @waywardmonkeys in [#5393](https://github.com/gfx-rs/wgpu/pull/5393)
+- Fix incorrect documentation of `Limits::max_compute_workgroup_storage_size` default value. By @atlv24 in [#5601](https://github.com/gfx-rs/wgpu/pull/5601)
+
+### Bug Fixes
+
+#### General
+- Fix `serde` feature not compiling for `wgpu-types`. By @KirmesBude in [#5149](https://github.com/gfx-rs/wgpu/pull/5149)
+- Fix the validation of vertex and index ranges. By @nical in [#5144](https://github.com/gfx-rs/wgpu/pull/5144) and [#5156](https://github.com/gfx-rs/wgpu/pull/5156)
+- Fix panic when creating a surface while no backend is available. By @wumpf [#5166](https://github.com/gfx-rs/wgpu/pull/5166)
+- Correctly compute minimum buffer size for array-typed `storage` and `uniform` vars. By @jimblandy [#5222](https://github.com/gfx-rs/wgpu/pull/5222)
+- Fix timeout when presenting a surface where no work has been done. By @waywardmonkeys in [#5200](https://github.com/gfx-rs/wgpu/pull/5200)
+- Fix registry leaks with de-duplicated resources. By @nical in [#5244](https://github.com/gfx-rs/wgpu/pull/5244)
+- Fix linking when targeting android. By @ashdnazg in [#5326](https://github.com/gfx-rs/wgpu/pull/5326).
+- Failing to set the device lost closure will call the closure before returning. By @bradwerth in [#5358](https://github.com/gfx-rs/wgpu/pull/5358).
+- Fix deadlocks caused by recursive read-write lock acquisitions [#5426](https://github.com/gfx-rs/wgpu/pull/5426).
+- Remove exposed C symbols (`extern "C"` + [no_mangle]) from RenderPass & ComputePass recording. By @wumpf in [#5409](https://github.com/gfx-rs/wgpu/pull/5409).
+- Fix surfaces being only compatible with first backend enabled on an instance, causing failures when manually specifying an adapter. By @Wumpf in [#5535](https://github.com/gfx-rs/wgpu/pull/5535).
+- Clean up weak references to texture views and bind groups. By @xiaopengli89 [#5595](https://github.com/gfx-rs/wgpu/pull/5595).
+
+#### Naga
+
+- In spv-in, remove unnecessary "gl_PerVertex" name check so unused builtins will always be skipped. Prevents validation errors caused by capability requirements of these builtins [#4915](https://github.com/gfx-rs/wgpu/issues/4915). By @Imberflur in [#5227](https://github.com/gfx-rs/wgpu/pull/5227).
+- In spv-out, check for acceleration and ray-query types when enabling ray-query extension to prevent validation error. By @Vecvec in [#5463](https://github.com/gfx-rs/wgpu/pull/5463)
+- Add a limit for curly brace nesting in WGSL parsing, plus a note about stack size requirements. By @ErichDonGubler in [#5447](https://github.com/gfx-rs/wgpu/pull/5447).
+- In hlsl-out, fix accesses on zero value expressions by generating helper functions for `Expression::ZeroValue`. By @Imberflur in [#5587](https://github.com/gfx-rs/wgpu/pull/5587).
+- Fix behavior of `extractBits` and `insertBits` when `offset + count` overflows the bit width. By @cwfitzgerald in [#5305](https://github.com/gfx-rs/wgpu/pull/5305)
+- Fix behavior of integer `clamp` when `min` argument > `max` argument. By @cwfitzgerald in [#5300](https://github.com/gfx-rs/wgpu/pull/5300).
+- Fix `TypeInner::scalar_width` to be consistent with the rest of the codebase and return values in bytes not bits. By @atlv24 in [#5532](https://github.com/gfx-rs/wgpu/pull/5532).
+
+#### GLES / OpenGL
+
+- GLSL 410 does not support layout(binding = ...), enable only for GLSL 420. By @bes in [#5357](https://github.com/gfx-rs/wgpu/pull/5357)
+- Fixes for being able to use an OpenGL 4.1 core context provided by macOS with wgpu. By @bes in [#5331](https://github.com/gfx-rs/wgpu/pull/5331).
+- Fix crash when holding multiple devices on wayland/surfaceless. By @ashdnazg in [#5351](https://github.com/gfx-rs/wgpu/pull/5351).
+- Fix `first_instance` getting ignored in draw indexed when `ARB_shader_draw_parameters` feature is present and `base_vertex` is 0. By @valaphee in [#5482](https://github.com/gfx-rs/wgpu/pull/5482)
+
+#### Vulkan
+
+- Set object labels when the DEBUG flag is set, even if the VALIDATION flag is disabled. By @DJMcNab in [#5345](https://github.com/gfx-rs/wgpu/pull/5345).
+- Add safety check to `wgpu_hal::vulkan::CommandEncoder` to make sure `discard_encoding` is not called in the closed state. By @villuna in [#5557](https://github.com/gfx-rs/wgpu/pull/5557)
+- Fix SPIR-V type capability requests to not depend on `LocalType` caching. By @atlv24 in [#5590](https://github.com/gfx-rs/wgpu/pull/5590)
+- Upgrade `ash` to `0.38`. By @MarijnS95 in [#5504](https://github.com/gfx-rs/wgpu/pull/5504).
+
+#### Tests
+
+- Fix intermittent crashes on Linux in the `multithreaded_compute` test. By @jimblandy in [#5129](https://github.com/gfx-rs/wgpu/pull/5129).
+- Refactor tests to read feature flags by name instead of a hardcoded hexadecimal u64. By @atlv24 in [#5155](https://github.com/gfx-rs/wgpu/pull/5155).
+- Add test that verifies that we can drop the queue before using the device to create a command encoder. By @Davidster in [#5211](https://github.com/gfx-rs/wgpu/pull/5211)
+
+## v0.19.4 (2024-04-17)
+
+### Bug Fixes
+
+#### General
+
+- Don't depend on bind group and bind group layout entry order in backends. This caused incorrect severely incorrect command execution and, in some cases, crashes. By @ErichDonGubler in [#5421](https://github.com/gfx-rs/wgpu/pull/5421).
+- Properly clean up all write_buffer/texture temporary resources. By @robtfm in [#5413](https://github.com/gfx-rs/wgpu/pull/5413).
+- Fix deadlock in certain situations when mapping buffers using `wgpu-profiler`. By @cwfitzgerald in [#5517](https://github.com/gfx-rs/wgpu/pull/5517)
+
+#### WebGPU
+- Correctly pass through timestamp queries to WebGPU. By @cwfitzgerald in [#5527](https://github.com/gfx-rs/wgpu/pull/5527).
+
+## v0.19.3 (2024-03-01)
+
+This release includes `wgpu`, `wgpu-core`, and `wgpu-hal`. All other crates are unchanged.
+
+### Major Changes
+
 #### Vendored WebGPU Bindings from `web_sys`

 **`--cfg=web_sys_unstable_apis` is no longer needed in your `RUSTFLAGS` to compile for WebGPU!!!**
@ -51,85 +318,36 @@ To combat this problem we have decided to vendor the `web_sys` bindings for WebG

 By @cwfitzgerald in [#5325](https://github.com/gfx-rs/wgpu/pull/5325).

-### Documentation
-
- Document Wayland specific behavior related to `SurfaceTexture::present`. By @i509VCB in [#5092](https://github.com/gfx-rs/wgpu/pull/5092).
- Add mention of primitive restart in the description of `PrimitiveState::strip_index_format`. By @cpsdqs in [#5350](https://github.com/gfx-rs/wgpu/pull/5350)
- Document precise behaviour of `SourceLocation`. By @stefnotch in [#5386](https://github.com/gfx-rs/wgpu/pull/5386)
- Give short example of WGSL `push_constant` syntax. By @waywardmonkeys in [#5393](https://github.com/gfx-rs/wgpu/pull/5393)
-
-### New features
+### Bug Fixes

 #### General

- Many numeric built-ins have had a constant evaluation implementation added for them, which allows them to be used in a `const` context:
-    - [#4879](https://github.com/gfx-rs/wgpu/pull/4879) by @ErichDonGubler:
-        - `abs`
-        - `acos`
-        - `acosh`
-        - `asin`
-        - `asinh`
-        - `atan`
-        - `atanh`
-        - `cos`
-        - `cosh`
-        - `round`
-        - `saturate`
-        - `sin`
-        - `sinh`
-        - `sqrt`
-        - `step`
-        - `tan`
-        - `tanh`
-    - [#5098](https://github.com/gfx-rs/wgpu/pull/5098) by @ErichDonGubler:
-        - `ceil`
-        - `countLeadingZeros`
-        - `countOneBits`
-        - `countTrailingZeros`
-        - `degrees`
-        - `exp`
-        - `exp2`
-        - `floor`
-        - `fract`
-        - `fma`
-        - `inverseSqrt`
-        - `log`
-        - `log2`
-        - `max`
-        - `min`
-        - `radians`
-        - `reverseBits`
-        - `sign`
-        - `trunc`
- Eager release of GPU resources comes from device.trackers. By @bradwerth in [#5075](https://github.com/gfx-rs/wgpu/pull/5075)
- `wgpu-types`'s `trace` and `replay` features have been replaced by the `serde` feature. By @KirmesBude in [#5149](https://github.com/gfx-rs/wgpu/pull/5149)
- `wgpu-core`'s `serial-pass` feature has been removed. Use `serde` instead. By @KirmesBude in [#5149](https://github.com/gfx-rs/wgpu/pull/5149)
- Added `InstanceFlags::GPU_BASED_VALIDATION`, which enables GPU-based validation for shaders. This is currently only supported on the DX12 and Vulkan backends; other platforms ignore this flag, for now.
-  - When set, this flag implies `InstanceFlags::VALIDATION`.
-  - This has been added to the set of flags set by `InstanceFlags::advanced_debugging`. Since the overhead is potentially very large, the flag is not enabled by default in debug builds when using `InstanceFlags::from_build_config`.
-  - As with other instance flags, this flag can be changed in calls to `InstanceFlags::with_env` with the new `WGPU_GPU_BASED_VALIDATION` environment variable.
+- Fix an issue where command encoders weren't properly freed if an error occurred during command encoding. By @ErichDonGubler in [#5251](https://github.com/gfx-rs/wgpu/pull/5251).
+- Fix incorrect validation causing all indexed draws on render bundles to fail. By @wumpf in [#5430](https://github.com/gfx-rs/wgpu/pull/5340).

-  By @ErichDonGubler in [#5146](https://github.com/gfx-rs/wgpu/pull/5146), [#5046](https://github.com/gfx-rs/wgpu/pull/5046).
- Signed and unsigned 64 bit integer support in shaders. By @rodolphito and @cwfitzgerald in [#5154](https://github.com/gfx-rs/wgpu/pull/5154)
- `wgpu::Instance` can now report which `wgpu::Backends` are available based on the build configuration. By @wumpf [#5167](https://github.com/gfx-rs/wgpu/pull/5167)
-```diff
-wgpu::Instance::any_backend_feature_enabled()
-+!wgpu::Instance::enabled_backend_features().is_empty()
-```
+#### Android
+- Fix linking error when targeting android without `winit`. By @ashdnazg in [#5326](https://github.com/gfx-rs/wgpu/pull/5326).
+
+
+## v0.19.2 (2024-02-29)
+
+This release includes `wgpu`, `wgpu-core`, `wgpu-hal`, `wgpu-types`, and `naga`. All other crates are unchanged.
+
+### Added/New Features
+
+#### General
 - `wgpu::Id` now implements `PartialOrd`/`Ord` allowing it to be put in `BTreeMap`s. By @cwfitzgerald and @9291Sam in [#5176](https://github.com/gfx-rs/wgpu/pull/5176)
- `wgpu::CommandEncoder::write_timestamp` requires now the new `wgpu::Features::TIMESTAMP_QUERY_INSIDE_ENCODERS` feature which is available on all native backends but not on WebGPU (due to a spec change `write_timestamp` is no longer supported on WebGPU). By @wumpf in [#5188](https://github.com/gfx-rs/wgpu/pull/5188)
- Breaking change: [`wgpu_core::pipeline::ProgrammableStageDescriptor`](https://docs.rs/wgpu-core/latest/wgpu_core/pipeline/struct.ProgrammableStageDescriptor.html#structfield.entry_point) is now optional. By @ErichDonGubler in [#5305](https://github.com/gfx-rs/wgpu/pull/5305).
- `Features::downlevel{_webgl2,}_features` was made const by @MultisampledNight in [#5343](https://github.com/gfx-rs/wgpu/pull/5343)

-#### GLES
+#### OpenGL
+- Log an error when OpenGL texture format heuristics fail. By @PolyMeilex in [#5266](https://github.com/gfx-rs/wgpu/issues/5266)

- Log an error when GLES texture format heuristics fail. By @PolyMeilex in [#5266](https://github.com/gfx-rs/wgpu/issues/5266)
- Cache the sample count to keep `get_texture_format_features` cheap. By @Dinnerbone in [#5346](https://github.com/gfx-rs/wgpu/pull/5346)
- Mark `DEPTH32FLOAT_STENCIL8` as supported in GLES. By @Dinnerbone in [#5370](https://github.com/gfx-rs/wgpu/pull/5370)
+#### `wgsl-out`
+- Learned to generate acceleration structure types. By @JMS55 in [#5261](https://github.com/gfx-rs/wgpu/pull/5261)

-#### Naga
-
- Allow user to select which MSL version to use via `--metal-version` with Naga CLI. By @pcleavelin in [#5392](https://github.com/gfx-rs/wgpu/pull/5392)
+### Documentation
+- Fix link in `wgpu::Instance::create_surface` documentation. By @HexoKnight in [#5280](https://github.com/gfx-rs/wgpu/pull/5280).
+- Fix typo in `wgpu::CommandEncoder::clear_buffer` documentation. By @PWhiddy in [#5281](https://github.com/gfx-rs/wgpu/pull/5281).
+- `Surface` configuration incorrectly claimed that `wgpu::Instance::create_surface` was unsafe. By @hackaugusto in [#5265](https://github.com/gfx-rs/wgpu/pull/5265).

 ### Examples

@ -138,49 +356,50 @@ By @cwfitzgerald in [#5325](https://github.com/gfx-rs/wgpu/pull/5325).
 ### Bug Fixes

 #### General
- Fix `panic!` when dropping `Instance` without `InstanceFlags::VALIDATION`. By @hakolao in [#5134](https://github.com/gfx-rs/wgpu/pull/5134)
- Fix `serde` feature not compiling for `wgpu-types`. By @KirmesBude in [#5149](https://github.com/gfx-rs/wgpu/pull/5149)
- Fix the validation of vertex and index ranges. By @nical in [#5144](https://github.com/gfx-rs/wgpu/pull/5144) and [#5156](https://github.com/gfx-rs/wgpu/pull/5156)
 - Device lost callbacks are invoked when replaced and when global is dropped. By @bradwerth in [#5168](https://github.com/gfx-rs/wgpu/pull/5168)
- Fix panic when creating a surface while no backend is available. By @wumpf [#5166](https://github.com/gfx-rs/wgpu/pull/5166)
- Correctly compute minimum buffer size for array-typed `storage` and `uniform` vars. By @jimblandy [#5222](https://github.com/gfx-rs/wgpu/pull/5222)
- Fix timeout when presenting a surface where no work has been done. By @waywardmonkeys in [#5200](https://github.com/gfx-rs/wgpu/pull/5200)
- Simplify and speed up the allocation of internal IDs. By @nical in [#5229](https://github.com/gfx-rs/wgpu/pull/5229)
- Fix an issue where command encoders weren't properly freed if an error occurred during command encoding. By @ErichDonGubler in [#5251](https://github.com/gfx-rs/wgpu/pull/5251).
- Fix behavior of `extractBits` and `insertBits` when `offset + count` overflows the bit width. By @cwfitzgerald in [#5305](https://github.com/gfx-rs/wgpu/pull/5305)
- Fix registry leaks with de-duplicated resources. By @nical in [#5244](https://github.com/gfx-rs/wgpu/pull/5244)
- Fix behavior of integer `clamp` when `min` argument > `max` argument. By @cwfitzgerald in [#5300](https://github.com/gfx-rs/wgpu/pull/5300).
- Fix missing validation for `Device::clear_buffer` where `offset + size buffer.size` was not checked when `size` was omitted. By @ErichDonGubler in [#5282](https://github.com/gfx-rs/wgpu/pull/5282).
- Fix linking when targeting android. By @ashdnazg in [#5326](https://github.com/gfx-rs/wgpu/pull/5326).
+- Fix performance regression when allocating a large amount of resources of the same type. By @nical in [#5229](https://github.com/gfx-rs/wgpu/pull/5229)
+- Fix docs.rs wasm32 builds. By @cwfitzgerald in [#5310](https://github.com/gfx-rs/wgpu/pull/5310)
+- Improve error message when binding count limit hit. By @hackaugusto in [#5298](https://github.com/gfx-rs/wgpu/pull/5298)
+- Remove an unnecessary `clone` during GLSL shader ingestion. By @a1phyr in [#5118](https://github.com/gfx-rs/wgpu/pull/5118).
+- Fix missing validation for `Device::clear_buffer` where `offset + size > buffer.size` was not checked when `size` was omitted. By @ErichDonGubler in [#5282](https://github.com/gfx-rs/wgpu/pull/5282).

-#### glsl-in
+#### DX12
+- Fix `panic!` when dropping `Instance` without `InstanceFlags::VALIDATION`. By @hakolao in [#5134](https://github.com/gfx-rs/wgpu/pull/5134)
+
+#### OpenGL
+- Fix internal format for the `Etc2Rgba8Unorm` format. By @andristarr in [#5178](https://github.com/gfx-rs/wgpu/pull/5178)
+- Try to load `libX11.so.6` in addition to `libX11.so` on linux. [#5307](https://github.com/gfx-rs/wgpu/pull/5307)
+- Make use of `GL_EXT_texture_shadow_lod` to support sampling a cube depth texture with an explicit LOD. By @cmrschwarz in #[5171](https://github.com/gfx-rs/wgpu/pull/5171).
+
+#### `glsl-in`

 - Fix code generation from nested loops. By @cwfitzgerald and @teoxoy in [#5311](https://github.com/gfx-rs/wgpu/pull/5311)

-#### WGL

- In Surface::configure and Surface::present, fix the current GL context not being unset when releasing the lock that guards access to making the context current. This was causing other threads to panic when trying to make the context current. By @Imberflur in [#5087](https://github.com/gfx-rs/wgpu/pull/5087).
+## v0.19.1 (2024-01-22)

-#### Naga
- Make use of `GL_EXT_texture_shadow_lod` to support sampling a cube depth texture with an explicit LOD. By @cmrschwarz in #[5171](https://github.com/gfx-rs/wgpu/pull/5171).
- In spv-in, remove unnecessary "gl_PerVertex" name check so unused builtins will always be skipped. By @Imberflur in [#5227](https://github.com/gfx-rs/wgpu/pull/5227).
- GLSL 410 does not support layout(binding = ...), enable only for GLSL 420. By @bes in [#5357](https://github.com/gfx-rs/wgpu/pull/5357)
+This release includes `wgpu` and `wgpu-hal`. The rest of the crates are unchanged since 0.19.0.

-#### Tests
+### Bug Fixes

- Fix intermittent crashes on Linux in the `multithreaded_compute` test. By @jimblandy in [#5129](https://github.com/gfx-rs/wgpu/pull/5129).
- Refactor tests to read feature flags by name instead of a hardcoded hexadecimal u64. By @rodolphito in [#5155](https://github.com/gfx-rs/wgpu/pull/5155).
- Add test that verifies that we can drop the queue before using the device to create a command encoder. By @Davidster in [#5211](https://github.com/gfx-rs/wgpu/pull/5211)
+#### DX12

-#### GLES
+- Properly register all swapchain buffers to prevent error on surface present. By @dtzxporter in [#5091](https://github.com/gfx-rs/wgpu/pull/5091)
+- Check for extra null states when creating resources. By @nical in [#5096](https://github.com/gfx-rs/wgpu/pull/5096)
+- Fix depth-only and stencil-only views causing crashes. By @teoxoy in [#5100](https://github.com/gfx-rs/wgpu/pull/5100)

- Fixes for being able to use an OpenGL 4.1 core context provided by macOS with wgpu. By @bes in [#5331](https://github.com/gfx-rs/wgpu/pull/5331).
- Don't create a program for shader-clearing if that workaround isn't required. By @Dinnerbone in [#5348](https://github.com/gfx-rs/wgpu/pull/5348).
- Fix crash when holding multiple devices on wayland/surfaceless. By @ashdnazg in [#5351](https://github.com/gfx-rs/wgpu/pull/5351).
+#### OpenGL

-#### Vulkan
+- In Surface::configure and Surface::present on Windows, fix the current GL context not being unset when releasing the lock that guards access to making the context current. This was causing other threads to panic when trying to make the context current. By @Imberflur in [#5087](https://github.com/gfx-rs/wgpu/pull/5087).
+
+#### WebGPU
+
+- Improve error message when compiling WebGPU backend on wasm without the `web_sys_unstable_apis` set. By @rukai in [#5104](https://github.com/gfx-rs/wgpu/pull/5104)
+
+### Documentation
+
+- Document Wayland specific behavior related to `SurfaceTexture::present`. By @i509VCB in [#5093](https://github.com/gfx-rs/wgpu/pull/5093).

- Set object labels when the DEBUG flag is set, even if the VALIDATION flag is disabled. By @DJMcNab in [#5345](https://github.com/gfx-rs/wgpu/pull/5345).

 ## v0.19.0 (2024-01-17)

--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -5,8 +5,9 @@ members = [
    "deno_webgpu",

    # default members
+    "benches",
    "d3d12",
-    "examples/",
+    "examples",
    "naga-cli",
    "naga",
    "naga/fuzz",
@ -22,8 +23,9 @@ members = [
 ]
 exclude = []
 default-members = [
+    "benches",
    "d3d12",
-    "examples/",
+    "examples",
    "naga-cli",
    "naga",
    "naga/fuzz",
@ -45,47 +47,49 @@ keywords = ["graphics"]
 license = "MIT OR Apache-2.0"
 homepage = "https://wgpu.rs/"
 repository = "https://github.com/gfx-rs/wgpu"
-version = "0.19.0"
+version = "0.20.0"
 authors = ["gfx-rs developers"]

 [workspace.dependencies.wgc]
 package = "wgpu-core"
 path = "./wgpu-core"
-version = "0.19.0"
+version = "0.20.0"

 [workspace.dependencies.wgt]
 package = "wgpu-types"
 path = "./wgpu-types"
-version = "0.19.0"
+version = "0.20.0"

 [workspace.dependencies.hal]
 package = "wgpu-hal"
 path = "./wgpu-hal"
-version = "0.19.0"
+version = "0.20.0"

 [workspace.dependencies.naga]
 path = "./naga"
-version = "0.19.0"
+version = "0.20.0"

 [workspace.dependencies]
-anyhow = "1.0"
+anyhow = "1.0.86"
 arrayvec = "0.7"
+bincode = "1"
 bit-vec = "0.6"
 bitflags = "2"
-bytemuck = { version = "1.14", features = ["derive"] }
+bytemuck = { version = "1.16", features = ["derive"] }
 cfg_aliases = "0.1"
 cfg-if = "1"
+criterion = "0.5"
 codespan-reporting = "0.11"
 ctor = "0.2"
 document-features = "0.2.8"
-encase = "0.7"
+encase = "0.8"
 env_logger = "0.11"
 fern = "0.6"
 flume = "0.11"
 futures-lite = "2"
 getrandom = "0.2"
-glam = "0.25"
-heck = "0.4.0"
+glam = "0.27"
+heck = "0.5.0"
 image = { version = "0.24", default-features = false, features = ["png"] }
 ktx2 = "0.3"
 libc = "0.2"
@ -96,7 +100,6 @@ log = "0.4"
 nanorand = { version = "0.7", default-features = false, features = ["wyrand"] }
 # https://github.com/Razaekel/noise-rs/issues/335 (Updated dependencies)
 noise = { version = "0.8", git = "https://github.com/Razaekel/noise-rs.git", rev = "c6942d4fb70af26db4441edcf41f90fa115333f2" }
-num-traits = { version = "0.2" }
 nv-flip = "0.1"
 obj = "0.10"
 once_cell = "1"
@ -110,41 +113,43 @@ png = "0.17.11"
 pollster = "0.3"
 profiling = { version = "1", default-features = false }
 raw-window-handle = "0.6"
+rayon = "1"
 renderdoc-sys = "1.1.0"
 ron = "0.8"
 rustc-hash = "1.1.0"
 serde = "1"
-serde_json = "1.0.113"
+serde_json = "1.0.116"
 smallvec = "1"
 static_assertions = "1.1.0"
+tracy-client = "0.17"
 thiserror = "1"
-wgpu = { version = "0.19.0", path = "./wgpu" }
-wgpu-core = { version = "0.19.0", path = "./wgpu-core" }
-wgpu-example = { version = "0.19.0", path = "./examples/common" }
-wgpu-macros = { version = "0.19.0", path = "./wgpu-macros" }
-wgpu-test = { version = "0.19.0", path = "./tests" }
-wgpu-types = { version = "0.19.0", path = "./wgpu-types" }
+wgpu = { version = "0.20.0", path = "./wgpu" }
+wgpu-core = { version = "0.20.0", path = "./wgpu-core" }
+wgpu-example = { version = "0.20.0", path = "./examples/common" }
+wgpu-macros = { version = "0.20.0", path = "./wgpu-macros" }
+wgpu-test = { version = "0.20.0", path = "./tests" }
+wgpu-types = { version = "0.20.0", path = "./wgpu-types" }
 winit = { version = "0.29", features = ["android-native-activity"] }

 # Metal dependencies
 block = "0.1"
 core-graphics-types = "0.1"
-metal = "0.27.0"
+metal = { version = "0.28.0" }
 objc = "0.2.5"

 # Vulkan dependencies
 android_system_properties = "0.1.1"
-ash = "0.37.3"
+ash = "0.38.0"
 gpu-alloc = "0.6"
-gpu-descriptor = "0.2"
+gpu-descriptor = "0.3"

 # DX dependencies
 bit-set = "0.5"
-gpu-allocator = { version = "0.25", default_features = false, features = [
+gpu-allocator = { version = "0.26", default-features = false, features = [
    "d3d12",
    "public-winapi",
 ] }
-d3d12 = { version = "0.7.0", path = "./d3d12/" }
+d3d12 = { version = "0.20.0", path = "./d3d12/" }
 range-alloc = "0.1"
 winapi = "0.3"
 hassle-rs = "0.11.0"
@ -165,13 +170,13 @@ web-sys = "0.3.69"
 web-time = "0.2.4"

 # deno dependencies
-deno_console = "0.125.0"
-deno_core = "0.232.0"
-deno_url = "0.125.0"
-deno_web = "0.156.0"
-deno_webidl = "0.125.0"
-deno_webgpu = { version = "0.85.0", path = "./deno_webgpu" }
-tokio = "1.36.0"
+deno_console = "0.143.0"
+deno_core = "0.272.0"
+deno_url = "0.143.0"
+deno_web = "0.174.0"
+deno_webidl = "0.143.0"
+deno_webgpu = { version = "0.118.0", path = "./deno_webgpu" }
+tokio = "1.37.0"
 termcolor = "1.4.1"

 [patch."https://github.com/gfx-rs/naga"]
@ -188,6 +193,10 @@ termcolor = "1.4.1"
 #js-sys = { path = "../wasm-bindgen/crates/js-sys" }
 #wasm-bindgen = { path = "../wasm-bindgen" }

+[profile.release]
+lto = "thin"
+debug = true
+
 # Speed up image comparison even in debug builds
 [profile.dev.package."nv-flip-sys"]
 opt-level = 3
--- a/README.md
+++ b/README.md
@ -199,7 +199,7 @@ To run a given set of tests:

 ```
 # Must be inside the `cts` folder we just checked out, else this will fail
-cargo run --manifest-path ../Cargo.toml --bin cts_runner -- ./tools/run_deno --verbose "<test string>"
+cargo run --manifest-path ../Cargo.toml -p cts_runner --bin cts_runner -- ./tools/run_deno --verbose "<test string>"
 ```

 To find the full list of tests, go to the [online cts viewer](https://gpuweb.github.io/cts/standalone/?runnow=0&worker=0&debug=0&q=webgpu:*).
--- a/benches/Cargo.toml
+++ b/benches/Cargo.toml
@ -0,0 +1,46 @@
+[package]
+name = "wgpu-benchmark"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+description = "wgpu benchmarking suite"
+homepage.workspace = true
+repository.workspace = true
+keywords.workspace = true
+license.workspace = true
+autobenches = false
+publish = false
+
+[[bench]]
+name = "root"
+harness = false
+path = "benches/root.rs"
+
+[features]
+# Uncomment these features to enable tracy and superluminal profiling.
+# tracy = ["dep:tracy-client", "profiling/profile-with-tracy"]
+# superluminal = ["profiling/profile-with-superluminal"]
+
+[dependencies]
+bincode.workspace = true
+bytemuck.workspace = true
+criterion.workspace = true
+naga = { workspace = true, features = [
+    "deserialize",
+    "serialize",
+    "wgsl-in",
+    "spv-in",
+    "glsl-in",
+    "spv-out",
+    "msl-out",
+    "hlsl-out",
+    "glsl-out",
+    "wgsl-out",
+] }
+nanorand.workspace = true
+once_cell.workspace = true
+pollster.workspace = true
+profiling.workspace = true
+rayon.workspace = true
+tracy-client = { workspace = true, optional = true }
+wgpu.workspace = true
--- a/benches/README.md
+++ b/benches/README.md
@ -0,0 +1,95 @@
+Collection of CPU benchmarks for `wgpu`.
+
+These benchmarks are designed as a first line of defence against performance regressions and generally approximate the performance for users.
+They all do very little GPU work and are testing the CPU performance of the API.
+
+Criterion will give you the end-to-end performance of the benchmark, but you can also use a profiler to get more detailed information about where time is being spent.
+
+## Usage
+
+```sh
+# Run all benchmarks
+cargo bench -p wgpu-benchmark
+# Run a specific benchmarks that contains "filter" in its name
+cargo bench -p wgpu-benchmark -- "filter"
+```
+
+## Benchmarks
+
+#### `Renderpass`
+
+This benchmark measures the performance of recording and submitting a render pass with a large
+number of draw calls and resources, emulating an intense, more traditional graphics application. 
+By default it measures 10k draw calls, with 90k total resources.
+
+Within this benchmark, both single threaded and multi-threaded recording are tested, as well as splitting
+the render pass into multiple passes over multiple command buffers.
+
+#### `Resource Creation`
+
+This benchmark measures the performance of creating large resources. By default it makes buffers that are 256MB. It tests this over a range of thread counts.
+
+#### `Shader Compilation`
+
+This benchmark measures the performance of naga parsing, validating, and generating shaders. 
+
+## Comparing Against a Baseline
+
+To compare the current benchmarks against a baseline, you can use the `--save-baseline` and `--baseline` flags.
+
+For example, to compare v0.20 against trunk, you could run the following:
+
+```sh
+git checkout v0.20
+
+# Run the baseline benchmarks
+cargo bench -p wgpu-benchmark -- --save-baseline "v0.20"
+
+git checkout trunk
+
+# Run the current benchmarks
+cargo bench -p wgpu-benchmark -- --baseline "v0.20"
+```
+
+You can use this for any bits of code you want to compare.
+
+## Integration with Profilers
+
+The benchmarks can be run with a profiler to get more detailed information about where time is being spent.
+Integrations are available for `tracy` and `superluminal`. Due to some implementation details,
+you need to uncomment the features in the `Cargo.toml` to allow features to be used.
+
+#### Tracy
+
+Tracy is available prebuilt for Windows on [github](https://github.com/wolfpld/tracy/releases/latest/).
+
+```sh
+# Once this is running, you can connect to it with the Tracy Profiler
+cargo bench -p wgpu-benchmark --features tracy
+```
+
+#### Superluminal
+
+Superluminal is a paid product for windows available [here](https://superluminal.eu/).
+
+```sh
+# This command will build the benchmarks, and display the path to the executable
+cargo bench -p wgpu-benchmark --features superluminal -- -h
+
+# Have Superluminal run the following command (replacing with the path to the executable)
+./target/release/deps/root-2c45d61b38a65438.exe --bench "filter"
+```
+
+#### `perf` and others
+
+You can follow the same pattern as above to run the benchmarks with other profilers.
+For example, the command line tool `perf` can be used to profile the benchmarks.
+
+```sh
+# This command will build the benchmarks, and display the path to the executable
+cargo bench -p wgpu-benchmark -- -h
+
+# Run the benchmarks with perf
+perf record ./target/release/deps/root-2c45d61b38a65438 --bench "filter"
+```
+
--- a/benches/benches/renderpass-bindless.wgsl
+++ b/benches/benches/renderpass-bindless.wgsl
@ -0,0 +1,26 @@
+@group(0) @binding(0)
+var tex: binding_array<texture_2d<f32>>;
+
+struct VertexOutput {
+    @builtin(position) position: vec4f,
+    @location(0) @interpolate(flat) instance_index: u32,
+}
+
+@vertex
+fn vs_main(@builtin(instance_index) instance_index: u32) -> VertexOutput {
+    return VertexOutput(
+        vec4f(0.0, 0.0, 0.0, 1.0),
+        instance_index
+    );
+}
+
+@fragment
+fn fs_main(vs_in: VertexOutput) -> @location(0) vec4f {
+    return textureLoad(tex[7 * vs_in.instance_index + 0], vec2u(0), 0) +
+           textureLoad(tex[7 * vs_in.instance_index + 1], vec2u(0), 0) +
+           textureLoad(tex[7 * vs_in.instance_index + 2], vec2u(0), 0) +
+           textureLoad(tex[7 * vs_in.instance_index + 3], vec2u(0), 0) +
+           textureLoad(tex[7 * vs_in.instance_index + 4], vec2u(0), 0) +
+           textureLoad(tex[7 * vs_in.instance_index + 5], vec2u(0), 0) +
+           textureLoad(tex[7 * vs_in.instance_index + 6], vec2u(0), 0); 
+}
--- a/benches/benches/renderpass.rs
+++ b/benches/benches/renderpass.rs
@ -0,0 +1,575 @@
+use std::{
+    num::NonZeroU32,
+    time::{Duration, Instant},
+};
+
+use criterion::{criterion_group, Criterion, Throughput};
+use nanorand::{Rng, WyRand};
+use once_cell::sync::Lazy;
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+
+use crate::DeviceState;
+
+const DRAW_COUNT: usize = 10_000;
+// Must match the number of textures in the renderpass.wgsl shader
+const TEXTURES_PER_DRAW: usize = 7;
+const VERTEX_BUFFERS_PER_DRAW: usize = 2;
+const VERTEX_BUFFER_COUNT: usize = DRAW_COUNT * VERTEX_BUFFERS_PER_DRAW;
+
+const TEXTURE_COUNT: usize = DRAW_COUNT * TEXTURES_PER_DRAW;
+
+struct RenderpassState {
+    device_state: DeviceState,
+    pipeline: wgpu::RenderPipeline,
+    bind_groups: Vec<wgpu::BindGroup>,
+    vertex_buffers: Vec<wgpu::Buffer>,
+    index_buffers: Vec<wgpu::Buffer>,
+    render_target: wgpu::TextureView,
+
+    // Bindless resources
+    bindless_bind_group: Option<wgpu::BindGroup>,
+    bindless_pipeline: Option<wgpu::RenderPipeline>,
+}
+
+impl RenderpassState {
+    /// Create and prepare all the resources needed for the renderpass benchmark.
+    fn new() -> Self {
+        let device_state = DeviceState::new();
+
+        let supports_bindless = device_state.device.features().contains(
+            wgpu::Features::TEXTURE_BINDING_ARRAY
+                | wgpu::Features::SAMPLED_TEXTURE_AND_STORAGE_BUFFER_ARRAY_NON_UNIFORM_INDEXING,
+        ) && device_state
+            .device
+            .limits()
+            .max_sampled_textures_per_shader_stage
+            >= TEXTURE_COUNT as _;
+
+        // Performance gets considerably worse if the resources are shuffled.
+        //
+        // This more closely matches the real-world use case where resources have no
+        // well defined usage order.
+        let mut random = WyRand::new_seed(0x8BADF00D);
+
+        let mut bind_group_layout_entries = Vec::with_capacity(TEXTURES_PER_DRAW);
+        for i in 0..TEXTURES_PER_DRAW {
+            bind_group_layout_entries.push(wgpu::BindGroupLayoutEntry {
+                binding: i as u32,
+                visibility: wgpu::ShaderStages::FRAGMENT,
+                ty: wgpu::BindingType::Texture {
+                    sample_type: wgpu::TextureSampleType::Float { filterable: true },
+                    view_dimension: wgpu::TextureViewDimension::D2,
+                    multisampled: false,
+                },
+                count: None,
+            });
+        }
+
+        let bind_group_layout =
+            device_state
+                .device
+                .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                    label: None,
+                    entries: &bind_group_layout_entries,
+                });
+
+        let mut texture_views = Vec::with_capacity(TEXTURE_COUNT);
+        for i in 0..TEXTURE_COUNT {
+            let texture = device_state
+                .device
+                .create_texture(&wgpu::TextureDescriptor {
+                    label: Some(&format!("Texture {i}")),
+                    size: wgpu::Extent3d {
+                        width: 1,
+                        height: 1,
+                        depth_or_array_layers: 1,
+                    },
+                    mip_level_count: 1,
+                    sample_count: 1,
+                    dimension: wgpu::TextureDimension::D2,
+                    format: wgpu::TextureFormat::Rgba8UnormSrgb,
+                    usage: wgpu::TextureUsages::TEXTURE_BINDING,
+                    view_formats: &[],
+                });
+            texture_views.push(texture.create_view(&wgpu::TextureViewDescriptor {
+                label: Some(&format!("Texture View {i}")),
+                ..Default::default()
+            }));
+        }
+        random.shuffle(&mut texture_views);
+
+        let texture_view_refs: Vec<_> = texture_views.iter().collect();
+
+        let mut bind_groups = Vec::with_capacity(DRAW_COUNT);
+        for draw_idx in 0..DRAW_COUNT {
+            let mut entries = Vec::with_capacity(TEXTURES_PER_DRAW);
+            for tex_idx in 0..TEXTURES_PER_DRAW {
+                entries.push(wgpu::BindGroupEntry {
+                    binding: tex_idx as u32,
+                    resource: wgpu::BindingResource::TextureView(
+                        &texture_views[draw_idx * TEXTURES_PER_DRAW + tex_idx],
+                    ),
+                });
+            }
+
+            bind_groups.push(
+                device_state
+                    .device
+                    .create_bind_group(&wgpu::BindGroupDescriptor {
+                        label: None,
+                        layout: &bind_group_layout,
+                        entries: &entries,
+                    }),
+            );
+        }
+        random.shuffle(&mut bind_groups);
+
+        let sm = device_state
+            .device
+            .create_shader_module(wgpu::include_wgsl!("renderpass.wgsl"));
+
+        let pipeline_layout =
+            device_state
+                .device
+                .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                    label: None,
+                    bind_group_layouts: &[&bind_group_layout],
+                    push_constant_ranges: &[],
+                });
+
+        let mut vertex_buffers = Vec::with_capacity(VERTEX_BUFFER_COUNT);
+        for _ in 0..VERTEX_BUFFER_COUNT {
+            vertex_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
+                label: None,
+                size: 3 * 16,
+                usage: wgpu::BufferUsages::VERTEX,
+                mapped_at_creation: false,
+            }));
+        }
+        random.shuffle(&mut vertex_buffers);
+
+        let mut index_buffers = Vec::with_capacity(DRAW_COUNT);
+        for _ in 0..DRAW_COUNT {
+            index_buffers.push(device_state.device.create_buffer(&wgpu::BufferDescriptor {
+                label: None,
+                size: 3 * 4,
+                usage: wgpu::BufferUsages::INDEX,
+                mapped_at_creation: false,
+            }));
+        }
+        random.shuffle(&mut index_buffers);
+
+        let mut vertex_buffer_attributes = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW);
+        for i in 0..VERTEX_BUFFERS_PER_DRAW {
+            vertex_buffer_attributes.push(wgpu::vertex_attr_array![i as u32 => Float32x4]);
+        }
+
+        let mut vertex_buffer_layouts = Vec::with_capacity(VERTEX_BUFFERS_PER_DRAW);
+        for attributes in &vertex_buffer_attributes {
+            vertex_buffer_layouts.push(wgpu::VertexBufferLayout {
+                array_stride: 16,
+                step_mode: wgpu::VertexStepMode::Vertex,
+                attributes,
+            });
+        }
+
+        let pipeline =
+            device_state
+                .device
+                .create_render_pipeline(&wgpu::RenderPipelineDescriptor {
+                    label: None,
+                    layout: Some(&pipeline_layout),
+                    vertex: wgpu::VertexState {
+                        module: &sm,
+                        entry_point: "vs_main",
+                        buffers: &vertex_buffer_layouts,
+                        compilation_options: wgpu::PipelineCompilationOptions::default(),
+                    },
+                    primitive: wgpu::PrimitiveState {
+                        topology: wgpu::PrimitiveTopology::TriangleList,
+                        strip_index_format: None,
+                        front_face: wgpu::FrontFace::Cw,
+                        cull_mode: Some(wgpu::Face::Back),
+                        polygon_mode: wgpu::PolygonMode::Fill,
+                        unclipped_depth: false,
+                        conservative: false,
+                    },
+                    depth_stencil: None,
+                    multisample: wgpu::MultisampleState::default(),
+                    fragment: Some(wgpu::FragmentState {
+                        module: &sm,
+                        entry_point: "fs_main",
+                        targets: &[Some(wgpu::ColorTargetState {
+                            format: wgpu::TextureFormat::Rgba8UnormSrgb,
+                            blend: None,
+                            write_mask: wgpu::ColorWrites::ALL,
+                        })],
+                        compilation_options: wgpu::PipelineCompilationOptions::default(),
+                    }),
+                    multiview: None,
+                    cache: None,
+                });
+
+        let render_target = device_state
+            .device
+            .create_texture(&wgpu::TextureDescriptor {
+                label: Some("Render Target"),
+                size: wgpu::Extent3d {
+                    width: 1,
+                    height: 1,
+                    depth_or_array_layers: 1,
+                },
+                mip_level_count: 1,
+                sample_count: 1,
+                dimension: wgpu::TextureDimension::D2,
+                format: wgpu::TextureFormat::Rgba8UnormSrgb,
+                usage: wgpu::TextureUsages::RENDER_ATTACHMENT,
+                view_formats: &[],
+            })
+            .create_view(&wgpu::TextureViewDescriptor::default());
+
+        let mut bindless_bind_group = None;
+        let mut bindless_pipeline = None;
+
+        if supports_bindless {
+            let bindless_bind_group_layout =
+                device_state
+                    .device
+                    .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                        label: None,
+                        entries: &[wgpu::BindGroupLayoutEntry {
+                            binding: 0,
+                            visibility: wgpu::ShaderStages::FRAGMENT,
+                            ty: wgpu::BindingType::Texture {
+                                sample_type: wgpu::TextureSampleType::Float { filterable: true },
+                                view_dimension: wgpu::TextureViewDimension::D2,
+                                multisampled: false,
+                            },
+                            count: Some(NonZeroU32::new(TEXTURE_COUNT as u32).unwrap()),
+                        }],
+                    });
+
+            bindless_bind_group = Some(device_state.device.create_bind_group(
+                &wgpu::BindGroupDescriptor {
+                    label: None,
+                    layout: &bindless_bind_group_layout,
+                    entries: &[wgpu::BindGroupEntry {
+                        binding: 0,
+                        resource: wgpu::BindingResource::TextureViewArray(&texture_view_refs),
+                    }],
+                },
+            ));
+
+            let bindless_shader_module = device_state
+                .device
+                .create_shader_module(wgpu::include_wgsl!("renderpass-bindless.wgsl"));
+
+            let bindless_pipeline_layout =
+                device_state
+                    .device
+                    .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+                        label: None,
+                        bind_group_layouts: &[&bindless_bind_group_layout],
+                        push_constant_ranges: &[],
+                    });
+
+            bindless_pipeline = Some(device_state.device.create_render_pipeline(
+                &wgpu::RenderPipelineDescriptor {
+                    label: None,
+                    layout: Some(&bindless_pipeline_layout),
+                    vertex: wgpu::VertexState {
+                        module: &bindless_shader_module,
+                        entry_point: "vs_main",
+                        buffers: &vertex_buffer_layouts,
+                        compilation_options: wgpu::PipelineCompilationOptions::default(),
+                    },
+                    primitive: wgpu::PrimitiveState {
+                        topology: wgpu::PrimitiveTopology::TriangleList,
+                        strip_index_format: None,
+                        front_face: wgpu::FrontFace::Cw,
+                        cull_mode: Some(wgpu::Face::Back),
+                        polygon_mode: wgpu::PolygonMode::Fill,
+                        unclipped_depth: false,
+                        conservative: false,
+                    },
+                    depth_stencil: None,
+                    multisample: wgpu::MultisampleState::default(),
+                    fragment: Some(wgpu::FragmentState {
+                        module: &bindless_shader_module,
+                        entry_point: "fs_main",
+                        targets: &[Some(wgpu::ColorTargetState {
+                            format: wgpu::TextureFormat::Rgba8UnormSrgb,
+                            blend: None,
+                            write_mask: wgpu::ColorWrites::ALL,
+                        })],
+                        compilation_options: wgpu::PipelineCompilationOptions::default(),
+                    }),
+                    multiview: None,
+                    cache: None,
+                },
+            ));
+        }
+
+        Self {
+            device_state,
+            pipeline,
+            bind_groups,
+            vertex_buffers,
+            index_buffers,
+            render_target,
+
+            bindless_bind_group,
+            bindless_pipeline,
+        }
+    }
+
+    fn run_subpass(&self, pass_number: usize, total_passes: usize) -> wgpu::CommandBuffer {
+        profiling::scope!("Renderpass", &format!("Pass {pass_number}/{total_passes}"));
+
+        let draws_per_pass = DRAW_COUNT / total_passes;
+
+        let mut encoder = self
+            .device_state
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+
+        let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
+            label: None,
+            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                view: &self.render_target,
+                resolve_target: None,
+                ops: wgpu::Operations {
+                    load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
+                    store: wgpu::StoreOp::Store,
+                },
+            })],
+            occlusion_query_set: None,
+            timestamp_writes: None,
+            depth_stencil_attachment: None,
+        });
+
+        let start_idx = pass_number * draws_per_pass;
+        let end_idx = start_idx + draws_per_pass;
+        for draw_idx in start_idx..end_idx {
+            render_pass.set_pipeline(&self.pipeline);
+            render_pass.set_bind_group(0, &self.bind_groups[draw_idx], &[]);
+            for i in 0..VERTEX_BUFFERS_PER_DRAW {
+                render_pass.set_vertex_buffer(
+                    i as u32,
+                    self.vertex_buffers[draw_idx * VERTEX_BUFFERS_PER_DRAW + i].slice(..),
+                );
+            }
+            render_pass.set_index_buffer(
+                self.index_buffers[draw_idx].slice(..),
+                wgpu::IndexFormat::Uint32,
+            );
+            render_pass.draw_indexed(0..3, 0, 0..1);
+        }
+
+        drop(render_pass);
+
+        encoder.finish()
+    }
+
+    fn run_bindless_pass(&self) -> wgpu::CommandBuffer {
+        profiling::scope!("Bindless Renderpass");
+
+        let mut encoder = self
+            .device_state
+            .device
+            .create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+
+        let mut render_pass = encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
+            label: None,
+            color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                view: &self.render_target,
+                resolve_target: None,
+                ops: wgpu::Operations {
+                    load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
+                    store: wgpu::StoreOp::Store,
+                },
+            })],
+            occlusion_query_set: None,
+            timestamp_writes: None,
+            depth_stencil_attachment: None,
+        });
+
+        render_pass.set_pipeline(self.bindless_pipeline.as_ref().unwrap());
+        render_pass.set_bind_group(0, self.bindless_bind_group.as_ref().unwrap(), &[]);
+        for i in 0..VERTEX_BUFFERS_PER_DRAW {
+            render_pass.set_vertex_buffer(i as u32, self.vertex_buffers[0].slice(..));
+        }
+        render_pass.set_index_buffer(self.index_buffers[0].slice(..), wgpu::IndexFormat::Uint32);
+
+        for draw_idx in 0..DRAW_COUNT {
+            render_pass.draw_indexed(0..3, 0, draw_idx as u32..draw_idx as u32 + 1);
+        }
+
+        drop(render_pass);
+
+        encoder.finish()
+    }
+}
+
+fn run_bench(ctx: &mut Criterion) {
+    let state = Lazy::new(RenderpassState::new);
+
+    // Test 10k draw calls split up into 1, 2, 4, and 8 renderpasses
+    let mut group = ctx.benchmark_group("Renderpass: Single Threaded");
+    group.throughput(Throughput::Elements(DRAW_COUNT as _));
+
+    for time_submit in [false, true] {
+        for rpasses in [1, 2, 4, 8] {
+            let draws_per_pass = DRAW_COUNT / rpasses;
+
+            let label = if time_submit {
+                "Submit Time"
+            } else {
+                "Renderpass Time"
+            };
+
+            group.bench_function(
+                &format!("{rpasses} renderpasses x {draws_per_pass} draws ({label})"),
+                |b| {
+                    Lazy::force(&state);
+
+                    b.iter_custom(|iters| {
+                        profiling::scope!("benchmark invocation");
+
+                        // This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
+                        if state.device_state.adapter_info.name.contains("Paravirtual") {
+                            return Duration::from_secs_f32(1.0);
+                        }
+
+                        let mut duration = Duration::ZERO;
+
+                        for _ in 0..iters {
+                            profiling::scope!("benchmark iteration");
+
+                            let mut start = Instant::now();
+
+                            let mut buffers: Vec<wgpu::CommandBuffer> = Vec::with_capacity(rpasses);
+                            for i in 0..rpasses {
+                                buffers.push(state.run_subpass(i, rpasses));
+                            }
+
+                            if time_submit {
+                                start = Instant::now();
+                            } else {
+                                duration += start.elapsed();
+                            }
+
+                            state.device_state.queue.submit(buffers);
+
+                            if time_submit {
+                                duration += start.elapsed();
+                            }
+
+                            state.device_state.device.poll(wgpu::Maintain::Wait);
+                        }
+
+                        duration
+                    })
+                },
+            );
+        }
+    }
+    group.finish();
+
+    // Test 10k draw calls split up over 2, 4, and 8 threads.
+    let mut group = ctx.benchmark_group("Renderpass: Multi Threaded");
+    group.throughput(Throughput::Elements(DRAW_COUNT as _));
+
+    for threads in [2, 4, 8] {
+        let draws_per_pass = DRAW_COUNT / threads;
+        group.bench_function(
+            &format!("{threads} threads x {draws_per_pass} draws"),
+            |b| {
+                Lazy::force(&state);
+
+                b.iter_custom(|iters| {
+                    profiling::scope!("benchmark invocation");
+
+                    // This benchmark hangs on Apple Paravirtualized GPUs. No idea why.
+                    if state.device_state.adapter_info.name.contains("Paravirtual") {
+                        return Duration::from_secs_f32(1.0);
+                    }
+
+                    let mut duration = Duration::ZERO;
+
+                    for _ in 0..iters {
+                        profiling::scope!("benchmark iteration");
+
+                        let start = Instant::now();
+
+                        let buffers = (0..threads)
+                            .into_par_iter()
+                            .map(|i| state.run_subpass(i, threads))
+                            .collect::<Vec<_>>();
+
+                        duration += start.elapsed();
+
+                        state.device_state.queue.submit(buffers);
+                        state.device_state.device.poll(wgpu::Maintain::Wait);
+                    }
+
+                    duration
+                })
+            },
+        );
+    }
+    group.finish();
+
+    // Test 10k draw calls split up over 1, 2, 4, and 8 threads.
+    let mut group = ctx.benchmark_group("Renderpass: Bindless");
+    group.throughput(Throughput::Elements(DRAW_COUNT as _));
+
+    group.bench_function(&format!("{DRAW_COUNT} draws"), |b| {
+        Lazy::force(&state);
+
+        b.iter_custom(|iters| {
+            profiling::scope!("benchmark invocation");
+
+            // Need bindless to run this benchmark
+            if state.bindless_bind_group.is_none() {
+                return Duration::from_secs_f32(1.0);
+            }
+
+            let mut duration = Duration::ZERO;
+
+            for _ in 0..iters {
+                profiling::scope!("benchmark iteration");
+
+                let start = Instant::now();
+
+                let buffer = state.run_bindless_pass();
+
+                duration += start.elapsed();
+
+                state.device_state.queue.submit([buffer]);
+                state.device_state.device.poll(wgpu::Maintain::Wait);
+            }
+
+            duration
+        })
+    });
+    group.finish();
+
+    ctx.bench_function(
+        &format!(
+            "Renderpass: Empty Submit with {} Resources",
+            TEXTURE_COUNT + VERTEX_BUFFER_COUNT
+        ),
+        |b| {
+            Lazy::force(&state);
+
+            b.iter(|| state.device_state.queue.submit([]));
+        },
+    );
+}
+
+criterion_group! {
+    name = renderpass;
+    config = Criterion::default().measurement_time(Duration::from_secs(10));
+    targets = run_bench,
+}
--- a/benches/benches/renderpass.wgsl
+++ b/benches/benches/renderpass.wgsl
@ -0,0 +1,36 @@
+@group(0) @binding(0)
+var tex_1: texture_2d<f32>;
+
+@group(0) @binding(1)
+var tex_2: texture_2d<f32>;
+
+@group(0) @binding(2)
+var tex_3: texture_2d<f32>;
+
+@group(0) @binding(3)
+var tex_4: texture_2d<f32>;
+
+@group(0) @binding(4)
+var tex_5: texture_2d<f32>;
+
+@group(0) @binding(5)
+var tex_6: texture_2d<f32>;
+
+@group(0) @binding(6)
+var tex_7: texture_2d<f32>;
+
+@vertex
+fn vs_main() -> @builtin(position) vec4f {
+    return vec4f(0.0, 0.0, 0.0, 1.0);
+}
+
+@fragment
+fn fs_main() -> @location(0) vec4f {
+    return textureLoad(tex_1, vec2u(0), 0) +
+           textureLoad(tex_2, vec2u(0), 0) +
+           textureLoad(tex_3, vec2u(0), 0) +
+           textureLoad(tex_4, vec2u(0), 0) +
+           textureLoad(tex_5, vec2u(0), 0) +
+           textureLoad(tex_6, vec2u(0), 0) +
+           textureLoad(tex_7, vec2u(0), 0); 
+}
--- a/benches/benches/resource_creation.rs
+++ b/benches/benches/resource_creation.rs
@ -0,0 +1,71 @@
+use std::time::{Duration, Instant};
+
+use criterion::{criterion_group, Criterion, Throughput};
+use once_cell::sync::Lazy;
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+
+use crate::DeviceState;
+
+fn run_bench(ctx: &mut Criterion) {
+    let state = Lazy::new(DeviceState::new);
+
+    const RESOURCES_TO_CREATE: usize = 8;
+
+    let mut group = ctx.benchmark_group("Resource Creation: Large Buffer");
+    group.throughput(Throughput::Elements(RESOURCES_TO_CREATE as _));
+
+    for threads in [1, 2, 4, 8] {
+        let resources_per_thread = RESOURCES_TO_CREATE / threads;
+        group.bench_function(
+            &format!("{threads} threads x {resources_per_thread} resource"),
+            |b| {
+                Lazy::force(&state);
+
+                b.iter_custom(|iters| {
+                    profiling::scope!("benchmark invocation");
+
+                    let mut duration = Duration::ZERO;
+
+                    for _ in 0..iters {
+                        profiling::scope!("benchmark iteration");
+
+                        // We can't create too many resources at once, so we do it 8 resources at a time.
+                        let start = Instant::now();
+
+                        let buffers = (0..threads)
+                            .into_par_iter()
+                            .map(|_| {
+                                (0..resources_per_thread)
+                                    .map(|_| {
+                                        state.device.create_buffer(&wgpu::BufferDescriptor {
+                                            label: None,
+                                            size: 256 * 1024 * 1024,
+                                            usage: wgpu::BufferUsages::COPY_DST,
+                                            mapped_at_creation: false,
+                                        })
+                                    })
+                                    .collect::<Vec<_>>()
+                            })
+                            .collect::<Vec<_>>();
+
+                        duration += start.elapsed();
+
+                        drop(buffers);
+
+                        state.queue.submit([]);
+                        state.device.poll(wgpu::Maintain::Wait);
+                    }
+
+                    duration
+                })
+            },
+        );
+    }
+    group.finish();
+}
+
+criterion_group! {
+    name = resource_creation;
+    config = Criterion::default().measurement_time(Duration::from_secs(10));
+    targets = run_bench,
+}
--- a/benches/benches/root.rs
+++ b/benches/benches/root.rs
@ -0,0 +1,65 @@
+use criterion::criterion_main;
+use pollster::block_on;
+
+mod renderpass;
+mod resource_creation;
+mod shader;
+
+struct DeviceState {
+    adapter_info: wgpu::AdapterInfo,
+    device: wgpu::Device,
+    queue: wgpu::Queue,
+}
+
+impl DeviceState {
+    fn new() -> Self {
+        #[cfg(feature = "tracy")]
+        tracy_client::Client::start();
+
+        let base_backend = if cfg!(target_os = "macos") {
+            // We don't want to use Molten-VK on Mac.
+            wgpu::Backends::METAL
+        } else {
+            wgpu::Backends::all()
+        };
+
+        let instance = wgpu::Instance::new(wgpu::InstanceDescriptor {
+            backends: wgpu::util::backend_bits_from_env().unwrap_or(base_backend),
+            flags: wgpu::InstanceFlags::empty(),
+            dx12_shader_compiler: wgpu::util::dx12_shader_compiler_from_env()
+                .unwrap_or(wgpu::Dx12Compiler::Fxc),
+            gles_minor_version: wgpu::Gles3MinorVersion::Automatic,
+        });
+
+        let adapter = block_on(wgpu::util::initialize_adapter_from_env_or_default(
+            &instance, None,
+        ))
+        .unwrap();
+
+        let adapter_info = adapter.get_info();
+
+        eprintln!("{:?}", adapter_info);
+
+        let (device, queue) = block_on(adapter.request_device(
+            &wgpu::DeviceDescriptor {
+                required_features: adapter.features(),
+                required_limits: adapter.limits(),
+                label: Some("RenderPass Device"),
+            },
+            None,
+        ))
+        .unwrap();
+
+        Self {
+            adapter_info,
+            device,
+            queue,
+        }
+    }
+}
+
+criterion_main!(
+    renderpass::renderpass,
+    resource_creation::resource_creation,
+    shader::shader
+);
--- a/benches/benches/shader.rs
+++ b/benches/benches/shader.rs
@ -0,0 +1,355 @@
+use criterion::*;
+use std::{fs, path::PathBuf};
+
+struct Input {
+    filename: String,
+    size: u64,
+    data: Vec<u8>,
+    string: Option<String>,
+    module: Option<naga::Module>,
+    module_info: Option<naga::valid::ModuleInfo>,
+}
+
+struct Inputs {
+    inner: Vec<Input>,
+}
+
+impl Inputs {
+    fn from_dir(folder: &str, extension: &str) -> Self {
+        let mut inputs = Vec::new();
+        let read_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join(folder)
+            .read_dir()
+            .unwrap();
+
+        for file_entry in read_dir {
+            match file_entry {
+                Ok(entry) => match entry.path().extension() {
+                    Some(ostr) if ostr == extension => {
+                        let path = entry.path();
+
+                        inputs.push(Input {
+                            filename: path.to_string_lossy().into_owned(),
+                            size: entry.metadata().unwrap().len(),
+                            string: None,
+                            data: vec![],
+                            module: None,
+                            module_info: None,
+                        });
+                    }
+                    _ => continue,
+                },
+                Err(e) => {
+                    eprintln!("Skipping file: {:?}", e);
+                    continue;
+                }
+            }
+        }
+
+        Self { inner: inputs }
+    }
+
+    fn bytes(&self) -> u64 {
+        self.inner.iter().map(|input| input.size).sum()
+    }
+
+    fn load(&mut self) {
+        for input in &mut self.inner {
+            if !input.data.is_empty() {
+                continue;
+            }
+
+            input.data = fs::read(&input.filename).unwrap_or_default();
+        }
+    }
+
+    fn load_utf8(&mut self) {
+        self.load();
+
+        for input in &mut self.inner {
+            if input.string.is_some() {
+                continue;
+            }
+
+            input.string = Some(std::str::from_utf8(&input.data).unwrap().to_string());
+        }
+    }
+
+    fn parse(&mut self) {
+        self.load_utf8();
+
+        let mut parser = naga::front::wgsl::Frontend::new();
+        for input in &mut self.inner {
+            if input.module.is_some() {
+                continue;
+            }
+
+            input.module = Some(parser.parse(input.string.as_ref().unwrap()).unwrap());
+        }
+    }
+
+    fn validate(&mut self) {
+        self.parse();
+
+        let mut validator = naga::valid::Validator::new(
+            naga::valid::ValidationFlags::all(),
+            // Note, this is empty, to let all backends work.
+            naga::valid::Capabilities::empty(),
+        );
+
+        for input in &mut self.inner {
+            if input.module_info.is_some() {
+                continue;
+            }
+
+            input.module_info = validator.validate(input.module.as_ref().unwrap()).ok();
+        }
+
+        self.inner.retain(|input| input.module_info.is_some());
+    }
+}
+
+fn parse_glsl(stage: naga::ShaderStage, inputs: &Inputs) {
+    let mut parser = naga::front::glsl::Frontend::default();
+    let options = naga::front::glsl::Options {
+        stage,
+        defines: Default::default(),
+    };
+    for input in &inputs.inner {
+        parser
+            .parse(&options, input.string.as_deref().unwrap())
+            .unwrap();
+    }
+}
+
+fn frontends(c: &mut Criterion) {
+    let mut group = c.benchmark_group("front");
+
+    let mut inputs_wgsl = Inputs::from_dir("../naga/tests/in", "wgsl");
+    group.throughput(Throughput::Bytes(inputs_wgsl.bytes()));
+    group.bench_function("shader: naga module bincode decode", |b| {
+        inputs_wgsl.parse();
+
+        let inputs_bin = inputs_wgsl
+            .inner
+            .iter()
+            .map(|input| bincode::serialize(&input.module.as_ref().unwrap()).unwrap())
+            .collect::<Vec<_>>();
+
+        b.iter(move || {
+            for input in inputs_bin.iter() {
+                bincode::deserialize::<naga::Module>(input).unwrap();
+            }
+        });
+    });
+
+    group.bench_function("shader: wgsl-in", |b| {
+        inputs_wgsl.load_utf8();
+
+        let mut frontend = naga::front::wgsl::Frontend::new();
+        b.iter(|| {
+            for input in &inputs_wgsl.inner {
+                frontend.parse(input.string.as_ref().unwrap()).unwrap();
+            }
+        });
+    });
+
+    let mut inputs_spirv = Inputs::from_dir("../naga/tests/in/spv", "spv");
+    group.throughput(Throughput::Bytes(inputs_spirv.bytes()));
+    group.bench_function("shader: spv-in", |b| {
+        inputs_spirv.load();
+
+        b.iter(|| {
+            let options = naga::front::spv::Options::default();
+            for input in &inputs_spirv.inner {
+                let spv = bytemuck::cast_slice(&input.data);
+                let parser = naga::front::spv::Frontend::new(spv.iter().cloned(), &options);
+                parser.parse().unwrap();
+            }
+        });
+    });
+
+    let mut inputs_vertex = Inputs::from_dir("../naga/tests/in/glsl", "vert");
+    let mut inputs_fragment = Inputs::from_dir("../naga/tests/in/glsl", "frag");
+    // let mut inputs_compute = Inputs::from_dir("../naga/tests/in/glsl", "comp");
+    group.throughput(Throughput::Bytes(
+        inputs_vertex.bytes() + inputs_fragment.bytes(), // + inputs_compute.bytes()
+    ));
+    group.bench_function("shader: glsl-in", |b| {
+        inputs_vertex.load();
+        inputs_vertex.load_utf8();
+        inputs_fragment.load_utf8();
+        // inputs_compute.load_utf8();
+
+        b.iter(|| parse_glsl(naga::ShaderStage::Vertex, &inputs_vertex));
+        b.iter(|| parse_glsl(naga::ShaderStage::Vertex, &inputs_fragment));
+        // TODO: This one hangs for some reason
+        // b.iter(move || parse_glsl(naga::ShaderStage::Compute, &inputs_compute));
+    });
+}
+
+fn validation(c: &mut Criterion) {
+    let mut inputs = Inputs::from_dir("../naga/tests/in", "wgsl");
+
+    let mut group = c.benchmark_group("validate");
+    group.throughput(Throughput::Bytes(inputs.bytes()));
+    group.bench_function("shader: validation", |b| {
+        inputs.load();
+        inputs.load_utf8();
+        inputs.parse();
+
+        let mut validator = naga::valid::Validator::new(
+            naga::valid::ValidationFlags::all(),
+            naga::valid::Capabilities::all(),
+        );
+        validator
+            .subgroup_stages(naga::valid::ShaderStages::all())
+            .subgroup_operations(naga::valid::SubgroupOperationSet::all());
+        b.iter(|| {
+            for input in &inputs.inner {
+                validator.validate(input.module.as_ref().unwrap()).unwrap();
+            }
+        });
+    });
+    group.finish();
+}
+
+fn backends(c: &mut Criterion) {
+    let mut inputs = Inputs::from_dir("../naga/tests/in", "wgsl");
+
+    let mut group = c.benchmark_group("back");
+    // While normally this would be done inside the bench_function callback, we need to
+    // run this to properly know the size of the inputs, as any that fail validation
+    // will be removed.
+    inputs.validate();
+
+    group.throughput(Throughput::Bytes(inputs.bytes()));
+    group.bench_function("shader: wgsl-out", |b| {
+        b.iter(|| {
+            let mut string = String::new();
+            let flags = naga::back::wgsl::WriterFlags::empty();
+            for input in &inputs.inner {
+                let mut writer = naga::back::wgsl::Writer::new(&mut string, flags);
+                let _ = writer.write(
+                    input.module.as_ref().unwrap(),
+                    input.module_info.as_ref().unwrap(),
+                );
+                string.clear();
+            }
+        });
+    });
+
+    group.bench_function("shader: spv-out", |b| {
+        b.iter(|| {
+            let mut data = Vec::new();
+            let options = naga::back::spv::Options::default();
+            for input in &inputs.inner {
+                let mut writer = naga::back::spv::Writer::new(&options).unwrap();
+                let _ = writer.write(
+                    input.module.as_ref().unwrap(),
+                    input.module_info.as_ref().unwrap(),
+                    None,
+                    &None,
+                    &mut data,
+                );
+                data.clear();
+            }
+        });
+    });
+    group.bench_function("shader: spv-out multiple entrypoints", |b| {
+        b.iter(|| {
+            let mut data = Vec::new();
+            let options = naga::back::spv::Options::default();
+            for input in &inputs.inner {
+                let mut writer = naga::back::spv::Writer::new(&options).unwrap();
+                let module = input.module.as_ref().unwrap();
+                for ep in module.entry_points.iter() {
+                    let pipeline_options = naga::back::spv::PipelineOptions {
+                        shader_stage: ep.stage,
+                        entry_point: ep.name.clone(),
+                    };
+                    let _ = writer.write(
+                        input.module.as_ref().unwrap(),
+                        input.module_info.as_ref().unwrap(),
+                        Some(&pipeline_options),
+                        &None,
+                        &mut data,
+                    );
+                    data.clear();
+                }
+            }
+        });
+    });
+
+    group.bench_function("shader: msl-out", |b| {
+        b.iter(|| {
+            let mut string = String::new();
+            let options = naga::back::msl::Options::default();
+            for input in &inputs.inner {
+                let pipeline_options = naga::back::msl::PipelineOptions::default();
+                let mut writer = naga::back::msl::Writer::new(&mut string);
+                let _ = writer.write(
+                    input.module.as_ref().unwrap(),
+                    input.module_info.as_ref().unwrap(),
+                    &options,
+                    &pipeline_options,
+                );
+                string.clear();
+            }
+        });
+    });
+
+    group.bench_function("shader: hlsl-out", |b| {
+        b.iter(|| {
+            let options = naga::back::hlsl::Options::default();
+            let mut string = String::new();
+            for input in &inputs.inner {
+                let mut writer = naga::back::hlsl::Writer::new(&mut string, &options);
+                let _ = writer.write(
+                    input.module.as_ref().unwrap(),
+                    input.module_info.as_ref().unwrap(),
+                ); // may fail on unimplemented things
+                string.clear();
+            }
+        });
+    });
+
+    group.bench_function("shader: glsl-out multiple entrypoints", |b| {
+        b.iter(|| {
+            let mut string = String::new();
+            let options = naga::back::glsl::Options {
+                version: naga::back::glsl::Version::new_gles(320),
+                writer_flags: naga::back::glsl::WriterFlags::empty(),
+                binding_map: Default::default(),
+                zero_initialize_workgroup_memory: true,
+            };
+            for input in &inputs.inner {
+                let module = input.module.as_ref().unwrap();
+                let info = input.module_info.as_ref().unwrap();
+                for ep in module.entry_points.iter() {
+                    let pipeline_options = naga::back::glsl::PipelineOptions {
+                        shader_stage: ep.stage,
+                        entry_point: ep.name.clone(),
+                        multiview: None,
+                    };
+
+                    // might be `Err` if missing features
+                    if let Ok(mut writer) = naga::back::glsl::Writer::new(
+                        &mut string,
+                        module,
+                        info,
+                        &options,
+                        &pipeline_options,
+                        naga::proc::BoundsCheckPolicies::default(),
+                    ) {
+                        let _ = writer.write(); // might be `Err` if unsupported
+                    }
+
+                    string.clear();
+                }
+            }
+        });
+    });
+}
+
+criterion_group!(shader, frontends, validation, backends);
--- a/cts_runner/src/bootstrap.js
+++ b/cts_runner/src/bootstrap.js
@ -5,8 +5,7 @@
 //
 delete Object.prototype.__proto__;

-const core = Deno.core;
-const primordials = globalThis.__bootstrap.primordials;
+import { core, primordials } from "ext:core/mod.js";
 const {
  Error,
  ObjectDefineProperty,
@ -25,9 +24,11 @@ import * as base64 from "ext:deno_web/05_base64.js";
 import * as encoding from "ext:deno_web/08_text_encoding.js";
 import { Console } from "ext:deno_console/01_console.js";
 import * as url from "ext:deno_url/00_url.js";
-import DOMException from "ext:deno_web/01_dom_exception.js";
+import { DOMException } from "ext:deno_web/01_dom_exception.js";
 import * as performance from "ext:deno_web/15_performance.js";
-import * as webgpu from "ext:deno_webgpu/01_webgpu.js";
+import { loadWebGPU } from "ext:deno_webgpu/00_init.js";
+import * as imageData from "ext:deno_web/16_image_data.js";
+const webgpu = loadWebGPU();

 // imports needed to pass module evaluation
 import "ext:deno_url/01_urlpattern.js";
@ -39,11 +40,10 @@ import "ext:deno_web/10_filereader.js";
 import "ext:deno_web/12_location.js";
 import "ext:deno_web/13_message_port.js";
 import "ext:deno_web/14_compression.js";
+import "ext:deno_webgpu/02_surface.js";

 let globalThis_;

-const { BadResource, Interrupted } = core;
-
 class NotFound extends Error {
  constructor(msg) {
    super(msg);
@ -183,6 +183,7 @@ const windowOrWorkerGlobalScope = {
  clearInterval: util.writable(timers.clearInterval),
  clearTimeout: util.writable(timers.clearTimeout),
  performance: util.writable(performance.performance),
+  ImageData: core.propNonEnumerable(imageData.ImageData),

  GPU: util.nonEnumerable(webgpu.GPU),
  GPUAdapter: util.nonEnumerable(webgpu.GPUAdapter),
@ -248,10 +249,8 @@ core.registerErrorClass("NotFound", NotFound);
 core.registerErrorClass("AlreadyExists", AlreadyExists);
 core.registerErrorClass("InvalidData", InvalidData);
 core.registerErrorClass("TimedOut", TimedOut);
-core.registerErrorClass("Interrupted", Interrupted);
 core.registerErrorClass("WriteZero", WriteZero);
 core.registerErrorClass("UnexpectedEof", UnexpectedEof);
-core.registerErrorClass("BadResource", BadResource);
 core.registerErrorClass("NotSupported", NotSupported);
 core.registerErrorBuilder(
  "DOMExceptionOperationError",
--- a/cts_runner/src/main.rs
+++ b/cts_runner/src/main.rs
@ -29,6 +29,9 @@ mod native {
            .ok_or_else(|| anyhow!("missing specifier in first command line argument"))?;
        let specifier = resolve_url_or_path(&url, &env::current_dir()?)?;

+        let mut feature_checker = deno_core::FeatureChecker::default();
+        feature_checker.enable_feature(deno_webgpu::UNSTABLE_FEATURE_NAME);
+
        let options = RuntimeOptions {
            module_loader: Some(Rc::new(deno_core::FsModuleLoader)),
            get_error_class_fn: Some(&get_error_class_name),
@ -40,18 +43,19 @@ mod native {
                    Arc::new(BlobStore::default()),
                    None,
                ),
-                deno_webgpu::deno_webgpu::init_ops_and_esm(true),
+                deno_webgpu::deno_webgpu::init_ops_and_esm(),
                cts_runner::init_ops_and_esm(),
            ],
+            feature_checker: Some(Arc::new(feature_checker)),
            ..Default::default()
        };
-        let mut isolate = JsRuntime::new(options);
+        let mut js_runtime = JsRuntime::new(options);
        let args = args_iter.collect::<Vec<String>>();
        let cfg = json!({"args": args, "cwd": env::current_dir().unwrap().to_string_lossy() });

        {
-            let context = isolate.main_context();
-            let scope = &mut isolate.handle_scope();
+            let context = js_runtime.main_context();
+            let scope = &mut js_runtime.handle_scope();
            let context_local = v8::Local::new(scope, context);
            let global_obj = context_local.global(scope);
            let bootstrap_str = v8::String::new(scope, "bootstrap").unwrap();
@ -65,20 +69,12 @@ mod native {
                .unwrap();
        }

-        isolate.op_state().borrow_mut().put(Permissions {});
+        js_runtime.op_state().borrow_mut().put(Permissions {});

-        let mod_id = isolate.load_main_module(&specifier, None).await?;
-        let mod_rx = isolate.mod_evaluate(mod_id);
-
-        let rx = tokio::spawn(async move {
-            match mod_rx.await {
-                Ok(err @ Err(_)) => err,
-                _ => Ok(()),
-            }
-        });
-
-        isolate.run_event_loop(false).await?;
-        rx.await.unwrap()?;
+        let mod_id = js_runtime.load_main_es_module(&specifier).await?;
+        let result = js_runtime.mod_evaluate(mod_id);
+        js_runtime.run_event_loop(Default::default()).await?;
+        result.await?;

        Ok(())
    }
@ -87,7 +83,7 @@ mod native {
        cts_runner,
        deps = [deno_webidl, deno_web],
        ops = [op_exit, op_read_file_sync, op_write_file_sync],
-        esm_entry_point = "ext:cts_runner/bootstrap.js",
+        esm_entry_point = "ext:cts_runner/src/bootstrap.js",
        esm = ["src/bootstrap.js"],
    );

--- a/d3d12/Cargo.toml
+++ b/d3d12/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "d3d12"
-version = "0.19.0"
+version = "0.20.0"
 authors = ["gfx-rs developers"]
 description = "Low level D3D12 API wrapper"
 repository = "https://github.com/gfx-rs/wgpu/tree/trunk/d3d12"
--- a/d3d12/src/resource.rs
+++ b/d3d12/src/resource.rs
@ -14,7 +14,6 @@ pub struct DiscardRegion<'a> {
 pub type Resource = ComPtr<d3d12::ID3D12Resource>;

 impl Resource {
-    ///
    pub fn map(
        &self,
        subresource: Subresource,
--- a/deno_webgpu/00_init.js
+++ b/deno_webgpu/00_init.js
@ -0,0 +1,7 @@
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
+
+import { core } from "ext:core/mod.js";
+
+const loadWebGPU = core.createLazyLoader("ext:deno_webgpu/01_webgpu.js");
+
+export { loadWebGPU };
--- a/deno_webgpu/01_webgpu.js
+++ b/deno_webgpu/01_webgpu.js
--- a/deno_webgpu/02_surface.js
+++ b/deno_webgpu/02_surface.js
@ -1,4 +1,4 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

 // @ts-check
 /// <reference path="../../core/lib.deno_core.d.ts" />
@ -6,27 +6,32 @@
 /// <reference path="../web/lib.deno_web.d.ts" />
 /// <reference path="./lib.deno_webgpu.d.ts" />

-const core = globalThis.Deno.core;
-const ops = core.ops;
-import * as webidl from "ext:deno_webidl/00_webidl.js";
-const primordials = globalThis.__bootstrap.primordials;
-const { Symbol } = primordials;
+import { primordials } from "ext:core/mod.js";
 import {
-  _device,
-  assertDevice,
-  createGPUTexture,
-  GPUTextureUsage,
-} from "ext:deno_webgpu/01_webgpu.js";
+  op_webgpu_surface_configure,
+  op_webgpu_surface_create,
+  op_webgpu_surface_get_current_texture,
+  op_webgpu_surface_present,
+} from "ext:core/ops";
+const {
+  ObjectPrototypeIsPrototypeOf,
+  Symbol,
+  SymbolFor,
+  TypeError,
+} = primordials;
+
+import * as webidl from "ext:deno_webidl/00_webidl.js";
+import { createFilteredInspectProxy } from "ext:deno_console/01_console.js";
+import { loadWebGPU } from "ext:deno_webgpu/00_init.js";

 const _surfaceRid = Symbol("[[surfaceRid]]");
 const _configuration = Symbol("[[configuration]]");
 const _canvas = Symbol("[[canvas]]");
 const _currentTexture = Symbol("[[currentTexture]]");
+const _present = Symbol("[[present]]");
 class GPUCanvasContext {
  /** @type {number} */
  [_surfaceRid];
-  /** @type {InnerGPUDevice} */
-  [_device];
  [_configuration];
  [_canvas];
  /** @type {GPUTexture | undefined} */
@ -50,11 +55,15 @@ class GPUCanvasContext {
      context: "Argument 1",
    });

+    const { _device, assertDevice } = loadWebGPU();
    this[_device] = configuration.device[_device];
    this[_configuration] = configuration;
-    const device = assertDevice(this, { prefix, context: "configuration.device" });
+    const device = assertDevice(this, {
+      prefix,
+      context: "configuration.device",
+    });

-    const { err } = ops.op_webgpu_surface_configure({
+    const { err } = op_webgpu_surface_configure({
      surfaceRid: this[_surfaceRid],
      deviceRid: device.rid,
      format: configuration.format,
@ -69,6 +78,8 @@ class GPUCanvasContext {
  }

  unconfigure() {
+    const { _device } = loadWebGPU();
+
    webidl.assertBranded(this, GPUCanvasContextPrototype);

    this[_configuration] = null;
@ -77,11 +88,13 @@ class GPUCanvasContext {

  getCurrentTexture() {
    webidl.assertBranded(this, GPUCanvasContextPrototype);
-    const prefix = "Failed to execute 'getCurrentTexture' on 'GPUCanvasContext'";
+    const prefix =
+      "Failed to execute 'getCurrentTexture' on 'GPUCanvasContext'";

    if (this[_configuration] === null) {
      throw new DOMException("context is not configured.", "InvalidStateError");
    }
+    const { createGPUTexture, assertDevice } = loadWebGPU();

    const device = assertDevice(this, { prefix, context: "this" });

@ -89,7 +102,10 @@ class GPUCanvasContext {
      return this[_currentTexture];
    }

-    const { rid } = ops.op_webgpu_surface_get_current_texture(device.rid, this[_surfaceRid]);
+    const { rid } = op_webgpu_surface_get_current_texture(
+      device.rid,
+      this[_surfaceRid],
+    );

    const texture = createGPUTexture(
      {
@ -112,102 +128,66 @@ class GPUCanvasContext {
    return texture;
  }

-  // Extended from spec. Required to present the texture; browser don't need this.
-  present() {
+  // Required to present the texture; browser don't need this.
+  [_present]() {
+    const { assertDevice } = loadWebGPU();
+
    webidl.assertBranded(this, GPUCanvasContextPrototype);
    const prefix = "Failed to execute 'present' on 'GPUCanvasContext'";
-    const device = assertDevice(this[_currentTexture], { prefix, context: "this" });
-    ops.op_webgpu_surface_present(device.rid, this[_surfaceRid]);
+    const device = assertDevice(this[_currentTexture], {
+      prefix,
+      context: "this",
+    });
+    op_webgpu_surface_present(device.rid, this[_surfaceRid]);
    this[_currentTexture].destroy();
    this[_currentTexture] = undefined;
  }
+
+  [SymbolFor("Deno.privateCustomInspect")](inspect, inspectOptions) {
+    return inspect(
+      createFilteredInspectProxy({
+        object: this,
+        evaluate: ObjectPrototypeIsPrototypeOf(GPUCanvasContextPrototype, this),
+        keys: [
+          "canvas",
+        ],
+      }),
+      inspectOptions,
+    );
+  }
 }
 const GPUCanvasContextPrototype = GPUCanvasContext.prototype;

 function createCanvasContext(options) {
+  // lazy load webgpu if needed
  const canvasContext = webidl.createBranded(GPUCanvasContext);
  canvasContext[_surfaceRid] = options.surfaceRid;
  canvasContext[_canvas] = options.canvas;
  return canvasContext;
 }

-// Converters
+// External webgpu surfaces

-// ENUM: GPUCanvasAlphaMode
-webidl.converters["GPUCanvasAlphaMode"] = webidl.createEnumConverter(
-  "GPUCanvasAlphaMode",
-  [
-    "opaque",
-    "premultiplied",
-  ],
-);
+// TODO(@littledivy): This will extend `OffscreenCanvas` when we add it.
+class UnsafeWindowSurface {
+  #ctx;
+  #surfaceRid;

-// NON-SPEC: ENUM: GPUPresentMode
-webidl.converters["GPUPresentMode"] = webidl.createEnumConverter(
-  "GPUPresentMode",
-  [
-    "autoVsync",
-    "autoNoVsync",
-    "fifo",
-    "fifoRelaxed",
-    "immediate",
-    "mailbox",
-  ],
-);
+  constructor(system, win, display) {
+    this.#surfaceRid = op_webgpu_surface_create(system, win, display);
+  }

-// DICT: GPUCanvasConfiguration
-const dictMembersGPUCanvasConfiguration = [
-  { key: "device", converter: webidl.converters.GPUDevice, required: true },
-  {
-    key: "format",
-    converter: webidl.converters.GPUTextureFormat,
-    required: true,
-  },
-  {
-    key: "usage",
-    converter: webidl.converters["GPUTextureUsageFlags"],
-    defaultValue: GPUTextureUsage.RENDER_ATTACHMENT,
-  },
-  {
-    key: "alphaMode",
-    converter: webidl.converters["GPUCanvasAlphaMode"],
-    defaultValue: "opaque",
-  },
+  getContext(context) {
+    if (context !== "webgpu") {
+      throw new TypeError("Only 'webgpu' context is supported.");
+    }
+    this.#ctx = createCanvasContext({ surfaceRid: this.#surfaceRid });
+    return this.#ctx;
+  }

-  // Extended from spec
-  {
-    key: "presentMode",
-    converter: webidl.converters["GPUPresentMode"],
-  },
-  {
-    key: "width",
-    converter: webidl.converters["long"],
-    required: true,
-  },
-  {
-    key: "height",
-    converter: webidl.converters["long"],
-    required: true,
-  },
-  {
-    key: "viewFormats",
-    converter: webidl.createSequenceConverter(
-      webidl.converters["GPUTextureFormat"],
-    ),
-    get defaultValue() {
-      return [];
-    },
-  },
-];
-webidl.converters["GPUCanvasConfiguration"] = webidl
-  .createDictionaryConverter(
-    "GPUCanvasConfiguration",
-    dictMembersGPUCanvasConfiguration,
-  );
+  present() {
+    this.#ctx[_present]();
+  }
+}

-
-window.__bootstrap.webgpu = {
-  ...window.__bootstrap.webgpu,
-  GPUCanvasContext,
-  createCanvasContext,
-};
+export { GPUCanvasContext, UnsafeWindowSurface };
--- a/deno_webgpu/Cargo.toml
+++ b/deno_webgpu/Cargo.toml
@ -1,8 +1,8 @@
-# Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+# Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

 [package]
 name = "deno_webgpu"
-version = "0.85.0"
+version = "0.118.0"
 authors = ["the Deno authors"]
 edition.workspace = true
 license = "MIT"
@ -13,9 +13,6 @@ description = "WebGPU implementation for Deno"
 [lib]
 path = "lib.rs"

-[features]
-surface = ["wgpu-core/raw-window-handle", "dep:raw-window-handle"]
-
 # We make all dependencies conditional on not being wasm,
 # so the whole workspace can built as wasm.
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
@ -23,11 +20,19 @@ deno_core.workspace = true
 serde = { workspace = true, features = ["derive"] }
 tokio = { workspace = true, features = ["full"] }
 wgpu-types = { workspace = true, features = ["serde"] }
-raw-window-handle = { workspace = true, optional = true }
+raw-window-handle = { workspace = true }

 [target.'cfg(not(target_arch = "wasm32"))'.dependencies.wgpu-core]
 workspace = true
-features = ["trace", "replay", "serde", "strict_asserts", "wgsl", "gles"]
+features = [
+    "raw-window-handle",
+    "trace",
+    "replay",
+    "serde",
+    "strict_asserts",
+    "wgsl",
+    "gles",
+]

 # We want the wgpu-core Metal backend on macOS and iOS.
 [target.'cfg(any(target_os = "macos", target_os = "ios"))'.dependencies.wgpu-core]
@ -39,11 +44,6 @@ features = ["metal"]
 workspace = true
 features = ["dx12"]

-[target.'cfg(windows)'.dependencies.wgpu-hal]
-version = "0.19.0"
-path = "../wgpu-hal"
-features = ["windows_rs"]
-
 # We want the wgpu-core Vulkan backend on Unix (but not Emscripten) and Windows.
 [target.'cfg(any(windows, all(unix, not(target_os = "emscripten"))))'.dependencies.wgpu-core]
 workspace = true
--- a/deno_webgpu/LICENSE.md
+++ b/deno_webgpu/LICENSE.md
@ -1,6 +1,6 @@
 MIT License

-Copyright 2018-2023 the Deno authors
+Copyright 2018-2024 the Deno authors

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
--- a/deno_webgpu/README.md
+++ b/deno_webgpu/README.md
@ -2,8 +2,8 @@

 This op crate implements the WebGPU API as defined in
 https://gpuweb.github.io/gpuweb/ in Deno. The implementation targets the spec
-draft as of February 22, 2021. The spec is still very much in flux. This op
-crate tries to stay up to date with the spec, but is constrained by the features
+draft as of March 31, 2024. The spec is still very much in flux. This extension
+tries to stay up to date with the spec, but is constrained by the features
 implemented in our GPU backend library [wgpu](https://github.com/gfx-rs/wgpu).

 The spec is still very bare bones, and is still missing many details. As the
--- a/deno_webgpu/binding.rs
+++ b/deno_webgpu/binding.rs
@ -1,4 +1,4 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

 use deno_core::error::AnyError;
 use deno_core::op2;
@ -112,25 +112,11 @@ impl From<GpuTextureSampleType> for wgpu_types::TextureSampleType {
 #[derive(Deserialize)]
 #[serde(rename_all = "camelCase")]
 struct GpuStorageTextureBindingLayout {
-    access: GpuStorageTextureAccess,
+    access: wgpu_types::StorageTextureAccess,
    format: wgpu_types::TextureFormat,
    view_dimension: wgpu_types::TextureViewDimension,
 }

-#[derive(Deserialize)]
-#[serde(rename_all = "kebab-case")]
-enum GpuStorageTextureAccess {
-    WriteOnly,
-}
-
-impl From<GpuStorageTextureAccess> for wgpu_types::StorageTextureAccess {
-    fn from(access: GpuStorageTextureAccess) -> Self {
-        match access {
-            GpuStorageTextureAccess::WriteOnly => wgpu_types::StorageTextureAccess::WriteOnly,
-        }
-    }
-}
-
 #[derive(Deserialize)]
 #[serde(rename_all = "camelCase")]
 pub struct GpuBindGroupLayoutEntry {
@ -165,7 +151,7 @@ impl From<GpuBindingType> for wgpu_types::BindingType {
            },
            GpuBindingType::StorageTexture(storage_texture) => {
                wgpu_types::BindingType::StorageTexture {
-                    access: storage_texture.access.into(),
+                    access: storage_texture.access,
                    format: storage_texture.format,
                    view_dimension: storage_texture.view_dimension,
                }
--- a/deno_webgpu/buffer.rs
+++ b/deno_webgpu/buffer.rs
@ -1,4 +1,4 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

 use deno_core::error::type_error;
 use deno_core::error::AnyError;
@ -163,6 +163,7 @@ pub fn op_webgpu_buffer_get_mapped_range(
    ))
    .map_err(|e| DomExceptionOperationError::new(&e.to_string()))?;

+    // SAFETY: guarantee to be safe from wgpu
    let slice = unsafe { std::slice::from_raw_parts_mut(slice_pointer, range_size as usize) };
    buf.copy_from_slice(slice);

@ -189,6 +190,7 @@ pub fn op_webgpu_buffer_unmap(
    let buffer = buffer_resource.1;

    if let Some(buf) = buf {
+        // SAFETY: guarantee to be safe from wgpu
        let slice = unsafe { std::slice::from_raw_parts_mut(mapped_resource.0, mapped_resource.1) };
        slice.copy_from_slice(buf);
    }
--- a/deno_webgpu/bundle.rs
+++ b/deno_webgpu/bundle.rs
@ -1,4 +1,4 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

 use deno_core::error::type_error;
 use deno_core::error::AnyError;
--- a/deno_webgpu/byow.rs
+++ b/deno_webgpu/byow.rs
@ -0,0 +1,131 @@
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
+
+use deno_core::error::type_error;
+use deno_core::error::AnyError;
+use deno_core::op2;
+use deno_core::OpState;
+use deno_core::ResourceId;
+use std::ffi::c_void;
+#[cfg(any(target_os = "linux", target_os = "macos"))]
+use std::ptr::NonNull;
+
+use crate::surface::WebGpuSurface;
+
+#[op2(fast)]
+#[smi]
+pub fn op_webgpu_surface_create(
+    state: &mut OpState,
+    #[string] system: &str,
+    p1: *const c_void,
+    p2: *const c_void,
+) -> Result<ResourceId, AnyError> {
+    let instance = state.borrow::<super::Instance>();
+    // Security note:
+    //
+    // The `p1` and `p2` parameters are pointers to platform-specific window
+    // handles.
+    //
+    // The code below works under the assumption that:
+    //
+    // - handles can only be created by the FFI interface which
+    // enforces --allow-ffi.
+    //
+    // - `*const c_void` deserizalizes null and v8::External.
+    //
+    // - Only FFI can export v8::External to user code.
+    if p1.is_null() {
+        return Err(type_error("Invalid parameters"));
+    }
+
+    let (win_handle, display_handle) = raw_window(system, p1, p2)?;
+    // SAFETY: see above comment
+    let surface = unsafe { instance.instance_create_surface(display_handle, win_handle, None)? };
+
+    let rid = state
+        .resource_table
+        .add(WebGpuSurface(instance.clone(), surface));
+    Ok(rid)
+}
+
+type RawHandles = (
+    raw_window_handle::RawWindowHandle,
+    raw_window_handle::RawDisplayHandle,
+);
+
+#[cfg(target_os = "macos")]
+fn raw_window(
+    system: &str,
+    _ns_window: *const c_void,
+    ns_view: *const c_void,
+) -> Result<RawHandles, AnyError> {
+    if system != "cocoa" {
+        return Err(type_error("Invalid system on macOS"));
+    }
+
+    let win_handle =
+        raw_window_handle::RawWindowHandle::AppKit(raw_window_handle::AppKitWindowHandle::new(
+            NonNull::new(ns_view as *mut c_void).ok_or(type_error("ns_view is null"))?,
+        ));
+
+    let display_handle =
+        raw_window_handle::RawDisplayHandle::AppKit(raw_window_handle::AppKitDisplayHandle::new());
+    Ok((win_handle, display_handle))
+}
+
+#[cfg(target_os = "windows")]
+fn raw_window(
+    system: &str,
+    window: *const c_void,
+    hinstance: *const c_void,
+) -> Result<RawHandles, AnyError> {
+    use raw_window_handle::WindowsDisplayHandle;
+    if system != "win32" {
+        return Err(type_error("Invalid system on Windows"));
+    }
+
+    let win_handle = {
+        let mut handle = raw_window_handle::Win32WindowHandle::new(
+            std::num::NonZeroIsize::new(window as isize).ok_or(type_error("window is null"))?,
+        );
+        handle.hinstance = std::num::NonZeroIsize::new(hinstance as isize);
+
+        raw_window_handle::RawWindowHandle::Win32(handle)
+    };
+
+    let display_handle = raw_window_handle::RawDisplayHandle::Windows(WindowsDisplayHandle::new());
+    Ok((win_handle, display_handle))
+}
+
+#[cfg(target_os = "linux")]
+fn raw_window(
+    system: &str,
+    window: *const c_void,
+    display: *const c_void,
+) -> Result<RawHandles, AnyError> {
+    let (win_handle, display_handle);
+    if system == "x11" {
+        win_handle = raw_window_handle::RawWindowHandle::Xlib(
+            raw_window_handle::XlibWindowHandle::new(window as *mut c_void as _),
+        );
+
+        display_handle = raw_window_handle::RawDisplayHandle::Xlib(
+            raw_window_handle::XlibDisplayHandle::new(NonNull::new(display as *mut c_void), 0),
+        );
+    } else if system == "wayland" {
+        win_handle = raw_window_handle::RawWindowHandle::Wayland(
+            raw_window_handle::WaylandWindowHandle::new(
+                NonNull::new(window as *mut c_void).ok_or(type_error("window is null"))?,
+            ),
+        );
+
+        display_handle = raw_window_handle::RawDisplayHandle::Wayland(
+            raw_window_handle::WaylandDisplayHandle::new(
+                NonNull::new(display as *mut c_void).ok_or(type_error("display is null"))?,
+            ),
+        );
+    } else {
+        return Err(type_error("Invalid system on Linux"));
+    }
+
+    Ok((win_handle, display_handle))
+}
--- a/deno_webgpu/command_encoder.rs
+++ b/deno_webgpu/command_encoder.rs
@ -1,4 +1,4 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

 use crate::WebGpuQuerySet;
 use deno_core::error::AnyError;
@ -254,13 +254,14 @@ pub fn op_webgpu_command_encoder_begin_compute_pass(
        None
    };

+    let instance = state.borrow::<super::Instance>();
+    let command_encoder = &command_encoder_resource.1;
    let descriptor = wgpu_core::command::ComputePassDescriptor {
        label: Some(label),
        timestamp_writes: timestamp_writes.as_ref(),
    };

-    let compute_pass =
-        wgpu_core::command::ComputePass::new(command_encoder_resource.1, &descriptor);
+    let compute_pass = gfx_select!(command_encoder => instance.command_encoder_create_compute_pass_dyn(*command_encoder, &descriptor));

    let rid = state
        .resource_table
--- a/deno_webgpu/compute_pass.rs
+++ b/deno_webgpu/compute_pass.rs
@ -1,4 +1,4 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

 use deno_core::error::AnyError;
 use deno_core::op2;
@ -10,7 +10,9 @@ use std::cell::RefCell;

 use super::error::WebGpuResult;

-pub(crate) struct WebGpuComputePass(pub(crate) RefCell<wgpu_core::command::ComputePass>);
+pub(crate) struct WebGpuComputePass(
+    pub(crate) RefCell<Box<dyn wgpu_core::command::DynComputePass>>,
+);
 impl Resource for WebGpuComputePass {
    fn name(&self) -> Cow<str> {
        "webGPUComputePass".into()
@ -31,10 +33,10 @@ pub fn op_webgpu_compute_pass_set_pipeline(
        .resource_table
        .get::<WebGpuComputePass>(compute_pass_rid)?;

-    wgpu_core::command::compute_ffi::wgpu_compute_pass_set_pipeline(
-        &mut compute_pass_resource.0.borrow_mut(),
-        compute_pipeline_resource.1,
-    );
+    compute_pass_resource
+        .0
+        .borrow_mut()
+        .set_pipeline(state.borrow(), compute_pipeline_resource.1)?;

    Ok(WebGpuResult::empty())
 }
@ -52,12 +54,10 @@ pub fn op_webgpu_compute_pass_dispatch_workgroups(
        .resource_table
        .get::<WebGpuComputePass>(compute_pass_rid)?;

-    wgpu_core::command::compute_ffi::wgpu_compute_pass_dispatch_workgroups(
-        &mut compute_pass_resource.0.borrow_mut(),
-        x,
-        y,
-        z,
-    );
+    compute_pass_resource
+        .0
+        .borrow_mut()
+        .dispatch_workgroups(state.borrow(), x, y, z)?;

    Ok(WebGpuResult::empty())
 }
@ -77,11 +77,10 @@ pub fn op_webgpu_compute_pass_dispatch_workgroups_indirect(
        .resource_table
        .get::<WebGpuComputePass>(compute_pass_rid)?;

-    wgpu_core::command::compute_ffi::wgpu_compute_pass_dispatch_workgroups_indirect(
-        &mut compute_pass_resource.0.borrow_mut(),
-        buffer_resource.1,
-        indirect_offset,
-    );
+    compute_pass_resource
+        .0
+        .borrow_mut()
+        .dispatch_workgroups_indirect(state.borrow(), buffer_resource.1, indirect_offset)?;

    Ok(WebGpuResult::empty())
 }
@ -90,24 +89,15 @@ pub fn op_webgpu_compute_pass_dispatch_workgroups_indirect(
 #[serde]
 pub fn op_webgpu_compute_pass_end(
    state: &mut OpState,
-    #[smi] command_encoder_rid: ResourceId,
    #[smi] compute_pass_rid: ResourceId,
 ) -> Result<WebGpuResult, AnyError> {
-    let command_encoder_resource =
-        state
-            .resource_table
-            .get::<super::command_encoder::WebGpuCommandEncoder>(command_encoder_rid)?;
-    let command_encoder = command_encoder_resource.1;
    let compute_pass_resource = state
        .resource_table
        .take::<WebGpuComputePass>(compute_pass_rid)?;
-    let compute_pass = &compute_pass_resource.0.borrow();
-    let instance = state.borrow::<super::Instance>();

-    gfx_ok!(command_encoder => instance.command_encoder_run_compute_pass(
-      command_encoder,
-      compute_pass
-    ))
+    compute_pass_resource.0.borrow_mut().end(state.borrow())?;
+
+    Ok(WebGpuResult::empty())
 }

 #[op2]
@ -137,17 +127,12 @@ pub fn op_webgpu_compute_pass_set_bind_group(

    let dynamic_offsets_data: &[u32] = &dynamic_offsets_data[start..start + len];

-    // SAFETY: the raw pointer and length are of the same slice, and that slice
-    // lives longer than the below function invocation.
-    unsafe {
-        wgpu_core::command::compute_ffi::wgpu_compute_pass_set_bind_group(
-            &mut compute_pass_resource.0.borrow_mut(),
-            index,
-            bind_group_resource.1,
-            dynamic_offsets_data.as_ptr(),
-            dynamic_offsets_data.len(),
-        );
-    }
+    compute_pass_resource.0.borrow_mut().set_bind_group(
+        state.borrow(),
+        index,
+        bind_group_resource.1,
+        dynamic_offsets_data,
+    )?;

    Ok(WebGpuResult::empty())
 }
@ -163,16 +148,11 @@ pub fn op_webgpu_compute_pass_push_debug_group(
        .resource_table
        .get::<WebGpuComputePass>(compute_pass_rid)?;

-    let label = std::ffi::CString::new(group_label).unwrap();
-    // SAFETY: the string the raw pointer points to lives longer than the below
-    // function invocation.
-    unsafe {
-        wgpu_core::command::compute_ffi::wgpu_compute_pass_push_debug_group(
-            &mut compute_pass_resource.0.borrow_mut(),
-            label.as_ptr(),
-            0, // wgpu#975
-        );
-    }
+    compute_pass_resource.0.borrow_mut().push_debug_group(
+        state.borrow(),
+        group_label,
+        0, // wgpu#975
+    )?;

    Ok(WebGpuResult::empty())
 }
@ -187,9 +167,10 @@ pub fn op_webgpu_compute_pass_pop_debug_group(
        .resource_table
        .get::<WebGpuComputePass>(compute_pass_rid)?;

-    wgpu_core::command::compute_ffi::wgpu_compute_pass_pop_debug_group(
-        &mut compute_pass_resource.0.borrow_mut(),
-    );
+    compute_pass_resource
+        .0
+        .borrow_mut()
+        .pop_debug_group(state.borrow())?;

    Ok(WebGpuResult::empty())
 }
@ -205,16 +186,11 @@ pub fn op_webgpu_compute_pass_insert_debug_marker(
        .resource_table
        .get::<WebGpuComputePass>(compute_pass_rid)?;

-    let label = std::ffi::CString::new(marker_label).unwrap();
-    // SAFETY: the string the raw pointer points to lives longer than the below
-    // function invocation.
-    unsafe {
-        wgpu_core::command::compute_ffi::wgpu_compute_pass_insert_debug_marker(
-            &mut compute_pass_resource.0.borrow_mut(),
-            label.as_ptr(),
-            0, // wgpu#975
-        );
-    }
+    compute_pass_resource.0.borrow_mut().insert_debug_marker(
+        state.borrow(),
+        marker_label,
+        0, // wgpu#975
+    )?;

    Ok(WebGpuResult::empty())
 }
--- a/deno_webgpu/error.rs
+++ b/deno_webgpu/error.rs
@ -1,4 +1,5 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
+
 use deno_core::error::AnyError;
 use deno_core::ResourceId;
 use serde::Serialize;
@ -23,7 +24,6 @@ use wgpu_core::device::DeviceError;
 use wgpu_core::pipeline::CreateComputePipelineError;
 use wgpu_core::pipeline::CreateRenderPipelineError;
 use wgpu_core::pipeline::CreateShaderModuleError;
-#[cfg(feature = "surface")]
 use wgpu_core::present::ConfigureSurfaceError;
 use wgpu_core::resource::BufferAccessError;
 use wgpu_core::resource::CreateBufferError;
@ -87,6 +87,7 @@ pub enum WebGpuError {
    Lost,
    OutOfMemory,
    Validation(String),
+    Internal,
 }

 impl From<CreateBufferError> for WebGpuError {
@ -277,7 +278,6 @@ impl From<ClearError> for WebGpuError {
    }
 }

-#[cfg(feature = "surface")]
 impl From<ConfigureSurfaceError> for WebGpuError {
    fn from(err: ConfigureSurfaceError) -> Self {
        WebGpuError::Validation(fmt_err(&err))
--- a/deno_webgpu/lib.rs
+++ b/deno_webgpu/lib.rs
@ -1,4 +1,4 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
 #![cfg(not(target_arch = "wasm32"))]
 #![warn(unsafe_op_in_unsafe_fn)]

@ -19,6 +19,8 @@ pub use wgpu_types;
 use error::DomExceptionOperationError;
 use error::WebGpuResult;

+pub const UNSTABLE_FEATURE_NAME: &str = "webgpu";
+
 #[macro_use]
 mod macros {
    macro_rules! gfx_select {
@ -71,6 +73,7 @@ mod macros {
 pub mod binding;
 pub mod buffer;
 pub mod bundle;
+pub mod byow;
 pub mod command_encoder;
 pub mod compute_pass;
 pub mod error;
@ -79,23 +82,9 @@ pub mod queue;
 pub mod render_pass;
 pub mod sampler;
 pub mod shader;
-#[cfg(feature = "surface")]
 pub mod surface;
 pub mod texture;

-pub struct Unstable(pub bool);
-
-fn check_unstable(state: &OpState, api_name: &str) {
-    let unstable = state.borrow::<Unstable>();
-    if !unstable.0 {
-        eprintln!(
-            "Unstable API '{}'. The --unstable flag must be provided.",
-            api_name
-        );
-        std::process::exit(70);
-    }
-}
-
 pub type Instance = std::sync::Arc<wgpu_core::global::Global>;

 struct WebGpuAdapter(Instance, wgpu_core::id::AdapterId);
@ -224,12 +213,15 @@ deno_core::extension!(
        queue::op_webgpu_write_texture,
        // shader
        shader::op_webgpu_create_shader_module,
+        // surface
+        surface::op_webgpu_surface_configure,
+        surface::op_webgpu_surface_get_current_texture,
+        surface::op_webgpu_surface_present,
+        // byow
+        byow::op_webgpu_surface_create,
    ],
-    esm = ["01_webgpu.js"],
-    options = { unstable: bool },
-    state = |state, options| {
-        state.put(Unstable(options.unstable));
-    },
+    esm = ["00_init.js", "02_surface.js"],
+    lazy_loaded_esm = ["01_webgpu.js"],
 );

 fn deserialize_features(features: &wgpu_types::Features) -> Vec<&'static str> {
@ -377,29 +369,45 @@ fn deserialize_features(features: &wgpu_types::Features) -> Vec<&'static str> {

 #[derive(Serialize)]
 #[serde(untagged)]
-pub enum GpuAdapterDeviceOrErr {
+pub enum GpuAdapterResOrErr {
    Error { err: String },
-    Features(GpuAdapterDevice),
+    Features(GpuAdapterRes),
 }

 #[derive(Serialize)]
 #[serde(rename_all = "camelCase")]
-pub struct GpuAdapterDevice {
+pub struct GpuAdapterRes {
    rid: ResourceId,
    limits: wgpu_types::Limits,
    features: Vec<&'static str>,
    is_software: bool,
 }

-#[op2(async)]
+#[derive(Serialize)]
+#[serde(rename_all = "camelCase")]
+pub struct GpuDeviceRes {
+    rid: ResourceId,
+    queue_rid: ResourceId,
+    limits: wgpu_types::Limits,
+    features: Vec<&'static str>,
+    is_software: bool,
+}
+
+#[op2]
 #[serde]
-pub async fn op_webgpu_request_adapter(
+pub fn op_webgpu_request_adapter(
    state: Rc<RefCell<OpState>>,
    #[serde] power_preference: Option<wgpu_types::PowerPreference>,
    force_fallback_adapter: bool,
-) -> Result<GpuAdapterDeviceOrErr, AnyError> {
+) -> Result<GpuAdapterResOrErr, AnyError> {
    let mut state = state.borrow_mut();
-    check_unstable(&state, "navigator.gpu.requestAdapter");
+
+    // TODO(bartlomieju): replace with `state.feature_checker.check_or_exit`
+    // once we phase out `check_or_exit_with_legacy_fallback`
+    state
+        .feature_checker
+        .check_or_exit_with_legacy_fallback(UNSTABLE_FEATURE_NAME, "navigator.gpu.requestAdapter");
+
    let backends = std::env::var("DENO_WEBGPU_BACKEND").map_or_else(
        |_| wgpu_types::Backends::all(),
        |s| wgpu_core::instance::parse_backends_from_comma_list(&s),
@ -432,7 +440,7 @@ pub async fn op_webgpu_request_adapter(
    let adapter = match res {
        Ok(adapter) => adapter,
        Err(err) => {
-            return Ok(GpuAdapterDeviceOrErr::Error {
+            return Ok(GpuAdapterResOrErr::Error {
                err: err.to_string(),
            })
        }
@ -445,7 +453,7 @@ pub async fn op_webgpu_request_adapter(

    let rid = state.resource_table.add(WebGpuAdapter(instance, adapter));

-    Ok(GpuAdapterDeviceOrErr::Features(GpuAdapterDevice {
+    Ok(GpuAdapterResOrErr::Features(GpuAdapterRes {
        rid,
        features,
        limits: adapter_limits,
@ -649,17 +657,17 @@ impl From<GpuRequiredFeatures> for wgpu_types::Features {
    }
 }

-#[op2(async)]
+#[op2]
 #[serde]
-pub async fn op_webgpu_request_device(
+pub fn op_webgpu_request_device(
    state: Rc<RefCell<OpState>>,
    #[smi] adapter_rid: ResourceId,
    #[string] label: String,
    #[serde] required_features: GpuRequiredFeatures,
    #[serde] required_limits: Option<wgpu_types::Limits>,
-) -> Result<GpuAdapterDevice, AnyError> {
+) -> Result<GpuDeviceRes, AnyError> {
    let mut state = state.borrow_mut();
-    let adapter_resource = state.resource_table.get::<WebGpuAdapter>(adapter_rid)?;
+    let adapter_resource = state.resource_table.take::<WebGpuAdapter>(adapter_rid)?;
    let adapter = adapter_resource.1;
    let instance = state.borrow::<Instance>();

@ -669,13 +677,14 @@ pub async fn op_webgpu_request_device(
        required_limits: required_limits.unwrap_or_default(),
    };

-    let (device, _queue, maybe_err) = gfx_select!(adapter => instance.adapter_request_device(
+    let (device, queue, maybe_err) = gfx_select!(adapter => instance.adapter_request_device(
      adapter,
      &descriptor,
      std::env::var("DENO_WEBGPU_TRACE").ok().as_ref().map(std::path::Path::new),
      None,
      None
    ));
+    adapter_resource.close();
    if let Some(err) = maybe_err {
        return Err(DomExceptionOperationError::new(&err.to_string()).into());
    }
@ -685,10 +694,15 @@ pub async fn op_webgpu_request_device(
    let limits = gfx_select!(device => instance.device_limits(device))?;

    let instance = instance.clone();
+    let instance2 = instance.clone();
    let rid = state.resource_table.add(WebGpuDevice(instance, device));
+    let queue_rid = state
+        .resource_table
+        .add(queue::WebGpuQueue(instance2, queue));

-    Ok(GpuAdapterDevice {
+    Ok(GpuDeviceRes {
        rid,
+        queue_rid,
        features,
        limits,
        // TODO(lucacasonato): report correctly from wgpu
@ -705,18 +719,19 @@ pub struct GPUAdapterInfo {
    description: String,
 }

-#[op2(async)]
+#[op2]
 #[serde]
-pub async fn op_webgpu_request_adapter_info(
+pub fn op_webgpu_request_adapter_info(
    state: Rc<RefCell<OpState>>,
    #[smi] adapter_rid: ResourceId,
 ) -> Result<GPUAdapterInfo, AnyError> {
-    let state = state.borrow_mut();
-    let adapter_resource = state.resource_table.get::<WebGpuAdapter>(adapter_rid)?;
+    let mut state = state.borrow_mut();
+    let adapter_resource = state.resource_table.take::<WebGpuAdapter>(adapter_rid)?;
    let adapter = adapter_resource.1;
    let instance = state.borrow::<Instance>();

    let info = gfx_select!(adapter => instance.adapter_get_info(adapter))?;
+    adapter_resource.close();

    Ok(GPUAdapterInfo {
        vendor: info.vendor.to_string(),
--- a/deno_webgpu/pipeline.rs
+++ b/deno_webgpu/pipeline.rs
@ -1,4 +1,4 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

 use deno_core::error::AnyError;
 use deno_core::op2;
@ -8,6 +8,7 @@ use deno_core::ResourceId;
 use serde::Deserialize;
 use serde::Serialize;
 use std::borrow::Cow;
+use std::collections::HashMap;
 use std::rc::Rc;

 use super::error::WebGpuError;
@ -74,8 +75,8 @@ pub enum GPUPipelineLayoutOrGPUAutoLayoutMode {
 #[serde(rename_all = "camelCase")]
 pub struct GpuProgrammableStage {
    module: ResourceId,
-    entry_point: String,
-    // constants: HashMap<String, GPUPipelineConstantValue>
+    entry_point: Option<String>,
+    constants: Option<HashMap<String, f64>>,
 }

 #[op2]
@ -110,9 +111,11 @@ pub fn op_webgpu_create_compute_pipeline(
        layout: pipeline_layout,
        stage: wgpu_core::pipeline::ProgrammableStageDescriptor {
            module: compute_shader_module_resource.1,
-            entry_point: Some(Cow::from(compute.entry_point)),
-            // TODO(lucacasonato): support args.compute.constants
+            entry_point: compute.entry_point.map(Cow::from),
+            constants: Cow::Owned(compute.constants.unwrap_or_default()),
+            zero_initialize_workgroup_memory: true,
        },
+        cache: None,
    };
    let implicit_pipelines = match layout {
        GPUPipelineLayoutOrGPUAutoLayoutMode::Layout(_) => None,
@ -278,7 +281,8 @@ impl<'a> From<GpuVertexBufferLayout> for wgpu_core::pipeline::VertexBufferLayout
 #[serde(rename_all = "camelCase")]
 struct GpuVertexState {
    module: ResourceId,
-    entry_point: String,
+    entry_point: Option<String>,
+    constants: Option<HashMap<String, f64>>,
    buffers: Vec<Option<GpuVertexBufferLayout>>,
 }

@ -305,8 +309,8 @@ impl From<GpuMultisampleState> for wgpu_types::MultisampleState {
 struct GpuFragmentState {
    targets: Vec<Option<wgpu_types::ColorTargetState>>,
    module: u32,
-    entry_point: String,
-    // TODO(lucacasonato): constants
+    entry_point: Option<String>,
+    constants: Option<HashMap<String, f64>>,
 }

 #[derive(Deserialize)]
@ -355,9 +359,12 @@ pub fn op_webgpu_create_render_pipeline(
        Some(wgpu_core::pipeline::FragmentState {
            stage: wgpu_core::pipeline::ProgrammableStageDescriptor {
                module: fragment_shader_module_resource.1,
-                entry_point: Some(Cow::from(fragment.entry_point)),
+                entry_point: fragment.entry_point.map(Cow::from),
+                constants: Cow::Owned(fragment.constants.unwrap_or_default()),
+                // Required to be true for WebGPU
+                zero_initialize_workgroup_memory: true,
            },
-            targets: Cow::from(fragment.targets),
+            targets: Cow::Owned(fragment.targets),
        })
    } else {
        None
@ -377,7 +384,10 @@ pub fn op_webgpu_create_render_pipeline(
        vertex: wgpu_core::pipeline::VertexState {
            stage: wgpu_core::pipeline::ProgrammableStageDescriptor {
                module: vertex_shader_module_resource.1,
-                entry_point: Some(Cow::Owned(args.vertex.entry_point)),
+                entry_point: args.vertex.entry_point.map(Cow::Owned),
+                constants: Cow::Owned(args.vertex.constants.unwrap_or_default()),
+                // Required to be true for WebGPU
+                zero_initialize_workgroup_memory: true,
            },
            buffers: Cow::Owned(vertex_buffers),
        },
@ -386,6 +396,7 @@ pub fn op_webgpu_create_render_pipeline(
        multisample: args.multisample,
        fragment,
        multiview: None,
+        cache: None,
    };

    let implicit_pipelines = match args.layout {
--- a/deno_webgpu/queue.rs
+++ b/deno_webgpu/queue.rs
@ -1,16 +1,28 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

 use crate::command_encoder::WebGpuCommandBuffer;
+use crate::Instance;
 use deno_core::error::AnyError;
 use deno_core::op2;
 use deno_core::OpState;
 use deno_core::Resource;
 use deno_core::ResourceId;
 use serde::Deserialize;
+use std::borrow::Cow;
+use std::rc::Rc;

 use super::error::WebGpuResult;

-type WebGpuQueue = super::WebGpuDevice;
+pub struct WebGpuQueue(pub Instance, pub wgpu_core::id::QueueId);
+impl Resource for WebGpuQueue {
+    fn name(&self) -> Cow<str> {
+        "webGPUQueue".into()
+    }
+
+    fn close(self: Rc<Self>) {
+        gfx_select!(self.1 => self.0.queue_drop(self.1));
+    }
+}

 #[op2]
 #[serde]
@ -19,7 +31,7 @@ pub fn op_webgpu_queue_submit(
    #[smi] queue_rid: ResourceId,
    #[serde] command_buffers: Vec<ResourceId>,
 ) -> Result<WebGpuResult, AnyError> {
-    let instance = state.borrow::<super::Instance>();
+    let instance = state.borrow::<Instance>();
    let queue_resource = state.resource_table.get::<WebGpuQueue>(queue_rid)?;
    let queue = queue_resource.1;

@ -32,7 +44,7 @@ pub fn op_webgpu_queue_submit(
        })
        .collect::<Result<Vec<_>, AnyError>>()?;

-    let maybe_err = gfx_select!(queue => instance.queue_submit(queue.transmute(), &ids)).err();
+    let maybe_err = gfx_select!(queue => instance.queue_submit(queue, &ids)).err();

    for rid in command_buffers {
        let resource = state.resource_table.take::<WebGpuCommandBuffer>(rid)?;
@ -71,7 +83,7 @@ pub fn op_webgpu_write_buffer(
    #[number] size: Option<usize>,
    #[buffer] buf: &[u8],
 ) -> Result<WebGpuResult, AnyError> {
-    let instance = state.borrow::<super::Instance>();
+    let instance = state.borrow::<Instance>();
    let buffer_resource = state
        .resource_table
        .get::<super::buffer::WebGpuBuffer>(buffer)?;
@ -84,7 +96,7 @@ pub fn op_webgpu_write_buffer(
        None => &buf[data_offset..],
    };
    let maybe_err = gfx_select!(queue => instance.queue_write_buffer(
-      queue.transmute(),
+      queue,
      buffer,
      buffer_offset,
      data
@ -104,7 +116,7 @@ pub fn op_webgpu_write_texture(
    #[serde] size: wgpu_types::Extent3d,
    #[buffer] buf: &[u8],
 ) -> Result<WebGpuResult, AnyError> {
-    let instance = state.borrow::<super::Instance>();
+    let instance = state.borrow::<Instance>();
    let texture_resource = state
        .resource_table
        .get::<super::texture::WebGpuTexture>(destination.texture)?;
@ -120,7 +132,7 @@ pub fn op_webgpu_write_texture(
    let data_layout = data_layout.into();

    gfx_ok!(queue => instance.queue_write_texture(
-      queue.transmute(),
+      queue,
      &destination,
      buf,
      &data_layout,
--- a/deno_webgpu/render_pass.rs
+++ b/deno_webgpu/render_pass.rs
@ -1,4 +1,4 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

 use deno_core::error::type_error;
 use deno_core::error::AnyError;
@ -41,7 +41,7 @@ pub fn op_webgpu_render_pass_set_viewport(
        .resource_table
        .get::<WebGpuRenderPass>(args.render_pass_rid)?;

-    wgpu_core::command::render_ffi::wgpu_render_pass_set_viewport(
+    wgpu_core::command::render_commands::wgpu_render_pass_set_viewport(
        &mut render_pass_resource.0.borrow_mut(),
        args.x,
        args.y,
@ -68,7 +68,7 @@ pub fn op_webgpu_render_pass_set_scissor_rect(
        .resource_table
        .get::<WebGpuRenderPass>(render_pass_rid)?;

-    wgpu_core::command::render_ffi::wgpu_render_pass_set_scissor_rect(
+    wgpu_core::command::render_commands::wgpu_render_pass_set_scissor_rect(
        &mut render_pass_resource.0.borrow_mut(),
        x,
        y,
@ -90,7 +90,7 @@ pub fn op_webgpu_render_pass_set_blend_constant(
        .resource_table
        .get::<WebGpuRenderPass>(render_pass_rid)?;

-    wgpu_core::command::render_ffi::wgpu_render_pass_set_blend_constant(
+    wgpu_core::command::render_commands::wgpu_render_pass_set_blend_constant(
        &mut render_pass_resource.0.borrow_mut(),
        &color,
    );
@ -109,7 +109,7 @@ pub fn op_webgpu_render_pass_set_stencil_reference(
        .resource_table
        .get::<WebGpuRenderPass>(render_pass_rid)?;

-    wgpu_core::command::render_ffi::wgpu_render_pass_set_stencil_reference(
+    wgpu_core::command::render_commands::wgpu_render_pass_set_stencil_reference(
        &mut render_pass_resource.0.borrow_mut(),
        reference,
    );
@ -128,7 +128,7 @@ pub fn op_webgpu_render_pass_begin_occlusion_query(
        .resource_table
        .get::<WebGpuRenderPass>(render_pass_rid)?;

-    wgpu_core::command::render_ffi::wgpu_render_pass_begin_occlusion_query(
+    wgpu_core::command::render_commands::wgpu_render_pass_begin_occlusion_query(
        &mut render_pass_resource.0.borrow_mut(),
        query_index,
    );
@ -146,7 +146,7 @@ pub fn op_webgpu_render_pass_end_occlusion_query(
        .resource_table
        .get::<WebGpuRenderPass>(render_pass_rid)?;

-    wgpu_core::command::render_ffi::wgpu_render_pass_end_occlusion_query(
+    wgpu_core::command::render_commands::wgpu_render_pass_end_occlusion_query(
        &mut render_pass_resource.0.borrow_mut(),
    );

@ -174,15 +174,10 @@ pub fn op_webgpu_render_pass_execute_bundles(
        .resource_table
        .get::<WebGpuRenderPass>(render_pass_rid)?;

-    // SAFETY: the raw pointer and length are of the same slice, and that slice
-    // lives longer than the below function invocation.
-    unsafe {
-        wgpu_core::command::render_ffi::wgpu_render_pass_execute_bundles(
-            &mut render_pass_resource.0.borrow_mut(),
-            bundles.as_ptr(),
-            bundles.len(),
-        );
-    }
+    wgpu_core::command::render_commands::wgpu_render_pass_execute_bundles(
+        &mut render_pass_resource.0.borrow_mut(),
+        &bundles,
+    );

    Ok(WebGpuResult::empty())
 }
@ -191,21 +186,16 @@ pub fn op_webgpu_render_pass_execute_bundles(
 #[serde]
 pub fn op_webgpu_render_pass_end(
    state: &mut OpState,
-    #[smi] command_encoder_rid: ResourceId,
    #[smi] render_pass_rid: ResourceId,
 ) -> Result<WebGpuResult, AnyError> {
-    let command_encoder_resource =
-        state
-            .resource_table
-            .get::<super::command_encoder::WebGpuCommandEncoder>(command_encoder_rid)?;
-    let command_encoder = command_encoder_resource.1;
    let render_pass_resource = state
        .resource_table
        .take::<WebGpuRenderPass>(render_pass_rid)?;
    let render_pass = &render_pass_resource.0.borrow();
+    let command_encoder = render_pass.parent_id();
    let instance = state.borrow::<super::Instance>();

-    gfx_ok!(command_encoder => instance.command_encoder_run_render_pass(command_encoder, render_pass))
+    gfx_ok!(command_encoder => instance.render_pass_end(render_pass))
 }

 #[op2]
@ -235,17 +225,12 @@ pub fn op_webgpu_render_pass_set_bind_group(

    let dynamic_offsets_data: &[u32] = &dynamic_offsets_data[start..start + len];

-    // SAFETY: the raw pointer and length are of the same slice, and that slice
-    // lives longer than the below function invocation.
-    unsafe {
-        wgpu_core::command::render_ffi::wgpu_render_pass_set_bind_group(
-            &mut render_pass_resource.0.borrow_mut(),
-            index,
-            bind_group_resource.1,
-            dynamic_offsets_data.as_ptr(),
-            dynamic_offsets_data.len(),
-        );
-    }
+    wgpu_core::command::render_commands::wgpu_render_pass_set_bind_group(
+        &mut render_pass_resource.0.borrow_mut(),
+        index,
+        bind_group_resource.1,
+        dynamic_offsets_data,
+    );

    Ok(WebGpuResult::empty())
 }
@ -261,16 +246,11 @@ pub fn op_webgpu_render_pass_push_debug_group(
        .resource_table
        .get::<WebGpuRenderPass>(render_pass_rid)?;

-    let label = std::ffi::CString::new(group_label).unwrap();
-    // SAFETY: the string the raw pointer points to lives longer than the below
-    // function invocation.
-    unsafe {
-        wgpu_core::command::render_ffi::wgpu_render_pass_push_debug_group(
-            &mut render_pass_resource.0.borrow_mut(),
-            label.as_ptr(),
-            0, // wgpu#975
-        );
-    }
+    wgpu_core::command::render_commands::wgpu_render_pass_push_debug_group(
+        &mut render_pass_resource.0.borrow_mut(),
+        group_label,
+        0, // wgpu#975
+    );

    Ok(WebGpuResult::empty())
 }
@ -285,7 +265,7 @@ pub fn op_webgpu_render_pass_pop_debug_group(
        .resource_table
        .get::<WebGpuRenderPass>(render_pass_rid)?;

-    wgpu_core::command::render_ffi::wgpu_render_pass_pop_debug_group(
+    wgpu_core::command::render_commands::wgpu_render_pass_pop_debug_group(
        &mut render_pass_resource.0.borrow_mut(),
    );

@ -303,16 +283,11 @@ pub fn op_webgpu_render_pass_insert_debug_marker(
        .resource_table
        .get::<WebGpuRenderPass>(render_pass_rid)?;

-    let label = std::ffi::CString::new(marker_label).unwrap();
-    // SAFETY: the string the raw pointer points to lives longer than the below
-    // function invocation.
-    unsafe {
-        wgpu_core::command::render_ffi::wgpu_render_pass_insert_debug_marker(
-            &mut render_pass_resource.0.borrow_mut(),
-            label.as_ptr(),
-            0, // wgpu#975
-        );
-    }
+    wgpu_core::command::render_commands::wgpu_render_pass_insert_debug_marker(
+        &mut render_pass_resource.0.borrow_mut(),
+        marker_label,
+        0, // wgpu#975
+    );

    Ok(WebGpuResult::empty())
 }
@ -331,7 +306,7 @@ pub fn op_webgpu_render_pass_set_pipeline(
        .resource_table
        .get::<WebGpuRenderPass>(render_pass_rid)?;

-    wgpu_core::command::render_ffi::wgpu_render_pass_set_pipeline(
+    wgpu_core::command::render_commands::wgpu_render_pass_set_pipeline(
        &mut render_pass_resource.0.borrow_mut(),
        render_pipeline_resource.1,
    );
@ -401,7 +376,7 @@ pub fn op_webgpu_render_pass_set_vertex_buffer(
        None
    };

-    wgpu_core::command::render_ffi::wgpu_render_pass_set_vertex_buffer(
+    wgpu_core::command::render_commands::wgpu_render_pass_set_vertex_buffer(
        &mut render_pass_resource.0.borrow_mut(),
        slot,
        buffer_resource.1,
@ -426,7 +401,7 @@ pub fn op_webgpu_render_pass_draw(
        .resource_table
        .get::<WebGpuRenderPass>(render_pass_rid)?;

-    wgpu_core::command::render_ffi::wgpu_render_pass_draw(
+    wgpu_core::command::render_commands::wgpu_render_pass_draw(
        &mut render_pass_resource.0.borrow_mut(),
        vertex_count,
        instance_count,
@ -452,7 +427,7 @@ pub fn op_webgpu_render_pass_draw_indexed(
        .resource_table
        .get::<WebGpuRenderPass>(render_pass_rid)?;

-    wgpu_core::command::render_ffi::wgpu_render_pass_draw_indexed(
+    wgpu_core::command::render_commands::wgpu_render_pass_draw_indexed(
        &mut render_pass_resource.0.borrow_mut(),
        index_count,
        instance_count,
@ -479,7 +454,7 @@ pub fn op_webgpu_render_pass_draw_indirect(
        .resource_table
        .get::<WebGpuRenderPass>(render_pass_rid)?;

-    wgpu_core::command::render_ffi::wgpu_render_pass_draw_indirect(
+    wgpu_core::command::render_commands::wgpu_render_pass_draw_indirect(
        &mut render_pass_resource.0.borrow_mut(),
        buffer_resource.1,
        indirect_offset,
@ -503,7 +478,7 @@ pub fn op_webgpu_render_pass_draw_indexed_indirect(
        .resource_table
        .get::<WebGpuRenderPass>(render_pass_rid)?;

-    wgpu_core::command::render_ffi::wgpu_render_pass_draw_indexed_indirect(
+    wgpu_core::command::render_commands::wgpu_render_pass_draw_indexed_indirect(
        &mut render_pass_resource.0.borrow_mut(),
        buffer_resource.1,
        indirect_offset,
--- a/deno_webgpu/sampler.rs
+++ b/deno_webgpu/sampler.rs
@ -1,4 +1,4 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

 use deno_core::error::AnyError;
 use deno_core::op2;
--- a/deno_webgpu/shader.rs
+++ b/deno_webgpu/shader.rs
@ -1,4 +1,4 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

 use deno_core::error::AnyError;
 use deno_core::op2;
--- a/deno_webgpu/surface.rs
+++ b/deno_webgpu/surface.rs
@ -1,4 +1,4 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

 use super::WebGpuResult;
 use deno_core::error::AnyError;
@ -11,21 +11,6 @@ use std::borrow::Cow;
 use std::rc::Rc;
 use wgpu_types::SurfaceStatus;

-deno_core::extension!(
-    deno_webgpu_surface,
-    deps = [deno_webidl, deno_web, deno_webgpu],
-    ops = [
-        op_webgpu_surface_configure,
-        op_webgpu_surface_get_current_texture,
-        op_webgpu_surface_present,
-    ],
-    esm = ["02_surface.js"],
-    options = { unstable: bool },
-    state = |state, options| {
-        state.put(super::Unstable(options.unstable));
-    },
-);
-
 pub struct WebGpuSurface(pub crate::Instance, pub wgpu_core::id::SurfaceId);
 impl Resource for WebGpuSurface {
    fn name(&self) -> Cow<str> {
--- a/deno_webgpu/texture.rs
+++ b/deno_webgpu/texture.rs
@ -1,4 +1,4 @@
-// Copyright 2018-2023 the Deno authors. All rights reserved. MIT license.
+// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

 use deno_core::error::AnyError;
 use deno_core::op2;
--- a/deno_webgpu/webgpu.idl
+++ b/deno_webgpu/webgpu.idl
@ -6,7 +6,7 @@ dictionary GPUObjectDescriptorBase {
    USVString label = "";
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUSupportedLimits {
    readonly attribute unsigned long maxTextureDimension1D;
    readonly attribute unsigned long maxTextureDimension2D;
@ -30,6 +30,8 @@ interface GPUSupportedLimits {
    readonly attribute unsigned long maxVertexAttributes;
    readonly attribute unsigned long maxVertexBufferArrayStride;
    readonly attribute unsigned long maxInterStageShaderComponents;
+    readonly attribute unsigned long maxColorAttachments;
+    readonly attribute unsigned long maxColorAttachmentBytesPerSample;
    readonly attribute unsigned long maxComputeWorkgroupStorageSize;
    readonly attribute unsigned long maxComputeInvocationsPerWorkgroup;
    readonly attribute unsigned long maxComputeWorkgroupSizeX;
@ -38,12 +40,12 @@ interface GPUSupportedLimits {
    readonly attribute unsigned long maxComputeWorkgroupsPerDimension;
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUSupportedFeatures {
    readonly setlike<DOMString>;
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUAdapterInfo {
    readonly attribute DOMString vendor;
    readonly attribute DOMString architecture;
@ -57,9 +59,10 @@ interface mixin NavigatorGPU {
 Navigator includes NavigatorGPU;
 WorkerNavigator includes NavigatorGPU;

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPU {
    Promise<GPUAdapter?> requestAdapter(optional GPURequestAdapterOptions options = {});
+    GPUTextureFormat getPreferredCanvasFormat();
 };

 dictionary GPURequestAdapterOptions {
@ -72,14 +75,14 @@ enum GPUPowerPreference {
    "high-performance",
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUAdapter {
    [SameObject] readonly attribute GPUSupportedFeatures features;
    [SameObject] readonly attribute GPUSupportedLimits limits;
    readonly attribute boolean isFallbackAdapter;

    Promise<GPUDevice> requestDevice(optional GPUDeviceDescriptor descriptor = {});
-    Promise<GPUAdapterInfo> requestAdapterInfo(optional sequence<DOMString> unmaskHints = []);
+    Promise<GPUAdapterInfo> requestAdapterInfo();
 };

 dictionary GPUDeviceDescriptor
@ -141,7 +144,7 @@ enum GPUFeatureName {
    "shader-early-depth-test",
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUDevice : EventTarget {
    [SameObject] readonly attribute GPUSupportedFeatures features;
    [SameObject] readonly attribute GPUSupportedLimits limits;
@ -171,7 +174,7 @@ interface GPUDevice : EventTarget {
 };
 GPUDevice includes GPUObjectBase;

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUBuffer {
    readonly attribute GPUSize64Out size;
    readonly attribute GPUFlagsConstant usage;
@ -200,7 +203,7 @@ dictionary GPUBufferDescriptor
 };

 typedef [EnforceRange] unsigned long GPUBufferUsageFlags;
-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 namespace GPUBufferUsage {
    const GPUFlagsConstant MAP_READ      = 0x0001;
    const GPUFlagsConstant MAP_WRITE     = 0x0002;
@ -215,13 +218,13 @@ namespace GPUBufferUsage {
 };

 typedef [EnforceRange] unsigned long GPUMapModeFlags;
-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 namespace GPUMapMode {
    const GPUFlagsConstant READ  = 0x0001;
    const GPUFlagsConstant WRITE = 0x0002;
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUTexture {
    GPUTextureView createView(optional GPUTextureViewDescriptor descriptor = {});

@ -256,7 +259,7 @@ enum GPUTextureDimension {
 };

 typedef [EnforceRange] unsigned long GPUTextureUsageFlags;
-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 namespace GPUTextureUsage {
    const GPUFlagsConstant COPY_SRC          = 0x01;
    const GPUFlagsConstant COPY_DST          = 0x02;
@ -265,7 +268,7 @@ namespace GPUTextureUsage {
    const GPUFlagsConstant RENDER_ATTACHMENT = 0x10;
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUTextureView {
 };
 GPUTextureView includes GPUObjectBase;
@ -328,6 +331,7 @@ enum GPUTextureFormat {
    "bgra8unorm-srgb",
    // Packed 32-bit formats
    "rgb9e5ufloat",
+    "rgb10a2uint",
    "rgb10a2unorm",
    "rg11b10ufloat",

@ -416,7 +420,7 @@ enum GPUTextureFormat {
    "astc-12x12-unorm-srgb",
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUSampler {
 };
 GPUSampler includes GPUObjectBase;
@ -462,7 +466,7 @@ enum GPUCompareFunction {
    "always",
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUBindGroupLayout {
 };
 GPUBindGroupLayout includes GPUObjectBase;
@ -483,7 +487,7 @@ dictionary GPUBindGroupLayoutEntry {
 };

 typedef [EnforceRange] unsigned long GPUShaderStageFlags;
-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 namespace GPUShaderStage {
    const GPUFlagsConstant VERTEX   = 0x1;
    const GPUFlagsConstant FRAGMENT = 0x2;
@ -528,6 +532,8 @@ dictionary GPUTextureBindingLayout {

 enum GPUStorageTextureAccess {
    "write-only",
+    "read-only",
+    "read-write",
 };

 dictionary GPUStorageTextureBindingLayout {
@ -536,7 +542,7 @@ dictionary GPUStorageTextureBindingLayout {
    GPUTextureViewDimension viewDimension = "2d";
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUBindGroup {
 };
 GPUBindGroup includes GPUObjectBase;
@ -560,7 +566,7 @@ dictionary GPUBufferBinding {
    GPUSize64 size;
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUPipelineLayout {
 };
 GPUPipelineLayout includes GPUObjectBase;
@ -570,7 +576,7 @@ dictionary GPUPipelineLayoutDescriptor
    required sequence<GPUBindGroupLayout> bindGroupLayouts;
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUShaderModule {
 };
 GPUShaderModule includes GPUObjectBase;
@ -586,7 +592,7 @@ enum GPUCompilationMessageType {
    "info",
 };

-[Exposed=(Window, DedicatedWorker), Serializable, SecureContext]
+[Exposed=(Window, Worker), Serializable, SecureContext]
 interface GPUCompilationMessage {
    readonly attribute DOMString message;
    readonly attribute GPUCompilationMessageType type;
@ -596,11 +602,26 @@ interface GPUCompilationMessage {
    readonly attribute unsigned long long length;
 };

-[Exposed=(Window, DedicatedWorker), Serializable, SecureContext]
+[Exposed=(Window, Worker), Serializable, SecureContext]
 interface GPUCompilationInfo {
    readonly attribute FrozenArray<GPUCompilationMessage> messages;
 };

+[Exposed=(Window, Worker), SecureContext, Serializable]
+interface GPUPipelineError : DOMException {
+    constructor(optional DOMString message = "", GPUPipelineErrorInit options);
+    readonly attribute GPUPipelineErrorReason reason;
+};
+
+dictionary GPUPipelineErrorInit {
+    required GPUPipelineErrorReason reason;
+};
+
+enum GPUPipelineErrorReason {
+    "validation",
+    "internal",
+};
+
 enum GPUAutoLayoutMode {
    "auto",
 };
@ -616,13 +637,13 @@ interface mixin GPUPipelineBase {

 dictionary GPUProgrammableStage {
    required GPUShaderModule module;
-    required USVString entryPoint;
+    USVString entryPoint;
    record<USVString, GPUPipelineConstantValue> constants;
 };

-typedef double GPUPipelineConstantValue; // May represent WGSL’s bool, f32, i32, u32, and f16 if enabled.
+typedef double GPUPipelineConstantValue; // May represent WGSL's bool, f32, i32, u32, and f16 if enabled.

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUComputePipeline {
 };
 GPUComputePipeline includes GPUObjectBase;
@ -633,7 +654,7 @@ dictionary GPUComputePipelineDescriptor
    required GPUProgrammableStage compute;
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPURenderPipeline {
 };
 GPURenderPipeline includes GPUObjectBase;
@ -701,7 +722,7 @@ dictionary GPUBlendState {
 };

 typedef [EnforceRange] unsigned long GPUColorWriteFlags;
-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 namespace GPUColorWrite {
    const GPUFlagsConstant RED   = 0x1;
    const GPUFlagsConstant GREEN = 0x2;
@ -811,6 +832,7 @@ enum GPUVertexFormat {
    "sint32x2",
    "sint32x3",
    "sint32x4",
+    "unorm10-10-10-2",
 };

 enum GPUVertexStepMode {
@ -854,7 +876,7 @@ dictionary GPUImageCopyTexture {
    GPUTextureAspect aspect = "all";
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUCommandBuffer {
 };
 GPUCommandBuffer includes GPUObjectBase;
@ -866,7 +888,7 @@ dictionary GPUCommandBufferDescriptor
 interface mixin GPUCommandsMixin {
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUCommandEncoder {
    GPURenderPassEncoder beginRenderPass(GPURenderPassDescriptor descriptor);
    GPUComputePassEncoder beginComputePass(optional GPUComputePassDescriptor descriptor = {});
@ -933,7 +955,7 @@ interface mixin GPUDebugCommandsMixin {
    undefined insertDebugMarker(USVString markerLabel);
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUComputePassEncoder {
    undefined setPipeline(GPUComputePipeline pipeline);
    undefined dispatchWorkgroups(GPUSize32 workgroupCountX, optional GPUSize32 workgroupCountY = 1, optional GPUSize32 workgroupCountZ = 1);
@ -957,7 +979,7 @@ dictionary GPUComputePassDescriptor
    GPUComputePassTimestampWrites timestampWrites;
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPURenderPassEncoder {
    undefined setViewport(float x, float y,
        float width, float height,
@ -1052,7 +1074,7 @@ interface mixin GPURenderCommandsMixin {
    undefined drawIndexedIndirect(GPUBuffer indirectBuffer, GPUSize64 indirectOffset);
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPURenderBundle {
 };
 GPURenderBundle includes GPUObjectBase;
@ -1061,7 +1083,7 @@ dictionary GPURenderBundleDescriptor
         : GPUObjectDescriptorBase {
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPURenderBundleEncoder {
    GPURenderBundle finish(optional GPURenderBundleDescriptor descriptor = {});
 };
@ -1077,7 +1099,7 @@ dictionary GPURenderBundleEncoderDescriptor
    boolean stencilReadOnly = false;
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUQueue {
    undefined submit(sequence<GPUCommandBuffer> commandBuffers);

@ -1098,7 +1120,7 @@ interface GPUQueue {
 };
 GPUQueue includes GPUObjectBase;

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUQuerySet {
    undefined destroy();

@ -1118,7 +1140,7 @@ enum GPUQueryType {
    "timestamp",
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUCanvasContext {
    readonly attribute (HTMLCanvasElement or OffscreenCanvas) canvas;

@ -1146,7 +1168,7 @@ enum GPUDeviceLostReason {
    "destroyed",
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUDeviceLostInfo {
    readonly attribute GPUDeviceLostReason reason;
    readonly attribute DOMString message;
@ -1156,27 +1178,33 @@ partial interface GPUDevice {
    readonly attribute Promise<GPUDeviceLostInfo> lost;
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUError {
    readonly attribute DOMString message;
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUValidationError
        : GPUError {
    constructor(DOMString message);
 };

-[Exposed=(Window, DedicatedWorker), SecureContext]
+[Exposed=(Window, Worker), SecureContext]
 interface GPUOutOfMemoryError
        : GPUError {
    constructor(DOMString message);
 };

+[Exposed=(Window, Worker), SecureContext]
+interface GPUInternalError
+        : GPUError {
+    constructor(DOMString message);
+};
+
 enum GPUErrorFilter {
    "validation",
    "out-of-memory",
-    "internal"
+    "internal",
 };

 partial interface GPUDevice {
@ -1184,8 +1212,21 @@ partial interface GPUDevice {
    Promise<GPUError?> popErrorScope();
 };

+[Exposed=(Window, Worker), SecureContext]
+interface GPUUncapturedErrorEvent : Event {
+    constructor(
+        DOMString type,
+        GPUUncapturedErrorEventInit gpuUncapturedErrorEventInitDict
+    );
+    [SameObject] readonly attribute GPUError error;
+};
+
+dictionary GPUUncapturedErrorEventInit : EventInit {
+    required GPUError error;
+};
+
 partial interface GPUDevice {
-    [Exposed=(Window, DedicatedWorker)]
+    [Exposed=(Window, Worker)]
    attribute EventHandler onuncapturederror;
 };

--- a/examples/Cargo.toml
+++ b/examples/Cargo.toml
@ -10,6 +10,10 @@ keywords.workspace = true
 license.workspace = true
 publish = false

+[package.metadata.cargo-machete]
+# Cargo machete struggles with this dev dependency:
+ignored = ["wasm_bindgen_test"]
+
 [lib]
 path = "src/lib.rs"
 harness = false
@ -47,10 +51,8 @@ env_logger.workspace = true
 console_error_panic_hook.workspace = true
 console_log.workspace = true
 fern.workspace = true
-js-sys.workspace = true
 wasm-bindgen.workspace = true
 wasm-bindgen-futures.workspace = true
-hal = { workspace = true, optional = true }
 # We need these features in the framework examples and tests
 web-sys = { workspace = true, features = [
    "Location",
--- a/examples/src/boids/mod.rs
+++ b/examples/src/boids/mod.rs
@ -132,6 +132,7 @@ impl crate::framework::Example for Example {
            vertex: wgpu::VertexState {
                module: &draw_shader,
                entry_point: "main_vs",
+                compilation_options: Default::default(),
                buffers: &[
                    wgpu::VertexBufferLayout {
                        array_stride: 4 * 4,
@ -148,12 +149,14 @@ impl crate::framework::Example for Example {
            fragment: Some(wgpu::FragmentState {
                module: &draw_shader,
                entry_point: "main_fs",
+                compilation_options: Default::default(),
                targets: &[Some(config.view_formats[0].into())],
            }),
            primitive: wgpu::PrimitiveState::default(),
            depth_stencil: None,
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
+            cache: None,
        });

        // create compute pipeline
@ -163,6 +166,8 @@ impl crate::framework::Example for Example {
            layout: Some(&compute_pipeline_layout),
            module: &compute_shader,
            entry_point: "main",
+            compilation_options: Default::default(),
+            cache: None,
        });

        // buffer for the three 2d triangle vertices of each instance
--- a/examples/src/bunnymark/mod.rs
+++ b/examples/src/bunnymark/mod.rs
@ -203,11 +203,13 @@ impl crate::framework::Example for Example {
            vertex: wgpu::VertexState {
                module: &shader,
                entry_point: "vs_main",
+                compilation_options: Default::default(),
                buffers: &[],
            },
            fragment: Some(wgpu::FragmentState {
                module: &shader,
                entry_point: "fs_main",
+                compilation_options: Default::default(),
                targets: &[Some(wgpu::ColorTargetState {
                    format: config.view_formats[0],
                    blend: Some(wgpu::BlendState::ALPHA_BLENDING),
@ -222,6 +224,7 @@ impl crate::framework::Example for Example {
            depth_stencil: None,
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
+            cache: None,
        });

        let texture = {
--- a/examples/src/conservative_raster/mod.rs
+++ b/examples/src/conservative_raster/mod.rs
@ -97,11 +97,13 @@ impl crate::framework::Example for Example {
                vertex: wgpu::VertexState {
                    module: &shader_triangle_and_lines,
                    entry_point: "vs_main",
+                    compilation_options: Default::default(),
                    buffers: &[],
                },
                fragment: Some(wgpu::FragmentState {
                    module: &shader_triangle_and_lines,
                    entry_point: "fs_main_red",
+                    compilation_options: Default::default(),
                    targets: &[Some(RENDER_TARGET_FORMAT.into())],
                }),
                primitive: wgpu::PrimitiveState {
@ -111,6 +113,7 @@ impl crate::framework::Example for Example {
                depth_stencil: None,
                multisample: wgpu::MultisampleState::default(),
                multiview: None,
+                cache: None,
            });

        let pipeline_triangle_regular =
@ -120,17 +123,20 @@ impl crate::framework::Example for Example {
                vertex: wgpu::VertexState {
                    module: &shader_triangle_and_lines,
                    entry_point: "vs_main",
+                    compilation_options: Default::default(),
                    buffers: &[],
                },
                fragment: Some(wgpu::FragmentState {
                    module: &shader_triangle_and_lines,
                    entry_point: "fs_main_blue",
+                    compilation_options: Default::default(),
                    targets: &[Some(RENDER_TARGET_FORMAT.into())],
                }),
                primitive: wgpu::PrimitiveState::default(),
                depth_stencil: None,
                multisample: wgpu::MultisampleState::default(),
                multiview: None,
+                cache: None,
            });

        let pipeline_lines = if device
@ -144,11 +150,13 @@ impl crate::framework::Example for Example {
                    vertex: wgpu::VertexState {
                        module: &shader_triangle_and_lines,
                        entry_point: "vs_main",
+                        compilation_options: Default::default(),
                        buffers: &[],
                    },
                    fragment: Some(wgpu::FragmentState {
                        module: &shader_triangle_and_lines,
                        entry_point: "fs_main_white",
+                        compilation_options: Default::default(),
                        targets: &[Some(config.view_formats[0].into())],
                    }),
                    primitive: wgpu::PrimitiveState {
@ -159,6 +167,7 @@ impl crate::framework::Example for Example {
                    depth_stencil: None,
                    multisample: wgpu::MultisampleState::default(),
                    multiview: None,
+                    cache: None,
                }),
            )
        } else {
@ -205,17 +214,20 @@ impl crate::framework::Example for Example {
                    vertex: wgpu::VertexState {
                        module: &shader,
                        entry_point: "vs_main",
+                        compilation_options: Default::default(),
                        buffers: &[],
                    },
                    fragment: Some(wgpu::FragmentState {
                        module: &shader,
                        entry_point: "fs_main",
+                        compilation_options: Default::default(),
                        targets: &[Some(config.view_formats[0].into())],
                    }),
                    primitive: wgpu::PrimitiveState::default(),
                    depth_stencil: None,
                    multisample: wgpu::MultisampleState::default(),
                    multiview: None,
+                    cache: None,
                }),
                bind_group_layout,
            )
--- a/examples/src/cube/mod.rs
+++ b/examples/src/cube/mod.rs
@ -244,11 +244,13 @@ impl crate::framework::Example for Example {
            vertex: wgpu::VertexState {
                module: &shader,
                entry_point: "vs_main",
+                compilation_options: Default::default(),
                buffers: &vertex_buffers,
            },
            fragment: Some(wgpu::FragmentState {
                module: &shader,
                entry_point: "fs_main",
+                compilation_options: Default::default(),
                targets: &[Some(config.view_formats[0].into())],
            }),
            primitive: wgpu::PrimitiveState {
@ -258,6 +260,7 @@ impl crate::framework::Example for Example {
            depth_stencil: None,
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
+            cache: None,
        });

        let pipeline_wire = if device
@ -270,11 +273,13 @@ impl crate::framework::Example for Example {
                vertex: wgpu::VertexState {
                    module: &shader,
                    entry_point: "vs_main",
+                    compilation_options: Default::default(),
                    buffers: &vertex_buffers,
                },
                fragment: Some(wgpu::FragmentState {
                    module: &shader,
                    entry_point: "fs_wire",
+                    compilation_options: Default::default(),
                    targets: &[Some(wgpu::ColorTargetState {
                        format: config.view_formats[0],
                        blend: Some(wgpu::BlendState {
@ -297,6 +302,7 @@ impl crate::framework::Example for Example {
                depth_stencil: None,
                multisample: wgpu::MultisampleState::default(),
                multiview: None,
+                cache: None,
            });
            Some(pipeline_wire)
        } else {
--- a/examples/src/hello_compute/mod.rs
+++ b/examples/src/hello_compute/mod.rs
@ -109,6 +109,8 @@ async fn execute_gpu_inner(
        layout: None,
        module: &cs_module,
        entry_point: "main",
+        compilation_options: Default::default(),
+        cache: None,
    });

    // Instantiates the bind group, once again specifying the binding of buffers.
--- a/examples/src/hello_synchronization/mod.rs
+++ b/examples/src/hello_synchronization/mod.rs
@ -103,12 +103,16 @@ async fn execute(
        layout: Some(&pipeline_layout),
        module: &shaders_module,
        entry_point: "patient_main",
+        compilation_options: Default::default(),
+        cache: None,
    });
    let hasty_pipeline = device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
        label: None,
        layout: Some(&pipeline_layout),
        module: &shaders_module,
        entry_point: "hasty_main",
+        compilation_options: Default::default(),
+        cache: None,
    });

    //----------------------------------------------------------
--- a/examples/src/hello_triangle/mod.rs
+++ b/examples/src/hello_triangle/mod.rs
@ -60,16 +60,19 @@ async fn run(event_loop: EventLoop<()>, window: Window) {
            module: &shader,
            entry_point: "vs_main",
            buffers: &[],
+            compilation_options: Default::default(),
        },
        fragment: Some(wgpu::FragmentState {
            module: &shader,
            entry_point: "fs_main",
+            compilation_options: Default::default(),
            targets: &[Some(swapchain_format.into())],
        }),
        primitive: wgpu::PrimitiveState::default(),
        depth_stencil: None,
        multisample: wgpu::MultisampleState::default(),
        multiview: None,
+        cache: None,
    });

    let mut config = surface
--- a/examples/src/hello_workgroups/mod.rs
+++ b/examples/src/hello_workgroups/mod.rs
@ -110,6 +110,8 @@ async fn run() {
        layout: Some(&pipeline_layout),
        module: &shader,
        entry_point: "main",
+        compilation_options: Default::default(),
+        cache: None,
    });

    //----------------------------------------------------------
--- a/examples/src/mipmap/mod.rs
+++ b/examples/src/mipmap/mod.rs
@ -30,7 +30,7 @@ fn create_texels(size: usize, cx: f32, cy: f32) -> Vec<u8> {
            iter::once(0xFF - (count * 2) as u8)
                .chain(iter::once(0xFF - (count * 5) as u8))
                .chain(iter::once(0xFF - (count * 13) as u8))
-                .chain(iter::once(std::u8::MAX))
+                .chain(iter::once(u8::MAX))
        })
        .collect()
 }
@ -93,11 +93,13 @@ impl Example {
            vertex: wgpu::VertexState {
                module: &shader,
                entry_point: "vs_main",
+                compilation_options: Default::default(),
                buffers: &[],
            },
            fragment: Some(wgpu::FragmentState {
                module: &shader,
                entry_point: "fs_main",
+                compilation_options: Default::default(),
                targets: &[Some(TEXTURE_FORMAT.into())],
            }),
            primitive: wgpu::PrimitiveState {
@ -107,6 +109,7 @@ impl Example {
            depth_stencil: None,
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
+            cache: None,
        });

        let bind_group_layout = pipeline.get_bind_group_layout(0);
@ -290,11 +293,13 @@ impl crate::framework::Example for Example {
            vertex: wgpu::VertexState {
                module: &shader,
                entry_point: "vs_main",
+                compilation_options: Default::default(),
                buffers: &[],
            },
            fragment: Some(wgpu::FragmentState {
                module: &shader,
                entry_point: "fs_main",
+                compilation_options: Default::default(),
                targets: &[Some(config.view_formats[0].into())],
            }),
            primitive: wgpu::PrimitiveState {
@ -306,6 +311,7 @@ impl crate::framework::Example for Example {
            depth_stencil: None,
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
+            cache: None,
        });

        // Create bind group
--- a/examples/src/msaa_line/mod.rs
+++ b/examples/src/msaa_line/mod.rs
@ -54,6 +54,7 @@ impl Example {
            vertex: wgpu::VertexState {
                module: shader,
                entry_point: "vs_main",
+                compilation_options: Default::default(),
                buffers: &[wgpu::VertexBufferLayout {
                    array_stride: std::mem::size_of::<Vertex>() as wgpu::BufferAddress,
                    step_mode: wgpu::VertexStepMode::Vertex,
@ -63,6 +64,7 @@ impl Example {
            fragment: Some(wgpu::FragmentState {
                module: shader,
                entry_point: "fs_main",
+                compilation_options: Default::default(),
                targets: &[Some(config.view_formats[0].into())],
            }),
            primitive: wgpu::PrimitiveState {
@ -76,6 +78,7 @@ impl Example {
                ..Default::default()
            },
            multiview: None,
+            cache: None,
        });
        let mut encoder =
            device.create_render_bundle_encoder(&wgpu::RenderBundleEncoderDescriptor {
--- a/examples/src/render_to_texture/mod.rs
+++ b/examples/src/render_to_texture/mod.rs
@ -59,17 +59,20 @@ async fn run(_path: Option<String>) {
        vertex: wgpu::VertexState {
            module: &shader,
            entry_point: "vs_main",
+            compilation_options: Default::default(),
            buffers: &[],
        },
        fragment: Some(wgpu::FragmentState {
            module: &shader,
            entry_point: "fs_main",
+            compilation_options: Default::default(),
            targets: &[Some(wgpu::TextureFormat::Rgba8UnormSrgb.into())],
        }),
        primitive: wgpu::PrimitiveState::default(),
        depth_stencil: None,
        multisample: wgpu::MultisampleState::default(),
        multiview: None,
+        cache: None,
    });

    log::info!("Wgpu context set up.");
--- a/examples/src/repeated_compute/mod.rs
+++ b/examples/src/repeated_compute/mod.rs
@ -245,6 +245,8 @@ impl WgpuContext {
            layout: Some(&pipeline_layout),
            module: &shader,
            entry_point: "main",
+            compilation_options: Default::default(),
+            cache: None,
        });

        WgpuContext {
--- a/examples/src/shadow/mod.rs
+++ b/examples/src/shadow/mod.rs
@ -500,6 +500,7 @@ impl crate::framework::Example for Example {
                vertex: wgpu::VertexState {
                    module: &shader,
                    entry_point: "vs_bake",
+                    compilation_options: Default::default(),
                    buffers: &[vb_desc.clone()],
                },
                fragment: None,
@ -525,6 +526,7 @@ impl crate::framework::Example for Example {
                }),
                multisample: wgpu::MultisampleState::default(),
                multiview: None,
+                cache: None,
            });

            Pass {
@ -632,6 +634,7 @@ impl crate::framework::Example for Example {
                vertex: wgpu::VertexState {
                    module: &shader,
                    entry_point: "vs_main",
+                    compilation_options: Default::default(),
                    buffers: &[vb_desc],
                },
                fragment: Some(wgpu::FragmentState {
@ -641,6 +644,7 @@ impl crate::framework::Example for Example {
                    } else {
                        "fs_main_without_storage"
                    },
+                    compilation_options: Default::default(),
                    targets: &[Some(config.view_formats[0].into())],
                }),
                primitive: wgpu::PrimitiveState {
@ -657,6 +661,7 @@ impl crate::framework::Example for Example {
                }),
                multisample: wgpu::MultisampleState::default(),
                multiview: None,
+                cache: None,
            });

            Pass {
--- a/examples/src/skybox/mod.rs
+++ b/examples/src/skybox/mod.rs
@ -199,11 +199,13 @@ impl crate::framework::Example for Example {
            vertex: wgpu::VertexState {
                module: &shader,
                entry_point: "vs_sky",
+                compilation_options: Default::default(),
                buffers: &[],
            },
            fragment: Some(wgpu::FragmentState {
                module: &shader,
                entry_point: "fs_sky",
+                compilation_options: Default::default(),
                targets: &[Some(config.view_formats[0].into())],
            }),
            primitive: wgpu::PrimitiveState {
@ -219,6 +221,7 @@ impl crate::framework::Example for Example {
            }),
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
+            cache: None,
        });
        let entity_pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
            label: Some("Entity"),
@ -226,6 +229,7 @@ impl crate::framework::Example for Example {
            vertex: wgpu::VertexState {
                module: &shader,
                entry_point: "vs_entity",
+                compilation_options: Default::default(),
                buffers: &[wgpu::VertexBufferLayout {
                    array_stride: std::mem::size_of::<Vertex>() as wgpu::BufferAddress,
                    step_mode: wgpu::VertexStepMode::Vertex,
@ -235,6 +239,7 @@ impl crate::framework::Example for Example {
            fragment: Some(wgpu::FragmentState {
                module: &shader,
                entry_point: "fs_entity",
+                compilation_options: Default::default(),
                targets: &[Some(config.view_formats[0].into())],
            }),
            primitive: wgpu::PrimitiveState {
@ -250,6 +255,7 @@ impl crate::framework::Example for Example {
            }),
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
+            cache: None,
        });

        let sampler = device.create_sampler(&wgpu::SamplerDescriptor {
--- a/examples/src/srgb_blend/mod.rs
+++ b/examples/src/srgb_blend/mod.rs
@ -131,11 +131,13 @@ impl<const SRGB: bool> crate::framework::Example for Example<SRGB> {
            vertex: wgpu::VertexState {
                module: &shader,
                entry_point: "vs_main",
+                compilation_options: Default::default(),
                buffers: &vertex_buffers,
            },
            fragment: Some(wgpu::FragmentState {
                module: &shader,
                entry_point: "fs_main",
+                compilation_options: Default::default(),
                targets: &[Some(wgpu::ColorTargetState {
                    format: config.view_formats[0],
                    blend: Some(wgpu::BlendState::ALPHA_BLENDING),
@ -149,6 +151,7 @@ impl<const SRGB: bool> crate::framework::Example for Example<SRGB> {
            depth_stencil: None,
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
+            cache: None,
        });

        // Done
--- a/examples/src/stencil_triangles/mod.rs
+++ b/examples/src/stencil_triangles/mod.rs
@ -74,11 +74,13 @@ impl crate::framework::Example for Example {
            vertex: wgpu::VertexState {
                module: &shader,
                entry_point: "vs_main",
+                compilation_options: Default::default(),
                buffers: &vertex_buffers,
            },
            fragment: Some(wgpu::FragmentState {
                module: &shader,
                entry_point: "fs_main",
+                compilation_options: Default::default(),
                targets: &[Some(wgpu::ColorTargetState {
                    format: config.view_formats[0],
                    blend: None,
@ -104,6 +106,7 @@ impl crate::framework::Example for Example {
            }),
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
+            cache: None,
        });

        let outer_pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
@ -112,11 +115,13 @@ impl crate::framework::Example for Example {
            vertex: wgpu::VertexState {
                module: &shader,
                entry_point: "vs_main",
+                compilation_options: Default::default(),
                buffers: &vertex_buffers,
            },
            fragment: Some(wgpu::FragmentState {
                module: &shader,
                entry_point: "fs_main",
+                compilation_options: Default::default(),
                targets: &[Some(config.view_formats[0].into())],
            }),
            primitive: Default::default(),
@ -137,6 +142,7 @@ impl crate::framework::Example for Example {
            }),
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
+            cache: None,
        });

        let stencil_buffer = device.create_texture(&wgpu::TextureDescriptor {
--- a/examples/src/storage_texture/mod.rs
+++ b/examples/src/storage_texture/mod.rs
@ -100,6 +100,8 @@ async fn run(_path: Option<String>) {
        layout: Some(&pipeline_layout),
        module: &shader,
        entry_point: "main",
+        compilation_options: Default::default(),
+        cache: None,
    });

    log::info!("Wgpu context set up.");
--- a/examples/src/texture_arrays/mod.rs
+++ b/examples/src/texture_arrays/mod.rs
@ -321,6 +321,7 @@ impl crate::framework::Example for Example {
            vertex: wgpu::VertexState {
                module: &base_shader_module,
                entry_point: "vert_main",
+                compilation_options: Default::default(),
                buffers: &[wgpu::VertexBufferLayout {
                    array_stride: vertex_size as wgpu::BufferAddress,
                    step_mode: wgpu::VertexStepMode::Vertex,
@ -330,6 +331,7 @@ impl crate::framework::Example for Example {
            fragment: Some(wgpu::FragmentState {
                module: fragment_shader_module,
                entry_point: fragment_entry_point,
+                compilation_options: Default::default(),
                targets: &[Some(config.view_formats[0].into())],
            }),
            primitive: wgpu::PrimitiveState {
@ -339,6 +341,7 @@ impl crate::framework::Example for Example {
            depth_stencil: None,
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
+            cache: None
        });

        Self {
--- a/examples/src/timestamp_queries/mod.rs
+++ b/examples/src/timestamp_queries/mod.rs
@ -298,6 +298,8 @@ fn compute_pass(
        layout: None,
        module,
        entry_point: "main_cs",
+        compilation_options: Default::default(),
+        cache: None,
    });
    let bind_group_layout = compute_pipeline.get_bind_group_layout(0);
    let bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
@ -352,19 +354,21 @@ fn render_pass(
        vertex: wgpu::VertexState {
            module,
            entry_point: "vs_main",
+            compilation_options: Default::default(),
            buffers: &[],
        },
        fragment: Some(wgpu::FragmentState {
            module,
            entry_point: "fs_main",
+            compilation_options: Default::default(),
            targets: &[Some(format.into())],
        }),
        primitive: wgpu::PrimitiveState::default(),
        depth_stencil: None,
        multisample: wgpu::MultisampleState::default(),
        multiview: None,
+        cache: None,
    });
-
    let render_target = device.create_texture(&wgpu::TextureDescriptor {
        label: Some("rendertarget"),
        size: wgpu::Extent3d {
--- a/examples/src/uniform_values/mod.rs
+++ b/examples/src/uniform_values/mod.rs
@ -179,19 +179,21 @@ impl WgpuContext {
            vertex: wgpu::VertexState {
                module: &shader,
                entry_point: "vs_main",
+                compilation_options: Default::default(),
                buffers: &[],
            },
            fragment: Some(wgpu::FragmentState {
                module: &shader,
                entry_point: "fs_main",
+                compilation_options: Default::default(),
                targets: &[Some(swapchain_format.into())],
            }),
            primitive: wgpu::PrimitiveState::default(),
            depth_stencil: None,
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
+            cache: None,
        });
-
        let surface_config = surface
            .get_default_config(&adapter, size.width, size.height)
            .unwrap();
--- a/examples/src/water/mod.rs
+++ b/examples/src/water/mod.rs
@ -512,6 +512,7 @@ impl crate::framework::Example for Example {
            vertex: wgpu::VertexState {
                module: &water_module,
                entry_point: "vs_main",
+                compilation_options: Default::default(),
                // Layout of our vertices. This should match the structs
                // which are uploaded to the GPU. This should also be
                // ensured by tagging on either a `#[repr(C)]` onto a
@ -527,6 +528,7 @@ impl crate::framework::Example for Example {
            fragment: Some(wgpu::FragmentState {
                module: &water_module,
                entry_point: "fs_main",
+                compilation_options: Default::default(),
                // Describes how the colour will be interpolated
                // and assigned to the output attachment.
                targets: &[Some(wgpu::ColorTargetState {
@ -572,6 +574,8 @@ impl crate::framework::Example for Example {
            // No multisampling is used.
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
+            // No pipeline caching is used
+            cache: None,
        });

        // Same idea as the water pipeline.
@ -581,6 +585,7 @@ impl crate::framework::Example for Example {
            vertex: wgpu::VertexState {
                module: &terrain_module,
                entry_point: "vs_main",
+                compilation_options: Default::default(),
                buffers: &[wgpu::VertexBufferLayout {
                    array_stride: terrain_vertex_size as wgpu::BufferAddress,
                    step_mode: wgpu::VertexStepMode::Vertex,
@ -590,6 +595,7 @@ impl crate::framework::Example for Example {
            fragment: Some(wgpu::FragmentState {
                module: &terrain_module,
                entry_point: "fs_main",
+                compilation_options: Default::default(),
                targets: &[Some(config.view_formats[0].into())],
            }),
            primitive: wgpu::PrimitiveState {
@ -606,6 +612,7 @@ impl crate::framework::Example for Example {
            }),
            multisample: wgpu::MultisampleState::default(),
            multiview: None,
+            cache: None
        });

        // A render bundle to draw the terrain.
@ -830,18 +837,8 @@ static TEST: crate::framework::ExampleTestParams = crate::framework::ExampleTest
        // To be fixed in <https://github.com/gfx-rs/wgpu/issues/5231>.
        .expect_fail(wgpu_test::FailureCase {
            backends: Some(wgpu::Backends::VULKAN),
-            reasons: vec![
-                wgpu_test::FailureReason::validation_error().with_message(concat!(
-                    "vkCmdEndRenderPass: ",
-                    "Hazard WRITE_AFTER_READ in subpass 0 for attachment 1 depth aspect ",
-                    "during store with storeOp VK_ATTACHMENT_STORE_OP_STORE. ",
-                    "Access info (",
-                    "usage: SYNC_LATE_FRAGMENT_TESTS_DEPTH_STENCIL_ATTACHMENT_WRITE, ",
-                    "prior_usage: SYNC_FRAGMENT_SHADER_SHADER_SAMPLED_READ, ",
-                    "read_barriers: VkPipelineStageFlags2(0), ",
-                    "command: vkCmdDraw"
-                )),
-            ],
+            reasons: vec![wgpu_test::FailureReason::validation_error()
+                .with_message(concat!("Hazard WRITE_AFTER_"))],
            behavior: wgpu_test::FailureBehavior::AssertFailure,
            ..Default::default()
        }),
--- a/examples/src/water/point_gen.rs
+++ b/examples/src/water/point_gen.rs
@ -124,7 +124,7 @@ impl HexTerrainMesh {
        let width = q_given_r(radius);
        let half_width = (width / 2) as isize;
        let mut map = HashMap::new();
-        let mut max = std::f32::NEG_INFINITY;
+        let mut max = f32::NEG_INFINITY;
        for i in -half_width..=half_width {
            let x_o = i as f32;
            for j in -half_width..=half_width {
--- a/naga-cli/Cargo.toml
+++ b/naga-cli/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "naga-cli"
-version = "0.19.0"
+version = "0.20.0"
 authors = ["gfx-rs developers"]
 edition = "2021"
 description = "Shader translation command line tool"
@ -19,13 +19,13 @@ test = false

 [dependencies]
 bincode = "1"
-log = "0.4"
 codespan-reporting = "0.11"
 env_logger = "0.11"
 argh = "0.1.5"
+anyhow.workspace = true

 [dependencies.naga]
-version = "0.19"
+version = "0.20.0"
 path = "../naga"
 features = [
    "compact",
--- a/naga-cli/src/bin/naga.rs
+++ b/naga-cli/src/bin/naga.rs
@ -1,4 +1,5 @@
 #![allow(clippy::manual_strip)]
+use anyhow::{anyhow, Context as _};
 #[allow(unused_imports)]
 use std::fs;
 use std::{error::Error, fmt, io::Read, path::Path, str::FromStr};
@ -62,6 +63,16 @@ struct Args {
    #[argh(option)]
    shader_model: Option<ShaderModelArg>,

+    /// the shader stage, for example 'frag', 'vert', or 'compute'.
+    /// if the shader stage is unspecified it will be derived from
+    /// the file extension.
+    #[argh(option)]
+    shader_stage: Option<ShaderStage>,
+
+    /// the kind of input, e.g. 'glsl', 'wgsl', 'spv', or 'bin'.
+    #[argh(option)]
+    input_kind: Option<InputKind>,
+
    /// the metal version to use, for example, 1.0, 1.1, 1.2, etc.
    #[argh(option)]
    metal_version: Option<MslVersionArg>,
@ -105,6 +116,10 @@ struct Args {
    #[argh(switch)]
    version: bool,

+    /// override value, of the form "foo=N,bar=M", repeatable
+    #[argh(option, long = "override")]
+    overrides: Vec<Overrides>,
+
    /// the input and output files.
    ///
    /// First positional argument is the input file. If not specified, the
@ -154,11 +169,58 @@ impl FromStr for ShaderModelArg {
            "50" => ShaderModel::V5_0,
            "51" => ShaderModel::V5_1,
            "60" => ShaderModel::V6_0,
+            "61" => ShaderModel::V6_1,
+            "62" => ShaderModel::V6_2,
+            "63" => ShaderModel::V6_3,
+            "64" => ShaderModel::V6_4,
+            "65" => ShaderModel::V6_5,
+            "66" => ShaderModel::V6_6,
+            "67" => ShaderModel::V6_7,
            _ => return Err(format!("Invalid value for --shader-model: {s}")),
        }))
    }
 }

+/// Newtype so we can implement [`FromStr`] for `ShaderSource`.
+#[derive(Debug, Clone, Copy)]
+struct ShaderStage(naga::ShaderStage);
+
+impl FromStr for ShaderStage {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        use naga::ShaderStage;
+        Ok(Self(match s.to_lowercase().as_str() {
+            "frag" | "fragment" => ShaderStage::Fragment,
+            "comp" | "compute" => ShaderStage::Compute,
+            "vert" | "vertex" => ShaderStage::Vertex,
+            _ => return Err(anyhow!("Invalid shader stage: {s}")),
+        }))
+    }
+}
+
+/// Input kind/file extension mapping
+#[derive(Debug, Clone, Copy)]
+enum InputKind {
+    Bincode,
+    Glsl,
+    SpirV,
+    Wgsl,
+}
+impl FromStr for InputKind {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(match s.to_lowercase().as_str() {
+            "bin" => InputKind::Bincode,
+            "glsl" => InputKind::Glsl,
+            "spv" => InputKind::SpirV,
+            "wgsl" => InputKind::Wgsl,
+            _ => return Err(anyhow!("Invalid value for --input-kind: {s}")),
+        })
+    }
+}
+
 /// Newtype so we can implement [`FromStr`] for [`naga::back::glsl::Version`].
 #[derive(Clone, Debug)]
 struct GlslProfileArg(naga::back::glsl::Version);
@ -202,18 +264,42 @@ impl FromStr for MslVersionArg {
    }
 }

+#[derive(Clone, Debug)]
+struct Overrides {
+    pairs: Vec<(String, f64)>,
+}
+
+impl FromStr for Overrides {
+    type Err = String;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut pairs = vec![];
+        for pair in s.split(',') {
+            let Some((name, value)) = pair.split_once('=') else {
+                return Err(format!("value needs a `=`: {pair:?}"));
+            };
+            let value = f64::from_str(value.trim()).map_err(|err| format!("{err}: {value:?}"))?;
+            pairs.push((name.trim().to_string(), value));
+        }
+        Ok(Overrides { pairs })
+    }
+}
+
 #[derive(Default)]
 struct Parameters<'a> {
    validation_flags: naga::valid::ValidationFlags,
    bounds_check_policies: naga::proc::BoundsCheckPolicies,
    entry_point: Option<String>,
    keep_coordinate_space: bool,
+    overrides: naga::back::PipelineConstants,
    spv_in: naga::front::spv::Options,
    spv_out: naga::back::spv::Options<'a>,
    dot: naga::back::dot::Options,
    msl: naga::back::msl::Options,
    glsl: naga::back::glsl::Options,
    hlsl: naga::back::hlsl::Options,
+    input_kind: Option<InputKind>,
+    shader_stage: Option<ShaderStage>,
 }

 trait PrettyResult {
@ -267,7 +353,7 @@ impl fmt::Display for CliError {
 }
 impl std::error::Error for CliError {}

-fn run() -> Result<(), Box<dyn std::error::Error>> {
+fn run() -> anyhow::Result<()> {
    env_logger::init();

    // Parse commandline arguments
@ -301,14 +387,19 @@ fn run() -> Result<(), Box<dyn std::error::Error>> {
        Some(arg) => arg.0,
        None => params.bounds_check_policies.index,
    };
-
+    params.overrides = args
+        .overrides
+        .iter()
+        .flat_map(|o| &o.pairs)
+        .cloned()
+        .collect();
    params.spv_in = naga::front::spv::Options {
        adjust_coordinate_space: !args.keep_coordinate_space,
        strict_capabilities: false,
        block_ctx_dump_prefix: args.block_ctx_dir.clone().map(std::path::PathBuf::from),
    };

-    params.entry_point = args.entry_point.clone();
+    params.entry_point.clone_from(&args.entry_point);
    if let Some(ref version) = args.profile {
        params.glsl.version = version.0;
    }
@ -343,6 +434,9 @@ fn run() -> Result<(), Box<dyn std::error::Error>> {
        return Err(CliError("Input file path is not specified").into());
    };

+    params.input_kind = args.input_kind;
+    params.shader_stage = args.shader_stage;
+
    let Parsed {
        mut module,
        input_text,
@ -386,6 +480,8 @@ fn run() -> Result<(), Box<dyn std::error::Error>> {

    // Validate the IR before compaction.
    let info = match naga::valid::Validator::new(params.validation_flags, validation_caps)
+        .subgroup_stages(naga::valid::ShaderStages::all())
+        .subgroup_operations(naga::valid::SubgroupOperationSet::all())
        .validate(&module)
    {
        Ok(info) => Some(info),
@ -460,67 +556,70 @@ struct Parsed {
    input_text: Option<String>,
 }

-fn parse_input(
-    input_path: &Path,
-    input: Vec<u8>,
-    params: &Parameters,
-) -> Result<Parsed, Box<dyn std::error::Error>> {
-    let (module, input_text) = match Path::new(&input_path)
-        .extension()
-        .ok_or(CliError("Input filename has no extension"))?
-        .to_str()
-        .ok_or(CliError("Input filename not valid unicode"))?
-    {
-        "bin" => (bincode::deserialize(&input)?, None),
-        "spv" => naga::front::spv::parse_u8_slice(&input, &params.spv_in).map(|m| (m, None))?,
-        "wgsl" => {
+fn parse_input(input_path: &Path, input: Vec<u8>, params: &Parameters) -> anyhow::Result<Parsed> {
+    let input_kind = match params.input_kind {
+        Some(kind) => kind,
+        None => input_path
+            .extension()
+            .context("Input filename has no extension")?
+            .to_str()
+            .context("Input filename not valid unicode")?
+            .parse()
+            .context("Unable to determine --input-kind from filename")?,
+    };
+
+    let (module, input_text) = match input_kind {
+        InputKind::Bincode => (bincode::deserialize(&input)?, None),
+        InputKind::SpirV => {
+            naga::front::spv::parse_u8_slice(&input, &params.spv_in).map(|m| (m, None))?
+        }
+        InputKind::Wgsl => {
            let input = String::from_utf8(input)?;
            let result = naga::front::wgsl::parse_str(&input);
            match result {
                Ok(v) => (v, Some(input)),
                Err(ref e) => {
-                    let message = format!(
+                    let message = anyhow!(
                        "Could not parse WGSL:\n{}",
                        e.emit_to_string_with_path(&input, input_path)
                    );
-                    return Err(message.into());
+                    return Err(message);
                }
            }
        }
-        ext @ ("vert" | "frag" | "comp" | "glsl") => {
+        InputKind::Glsl => {
+            let shader_stage = match params.shader_stage {
+                Some(shader_stage) => shader_stage,
+                None => {
+                    // filename.shader_stage.glsl -> filename.shader_stage
+                    let file_stem = input_path
+                        .file_stem()
+                        .context("Unable to determine file stem from input filename.")?;
+                    // filename.shader_stage -> shader_stage
+                    let inner_ext = Path::new(file_stem)
+                        .extension()
+                        .context("Unable to determine inner extension from input filename.")?
+                        .to_str()
+                        .context("Input filename not valid unicode")?;
+                    inner_ext.parse().context("from input filename")?
+                }
+            };
            let input = String::from_utf8(input)?;
            let mut parser = naga::front::glsl::Frontend::default();
-
            (
                parser
                    .parse(
                        &naga::front::glsl::Options {
-                            stage: match ext {
-                                "vert" => naga::ShaderStage::Vertex,
-                                "frag" => naga::ShaderStage::Fragment,
-                                "comp" => naga::ShaderStage::Compute,
-                                "glsl" => {
-                                    let internal_name = input_path.to_string_lossy();
-                                    match Path::new(&internal_name[..internal_name.len()-5])
-                                        .extension()
-                                        .ok_or(CliError("Input filename ending with .glsl has no internal extension"))?
-                                        .to_str()
-                                        .ok_or(CliError("Input filename not valid unicode"))?
-                                    {
-                                        "vert" => naga::ShaderStage::Vertex,
-                                        "frag" => naga::ShaderStage::Fragment,
-                                        "comp" => naga::ShaderStage::Compute,
-                                        _ => unreachable!(),
-                                    }
-                                },
-                                _ => unreachable!(),
-                            },
+                            stage: shader_stage.0,
                            defines: Default::default(),
                        },
                        &input,
                    )
                    .unwrap_or_else(|error| {
-                        let filename = input_path.file_name().and_then(std::ffi::OsStr::to_str).unwrap_or("glsl");
+                        let filename = input_path
+                            .file_name()
+                            .and_then(std::ffi::OsStr::to_str)
+                            .unwrap_or("glsl");
                        let mut writer = StandardStream::stderr(ColorChoice::Auto);
                        error.emit_to_writer_with_path(&mut writer, &input, filename);
                        std::process::exit(1);
@ -528,7 +627,6 @@ fn parse_input(
                Some(input),
            )
        }
-        _ => return Err(CliError("Unknown input file extension").into()),
    };

    Ok(Parsed { module, input_text })
@ -539,7 +637,7 @@ fn write_output(
    info: &Option<naga::valid::ModuleInfo>,
    params: &Parameters,
    output_path: &str,
-) -> Result<(), Box<dyn std::error::Error>> {
+) -> anyhow::Result<()> {
    match Path::new(&output_path)
        .extension()
        .ok_or(CliError("Output filename has no extension"))?
@ -566,17 +664,18 @@ fn write_output(
            let mut options = params.msl.clone();
            options.bounds_check_policies = params.bounds_check_policies;

+            let info = info.as_ref().ok_or(CliError(
+                "Generating metal output requires validation to \
+                 succeed, and it failed in a previous step",
+            ))?;
+
+            let (module, info) =
+                naga::back::pipeline_constants::process_overrides(module, info, &params.overrides)
+                    .unwrap_pretty();
+
            let pipeline_options = msl::PipelineOptions::default();
-            let (msl, _) = msl::write_string(
-                module,
-                info.as_ref().ok_or(CliError(
-                    "Generating metal output requires validation to \
-                     succeed, and it failed in a previous step",
-                ))?,
-                &options,
-                &pipeline_options,
-            )
-            .unwrap_pretty();
+            let (msl, _) =
+                msl::write_string(&module, &info, &options, &pipeline_options).unwrap_pretty();
            fs::write(output_path, msl)?;
        }
        "spv" => {
@ -599,16 +698,17 @@ fn write_output(
                None => None,
            };

-            let spv = spv::write_vec(
-                module,
-                info.as_ref().ok_or(CliError(
-                    "Generating SPIR-V output requires validation to \
-                     succeed, and it failed in a previous step",
-                ))?,
-                &params.spv_out,
-                pipeline_options,
-            )
-            .unwrap_pretty();
+            let info = info.as_ref().ok_or(CliError(
+                "Generating SPIR-V output requires validation to \
+                 succeed, and it failed in a previous step",
+            ))?;
+
+            let (module, info) =
+                naga::back::pipeline_constants::process_overrides(module, info, &params.overrides)
+                    .unwrap_pretty();
+
+            let spv =
+                spv::write_vec(&module, &info, &params.spv_out, pipeline_options).unwrap_pretty();
            let bytes = spv
                .iter()
                .fold(Vec::with_capacity(spv.len() * 4), |mut v, w| {
@ -635,14 +735,20 @@ fn write_output(
                multiview: None,
            };

+            let info = info.as_ref().ok_or(CliError(
+                "Generating glsl output requires validation to \
+                 succeed, and it failed in a previous step",
+            ))?;
+
+            let (module, info) =
+                naga::back::pipeline_constants::process_overrides(module, info, &params.overrides)
+                    .unwrap_pretty();
+
            let mut buffer = String::new();
            let mut writer = glsl::Writer::new(
                &mut buffer,
-                module,
-                info.as_ref().ok_or(CliError(
-                    "Generating glsl output requires validation to \
-                     succeed, and it failed in a previous step",
-                ))?,
+                &module,
+                &info,
                &params.glsl,
                &pipeline_options,
                params.bounds_check_policies,
@ -659,17 +765,19 @@ fn write_output(
        }
        "hlsl" => {
            use naga::back::hlsl;
+
+            let info = info.as_ref().ok_or(CliError(
+                "Generating hlsl output requires validation to \
+                 succeed, and it failed in a previous step",
+            ))?;
+
+            let (module, info) =
+                naga::back::pipeline_constants::process_overrides(module, info, &params.overrides)
+                    .unwrap_pretty();
+
            let mut buffer = String::new();
            let mut writer = hlsl::Writer::new(&mut buffer, &params.hlsl);
-            writer
-                .write(
-                    module,
-                    info.as_ref().ok_or(CliError(
-                        "Generating hlsl output requires validation to \
-                         succeed, and it failed in a previous step",
-                    ))?,
-                )
-                .unwrap_pretty();
+            writer.write(&module, &info).unwrap_pretty();
            fs::write(output_path, buffer)?;
        }
        "wgsl" => {
@ -694,7 +802,7 @@ fn write_output(
    Ok(())
 }

-fn bulk_validate(args: Args, params: &Parameters) -> Result<(), Box<dyn std::error::Error>> {
+fn bulk_validate(args: Args, params: &Parameters) -> anyhow::Result<()> {
    let mut invalid = vec![];
    for input_path in args.files {
        let path = Path::new(&input_path);
@ -712,6 +820,8 @@ fn bulk_validate(args: Args, params: &Parameters) -> Result<(), Box<dyn std::err

        let mut validator =
            naga::valid::Validator::new(params.validation_flags, naga::valid::Capabilities::all());
+        validator.subgroup_stages(naga::valid::ShaderStages::all());
+        validator.subgroup_operations(naga::valid::SubgroupOperationSet::all());

        if let Err(error) = validator.validate(&module) {
            invalid.push(input_path.clone());
@ -735,7 +845,7 @@ fn bulk_validate(args: Args, params: &Parameters) -> Result<(), Box<dyn std::err
        for path in invalid {
            writeln!(&mut formatted, "  {path}").unwrap();
        }
-        return Err(formatted.into());
+        return Err(anyhow!(formatted));
    }

    Ok(())
--- a/naga/Cargo.toml
+++ b/naga/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "naga"
-version = "0.19.0"
+version = "0.20.0"
 authors = ["gfx-rs developers"]
 edition = "2021"
 description = "Shader translation infrastructure"
@ -21,28 +21,23 @@ all-features = true

 [features]
 default = []
-clone = []
 dot-out = []
-glsl-in = ["pp-rs"]
+glsl-in = ["dep:pp-rs"]
 glsl-out = []
 msl-out = []
-serialize = ["serde", "bitflags/serde", "indexmap/serde"]
-deserialize = ["serde", "bitflags/serde", "indexmap/serde"]
+serialize = ["dep:serde", "bitflags/serde", "indexmap/serde"]
+deserialize = ["dep:serde", "bitflags/serde", "indexmap/serde"]
 arbitrary = ["dep:arbitrary", "bitflags/arbitrary", "indexmap/arbitrary"]
-spv-in = ["petgraph", "spirv"]
-spv-out = ["spirv"]
-wgsl-in = ["hexf-parse", "unicode-xid", "compact"]
+spv-in = ["dep:petgraph", "dep:spirv"]
+spv-out = ["dep:spirv"]
+wgsl-in = ["dep:hexf-parse", "dep:unicode-xid", "compact"]
 wgsl-out = []
 hlsl-out = []
 compact = []

-[[bench]]
-name = "criterion"
-harness = false
-
 [dependencies]
 arbitrary = { version = "1.3", features = ["derive"], optional = true }
-bitflags = "2.4"
+bitflags = "2.5"
 bit-set = "0.5"
 termcolor = { version = "1.4.1" }
 # remove termcolor dep when updating to the next version of codespan-reporting
@ -52,21 +47,16 @@ codespan-reporting = { version = "0.11.0" }
 rustc-hash = "1.1.0"
 indexmap = { version = "2", features = ["std"] }
 log = "0.4"
-num-traits = "0.2"
 spirv = { version = "0.3", optional = true }
-thiserror = "1.0.57"
-serde = { version = "1.0.196", features = ["derive"], optional = true }
+thiserror = "1.0.61"
+serde = { version = "1.0.202", features = ["derive"], optional = true }
 petgraph = { version = "0.6", optional = true }
 pp-rs = { version = "0.2.1", optional = true }
 hexf-parse = { version = "0.2.1", optional = true }
 unicode-xid = { version = "0.2.3", optional = true }
 arrayvec.workspace = true

-[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies]
-criterion = { version = "0.5", features = [] }
-
 [dev-dependencies]
-bincode = "1"
 diff = "0.1"
 env_logger = "0.11"
 # This _cannot_ have a version specified. If it does, crates.io will look
--- a/naga/README.md
+++ b/naga/README.md
@ -42,7 +42,7 @@ First, install `naga-cli` from crates.io or directly from GitHub.
 cargo install naga-cli

 # development version
-cargo install naga-cli --git https://github.com/gfx-rs/naga.git
+cargo install naga-cli --git https://github.com/gfx-rs/wgpu.git
 ```

 Then, you can run `naga` command.
--- a/naga/benches/criterion.rs
+++ b/naga/benches/criterion.rs
@ -1,273 +0,0 @@
-#![cfg(not(target_arch = "wasm32"))]
-#![allow(clippy::needless_borrowed_reference)]
-
-use criterion::*;
-use std::{fs, path::PathBuf, slice};
-
-fn gather_inputs(folder: &str, extension: &str) -> Vec<Box<[u8]>> {
-    let mut list = Vec::new();
-    let read_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
-        .join(folder)
-        .read_dir()
-        .unwrap();
-    for file_entry in read_dir {
-        match file_entry {
-            Ok(entry) => match entry.path().extension() {
-                Some(ostr) if ostr == extension => {
-                    let input = fs::read(entry.path()).unwrap_or_default();
-                    list.push(input.into_boxed_slice());
-                }
-                _ => continue,
-            },
-            Err(e) => {
-                log::warn!("Skipping file: {:?}", e);
-                continue;
-            }
-        }
-    }
-    list
-}
-
-fn parse_glsl(stage: naga::ShaderStage, inputs: &[Box<[u8]>]) {
-    let mut parser = naga::front::glsl::Frontend::default();
-    let options = naga::front::glsl::Options {
-        stage,
-        defines: Default::default(),
-    };
-    for input in inputs.iter() {
-        let string = std::str::from_utf8(input).unwrap();
-        parser.parse(&options, string).unwrap();
-    }
-}
-
-fn frontends(c: &mut Criterion) {
-    let mut group = c.benchmark_group("front");
-    #[cfg(all(feature = "wgsl-in", feature = "serialize", feature = "deserialize"))]
-    group.bench_function("bin", |b| {
-        let inputs_wgsl = gather_inputs("tests/in", "wgsl");
-        let mut frontend = naga::front::wgsl::Frontend::new();
-        let inputs_bin = inputs_wgsl
-            .iter()
-            .map(|input| {
-                let string = std::str::from_utf8(input).unwrap();
-                let module = frontend.parse(string).unwrap();
-                bincode::serialize(&module).unwrap()
-            })
-            .collect::<Vec<_>>();
-        b.iter(move || {
-            for input in inputs_bin.iter() {
-                bincode::deserialize::<naga::Module>(input).unwrap();
-            }
-        });
-    });
-    #[cfg(feature = "wgsl-in")]
-    group.bench_function("wgsl", |b| {
-        let inputs_wgsl = gather_inputs("tests/in", "wgsl");
-        let inputs = inputs_wgsl
-            .iter()
-            .map(|input| std::str::from_utf8(input).unwrap())
-            .collect::<Vec<_>>();
-        let mut frontend = naga::front::wgsl::Frontend::new();
-        b.iter(move || {
-            for &input in inputs.iter() {
-                frontend.parse(input).unwrap();
-            }
-        });
-    });
-    #[cfg(feature = "spv-in")]
-    group.bench_function("spv", |b| {
-        let inputs = gather_inputs("tests/in/spv", "spv");
-        b.iter(move || {
-            let options = naga::front::spv::Options::default();
-            for input in inputs.iter() {
-                let spv =
-                    unsafe { slice::from_raw_parts(input.as_ptr() as *const u32, input.len() / 4) };
-                let parser = naga::front::spv::Frontend::new(spv.iter().cloned(), &options);
-                parser.parse().unwrap();
-            }
-        });
-    });
-    #[cfg(feature = "glsl-in")]
-    group.bench_function("glsl", |b| {
-        let vert = gather_inputs("tests/in/glsl", "vert");
-        b.iter(move || parse_glsl(naga::ShaderStage::Vertex, &vert));
-        let frag = gather_inputs("tests/in/glsl", "frag");
-        b.iter(move || parse_glsl(naga::ShaderStage::Vertex, &frag));
-        //TODO: hangs for some reason!
-        //let comp = gather_inputs("tests/in/glsl", "comp");
-        //b.iter(move || parse_glsl(naga::ShaderStage::Compute, &comp));
-    });
-}
-
-#[cfg(feature = "wgsl-in")]
-fn gather_modules() -> Vec<naga::Module> {
-    let inputs = gather_inputs("tests/in", "wgsl");
-    let mut frontend = naga::front::wgsl::Frontend::new();
-    inputs
-        .iter()
-        .map(|input| {
-            let string = std::str::from_utf8(input).unwrap();
-            frontend.parse(string).unwrap()
-        })
-        .collect()
-}
-#[cfg(not(feature = "wgsl-in"))]
-fn gather_modules() -> Vec<naga::Module> {
-    Vec::new()
-}
-
-fn validation(c: &mut Criterion) {
-    let inputs = gather_modules();
-    let mut group = c.benchmark_group("valid");
-    group.bench_function("safe", |b| {
-        let mut validator = naga::valid::Validator::new(
-            naga::valid::ValidationFlags::all(),
-            naga::valid::Capabilities::all(),
-        );
-        b.iter(|| {
-            for input in inputs.iter() {
-                validator.validate(input).unwrap();
-            }
-        });
-    });
-    group.bench_function("unsafe", |b| {
-        let mut validator = naga::valid::Validator::new(
-            naga::valid::ValidationFlags::empty(),
-            naga::valid::Capabilities::all(),
-        );
-        b.iter(|| {
-            for input in inputs.iter() {
-                validator.validate(input).unwrap();
-            }
-        });
-    });
-}
-
-fn backends(c: &mut Criterion) {
-    let inputs = {
-        let mut validator = naga::valid::Validator::new(
-            naga::valid::ValidationFlags::empty(),
-            naga::valid::Capabilities::default(),
-        );
-        let input_modules = gather_modules();
-        input_modules
-            .into_iter()
-            .flat_map(|module| validator.validate(&module).ok().map(|info| (module, info)))
-            .collect::<Vec<_>>()
-    };
-
-    let mut group = c.benchmark_group("back");
-    #[cfg(feature = "wgsl-out")]
-    group.bench_function("wgsl", |b| {
-        b.iter(|| {
-            let mut string = String::new();
-            let flags = naga::back::wgsl::WriterFlags::empty();
-            for &(ref module, ref info) in inputs.iter() {
-                let mut writer = naga::back::wgsl::Writer::new(&mut string, flags);
-                writer.write(module, info).unwrap();
-                string.clear();
-            }
-        });
-    });
-
-    #[cfg(feature = "spv-out")]
-    group.bench_function("spv", |b| {
-        b.iter(|| {
-            let mut data = Vec::new();
-            let options = naga::back::spv::Options::default();
-            for &(ref module, ref info) in inputs.iter() {
-                let mut writer = naga::back::spv::Writer::new(&options).unwrap();
-                writer.write(module, info, None, &None, &mut data).unwrap();
-                data.clear();
-            }
-        });
-    });
-    #[cfg(feature = "spv-out")]
-    group.bench_function("spv-separate", |b| {
-        b.iter(|| {
-            let mut data = Vec::new();
-            let options = naga::back::spv::Options::default();
-            for &(ref module, ref info) in inputs.iter() {
-                let mut writer = naga::back::spv::Writer::new(&options).unwrap();
-                for ep in module.entry_points.iter() {
-                    let pipeline_options = naga::back::spv::PipelineOptions {
-                        shader_stage: ep.stage,
-                        entry_point: ep.name.clone(),
-                    };
-                    writer
-                        .write(module, info, Some(&pipeline_options), &None, &mut data)
-                        .unwrap();
-                    data.clear();
-                }
-            }
-        });
-    });
-
-    #[cfg(feature = "msl-out")]
-    group.bench_function("msl", |b| {
-        b.iter(|| {
-            let mut string = String::new();
-            let options = naga::back::msl::Options::default();
-            for &(ref module, ref info) in inputs.iter() {
-                let pipeline_options = naga::back::msl::PipelineOptions::default();
-                let mut writer = naga::back::msl::Writer::new(&mut string);
-                writer
-                    .write(module, info, &options, &pipeline_options)
-                    .unwrap();
-                string.clear();
-            }
-        });
-    });
-
-    #[cfg(feature = "hlsl-out")]
-    group.bench_function("hlsl", |b| {
-        b.iter(|| {
-            let options = naga::back::hlsl::Options::default();
-            let mut string = String::new();
-            for &(ref module, ref info) in inputs.iter() {
-                let mut writer = naga::back::hlsl::Writer::new(&mut string, &options);
-                let _ = writer.write(module, info); // may fail on unimplemented things
-                string.clear();
-            }
-        });
-    });
-
-    #[cfg(feature = "glsl-out")]
-    group.bench_function("glsl-separate", |b| {
-        b.iter(|| {
-            let mut string = String::new();
-            let options = naga::back::glsl::Options {
-                version: naga::back::glsl::Version::new_gles(320),
-                writer_flags: naga::back::glsl::WriterFlags::empty(),
-                binding_map: Default::default(),
-                zero_initialize_workgroup_memory: true,
-            };
-            for &(ref module, ref info) in inputs.iter() {
-                for ep in module.entry_points.iter() {
-                    let pipeline_options = naga::back::glsl::PipelineOptions {
-                        shader_stage: ep.stage,
-                        entry_point: ep.name.clone(),
-                        multiview: None,
-                    };
-
-                    // might be `Err` if missing features
-                    if let Ok(mut writer) = naga::back::glsl::Writer::new(
-                        &mut string,
-                        module,
-                        info,
-                        &options,
-                        &pipeline_options,
-                        naga::proc::BoundsCheckPolicies::default(),
-                    ) {
-                        let _ = writer.write(); // might be `Err` if unsupported
-                    }
-
-                    string.clear();
-                }
-            }
-        });
-    });
-}
-
-criterion_group!(criterion, frontends, validation, backends,);
-criterion_main!(criterion);
--- a/naga/fuzz/Cargo.toml
+++ b/naga/fuzz/Cargo.toml
@ -15,29 +15,33 @@ libfuzzer-sys = "0.4"

 [target.'cfg(not(any(target_arch = "wasm32", target_os = "ios")))'.dependencies.naga]
 path = ".."
-version = "0.19.0"
+version = "0.20.0"
 features = ["arbitrary", "spv-in", "wgsl-in", "glsl-in"]

 [[bin]]
 name = "spv_parser"
 path = "fuzz_targets/spv_parser.rs"
+bench = false
 test = false
 doc = false

 [[bin]]
 name = "wgsl_parser"
 path = "fuzz_targets/wgsl_parser.rs"
+bench = false
 test = false
 doc = false

 [[bin]]
 name = "glsl_parser"
 path = "fuzz_targets/glsl_parser.rs"
+bench = false
 test = false
 doc = false

 [[bin]]
 name = "ir"
 path = "fuzz_targets/ir.rs"
+bench = false
 test = false
 doc = false
--- a/naga/src/arena.rs
+++ b/naga/src/arena.rs
@ -122,6 +122,7 @@ impl<T> Handle<T> {
    serde(transparent)
 )]
 #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
+#[cfg_attr(test, derive(PartialEq))]
 pub struct Range<T> {
    inner: ops::Range<u32>,
    #[cfg_attr(any(feature = "serialize", feature = "deserialize"), serde(skip))]
@ -140,6 +141,7 @@ impl<T> Range<T> {

 // NOTE: Keep this diagnostic in sync with that of [`BadHandle`].
 #[derive(Clone, Debug, thiserror::Error)]
+#[cfg_attr(test, derive(PartialEq))]
 #[error("Handle range {range:?} of {kind} is either not present, or inaccessible yet")]
 pub struct BadRangeError {
    // This error is used for many `Handle` types, but there's no point in making this generic, so
@ -239,7 +241,7 @@ impl<T> Range<T> {
 /// Adding new items to the arena produces a strongly-typed [`Handle`].
 /// The arena can be indexed using the given handle to obtain
 /// a reference to the stored item.
-#[cfg_attr(feature = "clone", derive(Clone))]
+#[derive(Clone)]
 #[cfg_attr(feature = "serialize", derive(serde::Serialize))]
 #[cfg_attr(feature = "serialize", serde(transparent))]
 #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
@ -297,6 +299,17 @@ impl<T> Arena<T> {
            .map(|(i, v)| unsafe { (Handle::from_usize_unchecked(i), v) })
    }

+    /// Drains the arena, returning an iterator over the items stored.
+    pub fn drain(&mut self) -> impl DoubleEndedIterator<Item = (Handle<T>, T, Span)> {
+        let arena = std::mem::take(self);
+        arena
+            .data
+            .into_iter()
+            .zip(arena.span_info)
+            .enumerate()
+            .map(|(i, (v, span))| unsafe { (Handle::from_usize_unchecked(i), v, span) })
+    }
+
    /// Returns a iterator over the items stored in this arena,
    /// returning both the item's handle and a mutable reference to it.
    pub fn iter_mut(&mut self) -> impl DoubleEndedIterator<Item = (Handle<T>, &mut T)> {
@ -531,7 +544,7 @@ mod tests {
 ///
 /// `UniqueArena` is similar to [`Arena`]: If `Arena` is vector-like,
 /// `UniqueArena` is `HashSet`-like.
-#[cfg_attr(feature = "clone", derive(Clone))]
+#[derive(Clone)]
 pub struct UniqueArena<T> {
    set: FastIndexSet<T>,

--- a/naga/src/back/dot/mod.rs
+++ b/naga/src/back/dot/mod.rs
@ -279,6 +279,94 @@ impl StatementGraph {
                        crate::RayQueryFunction::Terminate => "RayQueryTerminate",
                    }
                }
+                S::SubgroupBallot { result, predicate } => {
+                    if let Some(predicate) = predicate {
+                        self.dependencies.push((id, predicate, "predicate"));
+                    }
+                    self.emits.push((id, result));
+                    "SubgroupBallot"
+                }
+                S::SubgroupCollectiveOperation {
+                    op,
+                    collective_op,
+                    argument,
+                    result,
+                } => {
+                    self.dependencies.push((id, argument, "arg"));
+                    self.emits.push((id, result));
+                    match (collective_op, op) {
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::All) => {
+                            "SubgroupAll"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Any) => {
+                            "SubgroupAny"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Add) => {
+                            "SubgroupAdd"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Mul) => {
+                            "SubgroupMul"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Max) => {
+                            "SubgroupMax"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Min) => {
+                            "SubgroupMin"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::And) => {
+                            "SubgroupAnd"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Or) => {
+                            "SubgroupOr"
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Xor) => {
+                            "SubgroupXor"
+                        }
+                        (
+                            crate::CollectiveOperation::ExclusiveScan,
+                            crate::SubgroupOperation::Add,
+                        ) => "SubgroupExclusiveAdd",
+                        (
+                            crate::CollectiveOperation::ExclusiveScan,
+                            crate::SubgroupOperation::Mul,
+                        ) => "SubgroupExclusiveMul",
+                        (
+                            crate::CollectiveOperation::InclusiveScan,
+                            crate::SubgroupOperation::Add,
+                        ) => "SubgroupInclusiveAdd",
+                        (
+                            crate::CollectiveOperation::InclusiveScan,
+                            crate::SubgroupOperation::Mul,
+                        ) => "SubgroupInclusiveMul",
+                        _ => unimplemented!(),
+                    }
+                }
+                S::SubgroupGather {
+                    mode,
+                    argument,
+                    result,
+                } => {
+                    match mode {
+                        crate::GatherMode::BroadcastFirst => {}
+                        crate::GatherMode::Broadcast(index)
+                        | crate::GatherMode::Shuffle(index)
+                        | crate::GatherMode::ShuffleDown(index)
+                        | crate::GatherMode::ShuffleUp(index)
+                        | crate::GatherMode::ShuffleXor(index) => {
+                            self.dependencies.push((id, index, "index"))
+                        }
+                    }
+                    self.dependencies.push((id, argument, "arg"));
+                    self.emits.push((id, result));
+                    match mode {
+                        crate::GatherMode::BroadcastFirst => "SubgroupBroadcastFirst",
+                        crate::GatherMode::Broadcast(_) => "SubgroupBroadcast",
+                        crate::GatherMode::Shuffle(_) => "SubgroupShuffle",
+                        crate::GatherMode::ShuffleDown(_) => "SubgroupShuffleDown",
+                        crate::GatherMode::ShuffleUp(_) => "SubgroupShuffleUp",
+                        crate::GatherMode::ShuffleXor(_) => "SubgroupShuffleXor",
+                    }
+                }
            };
            // Set the last node to the merge node
            last_node = merge_id;
@ -404,6 +492,7 @@ fn write_function_expressions(
        let (label, color_id) = match *expression {
            E::Literal(_) => ("Literal".into(), 2),
            E::Constant(_) => ("Constant".into(), 2),
+            E::Override(_) => ("Override".into(), 2),
            E::ZeroValue(_) => ("ZeroValue".into(), 2),
            E::Compose { ref components, .. } => {
                payload = Some(Payload::Arguments(components));
@ -586,6 +675,8 @@ fn write_function_expressions(
                let ty = if committed { "Committed" } else { "Candidate" };
                (format!("rayQueryGet{}Intersection", ty).into(), 4)
            }
+            E::SubgroupBallotResult => ("SubgroupBallotResult".into(), 4),
+            E::SubgroupOperationResult { .. } => ("SubgroupOperationResult".into(), 4),
        };

        // give uniform expressions an outline
--- a/naga/src/back/glsl/features.rs
+++ b/naga/src/back/glsl/features.rs
@ -50,6 +50,8 @@ bitflags::bitflags! {
        const INSTANCE_INDEX = 1 << 22;
        /// Sample specific LODs of cube / array shadow textures
        const TEXTURE_SHADOW_LOD = 1 << 23;
+        /// Subgroup operations
+        const SUBGROUP_OPERATIONS = 1 << 24;
    }
 }

@ -117,6 +119,7 @@ impl FeaturesManager {
        check_feature!(SAMPLE_VARIABLES, 400, 300);
        check_feature!(DYNAMIC_ARRAY_SIZE, 430, 310);
        check_feature!(DUAL_SOURCE_BLENDING, 330, 300 /* with extension */);
+        check_feature!(SUBGROUP_OPERATIONS, 430, 310);
        match version {
            Version::Embedded { is_webgl: true, .. } => check_feature!(MULTI_VIEW, 140, 300),
            _ => check_feature!(MULTI_VIEW, 140, 310),
@ -259,6 +262,22 @@ impl FeaturesManager {
            writeln!(out, "#extension GL_EXT_texture_shadow_lod : require")?;
        }

+        if self.0.contains(Features::SUBGROUP_OPERATIONS) {
+            // https://registry.khronos.org/OpenGL/extensions/KHR/KHR_shader_subgroup.txt
+            writeln!(out, "#extension GL_KHR_shader_subgroup_basic : require")?;
+            writeln!(out, "#extension GL_KHR_shader_subgroup_vote : require")?;
+            writeln!(
+                out,
+                "#extension GL_KHR_shader_subgroup_arithmetic : require"
+            )?;
+            writeln!(out, "#extension GL_KHR_shader_subgroup_ballot : require")?;
+            writeln!(out, "#extension GL_KHR_shader_subgroup_shuffle : require")?;
+            writeln!(
+                out,
+                "#extension GL_KHR_shader_subgroup_shuffle_relative : require"
+            )?;
+        }
+
        Ok(())
    }
 }
@ -326,7 +345,7 @@ impl<'a, W> Writer<'a, W> {
                            }

                            // If the type of this global is a struct
-                            if let crate::TypeInner::Struct { ref members, .. } =
+                            if let TypeInner::Struct { ref members, .. } =
                                self.module.types[global.ty].inner
                            {
                                // Check the last element of the struct to see if it's type uses
@ -453,7 +472,7 @@ impl<'a, W> Writer<'a, W> {
                    // layers queries are also implemented as size queries
                    crate::ImageQuery::Size { .. } | crate::ImageQuery::NumLayers => {
                        if let TypeInner::Image {
-                            class: crate::ImageClass::Storage { .. }, ..
+                            class: ImageClass::Storage { .. }, ..
                        } = *info[image].ty.inner_with(&module.types) {
                            features.request(Features::IMAGE_SIZE)
                        }
@ -518,6 +537,10 @@ impl<'a, W> Writer<'a, W> {
                        }
                    }
                }
+                Expression::SubgroupBallotResult |
+                Expression::SubgroupOperationResult { .. } => {
+                    features.request(Features::SUBGROUP_OPERATIONS)
+                }
                _ => {}
            }
            }
@ -535,7 +558,7 @@ impl<'a, W> Writer<'a, W> {

    fn varying_required_features(&mut self, binding: Option<&Binding>, ty: Handle<Type>) {
        match self.module.types[ty].inner {
-            crate::TypeInner::Struct { ref members, .. } => {
+            TypeInner::Struct { ref members, .. } => {
                for member in members {
                    self.varying_required_features(member.binding.as_ref(), member.ty);
                }
--- a/naga/src/back/glsl/mod.rs
+++ b/naga/src/back/glsl/mod.rs
@ -53,8 +53,7 @@ use crate::{
 use features::FeaturesManager;
 use std::{
    cmp::Ordering,
-    fmt,
-    fmt::{Error as FmtError, Write},
+    fmt::{self, Error as FmtError, Write},
    mem,
 };
 use thiserror::Error;
@ -282,7 +281,7 @@ impl Default for Options {
 }

 /// A subset of options meant to be changed per pipeline.
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[derive(Debug, Clone)]
 #[cfg_attr(feature = "serialize", derive(serde::Serialize))]
 #[cfg_attr(feature = "deserialize", derive(serde::Deserialize))]
 pub struct PipelineOptions {
@ -497,6 +496,8 @@ pub enum Error {
    ImageMultipleSamplers,
    #[error("{0}")]
    Custom(String),
+    #[error("overrides should not be present at this stage")]
+    Override,
 }

 /// Binary operation with a different logic on the GLSL side.
@ -565,6 +566,10 @@ impl<'a, W: Write> Writer<'a, W> {
        pipeline_options: &'a PipelineOptions,
        policies: proc::BoundsCheckPolicies,
    ) -> Result<Self, Error> {
+        if !module.overrides.is_empty() {
+            return Err(Error::Override);
+        }
+
        // Check if the requested version is supported
        if !options.version.is_supported() {
            log::error!("Version {}", options.version);
@ -1242,7 +1247,7 @@ impl<'a, W: Write> Writer<'a, W> {
        self.reflection_names_globals.insert(handle, block_name);

        match self.module.types[global.ty].inner {
-            crate::TypeInner::Struct { ref members, .. }
+            TypeInner::Struct { ref members, .. }
                if self.module.types[members.last().unwrap().ty]
                    .inner
                    .is_dynamically_sized(&self.module.types) =>
@ -1312,6 +1317,12 @@ impl<'a, W: Write> Writer<'a, W> {
                            }
                        }
                    }
+                    crate::MathFunction::Pack4xI8
+                    | crate::MathFunction::Pack4xU8
+                    | crate::MathFunction::Unpack4xI8
+                    | crate::MathFunction::Unpack4xU8 => {
+                        self.need_bake_expressions.insert(arg);
+                    }
                    crate::MathFunction::ExtractBits => {
                        // Only argument 1 is re-used.
                        self.need_bake_expressions.insert(arg1.unwrap());
@ -1423,7 +1434,7 @@ impl<'a, W: Write> Writer<'a, W> {
        output: bool,
    ) -> Result<(), Error> {
        // For a struct, emit a separate global for each member with a binding.
-        if let crate::TypeInner::Struct { ref members, .. } = self.module.types[ty].inner {
+        if let TypeInner::Struct { ref members, .. } = self.module.types[ty].inner {
            for member in members {
                self.write_varying(member.binding.as_ref(), member.ty, output)?;
            }
@ -1695,7 +1706,7 @@ impl<'a, W: Write> Writer<'a, W> {
                write!(self.out, " {name}")?;
                write!(self.out, " = ")?;
                match self.module.types[arg.ty].inner {
-                    crate::TypeInner::Struct { ref members, .. } => {
+                    TypeInner::Struct { ref members, .. } => {
                        self.write_type(arg.ty)?;
                        write!(self.out, "(")?;
                        for (index, member) in members.iter().enumerate() {
@ -2180,7 +2191,7 @@ impl<'a, W: Write> Writer<'a, W> {
                        if let Some(ref result) = ep.function.result {
                            let value = value.unwrap();
                            match self.module.types[result.ty].inner {
-                                crate::TypeInner::Struct { ref members, .. } => {
+                                TypeInner::Struct { ref members, .. } => {
                                    let temp_struct_name = match ctx.expressions[value] {
                                        crate::Expression::Compose { .. } => {
                                            let return_struct = "_tmp_return";
@ -2384,6 +2395,125 @@ impl<'a, W: Write> Writer<'a, W> {
                writeln!(self.out, ");")?;
            }
            Statement::RayQuery { .. } => unreachable!(),
+            Statement::SubgroupBallot { result, predicate } => {
+                write!(self.out, "{level}")?;
+                let res_name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                let res_ty = ctx.info[result].ty.inner_with(&self.module.types);
+                self.write_value_type(res_ty)?;
+                write!(self.out, " {res_name} = ")?;
+                self.named_expressions.insert(result, res_name);
+
+                write!(self.out, "subgroupBallot(")?;
+                match predicate {
+                    Some(predicate) => self.write_expr(predicate, ctx)?,
+                    None => write!(self.out, "true")?,
+                }
+                writeln!(self.out, ");")?;
+            }
+            Statement::SubgroupCollectiveOperation {
+                op,
+                collective_op,
+                argument,
+                result,
+            } => {
+                write!(self.out, "{level}")?;
+                let res_name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                let res_ty = ctx.info[result].ty.inner_with(&self.module.types);
+                self.write_value_type(res_ty)?;
+                write!(self.out, " {res_name} = ")?;
+                self.named_expressions.insert(result, res_name);
+
+                match (collective_op, op) {
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::All) => {
+                        write!(self.out, "subgroupAll(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Any) => {
+                        write!(self.out, "subgroupAny(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "subgroupAdd(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "subgroupMul(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Max) => {
+                        write!(self.out, "subgroupMax(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Min) => {
+                        write!(self.out, "subgroupMin(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::And) => {
+                        write!(self.out, "subgroupAnd(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Or) => {
+                        write!(self.out, "subgroupOr(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Xor) => {
+                        write!(self.out, "subgroupXor(")?
+                    }
+                    (crate::CollectiveOperation::ExclusiveScan, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "subgroupExclusiveAdd(")?
+                    }
+                    (crate::CollectiveOperation::ExclusiveScan, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "subgroupExclusiveMul(")?
+                    }
+                    (crate::CollectiveOperation::InclusiveScan, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "subgroupInclusiveAdd(")?
+                    }
+                    (crate::CollectiveOperation::InclusiveScan, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "subgroupInclusiveMul(")?
+                    }
+                    _ => unimplemented!(),
+                }
+                self.write_expr(argument, ctx)?;
+                writeln!(self.out, ");")?;
+            }
+            Statement::SubgroupGather {
+                mode,
+                argument,
+                result,
+            } => {
+                write!(self.out, "{level}")?;
+                let res_name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                let res_ty = ctx.info[result].ty.inner_with(&self.module.types);
+                self.write_value_type(res_ty)?;
+                write!(self.out, " {res_name} = ")?;
+                self.named_expressions.insert(result, res_name);
+
+                match mode {
+                    crate::GatherMode::BroadcastFirst => {
+                        write!(self.out, "subgroupBroadcastFirst(")?;
+                    }
+                    crate::GatherMode::Broadcast(_) => {
+                        write!(self.out, "subgroupBroadcast(")?;
+                    }
+                    crate::GatherMode::Shuffle(_) => {
+                        write!(self.out, "subgroupShuffle(")?;
+                    }
+                    crate::GatherMode::ShuffleDown(_) => {
+                        write!(self.out, "subgroupShuffleDown(")?;
+                    }
+                    crate::GatherMode::ShuffleUp(_) => {
+                        write!(self.out, "subgroupShuffleUp(")?;
+                    }
+                    crate::GatherMode::ShuffleXor(_) => {
+                        write!(self.out, "subgroupShuffleXor(")?;
+                    }
+                }
+                self.write_expr(argument, ctx)?;
+                match mode {
+                    crate::GatherMode::BroadcastFirst => {}
+                    crate::GatherMode::Broadcast(index)
+                    | crate::GatherMode::Shuffle(index)
+                    | crate::GatherMode::ShuffleDown(index)
+                    | crate::GatherMode::ShuffleUp(index)
+                    | crate::GatherMode::ShuffleXor(index) => {
+                        write!(self.out, ", ")?;
+                        self.write_expr(index, ctx)?;
+                    }
+                }
+                writeln!(self.out, ");")?;
+            }
        }

        Ok(())
@ -2402,7 +2532,7 @@ impl<'a, W: Write> Writer<'a, W> {
    fn write_const_expr(&mut self, expr: Handle<crate::Expression>) -> BackendResult {
        self.write_possibly_const_expr(
            expr,
-            &self.module.const_expressions,
+            &self.module.global_expressions,
            |expr| &self.info[expr],
            |writer, expr| writer.write_const_expr(expr),
        )
@ -2536,6 +2666,7 @@ impl<'a, W: Write> Writer<'a, W> {
                    |writer, expr| writer.write_expr(expr, ctx),
                )?;
            }
+            Expression::Override(_) => return Err(Error::Override),
            // `Access` is applied to arrays, vectors and matrices and is written as indexing
            Expression::Access { base, index } => {
                self.write_expr(base, ctx)?;
@ -2842,7 +2973,7 @@ impl<'a, W: Write> Writer<'a, W> {
                                if let Some(expr) = level {
                                    let cast_to_int = matches!(
                                        *ctx.resolve_type(expr, &self.module.types),
-                                        crate::TypeInner::Scalar(crate::Scalar {
+                                        TypeInner::Scalar(crate::Scalar {
                                            kind: crate::ScalarKind::Uint,
                                            ..
                                        })
@ -3185,7 +3316,7 @@ impl<'a, W: Write> Writer<'a, W> {
                        self.write_expr(arg, ctx)?;

                        match *ctx.resolve_type(arg, &self.module.types) {
-                            crate::TypeInner::Vector { size, .. } => write!(
+                            TypeInner::Vector { size, .. } => write!(
                                self.out,
                                ", vec{}(0.0), vec{0}(1.0)",
                                back::vector_size_str(size)
@ -3232,7 +3363,7 @@ impl<'a, W: Write> Writer<'a, W> {
                    Mf::Pow => "pow",
                    // geometry
                    Mf::Dot => match *ctx.resolve_type(arg, &self.module.types) {
-                        crate::TypeInner::Vector {
+                        TypeInner::Vector {
                            scalar:
                                crate::Scalar {
                                    kind: crate::ScalarKind::Float,
@ -3240,7 +3371,7 @@ impl<'a, W: Write> Writer<'a, W> {
                                },
                            ..
                        } => "dot",
-                        crate::TypeInner::Vector { size, .. } => {
+                        TypeInner::Vector { size, .. } => {
                            return self.write_dot_product(arg, arg1.unwrap(), size as usize, ctx)
                        }
                        _ => unreachable!(
@ -3292,7 +3423,7 @@ impl<'a, W: Write> Writer<'a, W> {
                    // bits
                    Mf::CountTrailingZeros => {
                        match *ctx.resolve_type(arg, &self.module.types) {
-                            crate::TypeInner::Vector { size, scalar, .. } => {
+                            TypeInner::Vector { size, scalar, .. } => {
                                let s = back::vector_size_str(size);
                                if let crate::ScalarKind::Uint = scalar.kind {
                                    write!(self.out, "min(uvec{s}(findLSB(")?;
@ -3304,7 +3435,7 @@ impl<'a, W: Write> Writer<'a, W> {
                                    write!(self.out, ")), uvec{s}(32u)))")?;
                                }
                            }
-                            crate::TypeInner::Scalar(scalar) => {
+                            TypeInner::Scalar(scalar) => {
                                if let crate::ScalarKind::Uint = scalar.kind {
                                    write!(self.out, "min(uint(findLSB(")?;
                                    self.write_expr(arg, ctx)?;
@ -3322,7 +3453,7 @@ impl<'a, W: Write> Writer<'a, W> {
                    Mf::CountLeadingZeros => {
                        if self.options.version.supports_integer_functions() {
                            match *ctx.resolve_type(arg, &self.module.types) {
-                                crate::TypeInner::Vector { size, scalar } => {
+                                TypeInner::Vector { size, scalar } => {
                                    let s = back::vector_size_str(size);

                                    if let crate::ScalarKind::Uint = scalar.kind {
@ -3337,7 +3468,7 @@ impl<'a, W: Write> Writer<'a, W> {
                                        write!(self.out, ", ivec{s}(0)))")?;
                                    }
                                }
-                                crate::TypeInner::Scalar(scalar) => {
+                                TypeInner::Scalar(scalar) => {
                                    if let crate::ScalarKind::Uint = scalar.kind {
                                        write!(self.out, "uint(31 - findMSB(")?;
                                    } else {
@ -3353,7 +3484,7 @@ impl<'a, W: Write> Writer<'a, W> {
                            };
                        } else {
                            match *ctx.resolve_type(arg, &self.module.types) {
-                                crate::TypeInner::Vector { size, scalar } => {
+                                TypeInner::Vector { size, scalar } => {
                                    let s = back::vector_size_str(size);

                                    if let crate::ScalarKind::Uint = scalar.kind {
@ -3371,7 +3502,7 @@ impl<'a, W: Write> Writer<'a, W> {
                                        write!(self.out, ", ivec{s}(0u))))")?;
                                    }
                                }
-                                crate::TypeInner::Scalar(scalar) => {
+                                TypeInner::Scalar(scalar) => {
                                    if let crate::ScalarKind::Uint = scalar.kind {
                                        write!(self.out, "uint(31.0 - floor(log2(float(")?;
                                        self.write_expr(arg, ctx)?;
@ -3411,7 +3542,8 @@ impl<'a, W: Write> Writer<'a, W> {
                        let scalar_bits = ctx
                            .resolve_type(arg, &self.module.types)
                            .scalar_width()
-                            .unwrap();
+                            .unwrap()
+                            * 8;

                        write!(self.out, "bitfieldExtract(")?;
                        self.write_expr(arg, ctx)?;
@ -3430,7 +3562,8 @@ impl<'a, W: Write> Writer<'a, W> {
                        let scalar_bits = ctx
                            .resolve_type(arg, &self.module.types)
                            .scalar_width()
-                            .unwrap();
+                            .unwrap()
+                            * 8;

                        write!(self.out, "bitfieldInsert(")?;
                        self.write_expr(arg, ctx)?;
@ -3454,12 +3587,66 @@ impl<'a, W: Write> Writer<'a, W> {
                    Mf::Pack2x16snorm => "packSnorm2x16",
                    Mf::Pack2x16unorm => "packUnorm2x16",
                    Mf::Pack2x16float => "packHalf2x16",
+                    fun @ (Mf::Pack4xI8 | Mf::Pack4xU8) => {
+                        let was_signed = match fun {
+                            Mf::Pack4xI8 => true,
+                            Mf::Pack4xU8 => false,
+                            _ => unreachable!(),
+                        };
+                        let const_suffix = if was_signed { "" } else { "u" };
+                        if was_signed {
+                            write!(self.out, "uint(")?;
+                        }
+                        write!(self.out, "(")?;
+                        self.write_expr(arg, ctx)?;
+                        write!(self.out, "[0] & 0xFF{const_suffix}) | ((")?;
+                        self.write_expr(arg, ctx)?;
+                        write!(self.out, "[1] & 0xFF{const_suffix}) << 8) | ((")?;
+                        self.write_expr(arg, ctx)?;
+                        write!(self.out, "[2] & 0xFF{const_suffix}) << 16) | ((")?;
+                        self.write_expr(arg, ctx)?;
+                        write!(self.out, "[3] & 0xFF{const_suffix}) << 24)")?;
+                        if was_signed {
+                            write!(self.out, ")")?;
+                        }
+
+                        return Ok(());
+                    }
                    // data unpacking
                    Mf::Unpack4x8snorm => "unpackSnorm4x8",
                    Mf::Unpack4x8unorm => "unpackUnorm4x8",
                    Mf::Unpack2x16snorm => "unpackSnorm2x16",
                    Mf::Unpack2x16unorm => "unpackUnorm2x16",
                    Mf::Unpack2x16float => "unpackHalf2x16",
+                    fun @ (Mf::Unpack4xI8 | Mf::Unpack4xU8) => {
+                        let sign_prefix = match fun {
+                            Mf::Unpack4xI8 => 'i',
+                            Mf::Unpack4xU8 => 'u',
+                            _ => unreachable!(),
+                        };
+                        write!(self.out, "{sign_prefix}vec4(")?;
+                        for i in 0..4 {
+                            write!(self.out, "bitfieldExtract(")?;
+                            // Since bitfieldExtract only sign extends if the value is signed, this
+                            // cast is needed
+                            match fun {
+                                Mf::Unpack4xI8 => {
+                                    write!(self.out, "int(")?;
+                                    self.write_expr(arg, ctx)?;
+                                    write!(self.out, ")")?;
+                                }
+                                Mf::Unpack4xU8 => self.write_expr(arg, ctx)?,
+                                _ => unreachable!(),
+                            };
+                            write!(self.out, ", {}, 8)", i * 8)?;
+                            if i != 3 {
+                                write!(self.out, ", ")?;
+                            }
+                        }
+                        write!(self.out, ")")?;
+
+                        return Ok(());
+                    }
                };

                let extract_bits = fun == Mf::ExtractBits;
@ -3477,11 +3664,11 @@ impl<'a, W: Write> Writer<'a, W> {
                // Check if the argument is an unsigned integer and return the vector size
                // in case it's a vector
                let maybe_uint_size = match *ctx.resolve_type(arg, &self.module.types) {
-                    crate::TypeInner::Scalar(crate::Scalar {
+                    TypeInner::Scalar(crate::Scalar {
                        kind: crate::ScalarKind::Uint,
                        ..
                    }) => Some(None),
-                    crate::TypeInner::Vector {
+                    TypeInner::Vector {
                        scalar:
                            crate::Scalar {
                                kind: crate::ScalarKind::Uint,
@ -3649,7 +3836,9 @@ impl<'a, W: Write> Writer<'a, W> {
            Expression::CallResult(_)
            | Expression::AtomicResult { .. }
            | Expression::RayQueryProceedResult
-            | Expression::WorkGroupUniformLoadResult { .. } => unreachable!(),
+            | Expression::WorkGroupUniformLoadResult { .. }
+            | Expression::SubgroupOperationResult { .. }
+            | Expression::SubgroupBallotResult => unreachable!(),
            // `ArrayLength` is written as `expr.length()` and we convert it to a uint
            Expression::ArrayLength(expr) => {
                write!(self.out, "uint(")?;
@ -4218,6 +4407,9 @@ impl<'a, W: Write> Writer<'a, W> {
        if flags.contains(crate::Barrier::WORK_GROUP) {
            writeln!(self.out, "{level}memoryBarrierShared();")?;
        }
+        if flags.contains(crate::Barrier::SUB_GROUP) {
+            writeln!(self.out, "{level}subgroupMemoryBarrier();")?;
+        }
        writeln!(self.out, "{level}barrier();")?;
        Ok(())
    }
@ -4269,7 +4461,7 @@ impl<'a, W: Write> Writer<'a, W> {
                continue;
            }
            match self.module.types[var.ty].inner {
-                crate::TypeInner::Image { .. } => {
+                TypeInner::Image { .. } => {
                    let tex_name = self.reflection_names_globals[&handle].clone();
                    match texture_mapping.entry(tex_name) {
                        Entry::Vacant(v) => {
@ -4305,7 +4497,7 @@ impl<'a, W: Write> Writer<'a, W> {
            //
            // This is potentially a bit wasteful, but the set of types in the program
            // shouldn't be too large.
-            let mut layouter = crate::proc::Layouter::default();
+            let mut layouter = proc::Layouter::default();
            layouter.update(self.module.to_ctx()).unwrap();

            // We start with the name of the binding itself.
@ -4333,7 +4525,7 @@ impl<'a, W: Write> Writer<'a, W> {
        &mut self,
        ty: Handle<crate::Type>,
        segments: &mut Vec<String>,
-        layouter: &crate::proc::Layouter,
+        layouter: &proc::Layouter,
        offset: &mut u32,
        items: &mut Vec<PushConstantItem>,
    ) {
@ -4487,6 +4679,11 @@ const fn glsl_built_in(built_in: crate::BuiltIn, options: VaryingOptions) -> &'s
        Bi::WorkGroupId => "gl_WorkGroupID",
        Bi::WorkGroupSize => "gl_WorkGroupSize",
        Bi::NumWorkGroups => "gl_NumWorkGroups",
+        // subgroup
+        Bi::NumSubgroups => "gl_NumSubgroups",
+        Bi::SubgroupId => "gl_SubgroupID",
+        Bi::SubgroupSize => "gl_SubgroupSize",
+        Bi::SubgroupInvocationId => "gl_SubgroupInvocationID",
    }
 }

--- a/naga/src/back/hlsl/conv.rs
+++ b/naga/src/back/hlsl/conv.rs
@ -179,6 +179,11 @@ impl crate::BuiltIn {
            // to this field will get replaced with references to `SPECIAL_CBUF_VAR`
            // in `Writer::write_expr`.
            Self::NumWorkGroups => "SV_GroupID",
+            // These builtins map to functions
+            Self::SubgroupSize
+            | Self::SubgroupInvocationId
+            | Self::NumSubgroups
+            | Self::SubgroupId => unreachable!(),
            Self::BaseInstance | Self::BaseVertex | Self::WorkGroupSize => {
                return Err(Error::Unimplemented(format!("builtin {self:?}")))
            }
--- a/naga/src/back/hlsl/help.rs
+++ b/naga/src/back/hlsl/help.rs
@ -70,6 +70,11 @@ pub(super) struct WrappedMath {
    pub(super) components: Option<u32>,
 }

+#[derive(Clone, Copy, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)]
+pub(super) struct WrappedZeroValue {
+    pub(super) ty: Handle<crate::Type>,
+}
+
 /// HLSL backend requires its own `ImageQuery` enum.
 ///
 /// It is used inside `WrappedImageQuery` and should be unique per ImageQuery function.
@ -359,7 +364,7 @@ impl<'a, W: Write> super::Writer<'a, W> {
    }

    /// Helper function that write wrapped function for `Expression::Compose` for structures.
-    pub(super) fn write_wrapped_constructor_function(
+    fn write_wrapped_constructor_function(
        &mut self,
        module: &crate::Module,
        constructor: WrappedConstructor,
@ -862,6 +867,25 @@ impl<'a, W: Write> super::Writer<'a, W> {
        Ok(())
    }

+    // TODO: we could merge this with iteration in write_wrapped_compose_functions...
+    //
+    /// Helper function that writes zero value wrapped functions
+    pub(super) fn write_wrapped_zero_value_functions(
+        &mut self,
+        module: &crate::Module,
+        expressions: &crate::Arena<crate::Expression>,
+    ) -> BackendResult {
+        for (handle, _) in expressions.iter() {
+            if let crate::Expression::ZeroValue(ty) = expressions[handle] {
+                let zero_value = WrappedZeroValue { ty };
+                if self.wrapped.zero_values.insert(zero_value) {
+                    self.write_wrapped_zero_value_function(module, zero_value)?;
+                }
+            }
+        }
+        Ok(())
+    }
+
    pub(super) fn write_wrapped_math_functions(
        &mut self,
        module: &crate::Module,
@ -1006,6 +1030,7 @@ impl<'a, W: Write> super::Writer<'a, W> {
    ) -> BackendResult {
        self.write_wrapped_math_functions(module, func_ctx)?;
        self.write_wrapped_compose_functions(module, func_ctx.expressions)?;
+        self.write_wrapped_zero_value_functions(module, func_ctx.expressions)?;

        for (handle, _) in func_ctx.expressions.iter() {
            match func_ctx.expressions[handle] {
@ -1019,7 +1044,12 @@ impl<'a, W: Write> super::Writer<'a, W> {
                        crate::Expression::GlobalVariable(var_handle) => {
                            &module.global_variables[var_handle]
                        }
-                        ref other => unreachable!("Array length of base {:?}", other),
+                        ref other => {
+                            return Err(super::Error::Unimplemented(format!(
+                                "Array length of base {:?}",
+                                other
+                            )))
+                        }
                    };
                    let storage_access = match global_var.space {
                        crate::AddressSpace::Storage { access } => access,
@ -1283,4 +1313,71 @@ impl<'a, W: Write> super::Writer<'a, W> {

        Ok(())
    }
+
+    pub(super) fn write_wrapped_zero_value_function_name(
+        &mut self,
+        module: &crate::Module,
+        zero_value: WrappedZeroValue,
+    ) -> BackendResult {
+        let name = crate::TypeInner::hlsl_type_id(zero_value.ty, module.to_ctx(), &self.names)?;
+        write!(self.out, "ZeroValue{name}")?;
+        Ok(())
+    }
+
+    /// Helper function that write wrapped function for `Expression::ZeroValue`
+    ///
+    /// This is necessary since we might have a member access after the zero value expression, e.g.
+    /// `.y` (in practice this can come up when consuming SPIRV that's been produced by glslc).
+    ///
+    /// So we can't just write `(float4)0` since `(float4)0.y` won't parse correctly.
+    ///
+    /// Parenthesizing the expression like `((float4)0).y` would work... except DXC can't handle
+    /// cases like:
+    ///
+    /// ```text
+    /// tests\out\hlsl\access.hlsl:183:41: error: cannot compile this l-value expression yet
+    ///     t_1.am = (__mat4x2[2])((float4x2[2])0);
+    ///                                         ^
+    /// ```
+    fn write_wrapped_zero_value_function(
+        &mut self,
+        module: &crate::Module,
+        zero_value: WrappedZeroValue,
+    ) -> BackendResult {
+        use crate::back::INDENT;
+
+        const RETURN_VARIABLE_NAME: &str = "ret";
+
+        // Write function return type and name
+        if let crate::TypeInner::Array { base, size, .. } = module.types[zero_value.ty].inner {
+            write!(self.out, "typedef ")?;
+            self.write_type(module, zero_value.ty)?;
+            write!(self.out, " ret_")?;
+            self.write_wrapped_zero_value_function_name(module, zero_value)?;
+            self.write_array_size(module, base, size)?;
+            writeln!(self.out, ";")?;
+
+            write!(self.out, "ret_")?;
+            self.write_wrapped_zero_value_function_name(module, zero_value)?;
+        } else {
+            self.write_type(module, zero_value.ty)?;
+        }
+        write!(self.out, " ")?;
+        self.write_wrapped_zero_value_function_name(module, zero_value)?;
+
+        // Write function parameters (none) and start function body
+        writeln!(self.out, "() {{")?;
+
+        // Write `ZeroValue` function.
+        write!(self.out, "{INDENT}return ")?;
+        self.write_default_init(module, zero_value.ty)?;
+        writeln!(self.out, ";")?;
+
+        // End of function body
+        writeln!(self.out, "}}")?;
+        // Write extra new line
+        writeln!(self.out)?;
+
+        Ok(())
+    }
 }
--- a/naga/src/back/hlsl/mod.rs
+++ b/naga/src/back/hlsl/mod.rs
@ -131,6 +131,13 @@ pub enum ShaderModel {
    V5_0,
    V5_1,
    V6_0,
+    V6_1,
+    V6_2,
+    V6_3,
+    V6_4,
+    V6_5,
+    V6_6,
+    V6_7,
 }

 impl ShaderModel {
@ -139,6 +146,13 @@ impl ShaderModel {
            Self::V5_0 => "5_0",
            Self::V5_1 => "5_1",
            Self::V6_0 => "6_0",
+            Self::V6_1 => "6_1",
+            Self::V6_2 => "6_2",
+            Self::V6_3 => "6_3",
+            Self::V6_4 => "6_4",
+            Self::V6_5 => "6_5",
+            Self::V6_6 => "6_6",
+            Self::V6_7 => "6_7",
        }
    }
 }
@ -247,10 +261,13 @@ pub enum Error {
    Unimplemented(String), // TODO: Error used only during development
    #[error("{0}")]
    Custom(String),
+    #[error("overrides should not be present at this stage")]
+    Override,
 }

 #[derive(Default)]
 struct Wrapped {
+    zero_values: crate::FastHashSet<help::WrappedZeroValue>,
    array_lengths: crate::FastHashSet<help::WrappedArrayLength>,
    image_queries: crate::FastHashSet<help::WrappedImageQuery>,
    constructors: crate::FastHashSet<help::WrappedConstructor>,
--- a/naga/src/back/hlsl/writer.rs
+++ b/naga/src/back/hlsl/writer.rs
@ -1,5 +1,8 @@
 use super::{
-    help::{WrappedArrayLength, WrappedConstructor, WrappedImageQuery, WrappedStructMatrixAccess},
+    help::{
+        WrappedArrayLength, WrappedConstructor, WrappedImageQuery, WrappedStructMatrixAccess,
+        WrappedZeroValue,
+    },
    storage::StoreValue,
    BackendResult, Error, Options,
 };
@ -77,6 +80,19 @@ enum Io {
    Output,
 }

+const fn is_subgroup_builtin_binding(binding: &Option<crate::Binding>) -> bool {
+    let &Some(crate::Binding::BuiltIn(builtin)) = binding else {
+        return false;
+    };
+    matches!(
+        builtin,
+        crate::BuiltIn::SubgroupSize
+            | crate::BuiltIn::SubgroupInvocationId
+            | crate::BuiltIn::NumSubgroups
+            | crate::BuiltIn::SubgroupId
+    )
+}
+
 impl<'a, W: fmt::Write> super::Writer<'a, W> {
    pub fn new(out: W, options: &'a Options) -> Self {
        Self {
@ -137,16 +153,20 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                    | crate::MathFunction::Unpack2x16unorm
                    | crate::MathFunction::Unpack4x8snorm
                    | crate::MathFunction::Unpack4x8unorm
+                    | crate::MathFunction::Unpack4xI8
+                    | crate::MathFunction::Unpack4xU8
                    | crate::MathFunction::Pack2x16float
                    | crate::MathFunction::Pack2x16snorm
                    | crate::MathFunction::Pack2x16unorm
                    | crate::MathFunction::Pack4x8snorm
-                    | crate::MathFunction::Pack4x8unorm => {
+                    | crate::MathFunction::Pack4x8unorm
+                    | crate::MathFunction::Pack4xI8
+                    | crate::MathFunction::Pack4xU8 => {
                        self.need_bake_expressions.insert(arg);
                    }
                    crate::MathFunction::CountLeadingZeros => {
                        let inner = info[fun_handle].ty.inner_with(&module.types);
-                        if let Some(crate::ScalarKind::Sint) = inner.scalar_kind() {
+                        if let Some(ScalarKind::Sint) = inner.scalar_kind() {
                            self.need_bake_expressions.insert(arg);
                        }
                    }
@ -161,6 +181,19 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                }
            }
        }
+        for statement in func.body.iter() {
+            match *statement {
+                crate::Statement::SubgroupCollectiveOperation {
+                    op: _,
+                    collective_op: crate::CollectiveOperation::InclusiveScan,
+                    argument,
+                    result: _,
+                } => {
+                    self.need_bake_expressions.insert(argument);
+                }
+                _ => {}
+            }
+        }
    }

    pub fn write(
@ -168,6 +201,10 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
        module: &Module,
        module_info: &valid::ModuleInfo,
    ) -> Result<super::ReflectionInfo, Error> {
+        if !module.overrides.is_empty() {
+            return Err(Error::Override);
+        }
+
        self.reset(module);

        // Write special constants, if needed
@ -233,7 +270,8 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {

        self.write_special_functions(module)?;

-        self.write_wrapped_compose_functions(module, &module.const_expressions)?;
+        self.write_wrapped_compose_functions(module, &module.global_expressions)?;
+        self.write_wrapped_zero_value_functions(module, &module.global_expressions)?;

        // Write all named constants
        let mut constants = module
@ -397,31 +435,32 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
    // if they are struct, so that the `stage` argument here could be omitted.
    fn write_semantic(
        &mut self,
-        binding: &crate::Binding,
+        binding: &Option<crate::Binding>,
        stage: Option<(ShaderStage, Io)>,
    ) -> BackendResult {
        match *binding {
-            crate::Binding::BuiltIn(builtin) => {
+            Some(crate::Binding::BuiltIn(builtin)) if !is_subgroup_builtin_binding(binding) => {
                let builtin_str = builtin.to_hlsl_str()?;
                write!(self.out, " : {builtin_str}")?;
            }
-            crate::Binding::Location {
+            Some(crate::Binding::Location {
                second_blend_source: true,
                ..
-            } => {
+            }) => {
                write!(self.out, " : SV_Target1")?;
            }
-            crate::Binding::Location {
+            Some(crate::Binding::Location {
                location,
                second_blend_source: false,
                ..
-            } => {
-                if stage == Some((crate::ShaderStage::Fragment, Io::Output)) {
+            }) => {
+                if stage == Some((ShaderStage::Fragment, Io::Output)) {
                    write!(self.out, " : SV_Target{location}")?;
                } else {
                    write!(self.out, " : {LOCATION_SEMANTIC}{location}")?;
                }
            }
+            _ => {}
        }

        Ok(())
@ -442,17 +481,30 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
        write!(self.out, "struct {struct_name}")?;
        writeln!(self.out, " {{")?;
        for m in members.iter() {
+            if is_subgroup_builtin_binding(&m.binding) {
+                continue;
+            }
            write!(self.out, "{}", back::INDENT)?;
            if let Some(ref binding) = m.binding {
                self.write_modifier(binding)?;
            }
            self.write_type(module, m.ty)?;
            write!(self.out, " {}", &m.name)?;
-            if let Some(ref binding) = m.binding {
-                self.write_semantic(binding, Some(shader_stage))?;
-            }
+            self.write_semantic(&m.binding, Some(shader_stage))?;
            writeln!(self.out, ";")?;
        }
+        if members.iter().any(|arg| {
+            matches!(
+                arg.binding,
+                Some(crate::Binding::BuiltIn(crate::BuiltIn::SubgroupId))
+            )
+        }) {
+            writeln!(
+                self.out,
+                "{}uint __local_invocation_index : SV_GroupIndex;",
+                back::INDENT
+            )?;
+        }
        writeln!(self.out, "}};")?;
        writeln!(self.out)?;

@ -553,8 +605,8 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
    }

    /// Writes special interface structures for an entry point. The special structures have
-    /// all the fields flattened into them and sorted by binding. They are only needed for
-    /// VS outputs and FS inputs, so that these interfaces match.
+    /// all the fields flattened into them and sorted by binding. They are needed to emulate
+    /// subgroup built-ins and to make the interfaces between VS outputs and FS inputs match.
    fn write_ep_interface(
        &mut self,
        module: &Module,
@ -563,7 +615,13 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
        ep_name: &str,
    ) -> Result<EntryPointInterface, Error> {
        Ok(EntryPointInterface {
-            input: if !func.arguments.is_empty() && stage == ShaderStage::Fragment {
+            input: if !func.arguments.is_empty()
+                && (stage == ShaderStage::Fragment
+                    || func
+                        .arguments
+                        .iter()
+                        .any(|arg| is_subgroup_builtin_binding(&arg.binding)))
+            {
                Some(self.write_ep_input_struct(module, func, stage, ep_name)?)
            } else {
                None
@ -577,6 +635,38 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
        })
    }

+    fn write_ep_argument_initialization(
+        &mut self,
+        ep: &crate::EntryPoint,
+        ep_input: &EntryPointBinding,
+        fake_member: &EpStructMember,
+    ) -> BackendResult {
+        match fake_member.binding {
+            Some(crate::Binding::BuiltIn(crate::BuiltIn::SubgroupSize)) => {
+                write!(self.out, "WaveGetLaneCount()")?
+            }
+            Some(crate::Binding::BuiltIn(crate::BuiltIn::SubgroupInvocationId)) => {
+                write!(self.out, "WaveGetLaneIndex()")?
+            }
+            Some(crate::Binding::BuiltIn(crate::BuiltIn::NumSubgroups)) => write!(
+                self.out,
+                "({}u + WaveGetLaneCount() - 1u) / WaveGetLaneCount()",
+                ep.workgroup_size[0] * ep.workgroup_size[1] * ep.workgroup_size[2]
+            )?,
+            Some(crate::Binding::BuiltIn(crate::BuiltIn::SubgroupId)) => {
+                write!(
+                    self.out,
+                    "{}.__local_invocation_index / WaveGetLaneCount()",
+                    ep_input.arg_name
+                )?;
+            }
+            _ => {
+                write!(self.out, "{}.{}", ep_input.arg_name, fake_member.name)?;
+            }
+        }
+        Ok(())
+    }
+
    /// Write an entry point preface that initializes the arguments as specified in IR.
    fn write_ep_arguments_initialization(
        &mut self,
@ -584,6 +674,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
        func: &crate::Function,
        ep_index: u16,
    ) -> BackendResult {
+        let ep = &module.entry_points[ep_index as usize];
        let ep_input = match self.entry_point_io[ep_index as usize].input.take() {
            Some(ep_input) => ep_input,
            None => return Ok(()),
@ -597,8 +688,13 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
            match module.types[arg.ty].inner {
                TypeInner::Array { base, size, .. } => {
                    self.write_array_size(module, base, size)?;
-                    let fake_member = fake_iter.next().unwrap();
-                    writeln!(self.out, " = {}.{};", ep_input.arg_name, fake_member.name)?;
+                    write!(self.out, " = ")?;
+                    self.write_ep_argument_initialization(
+                        ep,
+                        &ep_input,
+                        fake_iter.next().unwrap(),
+                    )?;
+                    writeln!(self.out, ";")?;
                }
                TypeInner::Struct { ref members, .. } => {
                    write!(self.out, " = {{ ")?;
@ -606,14 +702,22 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                        if index != 0 {
                            write!(self.out, ", ")?;
                        }
-                        let fake_member = fake_iter.next().unwrap();
-                        write!(self.out, "{}.{}", ep_input.arg_name, fake_member.name)?;
+                        self.write_ep_argument_initialization(
+                            ep,
+                            &ep_input,
+                            fake_iter.next().unwrap(),
+                        )?;
                    }
                    writeln!(self.out, " }};")?;
                }
                _ => {
-                    let fake_member = fake_iter.next().unwrap();
-                    writeln!(self.out, " = {}.{};", ep_input.arg_name, fake_member.name)?;
+                    write!(self.out, " = ")?;
+                    self.write_ep_argument_initialization(
+                        ep,
+                        &ep_input,
+                        fake_iter.next().unwrap(),
+                    )?;
+                    writeln!(self.out, ";")?;
                }
            }
        }
@ -894,7 +998,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                    columns,
                    scalar,
                } if member.binding.is_none() && rows == crate::VectorSize::Bi => {
-                    let vec_ty = crate::TypeInner::Vector { size: rows, scalar };
+                    let vec_ty = TypeInner::Vector { size: rows, scalar };
                    let field_name_key = NameKey::StructMember(handle, index as u32);

                    for i in 0..columns as u8 {
@ -928,9 +1032,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                }
            }

-            if let Some(ref binding) = member.binding {
-                self.write_semantic(binding, shader_stage)?;
-            };
+            self.write_semantic(&member.binding, shader_stage)?;
            writeln!(self.out, ";")?;
        }

@ -1143,7 +1245,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
            }
            back::FunctionType::EntryPoint(ep_index) => {
                if let Some(ref ep_input) = self.entry_point_io[ep_index as usize].input {
-                    write!(self.out, "{} {}", ep_input.ty_name, ep_input.arg_name,)?;
+                    write!(self.out, "{} {}", ep_input.ty_name, ep_input.arg_name)?;
                } else {
                    let stage = module.entry_points[ep_index as usize].stage;
                    for (index, arg) in func.arguments.iter().enumerate() {
@ -1160,17 +1262,16 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                            self.write_array_size(module, base, size)?;
                        }

-                        if let Some(ref binding) = arg.binding {
-                            self.write_semantic(binding, Some((stage, Io::Input)))?;
-                        }
+                        self.write_semantic(&arg.binding, Some((stage, Io::Input)))?;
                    }
-
-                    if need_workgroup_variables_initialization {
-                        if !func.arguments.is_empty() {
-                            write!(self.out, ", ")?;
-                        }
-                        write!(self.out, "uint3 __local_invocation_id : SV_GroupThreadID")?;
+                }
+                if need_workgroup_variables_initialization {
+                    if self.entry_point_io[ep_index as usize].input.is_some()
+                        || !func.arguments.is_empty()
+                    {
+                        write!(self.out, ", ")?;
                    }
+                    write!(self.out, "uint3 __local_invocation_id : SV_GroupThreadID")?;
                }
            }
        }
@ -1180,11 +1281,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
        // Write semantic if it present
        if let back::FunctionType::EntryPoint(index) = func_ctx.ty {
            let stage = module.entry_points[index as usize].stage;
-            if let Some(crate::FunctionResult {
-                binding: Some(ref binding),
-                ..
-            }) = func.result
-            {
+            if let Some(crate::FunctionResult { ref binding, .. }) = func.result {
                self.write_semantic(binding, Some((stage, Io::Output)))?;
            }
        }
@ -1984,6 +2081,129 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                writeln!(self.out, "{level}}}")?
            }
            Statement::RayQuery { .. } => unreachable!(),
+            Statement::SubgroupBallot { result, predicate } => {
+                write!(self.out, "{level}")?;
+                let name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                write!(self.out, "const uint4 {name} = ")?;
+                self.named_expressions.insert(result, name);
+
+                write!(self.out, "WaveActiveBallot(")?;
+                match predicate {
+                    Some(predicate) => self.write_expr(module, predicate, func_ctx)?,
+                    None => write!(self.out, "true")?,
+                }
+                writeln!(self.out, ");")?;
+            }
+            Statement::SubgroupCollectiveOperation {
+                op,
+                collective_op,
+                argument,
+                result,
+            } => {
+                write!(self.out, "{level}")?;
+                write!(self.out, "const ")?;
+                let name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                match func_ctx.info[result].ty {
+                    proc::TypeResolution::Handle(handle) => self.write_type(module, handle)?,
+                    proc::TypeResolution::Value(ref value) => {
+                        self.write_value_type(module, value)?
+                    }
+                };
+                write!(self.out, " {name} = ")?;
+                self.named_expressions.insert(result, name);
+
+                match (collective_op, op) {
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::All) => {
+                        write!(self.out, "WaveActiveAllTrue(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Any) => {
+                        write!(self.out, "WaveActiveAnyTrue(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "WaveActiveSum(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "WaveActiveProduct(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Max) => {
+                        write!(self.out, "WaveActiveMax(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Min) => {
+                        write!(self.out, "WaveActiveMin(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::And) => {
+                        write!(self.out, "WaveActiveBitAnd(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Or) => {
+                        write!(self.out, "WaveActiveBitOr(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Xor) => {
+                        write!(self.out, "WaveActiveBitXor(")?
+                    }
+                    (crate::CollectiveOperation::ExclusiveScan, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "WavePrefixSum(")?
+                    }
+                    (crate::CollectiveOperation::ExclusiveScan, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "WavePrefixProduct(")?
+                    }
+                    (crate::CollectiveOperation::InclusiveScan, crate::SubgroupOperation::Add) => {
+                        self.write_expr(module, argument, func_ctx)?;
+                        write!(self.out, " + WavePrefixSum(")?;
+                    }
+                    (crate::CollectiveOperation::InclusiveScan, crate::SubgroupOperation::Mul) => {
+                        self.write_expr(module, argument, func_ctx)?;
+                        write!(self.out, " * WavePrefixProduct(")?;
+                    }
+                    _ => unimplemented!(),
+                }
+                self.write_expr(module, argument, func_ctx)?;
+                writeln!(self.out, ");")?;
+            }
+            Statement::SubgroupGather {
+                mode,
+                argument,
+                result,
+            } => {
+                write!(self.out, "{level}")?;
+                write!(self.out, "const ")?;
+                let name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                match func_ctx.info[result].ty {
+                    proc::TypeResolution::Handle(handle) => self.write_type(module, handle)?,
+                    proc::TypeResolution::Value(ref value) => {
+                        self.write_value_type(module, value)?
+                    }
+                };
+                write!(self.out, " {name} = ")?;
+                self.named_expressions.insert(result, name);
+
+                if matches!(mode, crate::GatherMode::BroadcastFirst) {
+                    write!(self.out, "WaveReadLaneFirst(")?;
+                    self.write_expr(module, argument, func_ctx)?;
+                } else {
+                    write!(self.out, "WaveReadLaneAt(")?;
+                    self.write_expr(module, argument, func_ctx)?;
+                    write!(self.out, ", ")?;
+                    match mode {
+                        crate::GatherMode::BroadcastFirst => unreachable!(),
+                        crate::GatherMode::Broadcast(index) | crate::GatherMode::Shuffle(index) => {
+                            self.write_expr(module, index, func_ctx)?;
+                        }
+                        crate::GatherMode::ShuffleDown(index) => {
+                            write!(self.out, "WaveGetLaneIndex() + ")?;
+                            self.write_expr(module, index, func_ctx)?;
+                        }
+                        crate::GatherMode::ShuffleUp(index) => {
+                            write!(self.out, "WaveGetLaneIndex() - ")?;
+                            self.write_expr(module, index, func_ctx)?;
+                        }
+                        crate::GatherMode::ShuffleXor(index) => {
+                            write!(self.out, "WaveGetLaneIndex() ^ ")?;
+                            self.write_expr(module, index, func_ctx)?;
+                        }
+                    }
+                }
+                writeln!(self.out, ");")?;
+            }
        }

        Ok(())
@ -1997,7 +2217,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
        self.write_possibly_const_expression(
            module,
            expr,
-            &module.const_expressions,
+            &module.global_expressions,
            |writer, expr| writer.write_const_expression(module, expr),
        )
    }
@ -2039,7 +2259,10 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                    self.write_const_expression(module, constant.init)?;
                }
            }
-            Expression::ZeroValue(ty) => self.write_default_init(module, ty)?,
+            Expression::ZeroValue(ty) => {
+                self.write_wrapped_zero_value_function_name(module, WrappedZeroValue { ty })?;
+                write!(self.out, "()")?;
+            }
            Expression::Compose { ty, ref components } => {
                match module.types[ty].inner {
                    TypeInner::Struct { .. } | TypeInner::Array { .. } => {
@ -2140,6 +2363,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                    |writer, expr| writer.write_expr(module, expr, func_ctx),
                )?;
            }
+            Expression::Override(_) => return Err(Error::Override),
            // All of the multiplication can be expressed as `mul`,
            // except vector * vector, which needs to use the "*" operator.
            Expression::Binary {
@ -2177,7 +2401,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                left,
                right,
            } if func_ctx.resolve_type(left, &module.types).scalar_kind()
-                == Some(crate::ScalarKind::Float) =>
+                == Some(ScalarKind::Float) =>
            {
                write!(self.out, "fmod(")?;
                self.write_expr(module, left, func_ctx)?;
@ -2188,7 +2412,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
            Expression::Binary { op, left, right } => {
                write!(self.out, "(")?;
                self.write_expr(module, left, func_ctx)?;
-                write!(self.out, " {} ", crate::back::binary_operation_str(op))?;
+                write!(self.out, " {} ", back::binary_operation_str(op))?;
                self.write_expr(module, right, func_ctx)?;
                write!(self.out, ")")?;
            }
@ -2588,7 +2812,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                        true
                    }
                    None => {
-                        if inner.scalar_width() == Some(64) {
+                        if inner.scalar_width() == Some(8) {
                            false
                        } else {
                            write!(self.out, "{}(", kind.to_hlsl_cast(),)?;
@ -2618,11 +2842,15 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                    Pack2x16unorm,
                    Pack4x8snorm,
                    Pack4x8unorm,
+                    Pack4xI8,
+                    Pack4xU8,
                    Unpack2x16float,
                    Unpack2x16snorm,
                    Unpack2x16unorm,
                    Unpack4x8snorm,
                    Unpack4x8unorm,
+                    Unpack4xI8,
+                    Unpack4xU8,
                    Regular(&'static str),
                    MissingIntOverload(&'static str),
                    MissingIntReturnType(&'static str),
@ -2704,12 +2932,16 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                    Mf::Pack2x16unorm => Function::Pack2x16unorm,
                    Mf::Pack4x8snorm => Function::Pack4x8snorm,
                    Mf::Pack4x8unorm => Function::Pack4x8unorm,
+                    Mf::Pack4xI8 => Function::Pack4xI8,
+                    Mf::Pack4xU8 => Function::Pack4xU8,
                    // Data Unpacking
                    Mf::Unpack2x16float => Function::Unpack2x16float,
                    Mf::Unpack2x16snorm => Function::Unpack2x16snorm,
                    Mf::Unpack2x16unorm => Function::Unpack2x16unorm,
                    Mf::Unpack4x8snorm => Function::Unpack4x8snorm,
                    Mf::Unpack4x8unorm => Function::Unpack4x8unorm,
+                    Mf::Unpack4xI8 => Function::Unpack4xI8,
+                    Mf::Unpack4xU8 => Function::Unpack4xU8,
                    _ => return Err(Error::Unimplemented(format!("write_expr_math {fun:?}"))),
                };

@ -2802,6 +3034,24 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                        self.write_expr(module, arg, func_ctx)?;
                        write!(self.out, "[3], 0.0, 1.0) * {scale}.0)) << 24)")?;
                    }
+                    fun @ (Function::Pack4xI8 | Function::Pack4xU8) => {
+                        let was_signed = matches!(fun, Function::Pack4xI8);
+                        if was_signed {
+                            write!(self.out, "uint(")?;
+                        }
+                        write!(self.out, "(")?;
+                        self.write_expr(module, arg, func_ctx)?;
+                        write!(self.out, "[0] & 0xFF) | ((")?;
+                        self.write_expr(module, arg, func_ctx)?;
+                        write!(self.out, "[1] & 0xFF) << 8) | ((")?;
+                        self.write_expr(module, arg, func_ctx)?;
+                        write!(self.out, "[2] & 0xFF) << 16) | ((")?;
+                        self.write_expr(module, arg, func_ctx)?;
+                        write!(self.out, "[3] & 0xFF) << 24)")?;
+                        if was_signed {
+                            write!(self.out, ")")?;
+                        }
+                    }

                    Function::Unpack2x16float => {
                        write!(self.out, "float2(f16tof32(")?;
@ -2854,6 +3104,20 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                        self.write_expr(module, arg, func_ctx)?;
                        write!(self.out, " >> 24) / {scale}.0)")?;
                    }
+                    fun @ (Function::Unpack4xI8 | Function::Unpack4xU8) => {
+                        if matches!(fun, Function::Unpack4xU8) {
+                            write!(self.out, "u")?;
+                        }
+                        write!(self.out, "int4(")?;
+                        self.write_expr(module, arg, func_ctx)?;
+                        write!(self.out, ", ")?;
+                        self.write_expr(module, arg, func_ctx)?;
+                        write!(self.out, " >> 8, ")?;
+                        self.write_expr(module, arg, func_ctx)?;
+                        write!(self.out, " >> 16, ")?;
+                        self.write_expr(module, arg, func_ctx)?;
+                        write!(self.out, " >> 24) << 24 >> 24")?;
+                    }
                    Function::Regular(fun_name) => {
                        write!(self.out, "{fun_name}(")?;
                        self.write_expr(module, arg, func_ctx)?;
@ -3129,7 +3393,9 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
            Expression::CallResult(_)
            | Expression::AtomicResult { .. }
            | Expression::WorkGroupUniformLoadResult { .. }
-            | Expression::RayQueryProceedResult => {}
+            | Expression::RayQueryProceedResult
+            | Expression::SubgroupBallotResult
+            | Expression::SubgroupOperationResult { .. } => {}
        }

        if !closing_bracket.is_empty() {
@ -3179,7 +3445,11 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
    }

    /// Helper function that write default zero initialization
-    fn write_default_init(&mut self, module: &Module, ty: Handle<crate::Type>) -> BackendResult {
+    pub(super) fn write_default_init(
+        &mut self,
+        module: &Module,
+        ty: Handle<crate::Type>,
+    ) -> BackendResult {
        write!(self.out, "(")?;
        self.write_type(module, ty)?;
        if let TypeInner::Array { base, size, .. } = module.types[ty].inner {
@ -3196,6 +3466,9 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
        if barrier.contains(crate::Barrier::WORK_GROUP) {
            writeln!(self.out, "{level}GroupMemoryBarrierWithGroupSync();")?;
        }
+        if barrier.contains(crate::Barrier::SUB_GROUP) {
+            // Does not exist in DirectX
+        }
        Ok(())
    }
 }
--- a/naga/src/back/mod.rs
+++ b/naga/src/back/mod.rs
@ -16,14 +16,36 @@ pub mod spv;
 #[cfg(feature = "wgsl-out")]
 pub mod wgsl;

-const COMPONENTS: &[char] = &['x', 'y', 'z', 'w'];
-const INDENT: &str = "    ";
-const BAKE_PREFIX: &str = "_e";
+#[cfg(any(
+    feature = "hlsl-out",
+    feature = "msl-out",
+    feature = "spv-out",
+    feature = "glsl-out"
+))]
+pub mod pipeline_constants;

-type NeedBakeExpressions = crate::FastHashSet<crate::Handle<crate::Expression>>;
+/// Names of vector components.
+pub const COMPONENTS: &[char] = &['x', 'y', 'z', 'w'];
+/// Indent for backends.
+pub const INDENT: &str = "    ";
+/// Prefix used for baking.
+pub const BAKE_PREFIX: &str = "_e";

+/// Expressions that need baking.
+pub type NeedBakeExpressions = crate::FastHashSet<crate::Handle<crate::Expression>>;
+
+/// Specifies the values of pipeline-overridable constants in the shader module.
+///
+/// If an `@id` attribute was specified on the declaration,
+/// the key must be the pipeline constant ID as a decimal ASCII number; if not,
+/// the key must be the constant's identifier name.
+///
+/// The value may represent any of WGSL's concrete scalar types.
+pub type PipelineConstants = std::collections::HashMap<String, f64>;
+
+/// Indentation level.
 #[derive(Clone, Copy)]
-struct Level(usize);
+pub struct Level(pub usize);

 impl Level {
    const fn next(&self) -> Self {
@ -52,7 +74,7 @@ impl std::fmt::Display for Level {
 /// [`EntryPoint`]: crate::EntryPoint
 /// [`Module`]: crate::Module
 /// [`Module::entry_points`]: crate::Module::entry_points
-enum FunctionType {
+pub enum FunctionType {
    /// A regular function.
    Function(crate::Handle<crate::Function>),
    /// An [`EntryPoint`], and its index in [`Module::entry_points`].
@ -63,7 +85,8 @@ enum FunctionType {
 }

 impl FunctionType {
-    fn is_compute_entry_point(&self, module: &crate::Module) -> bool {
+    /// Returns true if the function is an entry point for a compute shader.
+    pub fn is_compute_entry_point(&self, module: &crate::Module) -> bool {
        match *self {
            FunctionType::EntryPoint(index) => {
                module.entry_points[index as usize].stage == crate::ShaderStage::Compute
@ -74,19 +97,20 @@ impl FunctionType {
 }

 /// Helper structure that stores data needed when writing the function
-struct FunctionCtx<'a> {
+pub struct FunctionCtx<'a> {
    /// The current function being written
-    ty: FunctionType,
+    pub ty: FunctionType,
    /// Analysis about the function
-    info: &'a crate::valid::FunctionInfo,
+    pub info: &'a crate::valid::FunctionInfo,
    /// The expression arena of the current function being written
-    expressions: &'a crate::Arena<crate::Expression>,
+    pub expressions: &'a crate::Arena<crate::Expression>,
    /// Map of expressions that have associated variable names
-    named_expressions: &'a crate::NamedExpressions,
+    pub named_expressions: &'a crate::NamedExpressions,
 }

 impl FunctionCtx<'_> {
-    fn resolve_type<'a>(
+    /// Helper method that resolves a type of a given expression.
+    pub fn resolve_type<'a>(
        &'a self,
        handle: crate::Handle<crate::Expression>,
        types: &'a crate::UniqueArena<crate::Type>,
@ -95,7 +119,10 @@ impl FunctionCtx<'_> {
    }

    /// Helper method that generates a [`NameKey`](crate::proc::NameKey) for a local in the current function
-    const fn name_key(&self, local: crate::Handle<crate::LocalVariable>) -> crate::proc::NameKey {
+    pub const fn name_key(
+        &self,
+        local: crate::Handle<crate::LocalVariable>,
+    ) -> crate::proc::NameKey {
        match self.ty {
            FunctionType::Function(handle) => crate::proc::NameKey::FunctionLocal(handle, local),
            FunctionType::EntryPoint(idx) => crate::proc::NameKey::EntryPointLocal(idx, local),
@ -106,7 +133,7 @@ impl FunctionCtx<'_> {
    ///
    /// # Panics
    /// - If the function arguments are less or equal to `arg`
-    const fn argument_key(&self, arg: u32) -> crate::proc::NameKey {
+    pub const fn argument_key(&self, arg: u32) -> crate::proc::NameKey {
        match self.ty {
            FunctionType::Function(handle) => crate::proc::NameKey::FunctionArgument(handle, arg),
            FunctionType::EntryPoint(ep_index) => {
@ -115,8 +142,8 @@ impl FunctionCtx<'_> {
        }
    }

-    // Returns true if the given expression points to a fixed-function pipeline input.
-    fn is_fixed_function_input(
+    /// Returns true if the given expression points to a fixed-function pipeline input.
+    pub fn is_fixed_function_input(
        &self,
        mut expression: crate::Handle<crate::Expression>,
        module: &crate::Module,
@ -162,7 +189,7 @@ impl crate::Expression {
    /// See the [module-level documentation][emit] for details.
    ///
    /// [emit]: index.html#expression-evaluation-time
-    const fn bake_ref_count(&self) -> usize {
+    pub const fn bake_ref_count(&self) -> usize {
        match *self {
            // accesses are never cached, only loads are
            crate::Expression::Access { .. } | crate::Expression::AccessIndex { .. } => usize::MAX,
@ -181,9 +208,7 @@ impl crate::Expression {
 }

 /// Helper function that returns the string corresponding to the [`BinaryOperator`](crate::BinaryOperator)
-/// # Notes
-/// Used by `glsl-out`, `msl-out`, `wgsl-out`, `hlsl-out`.
-const fn binary_operation_str(op: crate::BinaryOperator) -> &'static str {
+pub const fn binary_operation_str(op: crate::BinaryOperator) -> &'static str {
    use crate::BinaryOperator as Bo;
    match op {
        Bo::Add => "+",
@ -208,8 +233,6 @@ const fn binary_operation_str(op: crate::BinaryOperator) -> &'static str {
 }

 /// Helper function that returns the string corresponding to the [`VectorSize`](crate::VectorSize)
-/// # Notes
-/// Used by `msl-out`, `wgsl-out`, `hlsl-out`.
 const fn vector_size_str(size: crate::VectorSize) -> &'static str {
    match size {
        crate::VectorSize::Bi => "2",
@ -219,7 +242,8 @@ const fn vector_size_str(size: crate::VectorSize) -> &'static str {
 }

 impl crate::TypeInner {
-    const fn is_handle(&self) -> bool {
+    /// Returns true if this is a handle to a type rather than the type directly.
+    pub const fn is_handle(&self) -> bool {
        match *self {
            crate::TypeInner::Image { .. } | crate::TypeInner::Sampler { .. } => true,
            _ => false,
@ -266,8 +290,9 @@ bitflags::bitflags! {
    }
 }

+/// The intersection test to use for ray queries.
 #[repr(u32)]
-enum RayIntersectionType {
+pub enum RayIntersectionType {
    Triangle = 1,
    BoundingBox = 4,
 }
--- a/naga/src/back/msl/mod.rs
+++ b/naga/src/back/msl/mod.rs
@ -143,6 +143,8 @@ pub enum Error {
    UnsupportedArrayOfType(Handle<crate::Type>),
    #[error("ray tracing is not supported prior to MSL 2.3")]
    UnsupportedRayTracing,
+    #[error("overrides should not be present at this stage")]
+    Override,
 }

 #[derive(Clone, Debug, PartialEq, thiserror::Error)]
@ -221,7 +223,7 @@ impl Default for Options {
 }

 /// A subset of options that are meant to be changed per pipeline.
-#[derive(Debug, Default, Clone, PartialEq, Eq, Hash)]
+#[derive(Debug, Default, Clone)]
 #[cfg_attr(feature = "serialize", derive(serde::Serialize))]
 #[cfg_attr(feature = "deserialize", derive(serde::Deserialize))]
 pub struct PipelineOptions {
@ -434,6 +436,11 @@ impl ResolvedBinding {
                    Bi::WorkGroupId => "threadgroup_position_in_grid",
                    Bi::WorkGroupSize => "dispatch_threads_per_threadgroup",
                    Bi::NumWorkGroups => "threadgroups_per_grid",
+                    // subgroup
+                    Bi::NumSubgroups => "simdgroups_per_threadgroup",
+                    Bi::SubgroupId => "simdgroup_index_in_threadgroup",
+                    Bi::SubgroupSize => "threads_per_simdgroup",
+                    Bi::SubgroupInvocationId => "thread_index_in_simdgroup",
                    Bi::CullDistance | Bi::ViewIndex => {
                        return Err(Error::UnsupportedBuiltIn(built_in))
                    }
@ -526,7 +533,7 @@ pub fn write_string(
    options: &Options,
    pipeline_options: &PipelineOptions,
 ) -> Result<(String, TranslationInfo), Error> {
-    let mut w = writer::Writer::new(String::new());
+    let mut w = Writer::new(String::new());
    let info = w.write(module, info, options, pipeline_options)?;
    Ok((w.finish(), info))
 }
@ -536,3 +543,21 @@ fn test_error_size() {
    use std::mem::size_of;
    assert_eq!(size_of::<Error>(), 32);
 }
+
+impl crate::AtomicFunction {
+    fn to_msl(self) -> Result<&'static str, Error> {
+        Ok(match self {
+            Self::Add => "fetch_add",
+            Self::Subtract => "fetch_sub",
+            Self::And => "fetch_and",
+            Self::InclusiveOr => "fetch_or",
+            Self::ExclusiveOr => "fetch_xor",
+            Self::Min => "fetch_min",
+            Self::Max => "fetch_max",
+            Self::Exchange { compare: None } => "exchange",
+            Self::Exchange { compare: Some(_) } => Err(Error::FeatureNotImplemented(
+                "atomic CompareExchange".to_string(),
+            ))?,
+        })
+    }
+}
--- a/naga/src/back/msl/writer.rs
+++ b/naga/src/back/msl/writer.rs
@ -1131,21 +1131,10 @@ impl<W: Write> Writer<W> {
        Ok(())
    }

-    fn put_atomic_fetch(
-        &mut self,
-        pointer: Handle<crate::Expression>,
-        key: &str,
-        value: Handle<crate::Expression>,
-        context: &ExpressionContext,
-    ) -> BackendResult {
-        self.put_atomic_operation(pointer, "fetch_", key, value, context)
-    }
-
    fn put_atomic_operation(
        &mut self,
        pointer: Handle<crate::Expression>,
-        key1: &str,
-        key2: &str,
+        key: &str,
        value: Handle<crate::Expression>,
        context: &ExpressionContext,
    ) -> BackendResult {
@ -1163,7 +1152,7 @@ impl<W: Write> Writer<W> {

        write!(
            self.out,
-            "{NAMESPACE}::atomic_{key1}{key2}_explicit({ATOMIC_REFERENCE}"
+            "{NAMESPACE}::atomic_{key}_explicit({ATOMIC_REFERENCE}"
        )?;
        self.put_access_chain(pointer, policy, context)?;
        write!(self.out, ", ")?;
@ -1248,7 +1237,7 @@ impl<W: Write> Writer<W> {
    ) -> BackendResult {
        self.put_possibly_const_expression(
            expr_handle,
-            &module.const_expressions,
+            &module.global_expressions,
            module,
            mod_info,
            &(module, mod_info),
@ -1431,6 +1420,7 @@ impl<W: Write> Writer<W> {
                    |writer, context, expr| writer.put_expression(expr, context, true),
                )?;
            }
+            crate::Expression::Override(_) => return Err(Error::Override),
            crate::Expression::Access { base, .. }
            | crate::Expression::AccessIndex { base, .. } => {
                // This is an acceptable place to generate a `ReadZeroSkipWrite` check.
@ -1606,7 +1596,7 @@ impl<W: Write> Writer<W> {
                write!(self.out, ")")?;
            }
            crate::Expression::Binary { op, left, right } => {
-                let op_str = crate::back::binary_operation_str(op);
+                let op_str = back::binary_operation_str(op);
                let kind = context
                    .resolve_type(left)
                    .scalar_kind()
@ -1838,12 +1828,16 @@ impl<W: Write> Writer<W> {
                    Mf::Pack2x16snorm => "pack_float_to_snorm2x16",
                    Mf::Pack2x16unorm => "pack_float_to_unorm2x16",
                    Mf::Pack2x16float => "",
+                    Mf::Pack4xI8 => "",
+                    Mf::Pack4xU8 => "",
                    // data unpacking
                    Mf::Unpack4x8snorm => "unpack_snorm4x8_to_float",
                    Mf::Unpack4x8unorm => "unpack_unorm4x8_to_float",
                    Mf::Unpack2x16snorm => "unpack_snorm2x16_to_float",
                    Mf::Unpack2x16unorm => "unpack_unorm2x16_to_float",
                    Mf::Unpack2x16float => "",
+                    Mf::Unpack4xI8 => "",
+                    Mf::Unpack4xU8 => "",
                };

                match fun {
@ -1863,133 +1857,177 @@ impl<W: Write> Writer<W> {
                    _ => {}
                }

-                if fun == Mf::Distance && scalar_argument {
-                    write!(self.out, "{NAMESPACE}::abs(")?;
-                    self.put_expression(arg, context, false)?;
-                    write!(self.out, " - ")?;
-                    self.put_expression(arg1.unwrap(), context, false)?;
-                    write!(self.out, ")")?;
-                } else if fun == Mf::FindLsb {
-                    let scalar = context.resolve_type(arg).scalar().unwrap();
-                    let constant = scalar.width * 8 + 1;
+                match fun {
+                    Mf::Distance if scalar_argument => {
+                        write!(self.out, "{NAMESPACE}::abs(")?;
+                        self.put_expression(arg, context, false)?;
+                        write!(self.out, " - ")?;
+                        self.put_expression(arg1.unwrap(), context, false)?;
+                        write!(self.out, ")")?;
+                    }
+                    Mf::FindLsb => {
+                        let scalar = context.resolve_type(arg).scalar().unwrap();
+                        let constant = scalar.width * 8 + 1;

-                    write!(self.out, "((({NAMESPACE}::ctz(")?;
-                    self.put_expression(arg, context, true)?;
-                    write!(self.out, ") + 1) % {constant}) - 1)")?;
-                } else if fun == Mf::FindMsb {
-                    let inner = context.resolve_type(arg);
-                    let scalar = inner.scalar().unwrap();
-                    let constant = scalar.width * 8 - 1;
-
-                    write!(
-                        self.out,
-                        "{NAMESPACE}::select({constant} - {NAMESPACE}::clz("
-                    )?;
-
-                    if scalar.kind == crate::ScalarKind::Sint {
-                        write!(self.out, "{NAMESPACE}::select(")?;
+                        write!(self.out, "((({NAMESPACE}::ctz(")?;
                        self.put_expression(arg, context, true)?;
-                        write!(self.out, ", ~")?;
+                        write!(self.out, ") + 1) % {constant}) - 1)")?;
+                    }
+                    Mf::FindMsb => {
+                        let inner = context.resolve_type(arg);
+                        let scalar = inner.scalar().unwrap();
+                        let constant = scalar.width * 8 - 1;
+
+                        write!(
+                            self.out,
+                            "{NAMESPACE}::select({constant} - {NAMESPACE}::clz("
+                        )?;
+
+                        if scalar.kind == crate::ScalarKind::Sint {
+                            write!(self.out, "{NAMESPACE}::select(")?;
+                            self.put_expression(arg, context, true)?;
+                            write!(self.out, ", ~")?;
+                            self.put_expression(arg, context, true)?;
+                            write!(self.out, ", ")?;
+                            self.put_expression(arg, context, true)?;
+                            write!(self.out, " < 0)")?;
+                        } else {
+                            self.put_expression(arg, context, true)?;
+                        }
+
+                        write!(self.out, "), ")?;
+
+                        // or metal will complain that select is ambiguous
+                        match *inner {
+                            crate::TypeInner::Vector { size, scalar } => {
+                                let size = back::vector_size_str(size);
+                                let name = scalar.to_msl_name();
+                                write!(self.out, "{name}{size}")?;
+                            }
+                            crate::TypeInner::Scalar(scalar) => {
+                                let name = scalar.to_msl_name();
+                                write!(self.out, "{name}")?;
+                            }
+                            _ => (),
+                        }
+
+                        write!(self.out, "(-1), ")?;
+                        self.put_expression(arg, context, true)?;
+                        write!(self.out, " == 0 || ")?;
+                        self.put_expression(arg, context, true)?;
+                        write!(self.out, " == -1)")?;
+                    }
+                    Mf::Unpack2x16float => {
+                        write!(self.out, "float2(as_type<half2>(")?;
+                        self.put_expression(arg, context, false)?;
+                        write!(self.out, "))")?;
+                    }
+                    Mf::Pack2x16float => {
+                        write!(self.out, "as_type<uint>(half2(")?;
+                        self.put_expression(arg, context, false)?;
+                        write!(self.out, "))")?;
+                    }
+                    Mf::ExtractBits => {
+                        // The behavior of ExtractBits is undefined when offset + count > bit_width. We need
+                        // to first sanitize the offset and count first. If we don't do this, Apple chips
+                        // will return out-of-spec values if the extracted range is not within the bit width.
+                        //
+                        // This encodes the exact formula specified by the wgsl spec, without temporary values:
+                        // https://gpuweb.github.io/gpuweb/wgsl/#extractBits-unsigned-builtin
+                        //
+                        // w = sizeof(x) * 8
+                        // o = min(offset, w)
+                        // tmp = w - o
+                        // c = min(count, tmp)
+                        //
+                        // bitfieldExtract(x, o, c)
+                        //
+                        // extract_bits(e, min(offset, w), min(count, w - min(offset, w))))
+
+                        let scalar_bits = context.resolve_type(arg).scalar_width().unwrap() * 8;
+
+                        write!(self.out, "{NAMESPACE}::extract_bits(")?;
+                        self.put_expression(arg, context, true)?;
+                        write!(self.out, ", {NAMESPACE}::min(")?;
+                        self.put_expression(arg1.unwrap(), context, true)?;
+                        write!(self.out, ", {scalar_bits}u), {NAMESPACE}::min(")?;
+                        self.put_expression(arg2.unwrap(), context, true)?;
+                        write!(self.out, ", {scalar_bits}u - {NAMESPACE}::min(")?;
+                        self.put_expression(arg1.unwrap(), context, true)?;
+                        write!(self.out, ", {scalar_bits}u)))")?;
+                    }
+                    Mf::InsertBits => {
+                        // The behavior of InsertBits has the same issue as ExtractBits.
+                        //
+                        // insertBits(e, newBits, min(offset, w), min(count, w - min(offset, w))))
+
+                        let scalar_bits = context.resolve_type(arg).scalar_width().unwrap() * 8;
+
+                        write!(self.out, "{NAMESPACE}::insert_bits(")?;
+                        self.put_expression(arg, context, true)?;
+                        write!(self.out, ", ")?;
+                        self.put_expression(arg1.unwrap(), context, true)?;
+                        write!(self.out, ", {NAMESPACE}::min(")?;
+                        self.put_expression(arg2.unwrap(), context, true)?;
+                        write!(self.out, ", {scalar_bits}u), {NAMESPACE}::min(")?;
+                        self.put_expression(arg3.unwrap(), context, true)?;
+                        write!(self.out, ", {scalar_bits}u - {NAMESPACE}::min(")?;
+                        self.put_expression(arg2.unwrap(), context, true)?;
+                        write!(self.out, ", {scalar_bits}u)))")?;
+                    }
+                    Mf::Radians => {
+                        write!(self.out, "((")?;
+                        self.put_expression(arg, context, false)?;
+                        write!(self.out, ") * 0.017453292519943295474)")?;
+                    }
+                    Mf::Degrees => {
+                        write!(self.out, "((")?;
+                        self.put_expression(arg, context, false)?;
+                        write!(self.out, ") * 57.295779513082322865)")?;
+                    }
+                    Mf::Modf | Mf::Frexp => {
+                        write!(self.out, "{fun_name}")?;
+                        self.put_call_parameters(iter::once(arg), context)?;
+                    }
+                    fun @ (Mf::Pack4xI8 | Mf::Pack4xU8) => {
+                        let was_signed = fun == Mf::Pack4xI8;
+                        if was_signed {
+                            write!(self.out, "uint(")?;
+                        }
+                        write!(self.out, "(")?;
+                        self.put_expression(arg, context, true)?;
+                        write!(self.out, "[0] & 0xFF) | ((")?;
+                        self.put_expression(arg, context, true)?;
+                        write!(self.out, "[1] & 0xFF) << 8) | ((")?;
+                        self.put_expression(arg, context, true)?;
+                        write!(self.out, "[2] & 0xFF) << 16) | ((")?;
+                        self.put_expression(arg, context, true)?;
+                        write!(self.out, "[3] & 0xFF) << 24)")?;
+                        if was_signed {
+                            write!(self.out, ")")?;
+                        }
+                    }
+                    fun @ (Mf::Unpack4xI8 | Mf::Unpack4xU8) => {
+                        if matches!(fun, Mf::Unpack4xU8) {
+                            write!(self.out, "u")?;
+                        }
+                        write!(self.out, "int4(")?;
                        self.put_expression(arg, context, true)?;
                        write!(self.out, ", ")?;
                        self.put_expression(arg, context, true)?;
-                        write!(self.out, " < 0)")?;
-                    } else {
+                        write!(self.out, " >> 8, ")?;
                        self.put_expression(arg, context, true)?;
+                        write!(self.out, " >> 16, ")?;
+                        self.put_expression(arg, context, true)?;
+                        write!(self.out, " >> 24) << 24 >> 24")?;
                    }
-
-                    write!(self.out, "), ")?;
-
-                    // or metal will complain that select is ambiguous
-                    match *inner {
-                        crate::TypeInner::Vector { size, scalar } => {
-                            let size = back::vector_size_str(size);
-                            let name = scalar.to_msl_name();
-                            write!(self.out, "{name}{size}")?;
-                        }
-                        crate::TypeInner::Scalar(scalar) => {
-                            let name = scalar.to_msl_name();
-                            write!(self.out, "{name}")?;
-                        }
-                        _ => (),
+                    _ => {
+                        write!(self.out, "{NAMESPACE}::{fun_name}")?;
+                        self.put_call_parameters(
+                            iter::once(arg).chain(arg1).chain(arg2).chain(arg3),
+                            context,
+                        )?;
                    }
-
-                    write!(self.out, "(-1), ")?;
-                    self.put_expression(arg, context, true)?;
-                    write!(self.out, " == 0 || ")?;
-                    self.put_expression(arg, context, true)?;
-                    write!(self.out, " == -1)")?;
-                } else if fun == Mf::Unpack2x16float {
-                    write!(self.out, "float2(as_type<half2>(")?;
-                    self.put_expression(arg, context, false)?;
-                    write!(self.out, "))")?;
-                } else if fun == Mf::Pack2x16float {
-                    write!(self.out, "as_type<uint>(half2(")?;
-                    self.put_expression(arg, context, false)?;
-                    write!(self.out, "))")?;
-                } else if fun == Mf::ExtractBits {
-                    // The behavior of ExtractBits is undefined when offset + count > bit_width. We need
-                    // to first sanitize the offset and count first. If we don't do this, Apple chips
-                    // will return out-of-spec values if the extracted range is not within the bit width.
-                    //
-                    // This encodes the exact formula specified by the wgsl spec, without temporary values:
-                    // https://gpuweb.github.io/gpuweb/wgsl/#extractBits-unsigned-builtin
-                    //
-                    // w = sizeof(x) * 8
-                    // o = min(offset, w)
-                    // tmp = w - o
-                    // c = min(count, tmp)
-                    //
-                    // bitfieldExtract(x, o, c)
-                    //
-                    // extract_bits(e, min(offset, w), min(count, w - min(offset, w))))
-
-                    let scalar_bits = context.resolve_type(arg).scalar_width().unwrap();
-
-                    write!(self.out, "{NAMESPACE}::extract_bits(")?;
-                    self.put_expression(arg, context, true)?;
-                    write!(self.out, ", {NAMESPACE}::min(")?;
-                    self.put_expression(arg1.unwrap(), context, true)?;
-                    write!(self.out, ", {scalar_bits}u), {NAMESPACE}::min(")?;
-                    self.put_expression(arg2.unwrap(), context, true)?;
-                    write!(self.out, ", {scalar_bits}u - {NAMESPACE}::min(")?;
-                    self.put_expression(arg1.unwrap(), context, true)?;
-                    write!(self.out, ", {scalar_bits}u)))")?;
-                } else if fun == Mf::InsertBits {
-                    // The behavior of InsertBits has the same issue as ExtractBits.
-                    //
-                    // insertBits(e, newBits, min(offset, w), min(count, w - min(offset, w))))
-
-                    let scalar_bits = context.resolve_type(arg).scalar_width().unwrap();
-
-                    write!(self.out, "{NAMESPACE}::insert_bits(")?;
-                    self.put_expression(arg, context, true)?;
-                    write!(self.out, ", ")?;
-                    self.put_expression(arg1.unwrap(), context, true)?;
-                    write!(self.out, ", {NAMESPACE}::min(")?;
-                    self.put_expression(arg2.unwrap(), context, true)?;
-                    write!(self.out, ", {scalar_bits}u), {NAMESPACE}::min(")?;
-                    self.put_expression(arg3.unwrap(), context, true)?;
-                    write!(self.out, ", {scalar_bits}u - {NAMESPACE}::min(")?;
-                    self.put_expression(arg2.unwrap(), context, true)?;
-                    write!(self.out, ", {scalar_bits}u)))")?;
-                } else if fun == Mf::Radians {
-                    write!(self.out, "((")?;
-                    self.put_expression(arg, context, false)?;
-                    write!(self.out, ") * 0.017453292519943295474)")?;
-                } else if fun == Mf::Degrees {
-                    write!(self.out, "((")?;
-                    self.put_expression(arg, context, false)?;
-                    write!(self.out, ") * 57.295779513082322865)")?;
-                } else if fun == Mf::Modf || fun == Mf::Frexp {
-                    write!(self.out, "{fun_name}")?;
-                    self.put_call_parameters(iter::once(arg), context)?;
-                } else {
-                    write!(self.out, "{NAMESPACE}::{fun_name}")?;
-                    self.put_call_parameters(
-                        iter::once(arg).chain(arg1).chain(arg2).chain(arg3),
-                        context,
-                    )?;
                }
            }
            crate::Expression::As {
@ -2041,6 +2079,8 @@ impl<W: Write> Writer<W> {
            crate::Expression::CallResult(_)
            | crate::Expression::AtomicResult { .. }
            | crate::Expression::WorkGroupUniformLoadResult { .. }
+            | crate::Expression::SubgroupBallotResult
+            | crate::Expression::SubgroupOperationResult { .. }
            | crate::Expression::RayQueryProceedResult => {
                unreachable!()
            }
@ -2607,7 +2647,11 @@ impl<W: Write> Writer<W> {
                            }
                        }
                    }
-                    crate::MathFunction::FindMsb => {
+                    crate::MathFunction::FindMsb
+                    | crate::MathFunction::Pack4xI8
+                    | crate::MathFunction::Pack4xU8
+                    | crate::MathFunction::Unpack4xI8
+                    | crate::MathFunction::Unpack4xU8 => {
                        self.need_bake_expressions.insert(arg);
                    }
                    crate::MathFunction::ExtractBits => {
@ -2994,43 +3038,8 @@ impl<W: Write> Writer<W> {
                    let res_name = format!("{}{}", back::BAKE_PREFIX, result.index());
                    self.start_baking_expression(result, &context.expression, &res_name)?;
                    self.named_expressions.insert(result, res_name);
-                    match *fun {
-                        crate::AtomicFunction::Add => {
-                            self.put_atomic_fetch(pointer, "add", value, &context.expression)?;
-                        }
-                        crate::AtomicFunction::Subtract => {
-                            self.put_atomic_fetch(pointer, "sub", value, &context.expression)?;
-                        }
-                        crate::AtomicFunction::And => {
-                            self.put_atomic_fetch(pointer, "and", value, &context.expression)?;
-                        }
-                        crate::AtomicFunction::InclusiveOr => {
-                            self.put_atomic_fetch(pointer, "or", value, &context.expression)?;
-                        }
-                        crate::AtomicFunction::ExclusiveOr => {
-                            self.put_atomic_fetch(pointer, "xor", value, &context.expression)?;
-                        }
-                        crate::AtomicFunction::Min => {
-                            self.put_atomic_fetch(pointer, "min", value, &context.expression)?;
-                        }
-                        crate::AtomicFunction::Max => {
-                            self.put_atomic_fetch(pointer, "max", value, &context.expression)?;
-                        }
-                        crate::AtomicFunction::Exchange { compare: None } => {
-                            self.put_atomic_operation(
-                                pointer,
-                                "exchange",
-                                "",
-                                value,
-                                &context.expression,
-                            )?;
-                        }
-                        crate::AtomicFunction::Exchange { .. } => {
-                            return Err(Error::FeatureNotImplemented(
-                                "atomic CompareExchange".to_string(),
-                            ));
-                        }
-                    }
+                    let fun_str = fun.to_msl()?;
+                    self.put_atomic_operation(pointer, fun_str, value, &context.expression)?;
                    // done
                    writeln!(self.out, ";")?;
                }
@ -3144,6 +3153,121 @@ impl<W: Write> Writer<W> {
                        }
                    }
                }
+                crate::Statement::SubgroupBallot { result, predicate } => {
+                    write!(self.out, "{level}")?;
+                    let name = self.namer.call("");
+                    self.start_baking_expression(result, &context.expression, &name)?;
+                    self.named_expressions.insert(result, name);
+                    write!(self.out, "uint4((uint64_t){NAMESPACE}::simd_ballot(")?;
+                    if let Some(predicate) = predicate {
+                        self.put_expression(predicate, &context.expression, true)?;
+                    } else {
+                        write!(self.out, "true")?;
+                    }
+                    writeln!(self.out, "), 0, 0, 0);")?;
+                }
+                crate::Statement::SubgroupCollectiveOperation {
+                    op,
+                    collective_op,
+                    argument,
+                    result,
+                } => {
+                    write!(self.out, "{level}")?;
+                    let name = self.namer.call("");
+                    self.start_baking_expression(result, &context.expression, &name)?;
+                    self.named_expressions.insert(result, name);
+                    match (collective_op, op) {
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::All) => {
+                            write!(self.out, "{NAMESPACE}::simd_all(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Any) => {
+                            write!(self.out, "{NAMESPACE}::simd_any(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Add) => {
+                            write!(self.out, "{NAMESPACE}::simd_sum(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Mul) => {
+                            write!(self.out, "{NAMESPACE}::simd_product(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Max) => {
+                            write!(self.out, "{NAMESPACE}::simd_max(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Min) => {
+                            write!(self.out, "{NAMESPACE}::simd_min(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::And) => {
+                            write!(self.out, "{NAMESPACE}::simd_and(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Or) => {
+                            write!(self.out, "{NAMESPACE}::simd_or(")?
+                        }
+                        (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Xor) => {
+                            write!(self.out, "{NAMESPACE}::simd_xor(")?
+                        }
+                        (
+                            crate::CollectiveOperation::ExclusiveScan,
+                            crate::SubgroupOperation::Add,
+                        ) => write!(self.out, "{NAMESPACE}::simd_prefix_exclusive_sum(")?,
+                        (
+                            crate::CollectiveOperation::ExclusiveScan,
+                            crate::SubgroupOperation::Mul,
+                        ) => write!(self.out, "{NAMESPACE}::simd_prefix_exclusive_product(")?,
+                        (
+                            crate::CollectiveOperation::InclusiveScan,
+                            crate::SubgroupOperation::Add,
+                        ) => write!(self.out, "{NAMESPACE}::simd_prefix_inclusive_sum(")?,
+                        (
+                            crate::CollectiveOperation::InclusiveScan,
+                            crate::SubgroupOperation::Mul,
+                        ) => write!(self.out, "{NAMESPACE}::simd_prefix_inclusive_product(")?,
+                        _ => unimplemented!(),
+                    }
+                    self.put_expression(argument, &context.expression, true)?;
+                    writeln!(self.out, ");")?;
+                }
+                crate::Statement::SubgroupGather {
+                    mode,
+                    argument,
+                    result,
+                } => {
+                    write!(self.out, "{level}")?;
+                    let name = self.namer.call("");
+                    self.start_baking_expression(result, &context.expression, &name)?;
+                    self.named_expressions.insert(result, name);
+                    match mode {
+                        crate::GatherMode::BroadcastFirst => {
+                            write!(self.out, "{NAMESPACE}::simd_broadcast_first(")?;
+                        }
+                        crate::GatherMode::Broadcast(_) => {
+                            write!(self.out, "{NAMESPACE}::simd_broadcast(")?;
+                        }
+                        crate::GatherMode::Shuffle(_) => {
+                            write!(self.out, "{NAMESPACE}::simd_shuffle(")?;
+                        }
+                        crate::GatherMode::ShuffleDown(_) => {
+                            write!(self.out, "{NAMESPACE}::simd_shuffle_down(")?;
+                        }
+                        crate::GatherMode::ShuffleUp(_) => {
+                            write!(self.out, "{NAMESPACE}::simd_shuffle_up(")?;
+                        }
+                        crate::GatherMode::ShuffleXor(_) => {
+                            write!(self.out, "{NAMESPACE}::simd_shuffle_xor(")?;
+                        }
+                    }
+                    self.put_expression(argument, &context.expression, true)?;
+                    match mode {
+                        crate::GatherMode::BroadcastFirst => {}
+                        crate::GatherMode::Broadcast(index)
+                        | crate::GatherMode::Shuffle(index)
+                        | crate::GatherMode::ShuffleDown(index)
+                        | crate::GatherMode::ShuffleUp(index)
+                        | crate::GatherMode::ShuffleXor(index) => {
+                            write!(self.out, ", ")?;
+                            self.put_expression(index, &context.expression, true)?;
+                        }
+                    }
+                    writeln!(self.out, ");")?;
+                }
            }
        }

@ -3220,6 +3344,10 @@ impl<W: Write> Writer<W> {
        options: &Options,
        pipeline_options: &PipelineOptions,
    ) -> Result<TranslationInfo, Error> {
+        if !module.overrides.is_empty() {
+            return Err(Error::Override);
+        }
+
        self.names.clear();
        self.namer.reset(
            module,
@ -3897,7 +4025,7 @@ impl<W: Write> Writer<W> {
            // mapping.
            let mut flattened_member_names = FastHashMap::default();
            // Varyings' members get their own namespace
-            let mut varyings_namer = crate::proc::Namer::default();
+            let mut varyings_namer = proc::Namer::default();

            // List all the Naga `EntryPoint`'s `Function`'s arguments,
            // flattening structs into their members. In Metal, we will pass
@ -4487,6 +4615,12 @@ impl<W: Write> Writer<W> {
                "{level}{NAMESPACE}::threadgroup_barrier({NAMESPACE}::mem_flags::mem_threadgroup);",
            )?;
        }
+        if flags.contains(crate::Barrier::SUB_GROUP) {
+            writeln!(
+                self.out,
+                "{level}{NAMESPACE}::simdgroup_barrier({NAMESPACE}::mem_flags::mem_threadgroup);",
+            )?;
+        }
        Ok(())
    }
 }
@ -4722,7 +4856,7 @@ fn test_stack_size() {
    );
    let _ = module.functions.append(fun, Default::default());
    // analyse the module
-    let info = crate::valid::Validator::new(ValidationFlags::empty(), Capabilities::empty())
+    let info = valid::Validator::new(ValidationFlags::empty(), Capabilities::empty())
        .validate(&module)
        .unwrap();
    // process the module
@ -4757,8 +4891,8 @@ fn test_stack_size() {
        }
        let stack_size = addresses_end - addresses_start;
        // check the size (in debug only)
-        // last observed macOS value: 19152 (CI)
-        if !(9000..=20000).contains(&stack_size) {
+        // last observed macOS value: 22256 (CI)
+        if !(15000..=25000).contains(&stack_size) {
            panic!("`put_block` stack size {stack_size} has changed!");
        }
    }
--- a/naga/src/back/pipeline_constants.rs
+++ b/naga/src/back/pipeline_constants.rs
@ -0,0 +1,957 @@
+use super::PipelineConstants;
+use crate::{
+    proc::{ConstantEvaluator, ConstantEvaluatorError, Emitter},
+    valid::{Capabilities, ModuleInfo, ValidationError, ValidationFlags, Validator},
+    Arena, Block, Constant, Expression, Function, Handle, Literal, Module, Override, Range, Scalar,
+    Span, Statement, TypeInner, WithSpan,
+};
+use std::{borrow::Cow, collections::HashSet, mem};
+use thiserror::Error;
+
+#[derive(Error, Debug, Clone)]
+#[cfg_attr(test, derive(PartialEq))]
+pub enum PipelineConstantError {
+    #[error("Missing value for pipeline-overridable constant with identifier string: '{0}'")]
+    MissingValue(String),
+    #[error("Source f64 value needs to be finite (NaNs and Inifinites are not allowed) for number destinations")]
+    SrcNeedsToBeFinite,
+    #[error("Source f64 value doesn't fit in destination")]
+    DstRangeTooSmall,
+    #[error(transparent)]
+    ConstantEvaluatorError(#[from] ConstantEvaluatorError),
+    #[error(transparent)]
+    ValidationError(#[from] WithSpan<ValidationError>),
+}
+
+/// Replace all overrides in `module` with constants.
+///
+/// If no changes are needed, this just returns `Cow::Borrowed`
+/// references to `module` and `module_info`. Otherwise, it clones
+/// `module`, edits its [`global_expressions`] arena to contain only
+/// fully-evaluated expressions, and returns `Cow::Owned` values
+/// holding the simplified module and its validation results.
+///
+/// In either case, the module returned has an empty `overrides`
+/// arena, and the `global_expressions` arena contains only
+/// fully-evaluated expressions.
+///
+/// [`global_expressions`]: Module::global_expressions
+pub fn process_overrides<'a>(
+    module: &'a Module,
+    module_info: &'a ModuleInfo,
+    pipeline_constants: &PipelineConstants,
+) -> Result<(Cow<'a, Module>, Cow<'a, ModuleInfo>), PipelineConstantError> {
+    if module.overrides.is_empty() {
+        return Ok((Cow::Borrowed(module), Cow::Borrowed(module_info)));
+    }
+
+    let mut module = module.clone();
+
+    // A map from override handles to the handles of the constants
+    // we've replaced them with.
+    let mut override_map = Vec::with_capacity(module.overrides.len());
+
+    // A map from `module`'s original global expression handles to
+    // handles in the new, simplified global expression arena.
+    let mut adjusted_global_expressions = Vec::with_capacity(module.global_expressions.len());
+
+    // The set of constants whose initializer handles we've already
+    // updated to refer to the newly built global expression arena.
+    //
+    // All constants in `module` must have their `init` handles
+    // updated to point into the new, simplified global expression
+    // arena. Some of these we can most easily handle as a side effect
+    // during the simplification process, but we must handle the rest
+    // in a final fixup pass, guided by `adjusted_global_expressions`. We
+    // add their handles to this set, so that the final fixup step can
+    // leave them alone.
+    let mut adjusted_constant_initializers = HashSet::with_capacity(module.constants.len());
+
+    let mut global_expression_kind_tracker = crate::proc::ExpressionKindTracker::new();
+
+    // An iterator through the original overrides table, consumed in
+    // approximate tandem with the global expressions.
+    let mut override_iter = module.overrides.drain();
+
+    // Do two things in tandem:
+    //
+    // - Rebuild the global expression arena from scratch, fully
+    //   evaluating all expressions, and replacing each `Override`
+    //   expression in `module.global_expressions` with a `Constant`
+    //   expression.
+    //
+    // - Build a new `Constant` in `module.constants` to take the
+    //   place of each `Override`.
+    //
+    // Build a map from old global expression handles to their
+    // fully-evaluated counterparts in `adjusted_global_expressions` as we
+    // go.
+    //
+    // Why in tandem? Overrides refer to expressions, and expressions
+    // refer to overrides, so we can't disentangle the two into
+    // separate phases. However, we can take advantage of the fact
+    // that the overrides and expressions must form a DAG, and work
+    // our way from the leaves to the roots, replacing and evaluating
+    // as we go.
+    //
+    // Although the two loops are nested, this is really two
+    // alternating phases: we adjust and evaluate constant expressions
+    // until we hit an `Override` expression, at which point we switch
+    // to building `Constant`s for `Overrides` until we've handled the
+    // one used by the expression. Then we switch back to processing
+    // expressions. Because we know they form a DAG, we know the
+    // `Override` expressions we encounter can only have initializers
+    // referring to global expressions we've already simplified.
+    for (old_h, expr, span) in module.global_expressions.drain() {
+        let mut expr = match expr {
+            Expression::Override(h) => {
+                let c_h = if let Some(new_h) = override_map.get(h.index()) {
+                    *new_h
+                } else {
+                    let mut new_h = None;
+                    for entry in override_iter.by_ref() {
+                        let stop = entry.0 == h;
+                        new_h = Some(process_override(
+                            entry,
+                            pipeline_constants,
+                            &mut module,
+                            &mut override_map,
+                            &adjusted_global_expressions,
+                            &mut adjusted_constant_initializers,
+                            &mut global_expression_kind_tracker,
+                        )?);
+                        if stop {
+                            break;
+                        }
+                    }
+                    new_h.unwrap()
+                };
+                Expression::Constant(c_h)
+            }
+            Expression::Constant(c_h) => {
+                if adjusted_constant_initializers.insert(c_h) {
+                    let init = &mut module.constants[c_h].init;
+                    *init = adjusted_global_expressions[init.index()];
+                }
+                expr
+            }
+            expr => expr,
+        };
+        let mut evaluator = ConstantEvaluator::for_wgsl_module(
+            &mut module,
+            &mut global_expression_kind_tracker,
+            false,
+        );
+        adjust_expr(&adjusted_global_expressions, &mut expr);
+        let h = evaluator.try_eval_and_append(expr, span)?;
+        debug_assert_eq!(old_h.index(), adjusted_global_expressions.len());
+        adjusted_global_expressions.push(h);
+    }
+
+    // Finish processing any overrides we didn't visit in the loop above.
+    for entry in override_iter {
+        process_override(
+            entry,
+            pipeline_constants,
+            &mut module,
+            &mut override_map,
+            &adjusted_global_expressions,
+            &mut adjusted_constant_initializers,
+            &mut global_expression_kind_tracker,
+        )?;
+    }
+
+    // Update the initialization expression handles of all `Constant`s
+    // and `GlobalVariable`s. Skip `Constant`s we'd already updated en
+    // passant.
+    for (_, c) in module
+        .constants
+        .iter_mut()
+        .filter(|&(c_h, _)| !adjusted_constant_initializers.contains(&c_h))
+    {
+        c.init = adjusted_global_expressions[c.init.index()];
+    }
+
+    for (_, v) in module.global_variables.iter_mut() {
+        if let Some(ref mut init) = v.init {
+            *init = adjusted_global_expressions[init.index()];
+        }
+    }
+
+    let mut functions = mem::take(&mut module.functions);
+    for (_, function) in functions.iter_mut() {
+        process_function(&mut module, &override_map, function)?;
+    }
+    module.functions = functions;
+
+    let mut entry_points = mem::take(&mut module.entry_points);
+    for ep in entry_points.iter_mut() {
+        process_function(&mut module, &override_map, &mut ep.function)?;
+    }
+    module.entry_points = entry_points;
+
+    // Now that we've rewritten all the expressions, we need to
+    // recompute their types and other metadata. For the time being,
+    // do a full re-validation.
+    let mut validator = Validator::new(ValidationFlags::all(), Capabilities::all());
+    let module_info = validator.validate_no_overrides(&module)?;
+
+    Ok((Cow::Owned(module), Cow::Owned(module_info)))
+}
+
+/// Add a [`Constant`] to `module` for the override `old_h`.
+///
+/// Add the new `Constant` to `override_map` and `adjusted_constant_initializers`.
+fn process_override(
+    (old_h, override_, span): (Handle<Override>, Override, Span),
+    pipeline_constants: &PipelineConstants,
+    module: &mut Module,
+    override_map: &mut Vec<Handle<Constant>>,
+    adjusted_global_expressions: &[Handle<Expression>],
+    adjusted_constant_initializers: &mut HashSet<Handle<Constant>>,
+    global_expression_kind_tracker: &mut crate::proc::ExpressionKindTracker,
+) -> Result<Handle<Constant>, PipelineConstantError> {
+    // Determine which key to use for `override_` in `pipeline_constants`.
+    let key = if let Some(id) = override_.id {
+        Cow::Owned(id.to_string())
+    } else if let Some(ref name) = override_.name {
+        Cow::Borrowed(name)
+    } else {
+        unreachable!();
+    };
+
+    // Generate a global expression for `override_`'s value, either
+    // from the provided `pipeline_constants` table or its initializer
+    // in the module.
+    let init = if let Some(value) = pipeline_constants.get::<str>(&key) {
+        let literal = match module.types[override_.ty].inner {
+            TypeInner::Scalar(scalar) => map_value_to_literal(*value, scalar)?,
+            _ => unreachable!(),
+        };
+        let expr = module
+            .global_expressions
+            .append(Expression::Literal(literal), Span::UNDEFINED);
+        global_expression_kind_tracker.insert(expr, crate::proc::ExpressionKind::Const);
+        expr
+    } else if let Some(init) = override_.init {
+        adjusted_global_expressions[init.index()]
+    } else {
+        return Err(PipelineConstantError::MissingValue(key.to_string()));
+    };
+
+    // Generate a new `Constant` to represent the override's value.
+    let constant = Constant {
+        name: override_.name,
+        ty: override_.ty,
+        init,
+    };
+    let h = module.constants.append(constant, span);
+    debug_assert_eq!(old_h.index(), override_map.len());
+    override_map.push(h);
+    adjusted_constant_initializers.insert(h);
+    Ok(h)
+}
+
+/// Replace all override expressions in `function` with fully-evaluated constants.
+///
+/// Replace all `Expression::Override`s in `function`'s expression arena with
+/// the corresponding `Expression::Constant`s, as given in `override_map`.
+/// Replace any expressions whose values are now known with their fully
+/// evaluated form.
+///
+/// If `h` is a `Handle<Override>`, then `override_map[h.index()]` is the
+/// `Handle<Constant>` for the override's final value.
+fn process_function(
+    module: &mut Module,
+    override_map: &[Handle<Constant>],
+    function: &mut Function,
+) -> Result<(), ConstantEvaluatorError> {
+    // A map from original local expression handles to
+    // handles in the new, local expression arena.
+    let mut adjusted_local_expressions = Vec::with_capacity(function.expressions.len());
+
+    let mut local_expression_kind_tracker = crate::proc::ExpressionKindTracker::new();
+
+    let mut expressions = mem::take(&mut function.expressions);
+
+    // Dummy `emitter` and `block` for the constant evaluator.
+    // We can ignore the concept of emitting expressions here since
+    // expressions have already been covered by a `Statement::Emit`
+    // in the frontend.
+    // The only thing we might have to do is remove some expressions
+    // that have been covered by a `Statement::Emit`. See the docs of
+    // `filter_emits_in_block` for the reasoning.
+    let mut emitter = Emitter::default();
+    let mut block = Block::new();
+
+    let mut evaluator = ConstantEvaluator::for_wgsl_function(
+        module,
+        &mut function.expressions,
+        &mut local_expression_kind_tracker,
+        &mut emitter,
+        &mut block,
+    );
+
+    for (old_h, mut expr, span) in expressions.drain() {
+        if let Expression::Override(h) = expr {
+            expr = Expression::Constant(override_map[h.index()]);
+        }
+        adjust_expr(&adjusted_local_expressions, &mut expr);
+        let h = evaluator.try_eval_and_append(expr, span)?;
+        debug_assert_eq!(old_h.index(), adjusted_local_expressions.len());
+        adjusted_local_expressions.push(h);
+    }
+
+    adjust_block(&adjusted_local_expressions, &mut function.body);
+
+    filter_emits_in_block(&mut function.body, &function.expressions);
+
+    // Update local expression initializers.
+    for (_, local) in function.local_variables.iter_mut() {
+        if let &mut Some(ref mut init) = &mut local.init {
+            *init = adjusted_local_expressions[init.index()];
+        }
+    }
+
+    // We've changed the keys of `function.named_expression`, so we have to
+    // rebuild it from scratch.
+    let named_expressions = mem::take(&mut function.named_expressions);
+    for (expr_h, name) in named_expressions {
+        function
+            .named_expressions
+            .insert(adjusted_local_expressions[expr_h.index()], name);
+    }
+
+    Ok(())
+}
+
+/// Replace every expression handle in `expr` with its counterpart
+/// given by `new_pos`.
+fn adjust_expr(new_pos: &[Handle<Expression>], expr: &mut Expression) {
+    let adjust = |expr: &mut Handle<Expression>| {
+        *expr = new_pos[expr.index()];
+    };
+    match *expr {
+        Expression::Compose {
+            ref mut components,
+            ty: _,
+        } => {
+            for c in components.iter_mut() {
+                adjust(c);
+            }
+        }
+        Expression::Access {
+            ref mut base,
+            ref mut index,
+        } => {
+            adjust(base);
+            adjust(index);
+        }
+        Expression::AccessIndex {
+            ref mut base,
+            index: _,
+        } => {
+            adjust(base);
+        }
+        Expression::Splat {
+            ref mut value,
+            size: _,
+        } => {
+            adjust(value);
+        }
+        Expression::Swizzle {
+            ref mut vector,
+            size: _,
+            pattern: _,
+        } => {
+            adjust(vector);
+        }
+        Expression::Load { ref mut pointer } => {
+            adjust(pointer);
+        }
+        Expression::ImageSample {
+            ref mut image,
+            ref mut sampler,
+            ref mut coordinate,
+            ref mut array_index,
+            ref mut offset,
+            ref mut level,
+            ref mut depth_ref,
+            gather: _,
+        } => {
+            adjust(image);
+            adjust(sampler);
+            adjust(coordinate);
+            if let Some(e) = array_index.as_mut() {
+                adjust(e);
+            }
+            if let Some(e) = offset.as_mut() {
+                adjust(e);
+            }
+            match *level {
+                crate::SampleLevel::Exact(ref mut expr)
+                | crate::SampleLevel::Bias(ref mut expr) => {
+                    adjust(expr);
+                }
+                crate::SampleLevel::Gradient {
+                    ref mut x,
+                    ref mut y,
+                } => {
+                    adjust(x);
+                    adjust(y);
+                }
+                _ => {}
+            }
+            if let Some(e) = depth_ref.as_mut() {
+                adjust(e);
+            }
+        }
+        Expression::ImageLoad {
+            ref mut image,
+            ref mut coordinate,
+            ref mut array_index,
+            ref mut sample,
+            ref mut level,
+        } => {
+            adjust(image);
+            adjust(coordinate);
+            if let Some(e) = array_index.as_mut() {
+                adjust(e);
+            }
+            if let Some(e) = sample.as_mut() {
+                adjust(e);
+            }
+            if let Some(e) = level.as_mut() {
+                adjust(e);
+            }
+        }
+        Expression::ImageQuery {
+            ref mut image,
+            ref mut query,
+        } => {
+            adjust(image);
+            match *query {
+                crate::ImageQuery::Size { ref mut level } => {
+                    if let Some(e) = level.as_mut() {
+                        adjust(e);
+                    }
+                }
+                crate::ImageQuery::NumLevels
+                | crate::ImageQuery::NumLayers
+                | crate::ImageQuery::NumSamples => {}
+            }
+        }
+        Expression::Unary {
+            ref mut expr,
+            op: _,
+        } => {
+            adjust(expr);
+        }
+        Expression::Binary {
+            ref mut left,
+            ref mut right,
+            op: _,
+        } => {
+            adjust(left);
+            adjust(right);
+        }
+        Expression::Select {
+            ref mut condition,
+            ref mut accept,
+            ref mut reject,
+        } => {
+            adjust(condition);
+            adjust(accept);
+            adjust(reject);
+        }
+        Expression::Derivative {
+            ref mut expr,
+            axis: _,
+            ctrl: _,
+        } => {
+            adjust(expr);
+        }
+        Expression::Relational {
+            ref mut argument,
+            fun: _,
+        } => {
+            adjust(argument);
+        }
+        Expression::Math {
+            ref mut arg,
+            ref mut arg1,
+            ref mut arg2,
+            ref mut arg3,
+            fun: _,
+        } => {
+            adjust(arg);
+            if let Some(e) = arg1.as_mut() {
+                adjust(e);
+            }
+            if let Some(e) = arg2.as_mut() {
+                adjust(e);
+            }
+            if let Some(e) = arg3.as_mut() {
+                adjust(e);
+            }
+        }
+        Expression::As {
+            ref mut expr,
+            kind: _,
+            convert: _,
+        } => {
+            adjust(expr);
+        }
+        Expression::ArrayLength(ref mut expr) => {
+            adjust(expr);
+        }
+        Expression::RayQueryGetIntersection {
+            ref mut query,
+            committed: _,
+        } => {
+            adjust(query);
+        }
+        Expression::Literal(_)
+        | Expression::FunctionArgument(_)
+        | Expression::GlobalVariable(_)
+        | Expression::LocalVariable(_)
+        | Expression::CallResult(_)
+        | Expression::RayQueryProceedResult
+        | Expression::Constant(_)
+        | Expression::Override(_)
+        | Expression::ZeroValue(_)
+        | Expression::AtomicResult {
+            ty: _,
+            comparison: _,
+        }
+        | Expression::WorkGroupUniformLoadResult { ty: _ }
+        | Expression::SubgroupBallotResult
+        | Expression::SubgroupOperationResult { .. } => {}
+    }
+}
+
+/// Replace every expression handle in `block` with its counterpart
+/// given by `new_pos`.
+fn adjust_block(new_pos: &[Handle<Expression>], block: &mut Block) {
+    for stmt in block.iter_mut() {
+        adjust_stmt(new_pos, stmt);
+    }
+}
+
+/// Replace every expression handle in `stmt` with its counterpart
+/// given by `new_pos`.
+fn adjust_stmt(new_pos: &[Handle<Expression>], stmt: &mut Statement) {
+    let adjust = |expr: &mut Handle<Expression>| {
+        *expr = new_pos[expr.index()];
+    };
+    match *stmt {
+        Statement::Emit(ref mut range) => {
+            if let Some((mut first, mut last)) = range.first_and_last() {
+                adjust(&mut first);
+                adjust(&mut last);
+                *range = Range::new_from_bounds(first, last);
+            }
+        }
+        Statement::Block(ref mut block) => {
+            adjust_block(new_pos, block);
+        }
+        Statement::If {
+            ref mut condition,
+            ref mut accept,
+            ref mut reject,
+        } => {
+            adjust(condition);
+            adjust_block(new_pos, accept);
+            adjust_block(new_pos, reject);
+        }
+        Statement::Switch {
+            ref mut selector,
+            ref mut cases,
+        } => {
+            adjust(selector);
+            for case in cases.iter_mut() {
+                adjust_block(new_pos, &mut case.body);
+            }
+        }
+        Statement::Loop {
+            ref mut body,
+            ref mut continuing,
+            ref mut break_if,
+        } => {
+            adjust_block(new_pos, body);
+            adjust_block(new_pos, continuing);
+            if let Some(e) = break_if.as_mut() {
+                adjust(e);
+            }
+        }
+        Statement::Return { ref mut value } => {
+            if let Some(e) = value.as_mut() {
+                adjust(e);
+            }
+        }
+        Statement::Store {
+            ref mut pointer,
+            ref mut value,
+        } => {
+            adjust(pointer);
+            adjust(value);
+        }
+        Statement::ImageStore {
+            ref mut image,
+            ref mut coordinate,
+            ref mut array_index,
+            ref mut value,
+        } => {
+            adjust(image);
+            adjust(coordinate);
+            if let Some(e) = array_index.as_mut() {
+                adjust(e);
+            }
+            adjust(value);
+        }
+        Statement::Atomic {
+            ref mut pointer,
+            ref mut value,
+            ref mut result,
+            ref mut fun,
+        } => {
+            adjust(pointer);
+            adjust(value);
+            adjust(result);
+            match *fun {
+                crate::AtomicFunction::Exchange {
+                    compare: Some(ref mut compare),
+                } => {
+                    adjust(compare);
+                }
+                crate::AtomicFunction::Add
+                | crate::AtomicFunction::Subtract
+                | crate::AtomicFunction::And
+                | crate::AtomicFunction::ExclusiveOr
+                | crate::AtomicFunction::InclusiveOr
+                | crate::AtomicFunction::Min
+                | crate::AtomicFunction::Max
+                | crate::AtomicFunction::Exchange { compare: None } => {}
+            }
+        }
+        Statement::WorkGroupUniformLoad {
+            ref mut pointer,
+            ref mut result,
+        } => {
+            adjust(pointer);
+            adjust(result);
+        }
+        Statement::SubgroupBallot {
+            ref mut result,
+            ref mut predicate,
+        } => {
+            if let Some(ref mut predicate) = *predicate {
+                adjust(predicate);
+            }
+            adjust(result);
+        }
+        Statement::SubgroupCollectiveOperation {
+            ref mut argument,
+            ref mut result,
+            ..
+        } => {
+            adjust(argument);
+            adjust(result);
+        }
+        Statement::SubgroupGather {
+            ref mut mode,
+            ref mut argument,
+            ref mut result,
+        } => {
+            match *mode {
+                crate::GatherMode::BroadcastFirst => {}
+                crate::GatherMode::Broadcast(ref mut index)
+                | crate::GatherMode::Shuffle(ref mut index)
+                | crate::GatherMode::ShuffleDown(ref mut index)
+                | crate::GatherMode::ShuffleUp(ref mut index)
+                | crate::GatherMode::ShuffleXor(ref mut index) => {
+                    adjust(index);
+                }
+            }
+            adjust(argument);
+            adjust(result)
+        }
+        Statement::Call {
+            ref mut arguments,
+            ref mut result,
+            function: _,
+        } => {
+            for argument in arguments.iter_mut() {
+                adjust(argument);
+            }
+            if let Some(e) = result.as_mut() {
+                adjust(e);
+            }
+        }
+        Statement::RayQuery {
+            ref mut query,
+            ref mut fun,
+        } => {
+            adjust(query);
+            match *fun {
+                crate::RayQueryFunction::Initialize {
+                    ref mut acceleration_structure,
+                    ref mut descriptor,
+                } => {
+                    adjust(acceleration_structure);
+                    adjust(descriptor);
+                }
+                crate::RayQueryFunction::Proceed { ref mut result } => {
+                    adjust(result);
+                }
+                crate::RayQueryFunction::Terminate => {}
+            }
+        }
+        Statement::Break | Statement::Continue | Statement::Kill | Statement::Barrier(_) => {}
+    }
+}
+
+/// Adjust [`Emit`] statements in `block` to skip [`needs_pre_emit`] expressions we have introduced.
+///
+/// According to validation, [`Emit`] statements must not cover any expressions
+/// for which [`Expression::needs_pre_emit`] returns true. All expressions built
+/// by successful constant evaluation fall into that category, meaning that
+/// `process_function` will usually rewrite [`Override`] expressions and those
+/// that use their values into pre-emitted expressions, leaving any [`Emit`]
+/// statements that cover them invalid.
+///
+/// This function rewrites all [`Emit`] statements into zero or more new
+/// [`Emit`] statements covering only those expressions in the original range
+/// that are not pre-emitted.
+///
+/// [`Emit`]: Statement::Emit
+/// [`needs_pre_emit`]: Expression::needs_pre_emit
+/// [`Override`]: Expression::Override
+fn filter_emits_in_block(block: &mut Block, expressions: &Arena<Expression>) {
+    let original = mem::replace(block, Block::with_capacity(block.len()));
+    for (stmt, span) in original.span_into_iter() {
+        match stmt {
+            Statement::Emit(range) => {
+                let mut current = None;
+                for expr_h in range {
+                    if expressions[expr_h].needs_pre_emit() {
+                        if let Some((first, last)) = current {
+                            block.push(Statement::Emit(Range::new_from_bounds(first, last)), span);
+                        }
+
+                        current = None;
+                    } else if let Some((_, ref mut last)) = current {
+                        *last = expr_h;
+                    } else {
+                        current = Some((expr_h, expr_h));
+                    }
+                }
+                if let Some((first, last)) = current {
+                    block.push(Statement::Emit(Range::new_from_bounds(first, last)), span);
+                }
+            }
+            Statement::Block(mut child) => {
+                filter_emits_in_block(&mut child, expressions);
+                block.push(Statement::Block(child), span);
+            }
+            Statement::If {
+                condition,
+                mut accept,
+                mut reject,
+            } => {
+                filter_emits_in_block(&mut accept, expressions);
+                filter_emits_in_block(&mut reject, expressions);
+                block.push(
+                    Statement::If {
+                        condition,
+                        accept,
+                        reject,
+                    },
+                    span,
+                );
+            }
+            Statement::Switch {
+                selector,
+                mut cases,
+            } => {
+                for case in &mut cases {
+                    filter_emits_in_block(&mut case.body, expressions);
+                }
+                block.push(Statement::Switch { selector, cases }, span);
+            }
+            Statement::Loop {
+                mut body,
+                mut continuing,
+                break_if,
+            } => {
+                filter_emits_in_block(&mut body, expressions);
+                filter_emits_in_block(&mut continuing, expressions);
+                block.push(
+                    Statement::Loop {
+                        body,
+                        continuing,
+                        break_if,
+                    },
+                    span,
+                );
+            }
+            stmt => block.push(stmt.clone(), span),
+        }
+    }
+}
+
+fn map_value_to_literal(value: f64, scalar: Scalar) -> Result<Literal, PipelineConstantError> {
+    // note that in rust 0.0 == -0.0
+    match scalar {
+        Scalar::BOOL => {
+            // https://webidl.spec.whatwg.org/#js-boolean
+            let value = value != 0.0 && !value.is_nan();
+            Ok(Literal::Bool(value))
+        }
+        Scalar::I32 => {
+            // https://webidl.spec.whatwg.org/#js-long
+            if !value.is_finite() {
+                return Err(PipelineConstantError::SrcNeedsToBeFinite);
+            }
+
+            let value = value.trunc();
+            if value < f64::from(i32::MIN) || value > f64::from(i32::MAX) {
+                return Err(PipelineConstantError::DstRangeTooSmall);
+            }
+
+            let value = value as i32;
+            Ok(Literal::I32(value))
+        }
+        Scalar::U32 => {
+            // https://webidl.spec.whatwg.org/#js-unsigned-long
+            if !value.is_finite() {
+                return Err(PipelineConstantError::SrcNeedsToBeFinite);
+            }
+
+            let value = value.trunc();
+            if value < f64::from(u32::MIN) || value > f64::from(u32::MAX) {
+                return Err(PipelineConstantError::DstRangeTooSmall);
+            }
+
+            let value = value as u32;
+            Ok(Literal::U32(value))
+        }
+        Scalar::F32 => {
+            // https://webidl.spec.whatwg.org/#js-float
+            if !value.is_finite() {
+                return Err(PipelineConstantError::SrcNeedsToBeFinite);
+            }
+
+            let value = value as f32;
+            if !value.is_finite() {
+                return Err(PipelineConstantError::DstRangeTooSmall);
+            }
+
+            Ok(Literal::F32(value))
+        }
+        Scalar::F64 => {
+            // https://webidl.spec.whatwg.org/#js-double
+            if !value.is_finite() {
+                return Err(PipelineConstantError::SrcNeedsToBeFinite);
+            }
+
+            Ok(Literal::F64(value))
+        }
+        _ => unreachable!(),
+    }
+}
+
+#[test]
+fn test_map_value_to_literal() {
+    let bool_test_cases = [
+        (0.0, false),
+        (-0.0, false),
+        (f64::NAN, false),
+        (1.0, true),
+        (f64::INFINITY, true),
+        (f64::NEG_INFINITY, true),
+    ];
+    for (value, out) in bool_test_cases {
+        let res = Ok(Literal::Bool(out));
+        assert_eq!(map_value_to_literal(value, Scalar::BOOL), res);
+    }
+
+    for scalar in [Scalar::I32, Scalar::U32, Scalar::F32, Scalar::F64] {
+        for value in [f64::NAN, f64::INFINITY, f64::NEG_INFINITY] {
+            let res = Err(PipelineConstantError::SrcNeedsToBeFinite);
+            assert_eq!(map_value_to_literal(value, scalar), res);
+        }
+    }
+
+    // i32
+    assert_eq!(
+        map_value_to_literal(f64::from(i32::MIN), Scalar::I32),
+        Ok(Literal::I32(i32::MIN))
+    );
+    assert_eq!(
+        map_value_to_literal(f64::from(i32::MAX), Scalar::I32),
+        Ok(Literal::I32(i32::MAX))
+    );
+    assert_eq!(
+        map_value_to_literal(f64::from(i32::MIN) - 1.0, Scalar::I32),
+        Err(PipelineConstantError::DstRangeTooSmall)
+    );
+    assert_eq!(
+        map_value_to_literal(f64::from(i32::MAX) + 1.0, Scalar::I32),
+        Err(PipelineConstantError::DstRangeTooSmall)
+    );
+
+    // u32
+    assert_eq!(
+        map_value_to_literal(f64::from(u32::MIN), Scalar::U32),
+        Ok(Literal::U32(u32::MIN))
+    );
+    assert_eq!(
+        map_value_to_literal(f64::from(u32::MAX), Scalar::U32),
+        Ok(Literal::U32(u32::MAX))
+    );
+    assert_eq!(
+        map_value_to_literal(f64::from(u32::MIN) - 1.0, Scalar::U32),
+        Err(PipelineConstantError::DstRangeTooSmall)
+    );
+    assert_eq!(
+        map_value_to_literal(f64::from(u32::MAX) + 1.0, Scalar::U32),
+        Err(PipelineConstantError::DstRangeTooSmall)
+    );
+
+    // f32
+    assert_eq!(
+        map_value_to_literal(f64::from(f32::MIN), Scalar::F32),
+        Ok(Literal::F32(f32::MIN))
+    );
+    assert_eq!(
+        map_value_to_literal(f64::from(f32::MAX), Scalar::F32),
+        Ok(Literal::F32(f32::MAX))
+    );
+    assert_eq!(
+        map_value_to_literal(-f64::from_bits(0x47efffffefffffff), Scalar::F32),
+        Ok(Literal::F32(f32::MIN))
+    );
+    assert_eq!(
+        map_value_to_literal(f64::from_bits(0x47efffffefffffff), Scalar::F32),
+        Ok(Literal::F32(f32::MAX))
+    );
+    assert_eq!(
+        map_value_to_literal(-f64::from_bits(0x47effffff0000000), Scalar::F32),
+        Err(PipelineConstantError::DstRangeTooSmall)
+    );
+    assert_eq!(
+        map_value_to_literal(f64::from_bits(0x47effffff0000000), Scalar::F32),
+        Err(PipelineConstantError::DstRangeTooSmall)
+    );
+
+    // f64
+    assert_eq!(
+        map_value_to_literal(f64::MIN, Scalar::F64),
+        Ok(Literal::F64(f64::MIN))
+    );
+    assert_eq!(
+        map_value_to_literal(f64::MAX, Scalar::F64),
+        Ok(Literal::F64(f64::MAX))
+    );
+}
--- a/naga/src/back/spv/block.rs
+++ b/naga/src/back/spv/block.rs
@ -239,6 +239,7 @@ impl<'w> BlockContext<'w> {
                let init = self.ir_module.constants[handle].init;
                self.writer.constant_ids[init.index()]
            }
+            crate::Expression::Override(_) => return Err(Error::Override),
            crate::Expression::ZeroValue(_) => self.writer.get_constant_null(result_type_id),
            crate::Expression::Compose { ty, ref components } => {
                self.temp_list.clear();
@ -1072,7 +1073,7 @@ impl<'w> BlockContext<'w> {
                        //
                        // bitfieldExtract(x, o, c)

-                        let bit_width = arg_ty.scalar_width().unwrap();
+                        let bit_width = arg_ty.scalar_width().unwrap() * 8;
                        let width_constant = self
                            .writer
                            .get_constant_scalar(crate::Literal::U32(bit_width as u32));
@ -1128,7 +1129,7 @@ impl<'w> BlockContext<'w> {
                    Mf::InsertBits => {
                        // The behavior of InsertBits has the same undefined behavior as ExtractBits.

-                        let bit_width = arg_ty.scalar_width().unwrap();
+                        let bit_width = arg_ty.scalar_width().unwrap() * 8;
                        let width_constant = self
                            .writer
                            .get_constant_scalar(crate::Literal::U32(bit_width as u32));
@ -1184,7 +1185,7 @@ impl<'w> BlockContext<'w> {
                    }
                    Mf::FindLsb => MathOp::Ext(spirv::GLOp::FindILsb),
                    Mf::FindMsb => {
-                        if arg_ty.scalar_width() == Some(32) {
+                        if arg_ty.scalar_width() == Some(4) {
                            let thing = match arg_scalar_kind {
                                Some(crate::ScalarKind::Uint) => spirv::GLOp::FindUMsb,
                                Some(crate::ScalarKind::Sint) => spirv::GLOp::FindSMsb,
@ -1200,11 +1201,158 @@ impl<'w> BlockContext<'w> {
                    Mf::Pack2x16float => MathOp::Ext(spirv::GLOp::PackHalf2x16),
                    Mf::Pack2x16unorm => MathOp::Ext(spirv::GLOp::PackUnorm2x16),
                    Mf::Pack2x16snorm => MathOp::Ext(spirv::GLOp::PackSnorm2x16),
+                    fun @ (Mf::Pack4xI8 | Mf::Pack4xU8) => {
+                        let (int_type, is_signed) = match fun {
+                            Mf::Pack4xI8 => (crate::ScalarKind::Sint, true),
+                            Mf::Pack4xU8 => (crate::ScalarKind::Uint, false),
+                            _ => unreachable!(),
+                        };
+                        let uint_type_id = self.get_type_id(LookupType::Local(LocalType::Value {
+                            vector_size: None,
+                            scalar: crate::Scalar {
+                                kind: crate::ScalarKind::Uint,
+                                width: 4,
+                            },
+                            pointer_space: None,
+                        }));
+
+                        let int_type_id = self.get_type_id(LookupType::Local(LocalType::Value {
+                            vector_size: None,
+                            scalar: crate::Scalar {
+                                kind: int_type,
+                                width: 4,
+                            },
+                            pointer_space: None,
+                        }));
+
+                        let mut last_instruction = Instruction::new(spirv::Op::Nop);
+
+                        let zero = self.writer.get_constant_scalar(crate::Literal::U32(0));
+                        let mut preresult = zero;
+                        block
+                            .body
+                            .reserve(usize::from(VEC_LENGTH) * (2 + usize::from(is_signed)));
+
+                        let eight = self.writer.get_constant_scalar(crate::Literal::U32(8));
+                        const VEC_LENGTH: u8 = 4;
+                        for i in 0..u32::from(VEC_LENGTH) {
+                            let offset =
+                                self.writer.get_constant_scalar(crate::Literal::U32(i * 8));
+                            let mut extracted = self.gen_id();
+                            block.body.push(Instruction::binary(
+                                spirv::Op::CompositeExtract,
+                                int_type_id,
+                                extracted,
+                                arg0_id,
+                                i,
+                            ));
+                            if is_signed {
+                                let casted = self.gen_id();
+                                block.body.push(Instruction::unary(
+                                    spirv::Op::Bitcast,
+                                    uint_type_id,
+                                    casted,
+                                    extracted,
+                                ));
+                                extracted = casted;
+                            }
+                            let is_last = i == u32::from(VEC_LENGTH - 1);
+                            if is_last {
+                                last_instruction = Instruction::quaternary(
+                                    spirv::Op::BitFieldInsert,
+                                    result_type_id,
+                                    id,
+                                    preresult,
+                                    extracted,
+                                    offset,
+                                    eight,
+                                )
+                            } else {
+                                let new_preresult = self.gen_id();
+                                block.body.push(Instruction::quaternary(
+                                    spirv::Op::BitFieldInsert,
+                                    result_type_id,
+                                    new_preresult,
+                                    preresult,
+                                    extracted,
+                                    offset,
+                                    eight,
+                                ));
+                                preresult = new_preresult;
+                            }
+                        }
+
+                        MathOp::Custom(last_instruction)
+                    }
                    Mf::Unpack4x8unorm => MathOp::Ext(spirv::GLOp::UnpackUnorm4x8),
                    Mf::Unpack4x8snorm => MathOp::Ext(spirv::GLOp::UnpackSnorm4x8),
                    Mf::Unpack2x16float => MathOp::Ext(spirv::GLOp::UnpackHalf2x16),
                    Mf::Unpack2x16unorm => MathOp::Ext(spirv::GLOp::UnpackUnorm2x16),
                    Mf::Unpack2x16snorm => MathOp::Ext(spirv::GLOp::UnpackSnorm2x16),
+                    fun @ (Mf::Unpack4xI8 | Mf::Unpack4xU8) => {
+                        let (int_type, extract_op, is_signed) = match fun {
+                            Mf::Unpack4xI8 => {
+                                (crate::ScalarKind::Sint, spirv::Op::BitFieldSExtract, true)
+                            }
+                            Mf::Unpack4xU8 => {
+                                (crate::ScalarKind::Uint, spirv::Op::BitFieldUExtract, false)
+                            }
+                            _ => unreachable!(),
+                        };
+
+                        let sint_type_id = self.get_type_id(LookupType::Local(LocalType::Value {
+                            vector_size: None,
+                            scalar: crate::Scalar {
+                                kind: crate::ScalarKind::Sint,
+                                width: 4,
+                            },
+                            pointer_space: None,
+                        }));
+
+                        let eight = self.writer.get_constant_scalar(crate::Literal::U32(8));
+                        let int_type_id = self.get_type_id(LookupType::Local(LocalType::Value {
+                            vector_size: None,
+                            scalar: crate::Scalar {
+                                kind: int_type,
+                                width: 4,
+                            },
+                            pointer_space: None,
+                        }));
+                        block
+                            .body
+                            .reserve(usize::from(VEC_LENGTH) * 2 + usize::from(is_signed));
+                        let arg_id = if is_signed {
+                            let new_arg_id = self.gen_id();
+                            block.body.push(Instruction::unary(
+                                spirv::Op::Bitcast,
+                                sint_type_id,
+                                new_arg_id,
+                                arg0_id,
+                            ));
+                            new_arg_id
+                        } else {
+                            arg0_id
+                        };
+
+                        const VEC_LENGTH: u8 = 4;
+                        let parts: [_; VEC_LENGTH as usize] =
+                            std::array::from_fn(|_| self.gen_id());
+                        for (i, part_id) in parts.into_iter().enumerate() {
+                            let index = self
+                                .writer
+                                .get_constant_scalar(crate::Literal::U32(i as u32 * 8));
+                            block.body.push(Instruction::ternary(
+                                extract_op,
+                                int_type_id,
+                                part_id,
+                                arg_id,
+                                index,
+                                eight,
+                            ));
+                        }
+
+                        MathOp::Custom(Instruction::composite_construct(result_type_id, id, &parts))
+                    }
                };

                block.body.push(match math_op {
@ -1278,7 +1426,9 @@ impl<'w> BlockContext<'w> {
            crate::Expression::CallResult(_)
            | crate::Expression::AtomicResult { .. }
            | crate::Expression::WorkGroupUniformLoadResult { .. }
-            | crate::Expression::RayQueryProceedResult => self.cached[expr_handle],
+            | crate::Expression::RayQueryProceedResult
+            | crate::Expression::SubgroupBallotResult
+            | crate::Expression::SubgroupOperationResult { .. } => self.cached[expr_handle],
            crate::Expression::As {
                expr,
                kind,
@ -1917,7 +2067,7 @@ impl<'w> BlockContext<'w> {
                ));
            };
            match *statement {
-                crate::Statement::Emit(ref range) => {
+                Statement::Emit(ref range) => {
                    for handle in range.clone() {
                        // omit const expressions as we've already cached those
                        if !self.expression_constness.is_const(handle) {
@ -1925,7 +2075,7 @@ impl<'w> BlockContext<'w> {
                        }
                    }
                }
-                crate::Statement::Block(ref block_statements) => {
+                Statement::Block(ref block_statements) => {
                    let scope_id = self.gen_id();
                    self.function.consume(block, Instruction::branch(scope_id));

@ -1940,7 +2090,7 @@ impl<'w> BlockContext<'w> {

                    block = Block::new(merge_id);
                }
-                crate::Statement::If {
+                Statement::If {
                    condition,
                    ref accept,
                    ref reject,
@ -1994,7 +2144,7 @@ impl<'w> BlockContext<'w> {

                    block = Block::new(merge_id);
                }
-                crate::Statement::Switch {
+                Statement::Switch {
                    selector,
                    ref cases,
                } => {
@ -2074,7 +2224,7 @@ impl<'w> BlockContext<'w> {

                    block = Block::new(merge_id);
                }
-                crate::Statement::Loop {
+                Statement::Loop {
                    ref body,
                    ref continuing,
                    break_if,
@ -2143,19 +2293,19 @@ impl<'w> BlockContext<'w> {

                    block = Block::new(merge_id);
                }
-                crate::Statement::Break => {
+                Statement::Break => {
                    self.function
                        .consume(block, Instruction::branch(loop_context.break_id.unwrap()));
                    return Ok(());
                }
-                crate::Statement::Continue => {
+                Statement::Continue => {
                    self.function.consume(
                        block,
                        Instruction::branch(loop_context.continuing_id.unwrap()),
                    );
                    return Ok(());
                }
-                crate::Statement::Return { value: Some(value) } => {
+                Statement::Return { value: Some(value) } => {
                    let value_id = self.cached[value];
                    let instruction = match self.function.entry_point_context {
                        // If this is an entry point, and we need to return anything,
@ -2174,18 +2324,18 @@ impl<'w> BlockContext<'w> {
                    self.function.consume(block, instruction);
                    return Ok(());
                }
-                crate::Statement::Return { value: None } => {
+                Statement::Return { value: None } => {
                    self.function.consume(block, Instruction::return_void());
                    return Ok(());
                }
-                crate::Statement::Kill => {
+                Statement::Kill => {
                    self.function.consume(block, Instruction::kill());
                    return Ok(());
                }
-                crate::Statement::Barrier(flags) => {
+                Statement::Barrier(flags) => {
                    self.writer.write_barrier(flags, &mut block);
                }
-                crate::Statement::Store { pointer, value } => {
+                Statement::Store { pointer, value } => {
                    let value_id = self.cached[value];
                    match self.write_expression_pointer(pointer, &mut block, None)? {
                        ExpressionPointer::Ready { pointer_id } => {
@ -2234,13 +2384,13 @@ impl<'w> BlockContext<'w> {
                        }
                    };
                }
-                crate::Statement::ImageStore {
+                Statement::ImageStore {
                    image,
                    coordinate,
                    array_index,
                    value,
                } => self.write_image_store(image, coordinate, array_index, value, &mut block)?,
-                crate::Statement::Call {
+                Statement::Call {
                    function: local_function,
                    ref arguments,
                    result,
@ -2266,7 +2416,7 @@ impl<'w> BlockContext<'w> {
                        &self.temp_list,
                    ));
                }
-                crate::Statement::Atomic {
+                Statement::Atomic {
                    pointer,
                    ref fun,
                    value,
@ -2446,7 +2596,7 @@ impl<'w> BlockContext<'w> {

                    block.body.push(instruction);
                }
-                crate::Statement::WorkGroupUniformLoad { pointer, result } => {
+                Statement::WorkGroupUniformLoad { pointer, result } => {
                    self.writer
                        .write_barrier(crate::Barrier::WORK_GROUP, &mut block);
                    let result_type_id = self.get_expression_type_id(&self.fun_info[result].ty);
@ -2486,9 +2636,30 @@ impl<'w> BlockContext<'w> {
                    self.writer
                        .write_barrier(crate::Barrier::WORK_GROUP, &mut block);
                }
-                crate::Statement::RayQuery { query, ref fun } => {
+                Statement::RayQuery { query, ref fun } => {
                    self.write_ray_query_function(query, fun, &mut block);
                }
+                Statement::SubgroupBallot {
+                    result,
+                    ref predicate,
+                } => {
+                    self.write_subgroup_ballot(predicate, result, &mut block)?;
+                }
+                Statement::SubgroupCollectiveOperation {
+                    ref op,
+                    ref collective_op,
+                    argument,
+                    result,
+                } => {
+                    self.write_subgroup_operation(op, collective_op, argument, result, &mut block)?;
+                }
+                Statement::SubgroupGather {
+                    ref mode,
+                    argument,
+                    result,
+                } => {
+                    self.write_subgroup_gather(mode, argument, result, &mut block)?;
+                }
            }
        }

--- a/naga/src/back/spv/helpers.rs
+++ b/naga/src/back/spv/helpers.rs
@ -10,8 +10,12 @@ pub(super) fn bytes_to_words(bytes: &[u8]) -> Vec<Word> {

 pub(super) fn string_to_words(input: &str) -> Vec<Word> {
    let bytes = input.as_bytes();
-    let mut words = bytes_to_words(bytes);

+    str_bytes_to_words(bytes)
+}
+
+pub(super) fn str_bytes_to_words(bytes: &[u8]) -> Vec<Word> {
+    let mut words = bytes_to_words(bytes);
    if bytes.len() % 4 == 0 {
        // nul-termination
        words.push(0x0u32);
@ -20,6 +24,21 @@ pub(super) fn string_to_words(input: &str) -> Vec<Word> {
    words
 }

+/// split a string into chunks and keep utf8 valid
+#[allow(unstable_name_collisions)]
+pub(super) fn string_to_byte_chunks(input: &str, limit: usize) -> Vec<&[u8]> {
+    let mut offset: usize = 0;
+    let mut start: usize = 0;
+    let mut words = vec![];
+    while offset < input.len() {
+        offset = input.floor_char_boundary(offset + limit);
+        words.push(input[start..offset].as_bytes());
+        start = offset;
+    }
+
+    words
+}
+
 pub(super) const fn map_storage_class(space: crate::AddressSpace) -> spirv::StorageClass {
    match space {
        crate::AddressSpace::Handle => spirv::StorageClass::UniformConstant,
@ -107,3 +126,35 @@ pub fn global_needs_wrapper(ir_module: &crate::Module, var: &crate::GlobalVariab
        _ => true,
    }
 }
+
+///HACK: this is taken from std unstable, remove it when std's floor_char_boundary is stable
+trait U8Internal {
+    fn is_utf8_char_boundary(&self) -> bool;
+}
+
+impl U8Internal for u8 {
+    fn is_utf8_char_boundary(&self) -> bool {
+        // This is bit magic equivalent to: b < 128 || b >= 192
+        (*self as i8) >= -0x40
+    }
+}
+
+trait StrUnstable {
+    fn floor_char_boundary(&self, index: usize) -> usize;
+}
+
+impl StrUnstable for str {
+    fn floor_char_boundary(&self, index: usize) -> usize {
+        if index >= self.len() {
+            self.len()
+        } else {
+            let lower_bound = index.saturating_sub(3);
+            let new_index = self.as_bytes()[lower_bound..=index]
+                .iter()
+                .rposition(|b| b.is_utf8_char_boundary());
+
+            // SAFETY: we know that the character boundary will be within four bytes
+            unsafe { lower_bound + new_index.unwrap_unchecked() }
+        }
+    }
+}
--- a/naga/src/back/spv/index.rs
+++ b/naga/src/back/spv/index.rs
@ -3,8 +3,9 @@ Bounds-checking for SPIR-V output.
 */

 use super::{
-    helpers::global_needs_wrapper, selection::Selection, Block, BlockContext, Error, IdGenerator,
-    Instruction, Word,
+    helpers::{global_needs_wrapper, map_storage_class},
+    selection::Selection,
+    Block, BlockContext, Error, IdGenerator, Instruction, Word,
 };
 use crate::{arena::Handle, proc::BoundsCheckPolicy};

@ -42,32 +43,113 @@ impl<'w> BlockContext<'w> {
        array: Handle<crate::Expression>,
        block: &mut Block,
    ) -> Result<Word, Error> {
-        // Naga IR permits runtime-sized arrays as global variables or as the
-        // final member of a struct that is a global variable. SPIR-V permits
-        // only the latter, so this back end wraps bare runtime-sized arrays
-        // in a made-up struct; see `helpers::global_needs_wrapper` and its uses.
-        // This code must handle both cases.
-        let (structure_id, last_member_index) = match self.ir_function.expressions[array] {
+        // Naga IR permits runtime-sized arrays as global variables, or as the
+        // final member of a struct that is a global variable, or one of these
+        // inside a buffer that is itself an element in a buffer bindings array.
+        // SPIR-V requires that runtime-sized arrays are wrapped in structs.
+        // See `helpers::global_needs_wrapper` and its uses.
+        let (opt_array_index_id, global_handle, opt_last_member_index) = match self
+            .ir_function
+            .expressions[array]
+        {
            crate::Expression::AccessIndex { base, index } => {
                match self.ir_function.expressions[base] {
-                    crate::Expression::GlobalVariable(handle) => (
-                        self.writer.global_variables[handle.index()].access_id,
-                        index,
-                    ),
-                    _ => return Err(Error::Validation("array length expression")),
+                    // The global variable is an array of buffer bindings of structs,
+                    // we are accessing one of them with a static index,
+                    // and the last member of it.
+                    crate::Expression::AccessIndex {
+                        base: base_outer,
+                        index: index_outer,
+                    } => match self.ir_function.expressions[base_outer] {
+                        crate::Expression::GlobalVariable(handle) => {
+                            let index_id = self.get_index_constant(index_outer);
+                            (Some(index_id), handle, Some(index))
+                        }
+                        _ => return Err(Error::Validation("array length expression case-1a")),
+                    },
+                    // The global variable is an array of buffer bindings of structs,
+                    // we are accessing one of them with a dynamic index,
+                    // and the last member of it.
+                    crate::Expression::Access {
+                        base: base_outer,
+                        index: index_outer,
+                    } => match self.ir_function.expressions[base_outer] {
+                        crate::Expression::GlobalVariable(handle) => {
+                            let index_id = self.cached[index_outer];
+                            (Some(index_id), handle, Some(index))
+                        }
+                        _ => return Err(Error::Validation("array length expression case-1b")),
+                    },
+                    // The global variable is a buffer, and we are accessing the last member.
+                    crate::Expression::GlobalVariable(handle) => {
+                        let global = &self.ir_module.global_variables[handle];
+                        match self.ir_module.types[global.ty].inner {
+                            // The global variable is an array of buffer bindings of run-time arrays.
+                            crate::TypeInner::BindingArray { .. } => (Some(index), handle, None),
+                            // The global variable is a struct, and we are accessing the last member
+                            _ => (None, handle, Some(index)),
+                        }
+                    }
+                    _ => return Err(Error::Validation("array length expression case-1c")),
                }
            }
+            // The global variable is an array of buffer bindings of arrays.
+            crate::Expression::Access { base, index } => match self.ir_function.expressions[base] {
+                crate::Expression::GlobalVariable(handle) => {
+                    let index_id = self.cached[index];
+                    let global = &self.ir_module.global_variables[handle];
+                    match self.ir_module.types[global.ty].inner {
+                        crate::TypeInner::BindingArray { .. } => (Some(index_id), handle, None),
+                        _ => return Err(Error::Validation("array length expression case-2a")),
+                    }
+                }
+                _ => return Err(Error::Validation("array length expression case-2b")),
+            },
+            // The global variable is a run-time array.
            crate::Expression::GlobalVariable(handle) => {
                let global = &self.ir_module.global_variables[handle];
                if !global_needs_wrapper(self.ir_module, global) {
-                    return Err(Error::Validation("array length expression"));
+                    return Err(Error::Validation("array length expression case-3"));
                }
-
-                (self.writer.global_variables[handle.index()].var_id, 0)
+                (None, handle, None)
            }
-            _ => return Err(Error::Validation("array length expression")),
+            _ => return Err(Error::Validation("array length expression case-4")),
        };

+        let gvar = self.writer.global_variables[global_handle.index()].clone();
+        let global = &self.ir_module.global_variables[global_handle];
+        let (last_member_index, gvar_id) = match opt_last_member_index {
+            Some(index) => (index, gvar.access_id),
+            None => {
+                if !global_needs_wrapper(self.ir_module, global) {
+                    return Err(Error::Validation(
+                        "pointer to a global that is not a wrapped array",
+                    ));
+                }
+                (0, gvar.var_id)
+            }
+        };
+        let structure_id = match opt_array_index_id {
+            // We are indexing inside a binding array, generate the access op.
+            Some(index_id) => {
+                let element_type_id = match self.ir_module.types[global.ty].inner {
+                    crate::TypeInner::BindingArray { base, size: _ } => {
+                        let class = map_storage_class(global.space);
+                        self.get_pointer_id(base, class)?
+                    }
+                    _ => return Err(Error::Validation("array length expression case-5")),
+                };
+                let structure_id = self.gen_id();
+                block.body.push(Instruction::access_chain(
+                    element_type_id,
+                    structure_id,
+                    gvar_id,
+                    &[index_id],
+                ));
+                structure_id
+            }
+            None => gvar_id,
+        };
        let length_id = self.gen_id();
        block.body.push(Instruction::array_length(
            self.writer.get_uint_type_id(),
--- a/naga/src/back/spv/instructions.rs
+++ b/naga/src/back/spv/instructions.rs
@ -43,6 +43,42 @@ impl super::Instruction {
        instruction
    }

+    pub(super) fn source_continued(source: &[u8]) -> Self {
+        let mut instruction = Self::new(Op::SourceContinued);
+        instruction.add_operands(helpers::str_bytes_to_words(source));
+        instruction
+    }
+
+    pub(super) fn source_auto_continued(
+        source_language: spirv::SourceLanguage,
+        version: u32,
+        source: &Option<DebugInfoInner>,
+    ) -> Vec<Self> {
+        let mut instructions = vec![];
+
+        let with_continue = source.as_ref().and_then(|debug_info| {
+            (debug_info.source_code.len() > u16::MAX as usize).then_some(debug_info)
+        });
+        if let Some(debug_info) = with_continue {
+            let mut instruction = Self::new(Op::Source);
+            instruction.add_operand(source_language as u32);
+            instruction.add_operands(helpers::bytes_to_words(&version.to_le_bytes()));
+
+            let words = helpers::string_to_byte_chunks(debug_info.source_code, u16::MAX as usize);
+            instruction.add_operand(debug_info.source_file_id);
+            instruction.add_operands(helpers::str_bytes_to_words(words[0]));
+            instructions.push(instruction);
+            for word_bytes in words[1..].iter() {
+                let instruction_continue = Self::source_continued(word_bytes);
+                instructions.push(instruction_continue);
+            }
+        } else {
+            let instruction = Self::source(source_language, version, source);
+            instructions.push(instruction);
+        }
+        instructions
+    }
+
    pub(super) fn name(target_id: Word, name: &str) -> Self {
        let mut instruction = Self::new(Op::Name);
        instruction.add_operand(target_id);
@ -1037,6 +1073,73 @@ impl super::Instruction {
        instruction.add_operand(semantics_id);
        instruction
    }
+
+    // Group Instructions
+
+    pub(super) fn group_non_uniform_ballot(
+        result_type_id: Word,
+        id: Word,
+        exec_scope_id: Word,
+        predicate: Word,
+    ) -> Self {
+        let mut instruction = Self::new(Op::GroupNonUniformBallot);
+        instruction.set_type(result_type_id);
+        instruction.set_result(id);
+        instruction.add_operand(exec_scope_id);
+        instruction.add_operand(predicate);
+
+        instruction
+    }
+    pub(super) fn group_non_uniform_broadcast_first(
+        result_type_id: Word,
+        id: Word,
+        exec_scope_id: Word,
+        value: Word,
+    ) -> Self {
+        let mut instruction = Self::new(Op::GroupNonUniformBroadcastFirst);
+        instruction.set_type(result_type_id);
+        instruction.set_result(id);
+        instruction.add_operand(exec_scope_id);
+        instruction.add_operand(value);
+
+        instruction
+    }
+    pub(super) fn group_non_uniform_gather(
+        op: Op,
+        result_type_id: Word,
+        id: Word,
+        exec_scope_id: Word,
+        value: Word,
+        index: Word,
+    ) -> Self {
+        let mut instruction = Self::new(op);
+        instruction.set_type(result_type_id);
+        instruction.set_result(id);
+        instruction.add_operand(exec_scope_id);
+        instruction.add_operand(value);
+        instruction.add_operand(index);
+
+        instruction
+    }
+    pub(super) fn group_non_uniform_arithmetic(
+        op: Op,
+        result_type_id: Word,
+        id: Word,
+        exec_scope_id: Word,
+        group_op: Option<spirv::GroupOperation>,
+        value: Word,
+    ) -> Self {
+        let mut instruction = Self::new(op);
+        instruction.set_type(result_type_id);
+        instruction.set_result(id);
+        instruction.add_operand(exec_scope_id);
+        if let Some(group_op) = group_op {
+            instruction.add_operand(group_op as u32);
+        }
+        instruction.add_operand(value);
+
+        instruction
+    }
 }

 impl From<crate::StorageFormat> for spirv::ImageFormat {
--- a/naga/src/back/spv/mod.rs
+++ b/naga/src/back/spv/mod.rs
@ -13,6 +13,7 @@ mod layout;
 mod ray;
 mod recyclable;
 mod selection;
+mod subgroup;
 mod writer;

 pub use spirv::Capability;
@ -70,6 +71,8 @@ pub enum Error {
    FeatureNotImplemented(&'static str),
    #[error("module is not validated properly: {0}")]
    Validation(&'static str),
+    #[error("overrides should not be present at this stage")]
+    Override,
 }

 #[derive(Default)]
@ -245,7 +248,7 @@ impl LocalImageType {
 /// this, by converting everything possible to a `LocalType` before inspecting
 /// it.
 ///
-/// ## `Localtype` equality and SPIR-V `OpType` uniqueness
+/// ## `LocalType` equality and SPIR-V `OpType` uniqueness
 ///
 /// The definition of `Eq` on `LocalType` is carefully chosen to help us follow
 /// certain SPIR-V rules. SPIR-V §2.8 requires some classes of `OpType...`
@ -454,7 +457,7 @@ impl recyclable::Recyclable for CachedExpressions {

 #[derive(Eq, Hash, PartialEq)]
 enum CachedConstant {
-    Literal(crate::Literal),
+    Literal(crate::proc::HashableLiteral),
    Composite {
        ty: LookupType,
        constituent_ids: Vec<Word>,
@ -527,6 +530,42 @@ struct FunctionArgument {
    handle_id: Word,
 }

+/// Tracks the expressions for which the backend emits the following instructions:
+/// - OpConstantTrue
+/// - OpConstantFalse
+/// - OpConstant
+/// - OpConstantComposite
+/// - OpConstantNull
+struct ExpressionConstnessTracker {
+    inner: bit_set::BitSet,
+}
+
+impl ExpressionConstnessTracker {
+    fn from_arena(arena: &crate::Arena<crate::Expression>) -> Self {
+        let mut inner = bit_set::BitSet::new();
+        for (handle, expr) in arena.iter() {
+            let insert = match *expr {
+                crate::Expression::Literal(_)
+                | crate::Expression::ZeroValue(_)
+                | crate::Expression::Constant(_) => true,
+                crate::Expression::Compose { ref components, .. } => {
+                    components.iter().all(|h| inner.contains(h.index()))
+                }
+                crate::Expression::Splat { value, .. } => inner.contains(value.index()),
+                _ => false,
+            };
+            if insert {
+                inner.insert(handle.index());
+            }
+        }
+        Self { inner }
+    }
+
+    fn is_const(&self, value: Handle<crate::Expression>) -> bool {
+        self.inner.contains(value.index())
+    }
+}
+
 /// General information needed to emit SPIR-V for Naga statements.
 struct BlockContext<'w> {
    /// The writer handling the module to which this code belongs.
@ -552,7 +591,7 @@ struct BlockContext<'w> {
    temp_list: Vec<Word>,

    /// Tracks the constness of `Expression`s residing in `self.ir_function.expressions`
-    expression_constness: crate::proc::ExpressionConstnessTracker,
+    expression_constness: ExpressionConstnessTracker,
 }

 impl BlockContext<'_> {
@ -576,6 +615,15 @@ impl BlockContext<'_> {
        self.writer
            .get_constant_scalar(crate::Literal::I32(scope as _))
    }
+
+    fn get_pointer_id(
+        &mut self,
+        handle: Handle<crate::Type>,
+        class: spirv::StorageClass,
+    ) -> Result<Word, Error> {
+        self.writer
+            .get_pointer_id(&self.ir_module.types, handle, class)
+    }
 }

 #[derive(Clone, Copy, Default)]
@ -708,7 +756,7 @@ impl<'a> Default for Options<'a> {
            flags,
            binding_map: BindingMap::default(),
            capabilities: None,
-            bounds_check_policies: crate::proc::BoundsCheckPolicies::default(),
+            bounds_check_policies: BoundsCheckPolicies::default(),
            zero_initialize_workgroup_memory: ZeroInitializeWorkgroupMemoryMode::Polyfill,
            debug_info: None,
        }
@ -716,7 +764,7 @@ impl<'a> Default for Options<'a> {
 }

 // A subset of options meant to be changed per pipeline.
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[derive(Debug, Clone)]
 #[cfg_attr(feature = "serialize", derive(serde::Serialize))]
 #[cfg_attr(feature = "deserialize", derive(serde::Deserialize))]
 pub struct PipelineOptions {
--- a/naga/src/back/spv/subgroup.rs
+++ b/naga/src/back/spv/subgroup.rs
@ -0,0 +1,207 @@
+use super::{Block, BlockContext, Error, Instruction};
+use crate::{
+    arena::Handle,
+    back::spv::{LocalType, LookupType},
+    TypeInner,
+};
+
+impl<'w> BlockContext<'w> {
+    pub(super) fn write_subgroup_ballot(
+        &mut self,
+        predicate: &Option<Handle<crate::Expression>>,
+        result: Handle<crate::Expression>,
+        block: &mut Block,
+    ) -> Result<(), Error> {
+        self.writer.require_any(
+            "GroupNonUniformBallot",
+            &[spirv::Capability::GroupNonUniformBallot],
+        )?;
+        let vec4_u32_type_id = self.get_type_id(LookupType::Local(LocalType::Value {
+            vector_size: Some(crate::VectorSize::Quad),
+            scalar: crate::Scalar::U32,
+            pointer_space: None,
+        }));
+        let exec_scope_id = self.get_index_constant(spirv::Scope::Subgroup as u32);
+        let predicate = if let Some(predicate) = *predicate {
+            self.cached[predicate]
+        } else {
+            self.writer.get_constant_scalar(crate::Literal::Bool(true))
+        };
+        let id = self.gen_id();
+        block.body.push(Instruction::group_non_uniform_ballot(
+            vec4_u32_type_id,
+            id,
+            exec_scope_id,
+            predicate,
+        ));
+        self.cached[result] = id;
+        Ok(())
+    }
+    pub(super) fn write_subgroup_operation(
+        &mut self,
+        op: &crate::SubgroupOperation,
+        collective_op: &crate::CollectiveOperation,
+        argument: Handle<crate::Expression>,
+        result: Handle<crate::Expression>,
+        block: &mut Block,
+    ) -> Result<(), Error> {
+        use crate::SubgroupOperation as sg;
+        match *op {
+            sg::All | sg::Any => {
+                self.writer.require_any(
+                    "GroupNonUniformVote",
+                    &[spirv::Capability::GroupNonUniformVote],
+                )?;
+            }
+            _ => {
+                self.writer.require_any(
+                    "GroupNonUniformArithmetic",
+                    &[spirv::Capability::GroupNonUniformArithmetic],
+                )?;
+            }
+        }
+
+        let id = self.gen_id();
+        let result_ty = &self.fun_info[result].ty;
+        let result_type_id = self.get_expression_type_id(result_ty);
+        let result_ty_inner = result_ty.inner_with(&self.ir_module.types);
+
+        let (is_scalar, scalar) = match *result_ty_inner {
+            TypeInner::Scalar(kind) => (true, kind),
+            TypeInner::Vector { scalar: kind, .. } => (false, kind),
+            _ => unimplemented!(),
+        };
+
+        use crate::ScalarKind as sk;
+        let spirv_op = match (scalar.kind, *op) {
+            (sk::Bool, sg::All) if is_scalar => spirv::Op::GroupNonUniformAll,
+            (sk::Bool, sg::Any) if is_scalar => spirv::Op::GroupNonUniformAny,
+            (_, sg::All | sg::Any) => unimplemented!(),
+
+            (sk::Sint | sk::Uint, sg::Add) => spirv::Op::GroupNonUniformIAdd,
+            (sk::Float, sg::Add) => spirv::Op::GroupNonUniformFAdd,
+            (sk::Sint | sk::Uint, sg::Mul) => spirv::Op::GroupNonUniformIMul,
+            (sk::Float, sg::Mul) => spirv::Op::GroupNonUniformFMul,
+            (sk::Sint, sg::Max) => spirv::Op::GroupNonUniformSMax,
+            (sk::Uint, sg::Max) => spirv::Op::GroupNonUniformUMax,
+            (sk::Float, sg::Max) => spirv::Op::GroupNonUniformFMax,
+            (sk::Sint, sg::Min) => spirv::Op::GroupNonUniformSMin,
+            (sk::Uint, sg::Min) => spirv::Op::GroupNonUniformUMin,
+            (sk::Float, sg::Min) => spirv::Op::GroupNonUniformFMin,
+            (_, sg::Add | sg::Mul | sg::Min | sg::Max) => unimplemented!(),
+
+            (sk::Sint | sk::Uint, sg::And) => spirv::Op::GroupNonUniformBitwiseAnd,
+            (sk::Sint | sk::Uint, sg::Or) => spirv::Op::GroupNonUniformBitwiseOr,
+            (sk::Sint | sk::Uint, sg::Xor) => spirv::Op::GroupNonUniformBitwiseXor,
+            (sk::Bool, sg::And) => spirv::Op::GroupNonUniformLogicalAnd,
+            (sk::Bool, sg::Or) => spirv::Op::GroupNonUniformLogicalOr,
+            (sk::Bool, sg::Xor) => spirv::Op::GroupNonUniformLogicalXor,
+            (_, sg::And | sg::Or | sg::Xor) => unimplemented!(),
+        };
+
+        let exec_scope_id = self.get_index_constant(spirv::Scope::Subgroup as u32);
+
+        use crate::CollectiveOperation as c;
+        let group_op = match *op {
+            sg::All | sg::Any => None,
+            _ => Some(match *collective_op {
+                c::Reduce => spirv::GroupOperation::Reduce,
+                c::InclusiveScan => spirv::GroupOperation::InclusiveScan,
+                c::ExclusiveScan => spirv::GroupOperation::ExclusiveScan,
+            }),
+        };
+
+        let arg_id = self.cached[argument];
+        block.body.push(Instruction::group_non_uniform_arithmetic(
+            spirv_op,
+            result_type_id,
+            id,
+            exec_scope_id,
+            group_op,
+            arg_id,
+        ));
+        self.cached[result] = id;
+        Ok(())
+    }
+    pub(super) fn write_subgroup_gather(
+        &mut self,
+        mode: &crate::GatherMode,
+        argument: Handle<crate::Expression>,
+        result: Handle<crate::Expression>,
+        block: &mut Block,
+    ) -> Result<(), Error> {
+        self.writer.require_any(
+            "GroupNonUniformBallot",
+            &[spirv::Capability::GroupNonUniformBallot],
+        )?;
+        match *mode {
+            crate::GatherMode::BroadcastFirst | crate::GatherMode::Broadcast(_) => {
+                self.writer.require_any(
+                    "GroupNonUniformBallot",
+                    &[spirv::Capability::GroupNonUniformBallot],
+                )?;
+            }
+            crate::GatherMode::Shuffle(_) | crate::GatherMode::ShuffleXor(_) => {
+                self.writer.require_any(
+                    "GroupNonUniformShuffle",
+                    &[spirv::Capability::GroupNonUniformShuffle],
+                )?;
+            }
+            crate::GatherMode::ShuffleDown(_) | crate::GatherMode::ShuffleUp(_) => {
+                self.writer.require_any(
+                    "GroupNonUniformShuffleRelative",
+                    &[spirv::Capability::GroupNonUniformShuffleRelative],
+                )?;
+            }
+        }
+
+        let id = self.gen_id();
+        let result_ty = &self.fun_info[result].ty;
+        let result_type_id = self.get_expression_type_id(result_ty);
+
+        let exec_scope_id = self.get_index_constant(spirv::Scope::Subgroup as u32);
+
+        let arg_id = self.cached[argument];
+        match *mode {
+            crate::GatherMode::BroadcastFirst => {
+                block
+                    .body
+                    .push(Instruction::group_non_uniform_broadcast_first(
+                        result_type_id,
+                        id,
+                        exec_scope_id,
+                        arg_id,
+                    ));
+            }
+            crate::GatherMode::Broadcast(index)
+            | crate::GatherMode::Shuffle(index)
+            | crate::GatherMode::ShuffleDown(index)
+            | crate::GatherMode::ShuffleUp(index)
+            | crate::GatherMode::ShuffleXor(index) => {
+                let index_id = self.cached[index];
+                let op = match *mode {
+                    crate::GatherMode::BroadcastFirst => unreachable!(),
+                    // Use shuffle to emit broadcast to allow the index to
+                    // be dynamically uniform on Vulkan 1.1. The argument to
+                    // OpGroupNonUniformBroadcast must be a constant pre SPIR-V
+                    // 1.5 (vulkan 1.2)
+                    crate::GatherMode::Broadcast(_) => spirv::Op::GroupNonUniformShuffle,
+                    crate::GatherMode::Shuffle(_) => spirv::Op::GroupNonUniformShuffle,
+                    crate::GatherMode::ShuffleDown(_) => spirv::Op::GroupNonUniformShuffleDown,
+                    crate::GatherMode::ShuffleUp(_) => spirv::Op::GroupNonUniformShuffleUp,
+                    crate::GatherMode::ShuffleXor(_) => spirv::Op::GroupNonUniformShuffleXor,
+                };
+                block.body.push(Instruction::group_non_uniform_gather(
+                    op,
+                    result_type_id,
+                    id,
+                    exec_scope_id,
+                    arg_id,
+                    index_id,
+                ));
+            }
+        }
+        self.cached[result] = id;
+        Ok(())
+    }
+}
--- a/naga/src/back/spv/writer.rs
+++ b/naga/src/back/spv/writer.rs
@ -565,36 +565,38 @@ impl Writer {
            // Handle globals are pre-emitted and should be loaded automatically.
            //
            // Any that are binding arrays we skip as we cannot load the array, we must load the result after indexing.
-            let is_binding_array = match ir_module.types[var.ty].inner {
-                crate::TypeInner::BindingArray { .. } => true,
-                _ => false,
-            };
-
-            if var.space == crate::AddressSpace::Handle && !is_binding_array {
-                let var_type_id = self.get_type_id(LookupType::Handle(var.ty));
-                let id = self.id_gen.next();
-                prelude
-                    .body
-                    .push(Instruction::load(var_type_id, id, gv.var_id, None));
-                gv.access_id = gv.var_id;
-                gv.handle_id = id;
-            } else if global_needs_wrapper(ir_module, var) {
-                let class = map_storage_class(var.space);
-                let pointer_type_id = self.get_pointer_id(&ir_module.types, var.ty, class)?;
-                let index_id = self.get_index_constant(0);
-
-                let id = self.id_gen.next();
-                prelude.body.push(Instruction::access_chain(
-                    pointer_type_id,
-                    id,
-                    gv.var_id,
-                    &[index_id],
-                ));
-                gv.access_id = id;
-            } else {
-                // by default, the variable ID is accessed as is
-                gv.access_id = gv.var_id;
-            };
+            match ir_module.types[var.ty].inner {
+                crate::TypeInner::BindingArray { .. } => {
+                    gv.access_id = gv.var_id;
+                }
+                _ => {
+                    if var.space == crate::AddressSpace::Handle {
+                        let var_type_id = self.get_type_id(LookupType::Handle(var.ty));
+                        let id = self.id_gen.next();
+                        prelude
+                            .body
+                            .push(Instruction::load(var_type_id, id, gv.var_id, None));
+                        gv.access_id = gv.var_id;
+                        gv.handle_id = id;
+                    } else if global_needs_wrapper(ir_module, var) {
+                        let class = map_storage_class(var.space);
+                        let pointer_type_id =
+                            self.get_pointer_id(&ir_module.types, var.ty, class)?;
+                        let index_id = self.get_index_constant(0);
+                        let id = self.id_gen.next();
+                        prelude.body.push(Instruction::access_chain(
+                            pointer_type_id,
+                            id,
+                            gv.var_id,
+                            &[index_id],
+                        ));
+                        gv.access_id = id;
+                    } else {
+                        // by default, the variable ID is accessed as is
+                        gv.access_id = gv.var_id;
+                    };
+                }
+            }

            // work around borrow checking in the presence of `self.xxx()` calls
            self.global_variables[handle.index()] = gv;
@ -613,7 +615,7 @@ impl Writer {
            // Steal the Writer's temp list for a bit.
            temp_list: std::mem::take(&mut self.temp_list),
            writer: self,
-            expression_constness: crate::proc::ExpressionConstnessTracker::from_arena(
+            expression_constness: super::ExpressionConstnessTracker::from_arena(
                &ir_function.expressions,
            ),
        };
@ -968,6 +970,11 @@ impl Writer {
        handle: Handle<crate::Type>,
    ) -> Result<Word, Error> {
        let ty = &arena[handle];
+        // If it's a type that needs SPIR-V capabilities, request them now.
+        // This needs to happen regardless of the LocalType lookup succeeding,
+        // because some types which map to the same LocalType have different
+        // capability requirements. See https://github.com/gfx-rs/wgpu/issues/5569
+        self.request_type_capabilities(&ty.inner)?;
        let id = if let Some(local) = make_local(&ty.inner) {
            // This type can be represented as a `LocalType`, so check if we've
            // already written an instruction for it. If not, do so now, with
@ -983,10 +990,6 @@ impl Writer {

                    self.write_type_declaration_local(id, local);

-                    // If it's a type that needs SPIR-V capabilities, request them now,
-                    // so write_type_declaration_local can stay infallible.
-                    self.request_type_capabilities(&ty.inner)?;
-
                    id
                }
            }
@ -1148,7 +1151,7 @@ impl Writer {
    }

    pub(super) fn get_constant_scalar(&mut self, value: crate::Literal) -> Word {
-        let scalar = CachedConstant::Literal(value);
+        let scalar = CachedConstant::Literal(value.into());
        if let Some(&id) = self.cached_constants.get(&scalar) {
            return id;
        }
@ -1256,7 +1259,7 @@ impl Writer {
        ir_module: &crate::Module,
        mod_info: &ModuleInfo,
    ) -> Result<Word, Error> {
-        let id = match ir_module.const_expressions[handle] {
+        let id = match ir_module.global_expressions[handle] {
            crate::Expression::Literal(literal) => self.get_constant_scalar(literal),
            crate::Expression::Constant(constant) => {
                let constant = &ir_module.constants[constant];
@ -1270,7 +1273,7 @@ impl Writer {
                let component_ids: Vec<_> = crate::proc::flatten_compose(
                    ty,
                    components,
-                    &ir_module.const_expressions,
+                    &ir_module.global_expressions,
                    &ir_module.types,
                )
                .map(|component| self.constant_ids[component.index()])
@ -1308,7 +1311,11 @@ impl Writer {
            spirv::MemorySemantics::WORKGROUP_MEMORY,
            flags.contains(crate::Barrier::WORK_GROUP),
        );
-        let exec_scope_id = self.get_index_constant(spirv::Scope::Workgroup as u32);
+        let exec_scope_id = if flags.contains(crate::Barrier::SUB_GROUP) {
+            self.get_index_constant(spirv::Scope::Subgroup as u32)
+        } else {
+            self.get_index_constant(spirv::Scope::Workgroup as u32)
+        };
        let mem_scope_id = self.get_index_constant(memory_scope as u32);
        let semantics_id = self.get_index_constant(semantics.bits());
        block.body.push(Instruction::control_barrier(
@ -1583,6 +1590,41 @@ impl Writer {
                    Bi::WorkGroupId => BuiltIn::WorkgroupId,
                    Bi::WorkGroupSize => BuiltIn::WorkgroupSize,
                    Bi::NumWorkGroups => BuiltIn::NumWorkgroups,
+                    // Subgroup
+                    Bi::NumSubgroups => {
+                        self.require_any(
+                            "`num_subgroups` built-in",
+                            &[spirv::Capability::GroupNonUniform],
+                        )?;
+                        BuiltIn::NumSubgroups
+                    }
+                    Bi::SubgroupId => {
+                        self.require_any(
+                            "`subgroup_id` built-in",
+                            &[spirv::Capability::GroupNonUniform],
+                        )?;
+                        BuiltIn::SubgroupId
+                    }
+                    Bi::SubgroupSize => {
+                        self.require_any(
+                            "`subgroup_size` built-in",
+                            &[
+                                spirv::Capability::GroupNonUniform,
+                                spirv::Capability::SubgroupBallotKHR,
+                            ],
+                        )?;
+                        BuiltIn::SubgroupSize
+                    }
+                    Bi::SubgroupInvocationId => {
+                        self.require_any(
+                            "`subgroup_invocation_id` built-in",
+                            &[
+                                spirv::Capability::GroupNonUniform,
+                                spirv::Capability::SubgroupBallotKHR,
+                            ],
+                        )?;
+                        BuiltIn::SubgroupLocalInvocationId
+                    }
                };

                self.decorate(id, Decoration::BuiltIn, &[built_in as u32]);
@ -1858,9 +1900,15 @@ impl Writer {
            .iter()
            .flat_map(|entry| entry.function.arguments.iter())
            .any(|arg| has_view_index_check(ir_module, arg.binding.as_ref(), arg.ty));
-        let has_ray_query = ir_module.special_types.ray_desc.is_some()
+        let mut has_ray_query = ir_module.special_types.ray_desc.is_some()
            | ir_module.special_types.ray_intersection.is_some();

+        for (_, &crate::Type { ref inner, .. }) in ir_module.types.iter() {
+            if let &crate::TypeInner::AccelerationStructure | &crate::TypeInner::RayQuery = inner {
+                has_ray_query = true
+            }
+        }
+
        if self.physical_layout.version < 0x10300 && has_storage_buffers {
            // enable the storage buffer class on < SPV-1.3
            Instruction::extension("SPV_KHR_storage_buffer_storage_class")
@ -1891,7 +1939,7 @@ impl Writer {
                    source_code: debug_info.source_code,
                    source_file_id,
                });
-                self.debugs.push(Instruction::source(
+                self.debugs.append(&mut Instruction::source_auto_continued(
                    spirv::SourceLanguage::Unknown,
                    0,
                    &debug_info_inner,
@ -1906,8 +1954,8 @@ impl Writer {

        // write all const-expressions as constants
        self.constant_ids
-            .resize(ir_module.const_expressions.len(), 0);
-        for (handle, _) in ir_module.const_expressions.iter() {
+            .resize(ir_module.global_expressions.len(), 0);
+        for (handle, _) in ir_module.global_expressions.iter() {
            self.write_constant_expr(handle, ir_module, mod_info)?;
        }
        debug_assert!(self.constant_ids.iter().all(|&id| id != 0));
@ -2021,6 +2069,10 @@ impl Writer {
        debug_info: &Option<DebugInfo>,
        words: &mut Vec<Word>,
    ) -> Result<(), Error> {
+        if !ir_module.overrides.is_empty() {
+            return Err(Error::Override);
+        }
+
        self.reset();

        // Try to find the entry point and corresponding index
--- a/naga/src/back/wgsl/writer.rs
+++ b/naga/src/back/wgsl/writer.rs
@ -106,6 +106,12 @@ impl<W: Write> Writer<W> {
    }

    pub fn write(&mut self, module: &Module, info: &valid::ModuleInfo) -> BackendResult {
+        if !module.overrides.is_empty() {
+            return Err(Error::Unimplemented(
+                "Pipeline constants are not yet supported for this back-end".to_string(),
+            ));
+        }
+
        self.reset(module);

        // Save all ep result types
@ -918,8 +924,124 @@ impl<W: Write> Writer<W> {
                if barrier.contains(crate::Barrier::WORK_GROUP) {
                    writeln!(self.out, "{level}workgroupBarrier();")?;
                }
+
+                if barrier.contains(crate::Barrier::SUB_GROUP) {
+                    writeln!(self.out, "{level}subgroupBarrier();")?;
+                }
            }
            Statement::RayQuery { .. } => unreachable!(),
+            Statement::SubgroupBallot { result, predicate } => {
+                write!(self.out, "{level}")?;
+                let res_name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                self.start_named_expr(module, result, func_ctx, &res_name)?;
+                self.named_expressions.insert(result, res_name);
+
+                write!(self.out, "subgroupBallot(")?;
+                if let Some(predicate) = predicate {
+                    self.write_expr(module, predicate, func_ctx)?;
+                }
+                writeln!(self.out, ");")?;
+            }
+            Statement::SubgroupCollectiveOperation {
+                op,
+                collective_op,
+                argument,
+                result,
+            } => {
+                write!(self.out, "{level}")?;
+                let res_name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                self.start_named_expr(module, result, func_ctx, &res_name)?;
+                self.named_expressions.insert(result, res_name);
+
+                match (collective_op, op) {
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::All) => {
+                        write!(self.out, "subgroupAll(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Any) => {
+                        write!(self.out, "subgroupAny(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "subgroupAdd(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "subgroupMul(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Max) => {
+                        write!(self.out, "subgroupMax(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Min) => {
+                        write!(self.out, "subgroupMin(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::And) => {
+                        write!(self.out, "subgroupAnd(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Or) => {
+                        write!(self.out, "subgroupOr(")?
+                    }
+                    (crate::CollectiveOperation::Reduce, crate::SubgroupOperation::Xor) => {
+                        write!(self.out, "subgroupXor(")?
+                    }
+                    (crate::CollectiveOperation::ExclusiveScan, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "subgroupExclusiveAdd(")?
+                    }
+                    (crate::CollectiveOperation::ExclusiveScan, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "subgroupExclusiveMul(")?
+                    }
+                    (crate::CollectiveOperation::InclusiveScan, crate::SubgroupOperation::Add) => {
+                        write!(self.out, "subgroupInclusiveAdd(")?
+                    }
+                    (crate::CollectiveOperation::InclusiveScan, crate::SubgroupOperation::Mul) => {
+                        write!(self.out, "subgroupInclusiveMul(")?
+                    }
+                    _ => unimplemented!(),
+                }
+                self.write_expr(module, argument, func_ctx)?;
+                writeln!(self.out, ");")?;
+            }
+            Statement::SubgroupGather {
+                mode,
+                argument,
+                result,
+            } => {
+                write!(self.out, "{level}")?;
+                let res_name = format!("{}{}", back::BAKE_PREFIX, result.index());
+                self.start_named_expr(module, result, func_ctx, &res_name)?;
+                self.named_expressions.insert(result, res_name);
+
+                match mode {
+                    crate::GatherMode::BroadcastFirst => {
+                        write!(self.out, "subgroupBroadcastFirst(")?;
+                    }
+                    crate::GatherMode::Broadcast(_) => {
+                        write!(self.out, "subgroupBroadcast(")?;
+                    }
+                    crate::GatherMode::Shuffle(_) => {
+                        write!(self.out, "subgroupShuffle(")?;
+                    }
+                    crate::GatherMode::ShuffleDown(_) => {
+                        write!(self.out, "subgroupShuffleDown(")?;
+                    }
+                    crate::GatherMode::ShuffleUp(_) => {
+                        write!(self.out, "subgroupShuffleUp(")?;
+                    }
+                    crate::GatherMode::ShuffleXor(_) => {
+                        write!(self.out, "subgroupShuffleXor(")?;
+                    }
+                }
+                self.write_expr(module, argument, func_ctx)?;
+                match mode {
+                    crate::GatherMode::BroadcastFirst => {}
+                    crate::GatherMode::Broadcast(index)
+                    | crate::GatherMode::Shuffle(index)
+                    | crate::GatherMode::ShuffleDown(index)
+                    | crate::GatherMode::ShuffleUp(index)
+                    | crate::GatherMode::ShuffleXor(index) => {
+                        write!(self.out, ", ")?;
+                        self.write_expr(module, index, func_ctx)?;
+                    }
+                }
+                writeln!(self.out, ");")?;
+            }
        }

        Ok(())
@ -974,7 +1096,7 @@ impl<W: Write> Writer<W> {
            Ex::Access { base, .. } | Ex::AccessIndex { base, .. } => {
                let base_ty = func_ctx.resolve_type(base, &module.types);
                match *base_ty {
-                    crate::TypeInner::Pointer { .. } | crate::TypeInner::ValuePointer { .. } => {
+                    TypeInner::Pointer { .. } | TypeInner::ValuePointer { .. } => {
                        Indirection::Reference
                    }
                    _ => Indirection::Ordinary,
@ -1070,7 +1192,7 @@ impl<W: Write> Writer<W> {
        self.write_possibly_const_expression(
            module,
            expr,
-            &module.const_expressions,
+            &module.global_expressions,
            |writer, expr| writer.write_const_expression(module, expr),
        )
    }
@ -1199,6 +1321,7 @@ impl<W: Write> Writer<W> {
                    |writer, expr| writer.write_expr(module, expr, func_ctx),
                )?;
            }
+            Expression::Override(_) => unreachable!(),
            Expression::FunctionArgument(pos) => {
                let name_key = func_ctx.argument_key(pos);
                let name = &self.names[&name_key];
@ -1593,12 +1716,16 @@ impl<W: Write> Writer<W> {
                    Mf::Pack2x16snorm => Function::Regular("pack2x16snorm"),
                    Mf::Pack2x16unorm => Function::Regular("pack2x16unorm"),
                    Mf::Pack2x16float => Function::Regular("pack2x16float"),
+                    Mf::Pack4xI8 => Function::Regular("pack4xI8"),
+                    Mf::Pack4xU8 => Function::Regular("pack4xU8"),
                    // data unpacking
                    Mf::Unpack4x8snorm => Function::Regular("unpack4x8snorm"),
                    Mf::Unpack4x8unorm => Function::Regular("unpack4x8unorm"),
                    Mf::Unpack2x16snorm => Function::Regular("unpack2x16snorm"),
                    Mf::Unpack2x16unorm => Function::Regular("unpack2x16unorm"),
                    Mf::Unpack2x16float => Function::Regular("unpack2x16float"),
+                    Mf::Unpack4xI8 => Function::Regular("unpack4xI8"),
+                    Mf::Unpack4xU8 => Function::Regular("unpack4xU8"),
                    Mf::Inverse | Mf::Outer => {
                        return Err(Error::UnsupportedMathFunction(fun));
                    }
@ -1691,6 +1818,8 @@ impl<W: Write> Writer<W> {
            Expression::CallResult(_)
            | Expression::AtomicResult { .. }
            | Expression::RayQueryProceedResult
+            | Expression::SubgroupBallotResult
+            | Expression::SubgroupOperationResult { .. }
            | Expression::WorkGroupUniformLoadResult { .. } => {}
        }

@ -1792,6 +1921,10 @@ fn builtin_str(built_in: crate::BuiltIn) -> Result<&'static str, Error> {
        Bi::SampleMask => "sample_mask",
        Bi::PrimitiveIndex => "primitive_index",
        Bi::ViewIndex => "view_index",
+        Bi::NumSubgroups => "num_subgroups",
+        Bi::SubgroupId => "subgroup_id",
+        Bi::SubgroupSize => "subgroup_size",
+        Bi::SubgroupInvocationId => "subgroup_invocation_id",
        Bi::BaseInstance
        | Bi::BaseVertex
        | Bi::ClipDistance
--- a/naga/src/block.rs
+++ b/naga/src/block.rs
@ -65,6 +65,12 @@ impl Block {
        self.span_info.splice(range.clone(), other.span_info);
        self.body.splice(range, other.body);
    }
+
+    pub fn span_into_iter(self) -> impl Iterator<Item = (Statement, Span)> {
+        let Block { body, span_info } = self;
+        body.into_iter().zip(span_info)
+    }
+
    pub fn span_iter(&self) -> impl Iterator<Item = (&Statement, &Span)> {
        let span_iter = self.span_info.iter();
        self.body.iter().zip(span_iter)
--- a/naga/src/compact/expressions.rs
+++ b/naga/src/compact/expressions.rs
@ -3,6 +3,7 @@ use crate::arena::{Arena, Handle};

 pub struct ExpressionTracer<'tracer> {
    pub constants: &'tracer Arena<crate::Constant>,
+    pub overrides: &'tracer Arena<crate::Override>,

    /// The arena in which we are currently tracing expressions.
    pub expressions: &'tracer Arena<crate::Expression>,
@ -20,11 +21,11 @@ pub struct ExpressionTracer<'tracer> {
    /// the module's constant expression arena.
    pub expressions_used: &'tracer mut HandleSet<crate::Expression>,

-    /// The used set for the module's `const_expressions` arena.
+    /// The used set for the module's `global_expressions` arena.
    ///
    /// If `None`, we are already tracing the constant expressions,
    /// and `expressions_used` already refers to their handle set.
-    pub const_expressions_used: Option<&'tracer mut HandleSet<crate::Expression>>,
+    pub global_expressions_used: Option<&'tracer mut HandleSet<crate::Expression>>,
 }

 impl<'tracer> ExpressionTracer<'tracer> {
@ -39,11 +40,11 @@ impl<'tracer> ExpressionTracer<'tracer> {
    /// marked.
    ///
    /// [fe]: crate::Function::expressions
-    /// [ce]: crate::Module::const_expressions
+    /// [ce]: crate::Module::global_expressions
    pub fn trace_expressions(&mut self) {
        log::trace!(
            "entering trace_expression of {}",
-            if self.const_expressions_used.is_some() {
+            if self.global_expressions_used.is_some() {
                "function expressions"
            } else {
                "const expressions"
@ -71,6 +72,7 @@ impl<'tracer> ExpressionTracer<'tracer> {
                | Ex::GlobalVariable(_)
                | Ex::LocalVariable(_)
                | Ex::CallResult(_)
+                | Ex::SubgroupBallotResult
                | Ex::RayQueryProceedResult => {}

                Ex::Constant(handle) => {
@ -83,11 +85,16 @@ impl<'tracer> ExpressionTracer<'tracer> {
                    // and the constant refers to the initializer, it must
                    // precede `expr` in the arena.
                    let init = self.constants[handle].init;
-                    match self.const_expressions_used {
+                    match self.global_expressions_used {
                        Some(ref mut used) => used.insert(init),
                        None => self.expressions_used.insert(init),
                    }
                }
+                Ex::Override(_) => {
+                    // All overrides are considered used by definition. We mark
+                    // their types and initialization expressions as used in
+                    // `compact::compact`, so we have no more work to do here.
+                }
                Ex::ZeroValue(ty) => self.types_used.insert(ty),
                Ex::Compose { ty, ref components } => {
                    self.types_used.insert(ty);
@ -116,7 +123,7 @@ impl<'tracer> ExpressionTracer<'tracer> {
                    self.expressions_used
                        .insert_iter([image, sampler, coordinate]);
                    self.expressions_used.insert_iter(array_index);
-                    match self.const_expressions_used {
+                    match self.global_expressions_used {
                        Some(ref mut used) => used.insert_iter(offset),
                        None => self.expressions_used.insert_iter(offset),
                    }
@ -186,6 +193,7 @@ impl<'tracer> ExpressionTracer<'tracer> {
                Ex::AtomicResult { ty, comparison: _ } => self.types_used.insert(ty),
                Ex::WorkGroupUniformLoadResult { ty } => self.types_used.insert(ty),
                Ex::ArrayLength(expr) => self.expressions_used.insert(expr),
+                Ex::SubgroupOperationResult { ty } => self.types_used.insert(ty),
                Ex::RayQueryGetIntersection {
                    query,
                    committed: _,
@ -217,8 +225,12 @@ impl ModuleMap {
            | Ex::GlobalVariable(_)
            | Ex::LocalVariable(_)
            | Ex::CallResult(_)
+            | Ex::SubgroupBallotResult
            | Ex::RayQueryProceedResult => {}

+            // All overrides are retained, so their handles never change.
+            Ex::Override(_) => {}
+
            // Expressions that contain handles that need to be adjusted.
            Ex::Constant(ref mut constant) => self.constants.adjust(constant),
            Ex::ZeroValue(ref mut ty) => self.types.adjust(ty),
@ -267,7 +279,7 @@ impl ModuleMap {
                adjust(coordinate);
                operand_map.adjust_option(array_index);
                if let Some(ref mut offset) = *offset {
-                    self.const_expressions.adjust(offset);
+                    self.global_expressions.adjust(offset);
                }
                self.adjust_sample_level(level, operand_map);
                operand_map.adjust_option(depth_ref);
@ -344,6 +356,7 @@ impl ModuleMap {
                comparison: _,
            } => self.types.adjust(ty),
            Ex::WorkGroupUniformLoadResult { ref mut ty } => self.types.adjust(ty),
+            Ex::SubgroupOperationResult { ref mut ty } => self.types.adjust(ty),
            Ex::ArrayLength(ref mut expr) => adjust(expr),
            Ex::RayQueryGetIntersection {
                ref mut query,
--- a/naga/src/compact/functions.rs
+++ b/naga/src/compact/functions.rs
@ -4,10 +4,11 @@ use super::{FunctionMap, ModuleMap};
 pub struct FunctionTracer<'a> {
    pub function: &'a crate::Function,
    pub constants: &'a crate::Arena<crate::Constant>,
+    pub overrides: &'a crate::Arena<crate::Override>,

    pub types_used: &'a mut HandleSet<crate::Type>,
    pub constants_used: &'a mut HandleSet<crate::Constant>,
-    pub const_expressions_used: &'a mut HandleSet<crate::Expression>,
+    pub global_expressions_used: &'a mut HandleSet<crate::Expression>,

    /// Function-local expressions used.
    pub expressions_used: HandleSet<crate::Expression>,
@ -47,12 +48,13 @@ impl<'a> FunctionTracer<'a> {
    fn as_expression(&mut self) -> super::expressions::ExpressionTracer {
        super::expressions::ExpressionTracer {
            constants: self.constants,
+            overrides: self.overrides,
            expressions: &self.function.expressions,

            types_used: self.types_used,
            constants_used: self.constants_used,
            expressions_used: &mut self.expressions_used,
-            const_expressions_used: Some(&mut self.const_expressions_used),
+            global_expressions_used: Some(&mut self.global_expressions_used),
        }
    }
 }
--- a/naga/src/compact/mod.rs
+++ b/naga/src/compact/mod.rs
@ -38,7 +38,7 @@ pub fn compact(module: &mut crate::Module) {
            log::trace!("tracing global {:?}", global.name);
            module_tracer.types_used.insert(global.ty);
            if let Some(init) = global.init {
-                module_tracer.const_expressions_used.insert(init);
+                module_tracer.global_expressions_used.insert(init);
            }
        }
    }
@ -50,7 +50,15 @@ pub fn compact(module: &mut crate::Module) {
    for (handle, constant) in module.constants.iter() {
        if constant.name.is_some() {
            module_tracer.constants_used.insert(handle);
-            module_tracer.const_expressions_used.insert(constant.init);
+            module_tracer.global_expressions_used.insert(constant.init);
+        }
+    }
+
+    // We treat all overrides as used by definition.
+    for (_, override_) in module.overrides.iter() {
+        module_tracer.types_used.insert(override_.ty);
+        if let Some(init) = override_.init {
+            module_tracer.global_expressions_used.insert(init);
        }
    }

@ -137,9 +145,9 @@ pub fn compact(module: &mut crate::Module) {

    // Drop unused constant expressions, reusing existing storage.
    log::trace!("adjusting constant expressions");
-    module.const_expressions.retain_mut(|handle, expr| {
-        if module_map.const_expressions.used(handle) {
-            module_map.adjust_expression(expr, &module_map.const_expressions);
+    module.global_expressions.retain_mut(|handle, expr| {
+        if module_map.global_expressions.used(handle) {
+            module_map.adjust_expression(expr, &module_map.global_expressions);
            true
        } else {
            false
@ -151,20 +159,29 @@ pub fn compact(module: &mut crate::Module) {
    module.constants.retain_mut(|handle, constant| {
        if module_map.constants.used(handle) {
            module_map.types.adjust(&mut constant.ty);
-            module_map.const_expressions.adjust(&mut constant.init);
+            module_map.global_expressions.adjust(&mut constant.init);
            true
        } else {
            false
        }
    });

+    // Adjust override types and initializers.
+    log::trace!("adjusting overrides");
+    for (_, override_) in module.overrides.iter_mut() {
+        module_map.types.adjust(&mut override_.ty);
+        if let Some(init) = override_.init.as_mut() {
+            module_map.global_expressions.adjust(init);
+        }
+    }
+
    // Adjust global variables' types and initializers.
    log::trace!("adjusting global variables");
    for (_, global) in module.global_variables.iter_mut() {
        log::trace!("adjusting global {:?}", global.name);
        module_map.types.adjust(&mut global.ty);
        if let Some(ref mut init) = global.init {
-            module_map.const_expressions.adjust(init);
+            module_map.global_expressions.adjust(init);
        }
    }

@ -193,7 +210,7 @@ struct ModuleTracer<'module> {
    module: &'module crate::Module,
    types_used: HandleSet<crate::Type>,
    constants_used: HandleSet<crate::Constant>,
-    const_expressions_used: HandleSet<crate::Expression>,
+    global_expressions_used: HandleSet<crate::Expression>,
 }

 impl<'module> ModuleTracer<'module> {
@ -202,7 +219,7 @@ impl<'module> ModuleTracer<'module> {
            module,
            types_used: HandleSet::for_arena(&module.types),
            constants_used: HandleSet::for_arena(&module.constants),
-            const_expressions_used: HandleSet::for_arena(&module.const_expressions),
+            global_expressions_used: HandleSet::for_arena(&module.global_expressions),
        }
    }

@ -233,12 +250,13 @@ impl<'module> ModuleTracer<'module> {

    fn as_const_expression(&mut self) -> expressions::ExpressionTracer {
        expressions::ExpressionTracer {
-            expressions: &self.module.const_expressions,
+            expressions: &self.module.global_expressions,
            constants: &self.module.constants,
+            overrides: &self.module.overrides,
            types_used: &mut self.types_used,
            constants_used: &mut self.constants_used,
-            expressions_used: &mut self.const_expressions_used,
-            const_expressions_used: None,
+            expressions_used: &mut self.global_expressions_used,
+            global_expressions_used: None,
        }
    }

@ -249,9 +267,10 @@ impl<'module> ModuleTracer<'module> {
        FunctionTracer {
            function,
            constants: &self.module.constants,
+            overrides: &self.module.overrides,
            types_used: &mut self.types_used,
            constants_used: &mut self.constants_used,
-            const_expressions_used: &mut self.const_expressions_used,
+            global_expressions_used: &mut self.global_expressions_used,
            expressions_used: HandleSet::for_arena(&function.expressions),
        }
    }
@ -260,7 +279,7 @@ impl<'module> ModuleTracer<'module> {
 struct ModuleMap {
    types: HandleMap<crate::Type>,
    constants: HandleMap<crate::Constant>,
-    const_expressions: HandleMap<crate::Expression>,
+    global_expressions: HandleMap<crate::Expression>,
 }

 impl From<ModuleTracer<'_>> for ModuleMap {
@ -268,7 +287,7 @@ impl From<ModuleTracer<'_>> for ModuleMap {
        ModuleMap {
            types: HandleMap::from_set(used.types_used),
            constants: HandleMap::from_set(used.constants_used),
-            const_expressions: HandleMap::from_set(used.const_expressions_used),
+            global_expressions: HandleMap::from_set(used.global_expressions_used),
        }
    }
 }
--- a/naga/src/compact/statements.rs
+++ b/naga/src/compact/statements.rs
@ -97,6 +97,39 @@ impl FunctionTracer<'_> {
                        self.expressions_used.insert(query);
                        self.trace_ray_query_function(fun);
                    }
+                    St::SubgroupBallot { result, predicate } => {
+                        if let Some(predicate) = predicate {
+                            self.expressions_used.insert(predicate)
+                        }
+                        self.expressions_used.insert(result)
+                    }
+                    St::SubgroupCollectiveOperation {
+                        op: _,
+                        collective_op: _,
+                        argument,
+                        result,
+                    } => {
+                        self.expressions_used.insert(argument);
+                        self.expressions_used.insert(result)
+                    }
+                    St::SubgroupGather {
+                        mode,
+                        argument,
+                        result,
+                    } => {
+                        match mode {
+                            crate::GatherMode::BroadcastFirst => {}
+                            crate::GatherMode::Broadcast(index)
+                            | crate::GatherMode::Shuffle(index)
+                            | crate::GatherMode::ShuffleDown(index)
+                            | crate::GatherMode::ShuffleUp(index)
+                            | crate::GatherMode::ShuffleXor(index) => {
+                                self.expressions_used.insert(index)
+                            }
+                        }
+                        self.expressions_used.insert(argument);
+                        self.expressions_used.insert(result)
+                    }

                    // Trivial statements.
                    St::Break
@ -250,6 +283,40 @@ impl FunctionMap {
                        adjust(query);
                        self.adjust_ray_query_function(fun);
                    }
+                    St::SubgroupBallot {
+                        ref mut result,
+                        ref mut predicate,
+                    } => {
+                        if let Some(ref mut predicate) = *predicate {
+                            adjust(predicate);
+                        }
+                        adjust(result);
+                    }
+                    St::SubgroupCollectiveOperation {
+                        op: _,
+                        collective_op: _,
+                        ref mut argument,
+                        ref mut result,
+                    } => {
+                        adjust(argument);
+                        adjust(result);
+                    }
+                    St::SubgroupGather {
+                        ref mut mode,
+                        ref mut argument,
+                        ref mut result,
+                    } => {
+                        match *mode {
+                            crate::GatherMode::BroadcastFirst => {}
+                            crate::GatherMode::Broadcast(ref mut index)
+                            | crate::GatherMode::Shuffle(ref mut index)
+                            | crate::GatherMode::ShuffleDown(ref mut index)
+                            | crate::GatherMode::ShuffleUp(ref mut index)
+                            | crate::GatherMode::ShuffleXor(ref mut index) => adjust(index),
+                        }
+                        adjust(argument);
+                        adjust(result);
+                    }

                    // Trivial statements.
                    St::Break
--- a/naga/src/error.rs
+++ b/naga/src/error.rs
@ -0,0 +1,74 @@
+use std::{error::Error, fmt};
+
+#[derive(Clone, Debug)]
+pub struct ShaderError<E> {
+    /// The source code of the shader.
+    pub source: String,
+    pub label: Option<String>,
+    pub inner: Box<E>,
+}
+
+#[cfg(feature = "wgsl-in")]
+impl fmt::Display for ShaderError<crate::front::wgsl::ParseError> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let label = self.label.as_deref().unwrap_or_default();
+        let string = self.inner.emit_to_string(&self.source);
+        write!(f, "\nShader '{label}' parsing {string}")
+    }
+}
+#[cfg(feature = "glsl-in")]
+impl fmt::Display for ShaderError<crate::front::glsl::ParseErrors> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let label = self.label.as_deref().unwrap_or_default();
+        let string = self.inner.emit_to_string(&self.source);
+        write!(f, "\nShader '{label}' parsing {string}")
+    }
+}
+#[cfg(feature = "spv-in")]
+impl fmt::Display for ShaderError<crate::front::spv::Error> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let label = self.label.as_deref().unwrap_or_default();
+        let string = self.inner.emit_to_string(&self.source);
+        write!(f, "\nShader '{label}' parsing {string}")
+    }
+}
+impl fmt::Display for ShaderError<crate::WithSpan<crate::valid::ValidationError>> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        use codespan_reporting::{
+            diagnostic::{Diagnostic, Label},
+            files::SimpleFile,
+            term,
+        };
+
+        let label = self.label.as_deref().unwrap_or_default();
+        let files = SimpleFile::new(label, &self.source);
+        let config = term::Config::default();
+        let mut writer = termcolor::NoColor::new(Vec::new());
+
+        let diagnostic = Diagnostic::error().with_labels(
+            self.inner
+                .spans()
+                .map(|&(span, ref desc)| {
+                    Label::primary((), span.to_range().unwrap()).with_message(desc.to_owned())
+                })
+                .collect(),
+        );
+
+        term::emit(&mut writer, &config, &files, &diagnostic).expect("cannot write error");
+
+        write!(
+            f,
+            "\nShader validation {}",
+            String::from_utf8_lossy(&writer.into_inner())
+        )
+    }
+}
+impl<E> Error for ShaderError<E>
+where
+    ShaderError<E>: fmt::Display,
+    E: Error + 'static,
+{
+    fn source(&self) -> Option<&(dyn Error + 'static)> {
+        Some(&self.inner)
+    }
+}
--- a/naga/src/front/glsl/builtins.rs
+++ b/naga/src/front/glsl/builtins.rs
@ -718,13 +718,13 @@ fn inject_standard_builtins(

            let ty = match fun {
                MathFunction::Pack4x8snorm | MathFunction::Pack4x8unorm => TypeInner::Vector {
-                    size: crate::VectorSize::Quad,
+                    size: VectorSize::Quad,
                    scalar: Scalar::F32,
                },
                MathFunction::Pack2x16unorm
                | MathFunction::Pack2x16snorm
                | MathFunction::Pack2x16float => TypeInner::Vector {
-                    size: crate::VectorSize::Bi,
+                    size: VectorSize::Bi,
                    scalar: Scalar::F32,
                },
                _ => unreachable!(),
--- a/Show More
+++ b/Show More