Bitfield Fixes (#5305)

2024-11-21 14:23:32 +00:00 · 2024-02-29 15:50:42 -05:00 · 2024-02-29 15:50:42 -05:00 · a5c0181c3a
commit a5c0181c3a
parent b020b984df
13 changed files with 802 additions and 272 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -120,6 +120,7 @@ Bottom level categories:
 - Fix timeout when presenting a surface where no work has been done. By @waywardmonkeys in [#5200](https://github.com/gfx-rs/wgpu/pull/5200)
 - Simplify and speed up the allocation of internal IDs. By @nical in [#5229](https://github.com/gfx-rs/wgpu/pull/5229)
 - Fix an issue where command encoders weren't properly freed if an error occurred during command encoding. By @ErichDonGubler in [#5251](https://github.com/gfx-rs/wgpu/pull/5251).
+- Fix behavior of `extractBits` and `insertBits` when `offset + count` overflows the bit width. By @cwfitzgerald in [#5305](https://github.com/gfx-rs/wgpu/pull/5305)
 - Fix registry leaks with de-duplicated resources. By @nical in [#5244](https://github.com/gfx-rs/wgpu/pull/5244)
 - Fix behavior of integer `clamp` when `min` argument > `max` argument. By @cwfitzgerald in [#5300](https://github.com/gfx-rs/wgpu/pull/5300).
 - Fix missing validation for `Device::clear_buffer` where `offset + size buffer.size` was not checked when `size` was omitted. By @ErichDonGubler in [#5282](https://github.com/gfx-rs/wgpu/pull/5282).
--- a/naga/src/back/glsl/mod.rs
+++ b/naga/src/back/glsl/mod.rs
@ -1290,7 +1290,14 @@ impl<'a, W: Write> Writer<'a, W> {

            let inner = expr_info.ty.inner_with(&self.module.types);

-            if let Expression::Math { fun, arg, arg1, .. } = *expr {
+            if let Expression::Math {
+                fun,
+                arg,
+                arg1,
+                arg2,
+                ..
+            } = *expr
+            {
                match fun {
                    crate::MathFunction::Dot => {
                        // if the expression is a Dot product with integer arguments,
@ -1305,6 +1312,14 @@ impl<'a, W: Write> Writer<'a, W> {
                            }
                        }
                    }
+                    crate::MathFunction::ExtractBits => {
+                        // Only argument 1 is re-used.
+                        self.need_bake_expressions.insert(arg1.unwrap());
+                    }
+                    crate::MathFunction::InsertBits => {
+                        // Only argument 2 is re-used.
+                        self.need_bake_expressions.insert(arg2.unwrap());
+                    }
                    crate::MathFunction::CountLeadingZeros => {
                        if let Some(crate::ScalarKind::Sint) = inner.scalar_kind() {
                            self.need_bake_expressions.insert(arg);
@ -3375,8 +3390,59 @@ impl<'a, W: Write> Writer<'a, W> {
                    }
                    Mf::CountOneBits => "bitCount",
                    Mf::ReverseBits => "bitfieldReverse",
-                    Mf::ExtractBits => "bitfieldExtract",
-                    Mf::InsertBits => "bitfieldInsert",
+                    Mf::ExtractBits => {
+                        // The behavior of ExtractBits is undefined when offset + count > bit_width. We need
+                        // to first sanitize the offset and count first. If we don't do this, AMD and Intel chips
+                        // will return out-of-spec values if the extracted range is not within the bit width.
+                        //
+                        // This encodes the exact formula specified by the wgsl spec, without temporary values:
+                        // https://gpuweb.github.io/gpuweb/wgsl/#extractBits-unsigned-builtin
+                        //
+                        // w = sizeof(x) * 8
+                        // o = min(offset, w)
+                        // c = min(count, w - o)
+                        //
+                        // bitfieldExtract(x, o, c)
+                        //
+                        // extract_bits(e, min(offset, w), min(count, w - min(offset, w))))
+                        let scalar_bits = ctx
+                            .resolve_type(arg, &self.module.types)
+                            .scalar_width()
+                            .unwrap();
+
+                        write!(self.out, "bitfieldExtract(")?;
+                        self.write_expr(arg, ctx)?;
+                        write!(self.out, ", int(min(")?;
+                        self.write_expr(arg1.unwrap(), ctx)?;
+                        write!(self.out, ", {scalar_bits}u)), int(min(",)?;
+                        self.write_expr(arg2.unwrap(), ctx)?;
+                        write!(self.out, ", {scalar_bits}u - min(")?;
+                        self.write_expr(arg1.unwrap(), ctx)?;
+                        write!(self.out, ", {scalar_bits}u))))")?;
+
+                        return Ok(());
+                    }
+                    Mf::InsertBits => {
+                        // InsertBits has the same considerations as ExtractBits above
+                        let scalar_bits = ctx
+                            .resolve_type(arg, &self.module.types)
+                            .scalar_width()
+                            .unwrap();
+
+                        write!(self.out, "bitfieldInsert(")?;
+                        self.write_expr(arg, ctx)?;
+                        write!(self.out, ", ")?;
+                        self.write_expr(arg1.unwrap(), ctx)?;
+                        write!(self.out, ", int(min(")?;
+                        self.write_expr(arg2.unwrap(), ctx)?;
+                        write!(self.out, ", {scalar_bits}u)), int(min(",)?;
+                        self.write_expr(arg3.unwrap(), ctx)?;
+                        write!(self.out, ", {scalar_bits}u - min(")?;
+                        self.write_expr(arg2.unwrap(), ctx)?;
+                        write!(self.out, ", {scalar_bits}u))))")?;
+
+                        return Ok(());
+                    }
                    Mf::FindLsb => "findLSB",
                    Mf::FindMsb => "findMSB",
                    // data packing
--- a/naga/src/back/hlsl/help.rs
+++ b/naga/src/back/hlsl/help.rs
@ -26,7 +26,11 @@ int dim_1d = NagaDimensions1D(image_1d);
 ```
 */

-use super::{super::FunctionCtx, BackendResult};
+use super::{
+    super::FunctionCtx,
+    writer::{EXTRACT_BITS_FUNCTION, INSERT_BITS_FUNCTION},
+    BackendResult,
+};
 use crate::{arena::Handle, proc::NameKey};
 use std::fmt::Write;

@ -59,6 +63,13 @@ pub(super) struct WrappedMatCx2 {
    pub(super) columns: crate::VectorSize,
 }

+#[derive(Clone, Copy, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)]
+pub(super) struct WrappedMath {
+    pub(super) fun: crate::MathFunction,
+    pub(super) scalar: crate::Scalar,
+    pub(super) components: Option<u32>,
+}
+
 /// HLSL backend requires its own `ImageQuery` enum.
 ///
 /// It is used inside `WrappedImageQuery` and should be unique per ImageQuery function.
@ -851,12 +862,149 @@ impl<'a, W: Write> super::Writer<'a, W> {
        Ok(())
    }

+    pub(super) fn write_wrapped_math_functions(
+        &mut self,
+        module: &crate::Module,
+        func_ctx: &FunctionCtx,
+    ) -> BackendResult {
+        for (_, expression) in func_ctx.expressions.iter() {
+            if let crate::Expression::Math {
+                fun,
+                arg,
+                arg1: _arg1,
+                arg2: _arg2,
+                arg3: _arg3,
+            } = *expression
+            {
+                match fun {
+                    crate::MathFunction::ExtractBits => {
+                        // The behavior of our extractBits polyfill is undefined if offset + count > bit_width. We need
+                        // to first sanitize the offset and count first. If we don't do this, we will get out-of-spec
+                        // values if the extracted range is not within the bit width.
+                        //
+                        // This encodes the exact formula specified by the wgsl spec:
+                        // https://gpuweb.github.io/gpuweb/wgsl/#extractBits-unsigned-builtin
+                        //
+                        // w = sizeof(x) * 8
+                        // o = min(offset, w)
+                        // c = min(count, w - o)
+                        //
+                        // bitfieldExtract(x, o, c)
+                        let arg_ty = func_ctx.resolve_type(arg, &module.types);
+                        let scalar = arg_ty.scalar().unwrap();
+                        let components = arg_ty.components();
+
+                        let wrapped = WrappedMath {
+                            fun,
+                            scalar,
+                            components,
+                        };
+
+                        if !self.wrapped.math.insert(wrapped) {
+                            continue;
+                        }
+
+                        // Write return type
+                        self.write_value_type(module, arg_ty)?;
+
+                        let scalar_width: u8 = scalar.width * 8;
+
+                        // Write function name and parameters
+                        writeln!(self.out, " {EXTRACT_BITS_FUNCTION}(")?;
+                        write!(self.out, "    ")?;
+                        self.write_value_type(module, arg_ty)?;
+                        writeln!(self.out, " e,")?;
+                        writeln!(self.out, "    uint offset,")?;
+                        writeln!(self.out, "    uint count")?;
+                        writeln!(self.out, ") {{")?;
+
+                        // Write function body
+                        writeln!(self.out, "    uint w = {scalar_width};")?;
+                        writeln!(self.out, "    uint o = min(offset, w);")?;
+                        writeln!(self.out, "    uint c = min(count, w - o);")?;
+                        writeln!(
+                            self.out,
+                            "    return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c));"
+                        )?;
+
+                        // End of function body
+                        writeln!(self.out, "}}")?;
+                    }
+                    crate::MathFunction::InsertBits => {
+                        // The behavior of our insertBits polyfill has the same constraints as the extractBits polyfill.
+
+                        let arg_ty = func_ctx.resolve_type(arg, &module.types);
+                        let scalar = arg_ty.scalar().unwrap();
+                        let components = arg_ty.components();
+
+                        let wrapped = WrappedMath {
+                            fun,
+                            scalar,
+                            components,
+                        };
+
+                        if !self.wrapped.math.insert(wrapped) {
+                            continue;
+                        }
+
+                        // Write return type
+                        self.write_value_type(module, arg_ty)?;
+
+                        let scalar_width: u8 = scalar.width * 8;
+                        let scalar_max: u64 = match scalar.width {
+                            1 => 0xFF,
+                            2 => 0xFFFF,
+                            4 => 0xFFFFFFFF,
+                            8 => 0xFFFFFFFFFFFFFFFF,
+                            _ => unreachable!(),
+                        };
+
+                        // Write function name and parameters
+                        writeln!(self.out, " {INSERT_BITS_FUNCTION}(")?;
+                        write!(self.out, "    ")?;
+                        self.write_value_type(module, arg_ty)?;
+                        writeln!(self.out, " e,")?;
+                        write!(self.out, "    ")?;
+                        self.write_value_type(module, arg_ty)?;
+                        writeln!(self.out, " newbits,")?;
+                        writeln!(self.out, "    uint offset,")?;
+                        writeln!(self.out, "    uint count")?;
+                        writeln!(self.out, ") {{")?;
+
+                        // Write function body
+                        writeln!(self.out, "    uint w = {scalar_width}u;")?;
+                        writeln!(self.out, "    uint o = min(offset, w);")?;
+                        writeln!(self.out, "    uint c = min(count, w - o);")?;
+
+                        // The `u` suffix on the literals is _extremely_ important. Otherwise it will use
+                        // i32 shifting instead of the intended u32 shifting.
+                        writeln!(
+                            self.out,
+                            "    uint mask = (({scalar_max}u >> ({scalar_width}u - c)) << o);"
+                        )?;
+                        writeln!(
+                            self.out,
+                            "    return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask)));"
+                        )?;
+
+                        // End of function body
+                        writeln!(self.out, "}}")?;
+                    }
+                    _ => {}
+                }
+            }
+        }
+
+        Ok(())
+    }
+
    /// Helper function that writes various wrapped functions
    pub(super) fn write_wrapped_functions(
        &mut self,
        module: &crate::Module,
        func_ctx: &FunctionCtx,
    ) -> BackendResult {
+        self.write_wrapped_math_functions(module, func_ctx)?;
        self.write_wrapped_compose_functions(module, func_ctx.expressions)?;

        for (handle, _) in func_ctx.expressions.iter() {
--- a/naga/src/back/hlsl/keywords.rs
+++ b/naga/src/back/hlsl/keywords.rs
@ -817,6 +817,8 @@ pub const RESERVED: &[&str] = &[
    // Naga utilities
    super::writer::MODF_FUNCTION,
    super::writer::FREXP_FUNCTION,
+    super::writer::EXTRACT_BITS_FUNCTION,
+    super::writer::INSERT_BITS_FUNCTION,
 ];

 // DXC scalar types, from https://github.com/microsoft/DirectXShaderCompiler/blob/18c9e114f9c314f93e68fbc72ce207d4ed2e65ae/tools/clang/lib/AST/ASTContextHLSL.cpp#L48-L254
--- a/naga/src/back/hlsl/mod.rs
+++ b/naga/src/back/hlsl/mod.rs
@ -256,6 +256,7 @@ struct Wrapped {
    constructors: crate::FastHashSet<help::WrappedConstructor>,
    struct_matrix_access: crate::FastHashSet<help::WrappedStructMatrixAccess>,
    mat_cx2s: crate::FastHashSet<help::WrappedMatCx2>,
+    math: crate::FastHashSet<help::WrappedMath>,
 }

 impl Wrapped {
@ -265,6 +266,7 @@ impl Wrapped {
        self.constructors.clear();
        self.struct_matrix_access.clear();
        self.mat_cx2s.clear();
+        self.math.clear();
    }
 }

--- a/naga/src/back/hlsl/writer.rs
+++ b/naga/src/back/hlsl/writer.rs
@ -19,6 +19,8 @@ const SPECIAL_OTHER: &str = "other";

 pub(crate) const MODF_FUNCTION: &str = "naga_modf";
 pub(crate) const FREXP_FUNCTION: &str = "naga_frexp";
+pub(crate) const EXTRACT_BITS_FUNCTION: &str = "naga_extractBits";
+pub(crate) const INSERT_BITS_FUNCTION: &str = "naga_insertBits";

 struct EpStructMember {
    name: String,
@ -125,14 +127,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                self.need_bake_expressions.insert(fun_handle);
            }

-            if let Expression::Math {
-                fun,
-                arg,
-                arg1,
-                arg2,
-                arg3,
-            } = *expr
-            {
+            if let Expression::Math { fun, arg, .. } = *expr {
                match fun {
                    crate::MathFunction::Asinh
                    | crate::MathFunction::Acosh
@ -149,17 +144,6 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                    | crate::MathFunction::Pack4x8unorm => {
                        self.need_bake_expressions.insert(arg);
                    }
-                    crate::MathFunction::ExtractBits => {
-                        self.need_bake_expressions.insert(arg);
-                        self.need_bake_expressions.insert(arg1.unwrap());
-                        self.need_bake_expressions.insert(arg2.unwrap());
-                    }
-                    crate::MathFunction::InsertBits => {
-                        self.need_bake_expressions.insert(arg);
-                        self.need_bake_expressions.insert(arg1.unwrap());
-                        self.need_bake_expressions.insert(arg2.unwrap());
-                        self.need_bake_expressions.insert(arg3.unwrap());
-                    }
                    crate::MathFunction::CountLeadingZeros => {
                        let inner = info[fun_handle].ty.inner_with(&module.types);
                        if let Some(crate::ScalarKind::Sint) = inner.scalar_kind() {
@ -2620,8 +2604,6 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                enum Function {
                    Asincosh { is_sin: bool },
                    Atanh,
-                    ExtractBits,
-                    InsertBits,
                    Pack2x16float,
                    Pack2x16snorm,
                    Pack2x16unorm,
@ -2705,8 +2687,8 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                    Mf::ReverseBits => Function::MissingIntOverload("reversebits"),
                    Mf::FindLsb => Function::MissingIntReturnType("firstbitlow"),
                    Mf::FindMsb => Function::MissingIntReturnType("firstbithigh"),
-                    Mf::ExtractBits => Function::ExtractBits,
-                    Mf::InsertBits => Function::InsertBits,
+                    Mf::ExtractBits => Function::Regular(EXTRACT_BITS_FUNCTION),
+                    Mf::InsertBits => Function::Regular(INSERT_BITS_FUNCTION),
                    // Data Packing
                    Mf::Pack2x16float => Function::Pack2x16float,
                    Mf::Pack2x16snorm => Function::Pack2x16snorm,
@ -2742,70 +2724,6 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> {
                        self.write_expr(module, arg, func_ctx)?;
                        write!(self.out, "))")?;
                    }
-                    Function::ExtractBits => {
-                        // e: T,
-                        // offset: u32,
-                        // count: u32
-                        // T is u32 or i32 or vecN<u32> or vecN<i32>
-                        if let (Some(offset), Some(count)) = (arg1, arg2) {
-                            let scalar_width: u8 = 32;
-                            // Works for signed and unsigned
-                            // (count == 0 ? 0 : (e << (32 - count - offset)) >> (32 - count))
-                            write!(self.out, "(")?;
-                            self.write_expr(module, count, func_ctx)?;
-                            write!(self.out, " == 0 ? 0 : (")?;
-                            self.write_expr(module, arg, func_ctx)?;
-                            write!(self.out, " << ({scalar_width} - ")?;
-                            self.write_expr(module, count, func_ctx)?;
-                            write!(self.out, " - ")?;
-                            self.write_expr(module, offset, func_ctx)?;
-                            write!(self.out, ")) >> ({scalar_width} - ")?;
-                            self.write_expr(module, count, func_ctx)?;
-                            write!(self.out, "))")?;
-                        }
-                    }
-                    Function::InsertBits => {
-                        // e: T,
-                        // newbits: T,
-                        // offset: u32,
-                        // count: u32
-                        // returns T
-                        // T is i32, u32, vecN<i32>, or vecN<u32>
-                        if let (Some(newbits), Some(offset), Some(count)) = (arg1, arg2, arg3) {
-                            let scalar_width: u8 = 32;
-                            let scalar_max: u32 = 0xFFFFFFFF;
-                            // mask = ((0xFFFFFFFFu >> (32 - count)) << offset)
-                            // (count == 0 ? e : ((e & ~mask) | ((newbits << offset) & mask)))
-                            write!(self.out, "(")?;
-                            self.write_expr(module, count, func_ctx)?;
-                            write!(self.out, " == 0 ? ")?;
-                            self.write_expr(module, arg, func_ctx)?;
-                            write!(self.out, " : ")?;
-                            write!(self.out, "(")?;
-                            self.write_expr(module, arg, func_ctx)?;
-                            write!(self.out, " & ~")?;
-                            // mask
-                            write!(self.out, "(({scalar_max}u >> ({scalar_width}u - ")?;
-                            self.write_expr(module, count, func_ctx)?;
-                            write!(self.out, ")) << ")?;
-                            self.write_expr(module, offset, func_ctx)?;
-                            write!(self.out, ")")?;
-                            // end mask
-                            write!(self.out, ") | ((")?;
-                            self.write_expr(module, newbits, func_ctx)?;
-                            write!(self.out, " << ")?;
-                            self.write_expr(module, offset, func_ctx)?;
-                            write!(self.out, ") & ")?;
-                            // // mask
-                            write!(self.out, "(({scalar_max}u >> ({scalar_width}u - ")?;
-                            self.write_expr(module, count, func_ctx)?;
-                            write!(self.out, ")) << ")?;
-                            self.write_expr(module, offset, func_ctx)?;
-                            write!(self.out, ")")?;
-                            // // end mask
-                            write!(self.out, "))")?;
-                        }
-                    }
                    Function::Pack2x16float => {
                        write!(self.out, "(f32tof16(")?;
                        self.write_expr(module, arg, func_ctx)?;
--- a/naga/src/back/msl/writer.rs
+++ b/naga/src/back/msl/writer.rs
@ -1794,8 +1794,8 @@ impl<W: Write> Writer<W> {
                    Mf::CountLeadingZeros => "clz",
                    Mf::CountOneBits => "popcount",
                    Mf::ReverseBits => "reverse_bits",
-                    Mf::ExtractBits => "extract_bits",
-                    Mf::InsertBits => "insert_bits",
+                    Mf::ExtractBits => "",
+                    Mf::InsertBits => "",
                    Mf::FindLsb => "",
                    Mf::FindMsb => "",
                    // data packing
@ -1891,6 +1891,52 @@ impl<W: Write> Writer<W> {
                    write!(self.out, "as_type<uint>(half2(")?;
                    self.put_expression(arg, context, false)?;
                    write!(self.out, "))")?;
+                } else if fun == Mf::ExtractBits {
+                    // The behavior of ExtractBits is undefined when offset + count > bit_width. We need
+                    // to first sanitize the offset and count first. If we don't do this, Apple chips
+                    // will return out-of-spec values if the extracted range is not within the bit width.
+                    //
+                    // This encodes the exact formula specified by the wgsl spec, without temporary values:
+                    // https://gpuweb.github.io/gpuweb/wgsl/#extractBits-unsigned-builtin
+                    //
+                    // w = sizeof(x) * 8
+                    // o = min(offset, w)
+                    // tmp = w - o
+                    // c = min(count, tmp)
+                    //
+                    // bitfieldExtract(x, o, c)
+                    //
+                    // extract_bits(e, min(offset, w), min(count, w - min(offset, w))))
+
+                    let scalar_bits = context.resolve_type(arg).scalar_width().unwrap();
+
+                    write!(self.out, "{NAMESPACE}::extract_bits(")?;
+                    self.put_expression(arg, context, true)?;
+                    write!(self.out, ", {NAMESPACE}::min(")?;
+                    self.put_expression(arg1.unwrap(), context, true)?;
+                    write!(self.out, ", {scalar_bits}u), {NAMESPACE}::min(")?;
+                    self.put_expression(arg2.unwrap(), context, true)?;
+                    write!(self.out, ", {scalar_bits}u - {NAMESPACE}::min(")?;
+                    self.put_expression(arg1.unwrap(), context, true)?;
+                    write!(self.out, ", {scalar_bits}u)))")?;
+                } else if fun == Mf::InsertBits {
+                    // The behavior of InsertBits has the same issue as ExtractBits.
+                    //
+                    // insertBits(e, newBits, min(offset, w), min(count, w - min(offset, w))))
+
+                    let scalar_bits = context.resolve_type(arg).scalar_width().unwrap();
+
+                    write!(self.out, "{NAMESPACE}::insert_bits(")?;
+                    self.put_expression(arg, context, true)?;
+                    write!(self.out, ", ")?;
+                    self.put_expression(arg1.unwrap(), context, true)?;
+                    write!(self.out, ", {NAMESPACE}::min(")?;
+                    self.put_expression(arg2.unwrap(), context, true)?;
+                    write!(self.out, ", {scalar_bits}u), {NAMESPACE}::min(")?;
+                    self.put_expression(arg3.unwrap(), context, true)?;
+                    write!(self.out, ", {scalar_bits}u - {NAMESPACE}::min(")?;
+                    self.put_expression(arg2.unwrap(), context, true)?;
+                    write!(self.out, ", {scalar_bits}u)))")?;
                } else if fun == Mf::Radians {
                    write!(self.out, "((")?;
                    self.put_expression(arg, context, false)?;
@ -2489,7 +2535,14 @@ impl<W: Write> Writer<W> {
                }
            }

-            if let Expression::Math { fun, arg, arg1, .. } = *expr {
+            if let Expression::Math {
+                fun,
+                arg,
+                arg1,
+                arg2,
+                ..
+            } = *expr
+            {
                match fun {
                    crate::MathFunction::Dot => {
                        // WGSL's `dot` function works on any `vecN` type, but Metal's only
@ -2514,6 +2567,14 @@ impl<W: Write> Writer<W> {
                    crate::MathFunction::FindMsb => {
                        self.need_bake_expressions.insert(arg);
                    }
+                    crate::MathFunction::ExtractBits => {
+                        // Only argument 1 is re-used.
+                        self.need_bake_expressions.insert(arg1.unwrap());
+                    }
+                    crate::MathFunction::InsertBits => {
+                        // Only argument 2 is re-used.
+                        self.need_bake_expressions.insert(arg2.unwrap());
+                    }
                    crate::MathFunction::Sign => {
                        // WGSL's `sign` function works also on signed ints, but Metal's only
                        // works on floating points, so we emit inline code for integer `sign`
--- a/naga/src/back/spv/block.rs
+++ b/naga/src/back/spv/block.rs
@ -1050,24 +1050,131 @@ impl<'w> BlockContext<'w> {
                            Some(crate::ScalarKind::Sint) => spirv::Op::BitFieldSExtract,
                            other => unimplemented!("Unexpected sign({:?})", other),
                        };
+
+                        // The behavior of ExtractBits is undefined when offset + count > bit_width. We need
+                        // to first sanitize the offset and count first. If we don't do this, AMD and Intel
+                        // will return out-of-spec values if the extracted range is not within the bit width.
+                        //
+                        // This encodes the exact formula specified by the wgsl spec:
+                        // https://gpuweb.github.io/gpuweb/wgsl/#extractBits-unsigned-builtin
+                        //
+                        // w = sizeof(x) * 8
+                        // o = min(offset, w)
+                        // tmp = w - o
+                        // c = min(count, tmp)
+                        //
+                        // bitfieldExtract(x, o, c)
+
+                        let bit_width = arg_ty.scalar_width().unwrap();
+                        let width_constant = self
+                            .writer
+                            .get_constant_scalar(crate::Literal::U32(bit_width as u32));
+
+                        let u32_type = self.get_type_id(LookupType::Local(LocalType::Value {
+                            vector_size: None,
+                            scalar: crate::Scalar {
+                                kind: crate::ScalarKind::Uint,
+                                width: 4,
+                            },
+                            pointer_space: None,
+                        }));
+
+                        // o = min(offset, w)
+                        let offset_id = self.gen_id();
+                        block.body.push(Instruction::ext_inst(
+                            self.writer.gl450_ext_inst_id,
+                            spirv::GLOp::UMin,
+                            u32_type,
+                            offset_id,
+                            &[arg1_id, width_constant],
+                        ));
+
+                        // tmp = w - o
+                        let max_count_id = self.gen_id();
+                        block.body.push(Instruction::binary(
+                            spirv::Op::ISub,
+                            u32_type,
+                            max_count_id,
+                            width_constant,
+                            offset_id,
+                        ));
+
+                        // c = min(count, tmp)
+                        let count_id = self.gen_id();
+                        block.body.push(Instruction::ext_inst(
+                            self.writer.gl450_ext_inst_id,
+                            spirv::GLOp::UMin,
+                            u32_type,
+                            count_id,
+                            &[arg2_id, max_count_id],
+                        ));
+
                        MathOp::Custom(Instruction::ternary(
                            op,
                            result_type_id,
                            id,
                            arg0_id,
-                            arg1_id,
-                            arg2_id,
+                            offset_id,
+                            count_id,
                        ))
                    }
-                    Mf::InsertBits => MathOp::Custom(Instruction::quaternary(
+                    Mf::InsertBits => {
+                        // The behavior of InsertBits has the same undefined behavior as ExtractBits.
+
+                        let bit_width = arg_ty.scalar_width().unwrap();
+                        let width_constant = self
+                            .writer
+                            .get_constant_scalar(crate::Literal::U32(bit_width as u32));
+
+                        let u32_type = self.get_type_id(LookupType::Local(LocalType::Value {
+                            vector_size: None,
+                            scalar: crate::Scalar {
+                                kind: crate::ScalarKind::Uint,
+                                width: 4,
+                            },
+                            pointer_space: None,
+                        }));
+
+                        // o = min(offset, w)
+                        let offset_id = self.gen_id();
+                        block.body.push(Instruction::ext_inst(
+                            self.writer.gl450_ext_inst_id,
+                            spirv::GLOp::UMin,
+                            u32_type,
+                            offset_id,
+                            &[arg2_id, width_constant],
+                        ));
+
+                        // tmp = w - o
+                        let max_count_id = self.gen_id();
+                        block.body.push(Instruction::binary(
+                            spirv::Op::ISub,
+                            u32_type,
+                            max_count_id,
+                            width_constant,
+                            offset_id,
+                        ));
+
+                        // c = min(count, tmp)
+                        let count_id = self.gen_id();
+                        block.body.push(Instruction::ext_inst(
+                            self.writer.gl450_ext_inst_id,
+                            spirv::GLOp::UMin,
+                            u32_type,
+                            count_id,
+                            &[arg3_id, max_count_id],
+                        ));
+
+                        MathOp::Custom(Instruction::quaternary(
                            spirv::Op::BitFieldInsert,
                            result_type_id,
                            id,
                            arg0_id,
                            arg1_id,
-                        arg2_id,
-                        arg3_id,
-                    )),
+                            offset_id,
+                            count_id,
+                        ))
+                    }
                    Mf::FindLsb => MathOp::Ext(spirv::GLOp::FindILsb),
                    Mf::FindMsb => MathOp::Ext(match arg_scalar_kind {
                        Some(crate::ScalarKind::Uint) => spirv::GLOp::FindUMsb,
--- a/naga/src/lib.rs
+++ b/naga/src/lib.rs
@ -491,7 +491,7 @@ pub enum ScalarKind {
 }

 /// Characteristics of a scalar type.
-#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
 #[cfg_attr(feature = "serialize", derive(Serialize))]
 #[cfg_attr(feature = "deserialize", derive(Deserialize))]
 #[cfg_attr(feature = "arbitrary", derive(Arbitrary))]
--- a/naga/tests/out/glsl/bits.main.Compute.glsl
+++ b/naga/tests/out/glsl/bits.main.Compute.glsl
@ -39,44 +39,44 @@ void main() {
    f2_ = unpackHalf2x16(_e46);
    int _e48 = i;
    int _e49 = i;
-    i = bitfieldInsert(_e48, _e49, int(5u), int(10u));
+    i = bitfieldInsert(_e48, _e49, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    ivec2 _e53 = i2_;
    ivec2 _e54 = i2_;
-    i2_ = bitfieldInsert(_e53, _e54, int(5u), int(10u));
+    i2_ = bitfieldInsert(_e53, _e54, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    ivec3 _e58 = i3_;
    ivec3 _e59 = i3_;
-    i3_ = bitfieldInsert(_e58, _e59, int(5u), int(10u));
+    i3_ = bitfieldInsert(_e58, _e59, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    ivec4 _e63 = i4_;
    ivec4 _e64 = i4_;
-    i4_ = bitfieldInsert(_e63, _e64, int(5u), int(10u));
+    i4_ = bitfieldInsert(_e63, _e64, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    uint _e68 = u;
    uint _e69 = u;
-    u = bitfieldInsert(_e68, _e69, int(5u), int(10u));
+    u = bitfieldInsert(_e68, _e69, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    uvec2 _e73 = u2_;
    uvec2 _e74 = u2_;
-    u2_ = bitfieldInsert(_e73, _e74, int(5u), int(10u));
+    u2_ = bitfieldInsert(_e73, _e74, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    uvec3 _e78 = u3_;
    uvec3 _e79 = u3_;
-    u3_ = bitfieldInsert(_e78, _e79, int(5u), int(10u));
+    u3_ = bitfieldInsert(_e78, _e79, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    uvec4 _e83 = u4_;
    uvec4 _e84 = u4_;
-    u4_ = bitfieldInsert(_e83, _e84, int(5u), int(10u));
+    u4_ = bitfieldInsert(_e83, _e84, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    int _e88 = i;
-    i = bitfieldExtract(_e88, int(5u), int(10u));
+    i = bitfieldExtract(_e88, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    ivec2 _e92 = i2_;
-    i2_ = bitfieldExtract(_e92, int(5u), int(10u));
+    i2_ = bitfieldExtract(_e92, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    ivec3 _e96 = i3_;
-    i3_ = bitfieldExtract(_e96, int(5u), int(10u));
+    i3_ = bitfieldExtract(_e96, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    ivec4 _e100 = i4_;
-    i4_ = bitfieldExtract(_e100, int(5u), int(10u));
+    i4_ = bitfieldExtract(_e100, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    uint _e104 = u;
-    u = bitfieldExtract(_e104, int(5u), int(10u));
+    u = bitfieldExtract(_e104, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    uvec2 _e108 = u2_;
-    u2_ = bitfieldExtract(_e108, int(5u), int(10u));
+    u2_ = bitfieldExtract(_e108, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    uvec3 _e112 = u3_;
-    u3_ = bitfieldExtract(_e112, int(5u), int(10u));
+    u3_ = bitfieldExtract(_e112, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    uvec4 _e116 = u4_;
-    u4_ = bitfieldExtract(_e116, int(5u), int(10u));
+    u4_ = bitfieldExtract(_e116, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u))));
    int _e120 = i;
    i = findLSB(_e120);
    uvec2 _e122 = u2_;
--- a/naga/tests/out/hlsl/bits.hlsl
+++ b/naga/tests/out/hlsl/bits.hlsl
@ -1,3 +1,179 @@
+int naga_insertBits(
+    int e,
+    int newbits,
+    uint offset,
+    uint count
+) {
+    uint w = 32u;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    uint mask = ((4294967295u >> (32u - c)) << o);
+    return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask)));
+}
+int2 naga_insertBits(
+    int2 e,
+    int2 newbits,
+    uint offset,
+    uint count
+) {
+    uint w = 32u;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    uint mask = ((4294967295u >> (32u - c)) << o);
+    return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask)));
+}
+int3 naga_insertBits(
+    int3 e,
+    int3 newbits,
+    uint offset,
+    uint count
+) {
+    uint w = 32u;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    uint mask = ((4294967295u >> (32u - c)) << o);
+    return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask)));
+}
+int4 naga_insertBits(
+    int4 e,
+    int4 newbits,
+    uint offset,
+    uint count
+) {
+    uint w = 32u;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    uint mask = ((4294967295u >> (32u - c)) << o);
+    return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask)));
+}
+uint naga_insertBits(
+    uint e,
+    uint newbits,
+    uint offset,
+    uint count
+) {
+    uint w = 32u;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    uint mask = ((4294967295u >> (32u - c)) << o);
+    return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask)));
+}
+uint2 naga_insertBits(
+    uint2 e,
+    uint2 newbits,
+    uint offset,
+    uint count
+) {
+    uint w = 32u;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    uint mask = ((4294967295u >> (32u - c)) << o);
+    return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask)));
+}
+uint3 naga_insertBits(
+    uint3 e,
+    uint3 newbits,
+    uint offset,
+    uint count
+) {
+    uint w = 32u;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    uint mask = ((4294967295u >> (32u - c)) << o);
+    return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask)));
+}
+uint4 naga_insertBits(
+    uint4 e,
+    uint4 newbits,
+    uint offset,
+    uint count
+) {
+    uint w = 32u;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    uint mask = ((4294967295u >> (32u - c)) << o);
+    return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask)));
+}
+int naga_extractBits(
+    int e,
+    uint offset,
+    uint count
+) {
+    uint w = 32;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c));
+}
+int2 naga_extractBits(
+    int2 e,
+    uint offset,
+    uint count
+) {
+    uint w = 32;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c));
+}
+int3 naga_extractBits(
+    int3 e,
+    uint offset,
+    uint count
+) {
+    uint w = 32;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c));
+}
+int4 naga_extractBits(
+    int4 e,
+    uint offset,
+    uint count
+) {
+    uint w = 32;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c));
+}
+uint naga_extractBits(
+    uint e,
+    uint offset,
+    uint count
+) {
+    uint w = 32;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c));
+}
+uint2 naga_extractBits(
+    uint2 e,
+    uint offset,
+    uint count
+) {
+    uint w = 32;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c));
+}
+uint3 naga_extractBits(
+    uint3 e,
+    uint offset,
+    uint count
+) {
+    uint w = 32;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c));
+}
+uint4 naga_extractBits(
+    uint4 e,
+    uint offset,
+    uint count
+) {
+    uint w = 32;
+    uint o = min(offset, w);
+    uint c = min(count, w - o);
+    return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c));
+}
 [numthreads(1, 1, 1)]
 void main()
 {
@ -34,44 +210,44 @@ void main()
    f2_ = float2(f16tof32(_expr46), f16tof32((_expr46) >> 16));
    int _expr48 = i;
    int _expr49 = i;
-    i = (10u == 0 ? _expr48 : (_expr48 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr49 << 5u) & ((4294967295u >> (32u - 10u)) << 5u)));
+    i = naga_insertBits(_expr48, _expr49, 5u, 10u);
    int2 _expr53 = i2_;
    int2 _expr54 = i2_;
-    i2_ = (10u == 0 ? _expr53 : (_expr53 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr54 << 5u) & ((4294967295u >> (32u - 10u)) << 5u)));
+    i2_ = naga_insertBits(_expr53, _expr54, 5u, 10u);
    int3 _expr58 = i3_;
    int3 _expr59 = i3_;
-    i3_ = (10u == 0 ? _expr58 : (_expr58 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr59 << 5u) & ((4294967295u >> (32u - 10u)) << 5u)));
+    i3_ = naga_insertBits(_expr58, _expr59, 5u, 10u);
    int4 _expr63 = i4_;
    int4 _expr64 = i4_;
-    i4_ = (10u == 0 ? _expr63 : (_expr63 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr64 << 5u) & ((4294967295u >> (32u - 10u)) << 5u)));
+    i4_ = naga_insertBits(_expr63, _expr64, 5u, 10u);
    uint _expr68 = u;
    uint _expr69 = u;
-    u = (10u == 0 ? _expr68 : (_expr68 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr69 << 5u) & ((4294967295u >> (32u - 10u)) << 5u)));
+    u = naga_insertBits(_expr68, _expr69, 5u, 10u);
    uint2 _expr73 = u2_;
    uint2 _expr74 = u2_;
-    u2_ = (10u == 0 ? _expr73 : (_expr73 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr74 << 5u) & ((4294967295u >> (32u - 10u)) << 5u)));
+    u2_ = naga_insertBits(_expr73, _expr74, 5u, 10u);
    uint3 _expr78 = u3_;
    uint3 _expr79 = u3_;
-    u3_ = (10u == 0 ? _expr78 : (_expr78 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr79 << 5u) & ((4294967295u >> (32u - 10u)) << 5u)));
+    u3_ = naga_insertBits(_expr78, _expr79, 5u, 10u);
    uint4 _expr83 = u4_;
    uint4 _expr84 = u4_;
-    u4_ = (10u == 0 ? _expr83 : (_expr83 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr84 << 5u) & ((4294967295u >> (32u - 10u)) << 5u)));
+    u4_ = naga_insertBits(_expr83, _expr84, 5u, 10u);
    int _expr88 = i;
-    i = (10u == 0 ? 0 : (_expr88 << (32 - 10u - 5u)) >> (32 - 10u));
+    i = naga_extractBits(_expr88, 5u, 10u);
    int2 _expr92 = i2_;
-    i2_ = (10u == 0 ? 0 : (_expr92 << (32 - 10u - 5u)) >> (32 - 10u));
+    i2_ = naga_extractBits(_expr92, 5u, 10u);
    int3 _expr96 = i3_;
-    i3_ = (10u == 0 ? 0 : (_expr96 << (32 - 10u - 5u)) >> (32 - 10u));
+    i3_ = naga_extractBits(_expr96, 5u, 10u);
    int4 _expr100 = i4_;
-    i4_ = (10u == 0 ? 0 : (_expr100 << (32 - 10u - 5u)) >> (32 - 10u));
+    i4_ = naga_extractBits(_expr100, 5u, 10u);
    uint _expr104 = u;
-    u = (10u == 0 ? 0 : (_expr104 << (32 - 10u - 5u)) >> (32 - 10u));
+    u = naga_extractBits(_expr104, 5u, 10u);
    uint2 _expr108 = u2_;
-    u2_ = (10u == 0 ? 0 : (_expr108 << (32 - 10u - 5u)) >> (32 - 10u));
+    u2_ = naga_extractBits(_expr108, 5u, 10u);
    uint3 _expr112 = u3_;
-    u3_ = (10u == 0 ? 0 : (_expr112 << (32 - 10u - 5u)) >> (32 - 10u));
+    u3_ = naga_extractBits(_expr112, 5u, 10u);
    uint4 _expr116 = u4_;
-    u4_ = (10u == 0 ? 0 : (_expr116 << (32 - 10u - 5u)) >> (32 - 10u));
+    u4_ = naga_extractBits(_expr116, 5u, 10u);
    int _expr120 = i;
    i = asint(firstbitlow(_expr120));
    uint2 _expr122 = u2_;
--- a/naga/tests/out/msl/bits.msl
+++ b/naga/tests/out/msl/bits.msl
@ -39,44 +39,44 @@ kernel void main_(
    f2_ = float2(as_type<half2>(_e46));
    int _e48 = i;
    int _e49 = i;
-    i = metal::insert_bits(_e48, _e49, 5u, 10u);
+    i = metal::insert_bits(_e48, _e49, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    metal::int2 _e53 = i2_;
    metal::int2 _e54 = i2_;
-    i2_ = metal::insert_bits(_e53, _e54, 5u, 10u);
+    i2_ = metal::insert_bits(_e53, _e54, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    metal::int3 _e58 = i3_;
    metal::int3 _e59 = i3_;
-    i3_ = metal::insert_bits(_e58, _e59, 5u, 10u);
+    i3_ = metal::insert_bits(_e58, _e59, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    metal::int4 _e63 = i4_;
    metal::int4 _e64 = i4_;
-    i4_ = metal::insert_bits(_e63, _e64, 5u, 10u);
+    i4_ = metal::insert_bits(_e63, _e64, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    uint _e68 = u;
    uint _e69 = u;
-    u = metal::insert_bits(_e68, _e69, 5u, 10u);
+    u = metal::insert_bits(_e68, _e69, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    metal::uint2 _e73 = u2_;
    metal::uint2 _e74 = u2_;
-    u2_ = metal::insert_bits(_e73, _e74, 5u, 10u);
+    u2_ = metal::insert_bits(_e73, _e74, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    metal::uint3 _e78 = u3_;
    metal::uint3 _e79 = u3_;
-    u3_ = metal::insert_bits(_e78, _e79, 5u, 10u);
+    u3_ = metal::insert_bits(_e78, _e79, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    metal::uint4 _e83 = u4_;
    metal::uint4 _e84 = u4_;
-    u4_ = metal::insert_bits(_e83, _e84, 5u, 10u);
+    u4_ = metal::insert_bits(_e83, _e84, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    int _e88 = i;
-    i = metal::extract_bits(_e88, 5u, 10u);
+    i = metal::extract_bits(_e88, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    metal::int2 _e92 = i2_;
-    i2_ = metal::extract_bits(_e92, 5u, 10u);
+    i2_ = metal::extract_bits(_e92, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    metal::int3 _e96 = i3_;
-    i3_ = metal::extract_bits(_e96, 5u, 10u);
+    i3_ = metal::extract_bits(_e96, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    metal::int4 _e100 = i4_;
-    i4_ = metal::extract_bits(_e100, 5u, 10u);
+    i4_ = metal::extract_bits(_e100, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    uint _e104 = u;
-    u = metal::extract_bits(_e104, 5u, 10u);
+    u = metal::extract_bits(_e104, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    metal::uint2 _e108 = u2_;
-    u2_ = metal::extract_bits(_e108, 5u, 10u);
+    u2_ = metal::extract_bits(_e108, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    metal::uint3 _e112 = u3_;
-    u3_ = metal::extract_bits(_e112, 5u, 10u);
+    u3_ = metal::extract_bits(_e112, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    metal::uint4 _e116 = u4_;
-    u4_ = metal::extract_bits(_e116, 5u, 10u);
+    u4_ = metal::extract_bits(_e116, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u)));
    int _e120 = i;
    i = (((metal::ctz(_e120) + 1) % 33) - 1);
    metal::uint2 _e122 = u2_;
--- a/naga/tests/out/spv/bits.spvasm
+++ b/naga/tests/out/spv/bits.spvasm
@ -1,7 +1,7 @@
 ; SPIR-V
 ; Version: 1.1
 ; Generator: rspirv
-; Bound: 155
+; Bound: 204
 OpCapability Shader
 %1 = OpExtInstImport "GLSL.std.450"
 OpMemoryModel Logical GLSL450
@ -43,6 +43,7 @@ OpExecutionMode %15 LocalSize 1 1 1
 %45 = OpTypePointer Function %10
 %47 = OpTypePointer Function %11
 %49 = OpTypePointer Function %13
+%74 = OpConstant  %7  32
 %15 = OpFunction  %2  None %16
 %14 = OpLabel
 %48 = OpVariable  %49  Function %27
@ -89,125 +90,173 @@ OpStore %46 %68
 OpStore %46 %70
 %71 = OpLoad  %3  %30
 %72 = OpLoad  %3  %30
-%73 = OpBitFieldInsert  %3  %71 %72 %28 %29
+%75 = OpExtInst  %7  %1 UMin %28 %74
+%76 = OpISub  %7  %74 %75
+%77 = OpExtInst  %7  %1 UMin %29 %76
+%73 = OpBitFieldInsert  %3  %71 %72 %75 %77
 OpStore %30 %73
-%74 = OpLoad  %4  %32
-%75 = OpLoad  %4  %32
-%76 = OpBitFieldInsert  %4  %74 %75 %28 %29
-OpStore %32 %76
-%77 = OpLoad  %5  %34
-%78 = OpLoad  %5  %34
-%79 = OpBitFieldInsert  %5  %77 %78 %28 %29
-OpStore %34 %79
-%80 = OpLoad  %6  %36
-%81 = OpLoad  %6  %36
-%82 = OpBitFieldInsert  %6  %80 %81 %28 %29
-OpStore %36 %82
-%83 = OpLoad  %7  %38
-%84 = OpLoad  %7  %38
-%85 = OpBitFieldInsert  %7  %83 %84 %28 %29
-OpStore %38 %85
-%86 = OpLoad  %8  %40
-%87 = OpLoad  %8  %40
-%88 = OpBitFieldInsert  %8  %86 %87 %28 %29
-OpStore %40 %88
-%89 = OpLoad  %9  %42
-%90 = OpLoad  %9  %42
-%91 = OpBitFieldInsert  %9  %89 %90 %28 %29
-OpStore %42 %91
-%92 = OpLoad  %10  %44
-%93 = OpLoad  %10  %44
-%94 = OpBitFieldInsert  %10  %92 %93 %28 %29
-OpStore %44 %94
-%95 = OpLoad  %3  %30
-%96 = OpBitFieldSExtract  %3  %95 %28 %29
-OpStore %30 %96
-%97 = OpLoad  %4  %32
-%98 = OpBitFieldSExtract  %4  %97 %28 %29
-OpStore %32 %98
-%99 = OpLoad  %5  %34
-%100 = OpBitFieldSExtract  %5  %99 %28 %29
-OpStore %34 %100
-%101 = OpLoad  %6  %36
-%102 = OpBitFieldSExtract  %6  %101 %28 %29
-OpStore %36 %102
-%103 = OpLoad  %7  %38
-%104 = OpBitFieldUExtract  %7  %103 %28 %29
-OpStore %38 %104
-%105 = OpLoad  %8  %40
-%106 = OpBitFieldUExtract  %8  %105 %28 %29
-OpStore %40 %106
-%107 = OpLoad  %9  %42
-%108 = OpBitFieldUExtract  %9  %107 %28 %29
-OpStore %42 %108
-%109 = OpLoad  %10  %44
-%110 = OpBitFieldUExtract  %10  %109 %28 %29
-OpStore %44 %110
-%111 = OpLoad  %3  %30
-%112 = OpExtInst  %3  %1 FindILsb %111
-OpStore %30 %112
-%113 = OpLoad  %8  %40
-%114 = OpExtInst  %8  %1 FindILsb %113
-OpStore %40 %114
-%115 = OpLoad  %5  %34
-%116 = OpExtInst  %5  %1 FindSMsb %115
-OpStore %34 %116
-%117 = OpLoad  %9  %42
-%118 = OpExtInst  %9  %1 FindUMsb %117
-OpStore %42 %118
-%119 = OpLoad  %3  %30
-%120 = OpExtInst  %3  %1 FindSMsb %119
-OpStore %30 %120
-%121 = OpLoad  %7  %38
-%122 = OpExtInst  %7  %1 FindUMsb %121
-OpStore %38 %122
-%123 = OpLoad  %3  %30
-%124 = OpBitCount  %3  %123
-OpStore %30 %124
+%78 = OpLoad  %4  %32
+%79 = OpLoad  %4  %32
+%81 = OpExtInst  %7  %1 UMin %28 %74
+%82 = OpISub  %7  %74 %81
+%83 = OpExtInst  %7  %1 UMin %29 %82
+%80 = OpBitFieldInsert  %4  %78 %79 %81 %83
+OpStore %32 %80
+%84 = OpLoad  %5  %34
+%85 = OpLoad  %5  %34
+%87 = OpExtInst  %7  %1 UMin %28 %74
+%88 = OpISub  %7  %74 %87
+%89 = OpExtInst  %7  %1 UMin %29 %88
+%86 = OpBitFieldInsert  %5  %84 %85 %87 %89
+OpStore %34 %86
+%90 = OpLoad  %6  %36
+%91 = OpLoad  %6  %36
+%93 = OpExtInst  %7  %1 UMin %28 %74
+%94 = OpISub  %7  %74 %93
+%95 = OpExtInst  %7  %1 UMin %29 %94
+%92 = OpBitFieldInsert  %6  %90 %91 %93 %95
+OpStore %36 %92
+%96 = OpLoad  %7  %38
+%97 = OpLoad  %7  %38
+%99 = OpExtInst  %7  %1 UMin %28 %74
+%100 = OpISub  %7  %74 %99
+%101 = OpExtInst  %7  %1 UMin %29 %100
+%98 = OpBitFieldInsert  %7  %96 %97 %99 %101
+OpStore %38 %98
+%102 = OpLoad  %8  %40
+%103 = OpLoad  %8  %40
+%105 = OpExtInst  %7  %1 UMin %28 %74
+%106 = OpISub  %7  %74 %105
+%107 = OpExtInst  %7  %1 UMin %29 %106
+%104 = OpBitFieldInsert  %8  %102 %103 %105 %107
+OpStore %40 %104
+%108 = OpLoad  %9  %42
+%109 = OpLoad  %9  %42
+%111 = OpExtInst  %7  %1 UMin %28 %74
+%112 = OpISub  %7  %74 %111
+%113 = OpExtInst  %7  %1 UMin %29 %112
+%110 = OpBitFieldInsert  %9  %108 %109 %111 %113
+OpStore %42 %110
+%114 = OpLoad  %10  %44
+%115 = OpLoad  %10  %44
+%117 = OpExtInst  %7  %1 UMin %28 %74
+%118 = OpISub  %7  %74 %117
+%119 = OpExtInst  %7  %1 UMin %29 %118
+%116 = OpBitFieldInsert  %10  %114 %115 %117 %119
+OpStore %44 %116
+%120 = OpLoad  %3  %30
+%122 = OpExtInst  %7  %1 UMin %28 %74
+%123 = OpISub  %7  %74 %122
+%124 = OpExtInst  %7  %1 UMin %29 %123
+%121 = OpBitFieldSExtract  %3  %120 %122 %124
+OpStore %30 %121
 %125 = OpLoad  %4  %32
-%126 = OpBitCount  %4  %125
+%127 = OpExtInst  %7  %1 UMin %28 %74
+%128 = OpISub  %7  %74 %127
+%129 = OpExtInst  %7  %1 UMin %29 %128
+%126 = OpBitFieldSExtract  %4  %125 %127 %129
 OpStore %32 %126
-%127 = OpLoad  %5  %34
-%128 = OpBitCount  %5  %127
-OpStore %34 %128
-%129 = OpLoad  %6  %36
-%130 = OpBitCount  %6  %129
-OpStore %36 %130
-%131 = OpLoad  %7  %38
-%132 = OpBitCount  %7  %131
-OpStore %38 %132
-%133 = OpLoad  %8  %40
-%134 = OpBitCount  %8  %133
-OpStore %40 %134
-%135 = OpLoad  %9  %42
-%136 = OpBitCount  %9  %135
-OpStore %42 %136
-%137 = OpLoad  %10  %44
-%138 = OpBitCount  %10  %137
-OpStore %44 %138
-%139 = OpLoad  %3  %30
-%140 = OpBitReverse  %3  %139
-OpStore %30 %140
-%141 = OpLoad  %4  %32
-%142 = OpBitReverse  %4  %141
-OpStore %32 %142
-%143 = OpLoad  %5  %34
-%144 = OpBitReverse  %5  %143
-OpStore %34 %144
-%145 = OpLoad  %6  %36
-%146 = OpBitReverse  %6  %145
-OpStore %36 %146
-%147 = OpLoad  %7  %38
-%148 = OpBitReverse  %7  %147
-OpStore %38 %148
-%149 = OpLoad  %8  %40
-%150 = OpBitReverse  %8  %149
-OpStore %40 %150
-%151 = OpLoad  %9  %42
-%152 = OpBitReverse  %9  %151
-OpStore %42 %152
-%153 = OpLoad  %10  %44
-%154 = OpBitReverse  %10  %153
-OpStore %44 %154
+%130 = OpLoad  %5  %34
+%132 = OpExtInst  %7  %1 UMin %28 %74
+%133 = OpISub  %7  %74 %132
+%134 = OpExtInst  %7  %1 UMin %29 %133
+%131 = OpBitFieldSExtract  %5  %130 %132 %134
+OpStore %34 %131
+%135 = OpLoad  %6  %36
+%137 = OpExtInst  %7  %1 UMin %28 %74
+%138 = OpISub  %7  %74 %137
+%139 = OpExtInst  %7  %1 UMin %29 %138
+%136 = OpBitFieldSExtract  %6  %135 %137 %139
+OpStore %36 %136
+%140 = OpLoad  %7  %38
+%142 = OpExtInst  %7  %1 UMin %28 %74
+%143 = OpISub  %7  %74 %142
+%144 = OpExtInst  %7  %1 UMin %29 %143
+%141 = OpBitFieldUExtract  %7  %140 %142 %144
+OpStore %38 %141
+%145 = OpLoad  %8  %40
+%147 = OpExtInst  %7  %1 UMin %28 %74
+%148 = OpISub  %7  %74 %147
+%149 = OpExtInst  %7  %1 UMin %29 %148
+%146 = OpBitFieldUExtract  %8  %145 %147 %149
+OpStore %40 %146
+%150 = OpLoad  %9  %42
+%152 = OpExtInst  %7  %1 UMin %28 %74
+%153 = OpISub  %7  %74 %152
+%154 = OpExtInst  %7  %1 UMin %29 %153
+%151 = OpBitFieldUExtract  %9  %150 %152 %154
+OpStore %42 %151
+%155 = OpLoad  %10  %44
+%157 = OpExtInst  %7  %1 UMin %28 %74
+%158 = OpISub  %7  %74 %157
+%159 = OpExtInst  %7  %1 UMin %29 %158
+%156 = OpBitFieldUExtract  %10  %155 %157 %159
+OpStore %44 %156
+%160 = OpLoad  %3  %30
+%161 = OpExtInst  %3  %1 FindILsb %160
+OpStore %30 %161
+%162 = OpLoad  %8  %40
+%163 = OpExtInst  %8  %1 FindILsb %162
+OpStore %40 %163
+%164 = OpLoad  %5  %34
+%165 = OpExtInst  %5  %1 FindSMsb %164
+OpStore %34 %165
+%166 = OpLoad  %9  %42
+%167 = OpExtInst  %9  %1 FindUMsb %166
+OpStore %42 %167
+%168 = OpLoad  %3  %30
+%169 = OpExtInst  %3  %1 FindSMsb %168
+OpStore %30 %169
+%170 = OpLoad  %7  %38
+%171 = OpExtInst  %7  %1 FindUMsb %170
+OpStore %38 %171
+%172 = OpLoad  %3  %30
+%173 = OpBitCount  %3  %172
+OpStore %30 %173
+%174 = OpLoad  %4  %32
+%175 = OpBitCount  %4  %174
+OpStore %32 %175
+%176 = OpLoad  %5  %34
+%177 = OpBitCount  %5  %176
+OpStore %34 %177
+%178 = OpLoad  %6  %36
+%179 = OpBitCount  %6  %178
+OpStore %36 %179
+%180 = OpLoad  %7  %38
+%181 = OpBitCount  %7  %180
+OpStore %38 %181
+%182 = OpLoad  %8  %40
+%183 = OpBitCount  %8  %182
+OpStore %40 %183
+%184 = OpLoad  %9  %42
+%185 = OpBitCount  %9  %184
+OpStore %42 %185
+%186 = OpLoad  %10  %44
+%187 = OpBitCount  %10  %186
+OpStore %44 %187
+%188 = OpLoad  %3  %30
+%189 = OpBitReverse  %3  %188
+OpStore %30 %189
+%190 = OpLoad  %4  %32
+%191 = OpBitReverse  %4  %190
+OpStore %32 %191
+%192 = OpLoad  %5  %34
+%193 = OpBitReverse  %5  %192
+OpStore %34 %193
+%194 = OpLoad  %6  %36
+%195 = OpBitReverse  %6  %194
+OpStore %36 %195
+%196 = OpLoad  %7  %38
+%197 = OpBitReverse  %7  %196
+OpStore %38 %197
+%198 = OpLoad  %8  %40
+%199 = OpBitReverse  %8  %198
+OpStore %40 %199
+%200 = OpLoad  %9  %42
+%201 = OpBitReverse  %9  %200
+OpStore %42 %201
+%202 = OpLoad  %10  %44
+%203 = OpBitReverse  %10  %202
+OpStore %44 %203
 OpReturn
 OpFunctionEnd