diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a7646745..1e445ef1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -120,6 +120,7 @@ Bottom level categories: - Fix timeout when presenting a surface where no work has been done. By @waywardmonkeys in [#5200](https://github.com/gfx-rs/wgpu/pull/5200) - Simplify and speed up the allocation of internal IDs. By @nical in [#5229](https://github.com/gfx-rs/wgpu/pull/5229) - Fix an issue where command encoders weren't properly freed if an error occurred during command encoding. By @ErichDonGubler in [#5251](https://github.com/gfx-rs/wgpu/pull/5251). +- Fix behavior of `extractBits` and `insertBits` when `offset + count` overflows the bit width. By @cwfitzgerald in [#5305](https://github.com/gfx-rs/wgpu/pull/5305) - Fix registry leaks with de-duplicated resources. By @nical in [#5244](https://github.com/gfx-rs/wgpu/pull/5244) - Fix behavior of integer `clamp` when `min` argument > `max` argument. By @cwfitzgerald in [#5300](https://github.com/gfx-rs/wgpu/pull/5300). - Fix missing validation for `Device::clear_buffer` where `offset + size buffer.size` was not checked when `size` was omitted. By @ErichDonGubler in [#5282](https://github.com/gfx-rs/wgpu/pull/5282). diff --git a/naga/src/back/glsl/mod.rs b/naga/src/back/glsl/mod.rs index 9b716482a..f0a3d905b 100644 --- a/naga/src/back/glsl/mod.rs +++ b/naga/src/back/glsl/mod.rs @@ -1290,7 +1290,14 @@ impl<'a, W: Write> Writer<'a, W> { let inner = expr_info.ty.inner_with(&self.module.types); - if let Expression::Math { fun, arg, arg1, .. } = *expr { + if let Expression::Math { + fun, + arg, + arg1, + arg2, + .. + } = *expr + { match fun { crate::MathFunction::Dot => { // if the expression is a Dot product with integer arguments, @@ -1305,6 +1312,14 @@ impl<'a, W: Write> Writer<'a, W> { } } } + crate::MathFunction::ExtractBits => { + // Only argument 1 is re-used. + self.need_bake_expressions.insert(arg1.unwrap()); + } + crate::MathFunction::InsertBits => { + // Only argument 2 is re-used. + self.need_bake_expressions.insert(arg2.unwrap()); + } crate::MathFunction::CountLeadingZeros => { if let Some(crate::ScalarKind::Sint) = inner.scalar_kind() { self.need_bake_expressions.insert(arg); @@ -3375,8 +3390,59 @@ impl<'a, W: Write> Writer<'a, W> { } Mf::CountOneBits => "bitCount", Mf::ReverseBits => "bitfieldReverse", - Mf::ExtractBits => "bitfieldExtract", - Mf::InsertBits => "bitfieldInsert", + Mf::ExtractBits => { + // The behavior of ExtractBits is undefined when offset + count > bit_width. We need + // to first sanitize the offset and count first. If we don't do this, AMD and Intel chips + // will return out-of-spec values if the extracted range is not within the bit width. + // + // This encodes the exact formula specified by the wgsl spec, without temporary values: + // https://gpuweb.github.io/gpuweb/wgsl/#extractBits-unsigned-builtin + // + // w = sizeof(x) * 8 + // o = min(offset, w) + // c = min(count, w - o) + // + // bitfieldExtract(x, o, c) + // + // extract_bits(e, min(offset, w), min(count, w - min(offset, w)))) + let scalar_bits = ctx + .resolve_type(arg, &self.module.types) + .scalar_width() + .unwrap(); + + write!(self.out, "bitfieldExtract(")?; + self.write_expr(arg, ctx)?; + write!(self.out, ", int(min(")?; + self.write_expr(arg1.unwrap(), ctx)?; + write!(self.out, ", {scalar_bits}u)), int(min(",)?; + self.write_expr(arg2.unwrap(), ctx)?; + write!(self.out, ", {scalar_bits}u - min(")?; + self.write_expr(arg1.unwrap(), ctx)?; + write!(self.out, ", {scalar_bits}u))))")?; + + return Ok(()); + } + Mf::InsertBits => { + // InsertBits has the same considerations as ExtractBits above + let scalar_bits = ctx + .resolve_type(arg, &self.module.types) + .scalar_width() + .unwrap(); + + write!(self.out, "bitfieldInsert(")?; + self.write_expr(arg, ctx)?; + write!(self.out, ", ")?; + self.write_expr(arg1.unwrap(), ctx)?; + write!(self.out, ", int(min(")?; + self.write_expr(arg2.unwrap(), ctx)?; + write!(self.out, ", {scalar_bits}u)), int(min(",)?; + self.write_expr(arg3.unwrap(), ctx)?; + write!(self.out, ", {scalar_bits}u - min(")?; + self.write_expr(arg2.unwrap(), ctx)?; + write!(self.out, ", {scalar_bits}u))))")?; + + return Ok(()); + } Mf::FindLsb => "findLSB", Mf::FindMsb => "findMSB", // data packing diff --git a/naga/src/back/hlsl/help.rs b/naga/src/back/hlsl/help.rs index fa6062a1a..4dd9ea598 100644 --- a/naga/src/back/hlsl/help.rs +++ b/naga/src/back/hlsl/help.rs @@ -26,7 +26,11 @@ int dim_1d = NagaDimensions1D(image_1d); ``` */ -use super::{super::FunctionCtx, BackendResult}; +use super::{ + super::FunctionCtx, + writer::{EXTRACT_BITS_FUNCTION, INSERT_BITS_FUNCTION}, + BackendResult, +}; use crate::{arena::Handle, proc::NameKey}; use std::fmt::Write; @@ -59,6 +63,13 @@ pub(super) struct WrappedMatCx2 { pub(super) columns: crate::VectorSize, } +#[derive(Clone, Copy, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)] +pub(super) struct WrappedMath { + pub(super) fun: crate::MathFunction, + pub(super) scalar: crate::Scalar, + pub(super) components: Option, +} + /// HLSL backend requires its own `ImageQuery` enum. /// /// It is used inside `WrappedImageQuery` and should be unique per ImageQuery function. @@ -851,12 +862,149 @@ impl<'a, W: Write> super::Writer<'a, W> { Ok(()) } + pub(super) fn write_wrapped_math_functions( + &mut self, + module: &crate::Module, + func_ctx: &FunctionCtx, + ) -> BackendResult { + for (_, expression) in func_ctx.expressions.iter() { + if let crate::Expression::Math { + fun, + arg, + arg1: _arg1, + arg2: _arg2, + arg3: _arg3, + } = *expression + { + match fun { + crate::MathFunction::ExtractBits => { + // The behavior of our extractBits polyfill is undefined if offset + count > bit_width. We need + // to first sanitize the offset and count first. If we don't do this, we will get out-of-spec + // values if the extracted range is not within the bit width. + // + // This encodes the exact formula specified by the wgsl spec: + // https://gpuweb.github.io/gpuweb/wgsl/#extractBits-unsigned-builtin + // + // w = sizeof(x) * 8 + // o = min(offset, w) + // c = min(count, w - o) + // + // bitfieldExtract(x, o, c) + let arg_ty = func_ctx.resolve_type(arg, &module.types); + let scalar = arg_ty.scalar().unwrap(); + let components = arg_ty.components(); + + let wrapped = WrappedMath { + fun, + scalar, + components, + }; + + if !self.wrapped.math.insert(wrapped) { + continue; + } + + // Write return type + self.write_value_type(module, arg_ty)?; + + let scalar_width: u8 = scalar.width * 8; + + // Write function name and parameters + writeln!(self.out, " {EXTRACT_BITS_FUNCTION}(")?; + write!(self.out, " ")?; + self.write_value_type(module, arg_ty)?; + writeln!(self.out, " e,")?; + writeln!(self.out, " uint offset,")?; + writeln!(self.out, " uint count")?; + writeln!(self.out, ") {{")?; + + // Write function body + writeln!(self.out, " uint w = {scalar_width};")?; + writeln!(self.out, " uint o = min(offset, w);")?; + writeln!(self.out, " uint c = min(count, w - o);")?; + writeln!( + self.out, + " return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c));" + )?; + + // End of function body + writeln!(self.out, "}}")?; + } + crate::MathFunction::InsertBits => { + // The behavior of our insertBits polyfill has the same constraints as the extractBits polyfill. + + let arg_ty = func_ctx.resolve_type(arg, &module.types); + let scalar = arg_ty.scalar().unwrap(); + let components = arg_ty.components(); + + let wrapped = WrappedMath { + fun, + scalar, + components, + }; + + if !self.wrapped.math.insert(wrapped) { + continue; + } + + // Write return type + self.write_value_type(module, arg_ty)?; + + let scalar_width: u8 = scalar.width * 8; + let scalar_max: u64 = match scalar.width { + 1 => 0xFF, + 2 => 0xFFFF, + 4 => 0xFFFFFFFF, + 8 => 0xFFFFFFFFFFFFFFFF, + _ => unreachable!(), + }; + + // Write function name and parameters + writeln!(self.out, " {INSERT_BITS_FUNCTION}(")?; + write!(self.out, " ")?; + self.write_value_type(module, arg_ty)?; + writeln!(self.out, " e,")?; + write!(self.out, " ")?; + self.write_value_type(module, arg_ty)?; + writeln!(self.out, " newbits,")?; + writeln!(self.out, " uint offset,")?; + writeln!(self.out, " uint count")?; + writeln!(self.out, ") {{")?; + + // Write function body + writeln!(self.out, " uint w = {scalar_width}u;")?; + writeln!(self.out, " uint o = min(offset, w);")?; + writeln!(self.out, " uint c = min(count, w - o);")?; + + // The `u` suffix on the literals is _extremely_ important. Otherwise it will use + // i32 shifting instead of the intended u32 shifting. + writeln!( + self.out, + " uint mask = (({scalar_max}u >> ({scalar_width}u - c)) << o);" + )?; + writeln!( + self.out, + " return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask)));" + )?; + + // End of function body + writeln!(self.out, "}}")?; + } + _ => {} + } + } + } + + Ok(()) + } + /// Helper function that writes various wrapped functions pub(super) fn write_wrapped_functions( &mut self, module: &crate::Module, func_ctx: &FunctionCtx, ) -> BackendResult { + self.write_wrapped_math_functions(module, func_ctx)?; self.write_wrapped_compose_functions(module, func_ctx.expressions)?; for (handle, _) in func_ctx.expressions.iter() { diff --git a/naga/src/back/hlsl/keywords.rs b/naga/src/back/hlsl/keywords.rs index 059e533ff..2cb715c42 100644 --- a/naga/src/back/hlsl/keywords.rs +++ b/naga/src/back/hlsl/keywords.rs @@ -817,6 +817,8 @@ pub const RESERVED: &[&str] = &[ // Naga utilities super::writer::MODF_FUNCTION, super::writer::FREXP_FUNCTION, + super::writer::EXTRACT_BITS_FUNCTION, + super::writer::INSERT_BITS_FUNCTION, ]; // DXC scalar types, from https://github.com/microsoft/DirectXShaderCompiler/blob/18c9e114f9c314f93e68fbc72ce207d4ed2e65ae/tools/clang/lib/AST/ASTContextHLSL.cpp#L48-L254 diff --git a/naga/src/back/hlsl/mod.rs b/naga/src/back/hlsl/mod.rs index 37ddbd3d6..f37a223f4 100644 --- a/naga/src/back/hlsl/mod.rs +++ b/naga/src/back/hlsl/mod.rs @@ -256,6 +256,7 @@ struct Wrapped { constructors: crate::FastHashSet, struct_matrix_access: crate::FastHashSet, mat_cx2s: crate::FastHashSet, + math: crate::FastHashSet, } impl Wrapped { @@ -265,6 +266,7 @@ impl Wrapped { self.constructors.clear(); self.struct_matrix_access.clear(); self.mat_cx2s.clear(); + self.math.clear(); } } diff --git a/naga/src/back/hlsl/writer.rs b/naga/src/back/hlsl/writer.rs index 43f721283..4860651f7 100644 --- a/naga/src/back/hlsl/writer.rs +++ b/naga/src/back/hlsl/writer.rs @@ -19,6 +19,8 @@ const SPECIAL_OTHER: &str = "other"; pub(crate) const MODF_FUNCTION: &str = "naga_modf"; pub(crate) const FREXP_FUNCTION: &str = "naga_frexp"; +pub(crate) const EXTRACT_BITS_FUNCTION: &str = "naga_extractBits"; +pub(crate) const INSERT_BITS_FUNCTION: &str = "naga_insertBits"; struct EpStructMember { name: String, @@ -125,14 +127,7 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> { self.need_bake_expressions.insert(fun_handle); } - if let Expression::Math { - fun, - arg, - arg1, - arg2, - arg3, - } = *expr - { + if let Expression::Math { fun, arg, .. } = *expr { match fun { crate::MathFunction::Asinh | crate::MathFunction::Acosh @@ -149,17 +144,6 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> { | crate::MathFunction::Pack4x8unorm => { self.need_bake_expressions.insert(arg); } - crate::MathFunction::ExtractBits => { - self.need_bake_expressions.insert(arg); - self.need_bake_expressions.insert(arg1.unwrap()); - self.need_bake_expressions.insert(arg2.unwrap()); - } - crate::MathFunction::InsertBits => { - self.need_bake_expressions.insert(arg); - self.need_bake_expressions.insert(arg1.unwrap()); - self.need_bake_expressions.insert(arg2.unwrap()); - self.need_bake_expressions.insert(arg3.unwrap()); - } crate::MathFunction::CountLeadingZeros => { let inner = info[fun_handle].ty.inner_with(&module.types); if let Some(crate::ScalarKind::Sint) = inner.scalar_kind() { @@ -2620,8 +2604,6 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> { enum Function { Asincosh { is_sin: bool }, Atanh, - ExtractBits, - InsertBits, Pack2x16float, Pack2x16snorm, Pack2x16unorm, @@ -2705,8 +2687,8 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> { Mf::ReverseBits => Function::MissingIntOverload("reversebits"), Mf::FindLsb => Function::MissingIntReturnType("firstbitlow"), Mf::FindMsb => Function::MissingIntReturnType("firstbithigh"), - Mf::ExtractBits => Function::ExtractBits, - Mf::InsertBits => Function::InsertBits, + Mf::ExtractBits => Function::Regular(EXTRACT_BITS_FUNCTION), + Mf::InsertBits => Function::Regular(INSERT_BITS_FUNCTION), // Data Packing Mf::Pack2x16float => Function::Pack2x16float, Mf::Pack2x16snorm => Function::Pack2x16snorm, @@ -2742,70 +2724,6 @@ impl<'a, W: fmt::Write> super::Writer<'a, W> { self.write_expr(module, arg, func_ctx)?; write!(self.out, "))")?; } - Function::ExtractBits => { - // e: T, - // offset: u32, - // count: u32 - // T is u32 or i32 or vecN or vecN - if let (Some(offset), Some(count)) = (arg1, arg2) { - let scalar_width: u8 = 32; - // Works for signed and unsigned - // (count == 0 ? 0 : (e << (32 - count - offset)) >> (32 - count)) - write!(self.out, "(")?; - self.write_expr(module, count, func_ctx)?; - write!(self.out, " == 0 ? 0 : (")?; - self.write_expr(module, arg, func_ctx)?; - write!(self.out, " << ({scalar_width} - ")?; - self.write_expr(module, count, func_ctx)?; - write!(self.out, " - ")?; - self.write_expr(module, offset, func_ctx)?; - write!(self.out, ")) >> ({scalar_width} - ")?; - self.write_expr(module, count, func_ctx)?; - write!(self.out, "))")?; - } - } - Function::InsertBits => { - // e: T, - // newbits: T, - // offset: u32, - // count: u32 - // returns T - // T is i32, u32, vecN, or vecN - if let (Some(newbits), Some(offset), Some(count)) = (arg1, arg2, arg3) { - let scalar_width: u8 = 32; - let scalar_max: u32 = 0xFFFFFFFF; - // mask = ((0xFFFFFFFFu >> (32 - count)) << offset) - // (count == 0 ? e : ((e & ~mask) | ((newbits << offset) & mask))) - write!(self.out, "(")?; - self.write_expr(module, count, func_ctx)?; - write!(self.out, " == 0 ? ")?; - self.write_expr(module, arg, func_ctx)?; - write!(self.out, " : ")?; - write!(self.out, "(")?; - self.write_expr(module, arg, func_ctx)?; - write!(self.out, " & ~")?; - // mask - write!(self.out, "(({scalar_max}u >> ({scalar_width}u - ")?; - self.write_expr(module, count, func_ctx)?; - write!(self.out, ")) << ")?; - self.write_expr(module, offset, func_ctx)?; - write!(self.out, ")")?; - // end mask - write!(self.out, ") | ((")?; - self.write_expr(module, newbits, func_ctx)?; - write!(self.out, " << ")?; - self.write_expr(module, offset, func_ctx)?; - write!(self.out, ") & ")?; - // // mask - write!(self.out, "(({scalar_max}u >> ({scalar_width}u - ")?; - self.write_expr(module, count, func_ctx)?; - write!(self.out, ")) << ")?; - self.write_expr(module, offset, func_ctx)?; - write!(self.out, ")")?; - // // end mask - write!(self.out, "))")?; - } - } Function::Pack2x16float => { write!(self.out, "(f32tof16(")?; self.write_expr(module, arg, func_ctx)?; diff --git a/naga/src/back/msl/writer.rs b/naga/src/back/msl/writer.rs index 7542ae794..ac1c654a3 100644 --- a/naga/src/back/msl/writer.rs +++ b/naga/src/back/msl/writer.rs @@ -1794,8 +1794,8 @@ impl Writer { Mf::CountLeadingZeros => "clz", Mf::CountOneBits => "popcount", Mf::ReverseBits => "reverse_bits", - Mf::ExtractBits => "extract_bits", - Mf::InsertBits => "insert_bits", + Mf::ExtractBits => "", + Mf::InsertBits => "", Mf::FindLsb => "", Mf::FindMsb => "", // data packing @@ -1891,6 +1891,52 @@ impl Writer { write!(self.out, "as_type(half2(")?; self.put_expression(arg, context, false)?; write!(self.out, "))")?; + } else if fun == Mf::ExtractBits { + // The behavior of ExtractBits is undefined when offset + count > bit_width. We need + // to first sanitize the offset and count first. If we don't do this, Apple chips + // will return out-of-spec values if the extracted range is not within the bit width. + // + // This encodes the exact formula specified by the wgsl spec, without temporary values: + // https://gpuweb.github.io/gpuweb/wgsl/#extractBits-unsigned-builtin + // + // w = sizeof(x) * 8 + // o = min(offset, w) + // tmp = w - o + // c = min(count, tmp) + // + // bitfieldExtract(x, o, c) + // + // extract_bits(e, min(offset, w), min(count, w - min(offset, w)))) + + let scalar_bits = context.resolve_type(arg).scalar_width().unwrap(); + + write!(self.out, "{NAMESPACE}::extract_bits(")?; + self.put_expression(arg, context, true)?; + write!(self.out, ", {NAMESPACE}::min(")?; + self.put_expression(arg1.unwrap(), context, true)?; + write!(self.out, ", {scalar_bits}u), {NAMESPACE}::min(")?; + self.put_expression(arg2.unwrap(), context, true)?; + write!(self.out, ", {scalar_bits}u - {NAMESPACE}::min(")?; + self.put_expression(arg1.unwrap(), context, true)?; + write!(self.out, ", {scalar_bits}u)))")?; + } else if fun == Mf::InsertBits { + // The behavior of InsertBits has the same issue as ExtractBits. + // + // insertBits(e, newBits, min(offset, w), min(count, w - min(offset, w)))) + + let scalar_bits = context.resolve_type(arg).scalar_width().unwrap(); + + write!(self.out, "{NAMESPACE}::insert_bits(")?; + self.put_expression(arg, context, true)?; + write!(self.out, ", ")?; + self.put_expression(arg1.unwrap(), context, true)?; + write!(self.out, ", {NAMESPACE}::min(")?; + self.put_expression(arg2.unwrap(), context, true)?; + write!(self.out, ", {scalar_bits}u), {NAMESPACE}::min(")?; + self.put_expression(arg3.unwrap(), context, true)?; + write!(self.out, ", {scalar_bits}u - {NAMESPACE}::min(")?; + self.put_expression(arg2.unwrap(), context, true)?; + write!(self.out, ", {scalar_bits}u)))")?; } else if fun == Mf::Radians { write!(self.out, "((")?; self.put_expression(arg, context, false)?; @@ -2489,7 +2535,14 @@ impl Writer { } } - if let Expression::Math { fun, arg, arg1, .. } = *expr { + if let Expression::Math { + fun, + arg, + arg1, + arg2, + .. + } = *expr + { match fun { crate::MathFunction::Dot => { // WGSL's `dot` function works on any `vecN` type, but Metal's only @@ -2514,6 +2567,14 @@ impl Writer { crate::MathFunction::FindMsb => { self.need_bake_expressions.insert(arg); } + crate::MathFunction::ExtractBits => { + // Only argument 1 is re-used. + self.need_bake_expressions.insert(arg1.unwrap()); + } + crate::MathFunction::InsertBits => { + // Only argument 2 is re-used. + self.need_bake_expressions.insert(arg2.unwrap()); + } crate::MathFunction::Sign => { // WGSL's `sign` function works also on signed ints, but Metal's only // works on floating points, so we emit inline code for integer `sign` diff --git a/naga/src/back/spv/block.rs b/naga/src/back/spv/block.rs index cbb8e92e7..d8c04c88c 100644 --- a/naga/src/back/spv/block.rs +++ b/naga/src/back/spv/block.rs @@ -1050,24 +1050,131 @@ impl<'w> BlockContext<'w> { Some(crate::ScalarKind::Sint) => spirv::Op::BitFieldSExtract, other => unimplemented!("Unexpected sign({:?})", other), }; + + // The behavior of ExtractBits is undefined when offset + count > bit_width. We need + // to first sanitize the offset and count first. If we don't do this, AMD and Intel + // will return out-of-spec values if the extracted range is not within the bit width. + // + // This encodes the exact formula specified by the wgsl spec: + // https://gpuweb.github.io/gpuweb/wgsl/#extractBits-unsigned-builtin + // + // w = sizeof(x) * 8 + // o = min(offset, w) + // tmp = w - o + // c = min(count, tmp) + // + // bitfieldExtract(x, o, c) + + let bit_width = arg_ty.scalar_width().unwrap(); + let width_constant = self + .writer + .get_constant_scalar(crate::Literal::U32(bit_width as u32)); + + let u32_type = self.get_type_id(LookupType::Local(LocalType::Value { + vector_size: None, + scalar: crate::Scalar { + kind: crate::ScalarKind::Uint, + width: 4, + }, + pointer_space: None, + })); + + // o = min(offset, w) + let offset_id = self.gen_id(); + block.body.push(Instruction::ext_inst( + self.writer.gl450_ext_inst_id, + spirv::GLOp::UMin, + u32_type, + offset_id, + &[arg1_id, width_constant], + )); + + // tmp = w - o + let max_count_id = self.gen_id(); + block.body.push(Instruction::binary( + spirv::Op::ISub, + u32_type, + max_count_id, + width_constant, + offset_id, + )); + + // c = min(count, tmp) + let count_id = self.gen_id(); + block.body.push(Instruction::ext_inst( + self.writer.gl450_ext_inst_id, + spirv::GLOp::UMin, + u32_type, + count_id, + &[arg2_id, max_count_id], + )); + MathOp::Custom(Instruction::ternary( op, result_type_id, id, arg0_id, - arg1_id, - arg2_id, + offset_id, + count_id, + )) + } + Mf::InsertBits => { + // The behavior of InsertBits has the same undefined behavior as ExtractBits. + + let bit_width = arg_ty.scalar_width().unwrap(); + let width_constant = self + .writer + .get_constant_scalar(crate::Literal::U32(bit_width as u32)); + + let u32_type = self.get_type_id(LookupType::Local(LocalType::Value { + vector_size: None, + scalar: crate::Scalar { + kind: crate::ScalarKind::Uint, + width: 4, + }, + pointer_space: None, + })); + + // o = min(offset, w) + let offset_id = self.gen_id(); + block.body.push(Instruction::ext_inst( + self.writer.gl450_ext_inst_id, + spirv::GLOp::UMin, + u32_type, + offset_id, + &[arg2_id, width_constant], + )); + + // tmp = w - o + let max_count_id = self.gen_id(); + block.body.push(Instruction::binary( + spirv::Op::ISub, + u32_type, + max_count_id, + width_constant, + offset_id, + )); + + // c = min(count, tmp) + let count_id = self.gen_id(); + block.body.push(Instruction::ext_inst( + self.writer.gl450_ext_inst_id, + spirv::GLOp::UMin, + u32_type, + count_id, + &[arg3_id, max_count_id], + )); + + MathOp::Custom(Instruction::quaternary( + spirv::Op::BitFieldInsert, + result_type_id, + id, + arg0_id, + arg1_id, + offset_id, + count_id, )) } - Mf::InsertBits => MathOp::Custom(Instruction::quaternary( - spirv::Op::BitFieldInsert, - result_type_id, - id, - arg0_id, - arg1_id, - arg2_id, - arg3_id, - )), Mf::FindLsb => MathOp::Ext(spirv::GLOp::FindILsb), Mf::FindMsb => MathOp::Ext(match arg_scalar_kind { Some(crate::ScalarKind::Uint) => spirv::GLOp::FindUMsb, diff --git a/naga/src/lib.rs b/naga/src/lib.rs index 65703f684..8773f1225 100644 --- a/naga/src/lib.rs +++ b/naga/src/lib.rs @@ -491,7 +491,7 @@ pub enum ScalarKind { } /// Characteristics of a scalar type. -#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] #[cfg_attr(feature = "serialize", derive(Serialize))] #[cfg_attr(feature = "deserialize", derive(Deserialize))] #[cfg_attr(feature = "arbitrary", derive(Arbitrary))] diff --git a/naga/tests/out/glsl/bits.main.Compute.glsl b/naga/tests/out/glsl/bits.main.Compute.glsl index f991f532a..a5cc0f7c6 100644 --- a/naga/tests/out/glsl/bits.main.Compute.glsl +++ b/naga/tests/out/glsl/bits.main.Compute.glsl @@ -39,44 +39,44 @@ void main() { f2_ = unpackHalf2x16(_e46); int _e48 = i; int _e49 = i; - i = bitfieldInsert(_e48, _e49, int(5u), int(10u)); + i = bitfieldInsert(_e48, _e49, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); ivec2 _e53 = i2_; ivec2 _e54 = i2_; - i2_ = bitfieldInsert(_e53, _e54, int(5u), int(10u)); + i2_ = bitfieldInsert(_e53, _e54, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); ivec3 _e58 = i3_; ivec3 _e59 = i3_; - i3_ = bitfieldInsert(_e58, _e59, int(5u), int(10u)); + i3_ = bitfieldInsert(_e58, _e59, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); ivec4 _e63 = i4_; ivec4 _e64 = i4_; - i4_ = bitfieldInsert(_e63, _e64, int(5u), int(10u)); + i4_ = bitfieldInsert(_e63, _e64, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); uint _e68 = u; uint _e69 = u; - u = bitfieldInsert(_e68, _e69, int(5u), int(10u)); + u = bitfieldInsert(_e68, _e69, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); uvec2 _e73 = u2_; uvec2 _e74 = u2_; - u2_ = bitfieldInsert(_e73, _e74, int(5u), int(10u)); + u2_ = bitfieldInsert(_e73, _e74, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); uvec3 _e78 = u3_; uvec3 _e79 = u3_; - u3_ = bitfieldInsert(_e78, _e79, int(5u), int(10u)); + u3_ = bitfieldInsert(_e78, _e79, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); uvec4 _e83 = u4_; uvec4 _e84 = u4_; - u4_ = bitfieldInsert(_e83, _e84, int(5u), int(10u)); + u4_ = bitfieldInsert(_e83, _e84, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); int _e88 = i; - i = bitfieldExtract(_e88, int(5u), int(10u)); + i = bitfieldExtract(_e88, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); ivec2 _e92 = i2_; - i2_ = bitfieldExtract(_e92, int(5u), int(10u)); + i2_ = bitfieldExtract(_e92, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); ivec3 _e96 = i3_; - i3_ = bitfieldExtract(_e96, int(5u), int(10u)); + i3_ = bitfieldExtract(_e96, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); ivec4 _e100 = i4_; - i4_ = bitfieldExtract(_e100, int(5u), int(10u)); + i4_ = bitfieldExtract(_e100, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); uint _e104 = u; - u = bitfieldExtract(_e104, int(5u), int(10u)); + u = bitfieldExtract(_e104, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); uvec2 _e108 = u2_; - u2_ = bitfieldExtract(_e108, int(5u), int(10u)); + u2_ = bitfieldExtract(_e108, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); uvec3 _e112 = u3_; - u3_ = bitfieldExtract(_e112, int(5u), int(10u)); + u3_ = bitfieldExtract(_e112, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); uvec4 _e116 = u4_; - u4_ = bitfieldExtract(_e116, int(5u), int(10u)); + u4_ = bitfieldExtract(_e116, int(min(5u, 32u)), int(min(10u, 32u - min(5u, 32u)))); int _e120 = i; i = findLSB(_e120); uvec2 _e122 = u2_; diff --git a/naga/tests/out/hlsl/bits.hlsl b/naga/tests/out/hlsl/bits.hlsl index 8ae2f7e1f..7cfaeddea 100644 --- a/naga/tests/out/hlsl/bits.hlsl +++ b/naga/tests/out/hlsl/bits.hlsl @@ -1,3 +1,179 @@ +int naga_insertBits( + int e, + int newbits, + uint offset, + uint count +) { + uint w = 32u; + uint o = min(offset, w); + uint c = min(count, w - o); + uint mask = ((4294967295u >> (32u - c)) << o); + return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask))); +} +int2 naga_insertBits( + int2 e, + int2 newbits, + uint offset, + uint count +) { + uint w = 32u; + uint o = min(offset, w); + uint c = min(count, w - o); + uint mask = ((4294967295u >> (32u - c)) << o); + return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask))); +} +int3 naga_insertBits( + int3 e, + int3 newbits, + uint offset, + uint count +) { + uint w = 32u; + uint o = min(offset, w); + uint c = min(count, w - o); + uint mask = ((4294967295u >> (32u - c)) << o); + return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask))); +} +int4 naga_insertBits( + int4 e, + int4 newbits, + uint offset, + uint count +) { + uint w = 32u; + uint o = min(offset, w); + uint c = min(count, w - o); + uint mask = ((4294967295u >> (32u - c)) << o); + return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask))); +} +uint naga_insertBits( + uint e, + uint newbits, + uint offset, + uint count +) { + uint w = 32u; + uint o = min(offset, w); + uint c = min(count, w - o); + uint mask = ((4294967295u >> (32u - c)) << o); + return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask))); +} +uint2 naga_insertBits( + uint2 e, + uint2 newbits, + uint offset, + uint count +) { + uint w = 32u; + uint o = min(offset, w); + uint c = min(count, w - o); + uint mask = ((4294967295u >> (32u - c)) << o); + return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask))); +} +uint3 naga_insertBits( + uint3 e, + uint3 newbits, + uint offset, + uint count +) { + uint w = 32u; + uint o = min(offset, w); + uint c = min(count, w - o); + uint mask = ((4294967295u >> (32u - c)) << o); + return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask))); +} +uint4 naga_insertBits( + uint4 e, + uint4 newbits, + uint offset, + uint count +) { + uint w = 32u; + uint o = min(offset, w); + uint c = min(count, w - o); + uint mask = ((4294967295u >> (32u - c)) << o); + return (c == 0 ? e : ((e & ~mask) | ((newbits << o) & mask))); +} +int naga_extractBits( + int e, + uint offset, + uint count +) { + uint w = 32; + uint o = min(offset, w); + uint c = min(count, w - o); + return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c)); +} +int2 naga_extractBits( + int2 e, + uint offset, + uint count +) { + uint w = 32; + uint o = min(offset, w); + uint c = min(count, w - o); + return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c)); +} +int3 naga_extractBits( + int3 e, + uint offset, + uint count +) { + uint w = 32; + uint o = min(offset, w); + uint c = min(count, w - o); + return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c)); +} +int4 naga_extractBits( + int4 e, + uint offset, + uint count +) { + uint w = 32; + uint o = min(offset, w); + uint c = min(count, w - o); + return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c)); +} +uint naga_extractBits( + uint e, + uint offset, + uint count +) { + uint w = 32; + uint o = min(offset, w); + uint c = min(count, w - o); + return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c)); +} +uint2 naga_extractBits( + uint2 e, + uint offset, + uint count +) { + uint w = 32; + uint o = min(offset, w); + uint c = min(count, w - o); + return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c)); +} +uint3 naga_extractBits( + uint3 e, + uint offset, + uint count +) { + uint w = 32; + uint o = min(offset, w); + uint c = min(count, w - o); + return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c)); +} +uint4 naga_extractBits( + uint4 e, + uint offset, + uint count +) { + uint w = 32; + uint o = min(offset, w); + uint c = min(count, w - o); + return (c == 0 ? 0 : (e << (w - c - o)) >> (w - c)); +} [numthreads(1, 1, 1)] void main() { @@ -34,44 +210,44 @@ void main() f2_ = float2(f16tof32(_expr46), f16tof32((_expr46) >> 16)); int _expr48 = i; int _expr49 = i; - i = (10u == 0 ? _expr48 : (_expr48 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr49 << 5u) & ((4294967295u >> (32u - 10u)) << 5u))); + i = naga_insertBits(_expr48, _expr49, 5u, 10u); int2 _expr53 = i2_; int2 _expr54 = i2_; - i2_ = (10u == 0 ? _expr53 : (_expr53 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr54 << 5u) & ((4294967295u >> (32u - 10u)) << 5u))); + i2_ = naga_insertBits(_expr53, _expr54, 5u, 10u); int3 _expr58 = i3_; int3 _expr59 = i3_; - i3_ = (10u == 0 ? _expr58 : (_expr58 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr59 << 5u) & ((4294967295u >> (32u - 10u)) << 5u))); + i3_ = naga_insertBits(_expr58, _expr59, 5u, 10u); int4 _expr63 = i4_; int4 _expr64 = i4_; - i4_ = (10u == 0 ? _expr63 : (_expr63 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr64 << 5u) & ((4294967295u >> (32u - 10u)) << 5u))); + i4_ = naga_insertBits(_expr63, _expr64, 5u, 10u); uint _expr68 = u; uint _expr69 = u; - u = (10u == 0 ? _expr68 : (_expr68 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr69 << 5u) & ((4294967295u >> (32u - 10u)) << 5u))); + u = naga_insertBits(_expr68, _expr69, 5u, 10u); uint2 _expr73 = u2_; uint2 _expr74 = u2_; - u2_ = (10u == 0 ? _expr73 : (_expr73 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr74 << 5u) & ((4294967295u >> (32u - 10u)) << 5u))); + u2_ = naga_insertBits(_expr73, _expr74, 5u, 10u); uint3 _expr78 = u3_; uint3 _expr79 = u3_; - u3_ = (10u == 0 ? _expr78 : (_expr78 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr79 << 5u) & ((4294967295u >> (32u - 10u)) << 5u))); + u3_ = naga_insertBits(_expr78, _expr79, 5u, 10u); uint4 _expr83 = u4_; uint4 _expr84 = u4_; - u4_ = (10u == 0 ? _expr83 : (_expr83 & ~((4294967295u >> (32u - 10u)) << 5u)) | ((_expr84 << 5u) & ((4294967295u >> (32u - 10u)) << 5u))); + u4_ = naga_insertBits(_expr83, _expr84, 5u, 10u); int _expr88 = i; - i = (10u == 0 ? 0 : (_expr88 << (32 - 10u - 5u)) >> (32 - 10u)); + i = naga_extractBits(_expr88, 5u, 10u); int2 _expr92 = i2_; - i2_ = (10u == 0 ? 0 : (_expr92 << (32 - 10u - 5u)) >> (32 - 10u)); + i2_ = naga_extractBits(_expr92, 5u, 10u); int3 _expr96 = i3_; - i3_ = (10u == 0 ? 0 : (_expr96 << (32 - 10u - 5u)) >> (32 - 10u)); + i3_ = naga_extractBits(_expr96, 5u, 10u); int4 _expr100 = i4_; - i4_ = (10u == 0 ? 0 : (_expr100 << (32 - 10u - 5u)) >> (32 - 10u)); + i4_ = naga_extractBits(_expr100, 5u, 10u); uint _expr104 = u; - u = (10u == 0 ? 0 : (_expr104 << (32 - 10u - 5u)) >> (32 - 10u)); + u = naga_extractBits(_expr104, 5u, 10u); uint2 _expr108 = u2_; - u2_ = (10u == 0 ? 0 : (_expr108 << (32 - 10u - 5u)) >> (32 - 10u)); + u2_ = naga_extractBits(_expr108, 5u, 10u); uint3 _expr112 = u3_; - u3_ = (10u == 0 ? 0 : (_expr112 << (32 - 10u - 5u)) >> (32 - 10u)); + u3_ = naga_extractBits(_expr112, 5u, 10u); uint4 _expr116 = u4_; - u4_ = (10u == 0 ? 0 : (_expr116 << (32 - 10u - 5u)) >> (32 - 10u)); + u4_ = naga_extractBits(_expr116, 5u, 10u); int _expr120 = i; i = asint(firstbitlow(_expr120)); uint2 _expr122 = u2_; diff --git a/naga/tests/out/msl/bits.msl b/naga/tests/out/msl/bits.msl index 7d73568b7..20f0f8de9 100644 --- a/naga/tests/out/msl/bits.msl +++ b/naga/tests/out/msl/bits.msl @@ -39,44 +39,44 @@ kernel void main_( f2_ = float2(as_type(_e46)); int _e48 = i; int _e49 = i; - i = metal::insert_bits(_e48, _e49, 5u, 10u); + i = metal::insert_bits(_e48, _e49, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); metal::int2 _e53 = i2_; metal::int2 _e54 = i2_; - i2_ = metal::insert_bits(_e53, _e54, 5u, 10u); + i2_ = metal::insert_bits(_e53, _e54, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); metal::int3 _e58 = i3_; metal::int3 _e59 = i3_; - i3_ = metal::insert_bits(_e58, _e59, 5u, 10u); + i3_ = metal::insert_bits(_e58, _e59, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); metal::int4 _e63 = i4_; metal::int4 _e64 = i4_; - i4_ = metal::insert_bits(_e63, _e64, 5u, 10u); + i4_ = metal::insert_bits(_e63, _e64, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); uint _e68 = u; uint _e69 = u; - u = metal::insert_bits(_e68, _e69, 5u, 10u); + u = metal::insert_bits(_e68, _e69, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); metal::uint2 _e73 = u2_; metal::uint2 _e74 = u2_; - u2_ = metal::insert_bits(_e73, _e74, 5u, 10u); + u2_ = metal::insert_bits(_e73, _e74, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); metal::uint3 _e78 = u3_; metal::uint3 _e79 = u3_; - u3_ = metal::insert_bits(_e78, _e79, 5u, 10u); + u3_ = metal::insert_bits(_e78, _e79, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); metal::uint4 _e83 = u4_; metal::uint4 _e84 = u4_; - u4_ = metal::insert_bits(_e83, _e84, 5u, 10u); + u4_ = metal::insert_bits(_e83, _e84, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); int _e88 = i; - i = metal::extract_bits(_e88, 5u, 10u); + i = metal::extract_bits(_e88, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); metal::int2 _e92 = i2_; - i2_ = metal::extract_bits(_e92, 5u, 10u); + i2_ = metal::extract_bits(_e92, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); metal::int3 _e96 = i3_; - i3_ = metal::extract_bits(_e96, 5u, 10u); + i3_ = metal::extract_bits(_e96, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); metal::int4 _e100 = i4_; - i4_ = metal::extract_bits(_e100, 5u, 10u); + i4_ = metal::extract_bits(_e100, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); uint _e104 = u; - u = metal::extract_bits(_e104, 5u, 10u); + u = metal::extract_bits(_e104, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); metal::uint2 _e108 = u2_; - u2_ = metal::extract_bits(_e108, 5u, 10u); + u2_ = metal::extract_bits(_e108, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); metal::uint3 _e112 = u3_; - u3_ = metal::extract_bits(_e112, 5u, 10u); + u3_ = metal::extract_bits(_e112, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); metal::uint4 _e116 = u4_; - u4_ = metal::extract_bits(_e116, 5u, 10u); + u4_ = metal::extract_bits(_e116, metal::min(5u, 32u), metal::min(10u, 32u - metal::min(5u, 32u))); int _e120 = i; i = (((metal::ctz(_e120) + 1) % 33) - 1); metal::uint2 _e122 = u2_; diff --git a/naga/tests/out/spv/bits.spvasm b/naga/tests/out/spv/bits.spvasm index a77c4470a..33e2bb9e5 100644 --- a/naga/tests/out/spv/bits.spvasm +++ b/naga/tests/out/spv/bits.spvasm @@ -1,7 +1,7 @@ ; SPIR-V ; Version: 1.1 ; Generator: rspirv -; Bound: 155 +; Bound: 204 OpCapability Shader %1 = OpExtInstImport "GLSL.std.450" OpMemoryModel Logical GLSL450 @@ -43,6 +43,7 @@ OpExecutionMode %15 LocalSize 1 1 1 %45 = OpTypePointer Function %10 %47 = OpTypePointer Function %11 %49 = OpTypePointer Function %13 +%74 = OpConstant %7 32 %15 = OpFunction %2 None %16 %14 = OpLabel %48 = OpVariable %49 Function %27 @@ -89,125 +90,173 @@ OpStore %46 %68 OpStore %46 %70 %71 = OpLoad %3 %30 %72 = OpLoad %3 %30 -%73 = OpBitFieldInsert %3 %71 %72 %28 %29 +%75 = OpExtInst %7 %1 UMin %28 %74 +%76 = OpISub %7 %74 %75 +%77 = OpExtInst %7 %1 UMin %29 %76 +%73 = OpBitFieldInsert %3 %71 %72 %75 %77 OpStore %30 %73 -%74 = OpLoad %4 %32 -%75 = OpLoad %4 %32 -%76 = OpBitFieldInsert %4 %74 %75 %28 %29 -OpStore %32 %76 -%77 = OpLoad %5 %34 -%78 = OpLoad %5 %34 -%79 = OpBitFieldInsert %5 %77 %78 %28 %29 -OpStore %34 %79 -%80 = OpLoad %6 %36 -%81 = OpLoad %6 %36 -%82 = OpBitFieldInsert %6 %80 %81 %28 %29 -OpStore %36 %82 -%83 = OpLoad %7 %38 -%84 = OpLoad %7 %38 -%85 = OpBitFieldInsert %7 %83 %84 %28 %29 -OpStore %38 %85 -%86 = OpLoad %8 %40 -%87 = OpLoad %8 %40 -%88 = OpBitFieldInsert %8 %86 %87 %28 %29 -OpStore %40 %88 -%89 = OpLoad %9 %42 -%90 = OpLoad %9 %42 -%91 = OpBitFieldInsert %9 %89 %90 %28 %29 -OpStore %42 %91 -%92 = OpLoad %10 %44 -%93 = OpLoad %10 %44 -%94 = OpBitFieldInsert %10 %92 %93 %28 %29 -OpStore %44 %94 -%95 = OpLoad %3 %30 -%96 = OpBitFieldSExtract %3 %95 %28 %29 -OpStore %30 %96 -%97 = OpLoad %4 %32 -%98 = OpBitFieldSExtract %4 %97 %28 %29 -OpStore %32 %98 -%99 = OpLoad %5 %34 -%100 = OpBitFieldSExtract %5 %99 %28 %29 -OpStore %34 %100 -%101 = OpLoad %6 %36 -%102 = OpBitFieldSExtract %6 %101 %28 %29 -OpStore %36 %102 -%103 = OpLoad %7 %38 -%104 = OpBitFieldUExtract %7 %103 %28 %29 -OpStore %38 %104 -%105 = OpLoad %8 %40 -%106 = OpBitFieldUExtract %8 %105 %28 %29 -OpStore %40 %106 -%107 = OpLoad %9 %42 -%108 = OpBitFieldUExtract %9 %107 %28 %29 -OpStore %42 %108 -%109 = OpLoad %10 %44 -%110 = OpBitFieldUExtract %10 %109 %28 %29 -OpStore %44 %110 -%111 = OpLoad %3 %30 -%112 = OpExtInst %3 %1 FindILsb %111 -OpStore %30 %112 -%113 = OpLoad %8 %40 -%114 = OpExtInst %8 %1 FindILsb %113 -OpStore %40 %114 -%115 = OpLoad %5 %34 -%116 = OpExtInst %5 %1 FindSMsb %115 -OpStore %34 %116 -%117 = OpLoad %9 %42 -%118 = OpExtInst %9 %1 FindUMsb %117 -OpStore %42 %118 -%119 = OpLoad %3 %30 -%120 = OpExtInst %3 %1 FindSMsb %119 -OpStore %30 %120 -%121 = OpLoad %7 %38 -%122 = OpExtInst %7 %1 FindUMsb %121 -OpStore %38 %122 -%123 = OpLoad %3 %30 -%124 = OpBitCount %3 %123 -OpStore %30 %124 +%78 = OpLoad %4 %32 +%79 = OpLoad %4 %32 +%81 = OpExtInst %7 %1 UMin %28 %74 +%82 = OpISub %7 %74 %81 +%83 = OpExtInst %7 %1 UMin %29 %82 +%80 = OpBitFieldInsert %4 %78 %79 %81 %83 +OpStore %32 %80 +%84 = OpLoad %5 %34 +%85 = OpLoad %5 %34 +%87 = OpExtInst %7 %1 UMin %28 %74 +%88 = OpISub %7 %74 %87 +%89 = OpExtInst %7 %1 UMin %29 %88 +%86 = OpBitFieldInsert %5 %84 %85 %87 %89 +OpStore %34 %86 +%90 = OpLoad %6 %36 +%91 = OpLoad %6 %36 +%93 = OpExtInst %7 %1 UMin %28 %74 +%94 = OpISub %7 %74 %93 +%95 = OpExtInst %7 %1 UMin %29 %94 +%92 = OpBitFieldInsert %6 %90 %91 %93 %95 +OpStore %36 %92 +%96 = OpLoad %7 %38 +%97 = OpLoad %7 %38 +%99 = OpExtInst %7 %1 UMin %28 %74 +%100 = OpISub %7 %74 %99 +%101 = OpExtInst %7 %1 UMin %29 %100 +%98 = OpBitFieldInsert %7 %96 %97 %99 %101 +OpStore %38 %98 +%102 = OpLoad %8 %40 +%103 = OpLoad %8 %40 +%105 = OpExtInst %7 %1 UMin %28 %74 +%106 = OpISub %7 %74 %105 +%107 = OpExtInst %7 %1 UMin %29 %106 +%104 = OpBitFieldInsert %8 %102 %103 %105 %107 +OpStore %40 %104 +%108 = OpLoad %9 %42 +%109 = OpLoad %9 %42 +%111 = OpExtInst %7 %1 UMin %28 %74 +%112 = OpISub %7 %74 %111 +%113 = OpExtInst %7 %1 UMin %29 %112 +%110 = OpBitFieldInsert %9 %108 %109 %111 %113 +OpStore %42 %110 +%114 = OpLoad %10 %44 +%115 = OpLoad %10 %44 +%117 = OpExtInst %7 %1 UMin %28 %74 +%118 = OpISub %7 %74 %117 +%119 = OpExtInst %7 %1 UMin %29 %118 +%116 = OpBitFieldInsert %10 %114 %115 %117 %119 +OpStore %44 %116 +%120 = OpLoad %3 %30 +%122 = OpExtInst %7 %1 UMin %28 %74 +%123 = OpISub %7 %74 %122 +%124 = OpExtInst %7 %1 UMin %29 %123 +%121 = OpBitFieldSExtract %3 %120 %122 %124 +OpStore %30 %121 %125 = OpLoad %4 %32 -%126 = OpBitCount %4 %125 +%127 = OpExtInst %7 %1 UMin %28 %74 +%128 = OpISub %7 %74 %127 +%129 = OpExtInst %7 %1 UMin %29 %128 +%126 = OpBitFieldSExtract %4 %125 %127 %129 OpStore %32 %126 -%127 = OpLoad %5 %34 -%128 = OpBitCount %5 %127 -OpStore %34 %128 -%129 = OpLoad %6 %36 -%130 = OpBitCount %6 %129 -OpStore %36 %130 -%131 = OpLoad %7 %38 -%132 = OpBitCount %7 %131 -OpStore %38 %132 -%133 = OpLoad %8 %40 -%134 = OpBitCount %8 %133 -OpStore %40 %134 -%135 = OpLoad %9 %42 -%136 = OpBitCount %9 %135 -OpStore %42 %136 -%137 = OpLoad %10 %44 -%138 = OpBitCount %10 %137 -OpStore %44 %138 -%139 = OpLoad %3 %30 -%140 = OpBitReverse %3 %139 -OpStore %30 %140 -%141 = OpLoad %4 %32 -%142 = OpBitReverse %4 %141 -OpStore %32 %142 -%143 = OpLoad %5 %34 -%144 = OpBitReverse %5 %143 -OpStore %34 %144 -%145 = OpLoad %6 %36 -%146 = OpBitReverse %6 %145 -OpStore %36 %146 -%147 = OpLoad %7 %38 -%148 = OpBitReverse %7 %147 -OpStore %38 %148 -%149 = OpLoad %8 %40 -%150 = OpBitReverse %8 %149 -OpStore %40 %150 -%151 = OpLoad %9 %42 -%152 = OpBitReverse %9 %151 -OpStore %42 %152 -%153 = OpLoad %10 %44 -%154 = OpBitReverse %10 %153 -OpStore %44 %154 +%130 = OpLoad %5 %34 +%132 = OpExtInst %7 %1 UMin %28 %74 +%133 = OpISub %7 %74 %132 +%134 = OpExtInst %7 %1 UMin %29 %133 +%131 = OpBitFieldSExtract %5 %130 %132 %134 +OpStore %34 %131 +%135 = OpLoad %6 %36 +%137 = OpExtInst %7 %1 UMin %28 %74 +%138 = OpISub %7 %74 %137 +%139 = OpExtInst %7 %1 UMin %29 %138 +%136 = OpBitFieldSExtract %6 %135 %137 %139 +OpStore %36 %136 +%140 = OpLoad %7 %38 +%142 = OpExtInst %7 %1 UMin %28 %74 +%143 = OpISub %7 %74 %142 +%144 = OpExtInst %7 %1 UMin %29 %143 +%141 = OpBitFieldUExtract %7 %140 %142 %144 +OpStore %38 %141 +%145 = OpLoad %8 %40 +%147 = OpExtInst %7 %1 UMin %28 %74 +%148 = OpISub %7 %74 %147 +%149 = OpExtInst %7 %1 UMin %29 %148 +%146 = OpBitFieldUExtract %8 %145 %147 %149 +OpStore %40 %146 +%150 = OpLoad %9 %42 +%152 = OpExtInst %7 %1 UMin %28 %74 +%153 = OpISub %7 %74 %152 +%154 = OpExtInst %7 %1 UMin %29 %153 +%151 = OpBitFieldUExtract %9 %150 %152 %154 +OpStore %42 %151 +%155 = OpLoad %10 %44 +%157 = OpExtInst %7 %1 UMin %28 %74 +%158 = OpISub %7 %74 %157 +%159 = OpExtInst %7 %1 UMin %29 %158 +%156 = OpBitFieldUExtract %10 %155 %157 %159 +OpStore %44 %156 +%160 = OpLoad %3 %30 +%161 = OpExtInst %3 %1 FindILsb %160 +OpStore %30 %161 +%162 = OpLoad %8 %40 +%163 = OpExtInst %8 %1 FindILsb %162 +OpStore %40 %163 +%164 = OpLoad %5 %34 +%165 = OpExtInst %5 %1 FindSMsb %164 +OpStore %34 %165 +%166 = OpLoad %9 %42 +%167 = OpExtInst %9 %1 FindUMsb %166 +OpStore %42 %167 +%168 = OpLoad %3 %30 +%169 = OpExtInst %3 %1 FindSMsb %168 +OpStore %30 %169 +%170 = OpLoad %7 %38 +%171 = OpExtInst %7 %1 FindUMsb %170 +OpStore %38 %171 +%172 = OpLoad %3 %30 +%173 = OpBitCount %3 %172 +OpStore %30 %173 +%174 = OpLoad %4 %32 +%175 = OpBitCount %4 %174 +OpStore %32 %175 +%176 = OpLoad %5 %34 +%177 = OpBitCount %5 %176 +OpStore %34 %177 +%178 = OpLoad %6 %36 +%179 = OpBitCount %6 %178 +OpStore %36 %179 +%180 = OpLoad %7 %38 +%181 = OpBitCount %7 %180 +OpStore %38 %181 +%182 = OpLoad %8 %40 +%183 = OpBitCount %8 %182 +OpStore %40 %183 +%184 = OpLoad %9 %42 +%185 = OpBitCount %9 %184 +OpStore %42 %185 +%186 = OpLoad %10 %44 +%187 = OpBitCount %10 %186 +OpStore %44 %187 +%188 = OpLoad %3 %30 +%189 = OpBitReverse %3 %188 +OpStore %30 %189 +%190 = OpLoad %4 %32 +%191 = OpBitReverse %4 %190 +OpStore %32 %191 +%192 = OpLoad %5 %34 +%193 = OpBitReverse %5 %192 +OpStore %34 %193 +%194 = OpLoad %6 %36 +%195 = OpBitReverse %6 %194 +OpStore %36 %195 +%196 = OpLoad %7 %38 +%197 = OpBitReverse %7 %196 +OpStore %38 %197 +%198 = OpLoad %8 %40 +%199 = OpBitReverse %8 %198 +OpStore %40 %199 +%200 = OpLoad %9 %42 +%201 = OpBitReverse %9 %200 +OpStore %42 %201 +%202 = OpLoad %10 %44 +%203 = OpBitReverse %10 %202 +OpStore %44 %203 OpReturn OpFunctionEnd \ No newline at end of file