mirror of
https://github.com/rust-lang/rust.git
synced 2025-02-17 17:33:07 +00:00
Merge pull request #1431 from rust-lang/even_more_simd_intrinsics
Implement another batch of vendor intrinsics
This commit is contained in:
commit
710c67909d
@ -22,6 +22,11 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||
// Spin loop hint
|
||||
}
|
||||
|
||||
"llvm.x86.avx.vzeroupper" => {
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper&ig_expand=7218
|
||||
// Do nothing. It is a perf hint anyway.
|
||||
}
|
||||
|
||||
// Used by is_x86_feature_detected!();
|
||||
"llvm.x86.xgetbv" => {
|
||||
intrinsic_args!(fx, args => (xcr_no); intrinsic);
|
||||
@ -69,6 +74,103 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||
ret.write_cvalue(fx, val);
|
||||
}
|
||||
|
||||
"llvm.x86.avx2.gather.d.d"
|
||||
| "llvm.x86.avx2.gather.d.q"
|
||||
| "llvm.x86.avx2.gather.d.ps"
|
||||
| "llvm.x86.avx2.gather.d.pd"
|
||||
| "llvm.x86.avx2.gather.d.d.256"
|
||||
| "llvm.x86.avx2.gather.d.q.256"
|
||||
| "llvm.x86.avx2.gather.d.ps.256"
|
||||
| "llvm.x86.avx2.gather.d.pd.256"
|
||||
| "llvm.x86.avx2.gather.q.d"
|
||||
| "llvm.x86.avx2.gather.q.q"
|
||||
| "llvm.x86.avx2.gather.q.ps"
|
||||
| "llvm.x86.avx2.gather.q.pd"
|
||||
| "llvm.x86.avx2.gather.q.d.256"
|
||||
| "llvm.x86.avx2.gather.q.q.256"
|
||||
| "llvm.x86.avx2.gather.q.ps.256"
|
||||
| "llvm.x86.avx2.gather.q.pd.256" => {
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd&ig_expand=3818
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd&ig_expand=3819
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd&ig_expand=3821
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd&ig_expand=3822
|
||||
// ...
|
||||
|
||||
intrinsic_args!(fx, args => (src, ptr, index, mask, scale); intrinsic);
|
||||
|
||||
let (src_lane_count, src_lane_ty) = src.layout().ty.simd_size_and_type(fx.tcx);
|
||||
let (index_lane_count, index_lane_ty) = index.layout().ty.simd_size_and_type(fx.tcx);
|
||||
let (mask_lane_count, mask_lane_ty) = mask.layout().ty.simd_size_and_type(fx.tcx);
|
||||
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
|
||||
assert_eq!(src_lane_ty, ret_lane_ty);
|
||||
assert!(index_lane_ty.is_integral());
|
||||
assert_eq!(src_lane_count, mask_lane_count);
|
||||
assert_eq!(src_lane_count, ret_lane_count);
|
||||
|
||||
let lane_clif_ty = fx.clif_type(ret_lane_ty).unwrap();
|
||||
let index_lane_clif_ty = fx.clif_type(index_lane_ty).unwrap();
|
||||
let mask_lane_clif_ty = fx.clif_type(mask_lane_ty).unwrap();
|
||||
let ret_lane_layout = fx.layout_of(ret_lane_ty);
|
||||
|
||||
let ptr = ptr.load_scalar(fx);
|
||||
let scale = scale.load_scalar(fx);
|
||||
let scale = fx.bcx.ins().uextend(types::I64, scale);
|
||||
for lane_idx in 0..std::cmp::min(src_lane_count, index_lane_count) {
|
||||
let src_lane = src.value_lane(fx, lane_idx).load_scalar(fx);
|
||||
let index_lane = index.value_lane(fx, lane_idx).load_scalar(fx);
|
||||
let mask_lane = mask.value_lane(fx, lane_idx).load_scalar(fx);
|
||||
let mask_lane =
|
||||
fx.bcx.ins().bitcast(mask_lane_clif_ty.as_int(), MemFlags::new(), mask_lane);
|
||||
|
||||
let if_enabled = fx.bcx.create_block();
|
||||
let if_disabled = fx.bcx.create_block();
|
||||
let next = fx.bcx.create_block();
|
||||
let res_lane = fx.bcx.append_block_param(next, lane_clif_ty);
|
||||
|
||||
let mask_lane = match mask_lane_clif_ty {
|
||||
types::I32 | types::F32 => {
|
||||
fx.bcx.ins().band_imm(mask_lane, 0x8000_0000u64 as i64)
|
||||
}
|
||||
types::I64 | types::F64 => {
|
||||
fx.bcx.ins().band_imm(mask_lane, 0x8000_0000_0000_0000u64 as i64)
|
||||
}
|
||||
_ => unreachable!(),
|
||||
};
|
||||
fx.bcx.ins().brif(mask_lane, if_enabled, &[], if_disabled, &[]);
|
||||
fx.bcx.seal_block(if_enabled);
|
||||
fx.bcx.seal_block(if_disabled);
|
||||
|
||||
fx.bcx.switch_to_block(if_enabled);
|
||||
let index_lane = if index_lane_clif_ty != types::I64 {
|
||||
fx.bcx.ins().sextend(types::I64, index_lane)
|
||||
} else {
|
||||
index_lane
|
||||
};
|
||||
let offset = fx.bcx.ins().imul(index_lane, scale);
|
||||
let lane_ptr = fx.bcx.ins().iadd(ptr, offset);
|
||||
let res = fx.bcx.ins().load(lane_clif_ty, MemFlags::trusted(), lane_ptr, 0);
|
||||
fx.bcx.ins().jump(next, &[res]);
|
||||
|
||||
fx.bcx.switch_to_block(if_disabled);
|
||||
fx.bcx.ins().jump(next, &[src_lane]);
|
||||
|
||||
fx.bcx.seal_block(next);
|
||||
fx.bcx.switch_to_block(next);
|
||||
|
||||
fx.bcx.ins().nop();
|
||||
|
||||
ret.place_lane(fx, lane_idx)
|
||||
.write_cvalue(fx, CValue::by_val(res_lane, ret_lane_layout));
|
||||
}
|
||||
|
||||
for lane_idx in std::cmp::min(src_lane_count, index_lane_count)..ret_lane_count {
|
||||
let zero_lane = fx.bcx.ins().iconst(mask_lane_clif_ty.as_int(), 0);
|
||||
let zero_lane = fx.bcx.ins().bitcast(mask_lane_clif_ty, MemFlags::new(), zero_lane);
|
||||
ret.place_lane(fx, lane_idx)
|
||||
.write_cvalue(fx, CValue::by_val(zero_lane, ret_lane_layout));
|
||||
}
|
||||
}
|
||||
|
||||
"llvm.x86.sse.cmp.ps" | "llvm.x86.sse2.cmp.pd" => {
|
||||
let (x, y, kind) = match args {
|
||||
[x, y, kind] => (x, y, kind),
|
||||
@ -273,16 +375,31 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||
);
|
||||
}
|
||||
"llvm.x86.ssse3.pabs.b.128" | "llvm.x86.ssse3.pabs.w.128" | "llvm.x86.ssse3.pabs.d.128" => {
|
||||
let a = match args {
|
||||
[a] => a,
|
||||
_ => bug!("wrong number of args for intrinsic {intrinsic}"),
|
||||
};
|
||||
let a = codegen_operand(fx, a);
|
||||
intrinsic_args!(fx, args => (a); intrinsic);
|
||||
|
||||
simd_for_each_lane(fx, a, ret, &|fx, _lane_ty, _res_lane_ty, lane| {
|
||||
fx.bcx.ins().iabs(lane)
|
||||
});
|
||||
}
|
||||
"llvm.x86.sse2.cvttps2dq" => {
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32&ig_expand=2429
|
||||
intrinsic_args!(fx, args => (a); intrinsic);
|
||||
let a = a.load_scalar(fx);
|
||||
|
||||
// Using inline asm instead of fcvt_to_sint_sat as unrepresentable values are turned
|
||||
// into 0x80000000 for which Cranelift doesn't have a native instruction.
|
||||
codegen_inline_asm_inner(
|
||||
fx,
|
||||
&[InlineAsmTemplatePiece::String(format!("cvttps2dq xmm0, xmm0"))],
|
||||
&[CInlineAsmOperand::InOut {
|
||||
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
|
||||
_late: true,
|
||||
in_value: a,
|
||||
out_place: Some(ret),
|
||||
}],
|
||||
InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
|
||||
);
|
||||
}
|
||||
"llvm.x86.addcarry.32" | "llvm.x86.addcarry.64" => {
|
||||
intrinsic_args!(fx, args => (c_in, a, b); intrinsic);
|
||||
let c_in = c_in.load_scalar(fx);
|
||||
@ -439,12 +556,12 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||
let ret_lane_layout = fx.layout_of(fx.tcx.types.i32);
|
||||
for out_lane_idx in 0..lane_count / 2 {
|
||||
let a_lane0 = a.value_lane(fx, out_lane_idx * 2).load_scalar(fx);
|
||||
let a_lane0 = fx.bcx.ins().uextend(types::I32, a_lane0);
|
||||
let a_lane0 = fx.bcx.ins().sextend(types::I32, a_lane0);
|
||||
let b_lane0 = b.value_lane(fx, out_lane_idx * 2).load_scalar(fx);
|
||||
let b_lane0 = fx.bcx.ins().sextend(types::I32, b_lane0);
|
||||
|
||||
let a_lane1 = a.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);
|
||||
let a_lane1 = fx.bcx.ins().uextend(types::I32, a_lane1);
|
||||
let a_lane1 = fx.bcx.ins().sextend(types::I32, a_lane1);
|
||||
let b_lane1 = b.value_lane(fx, out_lane_idx * 2 + 1).load_scalar(fx);
|
||||
let b_lane1 = fx.bcx.ins().sextend(types::I32, b_lane1);
|
||||
|
||||
@ -599,14 +716,14 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
|
||||
assert_eq!(lane_count * 2, ret_lane_count);
|
||||
|
||||
let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
|
||||
let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
|
||||
let min_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MIN) as u32 as i64);
|
||||
let max_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MAX) as u32 as i64);
|
||||
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
|
||||
|
||||
for idx in 0..lane_count {
|
||||
let lane = a.value_lane(fx, idx).load_scalar(fx);
|
||||
let sat = fx.bcx.ins().smax(lane, min_i16);
|
||||
let sat = fx.bcx.ins().umin(sat, max_i16);
|
||||
let sat = fx.bcx.ins().smin(sat, max_i16);
|
||||
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||
|
||||
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||
@ -616,7 +733,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||
for idx in 0..lane_count {
|
||||
let lane = b.value_lane(fx, idx).load_scalar(fx);
|
||||
let sat = fx.bcx.ins().smax(lane, min_i16);
|
||||
let sat = fx.bcx.ins().umin(sat, max_i16);
|
||||
let sat = fx.bcx.ins().smin(sat, max_i16);
|
||||
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||
|
||||
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||
@ -643,8 +760,8 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||
|
||||
for idx in 0..lane_count {
|
||||
let lane = a.value_lane(fx, idx).load_scalar(fx);
|
||||
let sat = fx.bcx.ins().umax(lane, min_u16);
|
||||
let sat = fx.bcx.ins().umin(sat, max_u16);
|
||||
let sat = fx.bcx.ins().smax(lane, min_u16);
|
||||
let sat = fx.bcx.ins().smin(sat, max_u16);
|
||||
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||
|
||||
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||
@ -653,8 +770,8 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||
|
||||
for idx in 0..lane_count {
|
||||
let lane = b.value_lane(fx, idx).load_scalar(fx);
|
||||
let sat = fx.bcx.ins().umax(lane, min_u16);
|
||||
let sat = fx.bcx.ins().umin(sat, max_u16);
|
||||
let sat = fx.bcx.ins().smax(lane, min_u16);
|
||||
let sat = fx.bcx.ins().smin(sat, max_u16);
|
||||
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||
|
||||
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||
@ -675,14 +792,14 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||
assert_eq!(ret_lane_ty, fx.tcx.types.i16);
|
||||
assert_eq!(lane_count * 2, ret_lane_count);
|
||||
|
||||
let min_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MIN as u16));
|
||||
let max_i16 = fx.bcx.ins().iconst(types::I32, i64::from(i16::MAX as u16));
|
||||
let min_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MIN) as u32 as i64);
|
||||
let max_i16 = fx.bcx.ins().iconst(types::I32, i32::from(i16::MAX) as u32 as i64);
|
||||
let ret_lane_layout = fx.layout_of(fx.tcx.types.i16);
|
||||
|
||||
for idx in 0..lane_count / 2 {
|
||||
let lane = a.value_lane(fx, idx).load_scalar(fx);
|
||||
let sat = fx.bcx.ins().smax(lane, min_i16);
|
||||
let sat = fx.bcx.ins().umin(sat, max_i16);
|
||||
let sat = fx.bcx.ins().smin(sat, max_i16);
|
||||
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||
|
||||
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||
@ -692,7 +809,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||
for idx in 0..lane_count / 2 {
|
||||
let lane = b.value_lane(fx, idx).load_scalar(fx);
|
||||
let sat = fx.bcx.ins().smax(lane, min_i16);
|
||||
let sat = fx.bcx.ins().umin(sat, max_i16);
|
||||
let sat = fx.bcx.ins().smin(sat, max_i16);
|
||||
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||
|
||||
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||
@ -702,7 +819,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||
for idx in 0..lane_count / 2 {
|
||||
let lane = a.value_lane(fx, idx).load_scalar(fx);
|
||||
let sat = fx.bcx.ins().smax(lane, min_i16);
|
||||
let sat = fx.bcx.ins().umin(sat, max_i16);
|
||||
let sat = fx.bcx.ins().smin(sat, max_i16);
|
||||
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||
|
||||
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||
@ -712,7 +829,7 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||
for idx in 0..lane_count / 2 {
|
||||
let lane = b.value_lane(fx, idx).load_scalar(fx);
|
||||
let sat = fx.bcx.ins().smax(lane, min_i16);
|
||||
let sat = fx.bcx.ins().umin(sat, max_i16);
|
||||
let sat = fx.bcx.ins().smin(sat, max_i16);
|
||||
let res = fx.bcx.ins().ireduce(types::I16, sat);
|
||||
|
||||
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||
@ -720,6 +837,215 @@ pub(crate) fn codegen_x86_llvm_intrinsic_call<'tcx>(
|
||||
}
|
||||
}
|
||||
|
||||
"llvm.x86.fma.vfmaddsub.ps"
|
||||
| "llvm.x86.fma.vfmaddsub.pd"
|
||||
| "llvm.x86.fma.vfmaddsub.ps.256"
|
||||
| "llvm.x86.fma.vfmaddsub.pd.256" => {
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_ps&ig_expand=3205
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmaddsub_pd&ig_expand=3181
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_ps&ig_expand=3209
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmaddsub_pd&ig_expand=3185
|
||||
intrinsic_args!(fx, args => (a, b, c); intrinsic);
|
||||
|
||||
assert_eq!(a.layout(), b.layout());
|
||||
assert_eq!(a.layout(), c.layout());
|
||||
let layout = a.layout();
|
||||
|
||||
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
|
||||
assert!(lane_ty.is_floating_point());
|
||||
assert!(ret_lane_ty.is_floating_point());
|
||||
assert_eq!(lane_count, ret_lane_count);
|
||||
let ret_lane_layout = fx.layout_of(ret_lane_ty);
|
||||
|
||||
for idx in 0..lane_count {
|
||||
let a_lane = a.value_lane(fx, idx).load_scalar(fx);
|
||||
let b_lane = b.value_lane(fx, idx).load_scalar(fx);
|
||||
let c_lane = c.value_lane(fx, idx).load_scalar(fx);
|
||||
|
||||
let mul = fx.bcx.ins().fmul(a_lane, b_lane);
|
||||
let res = if idx & 1 == 0 {
|
||||
fx.bcx.ins().fsub(mul, c_lane)
|
||||
} else {
|
||||
fx.bcx.ins().fadd(mul, c_lane)
|
||||
};
|
||||
|
||||
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
|
||||
}
|
||||
}
|
||||
|
||||
"llvm.x86.fma.vfmsubadd.ps"
|
||||
| "llvm.x86.fma.vfmsubadd.pd"
|
||||
| "llvm.x86.fma.vfmsubadd.ps.256"
|
||||
| "llvm.x86.fma.vfmsubadd.pd.256" => {
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_ps&ig_expand=3325
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fmsubadd_pd&ig_expand=3301
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_ps&ig_expand=3329
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fmsubadd_pd&ig_expand=3305
|
||||
intrinsic_args!(fx, args => (a, b, c); intrinsic);
|
||||
|
||||
assert_eq!(a.layout(), b.layout());
|
||||
assert_eq!(a.layout(), c.layout());
|
||||
let layout = a.layout();
|
||||
|
||||
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
|
||||
assert!(lane_ty.is_floating_point());
|
||||
assert!(ret_lane_ty.is_floating_point());
|
||||
assert_eq!(lane_count, ret_lane_count);
|
||||
let ret_lane_layout = fx.layout_of(ret_lane_ty);
|
||||
|
||||
for idx in 0..lane_count {
|
||||
let a_lane = a.value_lane(fx, idx).load_scalar(fx);
|
||||
let b_lane = b.value_lane(fx, idx).load_scalar(fx);
|
||||
let c_lane = c.value_lane(fx, idx).load_scalar(fx);
|
||||
|
||||
let mul = fx.bcx.ins().fmul(a_lane, b_lane);
|
||||
let res = if idx & 1 == 0 {
|
||||
fx.bcx.ins().fadd(mul, c_lane)
|
||||
} else {
|
||||
fx.bcx.ins().fsub(mul, c_lane)
|
||||
};
|
||||
|
||||
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
|
||||
}
|
||||
}
|
||||
|
||||
"llvm.x86.fma.vfnmadd.ps"
|
||||
| "llvm.x86.fma.vfnmadd.pd"
|
||||
| "llvm.x86.fma.vfnmadd.ps.256"
|
||||
| "llvm.x86.fma.vfnmadd.pd.256" => {
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_ps&ig_expand=3391
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_fnmadd_pd&ig_expand=3367
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_ps&ig_expand=3395
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_fnmadd_pd&ig_expand=3371
|
||||
intrinsic_args!(fx, args => (a, b, c); intrinsic);
|
||||
|
||||
assert_eq!(a.layout(), b.layout());
|
||||
assert_eq!(a.layout(), c.layout());
|
||||
let layout = a.layout();
|
||||
|
||||
let (lane_count, lane_ty) = layout.ty.simd_size_and_type(fx.tcx);
|
||||
let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx);
|
||||
assert!(lane_ty.is_floating_point());
|
||||
assert!(ret_lane_ty.is_floating_point());
|
||||
assert_eq!(lane_count, ret_lane_count);
|
||||
let ret_lane_layout = fx.layout_of(ret_lane_ty);
|
||||
|
||||
for idx in 0..lane_count {
|
||||
let a_lane = a.value_lane(fx, idx).load_scalar(fx);
|
||||
let b_lane = b.value_lane(fx, idx).load_scalar(fx);
|
||||
let c_lane = c.value_lane(fx, idx).load_scalar(fx);
|
||||
|
||||
let mul = fx.bcx.ins().fmul(a_lane, b_lane);
|
||||
let neg_mul = fx.bcx.ins().fneg(mul);
|
||||
let res = fx.bcx.ins().fadd(neg_mul, c_lane);
|
||||
|
||||
let res_lane = CValue::by_val(res, ret_lane_layout);
|
||||
ret.place_lane(fx, idx).write_cvalue(fx, res_lane);
|
||||
}
|
||||
}
|
||||
|
||||
"llvm.x86.sse42.pcmpestri128" => {
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri&ig_expand=939
|
||||
intrinsic_args!(fx, args => (a, la, b, lb, _imm8); intrinsic);
|
||||
|
||||
let a = a.load_scalar(fx);
|
||||
let la = la.load_scalar(fx);
|
||||
let b = b.load_scalar(fx);
|
||||
let lb = lb.load_scalar(fx);
|
||||
|
||||
let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[4])
|
||||
{
|
||||
imm8
|
||||
} else {
|
||||
fx.tcx.sess.span_fatal(span, "Index argument for `_mm_cmpestri` is not a constant");
|
||||
};
|
||||
|
||||
let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8));
|
||||
|
||||
codegen_inline_asm_inner(
|
||||
fx,
|
||||
&[InlineAsmTemplatePiece::String(format!("pcmpestri xmm0, xmm1, {imm8}"))],
|
||||
&[
|
||||
CInlineAsmOperand::In {
|
||||
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
|
||||
value: a,
|
||||
},
|
||||
CInlineAsmOperand::In {
|
||||
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
|
||||
value: b,
|
||||
},
|
||||
// Implicit argument to the pcmpestri intrinsic
|
||||
CInlineAsmOperand::In {
|
||||
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
|
||||
value: la,
|
||||
},
|
||||
// Implicit argument to the pcmpestri intrinsic
|
||||
CInlineAsmOperand::In {
|
||||
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
|
||||
value: lb,
|
||||
},
|
||||
// Implicit result of the pcmpestri intrinsic
|
||||
CInlineAsmOperand::Out {
|
||||
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::cx)),
|
||||
late: true,
|
||||
place: Some(ret),
|
||||
},
|
||||
],
|
||||
InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
|
||||
);
|
||||
}
|
||||
|
||||
"llvm.x86.sse42.pcmpestrm128" => {
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm&ig_expand=940
|
||||
intrinsic_args!(fx, args => (a, la, b, lb, _imm8); intrinsic);
|
||||
|
||||
let a = a.load_scalar(fx);
|
||||
let la = la.load_scalar(fx);
|
||||
let b = b.load_scalar(fx);
|
||||
let lb = lb.load_scalar(fx);
|
||||
|
||||
let imm8 = if let Some(imm8) = crate::constant::mir_operand_get_const_val(fx, &args[4])
|
||||
{
|
||||
imm8
|
||||
} else {
|
||||
fx.tcx.sess.span_fatal(span, "Index argument for `_mm_cmpestrm` is not a constant");
|
||||
};
|
||||
|
||||
let imm8 = imm8.try_to_u8().unwrap_or_else(|_| panic!("kind not scalar: {:?}", imm8));
|
||||
|
||||
codegen_inline_asm_inner(
|
||||
fx,
|
||||
&[InlineAsmTemplatePiece::String(format!("pcmpestrm xmm0, xmm1, {imm8}"))],
|
||||
&[
|
||||
CInlineAsmOperand::InOut {
|
||||
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm0)),
|
||||
_late: true,
|
||||
in_value: a,
|
||||
out_place: Some(ret),
|
||||
},
|
||||
CInlineAsmOperand::In {
|
||||
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::xmm1)),
|
||||
value: b,
|
||||
},
|
||||
// Implicit argument to the pcmpestri intrinsic
|
||||
CInlineAsmOperand::In {
|
||||
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::ax)),
|
||||
value: la,
|
||||
},
|
||||
// Implicit argument to the pcmpestri intrinsic
|
||||
CInlineAsmOperand::In {
|
||||
reg: InlineAsmRegOrRegClass::Reg(InlineAsmReg::X86(X86InlineAsmReg::dx)),
|
||||
value: lb,
|
||||
},
|
||||
],
|
||||
InlineAsmOptions::NOSTACK | InlineAsmOptions::PURE | InlineAsmOptions::NOMEM,
|
||||
);
|
||||
}
|
||||
|
||||
"llvm.x86.pclmulqdq" => {
|
||||
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128&ig_expand=772
|
||||
intrinsic_args!(fx, args => (a, b, _imm8); intrinsic);
|
||||
|
Loading…
Reference in New Issue
Block a user