diff --git a/example/std_example.rs b/example/std_example.rs
index 33523a12871..e28da13e4c8 100644
--- a/example/std_example.rs
+++ b/example/std_example.rs
@@ -68,6 +68,7 @@ unsafe fn test_simd() {
     test_mm256_movemask_epi8();
     test_mm_add_epi8();
     test_mm_add_pd();
+    test_mm_cvtepi8_epi16();
 
     let mask1 = _mm_movemask_epi8(dbg!(_mm_setr_epi8(255u8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)));
     assert_eq!(mask1, 1);
@@ -170,6 +171,18 @@ pub unsafe fn assert_eq_m128d(a: __m128d, b: __m128d) {
     }
 }
 
+#[target_feature(enable = "sse4.1")]
+unsafe fn test_mm_cvtepi8_epi16() {
+    let a = _mm_set1_epi8(10);
+    let r = _mm_cvtepi8_epi16(a);
+    let e = _mm_set1_epi16(10);
+    assert_eq_m128i(r, e);
+    let a = _mm_set1_epi8(-10);
+    let r = _mm_cvtepi8_epi16(a);
+    let e = _mm_set1_epi16(-10);
+    assert_eq_m128i(r, e);
+}
+
 #[derive(PartialEq)]
 enum LoopState {
     Continue(()),
diff --git a/src/base.rs b/src/base.rs
index dd0594611cf..8612177166e 100644
--- a/src/base.rs
+++ b/src/base.rs
@@ -436,7 +436,6 @@ fn trans_stmt<'a, 'tcx: 'a>(
                         let discr = trans_get_discriminant(fx, place, fx.layout_of(to_ty));
                         lval.write_cvalue(fx, discr);
                     } else {
-                        let from_clif_ty = fx.clif_type(from_ty).unwrap();
                         let to_clif_ty = fx.clif_type(to_ty).unwrap();
                         let from = operand.load_scalar(fx);
 
@@ -447,43 +446,7 @@ fn trans_stmt<'a, 'tcx: 'a>(
                             _ => panic!("{}", from_ty),
                         };
 
-                        let res = if from_clif_ty.is_int() && to_clif_ty.is_int() {
-                            // int-like -> int-like
-                            crate::common::clif_intcast(
-                                fx,
-                                from,
-                                to_clif_ty,
-                                signed,
-                            )
-                        } else if from_clif_ty.is_int() && to_clif_ty.is_float() {
-                            // int-like -> float
-                            if signed {
-                                fx.bcx.ins().fcvt_from_sint(to_clif_ty, from)
-                            } else {
-                                fx.bcx.ins().fcvt_from_uint(to_clif_ty, from)
-                            }
-                        } else if from_clif_ty.is_float() && to_clif_ty.is_int() {
-                            // float -> int-like
-                            let from = operand.load_scalar(fx);
-                            if signed {
-                                fx.bcx.ins().fcvt_to_sint_sat(to_clif_ty, from)
-                            } else {
-                                fx.bcx.ins().fcvt_to_uint_sat(to_clif_ty, from)
-                            }
-                        } else if from_clif_ty.is_float() && to_clif_ty.is_float() {
-                            // float -> float
-                            match (from_clif_ty, to_clif_ty) {
-                                (types::F32, types::F64) => {
-                                    fx.bcx.ins().fpromote(types::F64, from)
-                                }
-                                (types::F64, types::F32) => {
-                                    fx.bcx.ins().fdemote(types::F32, from)
-                                }
-                                _ => from,
-                            }
-                        } else {
-                            unimpl!("rval misc {:?} {:?}", from_ty, to_ty)
-                        };
+                        let res = clif_int_or_float_cast(fx, from, to_clif_ty, signed);
                         lval.write_cvalue(fx, CValue::by_val(res, dest_layout));
                     }
                 }
diff --git a/src/cast.rs b/src/cast.rs
new file mode 100644
index 00000000000..d14faf4bc33
--- /dev/null
+++ b/src/cast.rs
@@ -0,0 +1,93 @@
+use crate::prelude::*;
+
+pub fn clif_intcast<'a, 'tcx: 'a>(
+    fx: &mut FunctionCx<'a, 'tcx, impl Backend>,
+    val: Value,
+    to: Type,
+    signed: bool,
+) -> Value {
+    let from = fx.bcx.func.dfg.value_type(val);
+    match (from, to) {
+        // equal
+        (_, _) if from == to => val,
+
+        // extend
+        (_, types::I128) => {
+            let wider = if from == types::I64 {
+                val
+            } else if signed {
+                fx.bcx.ins().sextend(types::I64, val)
+            } else {
+                fx.bcx.ins().uextend(types::I64, val)
+            };
+            let zero = fx.bcx.ins().iconst(types::I64, 0);
+            fx.bcx.ins().iconcat(wider, zero)
+        }
+        (_, _) if to.wider_or_equal(from) => {
+            if signed {
+                fx.bcx.ins().sextend(to, val)
+            } else {
+                fx.bcx.ins().uextend(to, val)
+            }
+        }
+
+        // reduce
+        (types::I128, _) => {
+            let (lsb, _msb) = fx.bcx.ins().isplit(val);
+            if to == types::I64 {
+                lsb
+            } else {
+                fx.bcx.ins().ireduce(to, lsb)
+            }
+        }
+        (_, _) => {
+            fx.bcx.ins().ireduce(to, val)
+        }
+    }
+}
+
+pub fn clif_int_or_float_cast(
+    fx: &mut FunctionCx<'_, '_, impl Backend>,
+    from: Value,
+    to_ty: Type,
+    signed: bool,
+) -> Value {
+    let from_ty = fx.bcx.func.dfg.value_type(from);
+
+    if from_ty.is_int() && to_ty.is_int() {
+        // int-like -> int-like
+        clif_intcast(
+            fx,
+            from,
+            to_ty,
+            signed,
+        )
+    } else if from_ty.is_int() && to_ty.is_float() {
+        // int-like -> float
+        if signed {
+            fx.bcx.ins().fcvt_from_sint(to_ty, from)
+        } else {
+            fx.bcx.ins().fcvt_from_uint(to_ty, from)
+        }
+    } else if from_ty.is_float() && to_ty.is_int() {
+        // float -> int-like
+        if signed {
+            fx.bcx.ins().fcvt_to_sint_sat(to_ty, from)
+        } else {
+            fx.bcx.ins().fcvt_to_uint_sat(to_ty, from)
+        }
+    } else if from_ty.is_float() && to_ty.is_float() {
+        // float -> float
+        match (from_ty, to_ty) {
+            (types::F32, types::F64) => {
+                fx.bcx.ins().fpromote(types::F64, from)
+            }
+            (types::F64, types::F32) => {
+                fx.bcx.ins().fdemote(types::F32, from)
+            }
+            _ => from,
+        }
+    } else {
+        unreachable!("cast value from {:?} to {:?}", from_ty, to_ty);
+    }
+}
diff --git a/src/common.rs b/src/common.rs
index 6a00d134198..28287390c61 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -73,52 +73,6 @@ pub fn codegen_select(bcx: &mut FunctionBuilder, cond: Value, lhs: Value, rhs: V
     }
 }
 
-pub fn clif_intcast<'a, 'tcx: 'a>(
-    fx: &mut FunctionCx<'a, 'tcx, impl Backend>,
-    val: Value,
-    to: Type,
-    signed: bool,
-) -> Value {
-    let from = fx.bcx.func.dfg.value_type(val);
-    match (from, to) {
-        // equal
-        (_, _) if from == to => val,
-
-        // extend
-        (_, types::I128) => {
-            let wider = if from == types::I64 {
-                val
-            } else if signed {
-                fx.bcx.ins().sextend(types::I64, val)
-            } else {
-                fx.bcx.ins().uextend(types::I64, val)
-            };
-            let zero = fx.bcx.ins().iconst(types::I64, 0);
-            fx.bcx.ins().iconcat(wider, zero)
-        }
-        (_, _) if to.wider_or_equal(from) => {
-            if signed {
-                fx.bcx.ins().sextend(to, val)
-            } else {
-                fx.bcx.ins().uextend(to, val)
-            }
-        }
-
-        // reduce
-        (types::I128, _) => {
-            let (lsb, _msb) = fx.bcx.ins().isplit(val);
-            if to == types::I64 {
-                lsb
-            } else {
-                fx.bcx.ins().ireduce(to, lsb)
-            }
-        }
-        (_, _) => {
-            fx.bcx.ins().ireduce(to, val)
-        }
-    }
-}
-
 fn resolve_normal_value_imm(func: &Function, val: Value) -> Option<i64> {
     if let ValueDef::Result(inst, 0 /*param*/) = func.dfg.value_def(val) {
         if let InstructionData::UnaryImm {
diff --git a/src/intrinsics.rs b/src/intrinsics.rs
index a456cac1d74..7da7e738f52 100644
--- a/src/intrinsics.rs
+++ b/src/intrinsics.rs
@@ -853,8 +853,27 @@ pub fn codegen_intrinsic_call<'a, 'tcx: 'a>(
             ret.write_cvalue(fx, val);
         };
 
-        simd_cast, (c x) {
-            ret.write_cvalue(fx, x.unchecked_cast_to(ret.layout()));
+        simd_cast, (c a) {
+            let (lane_layout, lane_count) = lane_type_and_count(fx, a.layout(), intrinsic);
+            let (ret_lane_layout, ret_lane_count) = lane_type_and_count(fx, ret.layout(), intrinsic);
+            assert_eq!(lane_count, ret_lane_count);
+
+            let ret_lane_ty = fx.clif_type(ret_lane_layout.ty).unwrap();
+
+            let signed = match lane_layout.ty.sty {
+                ty::Uint(..) => false,
+                ty::Int(..) => true,
+                ty::Float(..) => false, // `signed` is unused for floats
+                _ => panic!("{}", lane_layout.ty),
+            };
+
+            for lane in 0..lane_count {
+                let lane = mir::Field::new(lane.try_into().unwrap());
+
+                let a_lane = a.value_field(fx, lane).load_scalar(fx);
+                let res = clif_int_or_float_cast(fx, a_lane, ret_lane_ty, signed);
+                ret.place_field(fx, lane).write_cvalue(fx, CValue::by_val(res, ret_lane_layout));
+            }
         };
 
         simd_eq, (c x, c y) {
diff --git a/src/lib.rs b/src/lib.rs
index cdaabcf9a39..eab10e5ca5e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -33,6 +33,7 @@ mod allocator;
 mod analyze;
 mod archive;
 mod base;
+mod cast;
 mod codegen_i128;
 mod common;
 mod constant;
@@ -94,6 +95,7 @@ mod prelude {
 
     pub use crate::abi::*;
     pub use crate::base::{trans_operand, trans_place};
+    pub use crate::cast::*;
     pub use crate::common::*;
     pub use crate::debuginfo::{DebugContext, FunctionDebugContext};
     pub use crate::trap::*;