diff --git a/compiler/rustc_codegen_gcc/src/builder.rs b/compiler/rustc_codegen_gcc/src/builder.rs
index 76846692459..c8b7616e645 100644
--- a/compiler/rustc_codegen_gcc/src/builder.rs
+++ b/compiler/rustc_codegen_gcc/src/builder.rs
@@ -989,10 +989,14 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
             OperandValue::Ref(place.val)
         } else if place.layout.is_gcc_immediate() {
             let load = self.load(place.layout.gcc_type(self), place.val.llval, place.val.align);
-            if let abi::BackendRepr::Scalar(ref scalar) = place.layout.backend_repr {
-                scalar_load_metadata(self, load, scalar);
-            }
-            OperandValue::Immediate(self.to_immediate(load, place.layout))
+            OperandValue::Immediate(
+                if let abi::BackendRepr::Scalar(ref scalar) = place.layout.backend_repr {
+                    scalar_load_metadata(self, load, scalar);
+                    self.to_immediate_scalar(load, *scalar)
+                } else {
+                    load
+                },
+            )
         } else if let abi::BackendRepr::ScalarPair(ref a, ref b) = place.layout.backend_repr {
             let b_offset = a.size(self).align_to(b.align(self).abi);
 
diff --git a/compiler/rustc_codegen_gcc/src/intrinsic/mod.rs b/compiler/rustc_codegen_gcc/src/intrinsic/mod.rs
index a1123fafe2f..5322b731d8b 100644
--- a/compiler/rustc_codegen_gcc/src/intrinsic/mod.rs
+++ b/compiler/rustc_codegen_gcc/src/intrinsic/mod.rs
@@ -9,7 +9,7 @@ use gccjit::FunctionType;
 use gccjit::{ComparisonOp, Function, RValue, ToRValue, Type, UnaryOp};
 #[cfg(feature = "master")]
 use rustc_abi::ExternAbi;
-use rustc_abi::HasDataLayout;
+use rustc_abi::{BackendRepr, HasDataLayout};
 use rustc_codegen_ssa::MemFlags;
 use rustc_codegen_ssa::base::wants_msvc_seh;
 use rustc_codegen_ssa::common::IntPredicate;
@@ -181,14 +181,19 @@ impl<'a, 'gcc, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'a, 'gcc, 'tc
             sym::volatile_load | sym::unaligned_volatile_load => {
                 let tp_ty = fn_args.type_at(0);
                 let ptr = args[0].immediate();
+                let layout = self.layout_of(tp_ty);
                 let load = if let PassMode::Cast { cast: ref ty, pad_i32: _ } = fn_abi.ret.mode {
                     let gcc_ty = ty.gcc_type(self);
                     self.volatile_load(gcc_ty, ptr)
                 } else {
-                    self.volatile_load(self.layout_of(tp_ty).gcc_type(self), ptr)
+                    self.volatile_load(layout.gcc_type(self), ptr)
                 };
                 // TODO(antoyo): set alignment.
-                self.to_immediate(load, self.layout_of(tp_ty))
+                if let BackendRepr::Scalar(scalar) = layout.backend_repr {
+                    self.to_immediate_scalar(load, scalar)
+                } else {
+                    load
+                }
             }
             sym::volatile_store => {
                 let dst = args[0].deref(self.cx());
diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs
index 5d1a133e808..e1609e31c07 100644
--- a/compiler/rustc_codegen_llvm/src/builder.rs
+++ b/compiler/rustc_codegen_llvm/src/builder.rs
@@ -746,10 +746,12 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
                 let load = self.load(llty, place.val.llval, place.val.align);
                 if let abi::BackendRepr::Scalar(scalar) = place.layout.backend_repr {
                     scalar_load_metadata(self, load, scalar, place.layout, Size::ZERO);
+                    self.to_immediate_scalar(load, scalar)
+                } else {
+                    load
                 }
-                load
             });
-            OperandValue::Immediate(self.to_immediate(llval, place.layout))
+            OperandValue::Immediate(llval)
         } else if let abi::BackendRepr::ScalarPair(a, b) = place.layout.backend_repr {
             let b_offset = a.size(self).align_to(b.align(self).abi);
 
@@ -943,6 +945,8 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
     }
 
     fn unchecked_utrunc(&mut self, val: &'ll Value, dest_ty: &'ll Type) -> &'ll Value {
+        debug_assert_ne!(self.val_ty(val), dest_ty);
+
         let trunc = self.trunc(val, dest_ty);
         if llvm_util::get_version() >= (19, 0, 0) {
             unsafe {
@@ -955,6 +959,8 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
     }
 
     fn unchecked_strunc(&mut self, val: &'ll Value, dest_ty: &'ll Type) -> &'ll Value {
+        debug_assert_ne!(self.val_ty(val), dest_ty);
+
         let trunc = self.trunc(val, dest_ty);
         if llvm_util::get_version() >= (19, 0, 0) {
             unsafe {
diff --git a/compiler/rustc_codegen_ssa/src/mir/block.rs b/compiler/rustc_codegen_ssa/src/mir/block.rs
index 616d748a299..0620f08fc73 100644
--- a/compiler/rustc_codegen_ssa/src/mir/block.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/block.rs
@@ -1040,7 +1040,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                             let (idx, _) = op.layout.non_1zst_field(bx).expect(
                                 "not exactly one non-1-ZST field in a `DispatchFromDyn` type",
                             );
-                            op = op.extract_field(bx, idx);
+                            op = op.extract_field(self, bx, idx);
                         }
 
                         // Now that we have `*dyn Trait` or `&dyn Trait`, split it up into its
@@ -1072,7 +1072,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                             let (idx, _) = op.layout.non_1zst_field(bx).expect(
                                 "not exactly one non-1-ZST field in a `DispatchFromDyn` type",
                             );
-                            op = op.extract_field(bx, idx);
+                            op = op.extract_field(self, bx, idx);
                         }
 
                         // Make sure that we've actually unwrapped the rcvr down
@@ -1572,9 +1572,9 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                     if scalar.is_bool() {
                         bx.range_metadata(llval, WrappingRange { start: 0, end: 1 });
                     }
+                    // We store bools as `i8` so we need to truncate to `i1`.
+                    llval = bx.to_immediate_scalar(llval, scalar);
                 }
-                // We store bools as `i8` so we need to truncate to `i1`.
-                llval = bx.to_immediate(llval, arg.layout);
             }
         }
 
@@ -1604,7 +1604,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
         } else {
             // If the tuple is immediate, the elements are as well.
             for i in 0..tuple.layout.fields.count() {
-                let op = tuple.extract_field(bx, i);
+                let op = tuple.extract_field(self, bx, i);
                 self.codegen_argument(bx, op, llargs, &args[i]);
             }
         }
diff --git a/compiler/rustc_codegen_ssa/src/mir/operand.rs b/compiler/rustc_codegen_ssa/src/mir/operand.rs
index 9ca7d4f8f00..958a52a2cb1 100644
--- a/compiler/rustc_codegen_ssa/src/mir/operand.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/operand.rs
@@ -1,15 +1,14 @@
-use std::assert_matches::assert_matches;
 use std::fmt;
 
 use arrayvec::ArrayVec;
 use either::Either;
 use rustc_abi as abi;
 use rustc_abi::{Align, BackendRepr, Size};
-use rustc_middle::bug;
 use rustc_middle::mir::interpret::{Pointer, Scalar, alloc_range};
 use rustc_middle::mir::{self, ConstValue};
 use rustc_middle::ty::Ty;
 use rustc_middle::ty::layout::{LayoutOf, TyAndLayout};
+use rustc_middle::{bug, span_bug};
 use tracing::debug;
 
 use super::place::{PlaceRef, PlaceValue};
@@ -352,79 +351,83 @@ impl<'a, 'tcx, V: CodegenObject> OperandRef<'tcx, V> {
 
     pub(crate) fn extract_field<Bx: BuilderMethods<'a, 'tcx, Value = V>>(
         &self,
+        fx: &mut FunctionCx<'a, 'tcx, Bx>,
         bx: &mut Bx,
         i: usize,
     ) -> Self {
         let field = self.layout.field(bx.cx(), i);
         let offset = self.layout.fields.offset(i);
 
-        let mut val = match (self.val, self.layout.backend_repr) {
-            // If the field is ZST, it has no data.
-            _ if field.is_zst() => OperandValue::ZeroSized,
-
-            // Newtype of a scalar, scalar pair or vector.
-            (OperandValue::Immediate(_) | OperandValue::Pair(..), _)
-                if field.size == self.layout.size =>
-            {
-                assert_eq!(offset.bytes(), 0);
-                self.val
+        let val = if field.is_zst() {
+            OperandValue::ZeroSized
+        } else if field.size == self.layout.size {
+            assert_eq!(offset.bytes(), 0);
+            if let Some(field_val) = fx.codegen_transmute_operand(bx, *self, field) {
+                field_val
+            } else {
+                // we have to go through memory for things like
+                // Newtype vector of array, e.g. #[repr(simd)] struct S([i32; 4]);
+                let place = PlaceRef::alloca(bx, field);
+                self.val.store(bx, place.val.with_type(self.layout));
+                bx.load_operand(place).val
             }
-
-            // Extract a scalar component from a pair.
-            (OperandValue::Pair(a_llval, b_llval), BackendRepr::ScalarPair(a, b)) => {
-                if offset.bytes() == 0 {
-                    assert_eq!(field.size, a.size(bx.cx()));
-                    OperandValue::Immediate(a_llval)
-                } else {
-                    assert_eq!(offset, a.size(bx.cx()).align_to(b.align(bx.cx()).abi));
-                    assert_eq!(field.size, b.size(bx.cx()));
-                    OperandValue::Immediate(b_llval)
+        } else {
+            let (in_scalar, imm) = match (self.val, self.layout.backend_repr) {
+                // Extract a scalar component from a pair.
+                (OperandValue::Pair(a_llval, b_llval), BackendRepr::ScalarPair(a, b)) => {
+                    if offset.bytes() == 0 {
+                        assert_eq!(field.size, a.size(bx.cx()));
+                        (Some(a), a_llval)
+                    } else {
+                        assert_eq!(offset, a.size(bx.cx()).align_to(b.align(bx.cx()).abi));
+                        assert_eq!(field.size, b.size(bx.cx()));
+                        (Some(b), b_llval)
+                    }
                 }
-            }
 
-            // `#[repr(simd)]` types are also immediate.
-            (OperandValue::Immediate(llval), BackendRepr::Vector { .. }) => {
-                OperandValue::Immediate(bx.extract_element(llval, bx.cx().const_usize(i as u64)))
-            }
+                // `#[repr(simd)]` types are also immediate.
+                (OperandValue::Immediate(llval), BackendRepr::Vector { .. }) => {
+                    (None, bx.extract_element(llval, bx.cx().const_usize(i as u64)))
+                }
 
-            _ => bug!("OperandRef::extract_field({:?}): not applicable", self),
+                _ => {
+                    span_bug!(fx.mir.span, "OperandRef::extract_field({:?}): not applicable", self)
+                }
+            };
+            OperandValue::Immediate(match field.backend_repr {
+                BackendRepr::Vector { .. } => imm,
+                BackendRepr::Scalar(out_scalar) => {
+                    let Some(in_scalar) = in_scalar else {
+                        span_bug!(
+                            fx.mir.span,
+                            "OperandRef::extract_field({:?}): missing input scalar for output scalar",
+                            self
+                        )
+                    };
+                    if in_scalar != out_scalar {
+                        // If the backend and backend_immediate types might differ,
+                        // flip back to the backend type then to the new immediate.
+                        // This avoids nop truncations, but still handles things like
+                        // Bools in union fields needs to be truncated.
+                        let backend = bx.from_immediate(imm);
+                        bx.to_immediate_scalar(backend, out_scalar)
+                    } else {
+                        imm
+                    }
+                }
+                BackendRepr::Memory { sized: true } => {
+                    span_bug!(
+                        fx.mir.span,
+                        "Projecting into a simd type with padding doesn't work; \
+                         See <https://github.com/rust-lang/rust/issues/137108>",
+                    );
+                }
+                BackendRepr::Uninhabited
+                | BackendRepr::ScalarPair(_, _)
+                | BackendRepr::Memory { sized: false } => bug!(),
+            })
         };
 
-        match (&mut val, field.backend_repr) {
-            (OperandValue::ZeroSized, _) => {}
-            (
-                OperandValue::Immediate(llval),
-                BackendRepr::Scalar(_) | BackendRepr::ScalarPair(..) | BackendRepr::Vector { .. },
-            ) => {
-                // Bools in union fields needs to be truncated.
-                *llval = bx.to_immediate(*llval, field);
-            }
-            (OperandValue::Pair(a, b), BackendRepr::ScalarPair(a_abi, b_abi)) => {
-                // Bools in union fields needs to be truncated.
-                *a = bx.to_immediate_scalar(*a, a_abi);
-                *b = bx.to_immediate_scalar(*b, b_abi);
-            }
-            // Newtype vector of array, e.g. #[repr(simd)] struct S([i32; 4]);
-            (OperandValue::Immediate(llval), BackendRepr::Memory { sized: true }) => {
-                assert_matches!(self.layout.backend_repr, BackendRepr::Vector { .. });
-
-                let llfield_ty = bx.cx().backend_type(field);
-
-                // Can't bitcast an aggregate, so round trip through memory.
-                let llptr = bx.alloca(field.size, field.align.abi);
-                bx.store(*llval, llptr, field.align.abi);
-                *llval = bx.load(llfield_ty, llptr, field.align.abi);
-            }
-            (
-                OperandValue::Immediate(_),
-                BackendRepr::Uninhabited | BackendRepr::Memory { sized: false },
-            ) => {
-                bug!()
-            }
-            (OperandValue::Pair(..), _) => bug!(),
-            (OperandValue::Ref(..), _) => bug!(),
-        }
-
         OperandRef { val, layout: field }
     }
 }
@@ -587,7 +590,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                                 "Bad PlaceRef: destructing pointers should use cast/PtrMetadata, \
                                  but tried to access field {f:?} of pointer {o:?}",
                             );
-                            o = o.extract_field(bx, f.index());
+                            o = o.extract_field(self, bx, f.index());
                         }
                         mir::ProjectionElem::Index(_)
                         | mir::ProjectionElem::ConstantIndex { .. } => {
diff --git a/compiler/rustc_codegen_ssa/src/mir/rvalue.rs b/compiler/rustc_codegen_ssa/src/mir/rvalue.rs
index 4c5b183cfe9..daa4fa90ed7 100644
--- a/compiler/rustc_codegen_ssa/src/mir/rvalue.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/rvalue.rs
@@ -231,7 +231,7 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
     ///
     /// Returns `None` for cases that can't work in that framework, such as for
     /// `Immediate`->`Ref` that needs an `alloc` to get the location.
-    fn codegen_transmute_operand(
+    pub(crate) fn codegen_transmute_operand(
         &mut self,
         bx: &mut Bx,
         operand: OperandRef<'tcx, Bx::Value>,
@@ -260,6 +260,8 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
             OperandValue::Ref(source_place_val) => {
                 assert_eq!(source_place_val.llextra, None);
                 assert_matches!(operand_kind, OperandValueKind::Ref);
+                // The existing alignment is part of `source_place_val`,
+                // so that alignment will be used, not `cast`'s.
                 Some(bx.load_operand(source_place_val.with_type(cast)).val)
             }
             OperandValue::ZeroSized => {
diff --git a/compiler/rustc_codegen_ssa/src/traits/builder.rs b/compiler/rustc_codegen_ssa/src/traits/builder.rs
index 345db313022..2b00ba01946 100644
--- a/compiler/rustc_codegen_ssa/src/traits/builder.rs
+++ b/compiler/rustc_codegen_ssa/src/traits/builder.rs
@@ -1,7 +1,7 @@
 use std::assert_matches::assert_matches;
 use std::ops::Deref;
 
-use rustc_abi::{Align, BackendRepr, Scalar, Size, WrappingRange};
+use rustc_abi::{Align, Scalar, Size, WrappingRange};
 use rustc_middle::middle::codegen_fn_attrs::CodegenFnAttrs;
 use rustc_middle::ty::layout::{FnAbiOf, LayoutOf, TyAndLayout};
 use rustc_middle::ty::{Instance, Ty};
@@ -223,13 +223,6 @@ pub trait BuilderMethods<'a, 'tcx>:
     ) -> (Self::Value, Self::Value);
 
     fn from_immediate(&mut self, val: Self::Value) -> Self::Value;
-    fn to_immediate(&mut self, val: Self::Value, layout: TyAndLayout<'_>) -> Self::Value {
-        if let BackendRepr::Scalar(scalar) = layout.backend_repr {
-            self.to_immediate_scalar(val, scalar)
-        } else {
-            val
-        }
-    }
     fn to_immediate_scalar(&mut self, val: Self::Value, scalar: Scalar) -> Self::Value;
 
     fn alloca(&mut self, size: Size, align: Align) -> Self::Value;
diff --git a/tests/codegen/simd/project-to-simd-array-field.rs b/tests/codegen/simd/project-to-simd-array-field.rs
new file mode 100644
index 00000000000..29fab640633
--- /dev/null
+++ b/tests/codegen/simd/project-to-simd-array-field.rs
@@ -0,0 +1,31 @@
+//@compile-flags: -Copt-level=3
+
+#![crate_type = "lib"]
+#![feature(repr_simd, core_intrinsics)]
+
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy)]
+#[repr(simd)]
+struct i32x4([i32; 4]);
+
+#[inline(always)]
+fn to_array4(a: i32x4) -> [i32; 4] {
+    a.0
+}
+
+// CHECK-LABEL: simd_add_self_then_return_array(
+// CHECK-SAME: ptr{{.+}}sret{{.+}}%[[RET:.+]],
+// CHECK-SAME: ptr{{.+}}%a)
+#[no_mangle]
+pub fn simd_add_self_then_return_array(a: &i32x4) -> [i32; 4] {
+    // It would be nice to just ban `.0` into simd types,
+    // but until we do this has to keep working.
+    // See also <https://github.com/rust-lang/rust/issues/105439>
+
+    // CHECK: %[[T1:.+]] = load <4 x i32>, ptr %a
+    // CHECK: %[[T2:.+]] = shl <4 x i32> %[[T1]], {{splat \(i32 1\)|<i32 1, i32 1, i32 1, i32 1>}}
+    // CHECK: store <4 x i32> %[[T2]], ptr %[[RET]]
+    let a = *a;
+    let b = unsafe { core::intrinsics::simd::simd_add(a, a) };
+    to_array4(b)
+}
diff --git a/tests/crashes/project-to-simd-array-field.rs b/tests/crashes/project-to-simd-array-field.rs
new file mode 100644
index 00000000000..6dc916c41db
--- /dev/null
+++ b/tests/crashes/project-to-simd-array-field.rs
@@ -0,0 +1,33 @@
+//@ known-bug: #137108
+//@compile-flags: -Copt-level=3
+
+// If you fix this, put it in the corresponding codegen test,
+// not in a UI test like the readme says.
+
+#![crate_type = "lib"]
+
+#![feature(repr_simd, core_intrinsics)]
+
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy)]
+#[repr(simd)]
+struct i32x3([i32; 3]);
+
+const _: () = { assert!(size_of::<i32x3>() == 16) };
+
+#[inline(always)]
+fn to_array3(a: i32x3) -> [i32; 3] {
+    a.0
+}
+
+// CHECK-LABEL: simd_add_self_then_return_array_packed(
+// CHECK-SAME: ptr{{.+}}sret{{.+}}%[[RET:.+]],
+// CHECK-SAME: ptr{{.+}}%a)
+#[no_mangle]
+pub fn simd_add_self_then_return_array_packed(a: i32x3) -> [i32; 3] {
+    // CHECK: %[[T1:.+]] = load <3 x i32>, ptr %a
+    // CHECK: %[[T2:.+]] = shl <3 x i32> %[[T1]], <i32 1, i32 1, i32 1>
+    // CHECK: store <3 x i32> %[[T2]], ptr %[[RET]]
+    let b = unsafe { core::intrinsics::simd::simd_add(a, a) };
+    to_array3(b)
+}