Add support for target builtins

2024-11-02 07:22:42 +00:00 · 2022-02-06 17:04:24 -05:00 · 2022-02-06 17:04:24 -05:00 · 02970a6ca8
commit 02970a6ca8
parent 14c33f592a
20 changed files with 778 additions and 336 deletions
--- a/.gitignore
+++ b/.gitignore
@ -13,6 +13,7 @@ perf.data.old
 /rust
 /simple-raytracer
 /regex
+/rand
 gimple*
 *asm
 res
--- a/Cargo.lock
+++ b/Cargo.lock
@ -41,7 +41,7 @@ dependencies = [
 [[package]]
 name = "gccjit"
 version = "1.0.0"
-source = "git+https://github.com/antoyo/gccjit.rs#bdecdecfb8a02ec861a39a350f990faa33bd31c3"
+source = "git+https://github.com/antoyo/gccjit.rs#f24e1f49d99430941d8a747275b41c9a7930e049"
 dependencies = [
 "gccjit_sys",
 ]
@ -49,7 +49,7 @@ dependencies = [
 [[package]]
 name = "gccjit_sys"
 version = "0.0.1"
-source = "git+https://github.com/antoyo/gccjit.rs#bdecdecfb8a02ec861a39a350f990faa33bd31c3"
+source = "git+https://github.com/antoyo/gccjit.rs#f24e1f49d99430941d8a747275b41c9a7930e049"
 dependencies = [
 "libc 0.1.12",
 ]
--- a/config.sh
+++ b/config.sh
@ -2,7 +2,7 @@ set -e

 export CARGO_INCREMENTAL=0

-if [ -f ./gcc_path ]; then 
+if [ -f ./gcc_path ]; then
    export GCC_PATH=$(cat gcc_path)
 else
    echo 'Please put the path to your custom build of libgccjit in the file `gcc_path`, see Readme.md for details'
@ -38,7 +38,7 @@ if [[ "$HOST_TRIPLE" != "$TARGET_TRIPLE" ]]; then
   fi
 fi

-export RUSTFLAGS="$linker -Cpanic=abort -Csymbol-mangling-version=v0 -Cdebuginfo=2 -Clto=off -Zpanic-abort-tests -Zcodegen-backend=$(pwd)/target/${CHANNEL:-debug}/librustc_codegen_gcc.$dylib_ext --sysroot $(pwd)/build_sysroot/sysroot"
+export RUSTFLAGS="$CG_RUSTFLAGS $linker -Cpanic=abort -Csymbol-mangling-version=v0 -Cdebuginfo=2 -Clto=off -Zpanic-abort-tests -Zcodegen-backend=$(pwd)/target/${CHANNEL:-debug}/librustc_codegen_gcc.$dylib_ext --sysroot $(pwd)/build_sysroot/sysroot"

 # FIXME(antoyo): remove once the atomic shim is gone
 if [[ `uname` == 'Darwin' ]]; then
--- a/crate_patches/0002-rand-Disable-failing-test.patch
+++ b/crate_patches/0002-rand-Disable-failing-test.patch
@ -0,0 +1,32 @@
+From a8fb97120d71252538b6b026695df40d02696bdb Mon Sep 17 00:00:00 2001
+From: bjorn3 <bjorn3@users.noreply.github.com>
+Date: Sat, 15 Aug 2020 20:04:38 +0200
+Subject: [PATCH] [rand] Disable failing test
+
+---
+ src/distributions/uniform.rs | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
+index 480b859..c80bb6f 100644
+--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
+@@ -1085,7 +1085,7 @@ mod tests {
+             _ => panic!("`UniformDurationMode` was not serialized/deserialized correctly")
+         }
+     }
+-    
+
+     #[test]
+     #[cfg(feature = "serde1")]
+     fn test_uniform_serialization() {
+@@ -1314,6 +1314,7 @@ mod tests {
+         not(target_arch = "wasm32"),
+         not(target_arch = "asmjs")
+     ))]
+    #[ignore] // FIXME
+     fn test_float_assertions() {
+         use super::SampleUniform;
+         use std::panic::catch_unwind;
+-- 
+2.20.1
--- a/example/std_example.rs
+++ b/example/std_example.rs
@ -93,9 +93,9 @@ fn main() {

    println!("{:?}", std::intrinsics::caller_location());

-    /*unsafe {
+    unsafe {
        test_simd();
-    }*/
+    }

    Box::pin(move |mut _task_context| {
        yield ();
@ -104,7 +104,7 @@ fn main() {
    println!("End");
 }

-/*#[target_feature(enable = "sse2")]
+#[target_feature(enable = "sse2")]
 unsafe fn test_simd() {
    let x = _mm_setzero_si128();
    let y = _mm_set1_epi16(7);
@ -112,7 +112,7 @@ unsafe fn test_simd() {
    let cmp_eq = _mm_cmpeq_epi8(y, y);
    let cmp_lt = _mm_cmplt_epi8(y, y);

-    /*assert_eq!(std::mem::transmute::<_, [u16; 8]>(or), [7, 7, 7, 7, 7, 7, 7, 7]);
+    assert_eq!(std::mem::transmute::<_, [u16; 8]>(or), [7, 7, 7, 7, 7, 7, 7, 7]);
    assert_eq!(std::mem::transmute::<_, [u16; 8]>(cmp_eq), [0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff]);
    assert_eq!(std::mem::transmute::<_, [u16; 8]>(cmp_lt), [0, 0, 0, 0, 0, 0, 0, 0]);

@ -124,14 +124,14 @@ unsafe fn test_simd() {
    test_mm_cvtepi8_epi16();
    test_mm_cvtsi128_si64();

-    // FIXME(#666) implement `#[rustc_arg_required_const(..)]` support
-    //test_mm_extract_epi8();
+    test_mm_extract_epi8();
+    test_mm_insert_epi16();

    let mask1 = _mm_movemask_epi8(dbg!(_mm_setr_epi8(255u8 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)));
-    assert_eq!(mask1, 1);*/
-}*/
+    assert_eq!(mask1, 1);
+}

-/*#[target_feature(enable = "sse2")]
+#[target_feature(enable = "sse2")]
 unsafe fn test_mm_slli_si128() {
    #[rustfmt::skip]
    let a = _mm_setr_epi8(
@ -155,22 +155,9 @@ unsafe fn test_mm_slli_si128() {
    );
    let r = _mm_slli_si128(a, 16);
    assert_eq_m128i(r, _mm_set1_epi8(0));
-
-    #[rustfmt::skip]
-    let a = _mm_setr_epi8(
-        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-    );
-    let r = _mm_slli_si128(a, -1);
-    assert_eq_m128i(_mm_set1_epi8(0), r);
-
-    #[rustfmt::skip]
-    let a = _mm_setr_epi8(
-        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-    );
-    let r = _mm_slli_si128(a, -0x80000000);
-    assert_eq_m128i(r, _mm_set1_epi8(0));
 }

+
 #[target_feature(enable = "sse2")]
 unsafe fn test_mm_movemask_epi8() {
    #[rustfmt::skip]
@ -254,10 +241,19 @@ unsafe fn test_mm_extract_epi8() {
        8, 9, 10, 11, 12, 13, 14, 15
    );
    let r1 = _mm_extract_epi8(a, 0);
-    let r2 = _mm_extract_epi8(a, 19);
+    let r2 = _mm_extract_epi8(a, 3);
    assert_eq!(r1, 0xFF);
    assert_eq!(r2, 3);
-}*/
+}
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "sse2")]
+unsafe fn test_mm_insert_epi16() {
+    let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+    let r = _mm_insert_epi16(a, 9, 0);
+    let e = _mm_setr_epi16(9, 1, 2, 3, 4, 5, 6, 7);
+    assert_eq_m128i(r, e);
+}

 #[derive(PartialEq)]
 enum LoopState {
--- a/patches/0024-core-Disable-portable-simd-test.patch
+++ b/patches/0024-core-Disable-portable-simd-test.patch
@ -7,167 +7,6 @@ Subject: [PATCH] [core] Disable portable-simd test
 library/core/tests/lib.rs | 1 -
 1 file changed, 1 deletion(-)

-diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs
-index aa1ad93..95fbf55 100644
--- a/library/core/src/lib.rs
-+++ b/library/core/src/lib.rs
-@@ -398,23 +398,4 @@ pub mod arch {
-     }
- }
- 
-// Pull in the `core_simd` crate directly into libcore. The contents of
-// `core_simd` are in a different repository: rust-lang/portable-simd.
-//
-// `core_simd` depends on libcore, but the contents of this module are
-// set up in such a way that directly pulling it here works such that the
-// crate uses this crate as its libcore.
-#[path = "../../portable-simd/crates/core_simd/src/mod.rs"]
-#[allow(missing_debug_implementations, dead_code, unsafe_op_in_unsafe_fn, unused_unsafe)]
-#[allow(rustdoc::bare_urls)]
-#[unstable(feature = "portable_simd", issue = "86656")]
-mod core_simd;
-
-#[doc = include_str!("../../portable-simd/crates/core_simd/src/core_simd_docs.md")]
-#[unstable(feature = "portable_simd", issue = "86656")]
-pub mod simd {
-    #[unstable(feature = "portable_simd", issue = "86656")]
-    pub use crate::core_simd::simd::*;
-}
-
- include!("primitive_docs.rs");
-diff --git a/library/core/src/slice/mod.rs b/library/core/src/slice/mod.rs
-index cd38c3a..ad632dc 100644
--- a/library/core/src/slice/mod.rs
-+++ b/library/core/src/slice/mod.rs
-@@ -17,6 +17,5 @@ use crate::ptr;
- use crate::result::Result;
- use crate::result::Result::{Err, Ok};
-use crate::simd::{self, Simd};
- use crate::slice;
- 
- #[unstable(
-@@ -3475,121 +3474,6 @@ impl<T> [T] {
-         }
-     }
- 
-    /// Split a slice into a prefix, a middle of aligned SIMD types, and a suffix.
-    ///
-    /// This is a safe wrapper around [`slice::align_to`], so has the same weak
-    /// postconditions as that method.  You're only assured that
-    /// `self.len() == prefix.len() + middle.len() * LANES + suffix.len()`.
-    ///
-    /// Notably, all of the following are possible:
-    /// - `prefix.len() >= LANES`.
-    /// - `middle.is_empty()` despite `self.len() >= 3 * LANES`.
-    /// - `suffix.len() >= LANES`.
-    ///
-    /// That said, this is a safe method, so if you're only writing safe code,
-    /// then this can at most cause incorrect logic, not unsoundness.
-    ///
-    /// # Panics
-    ///
-    /// This will panic if the size of the SIMD type is different from
-    /// `LANES` times that of the scalar.
-    ///
-    /// At the time of writing, the trait restrictions on `Simd<T, LANES>` keeps
-    /// that from ever happening, as only power-of-two numbers of lanes are
-    /// supported.  It's possible that, in the future, those restrictions might
-    /// be lifted in a way that would make it possible to see panics from this
-    /// method for something like `LANES == 3`.
-    ///
-    /// # Examples
-    ///
-    /// ```
-    /// #![feature(portable_simd)]
-    ///
-    /// let short = &[1, 2, 3];
-    /// let (prefix, middle, suffix) = short.as_simd::<4>();
-    /// assert_eq!(middle, []); // Not enough elements for anything in the middle
-    ///
-    /// // They might be split in any possible way between prefix and suffix
-    /// let it = prefix.iter().chain(suffix).copied();
-    /// assert_eq!(it.collect::<Vec<_>>(), vec![1, 2, 3]);
-    ///
-    /// fn basic_simd_sum(x: &[f32]) -> f32 {
-    ///     use std::ops::Add;
-    ///     use std::simd::f32x4;
-    ///     let (prefix, middle, suffix) = x.as_simd();
-    ///     let sums = f32x4::from_array([
-    ///         prefix.iter().copied().sum(),
-    ///         0.0,
-    ///         0.0,
-    ///         suffix.iter().copied().sum(),
-    ///     ]);
-    ///     let sums = middle.iter().copied().fold(sums, f32x4::add);
-    ///     sums.reduce_sum()
-    /// }
-    ///
-    /// let numbers: Vec<f32> = (1..101).map(|x| x as _).collect();
-    /// assert_eq!(basic_simd_sum(&numbers[1..99]), 4949.0);
-    /// ```
-    #[unstable(feature = "portable_simd", issue = "86656")]
-    pub fn as_simd<const LANES: usize>(&self) -> (&[T], &[Simd<T, LANES>], &[T])
-    where
-        Simd<T, LANES>: AsRef<[T; LANES]>,
-        T: simd::SimdElement,
-        simd::LaneCount<LANES>: simd::SupportedLaneCount,
-    {
-        // These are expected to always match, as vector types are laid out like
-        // arrays per <https://llvm.org/docs/LangRef.html#vector-type>, but we
-        // might as well double-check since it'll optimize away anyhow.
-        assert_eq!(mem::size_of::<Simd<T, LANES>>(), mem::size_of::<[T; LANES]>());
-
-        // SAFETY: The simd types have the same layout as arrays, just with
-        // potentially-higher alignment, so the de-facto transmutes are sound.
-        unsafe { self.align_to() }
-    }
-
-    /// Split a slice into a prefix, a middle of aligned SIMD types, and a suffix.
-    ///
-    /// This is a safe wrapper around [`slice::align_to_mut`], so has the same weak
-    /// postconditions as that method.  You're only assured that
-    /// `self.len() == prefix.len() + middle.len() * LANES + suffix.len()`.
-    ///
-    /// Notably, all of the following are possible:
-    /// - `prefix.len() >= LANES`.
-    /// - `middle.is_empty()` despite `self.len() >= 3 * LANES`.
-    /// - `suffix.len() >= LANES`.
-    ///
-    /// That said, this is a safe method, so if you're only writing safe code,
-    /// then this can at most cause incorrect logic, not unsoundness.
-    ///
-    /// This is the mutable version of [`slice::as_simd`]; see that for examples.
-    ///
-    /// # Panics
-    ///
-    /// This will panic if the size of the SIMD type is different from
-    /// `LANES` times that of the scalar.
-    ///
-    /// At the time of writing, the trait restrictions on `Simd<T, LANES>` keeps
-    /// that from ever happening, as only power-of-two numbers of lanes are
-    /// supported.  It's possible that, in the future, those restrictions might
-    /// be lifted in a way that would make it possible to see panics from this
-    /// method for something like `LANES == 3`.
-    #[unstable(feature = "portable_simd", issue = "86656")]
-    pub fn as_simd_mut<const LANES: usize>(&mut self) -> (&mut [T], &mut [Simd<T, LANES>], &mut [T])
-    where
-        Simd<T, LANES>: AsMut<[T; LANES]>,
-        T: simd::SimdElement,
-        simd::LaneCount<LANES>: simd::SupportedLaneCount,
-    {
-        // These are expected to always match, as vector types are laid out like
-        // arrays per <https://llvm.org/docs/LangRef.html#vector-type>, but we
-        // might as well double-check since it'll optimize away anyhow.
-        assert_eq!(mem::size_of::<Simd<T, LANES>>(), mem::size_of::<[T; LANES]>());
-
-        // SAFETY: The simd types have the same layout as arrays, just with
-        // potentially-higher alignment, so the de-facto transmutes are sound.
-        unsafe { self.align_to_mut() }
-    }
-
-     /// Checks if the elements of this slice are sorted.
-     ///
-     /// That is, for each element `a` and its following element `b`, `a <= b` must hold. If the
 diff --git a/library/core/tests/lib.rs b/library/core/tests/lib.rs
 index 06c7be0..359e2e7 100644
 --- a/library/core/tests/lib.rs
@ -188,41 +27,3 @@ index 06c7be0..359e2e7 100644
 mod slice;
 mod str;
 mod str_lossy;
-diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs
-index 5dc586d..b6fc48f 100644
--- a/library/std/src/lib.rs
-+++ b/library/std/src/lib.rs
-@@ -312,6 +312,5 @@
- #![feature(panic_can_unwind)]
- #![feature(panic_unwind)]
- #![feature(platform_intrinsics)]
-#![feature(portable_simd)]
- #![feature(prelude_import)]
- #![feature(ptr_as_uninit)]
-@@ -508,23 +508,6 @@ pub mod time;
- #[unstable(feature = "once_cell", issue = "74465")]
- pub mod lazy;
- 
-// Pull in `std_float` crate  into libstd. The contents of
-// `std_float` are in a different repository: rust-lang/portable-simd.
-#[path = "../../portable-simd/crates/std_float/src/lib.rs"]
-#[allow(missing_debug_implementations, dead_code, unsafe_op_in_unsafe_fn, unused_unsafe)]
-#[allow(rustdoc::bare_urls)]
-#[unstable(feature = "portable_simd", issue = "86656")]
-mod std_float;
-
-#[doc = include_str!("../../portable-simd/crates/core_simd/src/core_simd_docs.md")]
-#[unstable(feature = "portable_simd", issue = "86656")]
-pub mod simd {
-    #[doc(inline)]
-    pub use crate::std_float::StdFloat;
-    #[doc(inline)]
-    pub use core::simd::*;
-}
-
- #[stable(feature = "futures_api", since = "1.36.0")]
- pub mod task {
-     //! Types and Traits for working with asynchronous tasks.
--
-2.26.2.7.g19db9cfb68
-
--- a/prepare.sh
+++ b/prepare.sh
@ -5,6 +5,13 @@ source prepare_build.sh

 cargo install hyperfine || echo "Skipping hyperfine install"

+git clone https://github.com/rust-random/rand.git || echo "rust-random/rand has already been cloned"
+pushd rand
+git checkout -- .
+git checkout 0f933f9c7176e53b2a3c7952ded484e1783f0bf1
+git am ../crate_patches/*-rand-*.patch
+popd
+
 git clone https://github.com/rust-lang/regex.git || echo "rust-lang/regex has already been cloned"
 pushd regex
 git checkout -- .
--- a/rustc_patches/compile_test.patch
+++ b/rustc_patches/compile_test.patch
@ -0,0 +1,14 @@
+diff --git a/src/tools/compiletest/src/header.rs b/src/tools/compiletest/src/header.rs
+index 887d27fd6dca4..2c2239f2b83d1 100644
+--- a/src/tools/compiletest/src/header.rs
+++ b/src/tools/compiletest/src/header.rs
+@@ -806,8 +806,8 @@ pub fn make_test_description<R: Read>(
+     cfg: Option<&str>,
+ ) -> test::TestDesc {
+     let mut ignore = false;
+     #[cfg(not(bootstrap))]
+-    let ignore_message: Option<String> = None;
+    let ignore_message: Option<&str> = None;
+     let mut should_fail = false;
+
+     let rustc_has_profiler_support = env::var_os("RUSTC_PROFILER_SUPPORT").is_some();
--- a/src/base.rs
+++ b/src/base.rs
@ -78,6 +78,11 @@ pub fn compile_codegen_unit<'tcx>(tcx: TyCtxt<'tcx>, cgu_name: Symbol, supports_
        let context = Context::default();
        // TODO(antoyo): only set on x86 platforms.
        context.add_command_line_option("-masm=intel");
+        // TODO(antoyo): only add the following cli argument if the feature is supported.
+        context.add_command_line_option("-mavx2");
+        // FIXME(antoyo): the following causes an illegal instruction on vmovdqu64 in std_example on my CPU.
+        // Only add if the CPU supports it.
+        //context.add_command_line_option("-mavx512f");
        for arg in &tcx.sess.opts.cg.llvm_args {
            context.add_command_line_option(arg);
        }
--- a/src/builder.rs
+++ b/src/builder.rs
@ -3,7 +3,6 @@ use std::cell::Cell;
 use std::convert::TryFrom;
 use std::ops::Deref;

-use gccjit::FunctionType;
 use gccjit::{
    BinaryOp,
    Block,
@ -224,10 +223,14 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
            .map(|(index, (expected_ty, &actual_val))| {
                let actual_ty = actual_val.get_type();
                if expected_ty != actual_ty {
-                    if on_stack_param_indices.contains(&index) {
+                    if !actual_ty.is_vector() && !expected_ty.is_vector() && actual_ty.is_integral() && expected_ty.is_integral() && actual_ty.get_size() != expected_ty.get_size() {
+                        self.context.new_cast(None, actual_val, expected_ty)
+                    }
+                    else if on_stack_param_indices.contains(&index) {
                        actual_val.dereference(None).to_rvalue()
                    }
                    else {
+                        assert!(!((actual_ty.is_vector() && !expected_ty.is_vector()) || (!actual_ty.is_vector() && expected_ty.is_vector())), "{:?} ({}) -> {:?} ({}), index: {:?}[{}]", actual_ty, actual_ty.is_vector(), expected_ty, expected_ty.is_vector(), func_ptr, index);
                        self.bitcast(actual_val, expected_ty)
                    }
                }
@ -286,15 +289,10 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
        // gccjit requires to use the result of functions, even when it's not used.
        // That's why we assign the result to a local or call add_eval().
        let gcc_func = func_ptr.get_type().dyncast_function_ptr_type().expect("function ptr");
-        let mut return_type = gcc_func.get_return_type();
+        let return_type = gcc_func.get_return_type();
        let void_type = self.context.new_type::<()>();
        let current_func = self.block.get_function();

-        // FIXME(antoyo): As a temporary workaround for unsupported LLVM intrinsics.
-        if gcc_func.get_param_count() == 0 && format!("{:?}", func_ptr) == "__builtin_ia32_pmovmskb128" {
-            return_type = self.int_type;
-        }
-
        if return_type != void_type {
            unsafe { RETURN_VALUE_COUNT += 1 };
            let result = current_func.new_local(None, return_type, &format!("ptrReturnValue{}", unsafe { RETURN_VALUE_COUNT }));
@ -302,13 +300,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
            result.to_rvalue()
        }
        else {
-            if gcc_func.get_param_count() == 0 {
-                // FIXME(antoyo): As a temporary workaround for unsupported LLVM intrinsics.
-                self.block.add_eval(None, self.cx.context.new_call_through_ptr(None, func_ptr, &[]));
-            }
-            else {
-                self.block.add_eval(None, self.cx.context.new_call_through_ptr(None, func_ptr, &args));
-            }
+            self.block.add_eval(None, self.cx.context.new_call_through_ptr(None, func_ptr, &args));
            // Return dummy value when not having return value.
            let result = current_func.new_local(None, self.isize_type, "dummyValueThatShouldNeverBeUsed");
            self.block.add_assignment(None, result, self.context.new_rvalue_from_long(self.isize_type, 0));
@ -529,12 +521,12 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
    }

    fn frem(&mut self, a: RValue<'gcc>, b: RValue<'gcc>) -> RValue<'gcc> {
-        if a.get_type() == self.cx.float_type {
+        if a.get_type().is_compatible_with(self.cx.float_type) {
            let fmodf = self.context.get_builtin_function("fmodf");
            // FIXME(antoyo): this seems to produce the wrong result.
            return self.context.new_call(None, fmodf, &[a, b]);
        }
-        assert_eq!(a.get_type(), self.cx.double_type);
+        assert_eq!(a.get_type().unqualified(), self.cx.double_type);

        let fmod = self.context.get_builtin_function("fmod");
        return self.context.new_call(None, fmod, &[a, b]);
@ -657,7 +649,7 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
        // NOTE: instead of returning the dereference here, we have to assign it to a variable in
        // the current basic block. Otherwise, it could be used in another basic block, causing a
        // dereference after a drop, for instance.
-        // TODO(antoyo): handle align.
+        // TODO(antoyo): handle align of the load instruction.
        let deref = ptr.dereference(None).to_rvalue();
        let value_type = deref.get_type();
        unsafe { RETURN_VALUE_COUNT += 1 };
@ -797,9 +789,16 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
        self.store_with_flags(val, ptr, align, MemFlags::empty())
    }

-    fn store_with_flags(&mut self, val: RValue<'gcc>, ptr: RValue<'gcc>, _align: Align, _flags: MemFlags) -> RValue<'gcc> {
+    fn store_with_flags(&mut self, val: RValue<'gcc>, ptr: RValue<'gcc>, align: Align, _flags: MemFlags) -> RValue<'gcc> {
        let ptr = self.check_store(val, ptr);
-        self.llbb().add_assignment(None, ptr.dereference(None), val);
+        let destination = ptr.dereference(None);
+        // NOTE: libgccjit does not support specifying the alignment on the assignment, so we cast
+        // to type so it gets the proper alignment.
+        let destination_type = destination.to_rvalue().get_type().unqualified();
+        let aligned_type = destination_type.get_aligned(align.bytes()).make_pointer();
+        let aligned_destination = self.cx.context.new_bitcast(None, ptr, aligned_type);
+        let aligned_destination = aligned_destination.dereference(None);
+        self.llbb().add_assignment(None, aligned_destination, val);
        // TODO(antoyo): handle align and flags.
        // NOTE: dummy value here since it's never used. FIXME(antoyo): API should not return a value here?
        self.cx.context.new_rvalue_zero(self.type_i32())
@ -1288,14 +1287,75 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {

 impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
    pub fn shuffle_vector(&mut self, v1: RValue<'gcc>, v2: RValue<'gcc>, mask: RValue<'gcc>) -> RValue<'gcc> {
-        let return_type = v1.get_type();
-        let params = [
-            self.context.new_parameter(None, return_type, "v1"),
-            self.context.new_parameter(None, return_type, "v2"),
-            self.context.new_parameter(None, mask.get_type(), "mask"),
-        ];
-        let shuffle = self.context.new_function(None, FunctionType::Extern, return_type, &params, "_mm_shuffle_epi8", false);
-        self.context.new_call(None, shuffle, &[v1, v2, mask])
+        let struct_type = mask.get_type().is_struct().expect("mask of struct type");
+
+        // TODO(antoyo): use a recursive unqualified() here.
+        let vector_type = v1.get_type().unqualified().dyncast_vector().expect("vector type");
+        let element_type = vector_type.get_element_type();
+        let vec_num_units = vector_type.get_num_units();
+
+        let mask_num_units = struct_type.get_field_count();
+        let mut vector_elements = vec![];
+        let mask_element_type =
+            if element_type.is_integral() {
+                element_type
+            }
+            else {
+                self.int_type
+            };
+        for i in 0..mask_num_units {
+            let field = struct_type.get_field(i as i32);
+            vector_elements.push(self.context.new_cast(None, mask.access_field(None, field).to_rvalue(), mask_element_type));
+        }
+
+        // NOTE: the mask needs to be the same length as the input vectors, so add the missing
+        // elements in the mask if needed.
+        for _ in mask_num_units..vec_num_units {
+            vector_elements.push(self.context.new_rvalue_zero(mask_element_type));
+        }
+
+        let array_type = self.context.new_array_type(None, element_type, vec_num_units as i32);
+        let result_type = self.context.new_vector_type(element_type, mask_num_units as u64);
+        let (v1, v2) =
+            if vec_num_units < mask_num_units {
+                // NOTE: the mask needs to be the same length as the input vectors, so join the 2
+                // vectors and create a dummy second vector.
+                let array = self.context.new_bitcast(None, v1, array_type);
+                let mut elements = vec![];
+                for i in 0..vec_num_units {
+                    elements.push(self.context.new_array_access(None, array, self.context.new_rvalue_from_int(self.int_type, i as i32)).to_rvalue());
+                }
+                let array = self.context.new_bitcast(None, v2, array_type);
+                for i in 0..vec_num_units {
+                    elements.push(self.context.new_array_access(None, array, self.context.new_rvalue_from_int(self.int_type, i as i32)).to_rvalue());
+                }
+                let v1 = self.context.new_rvalue_from_vector(None, result_type, &elements);
+                let zero = self.context.new_rvalue_zero(element_type);
+                let v2 = self.context.new_rvalue_from_vector(None, result_type, &vec![zero; mask_num_units]);
+                (v1, v2)
+            }
+            else {
+                (v1, v2)
+            };
+
+        let new_mask_num_units = std::cmp::max(mask_num_units, vec_num_units);
+        let mask_type = self.context.new_vector_type(mask_element_type, new_mask_num_units as u64);
+        let mask = self.context.new_rvalue_from_vector(None, mask_type, &vector_elements);
+        let result = self.context.new_rvalue_vector_perm(None, v1, v2, mask);
+
+        if vec_num_units != mask_num_units {
+            // NOTE: if padding was added, only select the number of elements of the masks to
+            // remove that padding in the result.
+            let mut elements = vec![];
+            let array = self.context.new_bitcast(None, result, array_type);
+            for i in 0..mask_num_units {
+                elements.push(self.context.new_array_access(None, array, self.context.new_rvalue_from_int(self.int_type, i as i32)).to_rvalue());
+            }
+            self.context.new_rvalue_from_vector(None, result_type, &elements)
+        }
+        else {
+            result
+        }
    }
 }

--- a/src/common.rs
+++ b/src/common.rs
@ -322,6 +322,8 @@ pub trait TypeReflection<'gcc, 'tcx>  {

    fn is_f32(&self, cx: &CodegenCx<'gcc, 'tcx>) -> bool;
    fn is_f64(&self, cx: &CodegenCx<'gcc, 'tcx>) -> bool;
+
+    fn is_vector(&self) -> bool;
 }

 impl<'gcc, 'tcx> TypeReflection<'gcc, 'tcx> for Type<'gcc> {
@ -392,4 +394,21 @@ impl<'gcc, 'tcx> TypeReflection<'gcc, 'tcx> for Type<'gcc> {
    fn is_f64(&self, cx: &CodegenCx<'gcc, 'tcx>) -> bool {
        self.unqualified() == cx.context.new_type::<f64>()
    }
+
+    fn is_vector(&self) -> bool {
+        let mut typ = self.clone();
+        loop {
+            if typ.dyncast_vector().is_some() {
+                return true;
+            }
+
+            let old_type = typ;
+            typ = typ.unqualified();
+            if old_type == typ {
+                break;
+            }
+        }
+
+        false
+    }
 }
--- a/src/consts.rs
+++ b/src/consts.rs
@ -25,7 +25,14 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
                }
            }
        }
-        self.context.new_bitcast(None, value, typ)
+        // NOTE: since bitcast makes a value non-constant, don't bitcast if not necessary as some
+        // SIMD builtins require a constant value.
+        if value.get_type() != typ {
+            self.context.new_bitcast(None, value, typ)
+        }
+        else {
+            value
+        }
    }
 }

@ -171,8 +178,9 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
                Some(kind) if !self.tcx.sess.fewer_names() => {
                    let name = self.generate_local_symbol_name(kind);
                    // TODO(antoyo): check if it's okay that no link_section is set.
-                    // TODO(antoyo): set alignment here as well.
-                    let global = self.declare_private_global(&name[..], self.val_ty(cv));
+
+                    let typ = self.val_ty(cv).get_aligned(align.bytes());
+                    let global = self.declare_private_global(&name[..], typ);
                    global
                }
                _ => {
--- a/src/context.rs
+++ b/src/context.rs
@ -269,11 +269,11 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
    }

    pub fn is_native_int_type_or_bool(&self, typ: Type<'gcc>) -> bool {
-        self.is_native_int_type(typ) || typ == self.bool_type
+        self.is_native_int_type(typ) || typ.is_compatible_with(self.bool_type)
    }

    pub fn is_int_type_or_bool(&self, typ: Type<'gcc>) -> bool {
-        self.is_native_int_type(typ) || self.is_non_native_int_type(typ) || typ == self.bool_type
+        self.is_native_int_type(typ) || self.is_non_native_int_type(typ) || typ.is_compatible_with(self.bool_type)
    }

    pub fn sess(&self) -> &Session {
--- a/src/int.rs
+++ b/src/int.rs
@ -153,8 +153,14 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
        let a_type = a.get_type();
        let b_type = b.get_type();
        if self.is_native_int_type_or_bool(a_type) && self.is_native_int_type_or_bool(b_type) {
-            if a.get_type() != b.get_type() {
-                b = self.context.new_cast(None, b, a.get_type());
+            if a_type != b_type {
+                if a_type.is_vector() {
+                    // Vector types need to be bitcast.
+                    b = self.context.new_bitcast(None, b, a.get_type());
+                }
+                else {
+                    b = self.context.new_cast(None, b, a.get_type());
+                }
            }
            self.context.new_binary_op(None, operation, a_type, a, b)
        }
@ -593,7 +599,10 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
        let b_type = b.get_type();
        let a_native = self.is_native_int_type_or_bool(a_type);
        let b_native = self.is_native_int_type_or_bool(b_type);
-        if a_native && b_native {
+        if a_type.is_vector() && b_type.is_vector() {
+            self.context.new_binary_op(None, operation, a_type, a, b)
+        }
+        else if a_native && b_native {
            if a_type != b_type {
                b = self.context.new_cast(None, b, a_type);
            }
@ -639,6 +648,9 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
        else {
            // Since u128 and i128 are the only types that can be unsupported, we know the type of
            // value and the destination type have the same size, so a bitcast is fine.
+
+            // TODO(antoyo): perhaps use __builtin_convertvector for vector casting. (This is elsewhere,
+            // though.)
            self.context.new_bitcast(None, value, dest_typ)
        }
    }
--- a/src/intrinsic/llvm.rs
+++ b/src/intrinsic/llvm.rs
@ -3,20 +3,122 @@ use gccjit::Function;
 use crate::context::CodegenCx;

 pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function<'gcc> {
-    let _gcc_name =
+    let gcc_name =
        match name {
-            "llvm.x86.xgetbv" => {
-                let gcc_name = "__builtin_trap";
-                let func = cx.context.get_builtin_function(gcc_name);
-                cx.functions.borrow_mut().insert(gcc_name.to_string(), func);
-                return func;
-            },
+            "llvm.x86.xgetbv" => "__builtin_ia32_xgetbv",
            // NOTE: this doc specifies the equivalent GCC builtins: http://huonw.github.io/llvmint/llvmint/x86/index.html
+            "llvm.x86.sse2.pmovmskb.128" => "__builtin_ia32_pmovmskb128",
+            "llvm.x86.avx2.pmovmskb" => "__builtin_ia32_pmovmskb256",
            "llvm.x86.sse2.cmp.pd" => "__builtin_ia32_cmppd",
            "llvm.x86.sse2.movmsk.pd" => "__builtin_ia32_movmskpd",
-            "llvm.x86.sse2.pmovmskb.128" => "__builtin_ia32_pmovmskb128",
-            _ => unimplemented!("unsupported LLVM intrinsic {}", name)
+            "llvm.x86.ssse3.pshuf.b.128" => "__builtin_ia32_pshufb128",
+            "llvm.x86.sse2.pause" => "__builtin_ia32_pause",
+            "llvm.x86.avx2.pshuf.b" => "__builtin_ia32_pshufb256",
+            "llvm.x86.avx2.pslli.d" => "__builtin_ia32_pslldi256",
+            "llvm.x86.avx2.psrli.d" => "__builtin_ia32_psrldi256",
+            "llvm.x86.avx.vzeroupper" => "__builtin_ia32_vzeroupper",
+            "llvm.x86.avx2.vperm2i128" => "__builtin_ia32_permti256",
+            "llvm.x86.avx2.psrli.w" => "__builtin_ia32_psrlwi256",
+            "llvm.x86.sse2.storeu.dq" => "__builtin_ia32_storedqu",
+            "llvm.x86.sse2.psrli.w" => "__builtin_ia32_psrlwi128",
+            "llvm.x86.avx2.pabs.d" => "__builtin_ia32_pabsd256",
+            "llvm.x86.sse2.psrli.q" => "__builtin_ia32_psrlqi128",
+            "llvm.x86.avx2.pabs.w" => "__builtin_ia32_pabsw256",
+            "llvm.x86.avx2.pblendvb" => "__builtin_ia32_pblendvb256",
+            "llvm.x86.avx2.pabs.b" => "__builtin_ia32_pabsb256",
+            "llvm.x86.avx2.psrli.q" => "__builtin_ia32_psrlqi256",
+            "llvm.x86.sse41.pblendvb" => "__builtin_ia32_pblendvb128",
+            "llvm.x86.avx2.pavg.w" => "__builtin_ia32_pavgw256",
+            "llvm.x86.avx2.pavg.b" => "__builtin_ia32_pavgb256",
+            "llvm.x86.avx2.phadd.w" => "__builtin_ia32_phaddw256",
+            "llvm.x86.avx2.phadd.d" => "__builtin_ia32_phaddd256",
+            "llvm.x86.avx2.phadd.sw" => "__builtin_ia32_phaddsw256",
+            "llvm.x86.avx2.phsub.w" => "__builtin_ia32_phsubw256",
+            "llvm.x86.avx2.phsub.d" => "__builtin_ia32_phsubd256",
+            "llvm.x86.avx2.phsub.sw" => "__builtin_ia32_phsubsw256",
+            "llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gatherd_d",
+            "llvm.x86.avx2.gather.d.d.256" => "__builtin_ia32_gatherd_d256",
+            "llvm.x86.avx2.gather.d.ps" => "__builtin_ia32_gatherd_ps",
+            "llvm.x86.avx2.gather.d.ps.256" => "__builtin_ia32_gatherd_ps256",
+            "llvm.x86.avx2.gather.d.q" => "__builtin_ia32_gatherd_q",
+            "llvm.x86.avx2.gather.d.q.256" => "__builtin_ia32_gatherd_q256",
+            "llvm.x86.avx2.gather.d.pd" => "__builtin_ia32_gatherd_pd",
+            "llvm.x86.avx2.gather.d.pd.256" => "__builtin_ia32_gatherd_pd256",
+            "llvm.x86.avx2.gather.q.d" => "__builtin_ia32_gatherq_d",
+            "llvm.x86.avx2.gather.q.d.256" => "__builtin_ia32_gatherq_d256",
+            "llvm.x86.avx2.gather.q.ps" => "__builtin_ia32_gatherq_ps",
+            "llvm.x86.avx2.gather.q.ps.256" => "__builtin_ia32_gatherq_ps256",
+            "llvm.x86.avx2.gather.q.q" => "__builtin_ia32_gatherq_q",
+            "llvm.x86.avx2.gather.q.q.256" => "__builtin_ia32_gatherq_q256",
+            "llvm.x86.avx2.gather.q.pd" => "__builtin_ia32_gatherq_pd",
+            "llvm.x86.avx2.gather.q.pd.256" => "__builtin_ia32_gatherq_pd256",
+            "llvm.x86.avx2.pmadd.wd" => "__builtin_ia32_pmaddwd256",
+            "llvm.x86.avx2.pmadd.ub.sw" => "__builtin_ia32_pmaddubsw256",
+            "llvm.x86.avx2.maskload.d" => "__builtin_ia32_maskloadd",
+            "llvm.x86.avx2.maskload.d.256" => "__builtin_ia32_maskloadd256",
+            "llvm.x86.avx2.maskload.q" => "__builtin_ia32_maskloadq",
+            "llvm.x86.avx2.maskload.q.256" => "__builtin_ia32_maskloadq256",
+            "llvm.x86.avx2.maskstore.d" => "__builtin_ia32_maskstored",
+            "llvm.x86.avx2.maskstore.d.256" => "__builtin_ia32_maskstored256",
+            "llvm.x86.avx2.maskstore.q" => "__builtin_ia32_maskstoreq",
+            "llvm.x86.avx2.maskstore.q.256" => "__builtin_ia32_maskstoreq256",
+            "llvm.x86.avx2.pmaxs.w" => "__builtin_ia32_pmaxsw256",
+            "llvm.x86.avx2.pmaxs.d" => "__builtin_ia32_pmaxsd256",
+            "llvm.x86.avx2.pmaxs.b" => "__builtin_ia32_pmaxsb256",
+            "llvm.x86.avx2.pmaxu.w" => "__builtin_ia32_pmaxuw256",
+            "llvm.x86.avx2.pmaxu.d" => "__builtin_ia32_pmaxud256",
+            "llvm.x86.avx2.pmaxu.b" => "__builtin_ia32_pmaxub256",
+            "llvm.x86.avx2.pmins.w" => "__builtin_ia32_pminsw256",
+            "llvm.x86.avx2.pmins.d" => "__builtin_ia32_pminsd256",
+            "llvm.x86.avx2.pmins.b" => "__builtin_ia32_pminsb256",
+            "llvm.x86.avx2.pminu.w" => "__builtin_ia32_pminuw256",
+            "llvm.x86.avx2.pminu.d" => "__builtin_ia32_pminud256",
+            "llvm.x86.avx2.pminu.b" => "__builtin_ia32_pminub256",
+            "llvm.x86.avx2.mpsadbw" => "__builtin_ia32_mpsadbw256",
+            "llvm.x86.avx2.pmul.dq" => "__builtin_ia32_pmuldq256",
+            "llvm.x86.avx2.pmulu.dq" => "__builtin_ia32_pmuludq256",
+            "llvm.x86.avx2.pmulh.w" => "__builtin_ia32_pmulhw256",
+            "llvm.x86.avx2.pmulhu.w" => "__builtin_ia32_pmulhuw256",
+            "llvm.x86.avx2.pmul.hr.sw" => "__builtin_ia32_pmulhrsw256",
+            "llvm.x86.avx2.packsswb" => "__builtin_ia32_packsswb256",
+            "llvm.x86.avx2.packssdw" => "__builtin_ia32_packssdw256",
+            "llvm.x86.avx2.packuswb" => "__builtin_ia32_packuswb256",
+            "llvm.x86.avx2.packusdw" => "__builtin_ia32_packusdw256",
+            "llvm.x86.avx2.permd" => "__builtin_ia32_permvarsi256",
+            "llvm.x86.avx2.permps" => "__builtin_ia32_permvarsf256",
+            "llvm.x86.avx2.psad.bw" => "__builtin_ia32_psadbw256",
+            "llvm.x86.avx2.psign.w" => "__builtin_ia32_psignw256",
+            "llvm.x86.avx2.psign.d" => "__builtin_ia32_psignd256",
+            "llvm.x86.avx2.psign.b" => "__builtin_ia32_psignb256",
+            "llvm.x86.avx2.psll.w" => "__builtin_ia32_psllw256",
+            "llvm.x86.avx2.psll.d" => "__builtin_ia32_pslld256",
+            "llvm.x86.avx2.psll.q" => "__builtin_ia32_psllq256",
+            "llvm.x86.avx2.pslli.w" => "__builtin_ia32_psllwi256",
+            "llvm.x86.avx2.pslli.q" => "__builtin_ia32_psllqi256",
+            "llvm.x86.avx2.psllv.d" => "__builtin_ia32_psllv4si",
+            "llvm.x86.avx2.psllv.d.256" => "__builtin_ia32_psllv8si",
+            "llvm.x86.avx2.psllv.q" => "__builtin_ia32_psllv2di",
+            "llvm.x86.avx2.psllv.q.256" => "__builtin_ia32_psllv4di",
+            "llvm.x86.avx2.psra.w" => "__builtin_ia32_psraw256",
+            "llvm.x86.avx2.psra.d" => "__builtin_ia32_psrad256",
+            "llvm.x86.avx2.psrai.w" => "__builtin_ia32_psrawi256",
+            "llvm.x86.avx2.psrai.d" => "__builtin_ia32_psradi256",
+            "llvm.x86.avx2.psrav.d" => "__builtin_ia32_psrav4si",
+            "llvm.x86.avx2.psrav.d.256" => "__builtin_ia32_psrav8si",
+            "llvm.x86.avx2.psrl.w" => "__builtin_ia32_psrlw256",
+            "llvm.x86.avx2.psrl.d" => "__builtin_ia32_psrld256",
+            "llvm.x86.avx2.psrl.q" => "__builtin_ia32_psrlq256",
+            "llvm.x86.avx2.psrlv.d" => "__builtin_ia32_psrlv4si",
+            "llvm.x86.avx2.psrlv.d.256" => "__builtin_ia32_psrlv8si",
+            "llvm.x86.avx2.psrlv.q" => "__builtin_ia32_psrlv2di",
+            "llvm.x86.avx2.psrlv.q.256" => "__builtin_ia32_psrlv4di",
+            "llvm.x86.sse.sqrt.ss" => "__builtin_ia32_sqrtss",
+
+            "llvm.sqrt.v2f64" => "__builtin_ia32_sqrtpd",
+            _ => unimplemented!("***** unsupported LLVM intrinsic {}", name),
        };

-    unimplemented!();
+    let func = cx.context.get_target_builtin_function(gcc_name);
+    cx.functions.borrow_mut().insert(gcc_name.to_string(), func);
+    func
 }
--- a/src/intrinsic/simd.rs
+++ b/src/intrinsic/simd.rs
@ -1,4 +1,6 @@
-use gccjit::{RValue, Type};
+use std::cmp::Ordering;
+
+use gccjit::{RValue, Type, ToRValue};
 use rustc_codegen_ssa::base::compare_simd_types;
 use rustc_codegen_ssa::common::{TypeKind, span_invalid_monomorphization_error};
 use rustc_codegen_ssa::mir::operand::OperandRef;
@ -10,6 +12,7 @@ use rustc_middle::ty::{self, Ty};
 use rustc_span::{Span, Symbol, sym};

 use crate::builder::Builder;
+use crate::intrinsic;

 pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>, name: Symbol, callee_ty: Ty<'tcx>, args: &[OperandRef<'tcx, RValue<'gcc>>], ret_ty: Ty<'tcx>, llret_ty: Type<'gcc>, span: Span) -> Result<RValue<'gcc>, ()> {
    // macros for error handling:
@ -100,9 +103,27 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
    }

    if let Some(stripped) = name_str.strip_prefix("simd_shuffle") {
-        let n: u64 = stripped.parse().unwrap_or_else(|_| {
-            span_bug!(span, "bad `simd_shuffle` instruction only caught in codegen?")
-        });
+        let n: u64 =
+            if stripped.is_empty() {
+                // Make sure this is actually an array, since typeck only checks the length-suffixed
+                // version of this intrinsic.
+                match args[2].layout.ty.kind() {
+                    ty::Array(ty, len) if matches!(ty.kind(), ty::Uint(ty::UintTy::U32)) => {
+                        len.try_eval_usize(bx.cx.tcx, ty::ParamEnv::reveal_all()).unwrap_or_else(|| {
+                            span_bug!(span, "could not evaluate shuffle index array length")
+                        })
+                    }
+                    _ => return_error!(
+                        "simd_shuffle index must be an array of `u32`, got `{}`",
+                        args[2].layout.ty
+                    ),
+                }
+            }
+            else {
+                stripped.parse().unwrap_or_else(|_| {
+                    span_bug!(span, "bad `simd_shuffle` instruction only caught in codegen?")
+                })
+            };

        require_simd!(ret_ty, "return");

@ -133,6 +154,202 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
        ));
    }

+    if name == sym::simd_insert {
+        require!(
+            in_elem == arg_tys[2],
+            "expected inserted type `{}` (element of input `{}`), found `{}`",
+            in_elem,
+            in_ty,
+            arg_tys[2]
+        );
+        let vector = args[0].immediate();
+        let index = args[1].immediate();
+        let value = args[2].immediate();
+        // TODO(antoyo): use a recursive unqualified() here.
+        let vector_type = vector.get_type().unqualified().dyncast_vector().expect("vector type");
+        let element_type = vector_type.get_element_type();
+        // NOTE: we cannot cast to an array and assign to its element here because the value might
+        // not be an l-value. So, call a builtin to set the element.
+        // TODO(antoyo): perhaps we could create a new vector or maybe there's a GIMPLE instruction for that?
+        let func_name =
+            match in_len {
+                2 => {
+                    if element_type == bx.i64_type {
+                        "__builtin_ia32_vec_set_v2di"
+                    }
+                    else {
+                        unimplemented!();
+                    }
+                },
+                4 => {
+                    if element_type == bx.i32_type {
+                        "__builtin_ia32_vec_set_v4si"
+                    }
+                    else {
+                        unimplemented!();
+                    }
+                },
+                8 => {
+                    if element_type == bx.i16_type {
+                        "__builtin_ia32_vec_set_v8hi"
+                    }
+                    else {
+                        unimplemented!();
+                    }
+                },
+                _ => unimplemented!("Len: {}", in_len),
+            };
+        let builtin = bx.context.get_target_builtin_function(func_name);
+        let param1_type = builtin.get_param(0).to_rvalue().get_type();
+        let vector =
+            if vector.get_type() != param1_type {
+                bx.context.new_bitcast(None, vector, param1_type)
+            }
+            else {
+                vector
+            };
+        let result = bx.context.new_call(None, builtin, &[vector, value, bx.context.new_cast(None, index, bx.int_type)]);
+        return Ok(bx.context.new_bitcast(None, result, vector.get_type()));
+    }
+    if name == sym::simd_extract {
+        require!(
+            ret_ty == in_elem,
+            "expected return type `{}` (element of input `{}`), found `{}`",
+            in_elem,
+            in_ty,
+            ret_ty
+        );
+        let vector = args[0].immediate();
+        return Ok(bx.context.new_vector_access(None, vector, args[1].immediate()).to_rvalue());
+    }
+
+    if name == sym::simd_cast {
+        require_simd!(ret_ty, "return");
+        let (out_len, out_elem) = ret_ty.simd_size_and_type(bx.tcx());
+        require!(
+            in_len == out_len,
+            "expected return type with length {} (same as input type `{}`), \
+                  found `{}` with length {}",
+            in_len,
+            in_ty,
+            ret_ty,
+            out_len
+        );
+        // casting cares about nominal type, not just structural type
+        if in_elem == out_elem {
+            return Ok(args[0].immediate());
+        }
+
+        enum Style {
+            Float,
+            Int(/* is signed? */ bool),
+            Unsupported,
+        }
+
+        let (in_style, in_width) = match in_elem.kind() {
+            // vectors of pointer-sized integers should've been
+            // disallowed before here, so this unwrap is safe.
+            ty::Int(i) => (
+                Style::Int(true),
+                i.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(),
+            ),
+            ty::Uint(u) => (
+                Style::Int(false),
+                u.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(),
+            ),
+            ty::Float(f) => (Style::Float, f.bit_width()),
+            _ => (Style::Unsupported, 0),
+        };
+        let (out_style, out_width) = match out_elem.kind() {
+            ty::Int(i) => (
+                Style::Int(true),
+                i.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(),
+            ),
+            ty::Uint(u) => (
+                Style::Int(false),
+                u.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(),
+            ),
+            ty::Float(f) => (Style::Float, f.bit_width()),
+            _ => (Style::Unsupported, 0),
+        };
+
+        let extend = |in_type, out_type| {
+            let vector_type = bx.context.new_vector_type(out_type, 8);
+            let vector = args[0].immediate();
+            let array_type = bx.context.new_array_type(None, in_type, 8);
+            let array = bx.context.new_bitcast(None, vector, array_type);
+
+            let cast_vec_element = |index| {
+                let index = bx.context.new_rvalue_from_int(bx.int_type, index);
+                bx.context.new_cast(None, bx.context.new_array_access(None, array, index).to_rvalue(), out_type)
+            };
+
+            bx.context.new_rvalue_from_vector(None, vector_type, &[
+                cast_vec_element(0),
+                cast_vec_element(1),
+                cast_vec_element(2),
+                cast_vec_element(3),
+                cast_vec_element(4),
+                cast_vec_element(5),
+                cast_vec_element(6),
+                cast_vec_element(7),
+            ])
+        };
+
+        match (in_style, out_style) {
+            (Style::Int(in_is_signed), Style::Int(_)) => {
+                return Ok(match in_width.cmp(&out_width) {
+                    Ordering::Greater => bx.trunc(args[0].immediate(), llret_ty),
+                    Ordering::Equal => args[0].immediate(),
+                    Ordering::Less => {
+                        if in_is_signed {
+                            match (in_width, out_width) {
+                                // FIXME(antoyo): the function _mm_cvtepi8_epi16 should directly
+                                // call an intrinsic equivalent to __builtin_ia32_pmovsxbw128 so that
+                                // we can generate a call to it.
+                                (8, 16) => extend(bx.i8_type, bx.i16_type),
+                                (8, 32) => extend(bx.i8_type, bx.i32_type),
+                                (8, 64) => extend(bx.i8_type, bx.i64_type),
+                                (16, 32) => extend(bx.i16_type, bx.i32_type),
+                                (32, 64) => extend(bx.i32_type, bx.i64_type),
+                                (16, 64) => extend(bx.i16_type, bx.i64_type),
+                                _ => unimplemented!("in: {}, out: {}", in_width, out_width),
+                            }
+                        } else {
+                            match (in_width, out_width) {
+                                (8, 16) => extend(bx.u8_type, bx.u16_type),
+                                (8, 32) => extend(bx.u8_type, bx.u32_type),
+                                (8, 64) => extend(bx.u8_type, bx.u64_type),
+                                (16, 32) => extend(bx.u16_type, bx.u32_type),
+                                (16, 64) => extend(bx.u16_type, bx.u64_type),
+                                (32, 64) => extend(bx.u32_type, bx.u64_type),
+                                _ => unimplemented!("in: {}, out: {}", in_width, out_width),
+                            }
+                        }
+                    }
+                });
+            }
+            (Style::Int(_), Style::Float) => {
+                unimplemented!();
+            }
+            (Style::Float, Style::Int(_)) => {
+                unimplemented!();
+            }
+            (Style::Float, Style::Float) => {
+                unimplemented!();
+            }
+            _ => { /* Unsupported. Fallthrough. */ }
+        }
+        require!(
+            false,
+            "unsupported cast from `{}` with element `{}` to `{}` with element `{}`",
+            in_ty,
+            in_elem,
+            ret_ty,
+            out_elem
+        );
+    }
+
    macro_rules! arith_binary {
        ($($name: ident: $($($p: ident),* => $call: ident),*;)*) => {
            $(if name == sym::$name {
@ -150,6 +367,105 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
        }
    }

+    fn simd_simple_float_intrinsic<'gcc, 'tcx>(
+        name: Symbol,
+        in_elem: Ty<'_>,
+        in_ty: Ty<'_>,
+        in_len: u64,
+        bx: &mut Builder<'_, 'gcc, 'tcx>,
+        span: Span,
+        args: &[OperandRef<'tcx, RValue<'gcc>>],
+    ) -> Result<RValue<'gcc>, ()> {
+        macro_rules! emit_error {
+            ($msg: tt) => {
+                emit_error!($msg, )
+            };
+            ($msg: tt, $($fmt: tt)*) => {
+                span_invalid_monomorphization_error(
+                    bx.sess(), span,
+                    &format!(concat!("invalid monomorphization of `{}` intrinsic: ", $msg),
+                             name, $($fmt)*));
+            }
+        }
+        macro_rules! return_error {
+            ($($fmt: tt)*) => {
+                {
+                    emit_error!($($fmt)*);
+                    return Err(());
+                }
+            }
+        }
+
+        let (elem_ty_str, elem_ty) =
+            if let ty::Float(f) = in_elem.kind() {
+                let elem_ty = bx.cx.type_float_from_ty(*f);
+                match f.bit_width() {
+                    32 => ("f32", elem_ty),
+                    64 => ("f64", elem_ty),
+                    _ => {
+                        return_error!(
+                            "unsupported element type `{}` of floating-point vector `{}`",
+                            f.name_str(),
+                            in_ty
+                        );
+                    }
+                }
+            }
+            else {
+                return_error!("`{}` is not a floating-point type", in_ty);
+            };
+
+        let vec_ty = bx.cx.type_vector(elem_ty, in_len);
+
+        let (intr_name, fn_ty) =
+            match name {
+                sym::simd_ceil => ("ceil", bx.type_func(&[vec_ty], vec_ty)),
+                sym::simd_fabs => ("fabs", bx.type_func(&[vec_ty], vec_ty)), // TODO(antoyo): pand with 170141183420855150465331762880109871103
+                sym::simd_fcos => ("cos", bx.type_func(&[vec_ty], vec_ty)),
+                sym::simd_fexp2 => ("exp2", bx.type_func(&[vec_ty], vec_ty)),
+                sym::simd_fexp => ("exp", bx.type_func(&[vec_ty], vec_ty)),
+                sym::simd_flog10 => ("log10", bx.type_func(&[vec_ty], vec_ty)),
+                sym::simd_flog2 => ("log2", bx.type_func(&[vec_ty], vec_ty)),
+                sym::simd_flog => ("log", bx.type_func(&[vec_ty], vec_ty)),
+                sym::simd_floor => ("floor", bx.type_func(&[vec_ty], vec_ty)),
+                sym::simd_fma => ("fma", bx.type_func(&[vec_ty, vec_ty, vec_ty], vec_ty)),
+                sym::simd_fpowi => ("powi", bx.type_func(&[vec_ty, bx.type_i32()], vec_ty)),
+                sym::simd_fpow => ("pow", bx.type_func(&[vec_ty, vec_ty], vec_ty)),
+                sym::simd_fsin => ("sin", bx.type_func(&[vec_ty], vec_ty)),
+                sym::simd_fsqrt => ("sqrt", bx.type_func(&[vec_ty], vec_ty)),
+                sym::simd_round => ("round", bx.type_func(&[vec_ty], vec_ty)),
+                sym::simd_trunc => ("trunc", bx.type_func(&[vec_ty], vec_ty)),
+                _ => return_error!("unrecognized intrinsic `{}`", name),
+            };
+        let llvm_name = &format!("llvm.{0}.v{1}{2}", intr_name, in_len, elem_ty_str);
+        let function = intrinsic::llvm::intrinsic(llvm_name, &bx.cx);
+        let function: RValue<'gcc> = unsafe { std::mem::transmute(function) };
+        let c = bx.call(fn_ty, function, &args.iter().map(|arg| arg.immediate()).collect::<Vec<_>>(), None);
+        Ok(c)
+    }
+
+    if std::matches!(
+        name,
+        sym::simd_ceil
+            | sym::simd_fabs
+            | sym::simd_fcos
+            | sym::simd_fexp2
+            | sym::simd_fexp
+            | sym::simd_flog10
+            | sym::simd_flog2
+            | sym::simd_flog
+            | sym::simd_floor
+            | sym::simd_fma
+            | sym::simd_fpow
+            | sym::simd_fpowi
+            | sym::simd_fsin
+            | sym::simd_fsqrt
+            | sym::simd_round
+            | sym::simd_trunc
+    ) {
+        return simd_simple_float_intrinsic(name, in_elem, in_ty, in_len, bx, span, args);
+    }
+
    arith_binary! {
        simd_add: Uint, Int => add, Float => fadd;
        simd_sub: Uint, Int => sub, Float => fsub;
@ -184,5 +500,41 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(bx: &mut Builder<'a, 'gcc, 'tcx>,
        simd_neg: Int => neg, Float => fneg;
    }

+    if name == sym::simd_saturating_add || name == sym::simd_saturating_sub {
+        let lhs = args[0].immediate();
+        let rhs = args[1].immediate();
+        let is_add = name == sym::simd_saturating_add;
+        let ptr_bits = bx.tcx().data_layout.pointer_size.bits() as _;
+        let (signed, elem_width, elem_ty) = match *in_elem.kind() {
+            ty::Int(i) => (true, i.bit_width().unwrap_or(ptr_bits), bx.cx.type_int_from_ty(i)),
+            ty::Uint(i) => (false, i.bit_width().unwrap_or(ptr_bits), bx.cx.type_uint_from_ty(i)),
+            _ => {
+                return_error!(
+                    "expected element type `{}` of vector type `{}` \
+                     to be a signed or unsigned integer type",
+                    arg_tys[0].simd_size_and_type(bx.tcx()).1,
+                    arg_tys[0]
+                );
+            }
+        };
+        let builtin_name =
+            match (signed, is_add, in_len, elem_width) {
+                (true, true, 32, 8) => "__builtin_ia32_paddsb256", // TODO(antoyo): cast arguments to unsigned.
+                (false, true, 32, 8) => "__builtin_ia32_paddusb256",
+                (true, true, 16, 16) => "__builtin_ia32_paddsw256",
+                (false, true, 16, 16) => "__builtin_ia32_paddusw256",
+                (true, false, 16, 16) => "__builtin_ia32_psubsw256",
+                (false, false, 16, 16) => "__builtin_ia32_psubusw256",
+                (true, false, 32, 8) => "__builtin_ia32_psubsb256",
+                (false, false, 32, 8) => "__builtin_ia32_psubusb256",
+                _ => unimplemented!("signed: {}, is_add: {}, in_len: {}, elem_width: {}", signed, is_add, in_len, elem_width),
+            };
+        let vec_ty = bx.cx.type_vector(elem_ty, in_len as u64);
+
+        let func = bx.context.get_target_builtin_function(builtin_name);
+        let result = bx.context.new_call(None, func, &[lhs, rhs]);
+        return Ok(bx.context.new_bitcast(None, result, vec_ty));
+    }
+
    unimplemented!("simd {}", name);
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -205,7 +205,7 @@ impl WriteBackendMethods for GccCodegenBackend {
    fn run_fat_lto(_cgcx: &CodegenContext<Self>, mut modules: Vec<FatLTOInput<Self>>, _cached_modules: Vec<(SerializedModule<Self::ModuleBuffer>, WorkProduct)>) -> Result<LtoModuleCodegen<Self>, FatalError> {
        // TODO(antoyo): implement LTO by sending -flto to libgccjit and adding the appropriate gcc linker plugins.
        // NOTE: implemented elsewhere.
-        // TODO: what is implemented elsewhere ^ ?
+        // TODO(antoyo): what is implemented elsewhere ^ ?
        let module =
            match modules.remove(0) {
                FatLTOInput::InMemory(module) => module,
@ -299,9 +299,17 @@ pub fn target_features(sess: &Session) -> Vec<Symbol> {
                if sess.is_nightly_build() || gate.is_none() { Some(feature) } else { None }
            },
        )
-        .filter(|_feature| {
+        .filter(|feature| {
            // TODO(antoyo): implement a way to get enabled feature in libgccjit.
-            false
+            // Probably using the equivalent of __builtin_cpu_supports.
+            feature.contains("sse") || feature.contains("avx")
+            /*
+               adx, aes, avx, avx2, avx512bf16, avx512bitalg, avx512bw, avx512cd, avx512dq, avx512er, avx512f, avx512gfni,
+               avx512ifma, avx512pf, avx512vaes, avx512vbmi, avx512vbmi2, avx512vl, avx512vnni, avx512vp2intersect, avx512vpclmulqdq,
+               avx512vpopcntdq, bmi1, bmi2, cmpxchg16b, ermsb, f16c, fma, fxsr, lzcnt, movbe, pclmulqdq, popcnt, rdrand, rdseed, rtm,
+               sha, sse, sse2, sse3, sse4.1, sse4.2, sse4a, ssse3, tbm, xsave, xsavec, xsaveopt, xsaves
+             */
+            //false
        })
        .map(|feature| Symbol::intern(feature))
        .collect()
--- a/src/type_.rs
+++ b/src/type_.rs
@ -3,10 +3,11 @@ use std::convert::TryInto;
 use gccjit::{RValue, Struct, Type};
 use rustc_codegen_ssa::traits::{BaseTypeMethods, DerivedTypeMethods};
 use rustc_codegen_ssa::common::TypeKind;
-use rustc_middle::bug;
+use rustc_middle::{bug, ty};
 use rustc_middle::ty::layout::TyAndLayout;
 use rustc_target::abi::{AddressSpace, Align, Integer, Size};

+use crate::common::TypeReflection;
 use crate::context::CodegenCx;
 use crate::type_of::LayoutGccExt;

@ -60,6 +61,17 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
        let ity = Integer::approximate_align(self, align);
        self.type_from_integer(ity)
    }
+
+    pub fn type_vector(&self, ty: Type<'gcc>, len: u64) -> Type<'gcc> {
+        self.context.new_vector_type(ty, len)
+    }
+
+    pub fn type_float_from_ty(&self, t: ty::FloatTy) -> Type<'gcc> {
+        match t {
+            ty::FloatTy::F32 => self.type_f32(),
+            ty::FloatTy::F64 => self.type_f64(),
+        }
+    }
 }

 impl<'gcc, 'tcx> BaseTypeMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
@ -127,7 +139,7 @@ impl<'gcc, 'tcx> BaseTypeMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
        else if typ.is_compatible_with(self.double_type) {
            TypeKind::Double
        }
-        else if typ.dyncast_vector().is_some() {
+        else if typ.is_vector() {
            TypeKind::Vector
        }
        else {
@ -141,7 +153,7 @@ impl<'gcc, 'tcx> BaseTypeMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
    }

    fn type_ptr_to_ext(&self, ty: Type<'gcc>, _address_space: AddressSpace) -> Type<'gcc> {
-        // TODO(antoyo): use address_space
+        // TODO(antoyo): use address_space, perhaps with TYPE_ADDR_SPACE?
        ty.make_pointer()
    }

@ -167,10 +179,10 @@ impl<'gcc, 'tcx> BaseTypeMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
    fn float_width(&self, typ: Type<'gcc>) -> usize {
        let f32 = self.context.new_type::<f32>();
        let f64 = self.context.new_type::<f64>();
-        if typ == f32 {
+        if typ.is_compatible_with(f32) {
            32
        }
-        else if typ == f64 {
+        else if typ.is_compatible_with(f64) {
            64
        }
        else {
--- a/src/type_of.rs
+++ b/src/type_of.rs
@ -24,6 +24,28 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
            I128 => self.type_u128(),
        }
    }
+
+    pub fn type_int_from_ty(&self, t: ty::IntTy) -> Type<'gcc> {
+        match t {
+            ty::IntTy::Isize => self.type_isize(),
+            ty::IntTy::I8 => self.type_i8(),
+            ty::IntTy::I16 => self.type_i16(),
+            ty::IntTy::I32 => self.type_i32(),
+            ty::IntTy::I64 => self.type_i64(),
+            ty::IntTy::I128 => self.type_i128(),
+        }
+    }
+
+    pub fn type_uint_from_ty(&self, t: ty::UintTy) -> Type<'gcc> {
+        match t {
+            ty::UintTy::Usize => self.type_isize(),
+            ty::UintTy::U8 => self.type_i8(),
+            ty::UintTy::U16 => self.type_i16(),
+            ty::UintTy::U32 => self.type_i32(),
+            ty::UintTy::U64 => self.type_i64(),
+            ty::UintTy::U128 => self.type_i128(),
+        }
+    }
 }

 pub fn uncached_gcc_type<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, layout: TyAndLayout<'tcx>, defer: &mut Option<(Struct<'gcc>, TyAndLayout<'tcx>)>) -> Type<'gcc> {
--- a/test.sh
+++ b/test.sh
@ -97,25 +97,6 @@ function std_tests() {
 #echo "[BUILD] sysroot in release mode"
 #./build_sysroot/build_sysroot.sh --release

-# TODO(antoyo): uncomment when it works.
-#pushd simple-raytracer
-#if [[ "$HOST_TRIPLE" = "$TARGET_TRIPLE" ]]; then
-    #echo "[BENCH COMPILE] ebobby/simple-raytracer"
-    #hyperfine --runs ${RUN_RUNS:-10} --warmup 1 --prepare "rm -r target/*/debug || true" \
-    #"RUSTFLAGS='' cargo build --target $TARGET_TRIPLE" \
-    #"../cargo.sh build"
-
-    #echo "[BENCH RUN] ebobby/simple-raytracer"
-    #cp ./target/*/debug/main ./raytracer_cg_gccjit
-    #hyperfine --runs ${RUN_RUNS:-10} ./raytracer_cg_llvm ./raytracer_cg_gccjit
-#else
-    #echo "[BENCH COMPILE] ebobby/simple-raytracer (skipped)"
-    #echo "[COMPILE] ebobby/simple-raytracer"
-    #../cargo.sh build
-    #echo "[BENCH RUN] ebobby/simple-raytracer (skipped)"
-#fi
-#popd
-
 function test_libcore() {
    pushd build_sysroot/sysroot_src/library/core/tests
    echo "[TEST] libcore"
@ -124,19 +105,6 @@ function test_libcore() {
    popd
 }

-# TODO(antoyo): uncomment when it works.
-#pushd regex
-#echo "[TEST] rust-lang/regex example shootout-regex-dna"
-#../cargo.sh clean
-## Make sure `[codegen mono items] start` doesn't poison the diff
-#../cargo.sh build --example shootout-regex-dna
-#cat examples/regexdna-input.txt | ../cargo.sh run --example shootout-regex-dna | grep -v "Spawned thread" > res.txt
-#diff -u res.txt examples/regexdna-output.txt
-
-#echo "[TEST] rust-lang/regex tests"
-#../cargo.sh test --tests -- --exclude-should-panic --test-threads 1 -Zunstable-options
-#popd
-
 #echo
 #echo "[BENCH COMPILE] mod_bench"

@ -153,6 +121,40 @@ function test_libcore() {
 #echo "[BENCH RUN] mod_bench"
 #hyperfine --runs ${RUN_RUNS:-10} ./target/out/mod_bench{,_inline} ./target/out/mod_bench_llvm_*

+function extended_sysroot_tests() {
+    pushd rand
+    cargo clean
+    echo "[TEST] rust-random/rand"
+    ../cargo.sh test --workspace
+    popd
+
+    #pushd simple-raytracer
+    #echo "[BENCH COMPILE] ebobby/simple-raytracer"
+    #hyperfine --runs "${RUN_RUNS:-10}" --warmup 1 --prepare "cargo clean" \
+    #"RUSTC=rustc RUSTFLAGS='' cargo build" \
+    #"../cargo.sh build"
+
+    #echo "[BENCH RUN] ebobby/simple-raytracer"
+    #cp ./target/debug/main ./raytracer_cg_gcc
+    #hyperfine --runs "${RUN_RUNS:-10}" ./raytracer_cg_llvm ./raytracer_cg_gcc
+    #popd
+
+    pushd regex
+    echo "[TEST] rust-lang/regex example shootout-regex-dna"
+    cargo clean
+    export CG_RUSTFLAGS="--cap-lints warn" # newer aho_corasick versions throw a deprecation warning
+    # Make sure `[codegen mono items] start` doesn't poison the diff
+    ../cargo.sh build --example shootout-regex-dna
+    cat examples/regexdna-input.txt \
+        | ../cargo.sh run --example shootout-regex-dna \
+        | grep -v "Spawned thread" > res.txt
+    diff -u res.txt examples/regexdna-output.txt
+
+    echo "[TEST] rust-lang/regex tests"
+    ../cargo.sh test --tests -- --exclude-should-panic --test-threads 1 -Zunstable-options -q
+    popd
+}
+
 function test_rustc() {
    echo
    echo "[TEST] rust-lang/rust"
@ -165,23 +167,7 @@ function test_rustc() {
    git checkout $(rustc -V | cut -d' ' -f3 | tr -d '(')
    export RUSTFLAGS=

-    git apply - <<EOF
-diff --git a/src/tools/compiletest/src/header.rs b/src/tools/compiletest/src/header.rs
-index 887d27fd6dca4..2c2239f2b83d1 100644
--- a/src/tools/compiletest/src/header.rs
-+++ b/src/tools/compiletest/src/header.rs
-@@ -806,8 +806,8 @@ pub fn make_test_description<R: Read>(
-     cfg: Option<&str>,
- ) -> test::TestDesc {
-     let mut ignore = false;
-     #[cfg(not(bootstrap))]
-    let ignore_message: Option<String> = None;
-+    let ignore_message: Option<&str> = None;
-     let mut should_fail = false;
-
-     let rustc_has_profiler_support = env::var_os("RUSTC_PROFILER_SUPPORT").is_some();
-
-EOF
+    git apply ../rustc_patches/compile_test.patch || true

    rm config.toml || true

@ -205,7 +191,7 @@ EOF

    git checkout -- src/test/ui/issues/auxiliary/issue-3136-a.rs # contains //~ERROR, but shouldn't be removed

-    rm -r src/test/ui/{abi*,extern/,panic-runtime/,panics/,unsized-locals/,proc-macro/,threads-sendsync/,thinlto/,simd*,borrowck/,test*,*lto*.rs} || true
+    rm -r src/test/ui/{abi*,extern/,panic-runtime/,panics/,unsized-locals/,proc-macro/,threads-sendsync/,thinlto/,borrowck/,test*,*lto*.rs} || true
    for test in $(rg --files-with-matches "catch_unwind|should_panic|thread|lto" src/test/ui); do
      rm $test
    done
@ -239,6 +225,10 @@ case $1 in
        std_tests
        ;;

+    "--extended-tests")
+        extended_sysroot_tests
+        ;;
+
    "--build-sysroot")
        build_sysroot
        ;;
@ -249,6 +239,7 @@ case $1 in
        build_sysroot
        std_tests
        test_libcore
+        extended_sysroot_tests
        test_rustc
        ;;
 esac