Merge commit '08a6d6e16b5efe217123e780398969946266268f' into sync-cg_gcc-2023-03-04

2024-11-21 22:34:05 +00:00 · 2023-03-05 12:03:19 -05:00 · 2023-03-05 12:03:19 -05:00 · 6bb2af0e6d
commit 6bb2af0e6d
parent f15f0ea739 08a6d6e16b
61 changed files with 5730 additions and 1123 deletions
--- a/compiler/rustc_codegen_gcc/.github/workflows/ci.yml
+++ b/compiler/rustc_codegen_gcc/.github/workflows/ci.yml
@ -4,36 +4,72 @@ on:
  - push
  - pull_request

+permissions:
+  contents: read
+
+env:
+  # Enable backtraces for easier debugging
+  RUST_BACKTRACE: 1
+
 jobs:
  build:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04

    strategy:
      fail-fast: false
      matrix:
-        libgccjit_version: ["libgccjit.so", "libgccjit_without_int128.so", "libgccjit12.so"]
+        libgccjit_version:
+          - { gcc: "libgccjit.so", extra: "", env_extra: "", artifacts_branch: "master" }
+          - { gcc: "libgccjit_without_int128.so", extra: "", env_extra: "", artifacts_branch: "master-without-128bit-integers" }
+          - { gcc: "libgccjit12.so", extra: "--no-default-features", env_extra: "TEST_FLAGS='-Cpanic=abort -Zpanic-abort-tests'", artifacts_branch: "gcc12" }
+        commands: [
+          "--mini-tests",
+          "--std-tests",
+          # FIXME: re-enable asm tests when GCC can emit in the right syntax.
+          # "--asm-tests",
+          "--test-libcore",
+          "--extended-rand-tests",
+          "--extended-regex-example-tests",
+          "--extended-regex-tests",
+          "--test-successful-rustc --nb-parts 2 --current-part 0",
+          "--test-successful-rustc --nb-parts 2 --current-part 1",
+          "--test-failing-rustc",
+        ]

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3

-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
      with:
        repository: llvm/llvm-project
        path: llvm

    - name: Install packages
-      run: sudo apt-get install ninja-build ripgrep
+      # `llvm-14-tools` is needed to install the `FileCheck` binary which is used for asm tests.
+      run: sudo apt-get install ninja-build ripgrep llvm-14-tools
+
+    - name: Install libgccjit12
+      if: matrix.libgccjit_version.gcc == 'libgccjit12.so'
+      run: sudo apt-get install libgccjit-12-dev

    - name: Download artifact
+      if: matrix.libgccjit_version.gcc != 'libgccjit12.so'
      uses: dawidd6/action-download-artifact@v2
      with:
          workflow: main.yml
-          name: ${{ matrix.libgccjit_version }}
+          name: ${{ matrix.libgccjit_version.gcc }}
          path: gcc-build
          repo: antoyo/gcc
+          branch: ${{ matrix.libgccjit_version.artifacts_branch }}
+          event: push
          search_artifacts: true # Because, instead, the action only check the last job ran and that won't work since we want multiple artifacts.

    - name: Setup path to libgccjit
+      if: matrix.libgccjit_version.gcc == 'libgccjit12.so'
+      run: echo /usr/lib/gcc/x86_64-linux-gnu/12 > gcc_path
+
+    - name: Setup path to libgccjit
+      if: matrix.libgccjit_version.gcc != 'libgccjit12.so'
      run: |
          echo $(readlink -f gcc-build) > gcc_path
          # NOTE: the filename is still libgccjit.so even when the artifact name is different.
@ -48,49 +84,44 @@ jobs:
    - name: Set RUST_COMPILER_RT_ROOT
      run: echo "RUST_COMPILER_RT_ROOT="${{ env.workspace }}/llvm/compiler-rt >> $GITHUB_ENV

-    # https://github.com/actions/cache/issues/133
-    - name: Fixup owner of ~/.cargo/
-      # Don't remove the trailing /. It is necessary to follow the symlink.
-      run: sudo chown -R $(whoami):$(id -ng) ~/.cargo/
-
    - name: Cache cargo installed crates
-      uses: actions/cache@v1.1.2
+      uses: actions/cache@v3
      with:
        path: ~/.cargo/bin
        key: cargo-installed-crates2-ubuntu-latest

    - name: Cache cargo registry
-      uses: actions/cache@v1
+      uses: actions/cache@v3
      with:
        path: ~/.cargo/registry
        key: ${{ runner.os }}-cargo-registry2-${{ hashFiles('**/Cargo.lock') }}

    - name: Cache cargo index
-      uses: actions/cache@v1
+      uses: actions/cache@v3
      with:
        path: ~/.cargo/git
        key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }}

    - name: Cache cargo target dir
-      uses: actions/cache@v1.1.2
+      uses: actions/cache@v3
      with:
        path: target
        key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('rust-toolchain') }}

-    - name: Build
-      if: matrix.libgccjit_version != 'libgccjit12.so'
-      run: |
-        ./prepare_build.sh
-        ./build.sh
-        cargo test
-        ./clean_all.sh
+    #- name: Cache rust repository
+      ## We only clone the rust repository for rustc tests
+      #if: ${{ contains(matrix.commands, 'rustc') }}
+      #uses: actions/cache@v3
+      #id: cache-rust-repository
+      #with:
+        #path: rust
+        #key: ${{ runner.os }}-packages-${{ hashFiles('rust/.git/HEAD') }}

    - name: Build
-      if: matrix.libgccjit_version == 'libgccjit12.so'
      run: |
        ./prepare_build.sh
-        ./build.sh --no-default-features
-        cargo test --no-default-features
+        ${{ matrix.libgccjit_version.env_extra }} ./build.sh ${{ matrix.libgccjit_version.extra }}
+        ${{ matrix.libgccjit_version.env_extra }} cargo test ${{ matrix.libgccjit_version.extra }}
        ./clean_all.sh

    - name: Prepare dependencies
@ -106,26 +137,16 @@ jobs:
        command: build
        args: --release

-    - name: Test
-      if: matrix.libgccjit_version != 'libgccjit12.so'
+    - name: Add more failing tests for GCC 12
+      if: ${{ matrix.libgccjit_version.gcc == 'libgccjit12.so' }}
+      run: cat failing-ui-tests12.txt >> failing-ui-tests.txt
+
+    - name: Run tests
      run: |
-        # Enable backtraces for easier debugging
-        export RUST_BACKTRACE=1
+        ${{ matrix.libgccjit_version.env_extra }} ./test.sh --release --clean --build-sysroot ${{ matrix.commands }} ${{ matrix.libgccjit_version.extra }}

-        # Reduce amount of benchmark runs as they are slow
-        export COMPILE_RUNS=2
-        export RUN_RUNS=2
-
-        ./test.sh --release
-
-    - name: Test
-      if: matrix.libgccjit_version == 'libgccjit12.so'
-      run: |
-        # Enable backtraces for easier debugging
-        export RUST_BACKTRACE=1
-
-        # Reduce amount of benchmark runs as they are slow
-        export COMPILE_RUNS=2
-        export RUN_RUNS=2
-
-        ./test.sh --release --no-default-features
+  duplicates:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - run: python tools/check_intrinsics_duplicates.py
--- a/compiler/rustc_codegen_gcc/.github/workflows/release.yml
+++ b/compiler/rustc_codegen_gcc/.github/workflows/release.yml
@ -0,0 +1,111 @@
+name: CI with sysroot compiled in release mode
+
+on:
+  - push
+  - pull_request
+
+permissions:
+  contents: read
+
+env:
+  # Enable backtraces for easier debugging
+  RUST_BACKTRACE: 1
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    strategy:
+      fail-fast: false
+      matrix:
+        libgccjit_version:
+          - { gcc: "libgccjit.so", artifacts_branch: "master" }
+        commands: [
+          "--test-successful-rustc --nb-parts 2 --current-part 0",
+          "--test-successful-rustc --nb-parts 2 --current-part 1",
+        ]
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: actions/checkout@v3
+      with:
+        repository: llvm/llvm-project
+        path: llvm
+
+    - name: Install packages
+      run: sudo apt-get install ninja-build ripgrep
+
+    - name: Download artifact
+      uses: dawidd6/action-download-artifact@v2
+      with:
+          workflow: main.yml
+          name: ${{ matrix.libgccjit_version.gcc }}
+          path: gcc-build
+          repo: antoyo/gcc
+          branch: ${{ matrix.libgccjit_version.artifacts_branch }}
+          event: push
+          search_artifacts: true # Because, instead, the action only check the last job ran and that won't work since we want multiple artifacts.
+
+    - name: Setup path to libgccjit
+      run: |
+          echo $(readlink -f gcc-build) > gcc_path
+          # NOTE: the filename is still libgccjit.so even when the artifact name is different.
+          ln gcc-build/libgccjit.so gcc-build/libgccjit.so.0
+
+    - name: Set env
+      run: |
+        echo "LIBRARY_PATH=$(cat gcc_path)" >> $GITHUB_ENV
+        echo "LD_LIBRARY_PATH=$(cat gcc_path)" >> $GITHUB_ENV
+        echo "workspace="$GITHUB_WORKSPACE >> $GITHUB_ENV
+
+    - name: Set RUST_COMPILER_RT_ROOT
+      run: echo "RUST_COMPILER_RT_ROOT="${{ env.workspace }}/llvm/compiler-rt >> $GITHUB_ENV
+
+    - name: Cache cargo installed crates
+      uses: actions/cache@v3
+      with:
+        path: ~/.cargo/bin
+        key: cargo-installed-crates2-ubuntu-latest
+
+    - name: Cache cargo registry
+      uses: actions/cache@v3
+      with:
+        path: ~/.cargo/registry
+        key: ${{ runner.os }}-cargo-registry2-${{ hashFiles('**/Cargo.lock') }}
+
+    - name: Cache cargo index
+      uses: actions/cache@v3
+      with:
+        path: ~/.cargo/git
+        key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }}
+
+    - name: Cache cargo target dir
+      uses: actions/cache@v3
+      with:
+        path: target
+        key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('rust-toolchain') }}
+
+    - name: Build
+      run: |
+        ./prepare_build.sh
+        ./build.sh --release --release-sysroot
+        cargo test
+        ./clean_all.sh
+
+    - name: Prepare dependencies
+      run: |
+        git config --global user.email "user@example.com"
+        git config --global user.name "User"
+        ./prepare.sh
+
+    # Compile is a separate step, as the actions-rs/cargo action supports error annotations
+    - name: Compile
+      uses: actions-rs/cargo@v1.0.3
+      with:
+        command: build
+        args: --release
+
+    - name: Run tests
+      run: |
+        ./test.sh --release --clean --release-sysroot --build-sysroot ${{ matrix.commands }}
--- a/compiler/rustc_codegen_gcc/.github/workflows/stdarch.yml
+++ b/compiler/rustc_codegen_gcc/.github/workflows/stdarch.yml
@ -0,0 +1,116 @@
+name: stdarch tests with sysroot compiled in release mode
+
+on:
+  - push
+  - pull_request
+
+permissions:
+  contents: read
+
+env:
+  # Enable backtraces for easier debugging
+  RUST_BACKTRACE: 1
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    strategy:
+      fail-fast: false
+      matrix:
+        libgccjit_version:
+          - { gcc: "libgccjit.so", artifacts_branch: "master" }
+        commands: [
+          "--test-successful-rustc --nb-parts 2 --current-part 0",
+          "--test-successful-rustc --nb-parts 2 --current-part 1",
+        ]
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: actions/checkout@v3
+      with:
+        repository: llvm/llvm-project
+        path: llvm
+
+    - name: Install packages
+      run: sudo apt-get install ninja-build ripgrep
+
+    - name: Download artifact
+      uses: dawidd6/action-download-artifact@v2
+      with:
+          workflow: main.yml
+          name: ${{ matrix.libgccjit_version.gcc }}
+          path: gcc-build
+          repo: antoyo/gcc
+          branch: ${{ matrix.libgccjit_version.artifacts_branch }}
+          event: push
+          search_artifacts: true # Because, instead, the action only check the last job ran and that won't work since we want multiple artifacts.
+
+    - name: Setup path to libgccjit
+      run: |
+          echo $(readlink -f gcc-build) > gcc_path
+          # NOTE: the filename is still libgccjit.so even when the artifact name is different.
+          ln gcc-build/libgccjit.so gcc-build/libgccjit.so.0
+
+    - name: Set env
+      run: |
+        echo "LIBRARY_PATH=$(cat gcc_path)" >> $GITHUB_ENV
+        echo "LD_LIBRARY_PATH=$(cat gcc_path)" >> $GITHUB_ENV
+        echo "workspace="$GITHUB_WORKSPACE >> $GITHUB_ENV
+
+    - name: Set RUST_COMPILER_RT_ROOT
+      run: echo "RUST_COMPILER_RT_ROOT="${{ env.workspace }}/llvm/compiler-rt >> $GITHUB_ENV
+
+    - name: Cache cargo installed crates
+      uses: actions/cache@v3
+      with:
+        path: ~/.cargo/bin
+        key: cargo-installed-crates2-ubuntu-latest
+
+    - name: Cache cargo registry
+      uses: actions/cache@v3
+      with:
+        path: ~/.cargo/registry
+        key: ${{ runner.os }}-cargo-registry2-${{ hashFiles('**/Cargo.lock') }}
+
+    - name: Cache cargo index
+      uses: actions/cache@v3
+      with:
+        path: ~/.cargo/git
+        key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }}
+
+    - name: Cache cargo target dir
+      uses: actions/cache@v3
+      with:
+        path: target
+        key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('rust-toolchain') }}
+
+    - name: Build
+      run: |
+        ./prepare_build.sh
+        ./build.sh --release --release-sysroot
+        cargo test
+        ./clean_all.sh
+
+    - name: Prepare dependencies
+      run: |
+        git config --global user.email "user@example.com"
+        git config --global user.name "User"
+        ./prepare.sh
+
+    # Compile is a separate step, as the actions-rs/cargo action supports error annotations
+    - name: Compile
+      uses: actions-rs/cargo@v1.0.3
+      with:
+        command: build
+        args: --release
+
+    - name: Run tests
+      run: |
+        ./test.sh --release --clean --release-sysroot --build-sysroot --mini-tests --std-tests --test-libcore
+
+    - name: Run stdarch tests
+      run: |
+        cd build_sysroot/sysroot_src/library/stdarch/
+        CHANNEL=release TARGET=x86_64-unknown-linux-gnu ../../../../cargo.sh test
--- a/compiler/rustc_codegen_gcc/Cargo.lock
+++ b/compiler/rustc_codegen_gcc/Cargo.lock
@ -208,6 +208,7 @@ version = "0.1.0"
 dependencies = [
 "gccjit",
 "lang_tester",
+ "smallvec",
 "tempfile",
 ]

@ -220,6 +221,12 @@ dependencies = [
 "winapi-util",
 ]

+[[package]]
+name = "smallvec"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
+
 [[package]]
 name = "tempfile"
 version = "3.2.0"
--- a/compiler/rustc_codegen_gcc/Cargo.toml
+++ b/compiler/rustc_codegen_gcc/Cargo.toml
@ -27,6 +27,8 @@ gccjit = { git = "https://github.com/antoyo/gccjit.rs" }
 # Local copy.
 #gccjit = { path = "../gccjit.rs" }

+smallvec = { version = "1.6.1", features = ["union", "may_dangle"] }
+
 [dev-dependencies]
 lang_tester = "0.3.9"
 tempfile = "3.1.0"
--- a/compiler/rustc_codegen_gcc/Readme.md
+++ b/compiler/rustc_codegen_gcc/Readme.md
@ -1,5 +1,7 @@
 # WIP libgccjit codegen backend for rust

+[![Chat on IRC](https://img.shields.io/badge/irc.libera.chat-%23rustc__codegen__gcc-blue.svg)](https://web.libera.chat/#rustc_codegen_gcc)
+
 This is a GCC codegen for rustc, which means it can be loaded by the existing rustc frontend, but benefits from GCC: more architectures are supported and GCC's optimizations are used.

 **Despite its name, libgccjit can be used for ahead-of-time compilation, as is used here.**
@ -16,21 +18,61 @@ The patches in [this repository](https://github.com/antoyo/libgccjit-patches) ne
 (Those patches should work when applied on master, but in case it doesn't work, they are known to work when applied on 079c23cfe079f203d5df83fea8e92a60c7d7e878.)
 You can also use my [fork of gcc](https://github.com/antoyo/gcc) which already includes these patches.**

+To build it (most of these instructions come from [here](https://gcc.gnu.org/onlinedocs/jit/internals/index.html), so don't hesitate to take a look there if you encounter an issue):
+
+```bash
+$ git clone https://github.com/antoyo/gcc
+$ sudo apt install flex libmpfr-dev libgmp-dev libmpc3 libmpc-dev
+$ mkdir gcc-build gcc-install
+$ cd gcc-build
+$ ../gcc/configure \
+    --enable-host-shared \
+    --enable-languages=jit \
+    --enable-checking=release \ # it enables extra checks which allow to find bugs
+    --disable-bootstrap \
+    --disable-multilib \
+    --prefix=$(pwd)/../gcc-install
+$ make -j4 # You can replace `4` with another number depending on how many cores you have.
+```
+
+If you want to run libgccjit tests, you will need to also enable the C++ language in the `configure`:
+
+```bash
+--enable-languages=jit,c++
+```
+
+Then to run libgccjit tests:
+
+```bash
+$ cd gcc # from the `gcc-build` folder
+$ make check-jit
+# To run one specific test:
+$ make check-jit RUNTESTFLAGS="-v -v -v jit.exp=jit.dg/test-asm.cc"
+```
+
 **Put the path to your custom build of libgccjit in the file `gcc_path`.**

 ```bash
-$ git clone https://github.com/rust-lang/rustc_codegen_gcc.git
-$ cd rustc_codegen_gcc
+$ dirname $(readlink -f `find . -name libgccjit.so`) > gcc_path
+```
+
+You also need to set RUST_COMPILER_RT_ROOT:
+
+```bash
 $ git clone https://github.com/llvm/llvm-project llvm --depth 1 --single-branch
 $ export RUST_COMPILER_RT_ROOT="$PWD/llvm/compiler-rt"
-$ ./prepare_build.sh # download and patch sysroot src
-$ ./build.sh --release
+```
+
+Then you can run commands like this:
+
+```bash
+$ ./prepare.sh # download and patch sysroot src and install hyperfine for benchmarking
+$ LIBRARY_PATH=$(cat gcc_path) LD_LIBRARY_PATH=$(cat gcc_path) ./build.sh --release
 ```

 To run the tests:

 ```bash
-$ ./prepare.sh # download and patch sysroot src and install hyperfine for benchmarking
 $ ./test.sh --release
 ```

@ -120,13 +162,52 @@ To print a debug representation of a tree:
 debug_tree(expr);
 ```

+(defined in print-tree.h)
+
+To print a debug reprensentation of a gimple struct:
+
+```c
+debug_gimple_stmt(gimple_struct)
+```
+
 To get the `rustc` command to run in `gdb`, add the `--verbose` flag to `cargo build`.

+To have the correct file paths in `gdb` instead of `/usr/src/debug/gcc/libstdc++-v3/libsupc++/eh_personality.cc`:
+
+Maybe by calling the following at the beginning of gdb:
+
+```
+set substitute-path /usr/src/debug/gcc /path/to/gcc-repo/gcc
+```
+
+TODO(antoyo): but that's not what I remember I was doing.
+
 ### How to use a custom-build rustc

 * Build the stage2 compiler (`rustup toolchain link debug-current build/x86_64-unknown-linux-gnu/stage2`).
 * Clean and rebuild the codegen with `debug-current` in the file `rust-toolchain`.

+### How to install a forked git-subtree
+
+Using git-subtree with `rustc` requires a patched git to make it work.
+The PR that is needed is [here](https://github.com/gitgitgadget/git/pull/493).
+Use the following instructions to install it:
+
+```
+git clone git@github.com:tqc/git.git
+cd git
+git checkout tqc/subtree
+make
+make install
+cd contrib/subtree
+make
+cp git-subtree ~/bin
+```
+
+### How to use [mem-trace](https://github.com/antoyo/mem-trace)
+
+`rustc` needs to be built without `jemalloc` so that `mem-trace` can overload `malloc` since `jemalloc` is linked statically, so a `LD_PRELOAD`-ed library won't a chance to intercept the calls to `malloc`.
+
 ### How to build a cross-compiling libgccjit

 #### Building libgccjit
@ -142,6 +223,5 @@ To get the `rustc` command to run in `gdb`, add the `--verbose` flag to `cargo b
 * Since rustc doesn't support this architecture yet, set it back to `TARGET_TRIPLE="mips-unknown-linux-gnu"` (or another target having the same attributes). Alternatively, create a [target specification file](https://book.avr-rust.com/005.1-the-target-specification-json-file.html) (note that the `arch` specified in this file must be supported by the rust compiler).
 * Set `linker='-Clinker=m68k-linux-gcc'`.
 * Set the path to the cross-compiling libgccjit in `gcc_path`.
- * Disable the 128-bit integer types if the target doesn't support them by using `let i128_type = context.new_type::<i64>();` in `context.rs` (same for u128_type).
 * Comment the line: `context.add_command_line_option("-masm=intel");` in src/base.rs.
 * (might not be necessary) Disable the compilation of libstd.so (and possibly libcore.so?).
--- a/compiler/rustc_codegen_gcc/build_sysroot/build_sysroot.sh
+++ b/compiler/rustc_codegen_gcc/build_sysroot/build_sysroot.sh
@ -16,7 +16,7 @@ rm Cargo.lock test_target/Cargo.lock 2>/dev/null || true
 rm -r sysroot/ 2>/dev/null || true

 # Build libs
-export RUSTFLAGS="$RUSTFLAGS -Z force-unstable-if-unmarked -Cpanic=abort"
+export RUSTFLAGS="$RUSTFLAGS -Z force-unstable-if-unmarked"
 if [[ "$1" == "--release" ]]; then
    sysroot_channel='release'
    RUSTFLAGS="$RUSTFLAGS -Zmir-opt-level=3" cargo build --target $TARGET_TRIPLE --release
--- a/compiler/rustc_codegen_gcc/config.sh
+++ b/compiler/rustc_codegen_gcc/config.sh
@ -38,7 +38,7 @@ if [[ "$HOST_TRIPLE" != "$TARGET_TRIPLE" ]]; then
   fi
 fi

-export RUSTFLAGS="$CG_RUSTFLAGS $linker -Cpanic=abort -Csymbol-mangling-version=v0 -Cdebuginfo=2 -Clto=off -Zpanic-abort-tests -Zcodegen-backend=$(pwd)/target/${CHANNEL:-debug}/librustc_codegen_gcc.$dylib_ext --sysroot $(pwd)/build_sysroot/sysroot"
+export RUSTFLAGS="$CG_RUSTFLAGS $linker -Csymbol-mangling-version=v0 -Cdebuginfo=2 -Clto=off -Zcodegen-backend=$(pwd)/target/${CHANNEL:-debug}/librustc_codegen_gcc.$dylib_ext --sysroot $(pwd)/build_sysroot/sysroot $TEST_FLAGS"

 # FIXME(antoyo): remove once the atomic shim is gone
 if [[ `uname` == 'Darwin' ]]; then
--- a/compiler/rustc_codegen_gcc/example/alloc_example.rs
+++ b/compiler/rustc_codegen_gcc/example/alloc_example.rs
@ -1,4 +1,4 @@
-#![feature(start, box_syntax, core_intrinsics, alloc_error_handler)]
+#![feature(start, box_syntax, core_intrinsics, alloc_error_handler, lang_items)]
 #![no_std]

 extern crate alloc;
@ -18,16 +18,22 @@ extern "C" {

 #[panic_handler]
 fn panic_handler(_: &core::panic::PanicInfo) -> ! {
-    unsafe {
-        core::intrinsics::abort();
-    }
+    core::intrinsics::abort();
 }

 #[alloc_error_handler]
 fn alloc_error_handler(_: alloc::alloc::Layout) -> ! {
-    unsafe {
-        core::intrinsics::abort();
-    }
+    core::intrinsics::abort();
+}
+
+#[lang = "eh_personality"]
+fn eh_personality() -> ! {
+    loop {}
+}
+
+#[no_mangle]
+unsafe extern "C" fn _Unwind_Resume() {
+    core::intrinsics::unreachable();
 }

 #[start]
--- a/compiler/rustc_codegen_gcc/example/mini_core.rs
+++ b/compiler/rustc_codegen_gcc/example/mini_core.rs
@ -1,6 +1,6 @@
 #![feature(
    no_core, lang_items, intrinsics, unboxed_closures, type_ascription, extern_types,
-    untagged_unions, decl_macro, rustc_attrs, transparent_unions, auto_traits,
+    decl_macro, rustc_attrs, transparent_unions, auto_traits,
    thread_local
 )]
 #![no_core]
@ -17,6 +17,9 @@ pub trait Sized {}
 #[lang = "destruct"]
 pub trait Destruct {}

+#[lang = "tuple_trait"]
+pub trait Tuple {}
+
 #[lang = "unsize"]
 pub trait Unsize<T: ?Sized> {}

@ -39,14 +42,14 @@ impl<'a, T: ?Sized+Unsize<U>, U: ?Sized> DispatchFromDyn<&'a mut U> for &'a mut
 impl<T: ?Sized+Unsize<U>, U: ?Sized> DispatchFromDyn<*const U> for *const T {}
 // *mut T -> *mut U
 impl<T: ?Sized+Unsize<U>, U: ?Sized> DispatchFromDyn<*mut U> for *mut T {}
-impl<T: ?Sized + Unsize<U>, U: ?Sized> DispatchFromDyn<Box<U>> for Box<T> {}
+impl<T: ?Sized + Unsize<U>, U: ?Sized> DispatchFromDyn<Box<U, ()>> for Box<T, ()> {}

 #[lang = "receiver"]
 pub trait Receiver {}

 impl<T: ?Sized> Receiver for &T {}
 impl<T: ?Sized> Receiver for &mut T {}
-impl<T: ?Sized> Receiver for Box<T> {}
+impl<T: ?Sized, A: Allocator> Receiver for Box<T, A> {}

 #[lang = "copy"]
 pub unsafe trait Copy {}
@ -396,7 +399,7 @@ pub struct PhantomData<T: ?Sized>;

 #[lang = "fn_once"]
 #[rustc_paren_sugar]
-pub trait FnOnce<Args> {
+pub trait FnOnce<Args: Tuple> {
    #[lang = "fn_once_output"]
    type Output;

@ -405,13 +408,21 @@ pub trait FnOnce<Args> {

 #[lang = "fn_mut"]
 #[rustc_paren_sugar]
-pub trait FnMut<Args>: FnOnce<Args> {
+pub trait FnMut<Args: Tuple>: FnOnce<Args> {
    extern "rust-call" fn call_mut(&mut self, args: Args) -> Self::Output;
 }

 #[lang = "panic"]
 #[track_caller]
-pub fn panic(_msg: &str) -> ! {
+pub fn panic(_msg: &'static str) -> ! {
+    unsafe {
+        libc::puts("Panicking\n\0" as *const str as *const u8);
+        intrinsics::abort();
+    }
+}
+
+#[lang = "panic_cannot_unwind"]
+fn panic_cannot_unwind() -> ! {
    unsafe {
        libc::puts("Panicking\n\0" as *const str as *const u8);
        intrinsics::abort();
@ -450,17 +461,32 @@ pub trait Deref {
 pub trait Allocator {
 }

+impl Allocator for () {}
+
 pub struct Global;

 impl Allocator for Global {}

-#[lang = "owned_box"]
-pub struct Box<
-    T: ?Sized,
-    A: Allocator = Global,
->(*mut T, A);
+#[repr(transparent)]
+#[rustc_layout_scalar_valid_range_start(1)]
+#[rustc_nonnull_optimization_guaranteed]
+pub struct NonNull<T: ?Sized>(pub *const T);

-impl<T: ?Sized + Unsize<U>, U: ?Sized> CoerceUnsized<Box<U>> for Box<T> {}
+impl<T: ?Sized, U: ?Sized> CoerceUnsized<NonNull<U>> for NonNull<T> where T: Unsize<U> {}
+impl<T: ?Sized, U: ?Sized> DispatchFromDyn<NonNull<U>> for NonNull<T> where T: Unsize<U> {}
+
+pub struct Unique<T: ?Sized> {
+    pub pointer: NonNull<T>,
+    pub _marker: PhantomData<T>,
+}
+
+impl<T: ?Sized, U: ?Sized> CoerceUnsized<Unique<U>> for Unique<T> where T: Unsize<U> {}
+impl<T: ?Sized, U: ?Sized> DispatchFromDyn<Unique<U>> for Unique<T> where T: Unsize<U> {}
+
+#[lang = "owned_box"]
+pub struct Box<T: ?Sized, A: Allocator = Global>(Unique<T>, A);
+
+impl<T: ?Sized + Unsize<U>, U: ?Sized, A: Allocator> CoerceUnsized<Box<U, A>> for Box<T, A> {}

 impl<T: ?Sized, A: Allocator> Drop for Box<T, A> {
    fn drop(&mut self) {
@ -468,7 +494,7 @@ impl<T: ?Sized, A: Allocator> Drop for Box<T, A> {
    }
 }

-impl<T> Deref for Box<T> {
+impl<T: ?Sized, A: Allocator> Deref for Box<T, A> {
    type Target = T;

    fn deref(&self) -> &Self::Target {
@ -482,8 +508,8 @@ unsafe fn allocate(size: usize, _align: usize) -> *mut u8 {
 }

 #[lang = "box_free"]
-unsafe fn box_free<T: ?Sized, A: Allocator>(ptr: *mut T, alloc: A) {
-    libc::free(ptr as *mut u8);
+unsafe fn box_free<T: ?Sized>(ptr: Unique<T>, _alloc: ()) {
+    libc::free(ptr.pointer.0 as *mut u8);
 }

 #[lang = "drop"]
@ -505,17 +531,25 @@ pub union MaybeUninit<T> {
 }

 pub mod intrinsics {
+    use crate::Sized;
+
    extern "rust-intrinsic" {
+        #[rustc_safe_intrinsic]
        pub fn abort() -> !;
+        #[rustc_safe_intrinsic]
        pub fn size_of<T>() -> usize;
-        pub fn size_of_val<T: ?::Sized>(val: *const T) -> usize;
+        pub fn size_of_val<T: ?Sized>(val: *const T) -> usize;
+        #[rustc_safe_intrinsic]
        pub fn min_align_of<T>() -> usize;
-        pub fn min_align_of_val<T: ?::Sized>(val: *const T) -> usize;
+        pub fn min_align_of_val<T: ?Sized>(val: *const T) -> usize;
        pub fn copy<T>(src: *const T, dst: *mut T, count: usize);
        pub fn transmute<T, U>(e: T) -> U;
        pub fn ctlz_nonzero<T>(x: T) -> T;
-        pub fn needs_drop<T: ?::Sized>() -> bool;
+        #[rustc_safe_intrinsic]
+        pub fn needs_drop<T: ?Sized>() -> bool;
+        #[rustc_safe_intrinsic]
        pub fn bitreverse<T>(x: T) -> T;
+        #[rustc_safe_intrinsic]
        pub fn bswap<T>(x: T) -> T;
        pub fn write_bytes<T>(dst: *mut T, val: u8, count: usize);
        pub fn unreachable() -> !;
--- a/compiler/rustc_codegen_gcc/example/mini_core_hello_world.rs
+++ b/compiler/rustc_codegen_gcc/example/mini_core_hello_world.rs
@ -85,6 +85,7 @@ fn start<T: Termination + 'static>(
    main: fn() -> T,
    argc: isize,
    argv: *const *const u8,
+    _sigpipe: u8,
 ) -> isize {
    if argc == 3 {
        unsafe { puts(*argv); }
@ -228,6 +229,7 @@ fn main() {
    } as Box<dyn SomeTrait>;

    const FUNC_REF: Option<fn()> = Some(main);
+    #[allow(unreachable_code)]
    match FUNC_REF {
        Some(_) => {},
        None => assert!(false),
--- a/compiler/rustc_codegen_gcc/example/mod_bench.rs
+++ b/compiler/rustc_codegen_gcc/example/mod_bench.rs
@ -6,9 +6,7 @@ extern {}

 #[panic_handler]
 fn panic_handler(_: &core::panic::PanicInfo) -> ! {
-    unsafe {
-        core::intrinsics::abort();
-    }
+    core::intrinsics::abort();
 }

 #[lang="eh_personality"]
@ -32,6 +30,6 @@ fn main(_argc: isize, _argv: *const *const u8) -> isize {
 #[inline(never)]
 fn black_box(i: u32) {
    if i != 1 {
-        unsafe { core::intrinsics::abort(); }
+        core::intrinsics::abort();
    }
 }
--- a/compiler/rustc_codegen_gcc/example/std_example.rs
+++ b/compiler/rustc_codegen_gcc/example/std_example.rs
@ -1,5 +1,6 @@
 #![feature(core_intrinsics, generators, generator_trait, is_sorted)]

+#[cfg(feature="master")]
 use std::arch::x86_64::*;
 use std::io::Write;
 use std::ops::Generator;
--- a/compiler/rustc_codegen_gcc/failing-ui-tests.txt
+++ b/compiler/rustc_codegen_gcc/failing-ui-tests.txt
@ -0,0 +1,68 @@
+tests/ui/allocator/custom-in-block.rs
+tests/ui/allocator/custom-in-submodule.rs
+tests/ui/allocator/custom.rs
+tests/ui/allocator/hygiene.rs
+tests/ui/allocator/no_std-alloc-error-handler-custom.rs
+tests/ui/allocator/no_std-alloc-error-handler-default.rs
+tests/ui/allocator/xcrate-use.rs
+tests/ui/allocator/xcrate-use2.rs
+tests/ui/asm/may_unwind.rs
+tests/ui/asm/x86_64/multiple-clobber-abi.rs
+tests/ui/debuginfo/debuginfo-emit-llvm-ir-and-split-debuginfo.rs
+tests/ui/functions-closures/parallel-codegen-closures.rs
+tests/ui/linkage-attr/linkage1.rs
+tests/ui/lto/dylib-works.rs
+tests/ui/numbers-arithmetic/saturating-float-casts.rs
+tests/ui/polymorphization/promoted-function.rs
+tests/ui/process/nofile-limit.rs
+tests/ui/sepcomp/sepcomp-cci.rs
+tests/ui/sepcomp/sepcomp-extern.rs
+tests/ui/sepcomp/sepcomp-fns-backwards.rs
+tests/ui/sepcomp/sepcomp-fns.rs
+tests/ui/sepcomp/sepcomp-statics.rs
+tests/ui/simd/intrinsic/generic-arithmetic-pass.rs
+tests/ui/sse2.rs
+tests/ui/target-feature/missing-plusminus.rs
+tests/ui/asm/x86_64/may_unwind.rs
+tests/ui/backtrace.rs
+tests/ui/catch-unwind-bang.rs
+tests/ui/cfg/cfg-panic-abort.rs
+tests/ui/drop/dynamic-drop-async.rs
+tests/ui/drop/repeat-drop.rs
+tests/ui/fmt/format-args-capture.rs
+tests/ui/generator/panic-drops-resume.rs
+tests/ui/generator/panic-drops.rs
+tests/ui/intrinsics/panic-uninitialized-zeroed.rs
+tests/ui/iterators/iter-sum-overflow-debug.rs
+tests/ui/iterators/iter-sum-overflow-overflow-checks.rs
+tests/ui/mir/mir_calls_to_shims.rs
+tests/ui/mir/mir_drop_order.rs
+tests/ui/mir/mir_let_chains_drop_order.rs
+tests/ui/oom_unwind.rs
+tests/ui/panic-runtime/abort-link-to-unwinding-crates.rs
+tests/ui/panic-runtime/abort.rs
+tests/ui/panic-runtime/link-to-abort.rs
+tests/ui/unwind-no-uwtable.rs
+tests/ui/parser/unclosed-delimiter-in-dep.rs
+tests/ui/runtime/rt-explody-panic-payloads.rs
+tests/ui/simd/intrinsic/ptr-cast.rs
+tests/ui/binding/fn-arg-incomplete-pattern-drop-order.rs
+tests/ui/consts/missing_span_in_backtrace.rs
+tests/ui/drop/dynamic-drop.rs
+tests/ui/dyn-star/box.rs
+tests/ui/issues/issue-40883.rs
+tests/ui/issues/issue-43853.rs
+tests/ui/issues/issue-47364.rs
+tests/ui/macros/rfc-2011-nicer-assert-messages/assert-without-captures-does-not-create-unnecessary-code.rs
+tests/ui/rfc-2091-track-caller/std-panic-locations.rs
+tests/ui/rfcs/rfc1857-drop-order.rs
+tests/ui/simd/issue-17170.rs
+tests/ui/simd/issue-39720.rs
+tests/ui/simd/issue-89193.rs
+tests/ui/statics/issue-91050-1.rs
+tests/ui/statics/issue-91050-2.rs
+tests/ui/alloc-error/default-alloc-error-hook.rs
+tests/ui/generator/panic-safe.rs
+tests/ui/issues/issue-14875.rs
+tests/ui/issues/issue-29948.rs
+tests/ui/panic-while-printing.rs
--- a/compiler/rustc_codegen_gcc/failing-ui-tests12.txt
+++ b/compiler/rustc_codegen_gcc/failing-ui-tests12.txt
@ -0,0 +1,39 @@
+tests/ui/asm/x86_64/issue-96797.rs
+tests/ui/intrinsics/const-eval-select-x86_64.rs
+tests/ui/packed/packed-struct-drop-aligned.rs
+tests/ui/packed/packed-struct-generic-layout.rs
+tests/ui/packed/packed-struct-layout.rs
+tests/ui/packed/packed-struct-optimized-enum.rs
+tests/ui/packed/packed-struct-size.rs
+tests/ui/packed/packed-struct-vec.rs
+tests/ui/packed/packed-tuple-struct-layout.rs
+tests/ui/simd/array-type.rs
+tests/ui/simd/intrinsic/float-minmax-pass.rs
+tests/ui/simd/intrinsic/generic-arithmetic-saturating-pass.rs
+tests/ui/simd/intrinsic/generic-as.rs
+tests/ui/simd/intrinsic/generic-cast-pass.rs
+tests/ui/simd/intrinsic/generic-cast-pointer-width.rs
+tests/ui/simd/intrinsic/generic-comparison-pass.rs
+tests/ui/simd/intrinsic/generic-elements-pass.rs
+tests/ui/simd/intrinsic/generic-reduction-pass.rs
+tests/ui/simd/intrinsic/generic-select-pass.rs
+tests/ui/simd/intrinsic/inlining-issue67557-ice.rs
+tests/ui/simd/intrinsic/inlining-issue67557.rs
+tests/ui/simd/monomorphize-shuffle-index.rs
+tests/ui/simd/shuffle.rs
+tests/ui/simd/simd-bitmask.rs
+tests/ui/generator/resume-after-return.rs
+tests/ui/iterators/iter-step-overflow-debug.rs
+tests/ui/macros/rfc-2011-nicer-assert-messages/all-expr-kinds.rs
+tests/ui/numbers-arithmetic/next-power-of-two-overflow-debug.rs
+tests/ui/privacy/reachable-unnameable-items.rs
+tests/ui/rfc-1937-termination-trait/termination-trait-in-test.rs
+tests/ui/async-await/async-fn-size-moved-locals.rs
+tests/ui/async-await/async-fn-size-uninit-locals.rs
+tests/ui/cfg/cfg-panic.rs
+tests/ui/generator/size-moved-locals.rs
+tests/ui/macros/rfc-2011-nicer-assert-messages/all-not-available-cases.rs
+tests/ui/simd/intrinsic/generic-gather-pass.rs
+tests/ui/simd/issue-85915-simd-ptrs.rs
+tests/ui/issues/issue-68010-large-zst-consts.rs
+tests/ui/rust-2018/proc-macro-crate-in-paths.rs
--- a/compiler/rustc_codegen_gcc/locales/en-US.ftl
+++ b/compiler/rustc_codegen_gcc/locales/en-US.ftl
@ -60,3 +60,6 @@ codegen_gcc_invalid_monomorphization_unsupported_cast =

 codegen_gcc_invalid_monomorphization_unsupported_operation =
    invalid monomorphization of `{$name}` intrinsic: unsupported operation on `{$in_ty}` with element `{$in_elem}`
+
+codegen_gcc_invalid_minimum_alignment =
+    invalid minimum global alignment: {$err}
--- a/compiler/rustc_codegen_gcc/patches/0001-Add-stdarch-Cargo.toml-for-testing.patch
+++ b/compiler/rustc_codegen_gcc/patches/0001-Add-stdarch-Cargo.toml-for-testing.patch
@ -0,0 +1,39 @@
+From c3821e02fbd6cb5ad6e06d759fccdc9073712375 Mon Sep 17 00:00:00 2001
+From: Antoni Boucher <bouanto@zoho.com>
+Date: Tue, 7 Jun 2022 21:40:13 -0400
+Subject: [PATCH] Add stdarch Cargo.toml for testing
+
+---
+ library/stdarch/Cargo.toml | 20 ++++++++++++++++++++
+ 1 file changed, 20 insertions(+)
+ create mode 100644 library/stdarch/Cargo.toml
+
+diff --git a/library/stdarch/Cargo.toml b/library/stdarch/Cargo.toml
+new file mode 100644
+index 0000000..fbe0a95
+--- /dev/null
+++ b/library/stdarch/Cargo.toml
+@@ -0,0 +1,20 @@
+[workspace]
+members = [
+  "crates/core_arch",
+  "crates/std_detect",
+  "crates/stdarch-gen",
+  "examples/"
+]
+exclude = [
+  "crates/wasm-assert-instr-tests"
+]
+
+[profile.release]
+debug = true
+opt-level = 3
+incremental = true
+
+[profile.bench]
+debug = 1
+opt-level = 3
+incremental = true
+-- 
+2.26.2.7.g19db9cfb68.dirty
+
--- a/compiler/rustc_codegen_gcc/patches/0001-Disable-examples.patch
+++ b/compiler/rustc_codegen_gcc/patches/0001-Disable-examples.patch
@ -0,0 +1,25 @@
+From a2d53a324a02c04b76c0e9d39dc15cd443a3b8b2 Mon Sep 17 00:00:00 2001
+From: Antoni Boucher <bouanto@zoho.com>
+Date: Fri, 25 Nov 2022 11:18:11 -0500
+Subject: [PATCH] Disable examples
+
+---
+ library/stdarch/Cargo.toml | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/library/stdarch/Cargo.toml b/library/stdarch/Cargo.toml
+index fbe0a95..748d72d 100644
+--- a/library/stdarch/Cargo.toml
+++ b/library/stdarch/Cargo.toml
+@@ -3,7 +3,7 @@ members = [
+   "crates/core_arch",
+   "crates/std_detect",
+   "crates/stdarch-gen",
+-  "examples/"
+  #"examples/"
+ ]
+ exclude = [
+   "crates/wasm-assert-instr-tests"
+-- 
+2.26.2.7.g19db9cfb68.dirty
+
--- a/compiler/rustc_codegen_gcc/patches/0022-core-Disable-not-compiling-tests.patch
+++ b/compiler/rustc_codegen_gcc/patches/0022-core-Disable-not-compiling-tests.patch
@ -18,7 +18,7 @@ new file mode 100644
 index 0000000..46fd999
 --- /dev/null
 +++ b/library/core/tests/Cargo.toml
-@@ -0,0 +1,8 @@
+@@ -0,0 +1,12 @@
 +[package]
 +name = "core"
 +version = "0.0.0"
@ -27,37 +27,18 @@ index 0000000..46fd999
 +[lib]
 +name = "coretests"
 +path = "lib.rs"
-diff --git a/library/core/tests/num/flt2dec/mod.rs b/library/core/tests/num/flt2dec/mod.rs
-index a35897e..f0bf645 100644
--- a/library/core/tests/num/flt2dec/mod.rs
-+++ b/library/core/tests/num/flt2dec/mod.rs
-@@ -13,7 +13,6 @@ mod strategy {
-     mod dragon;
-     mod grisu;
- }
-mod random;
- 
- pub fn decode_finite<T: DecodableFloat>(v: T) -> Decoded {
-     match decode(v).1 {
-diff --git a/library/core/tests/slice.rs b/library/core/tests/slice.rs
-index 6609bc3..241b497 100644
--- a/library/core/tests/slice.rs
-+++ b/library/core/tests/slice.rs
-@@ -1209,6 +1209,7 @@ fn brute_force_rotate_test_1() {
-     }
- }
- 
-+/*
- #[test]
- #[cfg(not(target_arch = "wasm32"))]
- fn sort_unstable() {
-@@ -1394,6 +1395,7 @@ fn partition_at_index() {
-     v.select_nth_unstable(0);
-     assert!(v == [0xDEADBEEF]);
- }
-+*/
- 
- #[test]
- #[should_panic(expected = "index 0 greater than length of slice")]
+
+[dependencies]
+rand = { version = "0.8.5", default-features = false }
+rand_xorshift = { version = "0.3.0", default-features = false }
+diff --git a/library/core/tests/lib.rs b/library/core/tests/lib.rs
+index 42a26ae..5ac1042 100644
+--- a/library/core/tests/lib.rs
+++ b/library/core/tests/lib.rs
+@@ -1,3 +1,4 @@
+#![cfg(test)]
+ #![feature(alloc_layout_extra)]
+ #![feature(array_chunks)]
+ #![feature(array_methods)]
 --
 2.21.0 (Apple Git-122)
--- a/compiler/rustc_codegen_gcc/patches/0024-core-Disable-portable-simd-test.patch
+++ b/compiler/rustc_codegen_gcc/patches/0024-core-Disable-portable-simd-test.patch
@ -1,28 +0,0 @@
-From b1ae000f6da1abd3b8e9b80c40bc11c89b8ae93c Mon Sep 17 00:00:00 2001
-From: bjorn3 <bjorn3@users.noreply.github.com>
-Date: Thu, 30 Dec 2021 16:54:40 +0100
-Subject: [PATCH] [core] Disable portable-simd test
-
---
- library/core/tests/lib.rs | 1 -
- 1 file changed, 1 deletion(-)
-
-diff --git a/library/core/tests/lib.rs b/library/core/tests/lib.rs
-index 06c7be0..359e2e7 100644
--- a/library/core/tests/lib.rs
-+++ b/library/core/tests/lib.rs
-@@ -75,7 +75,6 @@
- #![feature(never_type)]
- #![feature(unwrap_infallible)]
-#![feature(portable_simd)]
- #![feature(ptr_metadata)]
- #![feature(once_cell)]
- #![feature(option_result_contains)]
-@@ -127,7 +126,6 @@ mod pin;
- mod pin_macro;
- mod ptr;
- mod result;
-mod simd;
- mod slice;
- mod str;
- mod str_lossy;
--- a/compiler/rustc_codegen_gcc/rust-toolchain
+++ b/compiler/rustc_codegen_gcc/rust-toolchain
@ -1,3 +1,3 @@
 [toolchain]
-channel = "nightly-2022-06-06"
+channel = "nightly-2023-03-02"
 components = ["rust-src", "rustc-dev", "llvm-tools-preview"]
--- a/compiler/rustc_codegen_gcc/rustc_patches/compile_test.patch
+++ b/compiler/rustc_codegen_gcc/rustc_patches/compile_test.patch
@ -1,14 +0,0 @@
-diff --git a/src/tools/compiletest/src/header.rs b/src/tools/compiletest/src/header.rs
-index 887d27fd6dca4..2c2239f2b83d1 100644
--- a/src/tools/compiletest/src/header.rs
-+++ b/src/tools/compiletest/src/header.rs
-@@ -806,8 +806,8 @@ pub fn make_test_description<R: Read>(
-     cfg: Option<&str>,
- ) -> test::TestDesc {
-     let mut ignore = false;
-     #[cfg(not(bootstrap))]
-    let ignore_message: Option<String> = None;
-+    let ignore_message: Option<&str> = None;
-     let mut should_fail = false;
-
-     let rustc_has_profiler_support = env::var_os("RUSTC_PROFILER_SUPPORT").is_some();
--- a/compiler/rustc_codegen_gcc/src/allocator.rs
+++ b/compiler/rustc_codegen_gcc/src/allocator.rs
@ -1,3 +1,5 @@
+#[cfg(feature="master")]
+use gccjit::FnAttribute;
 use gccjit::{FunctionType, GlobalKind, ToRValue};
 use rustc_ast::expand::allocator::{AllocatorKind, AllocatorTy, ALLOCATOR_METHODS};
 use rustc_middle::bug;
@ -50,7 +52,8 @@ pub(crate) unsafe fn codegen(tcx: TyCtxt<'_>, mods: &mut GccContext, _module_nam
        let func = context.new_function(None, FunctionType::Exported, output.unwrap_or(void), &args, name, false);

        if tcx.sess.target.options.default_hidden_visibility {
-            // TODO(antoyo): set visibility.
+            #[cfg(feature="master")]
+            func.add_attribute(FnAttribute::Visibility(gccjit::Visibility::Hidden));
        }
        if tcx.sess.must_emit_unwind_tables() {
            // TODO(antoyo): emit unwind tables.
@ -61,7 +64,8 @@ pub(crate) unsafe fn codegen(tcx: TyCtxt<'_>, mods: &mut GccContext, _module_nam
            .map(|(index, typ)| context.new_parameter(None, *typ, &format!("param{}", index)))
            .collect();
        let callee = context.new_function(None, FunctionType::Extern, output.unwrap_or(void), &args, callee, false);
-        // TODO(antoyo): set visibility.
+        #[cfg(feature="master")]
+        callee.add_attribute(FnAttribute::Visibility(gccjit::Visibility::Hidden));

        let block = func.new_block("entry");

@ -90,12 +94,18 @@ pub(crate) unsafe fn codegen(tcx: TyCtxt<'_>, mods: &mut GccContext, _module_nam
        .collect();
    let func = context.new_function(None, FunctionType::Exported, void, &args, name, false);

+    if tcx.sess.target.default_hidden_visibility {
+        #[cfg(feature="master")]
+        func.add_attribute(FnAttribute::Visibility(gccjit::Visibility::Hidden));
+    }
+
    let callee = alloc_error_handler_kind.fn_name(sym::oom);
    let args: Vec<_> = types.iter().enumerate()
        .map(|(index, typ)| context.new_parameter(None, *typ, &format!("param{}", index)))
        .collect();
    let callee = context.new_function(None, FunctionType::Extern, void, &args, callee, false);
-    //llvm::LLVMRustSetVisibility(callee, llvm::Visibility::Hidden);
+    #[cfg(feature="master")]
+    callee.add_attribute(FnAttribute::Visibility(gccjit::Visibility::Hidden));

    let block = func.new_block("entry");

--- a/compiler/rustc_codegen_gcc/src/asm.rs
+++ b/compiler/rustc_codegen_gcc/src/asm.rs
@ -157,7 +157,7 @@ impl<'a, 'gcc, 'tcx> AsmBuilderMethods<'tcx> for Builder<'a, 'gcc, 'tcx> {
                    use ConstraintOrRegister::*;

                    let (constraint, ty) = match (reg_to_gcc(reg), place) {
-                        (Constraint(constraint), Some(place)) => (constraint, place.layout.gcc_type(self.cx, false)),
+                        (Constraint(constraint), Some(place)) => (constraint, place.layout.gcc_type(self.cx)),
                        // When `reg` is a class and not an explicit register but the out place is not specified,
                        // we need to create an unused output variable to assign the output to. This var
                        // needs to be of a type that's "compatible" with the register class, but specific type
@ -226,7 +226,7 @@ impl<'a, 'gcc, 'tcx> AsmBuilderMethods<'tcx> for Builder<'a, 'gcc, 'tcx> {
                    // This decision is also backed by the fact that LLVM needs in and out
                    // values to be of *exactly the same type*, not just "compatible".
                    // I'm not sure if GCC is so picky too, but better safe than sorry.
-                    let ty = in_value.layout.gcc_type(self.cx, false);
+                    let ty = in_value.layout.gcc_type(self.cx);
                    let tmp_var = self.current_func().new_local(None, ty, "output_register");

                    // If the out_place is None (i.e `inout(reg) _` syntax was used), we translate
@ -286,7 +286,7 @@ impl<'a, 'gcc, 'tcx> AsmBuilderMethods<'tcx> for Builder<'a, 'gcc, 'tcx> {
                            continue
                        };

-                        let ty = out_place.layout.gcc_type(self.cx, false);
+                        let ty = out_place.layout.gcc_type(self.cx);
                        let tmp_var = self.current_func().new_local(None, ty, "output_register");
                        tmp_var.set_register_name(reg_name);

@ -306,7 +306,7 @@ impl<'a, 'gcc, 'tcx> AsmBuilderMethods<'tcx> for Builder<'a, 'gcc, 'tcx> {
                // `in("explicit register") var`
                InlineAsmOperandRef::In { reg, value } => {
                    if let ConstraintOrRegister::Register(reg_name) = reg_to_gcc(reg) {
-                        let ty = value.layout.gcc_type(self.cx, false);
+                        let ty = value.layout.gcc_type(self.cx);
                        let reg_var = self.current_func().new_local(None, ty, "input_register");
                        reg_var.set_register_name(reg_name);
                        self.llbb().add_assignment(None, reg_var, value.immediate());
@ -325,7 +325,7 @@ impl<'a, 'gcc, 'tcx> AsmBuilderMethods<'tcx> for Builder<'a, 'gcc, 'tcx> {
                InlineAsmOperandRef::InOut { reg, late, in_value, out_place } => {
                    if let ConstraintOrRegister::Register(reg_name) = reg_to_gcc(reg) {
                        // See explanation in the first pass.
-                        let ty = in_value.layout.gcc_type(self.cx, false);
+                        let ty = in_value.layout.gcc_type(self.cx);
                        let tmp_var = self.current_func().new_local(None, ty, "output_register");
                        tmp_var.set_register_name(reg_name);

@ -353,8 +353,7 @@ impl<'a, 'gcc, 'tcx> AsmBuilderMethods<'tcx> for Builder<'a, 'gcc, 'tcx> {
                    inputs.push(AsmInOperand {
                        constraint: "X".into(),
                        rust_idx,
-                        val: self.cx.rvalue_as_function(get_fn(self.cx, instance))
-                            .get_address(None),
+                        val: get_fn(self.cx, instance).get_address(None),
                    });
                }

@ -382,15 +381,19 @@ impl<'a, 'gcc, 'tcx> AsmBuilderMethods<'tcx> for Builder<'a, 'gcc, 'tcx> {
        for piece in template {
            match *piece {
                InlineAsmTemplatePiece::String(ref string) => {
-                    // TODO(@Commeownist): switch to `Iterator::intersperse` once it's stable
-                    let mut iter = string.split('%');
-                    if let Some(s) = iter.next() {
-                        template_str.push_str(s);
-                    }
-
-                    for s in iter {
-                        template_str.push_str("%%");
-                        template_str.push_str(s);
+                    for char in string.chars() {
+                        // TODO(antoyo): might also need to escape | if rustc doesn't do it.
+                        let escaped_char =
+                            match char {
+                                '%' => "%%",
+                                '{' => "%{",
+                                '}' => "%}",
+                                _ => {
+                                    template_str.push(char);
+                                    continue;
+                                },
+                            };
+                        template_str.push_str(escaped_char);
                    }
                }
                InlineAsmTemplatePiece::Placeholder { operand_idx, modifier, span: _ } => {
@ -565,39 +568,52 @@ fn reg_to_gcc(reg: InlineAsmRegOrRegClass) -> ConstraintOrRegister {
                _ => unimplemented!(),
            }
        },
+        // They can be retrieved from https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
        InlineAsmRegOrRegClass::RegClass(reg) => match reg {
-            InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::preg) => unimplemented!(),
-            InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::reg) => unimplemented!(),
-            InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::vreg) => unimplemented!(),
-            InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::vreg_low16) => unimplemented!(),
-            InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg) => unimplemented!(),
+            InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::reg) => "r",
+            InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::vreg) => "w",
+            InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::vreg_low16) => "x",
+            InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::preg) => {
+                unreachable!("clobber-only")
+            }
+            InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg) => "r",
            InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg)
            | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::dreg_low16)
-            | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::qreg_low8) => unimplemented!(),
-            InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg_low16)
+            | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::qreg_low8)
+            | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg_low16)
            | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::dreg_low8)
-            | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::qreg_low4) => unimplemented!(),
-            InlineAsmRegClass::Arm(ArmInlineAsmRegClass::dreg)
-            | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::qreg) => unimplemented!(),
-            InlineAsmRegClass::Avr(_) => unimplemented!(),
-            InlineAsmRegClass::Bpf(_) => unimplemented!(),
-            InlineAsmRegClass::Hexagon(HexagonInlineAsmRegClass::reg) => unimplemented!(),
-            InlineAsmRegClass::Mips(MipsInlineAsmRegClass::reg) => unimplemented!(),
-            InlineAsmRegClass::Mips(MipsInlineAsmRegClass::freg) => unimplemented!(),
-            InlineAsmRegClass::Msp430(_) => unimplemented!(),
-            InlineAsmRegClass::Nvptx(NvptxInlineAsmRegClass::reg16) => unimplemented!(),
-            InlineAsmRegClass::Nvptx(NvptxInlineAsmRegClass::reg32) => unimplemented!(),
-            InlineAsmRegClass::Nvptx(NvptxInlineAsmRegClass::reg64) => unimplemented!(),
-            InlineAsmRegClass::PowerPC(PowerPCInlineAsmRegClass::reg) => unimplemented!(),
-            InlineAsmRegClass::PowerPC(PowerPCInlineAsmRegClass::reg_nonzero) => unimplemented!(),
-            InlineAsmRegClass::PowerPC(PowerPCInlineAsmRegClass::freg) => unimplemented!(),
+            | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::qreg_low4)
+            | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::dreg)
+            | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::qreg) => "t",
+            InlineAsmRegClass::Avr(AvrInlineAsmRegClass::reg) => "r",
+            InlineAsmRegClass::Avr(AvrInlineAsmRegClass::reg_upper) => "d",
+            InlineAsmRegClass::Avr(AvrInlineAsmRegClass::reg_pair) => "r",
+            InlineAsmRegClass::Avr(AvrInlineAsmRegClass::reg_iw) => "w",
+            InlineAsmRegClass::Avr(AvrInlineAsmRegClass::reg_ptr) => "e",
+            InlineAsmRegClass::Bpf(BpfInlineAsmRegClass::reg) => "r",
+            InlineAsmRegClass::Bpf(BpfInlineAsmRegClass::wreg) => "w",
+            InlineAsmRegClass::Hexagon(HexagonInlineAsmRegClass::reg) => "r",
+            InlineAsmRegClass::Mips(MipsInlineAsmRegClass::reg) => "d", // more specific than "r"
+            InlineAsmRegClass::Mips(MipsInlineAsmRegClass::freg) => "f",
+            InlineAsmRegClass::Msp430(Msp430InlineAsmRegClass::reg) => "r",
+            // https://github.com/gcc-mirror/gcc/blob/master/gcc/config/nvptx/nvptx.md -> look for
+            // "define_constraint".
+            InlineAsmRegClass::Nvptx(NvptxInlineAsmRegClass::reg16) => "h",
+            InlineAsmRegClass::Nvptx(NvptxInlineAsmRegClass::reg32) => "r",
+            InlineAsmRegClass::Nvptx(NvptxInlineAsmRegClass::reg64) => "l",
+
+            InlineAsmRegClass::PowerPC(PowerPCInlineAsmRegClass::reg) => "r",
+            InlineAsmRegClass::PowerPC(PowerPCInlineAsmRegClass::reg_nonzero) => "b",
+            InlineAsmRegClass::PowerPC(PowerPCInlineAsmRegClass::freg) => "f",
            InlineAsmRegClass::PowerPC(PowerPCInlineAsmRegClass::cr)
            | InlineAsmRegClass::PowerPC(PowerPCInlineAsmRegClass::xer) => {
                unreachable!("clobber-only")
            },
-            InlineAsmRegClass::RiscV(RiscVInlineAsmRegClass::reg) => unimplemented!(),
-            InlineAsmRegClass::RiscV(RiscVInlineAsmRegClass::freg) => unimplemented!(),
-            InlineAsmRegClass::RiscV(RiscVInlineAsmRegClass::vreg) => unimplemented!(),
+            InlineAsmRegClass::RiscV(RiscVInlineAsmRegClass::reg) => "r",
+            InlineAsmRegClass::RiscV(RiscVInlineAsmRegClass::freg) => "f",
+            InlineAsmRegClass::RiscV(RiscVInlineAsmRegClass::vreg) => {
+                unreachable!("clobber-only")
+            }
            InlineAsmRegClass::X86(X86InlineAsmRegClass::reg) => "r",
            InlineAsmRegClass::X86(X86InlineAsmRegClass::reg_abcd) => "Q",
            InlineAsmRegClass::X86(X86InlineAsmRegClass::reg_byte) => "q",
@ -605,16 +621,18 @@ fn reg_to_gcc(reg: InlineAsmRegOrRegClass) -> ConstraintOrRegister {
            | InlineAsmRegClass::X86(X86InlineAsmRegClass::ymm_reg) => "x",
            InlineAsmRegClass::X86(X86InlineAsmRegClass::zmm_reg) => "v",
            InlineAsmRegClass::X86(X86InlineAsmRegClass::kreg) => "Yk",
-            InlineAsmRegClass::X86(X86InlineAsmRegClass::kreg0) => unimplemented!(),
-            InlineAsmRegClass::Wasm(WasmInlineAsmRegClass::local) => unimplemented!(),
            InlineAsmRegClass::X86(
-                X86InlineAsmRegClass::x87_reg | X86InlineAsmRegClass::mmx_reg | X86InlineAsmRegClass::tmm_reg,
+                X86InlineAsmRegClass::kreg0
+                | X86InlineAsmRegClass::x87_reg
+                | X86InlineAsmRegClass::mmx_reg
+                | X86InlineAsmRegClass::tmm_reg,
            ) => unreachable!("clobber-only"),
            InlineAsmRegClass::SpirV(SpirVInlineAsmRegClass::reg) => {
                bug!("GCC backend does not support SPIR-V")
            }
-            InlineAsmRegClass::S390x(S390xInlineAsmRegClass::reg) => unimplemented!(),
-            InlineAsmRegClass::S390x(S390xInlineAsmRegClass::freg) => unimplemented!(),
+            InlineAsmRegClass::Wasm(WasmInlineAsmRegClass::local) => "r",
+            InlineAsmRegClass::S390x(S390xInlineAsmRegClass::reg) => "r",
+            InlineAsmRegClass::S390x(S390xInlineAsmRegClass::freg) => "f",
            InlineAsmRegClass::Err => unreachable!(),
        }
    };
@ -692,21 +710,23 @@ impl<'gcc, 'tcx> AsmMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
            && options.contains(InlineAsmOptions::ATT_SYNTAX);

        // Build the template string
-        let mut template_str = String::new();
+        let mut template_str = ".pushsection .text\n".to_owned();
+        if att_dialect {
+            template_str.push_str(".att_syntax\n");
+        }
        for piece in template {
            match *piece {
                InlineAsmTemplatePiece::String(ref string) => {
-                    for line in string.lines() {
+                    let mut index = 0;
+                    while index < string.len() {
                        // NOTE: gcc does not allow inline comment, so remove them.
-                        let line =
-                            if let Some(index) = line.rfind("//") {
-                                &line[..index]
-                            }
-                            else {
-                                line
-                            };
-                        template_str.push_str(line);
-                        template_str.push('\n');
+                        let comment_index = string[index..].find("//")
+                            .map(|comment_index| comment_index + index)
+                            .unwrap_or(string.len());
+                        template_str.push_str(&string[index..comment_index]);
+                        index = string[comment_index..].find('\n')
+                            .map(|index| index + comment_index)
+                            .unwrap_or(string.len());
                    }
                },
                InlineAsmTemplatePiece::Placeholder { operand_idx, modifier: _, span: _ } => {
@ -719,6 +739,8 @@ impl<'gcc, 'tcx> AsmMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
                        }

                        GlobalAsmOperandRef::SymFn { instance } => {
+                            let function = get_fn(self, instance);
+                            self.add_used_function(function);
                            // TODO(@Amanieu): Additional mangling is needed on
                            // some targets to add a leading underscore (Mach-O)
                            // or byte count suffixes (x86 Windows).
@ -727,6 +749,7 @@ impl<'gcc, 'tcx> AsmMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
                        }

                        GlobalAsmOperandRef::SymStatic { def_id } => {
+                            // TODO(antoyo): set the global variable as used.
                            // TODO(@Amanieu): Additional mangling is needed on
                            // some targets to add a leading underscore (Mach-O).
                            let instance = Instance::mono(self.tcx, def_id);
@ -738,48 +761,51 @@ impl<'gcc, 'tcx> AsmMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
            }
        }

-        let template_str =
-            if att_dialect {
-                format!(".att_syntax\n\t{}\n\t.intel_syntax noprefix", template_str)
-            }
-            else {
-                template_str
-            };
+        if att_dialect {
+            template_str.push_str("\n\t.intel_syntax noprefix");
+        }
        // NOTE: seems like gcc will put the asm in the wrong section, so set it to .text manually.
-        let template_str = format!(".pushsection .text\n{}\n.popsection", template_str);
+        template_str.push_str("\n.popsection");
        self.context.add_top_level_asm(None, &template_str);
    }
 }

 fn modifier_to_gcc(arch: InlineAsmArch, reg: InlineAsmRegClass, modifier: Option<char>) -> Option<char> {
+    // The modifiers can be retrieved from
+    // https://gcc.gnu.org/onlinedocs/gcc/Modifiers.html#Modifiers
    match reg {
        InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::reg) => modifier,
-        InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::preg) => modifier,
        InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::vreg)
        | InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::vreg_low16) => {
-            unimplemented!()
+            if modifier == Some('v') { None } else { modifier }
        }
-        InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg)  => unimplemented!(),
+        InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::preg) => {
+            unreachable!("clobber-only")
+        }
+        InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg) => None,
        InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg)
-        | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg_low16) => unimplemented!(),
+        | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg_low16) => None,
        InlineAsmRegClass::Arm(ArmInlineAsmRegClass::dreg)
        | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::dreg_low16)
-        | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::dreg_low8) => unimplemented!(),
+        | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::dreg_low8) => Some('P'),
        InlineAsmRegClass::Arm(ArmInlineAsmRegClass::qreg)
        | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::qreg_low8)
        | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::qreg_low4) => {
-            unimplemented!()
+            if modifier.is_none() {
+                Some('q')
+            } else {
+                modifier
+            }
        }
-        InlineAsmRegClass::Avr(_) => unimplemented!(),
-        InlineAsmRegClass::Bpf(_) => unimplemented!(),
-        InlineAsmRegClass::Hexagon(_) => unimplemented!(),
-        InlineAsmRegClass::Mips(_) => unimplemented!(),
-        InlineAsmRegClass::Msp430(_) => unimplemented!(),
-        InlineAsmRegClass::Nvptx(_) => unimplemented!(),
-        InlineAsmRegClass::PowerPC(_) => unimplemented!(),
+        InlineAsmRegClass::Hexagon(_) => None,
+        InlineAsmRegClass::Mips(_) => None,
+        InlineAsmRegClass::Nvptx(_) => None,
+        InlineAsmRegClass::PowerPC(_) => None,
        InlineAsmRegClass::RiscV(RiscVInlineAsmRegClass::reg)
-        | InlineAsmRegClass::RiscV(RiscVInlineAsmRegClass::freg) => unimplemented!(),
-        InlineAsmRegClass::RiscV(RiscVInlineAsmRegClass::vreg) => unimplemented!(),
+        | InlineAsmRegClass::RiscV(RiscVInlineAsmRegClass::freg) => None,
+        InlineAsmRegClass::RiscV(RiscVInlineAsmRegClass::vreg) => {
+            unreachable!("clobber-only")
+        }
        InlineAsmRegClass::X86(X86InlineAsmRegClass::reg)
        | InlineAsmRegClass::X86(X86InlineAsmRegClass::reg_abcd) => match modifier {
            None => if arch == InlineAsmArch::X86_64 { Some('q') } else { Some('k') },
@ -803,16 +829,29 @@ fn modifier_to_gcc(arch: InlineAsmArch, reg: InlineAsmRegClass, modifier: Option
            _ => unreachable!(),
        },
        InlineAsmRegClass::X86(X86InlineAsmRegClass::kreg) => None,
-        InlineAsmRegClass::X86(X86InlineAsmRegClass::kreg0) => None,
-        InlineAsmRegClass::X86(X86InlineAsmRegClass::x87_reg | X86InlineAsmRegClass::mmx_reg | X86InlineAsmRegClass::tmm_reg) => {
+        InlineAsmRegClass::X86(
+            X86InlineAsmRegClass::x87_reg
+            | X86InlineAsmRegClass::mmx_reg
+            | X86InlineAsmRegClass::kreg0
+            | X86InlineAsmRegClass::tmm_reg,
+        ) => {
            unreachable!("clobber-only")
        }
-        InlineAsmRegClass::Wasm(WasmInlineAsmRegClass::local) => unimplemented!(),
+        InlineAsmRegClass::Wasm(WasmInlineAsmRegClass::local) => None,
+        InlineAsmRegClass::Bpf(_) => None,
+        InlineAsmRegClass::Avr(AvrInlineAsmRegClass::reg_pair)
+        | InlineAsmRegClass::Avr(AvrInlineAsmRegClass::reg_iw)
+        | InlineAsmRegClass::Avr(AvrInlineAsmRegClass::reg_ptr) => match modifier {
+            Some('h') => Some('B'),
+            Some('l') => Some('A'),
+            _ => None,
+        },
+        InlineAsmRegClass::Avr(_) => None,
+        InlineAsmRegClass::S390x(_) => None,
+        InlineAsmRegClass::Msp430(_) => None,
        InlineAsmRegClass::SpirV(SpirVInlineAsmRegClass::reg) => {
            bug!("LLVM backend does not support SPIR-V")
-        },
-        InlineAsmRegClass::S390x(S390xInlineAsmRegClass::reg) => unimplemented!(),
-        InlineAsmRegClass::S390x(S390xInlineAsmRegClass::freg) => unimplemented!(),
+        }
        InlineAsmRegClass::Err => unreachable!(),
    }
 }
--- a/compiler/rustc_codegen_gcc/src/attributes.rs
+++ b/compiler/rustc_codegen_gcc/src/attributes.rs
@ -0,0 +1,112 @@
+#[cfg(feature="master")]
+use gccjit::FnAttribute;
+use gccjit::Function;
+use rustc_attr::InstructionSetAttr;
+use rustc_codegen_ssa::target_features::tied_target_features;
+use rustc_data_structures::fx::FxHashMap;
+use rustc_middle::ty;
+use rustc_session::Session;
+use rustc_span::symbol::sym;
+use smallvec::{smallvec, SmallVec};
+
+use crate::context::CodegenCx;
+
+// Given a map from target_features to whether they are enabled or disabled,
+// ensure only valid combinations are allowed.
+pub fn check_tied_features(sess: &Session, features: &FxHashMap<&str, bool>) -> Option<&'static [&'static str]> {
+    for tied in tied_target_features(sess) {
+        // Tied features must be set to the same value, or not set at all
+        let mut tied_iter = tied.iter();
+        let enabled = features.get(tied_iter.next().unwrap());
+        if tied_iter.any(|feature| enabled != features.get(feature)) {
+            return Some(tied);
+        }
+    }
+    None
+}
+
+// TODO(antoyo): maybe move to a new module gcc_util.
+// To find a list of GCC's names, check https://gcc.gnu.org/onlinedocs/gcc/Function-Attributes.html
+fn to_gcc_features<'a>(sess: &Session, s: &'a str) -> SmallVec<[&'a str; 2]> {
+    let arch = if sess.target.arch == "x86_64" { "x86" } else { &*sess.target.arch };
+    match (arch, s) {
+        ("x86", "sse4.2") => smallvec!["sse4.2", "crc32"],
+        ("x86", "pclmulqdq") => smallvec!["pclmul"],
+        ("x86", "rdrand") => smallvec!["rdrnd"],
+        ("x86", "bmi1") => smallvec!["bmi"],
+        ("x86", "cmpxchg16b") => smallvec!["cx16"],
+        ("x86", "avx512vaes") => smallvec!["vaes"],
+        ("x86", "avx512gfni") => smallvec!["gfni"],
+        ("x86", "avx512vpclmulqdq") => smallvec!["vpclmulqdq"],
+        // NOTE: seems like GCC requires 'avx512bw' for 'avx512vbmi2'.
+        ("x86", "avx512vbmi2") => smallvec!["avx512vbmi2", "avx512bw"],
+        // NOTE: seems like GCC requires 'avx512bw' for 'avx512bitalg'.
+        ("x86", "avx512bitalg") => smallvec!["avx512bitalg", "avx512bw"],
+        ("aarch64", "rcpc2") => smallvec!["rcpc-immo"],
+        ("aarch64", "dpb") => smallvec!["ccpp"],
+        ("aarch64", "dpb2") => smallvec!["ccdp"],
+        ("aarch64", "frintts") => smallvec!["fptoint"],
+        ("aarch64", "fcma") => smallvec!["complxnum"],
+        ("aarch64", "pmuv3") => smallvec!["perfmon"],
+        ("aarch64", "paca") => smallvec!["pauth"],
+        ("aarch64", "pacg") => smallvec!["pauth"],
+        // Rust ties fp and neon together. In LLVM neon implicitly enables fp,
+        // but we manually enable neon when a feature only implicitly enables fp
+        ("aarch64", "f32mm") => smallvec!["f32mm", "neon"],
+        ("aarch64", "f64mm") => smallvec!["f64mm", "neon"],
+        ("aarch64", "fhm") => smallvec!["fp16fml", "neon"],
+        ("aarch64", "fp16") => smallvec!["fullfp16", "neon"],
+        ("aarch64", "jsconv") => smallvec!["jsconv", "neon"],
+        ("aarch64", "sve") => smallvec!["sve", "neon"],
+        ("aarch64", "sve2") => smallvec!["sve2", "neon"],
+        ("aarch64", "sve2-aes") => smallvec!["sve2-aes", "neon"],
+        ("aarch64", "sve2-sm4") => smallvec!["sve2-sm4", "neon"],
+        ("aarch64", "sve2-sha3") => smallvec!["sve2-sha3", "neon"],
+        ("aarch64", "sve2-bitperm") => smallvec!["sve2-bitperm", "neon"],
+        (_, s) => smallvec![s],
+    }
+}
+
+/// Composite function which sets GCC attributes for function depending on its AST (`#[attribute]`)
+/// attributes.
+pub fn from_fn_attrs<'gcc, 'tcx>(
+    cx: &CodegenCx<'gcc, 'tcx>,
+    #[cfg_attr(not(feature="master"), allow(unused_variables))]
+    func: Function<'gcc>,
+    instance: ty::Instance<'tcx>,
+) {
+    let codegen_fn_attrs = cx.tcx.codegen_fn_attrs(instance.def_id());
+
+    let function_features =
+        codegen_fn_attrs.target_features.iter().map(|features| features.as_str()).collect::<Vec<&str>>();
+
+    if let Some(features) = check_tied_features(cx.tcx.sess, &function_features.iter().map(|features| (*features, true)).collect()) {
+        let span = cx.tcx
+            .get_attr(instance.def_id(), sym::target_feature)
+            .map_or_else(|| cx.tcx.def_span(instance.def_id()), |a| a.span);
+        let msg = format!("the target features {} must all be either enabled or disabled together", features.join(", "));
+        let mut err = cx.tcx.sess.struct_span_err(span, &msg);
+        err.help("add the missing features in a `target_feature` attribute");
+        err.emit();
+        return;
+    }
+
+    let mut function_features = function_features
+        .iter()
+        .flat_map(|feat| to_gcc_features(cx.tcx.sess, feat).into_iter())
+        .chain(codegen_fn_attrs.instruction_set.iter().map(|x| match x {
+            InstructionSetAttr::ArmA32 => "-thumb-mode", // TODO(antoyo): support removing feature.
+            InstructionSetAttr::ArmT32 => "thumb-mode",
+        }))
+        .collect::<Vec<_>>();
+
+    // TODO(antoyo): check if we really need global backend features. (Maybe they could be applied
+    // globally?)
+    let mut global_features = cx.tcx.global_backend_features(()).iter().map(|s| s.as_str());
+    function_features.extend(&mut global_features);
+    let target_features = function_features.join(",");
+    if !target_features.is_empty() {
+        #[cfg(feature="master")]
+        func.add_attribute(FnAttribute::Target(&target_features));
+    }
+}
--- a/compiler/rustc_codegen_gcc/src/back/write.rs
+++ b/compiler/rustc_codegen_gcc/src/back/write.rs
@ -57,6 +57,7 @@ pub(crate) unsafe fn codegen(cgcx: &CodegenContext<GccCodegenBackend>, _diag_han
                if env::var("CG_GCCJIT_DUMP_TO_FILE").as_deref() == Ok("1") {
                    let _ = fs::create_dir("/tmp/gccjit_dumps");
                    let path = &format!("/tmp/gccjit_dumps/{}.c", module.name);
+                    context.set_debug_info(true);
                    context.dump_to_file(path, true);
                }
                context.compile_to_file(OutputKind::ObjectFile, obj_out.to_str().expect("path to str"));
--- a/compiler/rustc_codegen_gcc/src/base.rs
+++ b/compiler/rustc_codegen_gcc/src/base.rs
@ -8,6 +8,8 @@ use gccjit::{
 };
 use rustc_middle::dep_graph;
 use rustc_middle::ty::TyCtxt;
+#[cfg(feature="master")]
+use rustc_middle::mir::mono::Visibility;
 use rustc_middle::mir::mono::Linkage;
 use rustc_codegen_ssa::{ModuleCodegen, ModuleKind};
 use rustc_codegen_ssa::base::maybe_create_entry_wrapper;
@ -20,6 +22,15 @@ use crate::GccContext;
 use crate::builder::Builder;
 use crate::context::CodegenCx;

+#[cfg(feature="master")]
+pub fn visibility_to_gcc(linkage: Visibility) -> gccjit::Visibility {
+    match linkage {
+        Visibility::Default => gccjit::Visibility::Default,
+        Visibility::Hidden => gccjit::Visibility::Hidden,
+        Visibility::Protected => gccjit::Visibility::Protected,
+    }
+}
+
 pub fn global_linkage_to_gcc(linkage: Linkage) -> GlobalKind {
    match linkage {
        Linkage::External => GlobalKind::Imported,
@ -76,16 +87,34 @@ pub fn compile_codegen_unit(tcx: TyCtxt<'_>, cgu_name: Symbol, supports_128bit_i
        // Instantiate monomorphizations without filling out definitions yet...
        //let llvm_module = ModuleLlvm::new(tcx, &cgu_name.as_str());
        let context = Context::default();
+
+        context.add_command_line_option("-fexceptions");
+        context.add_driver_option("-fexceptions");
+
        // TODO(antoyo): only set on x86 platforms.
        context.add_command_line_option("-masm=intel");
        // TODO(antoyo): only add the following cli argument if the feature is supported.
        context.add_command_line_option("-msse2");
        context.add_command_line_option("-mavx2");
-        context.add_command_line_option("-msha");
-        context.add_command_line_option("-mpclmul");
        // FIXME(antoyo): the following causes an illegal instruction on vmovdqu64 in std_example on my CPU.
        // Only add if the CPU supports it.
-        //context.add_command_line_option("-mavx512f");
+        context.add_command_line_option("-msha");
+        context.add_command_line_option("-mpclmul");
+        context.add_command_line_option("-mfma");
+        context.add_command_line_option("-mfma4");
+        context.add_command_line_option("-m64");
+        context.add_command_line_option("-mbmi");
+        context.add_command_line_option("-mgfni");
+        //context.add_command_line_option("-mavxvnni"); // The CI doesn't support this option.
+        context.add_command_line_option("-mf16c");
+        context.add_command_line_option("-maes");
+        context.add_command_line_option("-mxsavec");
+        context.add_command_line_option("-mbmi2");
+        context.add_command_line_option("-mrtm");
+        context.add_command_line_option("-mvaes");
+        context.add_command_line_option("-mvpclmulqdq");
+        context.add_command_line_option("-mavx");
+
        for arg in &tcx.sess.opts.cg.llvm_args {
            context.add_command_line_option(arg);
        }
@ -95,12 +124,20 @@ pub fn compile_codegen_unit(tcx: TyCtxt<'_>, cgu_name: Symbol, supports_128bit_i
        context.add_command_line_option("-fno-semantic-interposition");
        // NOTE: Rust relies on LLVM not doing TBAA (https://github.com/rust-lang/unsafe-code-guidelines/issues/292).
        context.add_command_line_option("-fno-strict-aliasing");
+        // NOTE: Rust relies on LLVM doing wrapping on overflow.
+        context.add_command_line_option("-fwrapv");

        if tcx.sess.opts.unstable_opts.function_sections.unwrap_or(tcx.sess.target.function_sections) {
            context.add_command_line_option("-ffunction-sections");
            context.add_command_line_option("-fdata-sections");
        }

+        if env::var("CG_GCCJIT_DUMP_RTL").as_deref() == Ok("1") {
+            context.add_command_line_option("-fdump-rtl-vregs");
+        }
+        if env::var("CG_GCCJIT_DUMP_TREE_ALL").as_deref() == Ok("1") {
+            context.add_command_line_option("-fdump-tree-all");
+        }
        if env::var("CG_GCCJIT_DUMP_CODE").as_deref() == Ok("1") {
            context.set_dump_code_on_compile(true);
        }
@ -115,7 +152,7 @@ pub fn compile_codegen_unit(tcx: TyCtxt<'_>, cgu_name: Symbol, supports_128bit_i
            context.set_keep_intermediates(true);
        }

-        // TODO(bjorn3): Remove once unwinding is properly implemented
+        // NOTE: The codegen generates unrechable blocks.
        context.set_allow_unreachable_blocks(true);

        {
--- a/compiler/rustc_codegen_gcc/src/builder.rs
+++ b/compiler/rustc_codegen_gcc/src/builder.rs
@ -217,7 +217,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {

                let actual_ty = actual_val.get_type();
                if expected_ty != actual_ty {
-                    if !actual_ty.is_vector() && !expected_ty.is_vector() && actual_ty.is_integral() && expected_ty.is_integral() && actual_ty.get_size() != expected_ty.get_size() {
+                    if !actual_ty.is_vector() && !expected_ty.is_vector() && (actual_ty.is_integral() && expected_ty.is_integral()) || (actual_ty.get_pointee().is_some() && expected_ty.get_pointee().is_some()) {
                        self.context.new_cast(None, actual_val, expected_ty)
                    }
                    else if on_stack_param_indices.contains(&index) {
@ -226,6 +226,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
                    else {
                        assert!(!((actual_ty.is_vector() && !expected_ty.is_vector()) || (!actual_ty.is_vector() && expected_ty.is_vector())), "{:?} ({}) -> {:?} ({}), index: {:?}[{}]", actual_ty, actual_ty.is_vector(), expected_ty, expected_ty.is_vector(), func_ptr, index);
                        // TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
+                        // TODO: remove bitcast now that vector types can be compared?
                        self.bitcast(actual_val, expected_ty)
                    }
                }
@ -279,21 +280,30 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
    }

    fn function_ptr_call(&mut self, func_ptr: RValue<'gcc>, args: &[RValue<'gcc>], _funclet: Option<&Funclet>) -> RValue<'gcc> {
-        let args = self.check_ptr_call("call", func_ptr, args);
+        let gcc_func = func_ptr.get_type().dyncast_function_ptr_type().expect("function ptr");
+        let func_name = format!("{:?}", func_ptr);
+        let previous_arg_count = args.len();
+        let orig_args = args;
+        let args = {
+            let function_address_names = self.function_address_names.borrow();
+            let original_function_name = function_address_names.get(&func_ptr);
+            llvm::adjust_intrinsic_arguments(&self, gcc_func, args.into(), &func_name, original_function_name)
+        };
+        let args_adjusted = args.len() != previous_arg_count;
+        let args = self.check_ptr_call("call", func_ptr, &*args);

        // gccjit requires to use the result of functions, even when it's not used.
        // That's why we assign the result to a local or call add_eval().
-        let gcc_func = func_ptr.get_type().dyncast_function_ptr_type().expect("function ptr");
        let return_type = gcc_func.get_return_type();
        let void_type = self.context.new_type::<()>();
        let current_func = self.block.get_function();

        if return_type != void_type {
            unsafe { RETURN_VALUE_COUNT += 1 };
-            let result = current_func.new_local(None, return_type, &format!("ptrReturnValue{}", unsafe { RETURN_VALUE_COUNT }));
-            let func_name = format!("{:?}", func_ptr);
-            let args = llvm::adjust_intrinsic_arguments(&self, gcc_func, args, &func_name);
-            self.block.add_assignment(None, result, self.cx.context.new_call_through_ptr(None, func_ptr, &args));
+            let return_value = self.cx.context.new_call_through_ptr(None, func_ptr, &args);
+            let return_value = llvm::adjust_intrinsic_return_value(&self, return_value, &func_name, &args, args_adjusted, orig_args);
+            let result = current_func.new_local(None, return_value.get_type(), &format!("ptrReturnValue{}", unsafe { RETURN_VALUE_COUNT }));
+            self.block.add_assignment(None, result, return_value);
            result.to_rvalue()
        }
        else {
@ -366,10 +376,10 @@ impl<'tcx> FnAbiOfHelpers<'tcx> for Builder<'_, '_, 'tcx> {
    }
 }

-impl<'gcc, 'tcx> Deref for Builder<'_, 'gcc, 'tcx> {
+impl<'a, 'gcc, 'tcx> Deref for Builder<'a, 'gcc, 'tcx> {
    type Target = CodegenCx<'gcc, 'tcx>;

-    fn deref(&self) -> &Self::Target {
+    fn deref<'b>(&'b self) -> &'a Self::Target {
        self.cx
    }
 }
@ -387,7 +397,7 @@ impl<'gcc, 'tcx> BackendTypes for Builder<'_, 'gcc, 'tcx> {
 }

 impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
-    fn build(cx: &'a CodegenCx<'gcc, 'tcx>, block: Block<'gcc>) -> Self {
+    fn build(cx: &'a CodegenCx<'gcc, 'tcx>, block: Block<'gcc>) -> Builder<'a, 'gcc, 'tcx> {
        Builder::with_cx(cx, block)
    }

@ -444,17 +454,36 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
        self.block.end_with_switch(None, value, default_block, &gcc_cases);
    }

-    fn invoke(
-        &mut self,
-        typ: Type<'gcc>,
-        fn_abi: Option<&FnAbi<'tcx, Ty<'tcx>>>,
-        func: RValue<'gcc>,
-        args: &[RValue<'gcc>],
-        then: Block<'gcc>,
-        catch: Block<'gcc>,
-        _funclet: Option<&Funclet>,
-    ) -> RValue<'gcc> {
-        // TODO(bjorn3): Properly implement unwinding.
+    #[cfg(feature="master")]
+    fn invoke(&mut self, typ: Type<'gcc>, _fn_abi: Option<&FnAbi<'tcx, Ty<'tcx>>>, func: RValue<'gcc>, args: &[RValue<'gcc>], then: Block<'gcc>, catch: Block<'gcc>, _funclet: Option<&Funclet>) -> RValue<'gcc> {
+        let try_block = self.current_func().new_block("try");
+
+        let current_block = self.block.clone();
+        self.block = try_block;
+        let call = self.call(typ, None, func, args, None); // TODO(antoyo): use funclet here?
+        self.block = current_block;
+
+        let return_value = self.current_func()
+            .new_local(None, call.get_type(), "invokeResult");
+
+        try_block.add_assignment(None, return_value, call);
+
+        try_block.end_with_jump(None, then);
+
+        if self.cleanup_blocks.borrow().contains(&catch) {
+            self.block.add_try_finally(None, try_block, catch);
+        }
+        else {
+            self.block.add_try_catch(None, try_block, catch);
+        }
+
+        self.block.end_with_jump(None, then);
+
+        return_value.to_rvalue()
+    }
+
+    #[cfg(not(feature="master"))]
+    fn invoke(&mut self, typ: Type<'gcc>, fn_abi: Option<&FnAbi<'tcx, Ty<'tcx>>>, func: RValue<'gcc>, args: &[RValue<'gcc>], then: Block<'gcc>, catch: Block<'gcc>, _funclet: Option<&Funclet>) -> RValue<'gcc> {
        let call_site = self.call(typ, None, func, args, None);
        let condition = self.context.new_rvalue_from_int(self.bool_type, 1);
        self.llbb().end_with_conditional(None, condition, then, catch);
@ -542,6 +571,31 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
    }

    fn frem(&mut self, a: RValue<'gcc>, b: RValue<'gcc>) -> RValue<'gcc> {
+        // TODO(antoyo): add check in libgccjit since using the binary operator % causes the following error:
+        // during RTL pass: expand
+        // libgccjit.so: error: in expmed_mode_index, at expmed.h:240
+        // 0x7f0101d58dc6 expmed_mode_index
+        //     ../../../gcc/gcc/expmed.h:240
+        // 0x7f0101d58e35 expmed_op_cost_ptr
+        //     ../../../gcc/gcc/expmed.h:262
+        // 0x7f0101d594a1 sdiv_cost_ptr
+        //     ../../../gcc/gcc/expmed.h:531
+        // 0x7f0101d594f3 sdiv_cost
+        //     ../../../gcc/gcc/expmed.h:549
+        // 0x7f0101d6af7e expand_divmod(int, tree_code, machine_mode, rtx_def*, rtx_def*, rtx_def*, int, optab_methods)
+        //     ../../../gcc/gcc/expmed.cc:4356
+        // 0x7f0101d94f9e expand_expr_divmod
+        //     ../../../gcc/gcc/expr.cc:8929
+        // 0x7f0101d97a26 expand_expr_real_2(separate_ops*, rtx_def*, machine_mode, expand_modifier)
+        //     ../../../gcc/gcc/expr.cc:9566
+        // 0x7f0101bef6ef expand_gimple_stmt_1
+        //     ../../../gcc/gcc/cfgexpand.cc:3967
+        // 0x7f0101bef910 expand_gimple_stmt
+        //     ../../../gcc/gcc/cfgexpand.cc:4028
+        // 0x7f0101bf6ee7 expand_gimple_basic_block
+        //     ../../../gcc/gcc/cfgexpand.cc:6069
+        // 0x7f0101bf9194 execute
+        //     ../../../gcc/gcc/cfgexpand.cc:6795
        if a.get_type().is_compatible_with(self.cx.float_type) {
            let fmodf = self.context.get_builtin_function("fmodf");
            // FIXME(antoyo): this seems to produce the wrong result.
@ -616,24 +670,29 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
        a * b
    }

-    fn fadd_fast(&mut self, _lhs: RValue<'gcc>, _rhs: RValue<'gcc>) -> RValue<'gcc> {
-        unimplemented!();
+    fn fadd_fast(&mut self, lhs: RValue<'gcc>, rhs: RValue<'gcc>) -> RValue<'gcc> {
+        // NOTE: it seems like we cannot enable fast-mode for a single operation in GCC.
+        lhs + rhs
    }

-    fn fsub_fast(&mut self, _lhs: RValue<'gcc>, _rhs: RValue<'gcc>) -> RValue<'gcc> {
-        unimplemented!();
+    fn fsub_fast(&mut self, lhs: RValue<'gcc>, rhs: RValue<'gcc>) -> RValue<'gcc> {
+        // NOTE: it seems like we cannot enable fast-mode for a single operation in GCC.
+        lhs - rhs
    }

-    fn fmul_fast(&mut self, _lhs: RValue<'gcc>, _rhs: RValue<'gcc>) -> RValue<'gcc> {
-        unimplemented!();
+    fn fmul_fast(&mut self, lhs: RValue<'gcc>, rhs: RValue<'gcc>) -> RValue<'gcc> {
+        // NOTE: it seems like we cannot enable fast-mode for a single operation in GCC.
+        lhs * rhs
    }

-    fn fdiv_fast(&mut self, _lhs: RValue<'gcc>, _rhs: RValue<'gcc>) -> RValue<'gcc> {
-        unimplemented!();
+    fn fdiv_fast(&mut self, lhs: RValue<'gcc>, rhs: RValue<'gcc>) -> RValue<'gcc> {
+        // NOTE: it seems like we cannot enable fast-mode for a single operation in GCC.
+        lhs / rhs
    }

-    fn frem_fast(&mut self, _lhs: RValue<'gcc>, _rhs: RValue<'gcc>) -> RValue<'gcc> {
-        unimplemented!();
+    fn frem_fast(&mut self, lhs: RValue<'gcc>, rhs: RValue<'gcc>) -> RValue<'gcc> {
+        // NOTE: it seems like we cannot enable fast-mode for a single operation in GCC.
+        self.frem(lhs, rhs)
    }

    fn checked_binop(&mut self, oop: OverflowOp, typ: Ty<'_>, lhs: Self::Value, rhs: Self::Value) -> (Self::Value, Self::Value) {
@ -722,7 +781,7 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
            }
            else if place.layout.is_gcc_immediate() {
                let load = self.load(
-                    place.layout.gcc_type(self, false),
+                    place.layout.gcc_type(self),
                    place.llval,
                    place.align,
                );
@ -733,7 +792,7 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
            }
            else if let abi::Abi::ScalarPair(ref a, ref b) = place.layout.abi {
                let b_offset = a.size(self).align_to(b.align(self).abi);
-                let pair_type = place.layout.gcc_type(self, false);
+                let pair_type = place.layout.gcc_type(self);

                let mut load = |i, scalar: &abi::Scalar, align| {
                    let llptr = self.struct_gep(pair_type, place.llval, i as u64);
@ -833,26 +892,31 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
    }

    fn gep(&mut self, _typ: Type<'gcc>, ptr: RValue<'gcc>, indices: &[RValue<'gcc>]) -> RValue<'gcc> {
-        let mut result = ptr;
+        let ptr_type = ptr.get_type();
+        let mut pointee_type = ptr.get_type();
+        // NOTE: we cannot use array indexing here like in inbounds_gep because array indexing is
+        // always considered in bounds in GCC (TODO(antoyo): to be verified).
+        // So, we have to cast to a number.
+        let mut result = self.context.new_bitcast(None, ptr, self.sizet_type);
+        // FIXME(antoyo): if there were more than 1 index, this code is probably wrong and would
+        // require dereferencing the pointer.
        for index in indices {
-            result = self.context.new_array_access(None, result, *index).get_address(None).to_rvalue();
+            pointee_type = pointee_type.get_pointee().expect("pointee type");
+            let pointee_size = self.context.new_rvalue_from_int(index.get_type(), pointee_type.get_size() as i32);
+            result = result + self.gcc_int_cast(*index * pointee_size, self.sizet_type);
        }
-        result
+        self.context.new_bitcast(None, result, ptr_type)
    }

    fn inbounds_gep(&mut self, _typ: Type<'gcc>, ptr: RValue<'gcc>, indices: &[RValue<'gcc>]) -> RValue<'gcc> {
-        // FIXME(antoyo): would be safer if doing the same thing (loop) as gep.
-        // TODO(antoyo): specify inbounds somehow.
-        match indices.len() {
-            1 => {
-                self.context.new_array_access(None, ptr, indices[0]).get_address(None)
-            },
-            2 => {
-                let array = ptr.dereference(None); // TODO(antoyo): assert that first index is 0?
-                self.context.new_array_access(None, array, indices[1]).get_address(None)
-            },
-            _ => unimplemented!(),
+        // NOTE: array indexing is always considered in bounds in GCC (TODO(antoyo): to be verified).
+        let mut indices = indices.into_iter();
+        let index = indices.next().expect("first index in inbounds_gep");
+        let mut result = self.context.new_array_access(None, ptr, *index);
+        for index in indices {
+            result = self.context.new_array_access(None, result, *index);
        }
+        result.get_address(None)
    }

    fn struct_gep(&mut self, value_type: Type<'gcc>, ptr: RValue<'gcc>, idx: u64) -> RValue<'gcc> {
@ -1034,8 +1098,19 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
        unimplemented!();
    }

-    fn extract_element(&mut self, _vec: RValue<'gcc>, _idx: RValue<'gcc>) -> RValue<'gcc> {
-        unimplemented!();
+    #[cfg(feature="master")]
+    fn extract_element(&mut self, vec: RValue<'gcc>, idx: RValue<'gcc>) -> RValue<'gcc> {
+        self.context.new_vector_access(None, vec, idx).to_rvalue()
+    }
+
+    #[cfg(not(feature="master"))]
+    fn extract_element(&mut self, vec: RValue<'gcc>, idx: RValue<'gcc>) -> RValue<'gcc> {
+        let vector_type = vec.get_type().unqualified().dyncast_vector().expect("Called extract_element on a non-vector type");
+        let element_type = vector_type.get_element_type();
+        let vec_num_units = vector_type.get_num_units();
+        let array_type = self.context.new_array_type(None, element_type, vec_num_units as u64);
+        let array = self.context.new_bitcast(None, vec, array_type).to_rvalue();
+        self.context.new_array_access(None, array, idx).to_rvalue()
    }

    fn vector_splat(&mut self, _num_elts: usize, _elt: RValue<'gcc>) -> RValue<'gcc> {
@ -1116,22 +1191,52 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
    }

    fn set_personality_fn(&mut self, _personality: RValue<'gcc>) {
-        // TODO(antoyo)
+        #[cfg(feature="master")]
+        {
+            let personality = self.rvalue_as_function(_personality);
+            self.current_func().set_personality_function(personality);
+        }
    }

+    #[cfg(feature="master")]
+    fn cleanup_landing_pad(&mut self, pers_fn: RValue<'gcc>) -> (RValue<'gcc>, RValue<'gcc>) {
+        self.set_personality_fn(pers_fn);
+
+        // NOTE: insert the current block in a variable so that a later call to invoke knows to
+        // generate a try/finally instead of a try/catch for this block.
+        self.cleanup_blocks.borrow_mut().insert(self.block);
+
+        let eh_pointer_builtin = self.cx.context.get_target_builtin_function("__builtin_eh_pointer");
+        let zero = self.cx.context.new_rvalue_zero(self.int_type);
+        let ptr = self.cx.context.new_call(None, eh_pointer_builtin, &[zero]);
+
+        let value1_type = self.u8_type.make_pointer();
+        let ptr = self.cx.context.new_cast(None, ptr, value1_type);
+        let value1 = ptr;
+        let value2 = zero; // TODO(antoyo): set the proper value here (the type of exception?).
+
+        (value1, value2)
+    }
+
+    #[cfg(not(feature="master"))]
    fn cleanup_landing_pad(&mut self, _pers_fn: RValue<'gcc>) -> (RValue<'gcc>, RValue<'gcc>) {
-        (
-            self.current_func().new_local(None, self.u8_type.make_pointer(), "landing_pad0")
-                .to_rvalue(),
-            self.current_func().new_local(None, self.i32_type, "landing_pad1").to_rvalue(),
-        )
-        // TODO(antoyo): Properly implement unwinding.
-        // the above is just to make the compilation work as it seems
-        // rustc_codegen_ssa now calls the unwinding builder methods even on panic=abort.
+        let value1 = self.current_func().new_local(None, self.u8_type.make_pointer(), "landing_pad0")
+                .to_rvalue();
+        let value2 = self.current_func().new_local(None, self.i32_type, "landing_pad1").to_rvalue();
+        (value1, value2)
    }

+    #[cfg(feature="master")]
+    fn resume(&mut self, exn0: RValue<'gcc>, _exn1: RValue<'gcc>) {
+        let exn_type = exn0.get_type();
+        let exn = self.context.new_cast(None, exn0, exn_type);
+        let unwind_resume = self.context.get_target_builtin_function("__builtin_unwind_resume");
+        self.llbb().add_eval(None, self.context.new_call(None, unwind_resume, &[exn]));
+        self.unreachable();
+    }
+
+    #[cfg(not(feature="master"))]
    fn resume(&mut self, _exn0: RValue<'gcc>, _exn1: RValue<'gcc>) {
-        // TODO(bjorn3): Properly implement unwinding.
        self.unreachable();
    }

@ -1160,6 +1265,15 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
    fn atomic_cmpxchg(&mut self, dst: RValue<'gcc>, cmp: RValue<'gcc>, src: RValue<'gcc>, order: AtomicOrdering, failure_order: AtomicOrdering, weak: bool) -> RValue<'gcc> {
        let expected = self.current_func().new_local(None, cmp.get_type(), "expected");
        self.llbb().add_assignment(None, expected, cmp);
+        // NOTE: gcc doesn't support a failure memory model that is stronger than the success
+        // memory model.
+        let order =
+            if failure_order as i32 > order as i32 {
+                failure_order
+            }
+            else {
+                order
+            };
        let success = self.compare_exchange(dst, expected, src, order, failure_order, weak);

        let pair_type = self.cx.type_struct(&[src.get_type(), self.bool_type], false);
@ -1469,7 +1583,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {

    #[cfg(feature="master")]
    pub fn shuffle_vector(&mut self, v1: RValue<'gcc>, v2: RValue<'gcc>, mask: RValue<'gcc>) -> RValue<'gcc> {
-        let struct_type = mask.get_type().is_struct().expect("mask of struct type");
+        let struct_type = mask.get_type().is_struct().expect("mask should be of struct type");

        // TODO(antoyo): use a recursive unqualified() here.
        let vector_type = v1.get_type().unqualified().dyncast_vector().expect("vector type");
@ -1501,22 +1615,17 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
            vector_elements.push(self.context.new_rvalue_zero(mask_element_type));
        }

-        let array_type = self.context.new_array_type(None, element_type, vec_num_units as i32);
        let result_type = self.context.new_vector_type(element_type, mask_num_units as u64);
        let (v1, v2) =
            if vec_num_units < mask_num_units {
                // NOTE: the mask needs to be the same length as the input vectors, so join the 2
                // vectors and create a dummy second vector.
-                // TODO(antoyo): switch to using new_vector_access.
-                let array = self.context.new_bitcast(None, v1, array_type);
                let mut elements = vec![];
                for i in 0..vec_num_units {
-                    elements.push(self.context.new_array_access(None, array, self.context.new_rvalue_from_int(self.int_type, i as i32)).to_rvalue());
+                    elements.push(self.context.new_vector_access(None, v1, self.context.new_rvalue_from_int(self.int_type, i as i32)).to_rvalue());
                }
-                // TODO(antoyo): switch to using new_vector_access.
-                let array = self.context.new_bitcast(None, v2, array_type);
                for i in 0..(mask_num_units - vec_num_units) {
-                    elements.push(self.context.new_array_access(None, array, self.context.new_rvalue_from_int(self.int_type, i as i32)).to_rvalue());
+                    elements.push(self.context.new_vector_access(None, v2, self.context.new_rvalue_from_int(self.int_type, i as i32)).to_rvalue());
                }
                let v1 = self.context.new_rvalue_from_vector(None, result_type, &elements);
                let zero = self.context.new_rvalue_zero(element_type);
@ -1536,10 +1645,8 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
            // NOTE: if padding was added, only select the number of elements of the masks to
            // remove that padding in the result.
            let mut elements = vec![];
-            // TODO(antoyo): switch to using new_vector_access.
-            let array = self.context.new_bitcast(None, result, array_type);
            for i in 0..mask_num_units {
-                elements.push(self.context.new_array_access(None, array, self.context.new_rvalue_from_int(self.int_type, i as i32)).to_rvalue());
+                elements.push(self.context.new_vector_access(None, result, self.context.new_rvalue_from_int(self.int_type, i as i32)).to_rvalue());
            }
            self.context.new_rvalue_from_vector(None, result_type, &elements)
        }
@ -1558,18 +1665,20 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
    where F: Fn(RValue<'gcc>, RValue<'gcc>, &'gcc Context<'gcc>) -> RValue<'gcc>
    {
        let vector_type = src.get_type().unqualified().dyncast_vector().expect("vector type");
+        let element_type = vector_type.get_element_type();
+        let mask_element_type = self.type_ix(element_type.get_size() as u64 * 8);
        let element_count = vector_type.get_num_units();
        let mut vector_elements = vec![];
        for i in 0..element_count {
            vector_elements.push(i);
        }
-        let mask_type = self.context.new_vector_type(self.int_type, element_count as u64);
+        let mask_type = self.context.new_vector_type(mask_element_type, element_count as u64);
        let mut shift = 1;
        let mut res = src;
        while shift < element_count {
            let vector_elements: Vec<_> =
                vector_elements.iter()
-                    .map(|i| self.context.new_rvalue_from_int(self.int_type, ((i + shift) % element_count) as i32))
+                    .map(|i| self.context.new_rvalue_from_int(mask_element_type, ((i + shift) % element_count) as i32))
                    .collect();
            let mask = self.context.new_rvalue_from_vector(None, mask_type, &vector_elements);
            let shifted = self.context.new_rvalue_vector_perm(None, res, res, mask);
@ -1581,7 +1690,7 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
    }

    #[cfg(not(feature="master"))]
-    pub fn vector_reduce<F>(&mut self, src: RValue<'gcc>, op: F) -> RValue<'gcc>
+    pub fn vector_reduce<F>(&mut self, _src: RValue<'gcc>, _op: F) -> RValue<'gcc>
    where F: Fn(RValue<'gcc>, RValue<'gcc>, &'gcc Context<'gcc>) -> RValue<'gcc>
    {
        unimplemented!();
@ -1595,15 +1704,47 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
        unimplemented!();
    }

+    #[cfg(feature="master")]
+    pub fn vector_reduce_fadd(&mut self, acc: RValue<'gcc>, src: RValue<'gcc>) -> RValue<'gcc> {
+        let vector_type = src.get_type().unqualified().dyncast_vector().expect("vector type");
+        let element_count = vector_type.get_num_units();
+        (0..element_count).into_iter()
+            .map(|i| self.context
+                .new_vector_access(None, src, self.context.new_rvalue_from_int(self.int_type, i as _))
+                .to_rvalue())
+            .fold(acc, |x, i| x + i)
+    }
+
+    #[cfg(not(feature="master"))]
+    pub fn vector_reduce_fadd(&mut self, _acc: RValue<'gcc>, _src: RValue<'gcc>) -> RValue<'gcc> {
+        unimplemented!();
+    }
+
    pub fn vector_reduce_fmul_fast(&mut self, _acc: RValue<'gcc>, _src: RValue<'gcc>) -> RValue<'gcc> {
        unimplemented!();
    }

+    #[cfg(feature="master")]
+    pub fn vector_reduce_fmul(&mut self, acc: RValue<'gcc>, src: RValue<'gcc>) -> RValue<'gcc> {
+        let vector_type = src.get_type().unqualified().dyncast_vector().expect("vector type");
+        let element_count = vector_type.get_num_units();
+        (0..element_count).into_iter()
+            .map(|i| self.context
+                .new_vector_access(None, src, self.context.new_rvalue_from_int(self.int_type, i as _))
+                .to_rvalue())
+            .fold(acc, |x, i| x * i)
+    }
+
+    #[cfg(not(feature="master"))]
+    pub fn vector_reduce_fmul(&mut self, _acc: RValue<'gcc>, _src: RValue<'gcc>) -> RValue<'gcc> {
+        unimplemented!()
+    }
+
    // Inspired by Hacker's Delight min implementation.
    pub fn vector_reduce_min(&mut self, src: RValue<'gcc>) -> RValue<'gcc> {
        self.vector_reduce(src, |a, b, context| {
            let differences_or_zeros = difference_or_zero(a, b, context);
-            context.new_binary_op(None, BinaryOp::Minus, a.get_type(), a, differences_or_zeros)
+            context.new_binary_op(None, BinaryOp::Plus, b.get_type(), b, differences_or_zeros)
        })
    }

@ -1611,38 +1752,148 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
    pub fn vector_reduce_max(&mut self, src: RValue<'gcc>) -> RValue<'gcc> {
        self.vector_reduce(src, |a, b, context| {
            let differences_or_zeros = difference_or_zero(a, b, context);
-            context.new_binary_op(None, BinaryOp::Plus, b.get_type(), b, differences_or_zeros)
+            context.new_binary_op(None, BinaryOp::Minus, a.get_type(), a, differences_or_zeros)
        })
    }

+    fn vector_extremum(&mut self, a: RValue<'gcc>, b: RValue<'gcc>, direction: ExtremumOperation) -> RValue<'gcc> {
+        let vector_type = a.get_type();
+
+        // mask out the NaNs in b and replace them with the corresponding lane in a, so when a and
+        // b get compared & spliced together, we get the numeric values instead of NaNs.
+        let b_nan_mask = self.context.new_comparison(None, ComparisonOp::NotEquals, b, b);
+        let mask_type = b_nan_mask.get_type();
+        let b_nan_mask_inverted = self.context.new_unary_op(None, UnaryOp::BitwiseNegate, mask_type, b_nan_mask);
+        let a_cast = self.context.new_bitcast(None, a, mask_type);
+        let b_cast = self.context.new_bitcast(None, b, mask_type);
+        let res = (b_nan_mask & a_cast) | (b_nan_mask_inverted & b_cast);
+        let b = self.context.new_bitcast(None, res, vector_type);
+
+        // now do the actual comparison
+        let comparison_op = match direction {
+            ExtremumOperation::Min => ComparisonOp::LessThan,
+            ExtremumOperation::Max => ComparisonOp::GreaterThan,
+        };
+        let cmp = self.context.new_comparison(None, comparison_op, a, b);
+        let cmp_inverted = self.context.new_unary_op(None, UnaryOp::BitwiseNegate, cmp.get_type(), cmp);
+        let res = (cmp & a_cast) | (cmp_inverted & res);
+        self.context.new_bitcast(None, res, vector_type)
+    }
+
+    pub fn vector_fmin(&mut self, a: RValue<'gcc>, b: RValue<'gcc>) -> RValue<'gcc> {
+        self.vector_extremum(a, b, ExtremumOperation::Min)
+    }
+
+    #[cfg(feature="master")]
+    pub fn vector_reduce_fmin(&mut self, src: RValue<'gcc>) -> RValue<'gcc> {
+        let vector_type = src.get_type().unqualified().dyncast_vector().expect("vector type");
+        let element_count = vector_type.get_num_units();
+        let mut acc = self.context.new_vector_access(None, src, self.context.new_rvalue_zero(self.int_type)).to_rvalue();
+        for i in 1..element_count {
+            let elem = self.context
+                .new_vector_access(None, src, self.context.new_rvalue_from_int(self.int_type, i as _))
+                .to_rvalue();
+            let cmp = self.context.new_comparison(None, ComparisonOp::LessThan, acc, elem);
+            acc = self.select(cmp, acc, elem);
+        }
+        acc
+    }
+
+    #[cfg(not(feature="master"))]
+    pub fn vector_reduce_fmin(&mut self, _src: RValue<'gcc>) -> RValue<'gcc> {
+        unimplemented!();
+    }
+
+    pub fn vector_fmax(&mut self, a: RValue<'gcc>, b: RValue<'gcc>) -> RValue<'gcc> {
+        self.vector_extremum(a, b, ExtremumOperation::Max)
+    }
+
+    #[cfg(feature="master")]
+    pub fn vector_reduce_fmax(&mut self, src: RValue<'gcc>) -> RValue<'gcc> {
+        let vector_type = src.get_type().unqualified().dyncast_vector().expect("vector type");
+        let element_count = vector_type.get_num_units();
+        let mut acc = self.context.new_vector_access(None, src, self.context.new_rvalue_zero(self.int_type)).to_rvalue();
+        for i in 1..element_count {
+            let elem = self.context
+                .new_vector_access(None, src, self.context.new_rvalue_from_int(self.int_type, i as _))
+                .to_rvalue();
+            let cmp = self.context.new_comparison(None, ComparisonOp::GreaterThan, acc, elem);
+            acc = self.select(cmp, acc, elem);
+        }
+        acc
+    }
+
+    #[cfg(not(feature="master"))]
+    pub fn vector_reduce_fmax(&mut self, _src: RValue<'gcc>) -> RValue<'gcc> {
+        unimplemented!();
+    }
+
    pub fn vector_select(&mut self, cond: RValue<'gcc>, then_val: RValue<'gcc>, else_val: RValue<'gcc>) -> RValue<'gcc> {
        // cond is a vector of integers, not of bools.
-        let cond_type = cond.get_type();
-        let vector_type = cond_type.unqualified().dyncast_vector().expect("vector type");
+        let vector_type = cond.get_type().unqualified().dyncast_vector().expect("vector type");
        let num_units = vector_type.get_num_units();
        let element_type = vector_type.get_element_type();
+
+        #[cfg(feature="master")]
+        let (cond, element_type) = {
+            let then_val_vector_type = then_val.get_type().dyncast_vector().expect("vector type");
+            let then_val_element_type = then_val_vector_type.get_element_type();
+            let then_val_element_size = then_val_element_type.get_size();
+
+            // NOTE: the mask needs to be of the same size as the other arguments in order for the &
+            // operation to work.
+            if then_val_element_size != element_type.get_size() {
+                let new_element_type = self.type_ix(then_val_element_size as u64 * 8);
+                let new_vector_type = self.context.new_vector_type(new_element_type, num_units as u64);
+                let cond = self.context.convert_vector(None, cond, new_vector_type);
+                (cond, new_element_type)
+            }
+            else {
+                (cond, element_type)
+            }
+        };
+
+        let cond_type = cond.get_type();
+
        let zeros = vec![self.context.new_rvalue_zero(element_type); num_units];
        let zeros = self.context.new_rvalue_from_vector(None, cond_type, &zeros);

+        let result_type = then_val.get_type();
+
        let masks = self.context.new_comparison(None, ComparisonOp::NotEquals, cond, zeros);
+        // NOTE: masks is a vector of integers, but the values can be vectors of floats, so use bitcast to make
+        // the & operation work.
+        let then_val = self.bitcast_if_needed(then_val, masks.get_type());
        let then_vals = masks & then_val;

-        let ones = vec![self.context.new_rvalue_one(element_type); num_units];
-        let ones = self.context.new_rvalue_from_vector(None, cond_type, &ones);
-        let inverted_masks = masks + ones;
+        let minus_ones = vec![self.context.new_rvalue_from_int(element_type, -1); num_units];
+        let minus_ones = self.context.new_rvalue_from_vector(None, cond_type, &minus_ones);
+        let inverted_masks = masks ^ minus_ones;
        // NOTE: sometimes, the type of else_val can be different than the type of then_val in
        // libgccjit (vector of int vs vector of int32_t), but they should be the same for the AND
        // operation to work.
+        // TODO: remove bitcast now that vector types can be compared?
        let else_val = self.context.new_bitcast(None, else_val, then_val.get_type());
        let else_vals = inverted_masks & else_val;

-        then_vals | else_vals
+        let res = then_vals | else_vals;
+        self.bitcast_if_needed(res, result_type)
    }
 }

 fn difference_or_zero<'gcc>(a: RValue<'gcc>, b: RValue<'gcc>, context: &'gcc Context<'gcc>) -> RValue<'gcc> {
    let difference = a - b;
    let masks = context.new_comparison(None, ComparisonOp::GreaterThanEquals, b, a);
+    // NOTE: masks is a vector of integers, but the values can be vectors of floats, so use bitcast to make
+    // the & operation work.
+    let a_type = a.get_type();
+    let masks =
+        if masks.get_type() != a_type {
+            context.new_bitcast(None, masks, a_type)
+        }
+        else {
+            masks
+        };
    difference & masks
 }

--- a/compiler/rustc_codegen_gcc/src/callee.rs
+++ b/compiler/rustc_codegen_gcc/src/callee.rs
@ -1,9 +1,10 @@
-use gccjit::{FunctionType, RValue};
-use rustc_codegen_ssa::traits::BaseTypeMethods;
+#[cfg(feature="master")]
+use gccjit::{FnAttribute, Visibility};
+use gccjit::{FunctionType, Function};
 use rustc_middle::ty::{self, Instance, TypeVisitableExt};
 use rustc_middle::ty::layout::{FnAbiOf, HasTyCtxt};

-use crate::abi::FnAbiGccExt;
+use crate::attributes;
 use crate::context::CodegenCx;

 /// Codegens a reference to a fn/method item, monomorphizing and
@ -13,22 +14,26 @@ use crate::context::CodegenCx;
 ///
 /// - `cx`: the crate context
 /// - `instance`: the instance to be instantiated
-pub fn get_fn<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, instance: Instance<'tcx>) -> RValue<'gcc> {
+pub fn get_fn<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, instance: Instance<'tcx>) -> Function<'gcc> {
    let tcx = cx.tcx();

    assert!(!instance.substs.needs_infer());
    assert!(!instance.substs.has_escaping_bound_vars());

+    let sym = tcx.symbol_name(instance).name;
+
    if let Some(&func) = cx.function_instances.borrow().get(&instance) {
        return func;
    }

-    let sym = tcx.symbol_name(instance).name;
-
    let fn_abi = cx.fn_abi_of_instance(instance, ty::List::empty());

    let func =
-        if let Some(func) = cx.get_declared_value(&sym) {
+        if let Some(_func) = cx.get_declared_value(&sym) {
+            // FIXME(antoyo): we never reach this because get_declared_value only returns global variables
+            // and here we try to get a function.
+            unreachable!();
+            /*
            // Create a fn pointer with the new signature.
            let ptrty = fn_abi.ptr_to_gcc_type(cx);

@ -61,13 +66,105 @@ pub fn get_fn<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, instance: Instance<'tcx>)
            }
            else {
                func
-            }
+            }*/
        }
        else {
            cx.linkage.set(FunctionType::Extern);
            let func = cx.declare_fn(&sym, &fn_abi);

+            attributes::from_fn_attrs(cx, func, instance);
+
+            let instance_def_id = instance.def_id();
+
            // TODO(antoyo): set linkage and attributes.
+
+            // Apply an appropriate linkage/visibility value to our item that we
+            // just declared.
+            //
+            // This is sort of subtle. Inside our codegen unit we started off
+            // compilation by predefining all our own `MonoItem` instances. That
+            // is, everything we're codegenning ourselves is already defined. That
+            // means that anything we're actually codegenning in this codegen unit
+            // will have hit the above branch in `get_declared_value`. As a result,
+            // we're guaranteed here that we're declaring a symbol that won't get
+            // defined, or in other words we're referencing a value from another
+            // codegen unit or even another crate.
+            //
+            // So because this is a foreign value we blanket apply an external
+            // linkage directive because it's coming from a different object file.
+            // The visibility here is where it gets tricky. This symbol could be
+            // referencing some foreign crate or foreign library (an `extern`
+            // block) in which case we want to leave the default visibility. We may
+            // also, though, have multiple codegen units. It could be a
+            // monomorphization, in which case its expected visibility depends on
+            // whether we are sharing generics or not. The important thing here is
+            // that the visibility we apply to the declaration is the same one that
+            // has been applied to the definition (wherever that definition may be).
+            let is_generic = instance.substs.non_erasable_generics().next().is_some();
+
+            if is_generic {
+                // This is a monomorphization. Its expected visibility depends
+                // on whether we are in share-generics mode.
+
+                if cx.tcx.sess.opts.share_generics() {
+                    // We are in share_generics mode.
+
+                    if let Some(instance_def_id) = instance_def_id.as_local() {
+                        // This is a definition from the current crate. If the
+                        // definition is unreachable for downstream crates or
+                        // the current crate does not re-export generics, the
+                        // definition of the instance will have been declared
+                        // as `hidden`.
+                        if cx.tcx.is_unreachable_local_definition(instance_def_id)
+                            || !cx.tcx.local_crate_exports_generics()
+                        {
+                            #[cfg(feature="master")]
+                            func.add_attribute(FnAttribute::Visibility(Visibility::Hidden));
+                        }
+                    } else {
+                        // This is a monomorphization of a generic function
+                        // defined in an upstream crate.
+                        if instance.upstream_monomorphization(tcx).is_some() {
+                            // This is instantiated in another crate. It cannot
+                            // be `hidden`.
+                        } else {
+                            // This is a local instantiation of an upstream definition.
+                            // If the current crate does not re-export it
+                            // (because it is a C library or an executable), it
+                            // will have been declared `hidden`.
+                            if !cx.tcx.local_crate_exports_generics() {
+                                #[cfg(feature="master")]
+                                func.add_attribute(FnAttribute::Visibility(Visibility::Hidden));
+                            }
+                        }
+                    }
+                } else {
+                    // When not sharing generics, all instances are in the same
+                    // crate and have hidden visibility
+                    #[cfg(feature="master")]
+                    func.add_attribute(FnAttribute::Visibility(Visibility::Hidden));
+                }
+            } else {
+                // This is a non-generic function
+                if cx.tcx.is_codegened_item(instance_def_id) {
+                    // This is a function that is instantiated in the local crate
+
+                    if instance_def_id.is_local() {
+                        // This is function that is defined in the local crate.
+                        // If it is not reachable, it is hidden.
+                        if !cx.tcx.is_reachable_non_generic(instance_def_id) {
+                            #[cfg(feature="master")]
+                            func.add_attribute(FnAttribute::Visibility(Visibility::Hidden));
+                        }
+                    } else {
+                        // This is a function from an upstream crate that has
+                        // been instantiated here. These are always hidden.
+                        #[cfg(feature="master")]
+                        func.add_attribute(FnAttribute::Visibility(Visibility::Hidden));
+                    }
+                }
+            }
+
            func
        };

--- a/compiler/rustc_codegen_gcc/src/common.rs
+++ b/compiler/rustc_codegen_gcc/src/common.rs
@ -36,7 +36,7 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
 pub fn bytes_in_context<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, bytes: &[u8]) -> RValue<'gcc> {
    let context = &cx.context;
    let byte_type = context.new_type::<u8>();
-    let typ = context.new_array_type(None, byte_type, bytes.len() as i32);
+    let typ = context.new_array_type(None, byte_type, bytes.len() as u64);
    let elements: Vec<_> =
        bytes.iter()
        .map(|&byte| context.new_rvalue_from_int(byte_type, byte as i32))
@ -115,8 +115,8 @@ impl<'gcc, 'tcx> ConstMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
        self.const_uint(self.usize_type, i)
    }

-    fn const_u8(&self, _i: u8) -> RValue<'gcc> {
-        unimplemented!();
+    fn const_u8(&self, i: u8) -> RValue<'gcc> {
+        self.const_uint(self.type_u8(), i as u64)
    }

    fn const_real(&self, typ: Type<'gcc>, val: f64) -> RValue<'gcc> {
@ -133,7 +133,7 @@ impl<'gcc, 'tcx> ConstMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
            .1;
        let len = s.len();
        let cs = self.const_ptrcast(str_global.get_address(None),
-            self.type_ptr_to(self.layout_of(self.tcx.types.str_).gcc_type(self, true)),
+            self.type_ptr_to(self.layout_of(self.tcx.types.str_).gcc_type(self)),
        );
        (cs, self.const_usize(len as u64))
    }
@ -174,8 +174,18 @@ impl<'gcc, 'tcx> ConstMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
                }

                let value = self.const_uint_big(self.type_ix(bitsize), data);
-                // TODO(bjorn3): assert size is correct
-                self.const_bitcast(value, ty)
+                let bytesize = layout.size(self).bytes();
+                if bitsize > 1 && ty.is_integral() && bytesize as u32 == ty.get_size() {
+                    // NOTE: since the intrinsic _xabort is called with a bitcast, which
+                    // is non-const, but expects a constant, do a normal cast instead of a bitcast.
+                    // FIXME(antoyo): fix bitcast to work in constant contexts.
+                    // TODO(antoyo): perhaps only use bitcast for pointers?
+                    self.context.new_cast(None, value, ty)
+                }
+                else {
+                    // TODO(bjorn3): assert size is correct
+                    self.const_bitcast(value, ty)
+                }
            }
            Scalar::Ptr(ptr, _size) => {
                let (alloc_id, offset) = ptr.into_parts();
@ -227,11 +237,11 @@ impl<'gcc, 'tcx> ConstMethods<'tcx> for CodegenCx<'gcc, 'tcx> {

    fn from_const_alloc(&self, layout: TyAndLayout<'tcx>, alloc: ConstAllocation<'tcx>, offset: Size) -> PlaceRef<'tcx, RValue<'gcc>> {
        assert_eq!(alloc.inner().align, layout.align.abi);
-        let ty = self.type_ptr_to(layout.gcc_type(self, true));
+        let ty = self.type_ptr_to(layout.gcc_type(self));
        let value =
            if layout.size == Size::ZERO {
                let value = self.const_usize(alloc.inner().align.bytes());
-                self.context.new_cast(None, value, ty)
+                self.const_bitcast(value, ty)
            }
            else {
                let init = const_alloc_to_gcc(self, alloc);
--- a/compiler/rustc_codegen_gcc/src/consts.rs
+++ b/compiler/rustc_codegen_gcc/src/consts.rs
@ -1,8 +1,8 @@
-use gccjit::{GlobalKind, LValue, RValue, ToRValue, Type};
+#[cfg(feature = "master")]
+use gccjit::FnAttribute;
+use gccjit::{Function, GlobalKind, LValue, RValue, ToRValue, Type};
 use rustc_codegen_ssa::traits::{BaseTypeMethods, ConstMethods, DerivedTypeMethods, StaticMethods};
-use rustc_hir as hir;
-use rustc_hir::Node;
-use rustc_middle::{bug, span_bug};
+use rustc_middle::span_bug;
 use rustc_middle::middle::codegen_fn_attrs::{CodegenFnAttrFlags, CodegenFnAttrs};
 use rustc_middle::mir::mono::MonoItem;
 use rustc_middle::ty::{self, Instance, Ty};
@ -13,6 +13,7 @@ use rustc_target::abi::{self, Align, HasDataLayout, Primitive, Size, WrappingRan

 use crate::base;
 use crate::context::CodegenCx;
+use crate::errors::InvalidMinimumAlignment;
 use crate::type_of::LayoutGccExt;

 impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
@ -30,6 +31,21 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
    }
 }

+fn set_global_alignment<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, gv: LValue<'gcc>, mut align: Align) {
+    // The target may require greater alignment for globals than the type does.
+    // Note: GCC and Clang also allow `__attribute__((aligned))` on variables,
+    // which can force it to be smaller. Rust doesn't support this yet.
+    if let Some(min) = cx.sess().target.min_global_align {
+        match Align::from_bits(min) {
+            Ok(min) => align = align.max(min),
+            Err(err) => {
+                cx.sess().emit_err(InvalidMinimumAlignment { err });
+            }
+        }
+    }
+    gv.set_alignment(align.bytes() as i32);
+}
+
 impl<'gcc, 'tcx> StaticMethods for CodegenCx<'gcc, 'tcx> {
    fn static_addr_of(&self, cv: RValue<'gcc>, align: Align, kind: Option<&str>) -> RValue<'gcc> {
        // TODO(antoyo): implement a proper rvalue comparison in libgccjit instead of doing the
@ -79,9 +95,9 @@ impl<'gcc, 'tcx> StaticMethods for CodegenCx<'gcc, 'tcx> {

        let instance = Instance::mono(self.tcx, def_id);
        let ty = instance.ty(self.tcx, ty::ParamEnv::reveal_all());
-        let gcc_type = self.layout_of(ty).gcc_type(self, true);
+        let gcc_type = self.layout_of(ty).gcc_type(self);

-        // TODO(antoyo): set alignment.
+        set_global_alignment(self, global, self.align_of(ty));

        let value = self.bitcast_if_needed(value, gcc_type);
        global.global_set_initializer_rvalue(value);
@ -158,12 +174,19 @@ impl<'gcc, 'tcx> StaticMethods for CodegenCx<'gcc, 'tcx> {
        // TODO(antoyo)
    }

-    fn add_compiler_used_global(&self, _global: RValue<'gcc>) {
-        // TODO(antoyo)
+    fn add_compiler_used_global(&self, global: RValue<'gcc>) {
+        // NOTE: seems like GCC does not make the distinction between compiler.used and used.
+        self.add_used_global(global);
    }
 }

 impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
+    #[cfg_attr(not(feature="master"), allow(unused_variables))]
+    pub fn add_used_function(&self, function: Function<'gcc>) {
+        #[cfg(feature = "master")]
+        function.add_attribute(FnAttribute::Used);
+    }
+
    pub fn static_addr_of_mut(&self, cv: RValue<'gcc>, align: Align, kind: Option<&str>) -> RValue<'gcc> {
        let global =
            match kind {
@ -208,82 +231,59 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
        let sym = self.tcx.symbol_name(instance).name;

        let global =
-            if let Some(def_id) = def_id.as_local() {
-                let id = self.tcx.hir().local_def_id_to_hir_id(def_id);
-                let llty = self.layout_of(ty).gcc_type(self, true);
-                // FIXME: refactor this to work without accessing the HIR
-                let global = match self.tcx.hir().get(id) {
-                    Node::Item(&hir::Item { span, kind: hir::ItemKind::Static(..), .. }) => {
-                        if let Some(global) = self.get_declared_value(&sym) {
-                            if self.val_ty(global) != self.type_ptr_to(llty) {
-                                span_bug!(span, "Conflicting types for static");
-                            }
-                        }
-
-                        let is_tls = fn_attrs.flags.contains(CodegenFnAttrFlags::THREAD_LOCAL);
-                        let global = self.declare_global(
-                            &sym,
-                            llty,
-                            GlobalKind::Exported,
-                            is_tls,
-                            fn_attrs.link_section,
-                        );
-
-                        if !self.tcx.is_reachable_non_generic(def_id) {
-                            // TODO(antoyo): set visibility.
-                        }
-
-                        global
-                    }
-
-                    Node::ForeignItem(&hir::ForeignItem {
-                        span: _,
-                        kind: hir::ForeignItemKind::Static(..),
-                        ..
-                    }) => {
-                        let fn_attrs = self.tcx.codegen_fn_attrs(def_id);
-                        check_and_apply_linkage(&self, &fn_attrs, ty, sym)
-                    }
-
-                    item => bug!("get_static: expected static, found {:?}", item),
-                };
-
-                global
-            }
-            else {
-                // FIXME(nagisa): perhaps the map of externs could be offloaded to llvm somehow?
-                //debug!("get_static: sym={} item_attr={:?}", sym, self.tcx.item_attrs(def_id));
-
-                let attrs = self.tcx.codegen_fn_attrs(def_id);
-                let global = check_and_apply_linkage(&self, &attrs, ty, sym);
-
-                let needs_dll_storage_attr = false; // TODO(antoyo)
-
-                // If this assertion triggers, there's something wrong with commandline
-                // argument validation.
-                debug_assert!(
-                    !(self.tcx.sess.opts.cg.linker_plugin_lto.enabled()
-                        && self.tcx.sess.target.options.is_like_msvc
-                        && self.tcx.sess.opts.cg.prefer_dynamic)
-                );
-
-                if needs_dll_storage_attr {
-                    // This item is external but not foreign, i.e., it originates from an external Rust
-                    // crate. Since we don't know whether this crate will be linked dynamically or
-                    // statically in the final application, we always mark such symbols as 'dllimport'.
-                    // If final linkage happens to be static, we rely on compiler-emitted __imp_ stubs
-                    // to make things work.
-                    //
-                    // However, in some scenarios we defer emission of statics to downstream
-                    // crates, so there are cases where a static with an upstream DefId
-                    // is actually present in the current crate. We can find out via the
-                    // is_codegened_item query.
-                    if !self.tcx.is_codegened_item(def_id) {
-                        unimplemented!();
-                    }
+            if def_id.is_local() && !self.tcx.is_foreign_item(def_id) {
+            let llty = self.layout_of(ty).gcc_type(self);
+            if let Some(global) = self.get_declared_value(sym) {
+                if self.val_ty(global) != self.type_ptr_to(llty) {
+                    span_bug!(self.tcx.def_span(def_id), "Conflicting types for static");
                }
-                global
-            };
+            }
+
+            let is_tls = fn_attrs.flags.contains(CodegenFnAttrFlags::THREAD_LOCAL);
+            let global = self.declare_global(
+                &sym,
+                llty,
+                GlobalKind::Exported,
+                is_tls,
+                fn_attrs.link_section,
+            );
+
+            if !self.tcx.is_reachable_non_generic(def_id) {
+                // TODO(antoyo): set visibility.
+            }
+
+            global
+        } else {
+            check_and_apply_linkage(&self, &fn_attrs, ty, sym)
+        };
+
+        if !def_id.is_local() {
+            let needs_dll_storage_attr = false; // TODO(antoyo)
+
+            // If this assertion triggers, there's something wrong with commandline
+            // argument validation.
+            debug_assert!(
+                !(self.tcx.sess.opts.cg.linker_plugin_lto.enabled()
+                    && self.tcx.sess.target.options.is_like_msvc
+                    && self.tcx.sess.opts.cg.prefer_dynamic)
+            );
+
+            if needs_dll_storage_attr {
+                // This item is external but not foreign, i.e., it originates from an external Rust
+                // crate. Since we don't know whether this crate will be linked dynamically or
+                // statically in the final application, we always mark such symbols as 'dllimport'.
+                // If final linkage happens to be static, we rely on compiler-emitted __imp_ stubs
+                // to make things work.
+                //
+                // However, in some scenarios we defer emission of statics to downstream
+                // crates, so there are cases where a static with an upstream DefId
+                // is actually present in the current crate. We can find out via the
+                // is_codegened_item query.
+                if !self.tcx.is_codegened_item(def_id) {
+                    unimplemented!();
+                }
+            }
+        }

        // TODO(antoyo): set dll storage class.

@ -357,7 +357,7 @@ pub fn codegen_static_initializer<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, def_id

 fn check_and_apply_linkage<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, attrs: &CodegenFnAttrs, ty: Ty<'tcx>, sym: &str) -> LValue<'gcc> {
    let is_tls = attrs.flags.contains(CodegenFnAttrFlags::THREAD_LOCAL);
-    let llty = cx.layout_of(ty).gcc_type(cx, true);
+    let gcc_type = cx.layout_of(ty).gcc_type(cx);
    if let Some(linkage) = attrs.import_linkage {
        // Declare a symbol `foo` with the desired linkage.
        let global1 = cx.declare_global_with_linkage(&sym, cx.type_i8(), base::global_linkage_to_gcc(linkage));
@ -370,9 +370,10 @@ fn check_and_apply_linkage<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, attrs: &Codeg
        // zero.
        let mut real_name = "_rust_extern_with_linkage_".to_string();
        real_name.push_str(&sym);
-        let global2 = cx.define_global(&real_name, llty, is_tls, attrs.link_section);
+        let global2 = cx.define_global(&real_name, gcc_type, is_tls, attrs.link_section);
        // TODO(antoyo): set linkage.
-        global2.global_set_initializer_rvalue(global1.get_address(None));
+        let value = cx.const_ptrcast(global1.get_address(None), gcc_type);
+        global2.global_set_initializer_rvalue(value);
        // TODO(antoyo): use global_set_initializer() when it will work.
        global2
    }
@ -386,6 +387,6 @@ fn check_and_apply_linkage<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, attrs: &Codeg
        // don't do this then linker errors can be generated where the linker
        // complains that one object files has a thread local version of the
        // symbol and another one doesn't.
-        cx.declare_global(&sym, llty, GlobalKind::Imported, is_tls, attrs.link_section)
+        cx.declare_global(&sym, gcc_type, GlobalKind::Imported, is_tls, attrs.link_section)
    }
 }
--- a/compiler/rustc_codegen_gcc/src/context.rs
+++ b/compiler/rustc_codegen_gcc/src/context.rs
@ -1,9 +1,10 @@
 use std::cell::{Cell, RefCell};

-use gccjit::{Block, CType, Context, Function, FunctionPtrType, FunctionType, LValue, RValue, Struct, Type};
+use gccjit::{Block, CType, Context, Function, FunctionPtrType, FunctionType, LValue, RValue, Type};
 use rustc_codegen_ssa::base::wants_msvc_seh;
 use rustc_codegen_ssa::traits::{
    BackendTypes,
+    BaseTypeMethods,
    MiscMethods,
 };
 use rustc_data_structures::base_n;
@ -11,7 +12,7 @@ use rustc_data_structures::fx::{FxHashMap, FxHashSet};
 use rustc_middle::span_bug;
 use rustc_middle::mir::mono::CodegenUnit;
 use rustc_middle::ty::{self, Instance, ParamEnv, PolyExistentialTraitRef, Ty, TyCtxt};
-use rustc_middle::ty::layout::{FnAbiError, FnAbiOfHelpers, FnAbiRequest, HasParamEnv, HasTyCtxt, LayoutError, TyAndLayout, LayoutOfHelpers};
+use rustc_middle::ty::layout::{FnAbiError, FnAbiOf, FnAbiOfHelpers, FnAbiRequest, HasParamEnv, HasTyCtxt, LayoutError, TyAndLayout, LayoutOfHelpers};
 use rustc_session::Session;
 use rustc_span::{Span, source_map::respan};
 use rustc_target::abi::{call::FnAbi, HasDataLayout, PointeeInfo, Size, TargetDataLayout, VariantIdx};
@ -33,6 +34,7 @@ pub struct CodegenCx<'gcc, 'tcx> {
    // TODO(bjorn3): Can this field be removed?
    pub current_func: RefCell<Option<Function<'gcc>>>,
    pub normal_function_addresses: RefCell<FxHashSet<RValue<'gcc>>>,
+    pub function_address_names: RefCell<FxHashMap<RValue<'gcc>, String>>,

    pub functions: RefCell<FxHashMap<String, Function<'gcc>>>,
    pub intrinsics: RefCell<FxHashMap<String, Function<'gcc>>>,
@ -78,12 +80,10 @@ pub struct CodegenCx<'gcc, 'tcx> {

    pub struct_types: RefCell<FxHashMap<Vec<Type<'gcc>>, Type<'gcc>>>,

-    pub types_with_fields_to_set: RefCell<FxHashMap<Type<'gcc>, (Struct<'gcc>, TyAndLayout<'tcx>)>>,
-
    /// Cache instances of monomorphic and polymorphic items
    pub instances: RefCell<FxHashMap<Instance<'tcx>, LValue<'gcc>>>,
    /// Cache function instances of monomorphic and polymorphic items
-    pub function_instances: RefCell<FxHashMap<Instance<'tcx>, RValue<'gcc>>>,
+    pub function_instances: RefCell<FxHashMap<Instance<'tcx>, Function<'gcc>>>,
    /// Cache generated vtables
    pub vtables: RefCell<FxHashMap<(Ty<'tcx>, Option<ty::PolyExistentialTraitRef<'tcx>>), RValue<'gcc>>>,

@ -110,6 +110,7 @@ pub struct CodegenCx<'gcc, 'tcx> {
    local_gen_sym_counter: Cell<usize>,

    eh_personality: Cell<Option<RValue<'gcc>>>,
+    pub rust_try_fn: Cell<Option<(Type<'gcc>, Function<'gcc>)>>,

    pub pointee_infos: RefCell<FxHashMap<(Ty<'tcx>, Size), Option<PointeeInfo>>>,

@ -119,6 +120,8 @@ pub struct CodegenCx<'gcc, 'tcx> {
    /// they can be dereferenced later.
    /// FIXME(antoyo): fix the rustc API to avoid having this hack.
    pub structs_as_pointer: RefCell<FxHashSet<RValue<'gcc>>>,
+
+    pub cleanup_blocks: RefCell<FxHashSet<Block<'gcc>>>,
 }

 impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
@ -194,6 +197,7 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
            context,
            current_func: RefCell::new(None),
            normal_function_addresses: Default::default(),
+            function_address_names: Default::default(),
            functions: RefCell::new(functions),
            intrinsics: RefCell::new(FxHashMap::default()),

@ -243,11 +247,12 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
            types: Default::default(),
            tcx,
            struct_types: Default::default(),
-            types_with_fields_to_set: Default::default(),
            local_gen_sym_counter: Cell::new(0),
            eh_personality: Cell::new(None),
+            rust_try_fn: Cell::new(None),
            pointee_infos: Default::default(),
            structs_as_pointer: Default::default(),
+            cleanup_blocks: Default::default(),
        }
    }

@ -327,8 +332,9 @@ impl<'gcc, 'tcx> MiscMethods<'tcx> for CodegenCx<'gcc, 'tcx> {

    fn get_fn(&self, instance: Instance<'tcx>) -> RValue<'gcc> {
        let func = get_fn(self, instance);
-        *self.current_func.borrow_mut() = Some(self.rvalue_as_function(func));
-        func
+        *self.current_func.borrow_mut() = Some(func);
+        // FIXME(antoyo): this is a wrong cast. That requires changing the compiler API.
+        unsafe { std::mem::transmute(func) }
    }

    fn get_fn_addr(&self, instance: Instance<'tcx>) -> RValue<'gcc> {
@ -339,8 +345,7 @@ impl<'gcc, 'tcx> MiscMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
                self.intrinsics.borrow()[func_name].clone()
            }
            else {
-                let func = get_fn(self, instance);
-                self.rvalue_as_function(func)
+                get_fn(self, instance)
            };
        let ptr = func.get_address(None);

@ -348,6 +353,7 @@ impl<'gcc, 'tcx> MiscMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
        // FIXME(antoyo): the rustc API seems to call get_fn_addr() when not needed (e.g. for FFI).

        self.normal_function_addresses.borrow_mut().insert(ptr);
+        self.function_address_names.borrow_mut().insert(ptr, func_name.to_string());

        ptr
    }
@ -377,31 +383,40 @@ impl<'gcc, 'tcx> MiscMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
            return llpersonality;
        }
        let tcx = self.tcx;
-        let llfn = match tcx.lang_items().eh_personality() {
-            Some(def_id) if !wants_msvc_seh(self.sess()) => self.get_fn_addr(
-                ty::Instance::resolve(
-                    tcx,
-                    ty::ParamEnv::reveal_all(),
-                    def_id,
-                    ty::List::empty(),
-                )
-                .unwrap().unwrap(),
-            ),
-            _ => {
-                let _name = if wants_msvc_seh(self.sess()) {
-                    "__CxxFrameHandler3"
-                } else {
-                    "rust_eh_personality"
-                };
-                //let func = self.declare_func(name, self.type_i32(), &[], true);
-                // FIXME(antoyo): this hack should not be needed. That will probably be removed when
-                // unwinding support is added.
-                self.context.new_rvalue_from_int(self.int_type, 0)
-            }
-        };
+        let func =
+            match tcx.lang_items().eh_personality() {
+                Some(def_id) if !wants_msvc_seh(self.sess()) => {
+                    let instance =
+                        ty::Instance::resolve(
+                            tcx,
+                            ty::ParamEnv::reveal_all(),
+                            def_id,
+                            ty::List::empty(),
+                        )
+                        .unwrap().unwrap();
+
+                    let symbol_name = tcx.symbol_name(instance).name;
+                    let fn_abi = self.fn_abi_of_instance(instance, ty::List::empty());
+                    self.linkage.set(FunctionType::Extern);
+                    let func = self.declare_fn(symbol_name, &fn_abi);
+                    let func: RValue<'gcc> = unsafe { std::mem::transmute(func) };
+                    func
+                },
+                _ => {
+                    let name =
+                        if wants_msvc_seh(self.sess()) {
+                            "__CxxFrameHandler3"
+                        }
+                        else {
+                            "rust_eh_personality"
+                        };
+                    let func = self.declare_func(name, self.type_i32(), &[], true);
+                    unsafe { std::mem::transmute(func) }
+                }
+            };
        // TODO(antoyo): apply target cpu attributes.
-        self.eh_personality.set(Some(llfn));
-        llfn
+        self.eh_personality.set(Some(func));
+        func
    }

    fn sess(&self) -> &Session {
--- a/compiler/rustc_codegen_gcc/src/declare.rs
+++ b/compiler/rustc_codegen_gcc/src/declare.rs
@ -38,12 +38,10 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
        global
    }

-    /*pub fn declare_func(&self, name: &str, return_type: Type<'gcc>, params: &[Type<'gcc>], variadic: bool) -> RValue<'gcc> {
-        self.linkage.set(FunctionType::Exported);
-        let func = declare_raw_fn(self, name, () /*llvm::CCallConv*/, return_type, params, variadic);
-        // FIXME(antoyo): this is a wrong cast. That requires changing the compiler API.
-        unsafe { std::mem::transmute(func) }
-    }*/
+    pub fn declare_func(&self, name: &str, return_type: Type<'gcc>, params: &[Type<'gcc>], variadic: bool) -> Function<'gcc> {
+        self.linkage.set(FunctionType::Extern);
+        declare_raw_fn(self, name, () /*llvm::CCallConv*/, return_type, params, variadic)
+    }

    pub fn declare_global(&self, name: &str, ty: Type<'gcc>, global_kind: GlobalKind, is_tls: bool, link_section: Option<Symbol>) -> LValue<'gcc> {
        let global = self.context.new_global(None, global_kind, ty, name);
@ -79,12 +77,11 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
        unsafe { std::mem::transmute(func) }
    }

-    pub fn declare_fn(&self, name: &str, fn_abi: &FnAbi<'tcx, Ty<'tcx>>) -> RValue<'gcc> {
+    pub fn declare_fn(&self, name: &str, fn_abi: &FnAbi<'tcx, Ty<'tcx>>) -> Function<'gcc> {
        let (return_type, params, variadic, on_stack_param_indices) = fn_abi.gcc_type(self);
        let func = declare_raw_fn(self, name, () /*fn_abi.llvm_cconv()*/, return_type, &params, variadic);
        self.on_stack_function_params.borrow_mut().insert(func, on_stack_param_indices);
-        // FIXME(antoyo): this is a wrong cast. That requires changing the compiler API.
-        unsafe { std::mem::transmute(func) }
+        func
    }

    pub fn define_global(&self, name: &str, ty: Type<'gcc>, is_tls: bool, link_section: Option<Symbol>) -> LValue<'gcc> {
--- a/compiler/rustc_codegen_gcc/src/errors.rs
+++ b/compiler/rustc_codegen_gcc/src/errors.rs
@ -221,3 +221,9 @@ pub(crate) struct UnwindingInlineAsm {
    #[primary_span]
    pub span: Span,
 }
+
+#[derive(Diagnostic)]
+#[diag(codegen_gcc_invalid_minimum_alignment)]
+pub(crate) struct InvalidMinimumAlignment {
+    pub err: String,
+}
--- a/compiler/rustc_codegen_gcc/src/int.rs
+++ b/compiler/rustc_codegen_gcc/src/int.rs
@ -389,18 +389,22 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
                };
            self.context.new_comparison(None, op, cmp, self.context.new_rvalue_from_int(self.int_type, limit))
        }
+        else if a_type.get_pointee().is_some() && b_type.get_pointee().is_some() {
+            // NOTE: gcc cannot compare pointers to different objects, but rustc does that, so cast them to usize.
+            lhs = self.context.new_bitcast(None, lhs, self.usize_type);
+            rhs = self.context.new_bitcast(None, rhs, self.usize_type);
+            self.context.new_comparison(None, op.to_gcc_comparison(), lhs, rhs)
+        }
        else {
-            let left_type = lhs.get_type();
-            let right_type = rhs.get_type();
-            if left_type != right_type {
+            if a_type != b_type {
                // NOTE: because libgccjit cannot compare function pointers.
-                if left_type.dyncast_function_ptr_type().is_some() && right_type.dyncast_function_ptr_type().is_some() {
+                if a_type.dyncast_function_ptr_type().is_some() && b_type.dyncast_function_ptr_type().is_some() {
                    lhs = self.context.new_cast(None, lhs, self.usize_type.make_pointer());
                    rhs = self.context.new_cast(None, rhs, self.usize_type.make_pointer());
                }
                // NOTE: hack because we try to cast a vector type to the same vector type.
-                else if format!("{:?}", left_type) != format!("{:?}", right_type) {
-                    rhs = self.context.new_cast(None, rhs, left_type);
+                else if format!("{:?}", a_type) != format!("{:?}", b_type) {
+                    rhs = self.context.new_cast(None, rhs, a_type);
                }
            }
            self.context.new_comparison(None, op.to_gcc_comparison(), lhs, rhs)
--- a/compiler/rustc_codegen_gcc/src/intrinsic/archs.rs
+++ b/compiler/rustc_codegen_gcc/src/intrinsic/archs.rs
--- a/compiler/rustc_codegen_gcc/src/intrinsic/llvm.rs
+++ b/compiler/rustc_codegen_gcc/src/intrinsic/llvm.rs
@ -1,159 +1,387 @@
 use std::borrow::Cow;

-use gccjit::{Function, FunctionPtrType, RValue, ToRValue};
+use gccjit::{Function, FunctionPtrType, RValue, ToRValue, UnaryOp};
+use rustc_codegen_ssa::traits::BuilderMethods;

 use crate::{context::CodegenCx, builder::Builder};

-pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc, 'tcx>, gcc_func: FunctionPtrType<'gcc>, mut args: Cow<'b, [RValue<'gcc>]>, func_name: &str) -> Cow<'b, [RValue<'gcc>]> {
+pub fn adjust_intrinsic_arguments<'a, 'b, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc, 'tcx>, gcc_func: FunctionPtrType<'gcc>, mut args: Cow<'b, [RValue<'gcc>]>, func_name: &str, original_function_name: Option<&String>) -> Cow<'b, [RValue<'gcc>]> {
    // Some LLVM intrinsics do not map 1-to-1 to GCC intrinsics, so we add the missing
    // arguments here.
    if gcc_func.get_param_count() != args.len() {
        match &*func_name {
-            "__builtin_ia32_pmuldq512_mask" | "__builtin_ia32_pmuludq512_mask"
-                // FIXME(antoyo): the following intrinsics has 4 (or 5) arguments according to the doc, but is defined with 2 (or 3) arguments in library/stdarch/crates/core_arch/src/x86/avx512f.rs.
+            // NOTE: the following intrinsics have a different number of parameters in LLVM and GCC.
+            "__builtin_ia32_prold512_mask" | "__builtin_ia32_pmuldq512_mask" | "__builtin_ia32_pmuludq512_mask"
                | "__builtin_ia32_pmaxsd512_mask" | "__builtin_ia32_pmaxsq512_mask" | "__builtin_ia32_pmaxsq256_mask"
-                | "__builtin_ia32_pmaxsq128_mask" | "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask"
-                | "__builtin_ia32_pmaxud512_mask" | "__builtin_ia32_pmaxuq512_mask" | "__builtin_ia32_pmaxuq256_mask"
-                | "__builtin_ia32_pmaxuq128_mask"
+                | "__builtin_ia32_pmaxsq128_mask" | "__builtin_ia32_pmaxud512_mask" | "__builtin_ia32_pmaxuq512_mask"
                | "__builtin_ia32_pminsd512_mask" | "__builtin_ia32_pminsq512_mask" | "__builtin_ia32_pminsq256_mask"
-                | "__builtin_ia32_pminsq128_mask" | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask"
-                | "__builtin_ia32_pminud512_mask" | "__builtin_ia32_pminuq512_mask" | "__builtin_ia32_pminuq256_mask"
-                | "__builtin_ia32_pminuq128_mask" | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask"
+                | "__builtin_ia32_pminsq128_mask" | "__builtin_ia32_pminud512_mask" | "__builtin_ia32_pminuq512_mask"
+                | "__builtin_ia32_prolq512_mask" | "__builtin_ia32_prorq512_mask" | "__builtin_ia32_pslldi512_mask"
+                | "__builtin_ia32_psrldi512_mask" | "__builtin_ia32_psllqi512_mask" | "__builtin_ia32_psrlqi512_mask"
+                | "__builtin_ia32_pslld512_mask" | "__builtin_ia32_psrld512_mask" | "__builtin_ia32_psllq512_mask"
+                | "__builtin_ia32_psrlq512_mask" | "__builtin_ia32_psrad512_mask" | "__builtin_ia32_psraq512_mask"
+                | "__builtin_ia32_psradi512_mask" | "__builtin_ia32_psraqi512_mask" | "__builtin_ia32_psrav16si_mask"
+                | "__builtin_ia32_psrav8di_mask" | "__builtin_ia32_prolvd512_mask" | "__builtin_ia32_prorvd512_mask"
+                | "__builtin_ia32_prolvq512_mask" | "__builtin_ia32_prorvq512_mask" | "__builtin_ia32_psllv16si_mask"
+                | "__builtin_ia32_psrlv16si_mask" | "__builtin_ia32_psllv8di_mask" | "__builtin_ia32_psrlv8di_mask"
+                | "__builtin_ia32_permvarsi512_mask" | "__builtin_ia32_vpermilvarps512_mask"
+                | "__builtin_ia32_vpermilvarpd512_mask" | "__builtin_ia32_permvardi512_mask"
+                | "__builtin_ia32_permvarsf512_mask" | "__builtin_ia32_permvarqi512_mask"
+                | "__builtin_ia32_permvarqi256_mask" | "__builtin_ia32_permvarqi128_mask"
+                | "__builtin_ia32_vpmultishiftqb512_mask" | "__builtin_ia32_vpmultishiftqb256_mask"
+                | "__builtin_ia32_vpmultishiftqb128_mask"
                => {
-                    // TODO: refactor by separating those intrinsics outside of this branch.
-                    let add_before_last_arg =
-                        match &*func_name {
-                            "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask"
-                                | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask"
-                                | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => true,
-                            _ => false,
-                        };
-                    let new_first_arg_is_zero =
-                        match &*func_name {
-                            "__builtin_ia32_pmaxuq256_mask" | "__builtin_ia32_pmaxuq128_mask"
-                                | "__builtin_ia32_pminuq256_mask" | "__builtin_ia32_pminuq128_mask" => true,
-                            _ => false
-                        };
-                    let arg3_index =
-                        match &*func_name {
-                            "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => 1,
-                            _ => 2,
-                        };
-                    let mut new_args = args.to_vec();
-                    let arg3_type = gcc_func.get_param_type(arg3_index);
-                    let first_arg =
-                        if new_first_arg_is_zero {
-                            let vector_type = arg3_type.dyncast_vector().expect("vector type");
-                            let zero = builder.context.new_rvalue_zero(vector_type.get_element_type());
-                            let num_units = vector_type.get_num_units();
-                            builder.context.new_rvalue_from_vector(None, arg3_type, &vec![zero; num_units])
-                        }
-                        else {
-                            builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue()
-                        };
-                    if add_before_last_arg {
-                        new_args.insert(new_args.len() - 1, first_arg);
+                let mut new_args = args.to_vec();
+                let arg3_type = gcc_func.get_param_type(2);
+                let first_arg = builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue();
+                new_args.push(first_arg);
+                let arg4_type = gcc_func.get_param_type(3);
+                let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                new_args.push(minus_one);
+                args = new_args.into();
+            },
+            "__builtin_ia32_pmaxuq256_mask" | "__builtin_ia32_pmaxuq128_mask" | "__builtin_ia32_pminuq256_mask"
+                | "__builtin_ia32_pminuq128_mask" | "__builtin_ia32_prold256_mask" | "__builtin_ia32_prold128_mask"
+                | "__builtin_ia32_prord512_mask" | "__builtin_ia32_prord256_mask" | "__builtin_ia32_prord128_mask"
+                | "__builtin_ia32_prolq256_mask" | "__builtin_ia32_prolq128_mask" | "__builtin_ia32_prorq256_mask"
+                | "__builtin_ia32_prorq128_mask" | "__builtin_ia32_psraq256_mask" | "__builtin_ia32_psraq128_mask"
+                | "__builtin_ia32_psraqi256_mask" | "__builtin_ia32_psraqi128_mask" | "__builtin_ia32_psravq256_mask"
+                | "__builtin_ia32_psravq128_mask" | "__builtin_ia32_prolvd256_mask" | "__builtin_ia32_prolvd128_mask"
+                | "__builtin_ia32_prorvd256_mask" | "__builtin_ia32_prorvd128_mask" | "__builtin_ia32_prolvq256_mask"
+                | "__builtin_ia32_prolvq128_mask" | "__builtin_ia32_prorvq256_mask" | "__builtin_ia32_prorvq128_mask"
+                | "__builtin_ia32_permvardi256_mask" | "__builtin_ia32_permvardf512_mask" | "__builtin_ia32_permvardf256_mask"
+                | "__builtin_ia32_pmulhuw512_mask" | "__builtin_ia32_pmulhw512_mask" | "__builtin_ia32_pmulhrsw512_mask"
+                | "__builtin_ia32_pmaxuw512_mask" | "__builtin_ia32_pmaxub512_mask" | "__builtin_ia32_pmaxsw512_mask"
+                | "__builtin_ia32_pmaxsb512_mask" | "__builtin_ia32_pminuw512_mask" | "__builtin_ia32_pminub512_mask"
+                | "__builtin_ia32_pminsw512_mask" | "__builtin_ia32_pminsb512_mask"
+                | "__builtin_ia32_pmaddwd512_mask" | "__builtin_ia32_pmaddubsw512_mask" | "__builtin_ia32_packssdw512_mask"
+                | "__builtin_ia32_packsswb512_mask" | "__builtin_ia32_packusdw512_mask" | "__builtin_ia32_packuswb512_mask"
+                | "__builtin_ia32_pavgw512_mask" | "__builtin_ia32_pavgb512_mask" | "__builtin_ia32_psllw512_mask"
+                | "__builtin_ia32_psllwi512_mask" | "__builtin_ia32_psllv32hi_mask" | "__builtin_ia32_psrlw512_mask"
+                | "__builtin_ia32_psrlwi512_mask" | "__builtin_ia32_psllv16hi_mask" | "__builtin_ia32_psllv8hi_mask"
+                | "__builtin_ia32_psrlv32hi_mask" | "__builtin_ia32_psraw512_mask" | "__builtin_ia32_psrawi512_mask"
+                | "__builtin_ia32_psrlv16hi_mask" | "__builtin_ia32_psrlv8hi_mask" | "__builtin_ia32_psrav32hi_mask"
+                | "__builtin_ia32_permvarhi512_mask" | "__builtin_ia32_pshufb512_mask" | "__builtin_ia32_psrav16hi_mask"
+                | "__builtin_ia32_psrav8hi_mask" | "__builtin_ia32_permvarhi256_mask" | "__builtin_ia32_permvarhi128_mask"
+                => {
+                let mut new_args = args.to_vec();
+                let arg3_type = gcc_func.get_param_type(2);
+                let vector_type = arg3_type.dyncast_vector().expect("vector type");
+                let zero = builder.context.new_rvalue_zero(vector_type.get_element_type());
+                let num_units = vector_type.get_num_units();
+                let first_arg = builder.context.new_rvalue_from_vector(None, arg3_type, &vec![zero; num_units]);
+                new_args.push(first_arg);
+                let arg4_type = gcc_func.get_param_type(3);
+                let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                new_args.push(minus_one);
+                args = new_args.into();
+            },
+            "__builtin_ia32_dbpsadbw512_mask" | "__builtin_ia32_dbpsadbw256_mask" | "__builtin_ia32_dbpsadbw128_mask" => {
+                let mut new_args = args.to_vec();
+                let arg4_type = gcc_func.get_param_type(3);
+                let vector_type = arg4_type.dyncast_vector().expect("vector type");
+                let zero = builder.context.new_rvalue_zero(vector_type.get_element_type());
+                let num_units = vector_type.get_num_units();
+                let first_arg = builder.context.new_rvalue_from_vector(None, arg4_type, &vec![zero; num_units]);
+                new_args.push(first_arg);
+                let arg5_type = gcc_func.get_param_type(4);
+                let minus_one = builder.context.new_rvalue_from_int(arg5_type, -1);
+                new_args.push(minus_one);
+                args = new_args.into();
+            },
+            "__builtin_ia32_vplzcntd_512_mask" | "__builtin_ia32_vplzcntd_256_mask" | "__builtin_ia32_vplzcntd_128_mask"
+                | "__builtin_ia32_vplzcntq_512_mask" | "__builtin_ia32_vplzcntq_256_mask" | "__builtin_ia32_vplzcntq_128_mask" => {
+                let mut new_args = args.to_vec();
+                // Remove last arg as it doesn't seem to be used in GCC and is always false.
+                new_args.pop();
+                let arg2_type = gcc_func.get_param_type(1);
+                let vector_type = arg2_type.dyncast_vector().expect("vector type");
+                let zero = builder.context.new_rvalue_zero(vector_type.get_element_type());
+                let num_units = vector_type.get_num_units();
+                let first_arg = builder.context.new_rvalue_from_vector(None, arg2_type, &vec![zero; num_units]);
+                new_args.push(first_arg);
+                let arg3_type = gcc_func.get_param_type(2);
+                let minus_one = builder.context.new_rvalue_from_int(arg3_type, -1);
+                new_args.push(minus_one);
+                args = new_args.into();
+            },
+            "__builtin_ia32_vpconflictsi_512_mask" | "__builtin_ia32_vpconflictsi_256_mask"
+                | "__builtin_ia32_vpconflictsi_128_mask" | "__builtin_ia32_vpconflictdi_512_mask"
+                | "__builtin_ia32_vpconflictdi_256_mask" | "__builtin_ia32_vpconflictdi_128_mask" => {
+                let mut new_args = args.to_vec();
+                let arg2_type = gcc_func.get_param_type(1);
+                let vector_type = arg2_type.dyncast_vector().expect("vector type");
+                let zero = builder.context.new_rvalue_zero(vector_type.get_element_type());
+                let num_units = vector_type.get_num_units();
+                let first_arg = builder.context.new_rvalue_from_vector(None, arg2_type, &vec![zero; num_units]);
+                new_args.push(first_arg);
+                let arg3_type = gcc_func.get_param_type(2);
+                let minus_one = builder.context.new_rvalue_from_int(arg3_type, -1);
+                new_args.push(minus_one);
+                args = new_args.into();
+            },
+            "__builtin_ia32_pternlogd512_mask" | "__builtin_ia32_pternlogd256_mask"
+                | "__builtin_ia32_pternlogd128_mask" | "__builtin_ia32_pternlogq512_mask"
+                | "__builtin_ia32_pternlogq256_mask" | "__builtin_ia32_pternlogq128_mask" => {
+                let mut new_args = args.to_vec();
+                let arg5_type = gcc_func.get_param_type(4);
+                let minus_one = builder.context.new_rvalue_from_int(arg5_type, -1);
+                new_args.push(minus_one);
+                args = new_args.into();
+            },
+            "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => {
+                let mut new_args = args.to_vec();
+
+                let mut last_arg = None;
+                if args.len() == 4 {
+                    last_arg = new_args.pop();
+                }
+
+                let arg4_type = gcc_func.get_param_type(3);
+                let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                new_args.push(minus_one);
+
+                if args.len() == 3 {
+                    // Both llvm.fma.v16f32 and llvm.x86.avx512.vfmadd.ps.512 maps to
+                    // the same GCC intrinsic, but the former has 3 parameters and the
+                    // latter has 4 so it doesn't require this additional argument.
+                    let arg5_type = gcc_func.get_param_type(4);
+                    new_args.push(builder.context.new_rvalue_from_int(arg5_type, 4));
+                }
+
+                if let Some(last_arg) = last_arg {
+                    new_args.push(last_arg);
+                }
+
+                args = new_args.into();
+            },
+            "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask"
+                | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask"
+                | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask"
+                | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask"
+                | "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask"
+                |  "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask" => {
+                let mut new_args = args.to_vec();
+                let last_arg = new_args.pop().expect("last arg");
+                let arg3_type = gcc_func.get_param_type(2);
+                let undefined = builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue();
+                new_args.push(undefined);
+                let arg4_type = gcc_func.get_param_type(3);
+                let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                new_args.push(minus_one);
+                new_args.push(last_arg);
+                args = new_args.into();
+            },
+            "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => {
+                let mut new_args = args.to_vec();
+                let last_arg = new_args.pop().expect("last arg");
+                let arg4_type = gcc_func.get_param_type(3);
+                let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                new_args.push(minus_one);
+                new_args.push(last_arg);
+                args = new_args.into();
+            },
+            "__builtin_ia32_vpermi2vard512_mask" | "__builtin_ia32_vpermi2vard256_mask"
+                | "__builtin_ia32_vpermi2vard128_mask" | "__builtin_ia32_vpermi2varq512_mask"
+                | "__builtin_ia32_vpermi2varq256_mask" | "__builtin_ia32_vpermi2varq128_mask"
+                | "__builtin_ia32_vpermi2varps512_mask" | "__builtin_ia32_vpermi2varps256_mask"
+                | "__builtin_ia32_vpermi2varps128_mask" | "__builtin_ia32_vpermi2varpd512_mask"
+                | "__builtin_ia32_vpermi2varpd256_mask" | "__builtin_ia32_vpermi2varpd128_mask" | "__builtin_ia32_vpmadd52huq512_mask"
+                | "__builtin_ia32_vpmadd52luq512_mask" | "__builtin_ia32_vpmadd52huq256_mask" | "__builtin_ia32_vpmadd52luq256_mask"
+                | "__builtin_ia32_vpmadd52huq128_mask"
+                => {
+                let mut new_args = args.to_vec();
+                let arg4_type = gcc_func.get_param_type(3);
+                let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                new_args.push(minus_one);
+                args = new_args.into();
+            },
+            "__builtin_ia32_cvtdq2ps512_mask" | "__builtin_ia32_cvtudq2ps512_mask"
+                | "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => {
+                let mut new_args = args.to_vec();
+                let last_arg = new_args.pop().expect("last arg");
+                let arg2_type = gcc_func.get_param_type(1);
+                let undefined = builder.current_func().new_local(None, arg2_type, "undefined_for_intrinsic").to_rvalue();
+                new_args.push(undefined);
+                let arg3_type = gcc_func.get_param_type(2);
+                let minus_one = builder.context.new_rvalue_from_int(arg3_type, -1);
+                new_args.push(minus_one);
+                new_args.push(last_arg);
+                args = new_args.into();
+            },
+            "__builtin_ia32_stmxcsr" => {
+                args = vec![].into();
+            },
+            "__builtin_ia32_addcarryx_u64" | "__builtin_ia32_sbb_u64" | "__builtin_ia32_addcarryx_u32" | "__builtin_ia32_sbb_u32" => {
+                let mut new_args = args.to_vec();
+                let arg2_type = gcc_func.get_param_type(1);
+                let variable = builder.current_func().new_local(None, arg2_type, "addcarryResult");
+                new_args.push(variable.get_address(None));
+                args = new_args.into();
+            },
+            "__builtin_ia32_vpermt2varqi512_mask" | "__builtin_ia32_vpermt2varqi256_mask"
+                | "__builtin_ia32_vpermt2varqi128_mask" | "__builtin_ia32_vpermt2varhi512_mask"
+                | "__builtin_ia32_vpermt2varhi256_mask" | "__builtin_ia32_vpermt2varhi128_mask"
+                => {
+                let new_args = args.to_vec();
+                let arg4_type = gcc_func.get_param_type(3);
+                let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
+                args = vec![new_args[1], new_args[0], new_args[2], minus_one].into();
+            },
+            "__builtin_ia32_xrstor" | "__builtin_ia32_xsavec" => {
+                let new_args = args.to_vec();
+                let thirty_two = builder.context.new_rvalue_from_int(new_args[1].get_type(), 32);
+                let arg2 = new_args[1] << thirty_two | new_args[2];
+                let arg2_type = gcc_func.get_param_type(1);
+                let arg2 = builder.context.new_cast(None, arg2, arg2_type);
+                args = vec![new_args[0], arg2].into();
+            },
+            "__builtin_prefetch" => {
+                let mut new_args = args.to_vec();
+                new_args.pop();
+                args = new_args.into();
+            },
+            _ => (),
+        }
+    }
+    else {
+        match &*func_name {
+            "__builtin_ia32_rndscaless_mask_round" | "__builtin_ia32_rndscalesd_mask_round" => {
+                let new_args = args.to_vec();
+                let arg3_type = gcc_func.get_param_type(2);
+                let arg3 = builder.context.new_cast(None, new_args[4], arg3_type);
+                let arg4_type = gcc_func.get_param_type(3);
+                let arg4 = builder.context.new_bitcast(None, new_args[2], arg4_type);
+                args = vec![new_args[0], new_args[1], arg3, arg4, new_args[3], new_args[5]].into();
+            },
+            // NOTE: the LLVM intrinsic receives 3 floats, but the GCC builtin requires 3 vectors.
+            // FIXME: the intrinsics like _mm_mask_fmadd_sd should probably directly call the GCC
+            // instrinsic to avoid this.
+            "__builtin_ia32_vfmaddss3_round" => {
+                let new_args = args.to_vec();
+                let arg1_type = gcc_func.get_param_type(0);
+                let arg2_type = gcc_func.get_param_type(1);
+                let arg3_type = gcc_func.get_param_type(2);
+                let a = builder.context.new_rvalue_from_vector(None, arg1_type, &[new_args[0]; 4]);
+                let b = builder.context.new_rvalue_from_vector(None, arg2_type, &[new_args[1]; 4]);
+                let c = builder.context.new_rvalue_from_vector(None, arg3_type, &[new_args[2]; 4]);
+                args = vec![a, b, c, new_args[3]].into();
+            },
+            "__builtin_ia32_vfmaddsd3_round" => {
+                let new_args = args.to_vec();
+                let arg1_type = gcc_func.get_param_type(0);
+                let arg2_type = gcc_func.get_param_type(1);
+                let arg3_type = gcc_func.get_param_type(2);
+                let a = builder.context.new_rvalue_from_vector(None, arg1_type, &[new_args[0]; 2]);
+                let b = builder.context.new_rvalue_from_vector(None, arg2_type, &[new_args[1]; 2]);
+                let c = builder.context.new_rvalue_from_vector(None, arg3_type, &[new_args[2]; 2]);
+                args = vec![a, b, c, new_args[3]].into();
+            },
+            "__builtin_ia32_vfmaddsubpd256" | "__builtin_ia32_vfmaddsubps" | "__builtin_ia32_vfmaddsubps256"
+                | "__builtin_ia32_vfmaddsubpd" => {
+                if let Some(original_function_name) = original_function_name {
+                    match &**original_function_name {
+                        "llvm.x86.fma.vfmsubadd.pd.256" | "llvm.x86.fma.vfmsubadd.ps" | "llvm.x86.fma.vfmsubadd.ps.256"
+                            | "llvm.x86.fma.vfmsubadd.pd" => {
+                            // NOTE: since both llvm.x86.fma.vfmsubadd.ps and llvm.x86.fma.vfmaddsub.ps maps to
+                            // __builtin_ia32_vfmaddsubps, only add minus if this comes from a
+                            // subadd LLVM intrinsic, e.g. _mm256_fmsubadd_pd.
+                            let mut new_args = args.to_vec();
+                            let arg3 = &mut new_args[2];
+                            *arg3 = builder.context.new_unary_op(None, UnaryOp::Minus, arg3.get_type(), *arg3);
+                            args = new_args.into();
+                        },
+                        _ => (),
                    }
-                    else {
-                        new_args.push(first_arg);
-                    }
-                    let arg4_index =
-                        match &*func_name {
-                            "__builtin_ia32_sqrtps512_mask" | "__builtin_ia32_sqrtpd512_mask" => 2,
-                            _ => 3,
-                        };
-                    let arg4_type = gcc_func.get_param_type(arg4_index);
-                    let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
-                    if add_before_last_arg {
-                        new_args.insert(new_args.len() - 1, minus_one);
-                    }
-                    else {
-                        new_args.push(minus_one);
-                    }
-                    args = new_args.into();
-                },
-                "__builtin_ia32_pternlogd512_mask" | "__builtin_ia32_pternlogd256_mask"
-                    | "__builtin_ia32_pternlogd128_mask" | "__builtin_ia32_pternlogq512_mask"
-                    | "__builtin_ia32_pternlogq256_mask" | "__builtin_ia32_pternlogq128_mask" => {
-                        let mut new_args = args.to_vec();
-                        let arg5_type = gcc_func.get_param_type(4);
-                        let minus_one = builder.context.new_rvalue_from_int(arg5_type, -1);
-                        new_args.push(minus_one);
-                        args = new_args.into();
-                    },
-                    "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => {
-                        let mut new_args = args.to_vec();
-
-                        let mut last_arg = None;
-                        if args.len() == 4 {
-                            last_arg = new_args.pop();
-                        }
-
-                        let arg4_type = gcc_func.get_param_type(3);
-                        let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
-                        new_args.push(minus_one);
-
-                        if args.len() == 3 {
-                            // Both llvm.fma.v16f32 and llvm.x86.avx512.vfmadd.ps.512 maps to
-                            // the same GCC intrinsic, but the former has 3 parameters and the
-                            // latter has 4 so it doesn't require this additional argument.
-                            let arg5_type = gcc_func.get_param_type(4);
-                            new_args.push(builder.context.new_rvalue_from_int(arg5_type, 4));
-                        }
-
-                        if let Some(last_arg) = last_arg {
-                            new_args.push(last_arg);
-                        }
-
-                        args = new_args.into();
-                    },
-                    "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask"
-                        | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask"
-                        | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask"
-                        | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask" => {
-                        let mut new_args = args.to_vec();
-                        let last_arg = new_args.pop().expect("last arg");
-                        let arg3_type = gcc_func.get_param_type(2);
-                        let undefined = builder.current_func().new_local(None, arg3_type, "undefined_for_intrinsic").to_rvalue();
-                        new_args.push(undefined);
-                        let arg4_type = gcc_func.get_param_type(3);
-                        let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
-                        new_args.push(minus_one);
-                        new_args.push(last_arg);
-                        args = new_args.into();
-                    },
-                    "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => {
-                        let mut new_args = args.to_vec();
-                        let last_arg = new_args.pop().expect("last arg");
-                        let arg4_type = gcc_func.get_param_type(3);
-                        let minus_one = builder.context.new_rvalue_from_int(arg4_type, -1);
-                        new_args.push(minus_one);
-                        new_args.push(last_arg);
-                        args = new_args.into();
-                    },
-                    _ => (),
+                }
+            },
+            "__builtin_ia32_ldmxcsr" => {
+                // The builtin __builtin_ia32_ldmxcsr takes an integer value while llvm.x86.sse.ldmxcsr takes a pointer,
+                // so dereference the pointer.
+                let mut new_args = args.to_vec();
+                let uint_ptr_type = builder.uint_type.make_pointer();
+                let arg1 = builder.context.new_cast(None, args[0], uint_ptr_type);
+                new_args[0] = arg1.dereference(None).to_rvalue();
+                args = new_args.into();
+            },
+            "__builtin_ia32_rcp14sd_mask" | "__builtin_ia32_rcp14ss_mask" | "__builtin_ia32_rsqrt14sd_mask"
+                | "__builtin_ia32_rsqrt14ss_mask" => {
+                let new_args = args.to_vec();
+                args = vec![new_args[1], new_args[0], new_args[2], new_args[3]].into();
+            },
+            "__builtin_ia32_sqrtsd_mask_round" | "__builtin_ia32_sqrtss_mask_round" => {
+                let new_args = args.to_vec();
+                args = vec![new_args[1], new_args[0], new_args[2], new_args[3], new_args[4]].into();
+            },
+            _ => (),
        }
    }

    args
 }

+pub fn adjust_intrinsic_return_value<'a, 'gcc, 'tcx>(builder: &Builder<'a, 'gcc, 'tcx>, mut return_value: RValue<'gcc>, func_name: &str, args: &[RValue<'gcc>], args_adjusted: bool, orig_args: &[RValue<'gcc>]) -> RValue<'gcc> {
+    match func_name {
+        "__builtin_ia32_vfmaddss3_round" | "__builtin_ia32_vfmaddsd3_round" => {
+            #[cfg(feature="master")]
+            {
+                let zero = builder.context.new_rvalue_zero(builder.int_type);
+                return_value = builder.context.new_vector_access(None, return_value, zero).to_rvalue();
+            }
+        },
+        "__builtin_ia32_addcarryx_u64" | "__builtin_ia32_sbb_u64" | "__builtin_ia32_addcarryx_u32" | "__builtin_ia32_sbb_u32" => {
+            // Both llvm.x86.addcarry.32 and llvm.x86.addcarryx.u32 points to the same GCC builtin,
+            // but only the former requires adjusting the return value.
+            // Those 2 LLVM intrinsics differ by their argument count, that's why we check if the
+            // arguments were adjusted.
+            if args_adjusted {
+                let last_arg = args.last().expect("last arg");
+                let field1 = builder.context.new_field(None, builder.u8_type, "carryFlag");
+                let field2 = builder.context.new_field(None, args[1].get_type(), "carryResult");
+                let struct_type = builder.context.new_struct_type(None, "addcarryResult", &[field1, field2]);
+                return_value = builder.context.new_struct_constructor(None, struct_type.as_type(), None, &[return_value, last_arg.dereference(None).to_rvalue()]);
+            }
+        },
+        "__builtin_ia32_stmxcsr" => {
+            // The builtin __builtin_ia32_stmxcsr returns a value while llvm.x86.sse.stmxcsr writes
+            // the result in its pointer argument.
+            // We removed the argument since __builtin_ia32_stmxcsr takes no arguments, so we need
+            // to get back the original argument to get the pointer we need to write the result to.
+            let uint_ptr_type = builder.uint_type.make_pointer();
+            let ptr = builder.context.new_cast(None, orig_args[0], uint_ptr_type);
+            builder.llbb().add_assignment(None, ptr.dereference(None), return_value);
+            // The return value was assigned to the result pointer above. In order to not call the
+            // builtin twice, we overwrite the return value with a dummy value.
+            return_value = builder.context.new_rvalue_zero(builder.int_type);
+        },
+        _ => (),
+    }
+
+    return_value
+}
+
 pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool {
-    // NOTE: these intrinsics have missing parameters before the last one, so ignore the
-    // last argument type check.
    // FIXME(antoyo): find a way to refactor in order to avoid this hack.
    match func_name {
+        // NOTE: these intrinsics have missing parameters before the last one, so ignore the
+        // last argument type check.
        "__builtin_ia32_maxps512_mask" | "__builtin_ia32_maxpd512_mask"
            | "__builtin_ia32_minps512_mask" | "__builtin_ia32_minpd512_mask" | "__builtin_ia32_sqrtps512_mask"
            | "__builtin_ia32_sqrtpd512_mask" | "__builtin_ia32_addps512_mask" | "__builtin_ia32_addpd512_mask"
            | "__builtin_ia32_subps512_mask" | "__builtin_ia32_subpd512_mask"
            | "__builtin_ia32_mulps512_mask" | "__builtin_ia32_mulpd512_mask"
            | "__builtin_ia32_divps512_mask" | "__builtin_ia32_divpd512_mask"
-            | "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask" => {
+            | "__builtin_ia32_vfmaddsubps512_mask" | "__builtin_ia32_vfmaddsubpd512_mask"
+            | "__builtin_ia32_cvtdq2ps512_mask" | "__builtin_ia32_cvtudq2ps512_mask" => {
                if index == args_len - 1 {
                    return true;
                }
            },
+        "__builtin_ia32_rndscaless_mask_round" | "__builtin_ia32_rndscalesd_mask_round" => {
+            if index == 2 || index == 3 {
+                return true;
+            }
+        },
        "__builtin_ia32_vfmaddps512_mask" | "__builtin_ia32_vfmaddpd512_mask" => {
            // Since there are two LLVM intrinsics that map to each of these GCC builtins and only
            // one of them has a missing parameter before the last one, we check the number of
@ -162,6 +390,14 @@ pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool {
                return true;
            }
        },
+        // NOTE: the LLVM intrinsic receives 3 floats, but the GCC builtin requires 3 vectors.
+        "__builtin_ia32_vfmaddss3_round" | "__builtin_ia32_vfmaddsd3_round" => return true,
+        "__builtin_ia32_vplzcntd_512_mask" | "__builtin_ia32_vplzcntd_256_mask" | "__builtin_ia32_vplzcntd_128_mask"
+            | "__builtin_ia32_vplzcntq_512_mask" | "__builtin_ia32_vplzcntq_256_mask" | "__builtin_ia32_vplzcntq_128_mask" => {
+            if index == args_len - 1 {
+                return true;
+            }
+        },
        _ => (),
    }

@ -171,7 +407,7 @@ pub fn ignore_arg_cast(func_name: &str, index: usize, args_len: usize) -> bool {
 #[cfg(not(feature="master"))]
 pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function<'gcc> {
    match name {
-        "llvm.x86.xgetbv" => {
+        "llvm.x86.xgetbv" | "llvm.x86.sse2.pause" => {
            let gcc_name = "__builtin_trap";
            let func = cx.context.get_builtin_function(gcc_name);
            cx.functions.borrow_mut().insert(gcc_name.to_string(), func);
@ -183,24 +419,26 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function

 #[cfg(feature="master")]
 pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function<'gcc> {
+    match name {
+        "llvm.prefetch" => {
+            let gcc_name = "__builtin_prefetch";
+            let func = cx.context.get_builtin_function(gcc_name);
+            cx.functions.borrow_mut().insert(gcc_name.to_string(), func);
+            return func
+        },
+        _ => (),
+    }
+
    let gcc_name = match name {
        "llvm.x86.xgetbv" => "__builtin_ia32_xgetbv",
        // NOTE: this doc specifies the equivalent GCC builtins: http://huonw.github.io/llvmint/llvmint/x86/index.html
        "llvm.sqrt.v2f64" => "__builtin_ia32_sqrtpd",
        "llvm.x86.avx512.pmul.dq.512" => "__builtin_ia32_pmuldq512_mask",
        "llvm.x86.avx512.pmulu.dq.512" => "__builtin_ia32_pmuludq512_mask",
-        "llvm.x86.avx512.mask.pmaxs.q.256" => "__builtin_ia32_pmaxsq256_mask",
-        "llvm.x86.avx512.mask.pmaxs.q.128" => "__builtin_ia32_pmaxsq128_mask",
        "llvm.x86.avx512.max.ps.512" => "__builtin_ia32_maxps512_mask",
        "llvm.x86.avx512.max.pd.512" => "__builtin_ia32_maxpd512_mask",
-        "llvm.x86.avx512.mask.pmaxu.q.256" => "__builtin_ia32_pmaxuq256_mask",
-        "llvm.x86.avx512.mask.pmaxu.q.128" => "__builtin_ia32_pmaxuq128_mask",
-        "llvm.x86.avx512.mask.pmins.q.256" => "__builtin_ia32_pminsq256_mask",
-        "llvm.x86.avx512.mask.pmins.q.128" => "__builtin_ia32_pminsq128_mask",
        "llvm.x86.avx512.min.ps.512" => "__builtin_ia32_minps512_mask",
        "llvm.x86.avx512.min.pd.512" => "__builtin_ia32_minpd512_mask",
-        "llvm.x86.avx512.mask.pminu.q.256" => "__builtin_ia32_pminuq256_mask",
-        "llvm.x86.avx512.mask.pminu.q.128" => "__builtin_ia32_pminuq128_mask",
        "llvm.fma.v16f32" => "__builtin_ia32_vfmaddps512_mask",
        "llvm.fma.v8f64" => "__builtin_ia32_vfmaddpd512_mask",
        "llvm.x86.avx512.vfmaddsub.ps.512" => "__builtin_ia32_vfmaddsubps512_mask",
@ -221,6 +459,153 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function
        "llvm.x86.avx512.div.pd.512" => "__builtin_ia32_divpd512_mask",
        "llvm.x86.avx512.vfmadd.ps.512" => "__builtin_ia32_vfmaddps512_mask",
        "llvm.x86.avx512.vfmadd.pd.512" => "__builtin_ia32_vfmaddpd512_mask",
+        "llvm.x86.avx512.sitofp.round.v16f32.v16i32" => "__builtin_ia32_cvtdq2ps512_mask",
+        "llvm.x86.avx512.uitofp.round.v16f32.v16i32" => "__builtin_ia32_cvtudq2ps512_mask",
+        "llvm.x86.avx512.mask.ucmp.d.512" => "__builtin_ia32_ucmpd512_mask",
+        "llvm.x86.avx512.mask.ucmp.d.256" => "__builtin_ia32_ucmpd256_mask",
+        "llvm.x86.avx512.mask.ucmp.d.128" => "__builtin_ia32_ucmpd128_mask",
+        "llvm.x86.avx512.mask.cmp.d.512" => "__builtin_ia32_cmpd512_mask",
+        "llvm.x86.avx512.mask.cmp.d.256" => "__builtin_ia32_cmpd256_mask",
+        "llvm.x86.avx512.mask.cmp.d.128" => "__builtin_ia32_cmpd128_mask",
+        "llvm.x86.avx512.mask.ucmp.q.512" => "__builtin_ia32_ucmpq512_mask",
+        "llvm.x86.avx512.mask.ucmp.q.256" => "__builtin_ia32_ucmpq256_mask",
+        "llvm.x86.avx512.mask.ucmp.q.128" => "__builtin_ia32_ucmpq128_mask",
+        "llvm.x86.avx512.mask.cmp.q.512" => "__builtin_ia32_cmpq512_mask",
+        "llvm.x86.avx512.mask.cmp.q.256" => "__builtin_ia32_cmpq256_mask",
+        "llvm.x86.avx512.mask.cmp.q.128" => "__builtin_ia32_cmpq128_mask",
+        "llvm.x86.avx512.mask.max.ss.round" => "__builtin_ia32_maxss_mask_round",
+        "llvm.x86.avx512.mask.max.sd.round" => "__builtin_ia32_maxsd_mask_round",
+        "llvm.x86.avx512.mask.min.ss.round" => "__builtin_ia32_minss_mask_round",
+        "llvm.x86.avx512.mask.min.sd.round" => "__builtin_ia32_minsd_mask_round",
+        "llvm.x86.avx512.mask.sqrt.ss" => "__builtin_ia32_sqrtss_mask_round",
+        "llvm.x86.avx512.mask.sqrt.sd" => "__builtin_ia32_sqrtsd_mask_round",
+        "llvm.x86.avx512.mask.getexp.ss" => "__builtin_ia32_getexpss_mask_round",
+        "llvm.x86.avx512.mask.getexp.sd" => "__builtin_ia32_getexpsd_mask_round",
+        "llvm.x86.avx512.mask.getmant.ss" => "__builtin_ia32_getmantss_mask_round",
+        "llvm.x86.avx512.mask.getmant.sd" => "__builtin_ia32_getmantsd_mask_round",
+        "llvm.x86.avx512.mask.rndscale.ss" => "__builtin_ia32_rndscaless_mask_round",
+        "llvm.x86.avx512.mask.rndscale.sd" => "__builtin_ia32_rndscalesd_mask_round",
+        "llvm.x86.avx512.mask.scalef.ss" => "__builtin_ia32_scalefss_mask_round",
+        "llvm.x86.avx512.mask.scalef.sd" => "__builtin_ia32_scalefsd_mask_round",
+        "llvm.x86.avx512.vfmadd.f32" => "__builtin_ia32_vfmaddss3_round",
+        "llvm.x86.avx512.vfmadd.f64" => "__builtin_ia32_vfmaddsd3_round",
+        "llvm.ceil.v4f64" => "__builtin_ia32_ceilpd256",
+        "llvm.ceil.v8f32" => "__builtin_ia32_ceilps256",
+        "llvm.floor.v4f64" => "__builtin_ia32_floorpd256",
+        "llvm.floor.v8f32" => "__builtin_ia32_floorps256",
+        "llvm.sqrt.v4f64" => "__builtin_ia32_sqrtpd256",
+        "llvm.x86.sse.stmxcsr" => "__builtin_ia32_stmxcsr",
+        "llvm.x86.sse.ldmxcsr" => "__builtin_ia32_ldmxcsr",
+        "llvm.ctpop.v16i32" => "__builtin_ia32_vpopcountd_v16si",
+        "llvm.ctpop.v8i32" => "__builtin_ia32_vpopcountd_v8si",
+        "llvm.ctpop.v4i32" => "__builtin_ia32_vpopcountd_v4si",
+        "llvm.ctpop.v8i64" => "__builtin_ia32_vpopcountq_v8di",
+        "llvm.ctpop.v4i64" => "__builtin_ia32_vpopcountq_v4di",
+        "llvm.ctpop.v2i64" => "__builtin_ia32_vpopcountq_v2di",
+        "llvm.x86.addcarry.64" => "__builtin_ia32_addcarryx_u64",
+        "llvm.x86.subborrow.64" => "__builtin_ia32_sbb_u64",
+        "llvm.floor.v2f64" => "__builtin_ia32_floorpd",
+        "llvm.floor.v4f32" => "__builtin_ia32_floorps",
+        "llvm.ceil.v2f64" => "__builtin_ia32_ceilpd",
+        "llvm.ceil.v4f32" => "__builtin_ia32_ceilps",
+        "llvm.fma.v2f64" => "__builtin_ia32_vfmaddpd",
+        "llvm.fma.v4f64" => "__builtin_ia32_vfmaddpd256",
+        "llvm.fma.v4f32" => "__builtin_ia32_vfmaddps",
+        "llvm.fma.v8f32" => "__builtin_ia32_vfmaddps256",
+        "llvm.ctlz.v16i32" => "__builtin_ia32_vplzcntd_512_mask",
+        "llvm.ctlz.v8i32" => "__builtin_ia32_vplzcntd_256_mask",
+        "llvm.ctlz.v4i32" => "__builtin_ia32_vplzcntd_128_mask",
+        "llvm.ctlz.v8i64" => "__builtin_ia32_vplzcntq_512_mask",
+        "llvm.ctlz.v4i64" => "__builtin_ia32_vplzcntq_256_mask",
+        "llvm.ctlz.v2i64" => "__builtin_ia32_vplzcntq_128_mask",
+        "llvm.ctpop.v32i16" => "__builtin_ia32_vpopcountw_v32hi",
+        "llvm.x86.fma.vfmsub.sd" => "__builtin_ia32_vfmsubsd3",
+        "llvm.x86.fma.vfmsub.ss" => "__builtin_ia32_vfmsubss3",
+        "llvm.x86.fma.vfmsubadd.pd" => "__builtin_ia32_vfmaddsubpd",
+        "llvm.x86.fma.vfmsubadd.pd.256" => "__builtin_ia32_vfmaddsubpd256",
+        "llvm.x86.fma.vfmsubadd.ps" => "__builtin_ia32_vfmaddsubps",
+        "llvm.x86.fma.vfmsubadd.ps.256" => "__builtin_ia32_vfmaddsubps256",
+        "llvm.x86.fma.vfnmadd.sd" => "__builtin_ia32_vfnmaddsd3",
+        "llvm.x86.fma.vfnmadd.ss" => "__builtin_ia32_vfnmaddss3",
+        "llvm.x86.fma.vfnmsub.sd" => "__builtin_ia32_vfnmsubsd3",
+        "llvm.x86.fma.vfnmsub.ss" => "__builtin_ia32_vfnmsubss3",
+        "llvm.x86.avx512.conflict.d.512" => "__builtin_ia32_vpconflictsi_512_mask",
+        "llvm.x86.avx512.conflict.d.256" => "__builtin_ia32_vpconflictsi_256_mask",
+        "llvm.x86.avx512.conflict.d.128" => "__builtin_ia32_vpconflictsi_128_mask",
+        "llvm.x86.avx512.conflict.q.512" => "__builtin_ia32_vpconflictdi_512_mask",
+        "llvm.x86.avx512.conflict.q.256" => "__builtin_ia32_vpconflictdi_256_mask",
+        "llvm.x86.avx512.conflict.q.128" => "__builtin_ia32_vpconflictdi_128_mask",
+        "llvm.x86.avx512.vpermi2var.qi.512" => "__builtin_ia32_vpermt2varqi512_mask",
+        "llvm.x86.avx512.vpermi2var.qi.256" => "__builtin_ia32_vpermt2varqi256_mask",
+        "llvm.x86.avx512.vpermi2var.qi.128" => "__builtin_ia32_vpermt2varqi128_mask",
+        "llvm.x86.avx512.permvar.qi.512" => "__builtin_ia32_permvarqi512_mask",
+        "llvm.x86.avx512.permvar.qi.256" => "__builtin_ia32_permvarqi256_mask",
+        "llvm.x86.avx512.permvar.qi.128" => "__builtin_ia32_permvarqi128_mask",
+        "llvm.x86.avx512.pmultishift.qb.512" => "__builtin_ia32_vpmultishiftqb512_mask",
+        "llvm.x86.avx512.pmultishift.qb.256" => "__builtin_ia32_vpmultishiftqb256_mask",
+        "llvm.x86.avx512.pmultishift.qb.128" => "__builtin_ia32_vpmultishiftqb128_mask",
+        "llvm.ctpop.v16i16" => "__builtin_ia32_vpopcountw_v16hi",
+        "llvm.ctpop.v8i16" => "__builtin_ia32_vpopcountw_v8hi",
+        "llvm.ctpop.v64i8" => "__builtin_ia32_vpopcountb_v64qi",
+        "llvm.ctpop.v32i8" => "__builtin_ia32_vpopcountb_v32qi",
+        "llvm.ctpop.v16i8" => "__builtin_ia32_vpopcountb_v16qi",
+        "llvm.x86.avx512.mask.vpshufbitqmb.512" => "__builtin_ia32_vpshufbitqmb512_mask",
+        "llvm.x86.avx512.mask.vpshufbitqmb.256" => "__builtin_ia32_vpshufbitqmb256_mask",
+        "llvm.x86.avx512.mask.vpshufbitqmb.128" => "__builtin_ia32_vpshufbitqmb128_mask",
+        "llvm.x86.avx512.mask.ucmp.w.512" => "__builtin_ia32_ucmpw512_mask",
+        "llvm.x86.avx512.mask.ucmp.w.256" => "__builtin_ia32_ucmpw256_mask",
+        "llvm.x86.avx512.mask.ucmp.w.128" => "__builtin_ia32_ucmpw128_mask",
+        "llvm.x86.avx512.mask.ucmp.b.512" => "__builtin_ia32_ucmpb512_mask",
+        "llvm.x86.avx512.mask.ucmp.b.256" => "__builtin_ia32_ucmpb256_mask",
+        "llvm.x86.avx512.mask.ucmp.b.128" => "__builtin_ia32_ucmpb128_mask",
+        "llvm.x86.avx512.mask.cmp.w.512" => "__builtin_ia32_cmpw512_mask",
+        "llvm.x86.avx512.mask.cmp.w.256" => "__builtin_ia32_cmpw256_mask",
+        "llvm.x86.avx512.mask.cmp.w.128" => "__builtin_ia32_cmpw128_mask",
+        "llvm.x86.avx512.mask.cmp.b.512" => "__builtin_ia32_cmpb512_mask",
+        "llvm.x86.avx512.mask.cmp.b.256" => "__builtin_ia32_cmpb256_mask",
+        "llvm.x86.avx512.mask.cmp.b.128" => "__builtin_ia32_cmpb128_mask",
+        "llvm.x86.xrstor" => "__builtin_ia32_xrstor",
+        "llvm.x86.xsavec" => "__builtin_ia32_xsavec",
+        "llvm.x86.addcarry.32" => "__builtin_ia32_addcarryx_u32",
+        "llvm.x86.subborrow.32" => "__builtin_ia32_sbb_u32",
+        "llvm.x86.avx512.mask.compress.store.w.512" => "__builtin_ia32_compressstoreuhi512_mask",
+        "llvm.x86.avx512.mask.compress.store.w.256" => "__builtin_ia32_compressstoreuhi256_mask",
+        "llvm.x86.avx512.mask.compress.store.w.128" => "__builtin_ia32_compressstoreuhi128_mask",
+        "llvm.x86.avx512.mask.compress.store.b.512" => "__builtin_ia32_compressstoreuqi512_mask",
+        "llvm.x86.avx512.mask.compress.store.b.256" => "__builtin_ia32_compressstoreuqi256_mask",
+        "llvm.x86.avx512.mask.compress.store.b.128" => "__builtin_ia32_compressstoreuqi128_mask",
+        "llvm.x86.avx512.mask.compress.w.512" => "__builtin_ia32_compresshi512_mask",
+        "llvm.x86.avx512.mask.compress.w.256" => "__builtin_ia32_compresshi256_mask",
+        "llvm.x86.avx512.mask.compress.w.128" => "__builtin_ia32_compresshi128_mask",
+        "llvm.x86.avx512.mask.compress.b.512" => "__builtin_ia32_compressqi512_mask",
+        "llvm.x86.avx512.mask.compress.b.256" => "__builtin_ia32_compressqi256_mask",
+        "llvm.x86.avx512.mask.compress.b.128" => "__builtin_ia32_compressqi128_mask",
+        "llvm.x86.avx512.mask.expand.w.512" => "__builtin_ia32_expandhi512_mask",
+        "llvm.x86.avx512.mask.expand.w.256" => "__builtin_ia32_expandhi256_mask",
+        "llvm.x86.avx512.mask.expand.w.128" => "__builtin_ia32_expandhi128_mask",
+        "llvm.x86.avx512.mask.expand.b.512" => "__builtin_ia32_expandqi512_mask",
+        "llvm.x86.avx512.mask.expand.b.256" => "__builtin_ia32_expandqi256_mask",
+        "llvm.x86.avx512.mask.expand.b.128" => "__builtin_ia32_expandqi128_mask",
+        "llvm.fshl.v8i64" => "__builtin_ia32_vpshldv_v8di",
+        "llvm.fshl.v4i64" => "__builtin_ia32_vpshldv_v4di",
+        "llvm.fshl.v2i64" => "__builtin_ia32_vpshldv_v2di",
+        "llvm.fshl.v16i32" => "__builtin_ia32_vpshldv_v16si",
+        "llvm.fshl.v8i32" => "__builtin_ia32_vpshldv_v8si",
+        "llvm.fshl.v4i32" => "__builtin_ia32_vpshldv_v4si",
+        "llvm.fshl.v32i16" => "__builtin_ia32_vpshldv_v32hi",
+        "llvm.fshl.v16i16" => "__builtin_ia32_vpshldv_v16hi",
+        "llvm.fshl.v8i16" => "__builtin_ia32_vpshldv_v8hi",
+        "llvm.fshr.v8i64" => "__builtin_ia32_vpshrdv_v8di",
+        "llvm.fshr.v4i64" => "__builtin_ia32_vpshrdv_v4di",
+        "llvm.fshr.v2i64" => "__builtin_ia32_vpshrdv_v2di",
+        "llvm.fshr.v16i32" => "__builtin_ia32_vpshrdv_v16si",
+        "llvm.fshr.v8i32" => "__builtin_ia32_vpshrdv_v8si",
+        "llvm.fshr.v4i32" => "__builtin_ia32_vpshrdv_v4si",
+        "llvm.fshr.v32i16" => "__builtin_ia32_vpshrdv_v32hi",
+        "llvm.fshr.v16i16" => "__builtin_ia32_vpshrdv_v16hi",
+        "llvm.fshr.v8i16" => "__builtin_ia32_vpshrdv_v8hi",
+        "llvm.x86.fma.vfmadd.sd" => "__builtin_ia32_vfmaddsd3",
+        "llvm.x86.fma.vfmadd.ss" => "__builtin_ia32_vfmaddss3",

        // The above doc points to unknown builtins for the following, so override them:
        "llvm.x86.avx2.gather.d.d" => "__builtin_ia32_gathersiv4si",
@ -239,7 +624,151 @@ pub fn intrinsic<'gcc, 'tcx>(name: &str, cx: &CodegenCx<'gcc, 'tcx>) -> Function
        "llvm.x86.avx2.gather.q.q.256" => "__builtin_ia32_gatherdiv4di",
        "llvm.x86.avx2.gather.q.pd" => "__builtin_ia32_gatherdiv2df",
        "llvm.x86.avx2.gather.q.pd.256" => "__builtin_ia32_gatherdiv4df",
-        "" => "",
+        "llvm.x86.avx512.pslli.d.512" => "__builtin_ia32_pslldi512_mask",
+        "llvm.x86.avx512.psrli.d.512" => "__builtin_ia32_psrldi512_mask",
+        "llvm.x86.avx512.pslli.q.512" => "__builtin_ia32_psllqi512_mask",
+        "llvm.x86.avx512.psrli.q.512" => "__builtin_ia32_psrlqi512_mask",
+        "llvm.x86.avx512.psll.d.512" => "__builtin_ia32_pslld512_mask",
+        "llvm.x86.avx512.psrl.d.512" => "__builtin_ia32_psrld512_mask",
+        "llvm.x86.avx512.psll.q.512" => "__builtin_ia32_psllq512_mask",
+        "llvm.x86.avx512.psrl.q.512" => "__builtin_ia32_psrlq512_mask",
+        "llvm.x86.avx512.psra.d.512" => "__builtin_ia32_psrad512_mask",
+        "llvm.x86.avx512.psra.q.512" => "__builtin_ia32_psraq512_mask",
+        "llvm.x86.avx512.psra.q.256" => "__builtin_ia32_psraq256_mask",
+        "llvm.x86.avx512.psra.q.128" => "__builtin_ia32_psraq128_mask",
+        "llvm.x86.avx512.psrai.d.512" => "__builtin_ia32_psradi512_mask",
+        "llvm.x86.avx512.psrai.q.512" => "__builtin_ia32_psraqi512_mask",
+        "llvm.x86.avx512.psrai.q.256" => "__builtin_ia32_psraqi256_mask",
+        "llvm.x86.avx512.psrai.q.128" => "__builtin_ia32_psraqi128_mask",
+        "llvm.x86.avx512.psrav.d.512" => "__builtin_ia32_psrav16si_mask",
+        "llvm.x86.avx512.psrav.q.512" => "__builtin_ia32_psrav8di_mask",
+        "llvm.x86.avx512.psrav.q.256" => "__builtin_ia32_psravq256_mask",
+        "llvm.x86.avx512.psrav.q.128" => "__builtin_ia32_psravq128_mask",
+        "llvm.x86.avx512.psllv.d.512" => "__builtin_ia32_psllv16si_mask",
+        "llvm.x86.avx512.psrlv.d.512" => "__builtin_ia32_psrlv16si_mask",
+        "llvm.x86.avx512.psllv.q.512" => "__builtin_ia32_psllv8di_mask",
+        "llvm.x86.avx512.psrlv.q.512" => "__builtin_ia32_psrlv8di_mask",
+        "llvm.x86.avx512.permvar.si.512" => "__builtin_ia32_permvarsi512_mask",
+        "llvm.x86.avx512.vpermilvar.ps.512" => "__builtin_ia32_vpermilvarps512_mask",
+        "llvm.x86.avx512.vpermilvar.pd.512" => "__builtin_ia32_vpermilvarpd512_mask",
+        "llvm.x86.avx512.permvar.di.512" => "__builtin_ia32_permvardi512_mask",
+        "llvm.x86.avx512.permvar.di.256" => "__builtin_ia32_permvardi256_mask",
+        "llvm.x86.avx512.permvar.sf.512" => "__builtin_ia32_permvarsf512_mask",
+        "llvm.x86.avx512.permvar.df.512" => "__builtin_ia32_permvardf512_mask",
+        "llvm.x86.avx512.permvar.df.256" => "__builtin_ia32_permvardf256_mask",
+        "llvm.x86.avx512.vpermi2var.d.512" => "__builtin_ia32_vpermi2vard512_mask",
+        "llvm.x86.avx512.vpermi2var.d.256" => "__builtin_ia32_vpermi2vard256_mask",
+        "llvm.x86.avx512.vpermi2var.d.128" => "__builtin_ia32_vpermi2vard128_mask",
+        "llvm.x86.avx512.vpermi2var.q.512" => "__builtin_ia32_vpermi2varq512_mask",
+        "llvm.x86.avx512.vpermi2var.q.256" => "__builtin_ia32_vpermi2varq256_mask",
+        "llvm.x86.avx512.vpermi2var.q.128" => "__builtin_ia32_vpermi2varq128_mask",
+        "llvm.x86.avx512.vpermi2var.ps.512" => "__builtin_ia32_vpermi2varps512_mask",
+        "llvm.x86.avx512.vpermi2var.ps.256" => "__builtin_ia32_vpermi2varps256_mask",
+        "llvm.x86.avx512.vpermi2var.ps.128" => "__builtin_ia32_vpermi2varps128_mask",
+        "llvm.x86.avx512.vpermi2var.pd.512" => "__builtin_ia32_vpermi2varpd512_mask",
+        "llvm.x86.avx512.vpermi2var.pd.256" => "__builtin_ia32_vpermi2varpd256_mask",
+        "llvm.x86.avx512.vpermi2var.pd.128" => "__builtin_ia32_vpermi2varpd128_mask",
+        "llvm.x86.avx512.mask.add.ss.round" => "__builtin_ia32_addss_mask_round",
+        "llvm.x86.avx512.mask.add.sd.round" => "__builtin_ia32_addsd_mask_round",
+        "llvm.x86.avx512.mask.sub.ss.round" => "__builtin_ia32_subss_mask_round",
+        "llvm.x86.avx512.mask.sub.sd.round" => "__builtin_ia32_subsd_mask_round",
+        "llvm.x86.avx512.mask.mul.ss.round" => "__builtin_ia32_mulss_mask_round",
+        "llvm.x86.avx512.mask.mul.sd.round" => "__builtin_ia32_mulsd_mask_round",
+        "llvm.x86.avx512.mask.div.ss.round" => "__builtin_ia32_divss_mask_round",
+        "llvm.x86.avx512.mask.div.sd.round" => "__builtin_ia32_divsd_mask_round",
+        "llvm.x86.avx512.mask.cvtss2sd.round" => "__builtin_ia32_cvtss2sd_mask_round",
+        "llvm.x86.avx512.mask.cvtsd2ss.round" => "__builtin_ia32_cvtsd2ss_mask_round",
+        "llvm.x86.avx512.mask.range.ss" => "__builtin_ia32_rangess128_mask_round",
+        "llvm.x86.avx512.mask.range.sd" => "__builtin_ia32_rangesd128_mask_round",
+        "llvm.x86.avx512.rcp28.ss" => "__builtin_ia32_rcp28ss_mask_round",
+        "llvm.x86.avx512.rcp28.sd" => "__builtin_ia32_rcp28sd_mask_round",
+        "llvm.x86.avx512.rsqrt28.ss" => "__builtin_ia32_rsqrt28ss_mask_round",
+        "llvm.x86.avx512.rsqrt28.sd" => "__builtin_ia32_rsqrt28sd_mask_round",
+        "llvm.x86.avx512fp16.mask.add.sh.round" => "__builtin_ia32_addsh_mask_round",
+        "llvm.x86.avx512fp16.mask.div.sh.round" => "__builtin_ia32_divsh_mask_round",
+        "llvm.x86.avx512fp16.mask.getmant.sh" => "__builtin_ia32_getmantsh_mask_round",
+        "llvm.x86.avx512fp16.mask.max.sh.round" => "__builtin_ia32_maxsh_mask_round",
+        "llvm.x86.avx512fp16.mask.min.sh.round" => "__builtin_ia32_minsh_mask_round",
+        "llvm.x86.avx512fp16.mask.mul.sh.round" => "__builtin_ia32_mulsh_mask_round",
+        "llvm.x86.avx512fp16.mask.rndscale.sh" => "__builtin_ia32_rndscalesh_mask_round",
+        "llvm.x86.avx512fp16.mask.scalef.sh" => "__builtin_ia32_scalefsh_mask_round",
+        "llvm.x86.avx512fp16.mask.sub.sh.round" => "__builtin_ia32_subsh_mask_round",
+        "llvm.x86.avx512fp16.mask.vcvtsd2sh.round" => "__builtin_ia32_vcvtsd2sh_mask_round",
+        "llvm.x86.avx512fp16.mask.vcvtsh2sd.round" => "__builtin_ia32_vcvtsh2sd_mask_round",
+        "llvm.x86.avx512fp16.mask.vcvtsh2ss.round" => "__builtin_ia32_vcvtsh2ss_mask_round",
+        "llvm.x86.avx512fp16.mask.vcvtss2sh.round" => "__builtin_ia32_vcvtss2sh_mask_round",
+        "llvm.x86.aesni.aesenc.256" => "__builtin_ia32_vaesenc_v32qi",
+        "llvm.x86.aesni.aesenclast.256" => "__builtin_ia32_vaesenclast_v32qi",
+        "llvm.x86.aesni.aesdec.256" => "__builtin_ia32_vaesdec_v32qi",
+        "llvm.x86.aesni.aesdeclast.256" => "__builtin_ia32_vaesdeclast_v32qi",
+        "llvm.x86.aesni.aesenc.512" => "__builtin_ia32_vaesenc_v64qi",
+        "llvm.x86.aesni.aesenclast.512" => "__builtin_ia32_vaesenclast_v64qi",
+        "llvm.x86.aesni.aesdec.512" => "__builtin_ia32_vaesdec_v64qi",
+        "llvm.x86.aesni.aesdeclast.512" => "__builtin_ia32_vaesdeclast_v64qi",
+        "llvm.x86.avx512bf16.cvtne2ps2bf16.128" => "__builtin_ia32_cvtne2ps2bf16_v8bf",
+        "llvm.x86.avx512bf16.cvtne2ps2bf16.256" => "__builtin_ia32_cvtne2ps2bf16_v16bf",
+        "llvm.x86.avx512bf16.cvtne2ps2bf16.512" => "__builtin_ia32_cvtne2ps2bf16_v32bf",
+        "llvm.x86.avx512bf16.cvtneps2bf16.256" => "__builtin_ia32_cvtneps2bf16_v8sf",
+        "llvm.x86.avx512bf16.cvtneps2bf16.512" => "__builtin_ia32_cvtneps2bf16_v16sf",
+        "llvm.x86.avx512bf16.dpbf16ps.128" => "__builtin_ia32_dpbf16ps_v4sf",
+        "llvm.x86.avx512bf16.dpbf16ps.256" => "__builtin_ia32_dpbf16ps_v8sf",
+        "llvm.x86.avx512bf16.dpbf16ps.512" => "__builtin_ia32_dpbf16ps_v16sf",
+        "llvm.x86.pclmulqdq.512" => "__builtin_ia32_vpclmulqdq_v8di",
+        "llvm.x86.pclmulqdq.256" => "__builtin_ia32_vpclmulqdq_v4di",
+        "llvm.x86.avx512.pmulhu.w.512" => "__builtin_ia32_pmulhuw512_mask",
+        "llvm.x86.avx512.pmulh.w.512" => "__builtin_ia32_pmulhw512_mask",
+        "llvm.x86.avx512.pmul.hr.sw.512" => "__builtin_ia32_pmulhrsw512_mask",
+        "llvm.x86.avx512.pmaddw.d.512" => "__builtin_ia32_pmaddwd512_mask",
+        "llvm.x86.avx512.pmaddubs.w.512" => "__builtin_ia32_pmaddubsw512_mask",
+        "llvm.x86.avx512.packssdw.512" => "__builtin_ia32_packssdw512_mask",
+        "llvm.x86.avx512.packsswb.512" => "__builtin_ia32_packsswb512_mask",
+        "llvm.x86.avx512.packusdw.512" => "__builtin_ia32_packusdw512_mask",
+        "llvm.x86.avx512.packuswb.512" => "__builtin_ia32_packuswb512_mask",
+        "llvm.x86.avx512.pavg.w.512" => "__builtin_ia32_pavgw512_mask",
+        "llvm.x86.avx512.pavg.b.512" => "__builtin_ia32_pavgb512_mask",
+        "llvm.x86.avx512.psll.w.512" => "__builtin_ia32_psllw512_mask",
+        "llvm.x86.avx512.pslli.w.512" => "__builtin_ia32_psllwi512_mask",
+        "llvm.x86.avx512.psllv.w.512" => "__builtin_ia32_psllv32hi_mask",
+        "llvm.x86.avx512.psllv.w.256" => "__builtin_ia32_psllv16hi_mask",
+        "llvm.x86.avx512.psllv.w.128" => "__builtin_ia32_psllv8hi_mask",
+        "llvm.x86.avx512.psrl.w.512" => "__builtin_ia32_psrlw512_mask",
+        "llvm.x86.avx512.psrli.w.512" => "__builtin_ia32_psrlwi512_mask",
+        "llvm.x86.avx512.psrlv.w.512" => "__builtin_ia32_psrlv32hi_mask",
+        "llvm.x86.avx512.psrlv.w.256" => "__builtin_ia32_psrlv16hi_mask",
+        "llvm.x86.avx512.psrlv.w.128" => "__builtin_ia32_psrlv8hi_mask",
+        "llvm.x86.avx512.psra.w.512" => "__builtin_ia32_psraw512_mask",
+        "llvm.x86.avx512.psrai.w.512" => "__builtin_ia32_psrawi512_mask",
+        "llvm.x86.avx512.psrav.w.512" => "__builtin_ia32_psrav32hi_mask",
+        "llvm.x86.avx512.psrav.w.256" => "__builtin_ia32_psrav16hi_mask",
+        "llvm.x86.avx512.psrav.w.128" => "__builtin_ia32_psrav8hi_mask",
+        "llvm.x86.avx512.vpermi2var.hi.512" => "__builtin_ia32_vpermt2varhi512_mask",
+        "llvm.x86.avx512.vpermi2var.hi.256" => "__builtin_ia32_vpermt2varhi256_mask",
+        "llvm.x86.avx512.vpermi2var.hi.128" => "__builtin_ia32_vpermt2varhi128_mask",
+        "llvm.x86.avx512.permvar.hi.512" => "__builtin_ia32_permvarhi512_mask",
+        "llvm.x86.avx512.permvar.hi.256" => "__builtin_ia32_permvarhi256_mask",
+        "llvm.x86.avx512.permvar.hi.128" => "__builtin_ia32_permvarhi128_mask",
+        "llvm.x86.avx512.pshuf.b.512" => "__builtin_ia32_pshufb512_mask",
+        "llvm.x86.avx512.dbpsadbw.512" => "__builtin_ia32_dbpsadbw512_mask",
+        "llvm.x86.avx512.dbpsadbw.256" => "__builtin_ia32_dbpsadbw256_mask",
+        "llvm.x86.avx512.dbpsadbw.128" => "__builtin_ia32_dbpsadbw128_mask",
+        "llvm.x86.avx512.vpmadd52h.uq.512" => "__builtin_ia32_vpmadd52huq512_mask",
+        "llvm.x86.avx512.vpmadd52l.uq.512" => "__builtin_ia32_vpmadd52luq512_mask",
+        "llvm.x86.avx512.vpmadd52h.uq.256" => "__builtin_ia32_vpmadd52huq256_mask",
+        "llvm.x86.avx512.vpmadd52l.uq.256" => "__builtin_ia32_vpmadd52luq256_mask",
+        "llvm.x86.avx512.vpmadd52h.uq.128" => "__builtin_ia32_vpmadd52huq128_mask",
+        "llvm.x86.avx512.vpdpwssd.512" => "__builtin_ia32_vpdpwssd_v16si",
+        "llvm.x86.avx512.vpdpwssd.256" => "__builtin_ia32_vpdpwssd_v8si",
+        "llvm.x86.avx512.vpdpwssd.128" => "__builtin_ia32_vpdpwssd_v4si",
+        "llvm.x86.avx512.vpdpwssds.512" => "__builtin_ia32_vpdpwssds_v16si",
+        "llvm.x86.avx512.vpdpwssds.256" => "__builtin_ia32_vpdpwssds_v8si",
+        "llvm.x86.avx512.vpdpwssds.128" => "__builtin_ia32_vpdpwssds_v4si",
+        "llvm.x86.avx512.vpdpbusd.512" => "__builtin_ia32_vpdpbusd_v16si",
+        "llvm.x86.avx512.vpdpbusd.256" => "__builtin_ia32_vpdpbusd_v8si",
+        "llvm.x86.avx512.vpdpbusd.128" => "__builtin_ia32_vpdpbusd_v4si",
+        "llvm.x86.avx512.vpdpbusds.512" => "__builtin_ia32_vpdpbusds_v16si",
+        "llvm.x86.avx512.vpdpbusds.256" => "__builtin_ia32_vpdpbusds_v8si",
+        "llvm.x86.avx512.vpdpbusds.128" => "__builtin_ia32_vpdpbusds_v4si",
+
        // NOTE: this file is generated by https://github.com/GuillaumeGomez/llvmint/blob/master/generate_list.py
        _ => include!("archs.rs"),
    };
--- a/compiler/rustc_codegen_gcc/src/intrinsic/mod.rs
+++ b/compiler/rustc_codegen_gcc/src/intrinsic/mod.rs
@ -1,6 +1,9 @@
 pub mod llvm;
 mod simd;

+#[cfg(feature="master")]
+use std::iter;
+
 use gccjit::{ComparisonOp, Function, RValue, ToRValue, Type, UnaryOp, FunctionType};
 use rustc_codegen_ssa::MemFlags;
 use rustc_codegen_ssa::base::wants_msvc_seh;
@ -8,15 +11,23 @@ use rustc_codegen_ssa::common::IntPredicate;
 use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue};
 use rustc_codegen_ssa::mir::place::PlaceRef;
 use rustc_codegen_ssa::traits::{ArgAbiMethods, BaseTypeMethods, BuilderMethods, ConstMethods, IntrinsicCallMethods};
+#[cfg(feature="master")]
+use rustc_codegen_ssa::traits::{DerivedTypeMethods, MiscMethods};
 use rustc_middle::bug;
 use rustc_middle::ty::{self, Instance, Ty};
 use rustc_middle::ty::layout::LayoutOf;
+#[cfg(feature="master")]
+use rustc_middle::ty::layout::{FnAbiOf, HasTyCtxt};
 use rustc_span::{Span, Symbol, symbol::kw, sym};
 use rustc_target::abi::HasDataLayout;
 use rustc_target::abi::call::{ArgAbi, FnAbi, PassMode};
 use rustc_target::spec::PanicStrategy;
+#[cfg(feature="master")]
+use rustc_target::spec::abi::Abi;

 use crate::abi::GccType;
+#[cfg(feature="master")]
+use crate::abi::FnAbiGccExt;
 use crate::builder::Builder;
 use crate::common::{SignType, TypeReflection};
 use crate::context::CodegenCx;
@ -91,7 +102,7 @@ impl<'a, 'gcc, 'tcx> IntrinsicCallMethods<'tcx> for Builder<'a, 'gcc, 'tcx> {
        let name = tcx.item_name(def_id);
        let name_str = name.as_str();

-        let llret_ty = self.layout_of(ret_ty).gcc_type(self, true);
+        let llret_ty = self.layout_of(ret_ty).gcc_type(self);
        let result = PlaceRef::new_sized(llresult, fn_abi.ret.layout);

        let simple = get_simple_intrinsic(self, name);
@ -404,7 +415,7 @@ impl<'gcc, 'tcx> ArgAbiExt<'gcc, 'tcx> for ArgAbi<'tcx, Ty<'tcx>> {
    /// Gets the LLVM type for a place of the original Rust type of
    /// this argument/return, i.e., the result of `type_of::type_of`.
    fn memory_ty(&self, cx: &CodegenCx<'gcc, 'tcx>) -> Type<'gcc> {
-        self.layout.gcc_type(cx, true)
+        self.layout.gcc_type(cx)
    }

    /// Stores a direct/indirect value described by this ArgAbi into a
@ -1120,10 +1131,8 @@ impl<'a, 'gcc, 'tcx> Builder<'a, 'gcc, 'tcx> {
    }
 }

-fn try_intrinsic<'gcc, 'tcx>(bx: &mut Builder<'_, 'gcc, 'tcx>, try_func: RValue<'gcc>, data: RValue<'gcc>, _catch_func: RValue<'gcc>, dest: RValue<'gcc>) {
-    // NOTE: the `|| true` here is to use the panic=abort strategy with panic=unwind too
-    if bx.sess().panic_strategy() == PanicStrategy::Abort || true {
-        // TODO(bjorn3): Properly implement unwinding and remove the `|| true` once this is done.
+fn try_intrinsic<'a, 'b, 'gcc, 'tcx>(bx: &'b mut Builder<'a, 'gcc, 'tcx>, try_func: RValue<'gcc>, data: RValue<'gcc>, _catch_func: RValue<'gcc>, dest: RValue<'gcc>) {
+    if bx.sess().panic_strategy() == PanicStrategy::Abort {
        bx.call(bx.type_void(), None, try_func, &[data], None);
        // Return 0 unconditionally from the intrinsic call;
        // we can never unwind.
@ -1134,6 +1143,141 @@ fn try_intrinsic<'gcc, 'tcx>(bx: &mut Builder<'_, 'gcc, 'tcx>, try_func: RValue<
        unimplemented!();
    }
    else {
+        #[cfg(feature="master")]
+        codegen_gnu_try(bx, try_func, data, _catch_func, dest);
+        #[cfg(not(feature="master"))]
        unimplemented!();
    }
 }
+
+// Definition of the standard `try` function for Rust using the GNU-like model
+// of exceptions (e.g., the normal semantics of LLVM's `landingpad` and `invoke`
+// instructions).
+//
+// This codegen is a little surprising because we always call a shim
+// function instead of inlining the call to `invoke` manually here. This is done
+// because in LLVM we're only allowed to have one personality per function
+// definition. The call to the `try` intrinsic is being inlined into the
+// function calling it, and that function may already have other personality
+// functions in play. By calling a shim we're guaranteed that our shim will have
+// the right personality function.
+#[cfg(feature="master")]
+fn codegen_gnu_try<'gcc>(bx: &mut Builder<'_, 'gcc, '_>, try_func: RValue<'gcc>, data: RValue<'gcc>, catch_func: RValue<'gcc>, dest: RValue<'gcc>) {
+    let cx: &CodegenCx<'gcc, '_> = bx.cx;
+    let (llty, func) = get_rust_try_fn(cx, &mut |mut bx| {
+        // Codegens the shims described above:
+        //
+        //   bx:
+        //      invoke %try_func(%data) normal %normal unwind %catch
+        //
+        //   normal:
+        //      ret 0
+        //
+        //   catch:
+        //      (%ptr, _) = landingpad
+        //      call %catch_func(%data, %ptr)
+        //      ret 1
+        let then = bx.append_sibling_block("then");
+        let catch = bx.append_sibling_block("catch");
+
+        let func = bx.current_func();
+        let try_func = func.get_param(0).to_rvalue();
+        let data = func.get_param(1).to_rvalue();
+        let catch_func = func.get_param(2).to_rvalue();
+        let try_func_ty = bx.type_func(&[bx.type_i8p()], bx.type_void());
+
+        let current_block = bx.block.clone();
+
+        bx.switch_to_block(then);
+        bx.ret(bx.const_i32(0));
+
+        // Type indicator for the exception being thrown.
+        //
+        // The value is a pointer to the exception object
+        // being thrown.
+        bx.switch_to_block(catch);
+        bx.set_personality_fn(bx.eh_personality());
+
+        let eh_pointer_builtin = bx.cx.context.get_target_builtin_function("__builtin_eh_pointer");
+        let zero = bx.cx.context.new_rvalue_zero(bx.int_type);
+        let ptr = bx.cx.context.new_call(None, eh_pointer_builtin, &[zero]);
+        let catch_ty = bx.type_func(&[bx.type_i8p(), bx.type_i8p()], bx.type_void());
+        bx.call(catch_ty, None, catch_func, &[data, ptr], None);
+        bx.ret(bx.const_i32(1));
+
+        // NOTE: the blocks must be filled before adding the try/catch, otherwise gcc will not
+        // generate a try/catch.
+        // FIXME(antoyo): add a check in the libgccjit API to prevent this.
+        bx.switch_to_block(current_block);
+        bx.invoke(try_func_ty, None, try_func, &[data], then, catch, None);
+    });
+
+    let func = unsafe { std::mem::transmute(func) };
+
+    // Note that no invoke is used here because by definition this function
+    // can't panic (that's what it's catching).
+    let ret = bx.call(llty, None, func, &[try_func, data, catch_func], None);
+    let i32_align = bx.tcx().data_layout.i32_align.abi;
+    bx.store(ret, dest, i32_align);
+}
+
+
+// Helper function used to get a handle to the `__rust_try` function used to
+// catch exceptions.
+//
+// This function is only generated once and is then cached.
+#[cfg(feature="master")]
+fn get_rust_try_fn<'a, 'gcc, 'tcx>(cx: &'a CodegenCx<'gcc, 'tcx>, codegen: &mut dyn FnMut(Builder<'a, 'gcc, 'tcx>)) -> (Type<'gcc>, Function<'gcc>) {
+    if let Some(llfn) = cx.rust_try_fn.get() {
+        return llfn;
+    }
+
+    // Define the type up front for the signature of the rust_try function.
+    let tcx = cx.tcx;
+    let i8p = tcx.mk_mut_ptr(tcx.types.i8);
+    // `unsafe fn(*mut i8) -> ()`
+    let try_fn_ty = tcx.mk_fn_ptr(ty::Binder::dummy(tcx.mk_fn_sig(
+        iter::once(i8p),
+        tcx.mk_unit(),
+        false,
+        rustc_hir::Unsafety::Unsafe,
+        Abi::Rust,
+    )));
+    // `unsafe fn(*mut i8, *mut i8) -> ()`
+    let catch_fn_ty = tcx.mk_fn_ptr(ty::Binder::dummy(tcx.mk_fn_sig(
+        [i8p, i8p].iter().cloned(),
+        tcx.mk_unit(),
+        false,
+        rustc_hir::Unsafety::Unsafe,
+        Abi::Rust,
+    )));
+    // `unsafe fn(unsafe fn(*mut i8) -> (), *mut i8, unsafe fn(*mut i8, *mut i8) -> ()) -> i32`
+    let rust_fn_sig = ty::Binder::dummy(cx.tcx.mk_fn_sig(
+        [try_fn_ty, i8p, catch_fn_ty],
+        tcx.types.i32,
+        false,
+        rustc_hir::Unsafety::Unsafe,
+        Abi::Rust,
+    ));
+    let rust_try = gen_fn(cx, "__rust_try", rust_fn_sig, codegen);
+    cx.rust_try_fn.set(Some(rust_try));
+    rust_try
+}
+
+// Helper function to give a Block to a closure to codegen a shim function.
+// This is currently primarily used for the `try` intrinsic functions above.
+#[cfg(feature="master")]
+fn gen_fn<'a, 'gcc, 'tcx>(cx: &'a CodegenCx<'gcc, 'tcx>, name: &str, rust_fn_sig: ty::PolyFnSig<'tcx>, codegen: &mut dyn FnMut(Builder<'a, 'gcc, 'tcx>)) -> (Type<'gcc>, Function<'gcc>) {
+    let fn_abi = cx.fn_abi_of_fn_ptr(rust_fn_sig, ty::List::empty());
+    let (typ, _, _, _) = fn_abi.gcc_type(cx);
+    // FIXME(eddyb) find a nicer way to do this.
+    cx.linkage.set(FunctionType::Internal);
+    let func = cx.declare_fn(name, fn_abi);
+    let func_val = unsafe { std::mem::transmute(func) };
+    cx.set_frame_pointer_type(func_val);
+    cx.apply_target_cpu_attr(func_val);
+    let block = Builder::append_block(cx, func_val, "entry-block");
+    let bx = Builder::build(cx, block);
+    codegen(bx);
+    (typ, func)
+}
--- a/compiler/rustc_codegen_gcc/src/intrinsic/simd.rs
+++ b/compiler/rustc_codegen_gcc/src/intrinsic/simd.rs
@ -1,8 +1,13 @@
-use std::cmp::Ordering;
+#[cfg(feature="master")]
+use gccjit::{ComparisonOp, UnaryOp};
+use gccjit::ToRValue;
+use gccjit::{BinaryOp, RValue, Type};

-use gccjit::{BinaryOp, RValue, ToRValue, Type};
 use rustc_codegen_ssa::base::compare_simd_types;
-use rustc_codegen_ssa::common::TypeKind;
+use rustc_codegen_ssa::common::{IntPredicate, TypeKind};
+#[cfg(feature="master")]
+use rustc_codegen_ssa::errors::ExpectedPointerMutability;
+use rustc_codegen_ssa::errors::InvalidMonomorphization;
 use rustc_codegen_ssa::mir::operand::OperandRef;
 use rustc_codegen_ssa::mir::place::PlaceRef;
 use rustc_codegen_ssa::traits::{BaseTypeMethods, BuilderMethods};
@ -14,18 +19,21 @@ use rustc_span::{sym, Span, Symbol};
 use rustc_target::abi::Align;

 use crate::builder::Builder;
+#[cfg(feature="master")]
+use crate::context::CodegenCx;
+#[cfg(feature="master")]
+use crate::errors::{InvalidMonomorphizationExpectedSignedUnsigned, InvalidMonomorphizationInsertedType};
 use crate::errors::{
-    InvalidMonomorphizationExpectedSignedUnsigned, InvalidMonomorphizationExpectedSimd,
-    InvalidMonomorphizationInsertedType, InvalidMonomorphizationInvalidBitmask,
+    InvalidMonomorphizationExpectedSimd,
+    InvalidMonomorphizationInvalidBitmask,
    InvalidMonomorphizationInvalidFloatVector, InvalidMonomorphizationMaskType,
    InvalidMonomorphizationMismatchedLengths, InvalidMonomorphizationNotFloat,
    InvalidMonomorphizationReturnElement, InvalidMonomorphizationReturnIntegerType,
    InvalidMonomorphizationReturnLength, InvalidMonomorphizationReturnLengthInputType,
    InvalidMonomorphizationReturnType, InvalidMonomorphizationSimdShuffle,
-    InvalidMonomorphizationUnrecognized, InvalidMonomorphizationUnsupportedCast,
-    InvalidMonomorphizationUnsupportedElement, InvalidMonomorphizationUnsupportedOperation,
+    InvalidMonomorphizationUnrecognized, InvalidMonomorphizationUnsupportedElement,
+    InvalidMonomorphizationUnsupportedOperation,
 };
-use crate::intrinsic;

 pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(
    bx: &mut Builder<'a, 'gcc, 'tcx>,
@ -105,14 +113,19 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(
        let arg1_vector_type = arg1_type.unqualified().dyncast_vector().expect("vector type");
        let arg1_element_type = arg1_vector_type.get_element_type();

+        // NOTE: since the arguments can be vectors of floats, make sure the mask is a vector of
+        // integer.
+        let mask_element_type = bx.type_ix(arg1_element_type.get_size() as u64 * 8);
+        let vector_mask_type = bx.context.new_vector_type(mask_element_type, arg1_vector_type.get_num_units() as u64);
+
        let mut elements = vec![];
        let one = bx.context.new_rvalue_one(mask.get_type());
        for _ in 0..len {
-            let element = bx.context.new_cast(None, mask & one, arg1_element_type);
+            let element = bx.context.new_cast(None, mask & one, mask_element_type);
            elements.push(element);
            mask = mask >> one;
        }
-        let vector_mask = bx.context.new_rvalue_from_vector(None, arg1_type, &elements);
+        let vector_mask = bx.context.new_rvalue_from_vector(None, vector_mask_type, &elements);

        return Ok(bx.vector_select(vector_mask, arg1, args[2].immediate()));
    }
@ -210,48 +223,12 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(
        let vector = args[0].immediate();
        let index = args[1].immediate();
        let value = args[2].immediate();
-        // TODO(antoyo): use a recursive unqualified() here.
-        let vector_type = vector.get_type().unqualified().dyncast_vector().expect("vector type");
-        let element_type = vector_type.get_element_type();
-        // NOTE: we cannot cast to an array and assign to its element here because the value might
-        // not be an l-value. So, call a builtin to set the element.
-        // TODO(antoyo): perhaps we could create a new vector or maybe there's a GIMPLE instruction for that?
-        // TODO(antoyo): don't use target specific builtins here.
-        let func_name = match in_len {
-            2 => {
-                if element_type == bx.i64_type {
-                    "__builtin_ia32_vec_set_v2di"
-                } else {
-                    unimplemented!();
-                }
-            }
-            4 => {
-                if element_type == bx.i32_type {
-                    "__builtin_ia32_vec_set_v4si"
-                } else {
-                    unimplemented!();
-                }
-            }
-            8 => {
-                if element_type == bx.i16_type {
-                    "__builtin_ia32_vec_set_v8hi"
-                } else {
-                    unimplemented!();
-                }
-            }
-            _ => unimplemented!("Len: {}", in_len),
-        };
-        let builtin = bx.context.get_target_builtin_function(func_name);
-        let param1_type = builtin.get_param(0).to_rvalue().get_type();
-        // TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
-        let vector = bx.cx.bitcast_if_needed(vector, param1_type);
-        let result = bx.context.new_call(
-            None,
-            builtin,
-            &[vector, value, bx.context.new_cast(None, index, bx.int_type)],
-        );
-        // TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
-        return Ok(bx.context.new_bitcast(None, result, vector.get_type()));
+        let variable = bx.current_func().new_local(None, vector.get_type(), "new_vector");
+        bx.llbb().add_assignment(None, variable, vector);
+        let lvalue = bx.context.new_vector_access(None, variable.to_rvalue(), index);
+        // TODO(antoyo): if simd_insert is constant, use BIT_REF.
+        bx.llbb().add_assignment(None, lvalue, value);
+        return Ok(variable.to_rvalue());
    }

    #[cfg(feature = "master")]
@ -280,7 +257,8 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(
        return Ok(bx.vector_select(args[0].immediate(), args[1].immediate(), args[2].immediate()));
    }

-    if name == sym::simd_cast {
+    #[cfg(feature="master")]
+    if name == sym::simd_cast || name == sym::simd_as {
        require_simd!(ret_ty, "return");
        let (out_len, out_elem) = ret_ty.simd_size_and_type(bx.tcx());
        require!(
@ -301,125 +279,40 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(

        enum Style {
            Float,
-            Int(/* is signed? */ bool),
+            Int,
            Unsupported,
        }

-        let (in_style, in_width) = match in_elem.kind() {
-            // vectors of pointer-sized integers should've been
-            // disallowed before here, so this unwrap is safe.
-            ty::Int(i) => (
-                Style::Int(true),
-                i.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(),
-            ),
-            ty::Uint(u) => (
-                Style::Int(false),
-                u.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(),
-            ),
-            ty::Float(f) => (Style::Float, f.bit_width()),
-            _ => (Style::Unsupported, 0),
-        };
-        let (out_style, out_width) = match out_elem.kind() {
-            ty::Int(i) => (
-                Style::Int(true),
-                i.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(),
-            ),
-            ty::Uint(u) => (
-                Style::Int(false),
-                u.normalize(bx.tcx().sess.target.pointer_width).bit_width().unwrap(),
-            ),
-            ty::Float(f) => (Style::Float, f.bit_width()),
-            _ => (Style::Unsupported, 0),
-        };
-
-        let extend = |in_type, out_type| {
-            let vector_type = bx.context.new_vector_type(out_type, 8);
-            let vector = args[0].immediate();
-            let array_type = bx.context.new_array_type(None, in_type, 8);
-            // TODO(antoyo): switch to using new_vector_access or __builtin_convertvector for vector casting.
-            let array = bx.context.new_bitcast(None, vector, array_type);
-
-            let cast_vec_element = |index| {
-                let index = bx.context.new_rvalue_from_int(bx.int_type, index);
-                bx.context.new_cast(
-                    None,
-                    bx.context.new_array_access(None, array, index).to_rvalue(),
-                    out_type,
-                )
+        let in_style =
+            match in_elem.kind() {
+                ty::Int(_) | ty::Uint(_) => Style::Int,
+                ty::Float(_) => Style::Float,
+                 _ => Style::Unsupported,
            };

-            bx.context.new_rvalue_from_vector(
-                None,
-                vector_type,
-                &[
-                    cast_vec_element(0),
-                    cast_vec_element(1),
-                    cast_vec_element(2),
-                    cast_vec_element(3),
-                    cast_vec_element(4),
-                    cast_vec_element(5),
-                    cast_vec_element(6),
-                    cast_vec_element(7),
-                ],
-            )
-        };
+        let out_style =
+            match out_elem.kind() {
+                ty::Int(_) | ty::Uint(_) => Style::Int,
+                ty::Float(_) => Style::Float,
+                 _ => Style::Unsupported,
+            };

        match (in_style, out_style) {
-            (Style::Int(in_is_signed), Style::Int(_)) => {
-                return Ok(match in_width.cmp(&out_width) {
-                    Ordering::Greater => bx.trunc(args[0].immediate(), llret_ty),
-                    Ordering::Equal => args[0].immediate(),
-                    Ordering::Less => {
-                        if in_is_signed {
-                            match (in_width, out_width) {
-                                // FIXME(antoyo): the function _mm_cvtepi8_epi16 should directly
-                                // call an intrinsic equivalent to __builtin_ia32_pmovsxbw128 so that
-                                // we can generate a call to it.
-                                (8, 16) => extend(bx.i8_type, bx.i16_type),
-                                (8, 32) => extend(bx.i8_type, bx.i32_type),
-                                (8, 64) => extend(bx.i8_type, bx.i64_type),
-                                (16, 32) => extend(bx.i16_type, bx.i32_type),
-                                (32, 64) => extend(bx.i32_type, bx.i64_type),
-                                (16, 64) => extend(bx.i16_type, bx.i64_type),
-                                _ => unimplemented!("in: {}, out: {}", in_width, out_width),
-                            }
-                        } else {
-                            match (in_width, out_width) {
-                                (8, 16) => extend(bx.u8_type, bx.u16_type),
-                                (8, 32) => extend(bx.u8_type, bx.u32_type),
-                                (8, 64) => extend(bx.u8_type, bx.u64_type),
-                                (16, 32) => extend(bx.u16_type, bx.u32_type),
-                                (16, 64) => extend(bx.u16_type, bx.u64_type),
-                                (32, 64) => extend(bx.u32_type, bx.u64_type),
-                                _ => unimplemented!("in: {}, out: {}", in_width, out_width),
-                            }
-                        }
+            (Style::Unsupported, Style::Unsupported) => {
+                require!(
+                    false,
+                    InvalidMonomorphization::UnsupportedCast {
+                        span,
+                        name,
+                        in_ty,
+                        in_elem,
+                        ret_ty,
+                        out_elem
                    }
-                });
-            }
-            (Style::Int(_), Style::Float) => {
-                // TODO: add support for internal functions in libgccjit to get access to IFN_VEC_CONVERT which is
-                // doing like __builtin_convertvector?
-                // Or maybe provide convert_vector as an API since it might not easy to get the
-                // types of internal functions.
-                unimplemented!();
-            }
-            (Style::Float, Style::Int(_)) => {
-                unimplemented!();
-            }
-            (Style::Float, Style::Float) => {
-                unimplemented!();
-            }
-            _ => { /* Unsupported. Fallthrough. */ }
+                );
+            },
+            _ => return Ok(bx.context.convert_vector(None, args[0].immediate(), llret_ty)),
        }
-        return_error!(InvalidMonomorphizationUnsupportedCast {
-            span,
-            name,
-            in_ty,
-            in_elem,
-            ret_ty,
-            out_elem
-        });
    }

    macro_rules! arith_binary {
@ -436,6 +329,71 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(
        }
    }

+    if name == sym::simd_bitmask {
+        // The `fn simd_bitmask(vector) -> unsigned integer` intrinsic takes a
+        // vector mask and returns the most significant bit (MSB) of each lane in the form
+        // of either:
+        // * an unsigned integer
+        // * an array of `u8`
+        // If the vector has less than 8 lanes, a u8 is returned with zeroed trailing bits.
+        //
+        // The bit order of the result depends on the byte endianness, LSB-first for little
+        // endian and MSB-first for big endian.
+
+        let vector = args[0].immediate();
+        let vector_type = vector.get_type().dyncast_vector().expect("vector type");
+        let elem_type = vector_type.get_element_type();
+
+        let expected_int_bits = in_len.max(8);
+        let expected_bytes = expected_int_bits / 8 + ((expected_int_bits % 8 > 0) as u64);
+
+        // FIXME(antoyo): that's not going to work for masks bigger than 128 bits.
+        let result_type = bx.type_ix(expected_int_bits);
+        let mut result = bx.context.new_rvalue_zero(result_type);
+
+        let elem_size = elem_type.get_size() * 8;
+        let sign_shift = bx.context.new_rvalue_from_int(elem_type, elem_size as i32 - 1);
+        let one = bx.context.new_rvalue_one(elem_type);
+
+        let mut shift = 0;
+        for i in 0..in_len {
+            let elem = bx.extract_element(vector, bx.context.new_rvalue_from_int(bx.int_type, i as i32));
+            let shifted = elem >> sign_shift;
+            let masked = shifted & one;
+            result = result | (bx.context.new_cast(None, masked, result_type) << bx.context.new_rvalue_from_int(result_type, shift));
+            shift += 1;
+        }
+
+        match ret_ty.kind() {
+            ty::Uint(i) if i.bit_width() == Some(expected_int_bits) => {
+                // Zero-extend iN to the bitmask type:
+                return Ok(result);
+            }
+            ty::Array(elem, len)
+                if matches!(elem.kind(), ty::Uint(ty::UintTy::U8))
+                    && len.try_eval_target_usize(bx.tcx, ty::ParamEnv::reveal_all())
+                        == Some(expected_bytes) =>
+            {
+                // Zero-extend iN to the array length:
+                let ze = bx.zext(result, bx.type_ix(expected_bytes * 8));
+
+                // Convert the integer to a byte array
+                let ptr = bx.alloca(bx.type_ix(expected_bytes * 8), Align::ONE);
+                bx.store(ze, ptr, Align::ONE);
+                let array_ty = bx.type_array(bx.type_i8(), expected_bytes);
+                let ptr = bx.pointercast(ptr, bx.cx.type_ptr_to(array_ty));
+                return Ok(bx.load(array_ty, ptr, Align::ONE));
+            }
+            _ => return_error!(InvalidMonomorphization::CannotReturn {
+                span,
+                name,
+                ret_ty,
+                expected_int_bits,
+                expected_bytes
+            }),
+        }
+    }
+
    fn simd_simple_float_intrinsic<'gcc, 'tcx>(
        name: Symbol,
        in_elem: Ty<'_>,
@ -451,55 +409,66 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(
                return Err(());
            }};
        }
-        let (elem_ty_str, elem_ty) = if let ty::Float(f) = in_elem.kind() {
-            let elem_ty = bx.cx.type_float_from_ty(*f);
-            match f.bit_width() {
-                32 => ("f32", elem_ty),
-                64 => ("f64", elem_ty),
-                _ => {
-                    return_error!(InvalidMonomorphizationInvalidFloatVector {
-                        span,
-                        name,
-                        elem_ty: f.name_str(),
-                        vec_ty: in_ty
-                    });
+        let (elem_ty_str, elem_ty) =
+            if let ty::Float(f) = in_elem.kind() {
+                let elem_ty = bx.cx.type_float_from_ty(*f);
+                match f.bit_width() {
+                    32 => ("f", elem_ty),
+                    64 => ("", elem_ty),
+                    _ => {
+                        return_error!(InvalidMonomorphizationInvalidFloatVector { span, name, elem_ty: f.name_str(), vec_ty: in_ty });
+                    }
                }
            }
-        } else {
-            return_error!(InvalidMonomorphizationNotFloat { span, name, ty: in_ty });
-        };
+            else {
+                return_error!(InvalidMonomorphizationNotFloat { span, name, ty: in_ty });
+            };

        let vec_ty = bx.cx.type_vector(elem_ty, in_len);

-        let (intr_name, fn_ty) = match name {
-            sym::simd_ceil => ("ceil", bx.type_func(&[vec_ty], vec_ty)),
-            sym::simd_fabs => ("fabs", bx.type_func(&[vec_ty], vec_ty)), // TODO(antoyo): pand with 170141183420855150465331762880109871103
-            sym::simd_fcos => ("cos", bx.type_func(&[vec_ty], vec_ty)),
-            sym::simd_fexp2 => ("exp2", bx.type_func(&[vec_ty], vec_ty)),
-            sym::simd_fexp => ("exp", bx.type_func(&[vec_ty], vec_ty)),
-            sym::simd_flog10 => ("log10", bx.type_func(&[vec_ty], vec_ty)),
-            sym::simd_flog2 => ("log2", bx.type_func(&[vec_ty], vec_ty)),
-            sym::simd_flog => ("log", bx.type_func(&[vec_ty], vec_ty)),
-            sym::simd_floor => ("floor", bx.type_func(&[vec_ty], vec_ty)),
-            sym::simd_fma => ("fma", bx.type_func(&[vec_ty, vec_ty, vec_ty], vec_ty)),
-            sym::simd_fpowi => ("powi", bx.type_func(&[vec_ty, bx.type_i32()], vec_ty)),
-            sym::simd_fpow => ("pow", bx.type_func(&[vec_ty, vec_ty], vec_ty)),
-            sym::simd_fsin => ("sin", bx.type_func(&[vec_ty], vec_ty)),
-            sym::simd_fsqrt => ("sqrt", bx.type_func(&[vec_ty], vec_ty)),
-            sym::simd_round => ("round", bx.type_func(&[vec_ty], vec_ty)),
-            sym::simd_trunc => ("trunc", bx.type_func(&[vec_ty], vec_ty)),
-            _ => return_error!(InvalidMonomorphizationUnrecognized { span, name }),
-        };
-        let llvm_name = &format!("llvm.{0}.v{1}{2}", intr_name, in_len, elem_ty_str);
-        let function = intrinsic::llvm::intrinsic(llvm_name, &bx.cx);
-        let function: RValue<'gcc> = unsafe { std::mem::transmute(function) };
-        let c = bx.call(
-            fn_ty,
-            None,
-            function,
-            &args.iter().map(|arg| arg.immediate()).collect::<Vec<_>>(),
-            None,
-        );
+        let intr_name =
+            match name {
+                sym::simd_ceil => "ceil",
+                sym::simd_fabs => "fabs", // TODO(antoyo): pand with 170141183420855150465331762880109871103
+                sym::simd_fcos => "cos",
+                sym::simd_fexp2 => "exp2",
+                sym::simd_fexp => "exp",
+                sym::simd_flog10 => "log10",
+                sym::simd_flog2 => "log2",
+                sym::simd_flog => "log",
+                sym::simd_floor => "floor",
+                sym::simd_fma => "fma",
+                sym::simd_fpowi => "__builtin_powi",
+                sym::simd_fpow => "pow",
+                sym::simd_fsin => "sin",
+                sym::simd_fsqrt => "sqrt",
+                sym::simd_round => "round",
+                sym::simd_trunc => "trunc",
+                _ => return_error!(InvalidMonomorphizationUnrecognized { span, name })
+            };
+        let builtin_name = format!("{}{}", intr_name, elem_ty_str);
+        let funcs = bx.cx.functions.borrow();
+        let function = funcs.get(&builtin_name).unwrap_or_else(|| panic!("unable to find builtin function {}", builtin_name));
+
+        // TODO(antoyo): add platform-specific behavior here for architectures that have these
+        // intrinsics as instructions (for instance, gpus)
+        let mut vector_elements = vec![];
+        for i in 0..in_len {
+            let index = bx.context.new_rvalue_from_long(bx.ulong_type, i as i64);
+            // we have to treat fpowi specially, since fpowi's second argument is always an i32
+            let arguments = if name == sym::simd_fpowi {
+                vec![
+                    bx.extract_element(args[0].immediate(), index).to_rvalue(),
+                    args[1].immediate(),
+                ]
+            } else {
+                args.iter()
+                    .map(|arg| bx.extract_element(arg.immediate(), index).to_rvalue())
+                    .collect()
+            };
+            vector_elements.push(bx.context.new_call(None, *function, &arguments));
+        }
+        let c = bx.context.new_rvalue_from_vector(None, vec_ty, &vector_elements);
        Ok(c)
    }

@ -525,6 +494,297 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(
        return simd_simple_float_intrinsic(name, in_elem, in_ty, in_len, bx, span, args);
    }

+    #[cfg(feature="master")]
+    fn vector_ty<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, elem_ty: Ty<'tcx>, vec_len: u64) -> Type<'gcc> {
+        // FIXME: use cx.layout_of(ty).llvm_type() ?
+        let elem_ty = match *elem_ty.kind() {
+            ty::Int(v) => cx.type_int_from_ty(v),
+            ty::Uint(v) => cx.type_uint_from_ty(v),
+            ty::Float(v) => cx.type_float_from_ty(v),
+            _ => unreachable!(),
+        };
+        cx.type_vector(elem_ty, vec_len)
+    }
+
+    #[cfg(feature="master")]
+    fn gather<'a, 'gcc, 'tcx>(default: RValue<'gcc>, pointers: RValue<'gcc>, mask: RValue<'gcc>, pointer_count: usize, bx: &mut Builder<'a, 'gcc, 'tcx>, in_len: u64, underlying_ty: Ty<'tcx>, invert: bool) -> RValue<'gcc> {
+        let vector_type =
+            if pointer_count > 1 {
+                bx.context.new_vector_type(bx.usize_type, in_len)
+            }
+            else {
+                vector_ty(bx, underlying_ty, in_len)
+            };
+        let elem_type = vector_type.dyncast_vector().expect("vector type").get_element_type();
+
+        let mut values = vec![];
+        for i in 0..in_len {
+            let index = bx.context.new_rvalue_from_long(bx.i32_type, i as i64);
+            let int = bx.context.new_vector_access(None, pointers, index).to_rvalue();
+
+            let ptr_type = elem_type.make_pointer();
+            let ptr = bx.context.new_bitcast(None, int, ptr_type);
+            let value = ptr.dereference(None).to_rvalue();
+            values.push(value);
+        }
+
+        let vector = bx.context.new_rvalue_from_vector(None, vector_type, &values);
+
+        let mut mask_types = vec![];
+        let mut mask_values = vec![];
+        for i in 0..in_len {
+            let index = bx.context.new_rvalue_from_long(bx.i32_type, i as i64);
+            mask_types.push(bx.context.new_field(None, bx.i32_type, "m"));
+            let mask_value = bx.context.new_vector_access(None, mask, index).to_rvalue();
+            let masked = bx.context.new_rvalue_from_int(bx.i32_type, in_len as i32) & mask_value;
+            let value = index + masked;
+            mask_values.push(value);
+        }
+        let mask_type = bx.context.new_struct_type(None, "mask_type", &mask_types);
+        let mask = bx.context.new_struct_constructor(None, mask_type.as_type(), None, &mask_values);
+
+        if invert {
+            bx.shuffle_vector(vector, default, mask)
+        }
+        else {
+            bx.shuffle_vector(default, vector, mask)
+        }
+    }
+
+    #[cfg(feature="master")]
+    if name == sym::simd_gather {
+        // simd_gather(values: <N x T>, pointers: <N x *_ T>,
+        //             mask: <N x i{M}>) -> <N x T>
+        // * N: number of elements in the input vectors
+        // * T: type of the element to load
+        // * M: any integer width is supported, will be truncated to i1
+
+        // All types must be simd vector types
+        require_simd!(in_ty, "first");
+        require_simd!(arg_tys[1], "second");
+        require_simd!(arg_tys[2], "third");
+        require_simd!(ret_ty, "return");
+
+        // Of the same length:
+        let (out_len, _) = arg_tys[1].simd_size_and_type(bx.tcx());
+        let (out_len2, _) = arg_tys[2].simd_size_and_type(bx.tcx());
+        require!(
+            in_len == out_len,
+            InvalidMonomorphization::SecondArgumentLength {
+                span,
+                name,
+                in_len,
+                in_ty,
+                arg_ty: arg_tys[1],
+                out_len
+            }
+        );
+        require!(
+            in_len == out_len2,
+            InvalidMonomorphization::ThirdArgumentLength {
+                span,
+                name,
+                in_len,
+                in_ty,
+                arg_ty: arg_tys[2],
+                out_len: out_len2
+            }
+        );
+
+        // The return type must match the first argument type
+        require!(
+            ret_ty == in_ty,
+            InvalidMonomorphization::ExpectedReturnType { span, name, in_ty, ret_ty }
+        );
+
+        // This counts how many pointers
+        fn ptr_count(t: Ty<'_>) -> usize {
+            match t.kind() {
+                ty::RawPtr(p) => 1 + ptr_count(p.ty),
+                _ => 0,
+            }
+        }
+
+        // Non-ptr type
+        fn non_ptr(t: Ty<'_>) -> Ty<'_> {
+            match t.kind() {
+                ty::RawPtr(p) => non_ptr(p.ty),
+                _ => t,
+            }
+        }
+
+        // The second argument must be a simd vector with an element type that's a pointer
+        // to the element type of the first argument
+        let (_, element_ty0) = arg_tys[0].simd_size_and_type(bx.tcx());
+        let (_, element_ty1) = arg_tys[1].simd_size_and_type(bx.tcx());
+        let (pointer_count, underlying_ty) = match element_ty1.kind() {
+            ty::RawPtr(p) if p.ty == in_elem => (ptr_count(element_ty1), non_ptr(element_ty1)),
+            _ => {
+                require!(
+                    false,
+                    InvalidMonomorphization::ExpectedElementType {
+                        span,
+                        name,
+                        expected_element: element_ty1,
+                        second_arg: arg_tys[1],
+                        in_elem,
+                        in_ty,
+                        mutability: ExpectedPointerMutability::Not,
+                    }
+                );
+                unreachable!();
+            }
+        };
+        assert!(pointer_count > 0);
+        assert_eq!(pointer_count - 1, ptr_count(element_ty0));
+        assert_eq!(underlying_ty, non_ptr(element_ty0));
+
+        // The element type of the third argument must be a signed integer type of any width:
+        let (_, element_ty2) = arg_tys[2].simd_size_and_type(bx.tcx());
+        match element_ty2.kind() {
+            ty::Int(_) => (),
+            _ => {
+                require!(
+                    false,
+                    InvalidMonomorphization::ThirdArgElementType {
+                        span,
+                        name,
+                        expected_element: element_ty2,
+                        third_arg: arg_tys[2]
+                    }
+                );
+            }
+        }
+
+        return Ok(gather(args[0].immediate(), args[1].immediate(), args[2].immediate(), pointer_count, bx, in_len, underlying_ty, false));
+    }
+
+    #[cfg(feature="master")]
+    if name == sym::simd_scatter {
+        // simd_scatter(values: <N x T>, pointers: <N x *mut T>,
+        //             mask: <N x i{M}>) -> ()
+        // * N: number of elements in the input vectors
+        // * T: type of the element to load
+        // * M: any integer width is supported, will be truncated to i1
+
+        // All types must be simd vector types
+        require_simd!(in_ty, "first");
+        require_simd!(arg_tys[1], "second");
+        require_simd!(arg_tys[2], "third");
+
+        // Of the same length:
+        let (element_len1, _) = arg_tys[1].simd_size_and_type(bx.tcx());
+        let (element_len2, _) = arg_tys[2].simd_size_and_type(bx.tcx());
+        require!(
+            in_len == element_len1,
+            InvalidMonomorphization::SecondArgumentLength {
+                span,
+                name,
+                in_len,
+                in_ty,
+                arg_ty: arg_tys[1],
+                out_len: element_len1
+            }
+        );
+        require!(
+            in_len == element_len2,
+            InvalidMonomorphization::ThirdArgumentLength {
+                span,
+                name,
+                in_len,
+                in_ty,
+                arg_ty: arg_tys[2],
+                out_len: element_len2
+            }
+        );
+
+        // This counts how many pointers
+        fn ptr_count(t: Ty<'_>) -> usize {
+            match t.kind() {
+                ty::RawPtr(p) => 1 + ptr_count(p.ty),
+                _ => 0,
+            }
+        }
+
+        // Non-ptr type
+        fn non_ptr(t: Ty<'_>) -> Ty<'_> {
+            match t.kind() {
+                ty::RawPtr(p) => non_ptr(p.ty),
+                _ => t,
+            }
+        }
+
+        // The second argument must be a simd vector with an element type that's a pointer
+        // to the element type of the first argument
+        let (_, element_ty0) = arg_tys[0].simd_size_and_type(bx.tcx());
+        let (_, element_ty1) = arg_tys[1].simd_size_and_type(bx.tcx());
+        let (_, element_ty2) = arg_tys[2].simd_size_and_type(bx.tcx());
+        let (pointer_count, underlying_ty) = match element_ty1.kind() {
+            ty::RawPtr(p) if p.ty == in_elem && p.mutbl == hir::Mutability::Mut => {
+                (ptr_count(element_ty1), non_ptr(element_ty1))
+            }
+            _ => {
+                require!(
+                    false,
+                    InvalidMonomorphization::ExpectedElementType {
+                        span,
+                        name,
+                        expected_element: element_ty1,
+                        second_arg: arg_tys[1],
+                        in_elem,
+                        in_ty,
+                        mutability: ExpectedPointerMutability::Mut,
+                    }
+                );
+                unreachable!();
+            }
+        };
+        assert!(pointer_count > 0);
+        assert_eq!(pointer_count - 1, ptr_count(element_ty0));
+        assert_eq!(underlying_ty, non_ptr(element_ty0));
+
+        // The element type of the third argument must be a signed integer type of any width:
+        match element_ty2.kind() {
+            ty::Int(_) => (),
+            _ => {
+                require!(
+                    false,
+                    InvalidMonomorphization::ThirdArgElementType {
+                        span,
+                        name,
+                        expected_element: element_ty2,
+                        third_arg: arg_tys[2]
+                    }
+                );
+            }
+        }
+
+        let result = gather(args[0].immediate(), args[1].immediate(), args[2].immediate(), pointer_count, bx, in_len, underlying_ty, true);
+
+        let pointers = args[1].immediate();
+
+        let vector_type =
+            if pointer_count > 1 {
+                bx.context.new_vector_type(bx.usize_type, in_len)
+            }
+            else {
+                vector_ty(bx, underlying_ty, in_len)
+            };
+        let elem_type = vector_type.dyncast_vector().expect("vector type").get_element_type();
+
+        for i in 0..in_len {
+            let index = bx.context.new_rvalue_from_int(bx.int_type, i as i32);
+            let value = bx.context.new_vector_access(None, result, index);
+
+            let int = bx.context.new_vector_access(None, pointers, index).to_rvalue();
+            let ptr_type = elem_type.make_pointer();
+            let ptr = bx.context.new_bitcast(None, int, ptr_type);
+            bx.llbb().add_assignment(None, ptr.dereference(None), value);
+        }
+
+        return Ok(bx.context.new_rvalue_zero(bx.i32_type));
+    }
+
    arith_binary! {
        simd_add: Uint, Int => add, Float => fadd;
        simd_sub: Uint, Int => sub, Float => fsub;
@ -536,6 +796,8 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(
        simd_and: Uint, Int => and;
        simd_or: Uint, Int => or; // FIXME(antoyo): calling `or` might not work on vectors.
        simd_xor: Uint, Int => xor;
+        simd_fmin: Float => vector_fmin;
+        simd_fmax: Float => vector_fmax;
    }

    macro_rules! arith_unary {
@ -562,10 +824,11 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(
        let rhs = args[1].immediate();
        let is_add = name == sym::simd_saturating_add;
        let ptr_bits = bx.tcx().data_layout.pointer_size.bits() as _;
-        let (signed, elem_width, elem_ty) = match *in_elem.kind() {
-            ty::Int(i) => (true, i.bit_width().unwrap_or(ptr_bits), bx.cx.type_int_from_ty(i)),
-            ty::Uint(i) => (false, i.bit_width().unwrap_or(ptr_bits), bx.cx.type_uint_from_ty(i)),
-            _ => {
+        let (signed, elem_width, elem_ty) =
+            match *in_elem.kind() {
+                ty::Int(i) => (true, i.bit_width().unwrap_or(ptr_bits) / 8, bx.cx.type_int_from_ty(i)),
+                ty::Uint(i) => (false, i.bit_width().unwrap_or(ptr_bits) / 8, bx.cx.type_uint_from_ty(i)),
+                _ => {
                return_error!(InvalidMonomorphizationExpectedSignedUnsigned {
                    span,
                    name,
@ -574,33 +837,78 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(
                });
            }
        };
-        let builtin_name = match (signed, is_add, in_len, elem_width) {
-            (true, true, 32, 8) => "__builtin_ia32_paddsb256", // TODO(antoyo): cast arguments to unsigned.
-            (false, true, 32, 8) => "__builtin_ia32_paddusb256",
-            (true, true, 16, 16) => "__builtin_ia32_paddsw256",
-            (false, true, 16, 16) => "__builtin_ia32_paddusw256",
-            (true, false, 16, 16) => "__builtin_ia32_psubsw256",
-            (false, false, 16, 16) => "__builtin_ia32_psubusw256",
-            (true, false, 32, 8) => "__builtin_ia32_psubsb256",
-            (false, false, 32, 8) => "__builtin_ia32_psubusb256",
-            _ => unimplemented!(
-                "signed: {}, is_add: {}, in_len: {}, elem_width: {}",
-                signed,
-                is_add,
-                in_len,
-                elem_width
-            ),
-        };
-        let vec_ty = bx.cx.type_vector(elem_ty, in_len as u64);

-        let func = bx.context.get_target_builtin_function(builtin_name);
-        let param1_type = func.get_param(0).to_rvalue().get_type();
-        let param2_type = func.get_param(1).to_rvalue().get_type();
-        let lhs = bx.cx.bitcast_if_needed(lhs, param1_type);
-        let rhs = bx.cx.bitcast_if_needed(rhs, param2_type);
-        let result = bx.context.new_call(None, func, &[lhs, rhs]);
-        // TODO(antoyo): perhaps use __builtin_convertvector for vector casting.
-        return Ok(bx.context.new_bitcast(None, result, vec_ty));
+        let result =
+            match (signed, is_add) {
+                (false, true) => {
+                    let res = lhs + rhs;
+                    let cmp = bx.context.new_comparison(None, ComparisonOp::LessThan, res, lhs);
+                    res | cmp
+                },
+                (true, true) => {
+                    // Algorithm from: https://codereview.stackexchange.com/questions/115869/saturated-signed-addition
+                    // TODO(antoyo): improve using conditional operators if possible.
+                    let arg_type = lhs.get_type();
+                    // TODO(antoyo): convert lhs and rhs to unsigned.
+                    let sum = lhs + rhs;
+                    let vector_type = arg_type.dyncast_vector().expect("vector type");
+                    let unit = vector_type.get_num_units();
+                    let a = bx.context.new_rvalue_from_int(elem_ty, ((elem_width as i32) << 3) - 1);
+                    let width = bx.context.new_rvalue_from_vector(None, lhs.get_type(), &vec![a; unit]);
+
+                    let xor1 = lhs ^ rhs;
+                    let xor2 = lhs ^ sum;
+                    let and = bx.context.new_unary_op(None, UnaryOp::BitwiseNegate, arg_type, xor1) & xor2;
+                    let mask = and >> width;
+
+                    let one = bx.context.new_rvalue_one(elem_ty);
+                    let ones = bx.context.new_rvalue_from_vector(None, lhs.get_type(), &vec![one; unit]);
+                    let shift1 = ones << width;
+                    let shift2 = sum >> width;
+                    let mask_min = shift1 ^ shift2;
+
+                    let and1 = bx.context.new_unary_op(None, UnaryOp::BitwiseNegate, arg_type, mask) & sum;
+                    let and2 = mask & mask_min;
+
+                    and1 + and2
+                },
+                (false, false) => {
+                    let res = lhs - rhs;
+                    let cmp = bx.context.new_comparison(None, ComparisonOp::LessThanEquals, res, lhs);
+                    res & cmp
+                },
+                (true, false) => {
+                    let arg_type = lhs.get_type();
+                    // TODO(antoyo): this uses the same algorithm from saturating add, but add the
+                    // negative of the right operand. Find a proper subtraction algorithm.
+                    let rhs = bx.context.new_unary_op(None, UnaryOp::Minus, arg_type, rhs);
+
+                    // TODO(antoyo): convert lhs and rhs to unsigned.
+                    let sum = lhs + rhs;
+                    let vector_type = arg_type.dyncast_vector().expect("vector type");
+                    let unit = vector_type.get_num_units();
+                    let a = bx.context.new_rvalue_from_int(elem_ty, ((elem_width as i32) << 3) - 1);
+                    let width = bx.context.new_rvalue_from_vector(None, lhs.get_type(), &vec![a; unit]);
+
+                    let xor1 = lhs ^ rhs;
+                    let xor2 = lhs ^ sum;
+                    let and = bx.context.new_unary_op(None, UnaryOp::BitwiseNegate, arg_type, xor1) & xor2;
+                    let mask = and >> width;
+
+                    let one = bx.context.new_rvalue_one(elem_ty);
+                    let ones = bx.context.new_rvalue_from_vector(None, lhs.get_type(), &vec![one; unit]);
+                    let shift1 = ones << width;
+                    let shift2 = sum >> width;
+                    let mask_min = shift1 ^ shift2;
+
+                    let and1 = bx.context.new_unary_op(None, UnaryOp::BitwiseNegate, arg_type, mask) & sum;
+                    let and2 = mask & mask_min;
+
+                    and1 + and2
+                }
+            };
+
+        return Ok(result);
    }

    macro_rules! arith_red {
@ -650,33 +958,50 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(
        add,
        0.0 // TODO: Use this argument.
    );
-    arith_red!(simd_reduce_mul_unordered: BinaryOp::Mult, vector_reduce_fmul_fast, false, mul, 1.0);
+    arith_red!(
+        simd_reduce_mul_unordered: BinaryOp::Mult,
+        vector_reduce_fmul_fast,
+        false,
+        mul,
+        1.0
+    );
+    arith_red!(
+        simd_reduce_add_ordered: BinaryOp::Plus,
+        vector_reduce_fadd,
+        true,
+        add,
+        0.0
+    );
+    arith_red!(
+        simd_reduce_mul_ordered: BinaryOp::Mult,
+        vector_reduce_fmul,
+        true,
+        mul,
+        1.0
+    );
+

    macro_rules! minmax_red {
-        ($name:ident: $reduction:ident) => {
+        ($name:ident: $int_red:ident, $float_red:ident) => {
            if name == sym::$name {
                require!(
                    ret_ty == in_elem,
                    InvalidMonomorphizationReturnType { span, name, in_elem, in_ty, ret_ty }
                );
                return match in_elem.kind() {
-                    ty::Int(_) | ty::Uint(_) | ty::Float(_) => {
-                        Ok(bx.$reduction(args[0].immediate()))
-                    }
-                    _ => return_error!(InvalidMonomorphizationUnsupportedElement {
-                        span,
-                        name,
-                        in_ty,
-                        elem_ty: in_elem,
-                        ret_ty
-                    }),
+                    ty::Int(_) | ty::Uint(_) => Ok(bx.$int_red(args[0].immediate())),
+                    ty::Float(_) => Ok(bx.$float_red(args[0].immediate())),
+                    _ => return_error!(InvalidMonomorphizationUnsupportedElement { span, name, in_ty, elem_ty: in_elem, ret_ty }),
                };
            }
        };
    }

-    minmax_red!(simd_reduce_min: vector_reduce_min);
-    minmax_red!(simd_reduce_max: vector_reduce_max);
+    minmax_red!(simd_reduce_min: vector_reduce_min, vector_reduce_fmin);
+    minmax_red!(simd_reduce_max: vector_reduce_max, vector_reduce_fmax);
+    // TODO(sadlerap): revisit these intrinsics to generate more optimal reductions
+    minmax_red!(simd_reduce_min_nanless: vector_reduce_min, vector_reduce_fmin);
+    minmax_red!(simd_reduce_max_nanless: vector_reduce_max, vector_reduce_fmax);

    macro_rules! bitwise_red {
        ($name:ident : $op:expr, $boolean:expr) => {
@ -699,15 +1024,12 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(
                        }),
                    }

-                    // boolean reductions operate on vectors of i1s:
-                    let i1 = bx.type_i1();
-                    let i1xn = bx.type_vector(i1, in_len as u64);
-                    bx.trunc(args[0].immediate(), i1xn)
+                    args[0].immediate()
                };
                return match in_elem.kind() {
                    ty::Int(_) | ty::Uint(_) => {
                        let r = bx.vector_reduce_op(input, $op);
-                        Ok(if !$boolean { r } else { bx.zext(r, bx.type_bool()) })
+                        Ok(if !$boolean { r } else { bx.icmp(IntPredicate::IntNE, r, bx.context.new_rvalue_zero(r.get_type())) })
                    }
                    _ => return_error!(InvalidMonomorphizationUnsupportedElement {
                        span,
@ -723,6 +1045,9 @@ pub fn generic_simd_intrinsic<'a, 'gcc, 'tcx>(

    bitwise_red!(simd_reduce_and: BinaryOp::BitwiseAnd, false);
    bitwise_red!(simd_reduce_or: BinaryOp::BitwiseOr, false);
+    bitwise_red!(simd_reduce_xor: BinaryOp::BitwiseXor, false);
+    bitwise_red!(simd_reduce_all: BinaryOp::BitwiseAnd, true);
+    bitwise_red!(simd_reduce_any: BinaryOp::BitwiseOr, true);

    unimplemented!("simd {}", name);
 }
--- a/compiler/rustc_codegen_gcc/src/lib.rs
+++ b/compiler/rustc_codegen_gcc/src/lib.rs
@ -1,7 +1,7 @@
 /*
 * TODO(antoyo): implement equality in libgccjit based on https://zpz.github.io/blog/overloading-equality-operator-in-cpp-class-hierarchy/ (for type equality?)
 * TODO(antoyo): support #[inline] attributes.
- * TODO(antoyo): support LTO (gcc's equivalent to Thin LTO is enabled by -fwhopr: https://stackoverflow.com/questions/64954525/does-gcc-have-thin-lto).
+ * TODO(antoyo): support LTO (gcc's equivalent to Full LTO is -flto -flto-partition=one — https://documentation.suse.com/sbp/all/html/SBP-GCC-10/index.html).
 *
 * TODO(antoyo): remove the patches.
 */
@ -23,6 +23,7 @@

 extern crate rustc_apfloat;
 extern crate rustc_ast;
+extern crate rustc_attr;
 extern crate rustc_codegen_ssa;
 extern crate rustc_data_structures;
 extern crate rustc_errors;
@ -43,6 +44,7 @@ mod abi;
 mod allocator;
 mod archive;
 mod asm;
+mod attributes;
 mod back;
 mod base;
 mod builder;
@ -314,9 +316,12 @@ pub fn target_features(sess: &Session, allow_unstable: bool) -> Vec<Symbol> {
        .filter(|_feature| {
            // TODO(antoyo): implement a way to get enabled feature in libgccjit.
            // Probably using the equivalent of __builtin_cpu_supports.
+            // TODO(antoyo): maybe use whatever outputs the following command:
+            // gcc -march=native -Q --help=target
            #[cfg(feature="master")]
            {
-                _feature.contains("sse") || _feature.contains("avx")
+                // NOTE: the CPU in the CI doesn't support sse4a, so disable it to make the stdarch tests pass in the CI.
+                (_feature.contains("sse") || _feature.contains("avx")) && !_feature.contains("avx512") && !_feature.contains("sse4a")
            }
            #[cfg(not(feature="master"))]
            {
--- a/compiler/rustc_codegen_gcc/src/mono_item.rs
+++ b/compiler/rustc_codegen_gcc/src/mono_item.rs
@ -1,38 +1,66 @@
+#[cfg(feature="master")]
+use gccjit::{VarAttribute, FnAttribute};
 use rustc_codegen_ssa::traits::PreDefineMethods;
+use rustc_hir::def_id::{DefId, LOCAL_CRATE};
 use rustc_middle::middle::codegen_fn_attrs::CodegenFnAttrFlags;
 use rustc_middle::mir::mono::{Linkage, Visibility};
 use rustc_middle::ty::{self, Instance, TypeVisitableExt};
 use rustc_middle::ty::layout::{FnAbiOf, LayoutOf};
-use rustc_span::def_id::DefId;

+use crate::attributes;
 use crate::base;
 use crate::context::CodegenCx;
 use crate::type_of::LayoutGccExt;

 impl<'gcc, 'tcx> PreDefineMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
-    fn predefine_static(&self, def_id: DefId, _linkage: Linkage, _visibility: Visibility, symbol_name: &str) {
+    #[cfg_attr(not(feature="master"), allow(unused_variables))]
+    fn predefine_static(&self, def_id: DefId, _linkage: Linkage, visibility: Visibility, symbol_name: &str) {
        let attrs = self.tcx.codegen_fn_attrs(def_id);
        let instance = Instance::mono(self.tcx, def_id);
        let ty = instance.ty(self.tcx, ty::ParamEnv::reveal_all());
-        let gcc_type = self.layout_of(ty).gcc_type(self, true);
+        let gcc_type = self.layout_of(ty).gcc_type(self);

        let is_tls = attrs.flags.contains(CodegenFnAttrFlags::THREAD_LOCAL);
        let global = self.define_global(symbol_name, gcc_type, is_tls, attrs.link_section);
+        #[cfg(feature="master")]
+        global.add_attribute(VarAttribute::Visibility(base::visibility_to_gcc(visibility)));

-        // TODO(antoyo): set linkage and visibility.
+        // TODO(antoyo): set linkage.
        self.instances.borrow_mut().insert(instance, global);
    }

-    fn predefine_fn(&self, instance: Instance<'tcx>, linkage: Linkage, _visibility: Visibility, symbol_name: &str) {
+    #[cfg_attr(not(feature="master"), allow(unused_variables))]
+    fn predefine_fn(&self, instance: Instance<'tcx>, linkage: Linkage, visibility: Visibility, symbol_name: &str) {
        assert!(!instance.substs.needs_infer());

        let fn_abi = self.fn_abi_of_instance(instance, ty::List::empty());
        self.linkage.set(base::linkage_to_gcc(linkage));
-        let _decl = self.declare_fn(symbol_name, &fn_abi);
+        let decl = self.declare_fn(symbol_name, &fn_abi);
        //let attrs = self.tcx.codegen_fn_attrs(instance.def_id());

+        attributes::from_fn_attrs(self, decl, instance);
+
+        // If we're compiling the compiler-builtins crate, e.g., the equivalent of
+        // compiler-rt, then we want to implicitly compile everything with hidden
+        // visibility as we're going to link this object all over the place but
+        // don't want the symbols to get exported.
+        if linkage != Linkage::Internal
+            && linkage != Linkage::Private
+            && self.tcx.is_compiler_builtins(LOCAL_CRATE)
+        {
+            #[cfg(feature="master")]
+            decl.add_attribute(FnAttribute::Visibility(gccjit::Visibility::Hidden));
+        }
+        else {
+            #[cfg(feature="master")]
+            decl.add_attribute(FnAttribute::Visibility(base::visibility_to_gcc(visibility)));
+        }
+
        // TODO(antoyo): call set_link_section() to allow initializing argc/argv.
        // TODO(antoyo): set unique comdat.
        // TODO(antoyo): use inline attribute from there in linkage.set() above.
+
+        self.functions.borrow_mut().insert(symbol_name.to_string(), decl);
+        self.function_instances.borrow_mut().insert(instance, unsafe { std::mem::transmute(decl) });
    }
 }
--- a/compiler/rustc_codegen_gcc/src/type_.rs
+++ b/compiler/rustc_codegen_gcc/src/type_.rs
@ -1,5 +1,3 @@
-use std::convert::TryInto;
-
 use gccjit::{RValue, Struct, Type};
 use rustc_codegen_ssa::traits::{BaseTypeMethods, DerivedTypeMethods, TypeMembershipMethods};
 use rustc_codegen_ssa::common::TypeKind;
@ -202,8 +200,9 @@ impl<'gcc, 'tcx> BaseTypeMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
        value.get_type()
    }

-    fn type_array(&self, ty: Type<'gcc>, mut len: u64) -> Type<'gcc> {
-        if let Some(struct_type) = ty.is_struct() {
+    fn type_array(&self, ty: Type<'gcc>, len: u64) -> Type<'gcc> {
+        // TODO: remove this as well?
+        /*if let Some(struct_type) = ty.is_struct() {
            if struct_type.get_field_count() == 0 {
                // NOTE: since gccjit only supports i32 for the array size and libcore's tests uses a
                // size of usize::MAX in test_binary_search, we workaround this by setting the size to
@ -211,14 +210,7 @@ impl<'gcc, 'tcx> BaseTypeMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
                // FIXME(antoyo): fix gccjit API.
                len = 0;
            }
-        }
-
-        // NOTE: see note above. Some other test uses usize::MAX.
-        if len == u64::MAX {
-            len = 0;
-        }
-
-        let len: i32 = len.try_into().expect("array len");
+        }*/

        self.context.new_array_type(None, ty, len)
    }
@ -247,10 +239,6 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
    pub fn type_named_struct(&self, name: &str) -> Struct<'gcc> {
        self.context.new_opaque_struct_type(None, name)
    }
-
-    pub fn type_bool(&self) -> Type<'gcc> {
-        self.context.new_type::<bool>()
-    }
 }

 pub fn struct_fields<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, layout: TyAndLayout<'tcx>) -> (Vec<Type<'gcc>>, bool) {
@ -273,7 +261,7 @@ pub fn struct_fields<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, layout: TyAndLayout
        assert_eq!(offset.align_to(padding_align) + padding, target_offset);
        result.push(cx.type_padding_filler(padding, padding_align));

-        result.push(field.gcc_type(cx, !field.ty.is_any_ptr())); // FIXME(antoyo): might need to check if the type is inside another, like Box<Type>.
+        result.push(field.gcc_type(cx));
        offset = target_offset + field.size;
        prev_effective_align = effective_field_align;
    }
--- a/compiler/rustc_codegen_gcc/src/type_of.rs
+++ b/compiler/rustc_codegen_gcc/src/type_of.rs
@ -6,7 +6,7 @@ use rustc_middle::bug;
 use rustc_middle::ty::{self, Ty, TypeVisitableExt};
 use rustc_middle::ty::layout::{FnAbiOf, LayoutOf, TyAndLayout};
 use rustc_middle::ty::print::with_no_trimmed_paths;
-use rustc_target::abi::{self, Abi, F32, F64, FieldsShape, Int, Integer, Pointer, PointeeInfo, Size, TyAbiInterface, Variants};
+use rustc_target::abi::{self, Abi, Align, F32, F64, FieldsShape, Int, Integer, Pointer, PointeeInfo, Size, TyAbiInterface, Variants};
 use rustc_target::abi::call::{CastTarget, FnAbi, Reg};

 use crate::abi::{FnAbiGccExt, GccType};
@ -50,11 +50,25 @@ impl<'gcc, 'tcx> CodegenCx<'gcc, 'tcx> {
    }
 }

-pub fn uncached_gcc_type<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, layout: TyAndLayout<'tcx>, defer: &mut Option<(Struct<'gcc>, TyAndLayout<'tcx>)>) -> Type<'gcc> {
+impl<'a, 'tcx> CodegenCx<'a, 'tcx> {
+    pub fn align_of(&self, ty: Ty<'tcx>) -> Align {
+        self.layout_of(ty).align.abi
+    }
+}
+
+fn uncached_gcc_type<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, layout: TyAndLayout<'tcx>, defer: &mut Option<(Struct<'gcc>, TyAndLayout<'tcx>)>) -> Type<'gcc> {
    match layout.abi {
        Abi::Scalar(_) => bug!("handled elsewhere"),
        Abi::Vector { ref element, count } => {
            let element = layout.scalar_gcc_type_at(cx, element, Size::ZERO);
+            let element =
+                // NOTE: gcc doesn't allow pointer types in vectors.
+                if element.get_pointee().is_some() {
+                    cx.usize_type
+                }
+                else {
+                    element
+                };
            return cx.context.new_vector_type(element, count);
        },
        Abi::ScalarPair(..) => {
@ -114,7 +128,7 @@ pub fn uncached_gcc_type<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, layout: TyAndLa
                },
            }
        }
-        FieldsShape::Array { count, .. } => cx.type_array(layout.field(cx, 0).gcc_type(cx, true), count),
+        FieldsShape::Array { count, .. } => cx.type_array(layout.field(cx, 0).gcc_type(cx), count),
        FieldsShape::Arbitrary { .. } =>
            match name {
                None => {
@ -133,7 +147,7 @@ pub fn uncached_gcc_type<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, layout: TyAndLa
 pub trait LayoutGccExt<'tcx> {
    fn is_gcc_immediate(&self) -> bool;
    fn is_gcc_scalar_pair(&self) -> bool;
-    fn gcc_type<'gcc>(&self, cx: &CodegenCx<'gcc, 'tcx>, set_fields: bool) -> Type<'gcc>;
+    fn gcc_type<'gcc>(&self, cx: &CodegenCx<'gcc, 'tcx>) -> Type<'gcc>;
    fn immediate_gcc_type<'gcc>(&self, cx: &CodegenCx<'gcc, 'tcx>) -> Type<'gcc>;
    fn scalar_gcc_type_at<'gcc>(&self, cx: &CodegenCx<'gcc, 'tcx>, scalar: &abi::Scalar, offset: Size) -> Type<'gcc>;
    fn scalar_pair_element_gcc_type<'gcc>(&self, cx: &CodegenCx<'gcc, 'tcx>, index: usize, immediate: bool) -> Type<'gcc>;
@ -168,8 +182,7 @@ impl<'tcx> LayoutGccExt<'tcx> for TyAndLayout<'tcx> {
    /// with the inner-most trailing unsized field using the "minimal unit"
    /// of that field's type - this is useful for taking the address of
    /// that field and ensuring the struct has the right alignment.
-    //TODO(antoyo): do we still need the set_fields parameter?
-    fn gcc_type<'gcc>(&self, cx: &CodegenCx<'gcc, 'tcx>, set_fields: bool) -> Type<'gcc> {
+    fn gcc_type<'gcc>(&self, cx: &CodegenCx<'gcc, 'tcx>) -> Type<'gcc> {
        if let Abi::Scalar(ref scalar) = self.abi {
            // Use a different cache for scalars because pointers to DSTs
            // can be either fat or thin (data pointers of fat pointers).
@ -179,10 +192,10 @@ impl<'tcx> LayoutGccExt<'tcx> for TyAndLayout<'tcx> {
            let ty =
                match *self.ty.kind() {
                    ty::Ref(_, ty, _) | ty::RawPtr(ty::TypeAndMut { ty, .. }) => {
-                        cx.type_ptr_to(cx.layout_of(ty).gcc_type(cx, set_fields))
+                        cx.type_ptr_to(cx.layout_of(ty).gcc_type(cx))
                    }
                    ty::Adt(def, _) if def.is_box() => {
-                        cx.type_ptr_to(cx.layout_of(self.ty.boxed_ty()).gcc_type(cx, true))
+                        cx.type_ptr_to(cx.layout_of(self.ty.boxed_ty()).gcc_type(cx))
                    }
                    ty::FnPtr(sig) => cx.fn_ptr_backend_type(&cx.fn_abi_of_fn_ptr(sig, ty::List::empty())),
                    _ => self.scalar_gcc_type_at(cx, scalar, Size::ZERO),
@ -199,13 +212,6 @@ impl<'tcx> LayoutGccExt<'tcx> for TyAndLayout<'tcx> {
            };
        let cached_type = cx.types.borrow().get(&(self.ty, variant_index)).cloned();
        if let Some(ty) = cached_type {
-            let type_to_set_fields = cx.types_with_fields_to_set.borrow_mut().remove(&ty);
-            if let Some((struct_type, layout)) = type_to_set_fields {
-                // Since we might be trying to generate a type containing another type which is not
-                // completely generated yet, we deferred setting the fields until now.
-                let (fields, packed) = struct_fields(cx, layout);
-                cx.set_struct_body(struct_type, &fields, packed);
-            }
            return ty;
        }

@ -222,7 +228,7 @@ impl<'tcx> LayoutGccExt<'tcx> for TyAndLayout<'tcx> {
                if let Some(v) = variant_index {
                    layout = layout.for_variant(cx, v);
                }
-                layout.gcc_type(cx, true)
+                layout.gcc_type(cx)
            }
            else {
                uncached_gcc_type(cx, *self, &mut defer)
@ -230,9 +236,9 @@ impl<'tcx> LayoutGccExt<'tcx> for TyAndLayout<'tcx> {

        cx.types.borrow_mut().insert((self.ty, variant_index), ty);

-        if let Some((ty, layout)) = defer {
+        if let Some((deferred_ty, layout)) = defer {
            let (fields, packed) = struct_fields(cx, layout);
-            cx.set_struct_body(ty, &fields, packed);
+            cx.set_struct_body(deferred_ty, &fields, packed);
        }

        ty
@ -244,7 +250,7 @@ impl<'tcx> LayoutGccExt<'tcx> for TyAndLayout<'tcx> {
                return cx.type_i1();
            }
        }
-        self.gcc_type(cx, true)
+        self.gcc_type(cx)
    }

    fn scalar_gcc_type_at<'gcc>(&self, cx: &CodegenCx<'gcc, 'tcx>, scalar: &abi::Scalar, offset: Size) -> Type<'gcc> {
@ -273,7 +279,7 @@ impl<'tcx> LayoutGccExt<'tcx> for TyAndLayout<'tcx> {
        // pointee types, to avoid bitcasting every `OperandRef::deref`.
        match self.ty.kind() {
            ty::Ref(..) | ty::RawPtr(_) => {
-                return self.field(cx, index).gcc_type(cx, true);
+                return self.field(cx, index).gcc_type(cx);
            }
            // only wide pointer boxes are handled as pointers
            // thin pointer boxes with scalar allocators are handled by the general logic below
@ -343,7 +349,7 @@ impl<'tcx> LayoutGccExt<'tcx> for TyAndLayout<'tcx> {

 impl<'gcc, 'tcx> LayoutTypeMethods<'tcx> for CodegenCx<'gcc, 'tcx> {
    fn backend_type(&self, layout: TyAndLayout<'tcx>) -> Type<'gcc> {
-        layout.gcc_type(self, true)
+        layout.gcc_type(self)
    }

    fn immediate_backend_type(&self, layout: TyAndLayout<'tcx>) -> Type<'gcc> {
--- a/compiler/rustc_codegen_gcc/test.sh
+++ b/compiler/rustc_codegen_gcc/test.sh
@ -17,17 +17,20 @@ export LIBRARY_PATH="$GCC_PATH"
 flags=
 gcc_master_branch=1
 channel="debug"
-func=all
+funcs=()
 build_only=0
+nb_parts=0
+current_part=0

 while [[ $# -gt 0 ]]; do
    case $1 in
        --release)
            codegen_channel=release
+            channel="release"
            shift
            ;;
        --release-sysroot)
-            sysroot_channel=release
+            sysroot_channel="--release"
            shift
            ;;
        --no-default-features)
@ -40,43 +43,83 @@ while [[ $# -gt 0 ]]; do
            flags="$flags --features $1"
            shift
            ;;
-        --release)
-            channel="release"
+        "--test-rustc")
+            funcs+=(test_rustc)
            shift
            ;;
-        "--test-rustc")
-            func=test_rustc
+        "--test-successful-rustc")
+            funcs+=(test_successful_rustc)
+            shift
+            ;;
+        "--test-failing-rustc")
+            funcs+=(test_failing_rustc)
            shift
            ;;

        "--test-libcore")
-            func=test_libcore
+            funcs+=(test_libcore)
            shift
            ;;

        "--clean-ui-tests")
-            func=clean_ui_tests
+            funcs+=(clean_ui_tests)
+            shift
+            ;;
+        "--clean")
+            funcs+=(clean)
            shift
            ;;

        "--std-tests")
-            func=std_tests
+            funcs+=(std_tests)
+            shift
+            ;;
+
+        "--asm-tests")
+            funcs+=(asm_tests)
            shift
            ;;

        "--extended-tests")
-            func=extended_sysroot_tests
+            funcs+=(extended_sysroot_tests)
+            shift
+            ;;
+        "--extended-rand-tests")
+            funcs+=(extended_rand_tests)
+            shift
+            ;;
+        "--extended-regex-example-tests")
+            funcs+=(extended_regex_example_tests)
+            shift
+            ;;
+        "--extended-regex-tests")
+            funcs+=(extended_regex_tests)
+            shift
+            ;;
+
+        "--mini-tests")
+            funcs+=(mini_tests)
            shift
            ;;

        "--build-sysroot")
-            func=build_sysroot
+            funcs+=(build_sysroot)
            shift
            ;;
        "--build")
            build_only=1
            shift
            ;;
+        "--nb-parts")
+            shift
+            nb_parts=$1
+            shift
+            ;;
+        "--current-part")
+            shift
+            current_part=$1
+            shift
+            ;;
        *)
            echo "Unknown option $1"
            exit 1
@ -87,7 +130,6 @@ done
 if [[ $channel == "release" ]]; then
    export CHANNEL='release'
    CARGO_INCREMENTAL=1 cargo rustc --release $flags
-    shift
 else
    echo $LD_LIBRARY_PATH
    export CHANNEL='debug'
@ -95,6 +137,7 @@ else
 fi

 if (( $build_only == 1 )); then
+    echo "Since it's 'build-only', exiting..."
    exit
 fi

@ -119,7 +162,7 @@ function mini_tests() {

 function build_sysroot() {
    echo "[BUILD] sysroot"
-    time ./build_sysroot/build_sysroot.sh
+    time ./build_sysroot/build_sysroot.sh $sysroot_channel
 }

 function std_tests() {
@ -148,17 +191,57 @@ function std_tests() {
    $RUN_WRAPPER ./target/out/std_example --target $TARGET_TRIPLE

    echo "[AOT] subslice-patterns-const-eval"
-    $RUSTC example/subslice-patterns-const-eval.rs --crate-type bin -Cpanic=abort --target $TARGET_TRIPLE
+    $RUSTC example/subslice-patterns-const-eval.rs --crate-type bin $TEST_FLAGS --target $TARGET_TRIPLE
    $RUN_WRAPPER ./target/out/subslice-patterns-const-eval

    echo "[AOT] track-caller-attribute"
-    $RUSTC example/track-caller-attribute.rs --crate-type bin -Cpanic=abort --target $TARGET_TRIPLE
+    $RUSTC example/track-caller-attribute.rs --crate-type bin $TEST_FLAGS --target $TARGET_TRIPLE
    $RUN_WRAPPER ./target/out/track-caller-attribute

    echo "[BUILD] mod_bench"
    $RUSTC example/mod_bench.rs --crate-type bin --target $TARGET_TRIPLE
 }

+function setup_rustc() {
+    rust_toolchain=$(cat rust-toolchain | grep channel | sed 's/channel = "\(.*\)"/\1/')
+
+    git clone https://github.com/rust-lang/rust.git || true
+    cd rust
+    git fetch
+    git checkout $(rustc -V | cut -d' ' -f3 | tr -d '(')
+    export RUSTFLAGS=
+
+    rm config.toml || true
+
+    cat > config.toml <<EOF
+[rust]
+codegen-backends = []
+deny-warnings = false
+
+[build]
+cargo = "$(which cargo)"
+local-rebuild = true
+rustc = "$HOME/.rustup/toolchains/$rust_toolchain-$TARGET_TRIPLE/bin/rustc"
+
+[target.x86_64-unknown-linux-gnu]
+llvm-filecheck = "`which FileCheck-10 || which FileCheck-11 || which FileCheck-12 || which FileCheck-13 || which FileCheck-14`"
+
+[llvm]
+download-ci-llvm = false
+EOF
+
+    rustc -V | cut -d' ' -f3 | tr -d '('
+    git checkout $(rustc -V | cut -d' ' -f3 | tr -d '(') tests
+}
+
+function asm_tests() {
+    setup_rustc
+
+    echo "[TEST] rustc test suite"
+    RUSTC_ARGS="-Zpanic-abort-tests -Csymbol-mangling-version=v0 -Zcodegen-backend="$(pwd)"/../target/"$CHANNEL"/librustc_codegen_gcc."$dylib_ext" --sysroot "$(pwd)"/../build_sysroot/sysroot -Cpanic=abort"
+    COMPILETEST_FORCE_STAGE0=1 ./x.py test --run always --stage 0 tests/assembly/asm --rustc-args "$RUSTC_ARGS"
+}
+
 # FIXME(antoyo): linker gives multiple definitions error on Linux
 #echo "[BUILD] sysroot in release mode"
 #./build_sysroot/build_sysroot.sh --release
@ -187,7 +270,7 @@ function test_libcore() {
 #echo "[BENCH RUN] mod_bench"
 #hyperfine --runs ${RUN_RUNS:-10} ./target/out/mod_bench{,_inline} ./target/out/mod_bench_llvm_*

-function extended_sysroot_tests() {
+function extended_rand_tests() {
    if (( $gcc_master_branch == 0 )); then
        return
    fi
@ -197,17 +280,12 @@ function extended_sysroot_tests() {
    echo "[TEST] rust-random/rand"
    ../cargo.sh test --workspace
    popd
+}

-    #pushd simple-raytracer
-    #echo "[BENCH COMPILE] ebobby/simple-raytracer"
-    #hyperfine --runs "${RUN_RUNS:-10}" --warmup 1 --prepare "cargo clean" \
-    #"RUSTC=rustc RUSTFLAGS='' cargo build" \
-    #"../cargo.sh build"
-
-    #echo "[BENCH RUN] ebobby/simple-raytracer"
-    #cp ./target/debug/main ./raytracer_cg_gcc
-    #hyperfine --runs "${RUN_RUNS:-10}" ./raytracer_cg_llvm ./raytracer_cg_gcc
-    #popd
+function extended_regex_example_tests() {
+    if (( $gcc_master_branch == 0 )); then
+        return
+    fi

    pushd regex
    echo "[TEST] rust-lang/regex example shootout-regex-dna"
@ -219,41 +297,43 @@ function extended_sysroot_tests() {
        | ../cargo.sh run --example shootout-regex-dna \
        | grep -v "Spawned thread" > res.txt
    diff -u res.txt examples/regexdna-output.txt
+    popd
+}

+function extended_regex_tests() {
+    if (( $gcc_master_branch == 0 )); then
+        return
+    fi
+
+    pushd regex
    echo "[TEST] rust-lang/regex tests"
+    export CG_RUSTFLAGS="--cap-lints warn" # newer aho_corasick versions throw a deprecation warning
    ../cargo.sh test --tests -- --exclude-should-panic --test-threads 1 -Zunstable-options -q
    popd
 }

+function extended_sysroot_tests() {
+    #pushd simple-raytracer
+    #echo "[BENCH COMPILE] ebobby/simple-raytracer"
+    #hyperfine --runs "${RUN_RUNS:-10}" --warmup 1 --prepare "cargo clean" \
+    #"RUSTC=rustc RUSTFLAGS='' cargo build" \
+    #"../cargo.sh build"
+
+    #echo "[BENCH RUN] ebobby/simple-raytracer"
+    #cp ./target/debug/main ./raytracer_cg_gcc
+    #hyperfine --runs "${RUN_RUNS:-10}" ./raytracer_cg_llvm ./raytracer_cg_gcc
+    #popd
+
+    extended_rand_tests
+    extended_regex_example_tests
+    extended_regex_tests
+}
+
 function test_rustc() {
    echo
    echo "[TEST] rust-lang/rust"

-    rust_toolchain=$(cat rust-toolchain | grep channel | sed 's/channel = "\(.*\)"/\1/')
-
-    git clone https://github.com/rust-lang/rust.git || true
-    cd rust
-    git fetch
-    git checkout $(rustc -V | cut -d' ' -f3 | tr -d '(')
-    export RUSTFLAGS=
-
-    git apply ../rustc_patches/compile_test.patch || true
-
-    rm config.toml || true
-
-    cat > config.toml <<EOF
-[rust]
-codegen-backends = []
-deny-warnings = false
-
-[build]
-cargo = "$(which cargo)"
-local-rebuild = true
-rustc = "$HOME/.rustup/toolchains/$rust_toolchain-$TARGET_TRIPLE/bin/rustc"
-EOF
-
-    rustc -V | cut -d' ' -f3 | tr -d '('
-    git checkout $(rustc -V | cut -d' ' -f3 | tr -d '(') tests
+    setup_rustc

    for test in $(rg -i --files-with-matches "//(\[\w+\])?~|// error-pattern:|// build-fail|// run-fail|-Cllvm-args" tests/ui); do
      rm $test
@ -261,21 +341,61 @@ EOF

    git checkout -- tests/ui/issues/auxiliary/issue-3136-a.rs # contains //~ERROR, but shouldn't be removed

-    rm -r tests/ui/{abi*,extern/,panic-runtime/,panics/,unsized-locals/,proc-macro/,threads-sendsync/,thinlto/,borrowck/,test*,*lto*.rs} || true
-    for test in $(rg --files-with-matches "catch_unwind|should_panic|thread|lto" tests/ui); do
+    rm -r tests/ui/{abi*,extern/,unsized-locals/,proc-macro/,threads-sendsync/,thinlto/,borrowck/,chalkify/bugs/,test*,*lto*.rs,consts/const-float-bits-reject-conv.rs,consts/issue-miri-1910.rs} || true
+    rm tests/ui/mir/mir_heavy_promoted.rs # this tests is oom-killed in the CI.
+    for test in $(rg --files-with-matches "thread|lto" tests/ui); do
      rm $test
    done
+    git checkout tests/ui/lto/auxiliary/dylib.rs
    git checkout tests/ui/type-alias-impl-trait/auxiliary/cross_crate_ice.rs
    git checkout tests/ui/type-alias-impl-trait/auxiliary/cross_crate_ice2.rs
+    git checkout tests/ui/macros/rfc-2011-nicer-assert-messages/auxiliary/common.rs

-    RUSTC_ARGS="-Zpanic-abort-tests -Csymbol-mangling-version=v0 -Zcodegen-backend="$(pwd)"/../target/"$CHANNEL"/librustc_codegen_gcc."$dylib_ext" --sysroot "$(pwd)"/../build_sysroot/sysroot -Cpanic=abort"
+    RUSTC_ARGS="$TEST_FLAGS -Csymbol-mangling-version=v0 -Zcodegen-backend="$(pwd)"/../target/"$CHANNEL"/librustc_codegen_gcc."$dylib_ext" --sysroot "$(pwd)"/../build_sysroot/sysroot"
+
+    if [ $# -eq 0 ]; then
+        # No argument supplied to the function. Doing nothing.
+        echo "No argument provided. Keeping all UI tests"
+    elif [ $1 = "0" ]; then
+        # Removing the failing tests.
+        xargs -a ../failing-ui-tests.txt -d'\n' rm
+    else
+        # Removing all tests.
+        find tests/ui -type f -name '*.rs' -not -path '*/auxiliary/*' -delete
+        # Putting back only the failing ones.
+        xargs -a ../failing-ui-tests.txt -d'\n' git checkout --
+    fi
+
+    if [ $nb_parts -gt 0 ]; then
+        echo "Splitting ui_test into $nb_parts parts (and running part $current_part)"
+        find tests/ui -type f -name '*.rs' -not -path "*/auxiliary/*" > ui_tests
+        # To ensure it'll be always the same sub files, we sort the content.
+        sort ui_tests -o ui_tests
+        count=$((`wc -l < ui_tests` / $nb_parts))
+        # We increment the number of tests by one because if this is an odd number, we would skip
+        # one test.
+        count=$((count + 1))
+        split -d -l $count -a 1 ui_tests ui_tests.split
+        # Removing all tests.
+        find tests/ui -type f -name '*.rs' -not -path "*/auxiliary/*" -delete
+        # Putting back only the ones we want to test.
+        xargs -a "ui_tests.split$current_part" -d'\n' git checkout --
+    fi

    echo "[TEST] rustc test suite"
    COMPILETEST_FORCE_STAGE0=1 ./x.py test --run always --stage 0 tests/ui/ --rustc-args "$RUSTC_ARGS"
 }

+function test_failing_rustc() {
+    test_rustc "1"
+}
+
+function test_successful_rustc() {
+    test_rustc "0"
+}
+
 function clean_ui_tests() {
-    find rust/build/x86_64-unknown-linux-gnu/test/ui/ -name stamp -exec rm -rf {} \;
+    find rust/build/x86_64-unknown-linux-gnu/test/ui/ -name stamp -delete
 }

 function all() {
@ -283,9 +403,17 @@ function all() {
    mini_tests
    build_sysroot
    std_tests
+    #asm_tests
    test_libcore
    extended_sysroot_tests
    test_rustc
 }

-$func
+if [ ${#funcs[@]} -eq 0 ]; then
+    echo "No command passed, running '--all'..."
+    all
+else
+    for t in ${funcs[@]}; do
+        $t
+    done
+fi
--- a/compiler/rustc_codegen_gcc/tests/lang_tests_common.rs
+++ b/compiler/rustc_codegen_gcc/tests/lang_tests_common.rs
@ -46,11 +46,15 @@ pub fn main_inner(profile: Profile) {
                &format!("-Zcodegen-backend={}/target/debug/librustc_codegen_gcc.so", current_dir),
                "--sysroot", &format!("{}/build_sysroot/sysroot/", current_dir),
                "-Zno-parallel-llvm",
-                "-C", "panic=abort",
                "-C", "link-arg=-lc",
                "-o", exe.to_str().expect("to_str"),
                path.to_str().expect("to_str"),
            ]);
+            if let Some(flags) = option_env!("TEST_FLAGS") {
+                for flag in flags.split_whitespace() {
+                    compiler.arg(&flag);
+                }
+            }
            match profile {
                Profile::Debug => {}
                Profile::Release => {
--- a/compiler/rustc_codegen_gcc/tests/run/abort1.rs
+++ b/compiler/rustc_codegen_gcc/tests/run/abort1.rs
@ -33,6 +33,7 @@ mod intrinsics {
    use super::Sized;

    extern "rust-intrinsic" {
+        #[rustc_safe_intrinsic]
        pub fn abort() -> !;
    }
 }
--- a/compiler/rustc_codegen_gcc/tests/run/abort2.rs
+++ b/compiler/rustc_codegen_gcc/tests/run/abort2.rs
@ -33,6 +33,7 @@ mod intrinsics {
    use super::Sized;

    extern "rust-intrinsic" {
+        #[rustc_safe_intrinsic]
        pub fn abort() -> !;
    }
 }
--- a/compiler/rustc_codegen_gcc/tests/run/array.rs
+++ b/compiler/rustc_codegen_gcc/tests/run/array.rs
@ -79,7 +79,7 @@ pub unsafe fn drop_in_place<T: ?Sized>(to_drop: *mut T) {
 #[lang = "panic"]
 #[track_caller]
 #[no_mangle]
-pub fn panic(_msg: &str) -> ! {
+pub fn panic(_msg: &'static str) -> ! {
    unsafe {
        libc::puts("Panicking\0" as *const str as *const u8);
        intrinsics::abort();
@ -105,6 +105,7 @@ fn panic_bounds_check(index: usize, len: usize) -> ! {

 mod intrinsics {
    extern "rust-intrinsic" {
+        #[rustc_safe_intrinsic]
        pub fn abort() -> !;
    }
 }
--- a/compiler/rustc_codegen_gcc/tests/run/assign.rs
+++ b/compiler/rustc_codegen_gcc/tests/run/assign.rs
@ -57,6 +57,7 @@ mod libc {

 mod intrinsics {
    extern "rust-intrinsic" {
+        #[rustc_safe_intrinsic]
        pub fn abort() -> !;
    }
 }
@ -64,7 +65,7 @@ mod intrinsics {
 #[lang = "panic"]
 #[track_caller]
 #[no_mangle]
-pub fn panic(_msg: &str) -> ! {
+pub fn panic(_msg: &'static str) -> ! {
    unsafe {
        libc::puts("Panicking\0" as *const str as *const u8);
        libc::fflush(libc::stdout);
--- a/compiler/rustc_codegen_gcc/tests/run/closure.rs
+++ b/compiler/rustc_codegen_gcc/tests/run/closure.rs
@ -97,10 +97,14 @@ fn panic_bounds_check(index: usize, len: usize) -> ! {

 mod intrinsics {
    extern "rust-intrinsic" {
+        #[rustc_safe_intrinsic]
        pub fn abort() -> !;
    }
 }

+#[lang = "tuple_trait"]
+pub trait Tuple {}
+
 #[lang = "unsize"]
 pub trait Unsize<T: ?Sized> {}

@ -114,7 +118,7 @@ impl<T: ?Sized + Unsize<U>, U: ?Sized> CoerceUnsized<*mut U> for *mut T {}

 #[lang = "fn_once"]
 #[rustc_paren_sugar]
-pub trait FnOnce<Args> {
+pub trait FnOnce<Args: Tuple> {
    #[lang = "fn_once_output"]
    type Output;

@ -123,7 +127,7 @@ pub trait FnOnce<Args> {

 #[lang = "fn_mut"]
 #[rustc_paren_sugar]
-pub trait FnMut<Args>: FnOnce<Args> {
+pub trait FnMut<Args: Tuple>: FnOnce<Args> {
    extern "rust-call" fn call_mut(&mut self, args: Args) -> Self::Output;
 }

@ -177,7 +181,7 @@ impl Add for isize {
 #[lang = "panic"]
 #[track_caller]
 #[no_mangle]
-pub fn panic(_msg: &str) -> ! {
+pub fn panic(_msg: &'static str) -> ! {
    unsafe {
        libc::puts("Panicking\0" as *const str as *const u8);
        intrinsics::abort();
--- a/compiler/rustc_codegen_gcc/tests/run/condition.rs
+++ b/compiler/rustc_codegen_gcc/tests/run/condition.rs
@ -82,7 +82,7 @@ pub unsafe fn drop_in_place<T: ?Sized>(to_drop: *mut T) {
 #[lang = "panic"]
 #[track_caller]
 #[no_mangle]
-pub fn panic(_msg: &str) -> ! {
+pub fn panic(_msg: &'static str) -> ! {
    unsafe {
        libc::puts("Panicking\0" as *const str as *const u8);
        intrinsics::abort();
@ -108,6 +108,7 @@ fn panic_bounds_check(index: usize, len: usize) -> ! {

 mod intrinsics {
    extern "rust-intrinsic" {
+        #[rustc_safe_intrinsic]
        pub fn abort() -> !;
    }
 }
--- a/compiler/rustc_codegen_gcc/tests/run/fun_ptr.rs
+++ b/compiler/rustc_codegen_gcc/tests/run/fun_ptr.rs
@ -76,7 +76,7 @@ pub unsafe fn drop_in_place<T: ?Sized>(to_drop: *mut T) {
 #[lang = "panic"]
 #[track_caller]
 #[no_mangle]
-pub fn panic(_msg: &str) -> ! {
+pub fn panic(_msg: &'static str) -> ! {
    unsafe {
        libc::puts("Panicking\0" as *const str as *const u8);
        intrinsics::abort();
@ -102,6 +102,7 @@ fn panic_bounds_check(index: usize, len: usize) -> ! {

 mod intrinsics {
    extern "rust-intrinsic" {
+        #[rustc_safe_intrinsic]
        pub fn abort() -> !;
    }
 }
--- a/compiler/rustc_codegen_gcc/tests/run/int.rs
+++ b/compiler/rustc_codegen_gcc/tests/run/int.rs
@ -3,22 +3,14 @@
 // Run-time:
 //   status: 0

-#![feature(const_black_box, core_intrinsics, start)]
-
-#![no_std]
-
-#[panic_handler]
-fn panic_handler(_: &core::panic::PanicInfo) -> ! {
-    core::intrinsics::abort();
-}
+#![feature(const_black_box)]

 /*
 * Code
 */

-#[start]
-fn main(_argc: isize, _argv: *const *const u8) -> isize {
-    use core::hint::black_box;
+fn main() {
+    use std::hint::black_box;

    macro_rules! check {
        ($ty:ty, $expr:expr) => {
@ -335,6 +327,4 @@ fn main(_argc: isize, _argv: *const *const u8) -> isize {
        const VAL5: T = 73236519889708027473620326106273939584_i128;
        check_ops128!();
    }
-
-    0
 }
--- a/compiler/rustc_codegen_gcc/tests/run/int_overflow.rs
+++ b/compiler/rustc_codegen_gcc/tests/run/int_overflow.rs
@ -55,6 +55,7 @@ mod libc {

 mod intrinsics {
    extern "rust-intrinsic" {
+        #[rustc_safe_intrinsic]
        pub fn abort() -> !;
    }
 }
@ -62,7 +63,7 @@ mod intrinsics {
 #[lang = "panic"]
 #[track_caller]
 #[no_mangle]
-pub fn panic(_msg: &str) -> ! {
+pub fn panic(_msg: &'static str) -> ! {
    unsafe {
        // Panicking is expected iff overflow checking is enabled.
        #[cfg(debug_assertions)]
--- a/compiler/rustc_codegen_gcc/tests/run/mut_ref.rs
+++ b/compiler/rustc_codegen_gcc/tests/run/mut_ref.rs
@ -59,6 +59,7 @@ mod libc {

 mod intrinsics {
    extern "rust-intrinsic" {
+        #[rustc_safe_intrinsic]
        pub fn abort() -> !;
    }
 }
@ -66,7 +67,7 @@ mod intrinsics {
 #[lang = "panic"]
 #[track_caller]
 #[no_mangle]
-pub fn panic(_msg: &str) -> ! {
+pub fn panic(_msg: &'static str) -> ! {
    unsafe {
        libc::puts("Panicking\0" as *const str as *const u8);
        libc::fflush(libc::stdout);
--- a/compiler/rustc_codegen_gcc/tests/run/operations.rs
+++ b/compiler/rustc_codegen_gcc/tests/run/operations.rs
@ -65,6 +65,7 @@ mod libc {

 mod intrinsics {
    extern "rust-intrinsic" {
+        #[rustc_safe_intrinsic]
        pub fn abort() -> !;
    }
 }
@ -72,7 +73,7 @@ mod intrinsics {
 #[lang = "panic"]
 #[track_caller]
 #[no_mangle]
-pub fn panic(_msg: &str) -> ! {
+pub fn panic(_msg: &'static str) -> ! {
    unsafe {
        libc::puts("Panicking\0" as *const str as *const u8);
        libc::fflush(libc::stdout);
--- a/compiler/rustc_codegen_gcc/tests/run/ptr_cast.rs
+++ b/compiler/rustc_codegen_gcc/tests/run/ptr_cast.rs
@ -76,7 +76,7 @@ pub unsafe fn drop_in_place<T: ?Sized>(to_drop: *mut T) {
 #[lang = "panic"]
 #[track_caller]
 #[no_mangle]
-pub fn panic(_msg: &str) -> ! {
+pub fn panic(_msg: &'static str) -> ! {
    unsafe {
        libc::puts("Panicking\0" as *const str as *const u8);
        intrinsics::abort();
@ -102,6 +102,7 @@ fn panic_bounds_check(index: usize, len: usize) -> ! {

 mod intrinsics {
    extern "rust-intrinsic" {
+        #[rustc_safe_intrinsic]
        pub fn abort() -> !;
    }
 }
--- a/compiler/rustc_codegen_gcc/tests/run/slice.rs
+++ b/compiler/rustc_codegen_gcc/tests/run/slice.rs
@ -102,6 +102,7 @@ mod intrinsics {
    use super::Sized;

    extern "rust-intrinsic" {
+        #[rustc_safe_intrinsic]
        pub fn abort() -> !;
    }
 }
--- a/compiler/rustc_codegen_gcc/tests/run/static.rs
+++ b/compiler/rustc_codegen_gcc/tests/run/static.rs
@ -45,6 +45,7 @@ mod intrinsics {
    use super::Sized;

    extern "rust-intrinsic" {
+        #[rustc_safe_intrinsic]
        pub fn abort() -> !;
    }
 }
--- a/compiler/rustc_codegen_gcc/tools/check_intrinsics_duplicates.py
+++ b/compiler/rustc_codegen_gcc/tools/check_intrinsics_duplicates.py
@ -0,0 +1,67 @@
+import sys
+
+
+def check_duplicates():
+    auto_content = ""
+    manual_content = ""
+
+    with open("src/intrinsic/llvm.rs", "r", encoding="utf8") as f:
+        manual_content = f.read()
+    with open("src/intrinsic/archs.rs", "r", encoding="utf8") as f:
+        auto_content = f.read()
+
+    intrinsics_map = {}
+    for line in auto_content.splitlines():
+        line = line.strip()
+        if not line.startswith('"'):
+            continue
+        parts = line.split('"')
+        if len(parts) != 5:
+            continue
+        intrinsics_map[parts[1]] = parts[3]
+
+    if len(intrinsics_map) == 0:
+        print("No intrinsics found in auto code... Aborting.")
+        return 1
+    print("Found {} intrinsics in auto code".format(len(intrinsics_map)))
+    errors = []
+    lines = manual_content.splitlines()
+    pos = 0
+    found = 0
+    while pos < len(lines):
+        line = lines[pos].strip()
+        # This is our marker.
+        if line == "let gcc_name = match name {":
+            while pos < len(lines):
+                line = lines[pos].strip()
+                pos += 1
+                if line == "};":
+                    # We're done!
+                    if found == 0:
+                        print("No intrinsics found in manual code even though we found the "
+                            "marker... Aborting...")
+                        return 1
+                    for error in errors:
+                        print("ERROR => {}".format(error))
+                    return 1 if len(errors) != 0 else 0
+                parts = line.split('"')
+                if len(parts) != 5:
+                    continue
+                found += 1
+                if parts[1] in intrinsics_map:
+                    if parts[3] != intrinsics_map[parts[1]]:
+                        print("Same intrinsics (`{}` at line {}) but different GCC "
+                            "translations: `{}` != `{}`".format(
+                                parts[1], pos, intrinsics_map[parts[1]], parts[3]))
+                    else:
+                        errors.append("Duplicated intrinsics: `{}` at line {}. Please remove it "
+                            " from manual code".format(parts[1], pos))
+            # Weird but whatever...
+            return 1 if len(errors) != 0 else 0
+        pos += 1
+    print("No intrinsics found in manual code... Aborting")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(check_duplicates())
--- a/compiler/rustc_codegen_gcc/tools/generate_intrinsics.py
+++ b/compiler/rustc_codegen_gcc/tools/generate_intrinsics.py
@ -13,7 +13,7 @@ def run_command(command, cwd=None):
        sys.exit(1)


-def clone_repository(repo_name, path, repo_url, sub_path=None):
+def clone_repository(repo_name, path, repo_url, sub_paths=None):
    if os.path.exists(path):
        while True:
            choice = input("There is already a `{}` folder, do you want to update it? [y/N]".format(path))
@ -27,12 +27,12 @@ def clone_repository(repo_name, path, repo_url, sub_path=None):
            else:
                print("Didn't understand answer...")
    print("Cloning {} repository...".format(repo_name))
-    if sub_path is None:
+    if sub_paths is None:
        run_command(["git", "clone", repo_url, "--depth", "1", path])
    else:
        run_command(["git", "clone", repo_url, "--filter=tree:0", "--no-checkout", path])
        run_command(["git", "sparse-checkout", "init"], cwd=path)
-        run_command(["git", "sparse-checkout", "set", "add", sub_path], cwd=path)
+        run_command(["git", "sparse-checkout", "set", *sub_paths], cwd=path)
        run_command(["git", "checkout"], cwd=path)


@ -40,56 +40,45 @@ def append_intrinsic(array, intrinsic_name, translation):
    array.append((intrinsic_name, translation))


-def extract_instrinsics(intrinsics, file):
-    print("Extracting intrinsics from `{}`...".format(file))
-    with open(file, "r", encoding="utf8") as f:
-        content = f.read()
-
-    lines = content.splitlines()
-    pos = 0
-    current_arch = None
-    while pos < len(lines):
-        line = lines[pos].strip()
-        if line.startswith("let TargetPrefix ="):
-            current_arch = line.split('"')[1].strip()
-            if len(current_arch) == 0:
-                current_arch = None
-        elif current_arch is None:
-            pass
-        elif line == "}":
-            current_arch = None
-        elif line.startswith("def "):
-            content = ""
-            while not content.endswith(";") and not content.endswith("}") and pos < len(lines):
-                line = lines[pos].split(" // ")[0].strip()
-                content += line
-                pos += 1
-            entries = re.findall('GCCBuiltin<"(\\w+)">', content)
-            if len(entries) > 0:
-                intrinsic = content.split("def ")[1].strip().split(":")[0].strip()
-                intrinsic = intrinsic.split("_")
-                if len(intrinsic) < 2 or intrinsic[0] != "int":
-                    continue
-                intrinsic[0] = "llvm"
-                intrinsic = ".".join(intrinsic)
-                if current_arch not in intrinsics:
-                    intrinsics[current_arch] = []
-                for entry in entries:
-                    append_intrinsic(intrinsics[current_arch], intrinsic, entry)
-            continue
-        pos += 1
-        continue
-    print("Done!")
+def convert_to_string(content):
+    if content.__class__.__name__ == 'bytes':
+        return content.decode('utf-8')
+    return content


 def extract_instrinsics_from_llvm(llvm_path, intrinsics):
-    files = []
-    intrinsics_path = os.path.join(llvm_path, "llvm/include/llvm/IR")
-    for (dirpath, dirnames, filenames) in walk(intrinsics_path):
-        files.extend([os.path.join(intrinsics_path, f) for f in filenames if f.endswith(".td")])
-
-    for file in files:
-        extract_instrinsics(intrinsics, file)
+    p = subprocess.Popen(
+        ["llvm-tblgen", "llvm/IR/Intrinsics.td"],
+        cwd=os.path.join(llvm_path, "llvm/include"),
+        stdout=subprocess.PIPE)
+    output, err = p.communicate()
+    lines = convert_to_string(output).splitlines()
+    pos = 0
+    while pos < len(lines):
+        line = lines[pos]
+        if not line.startswith("def "):
+            pos += 1
+            continue
+        intrinsic = line.split(" ")[1].strip()
+        content = line
+        while pos < len(lines):
+            line = lines[pos].split(" // ")[0].strip()
+            content += line
+            pos += 1
+            if line == "}":
+                break
+        entries = re.findall('string ClangBuiltinName = "(\\w+)";', content)
+        current_arch = re.findall('string TargetPrefix = "(\\w+)";', content)
+        if len(entries) == 1 and len(current_arch) == 1:
+            current_arch = current_arch[0]
+            intrinsic = intrinsic.split("_")
+            if len(intrinsic) < 2 or intrinsic[0] != "int":
+                continue
+            intrinsic[0] = "llvm"
+            intrinsic = ".".join(intrinsic)
+            if current_arch not in intrinsics:
+                intrinsics[current_arch] = []
+            append_intrinsic(intrinsics[current_arch], intrinsic, entries[0])


 def append_translation(json_data, p, array):
@ -193,6 +182,8 @@ def update_intrinsics(llvm_path, llvmint, llvmint2):
            for entry in intrinsics[arch]:
                if entry[2] == True: # if it is a duplicate
                    out.write('    // [DUPLICATE]: "{}" => "{}",\n'.format(entry[0], entry[1]))
+                elif "_round_mask" in entry[1]:
+                    out.write('    // [INVALID CONVERSION]: "{}" => "{}",\n'.format(entry[0], entry[1]))
                else:
                    out.write('    "{}" => "{}",\n'.format(entry[0], entry[1]))
        out.write('    _ => unimplemented!("***** unsupported LLVM intrinsic {}", name),\n')
@ -219,7 +210,7 @@ def main():
        "llvm-project",
        llvm_path,
        "https://github.com/llvm/llvm-project",
-        sub_path="llvm/include/llvm/IR",
+        sub_paths=["llvm/include/llvm/IR", "llvm/include/llvm/CodeGen/"],
    )
    clone_repository(
        "llvmint",