Auto merge of #89167 - workingjubilee:use-simd, r=MarkSimulacrum

pub use core::simd; A portable abstraction over SIMD has been a major pursuit in recent years for several programming languages. In Rust, `std::arch` offers explicit SIMD acceleration via compiler intrinsics, but it does so at the cost of having to individually maintain each and every single such API, and is almost completely `unsafe` to use. `core::simd` offers safe abstractions that are resolved to the appropriate SIMD instructions by LLVM during compilation, including scalar instructions if that is all that is available. `core::simd` is enabled by the `#![portable_simd]` nightly feature tracked in https://github.com/rust-lang/rust/issues/86656 and is introduced here by pulling in the https://github.com/rust-lang/portable-simd repository as a subtree. We built the repository out-of-tree to allow faster compilation and a stochastic test suite backed by the proptest crate to verify that different targets, features, and optimizations produce the same result, so that using this library does not introduce any surprises. As these tests are technically non-deterministic, and thus can introduce overly interesting Heisenbugs if included in the rustc CI, they are visible in the commit history of the subtree but do nothing here. Some tests **are** introduced via the documentation, but these use deterministic asserts. There are multiple unsolved problems with the library at the current moment, including a want for better documentation, technical issues with LLVM scalarizing and lowering to libm, room for improvement for the APIs, and so far I have not added the necessary plumbing for allowing the more experimental or libm-dependent APIs to be used. However, I thought it would be prudent to open this for review in its current condition, as it is both usable and it is likely I am going to learn something else needs to be fixed when bors tries this out. The major types are - `core::simd::Simd<T, N>` - `core::simd::Mask<T, N>` There is also the `LaneCount` struct, which, together with the SimdElement and SupportedLaneCount traits, limit the implementation's maximum support to vectors we know will actually compile and provide supporting logic for bitmasks. I'm hoping to simplify at least some of these out of the way as the compiler and library evolve.
2024-11-23 23:34:48 +00:00 · 2021-11-13 02:17:20 +00:00 · 2021-11-13 02:17:20 +00:00 · 032dfe4360
commit 032dfe4360
parent e90c5fbbc5 7c3d72d069
89 changed files with 7631 additions and 0 deletions
--- a/library/core/src/lib.rs
+++ b/library/core/src/lib.rs
@ -392,4 +392,25 @@ pub mod arch {
    }
 }

+// Pull in the `core_simd` crate directly into libcore. The contents of
+// `core_simd` are in a different repository: rust-lang/portable-simd.
+//
+// `core_simd` depends on libcore, but the contents of this module are
+// set up in such a way that directly pulling it here works such that the
+// crate uses this crate as its libcore.
+#[path = "../../portable-simd/crates/core_simd/src/mod.rs"]
+#[allow(missing_debug_implementations, dead_code, unsafe_op_in_unsafe_fn, unused_unsafe)]
+#[allow(rustdoc::bare_urls)]
+#[unstable(feature = "portable_simd", issue = "86656")]
+#[cfg(not(bootstrap))]
+mod core_simd;
+
+#[doc = include_str!("../../portable-simd/crates/core_simd/src/core_simd_docs.md")]
+#[unstable(feature = "portable_simd", issue = "86656")]
+#[cfg(not(bootstrap))]
+pub mod simd {
+    #[unstable(feature = "portable_simd", issue = "86656")]
+    pub use crate::core_simd::simd::*;
+}
+
 include!("primitive_docs.rs");
--- a/library/core/tests/lib.rs
+++ b/library/core/tests/lib.rs
@ -60,6 +60,7 @@
 #![feature(never_type)]
 #![feature(unwrap_infallible)]
 #![feature(result_into_ok_or_err)]
+#![cfg_attr(not(bootstrap), feature(portable_simd))]
 #![feature(ptr_metadata)]
 #![feature(once_cell)]
 #![feature(unsized_tuple_coercion)]
@ -105,6 +106,8 @@ mod pattern;
 mod pin;
 mod ptr;
 mod result;
+#[cfg(not(bootstrap))]
+mod simd;
 mod slice;
 mod str;
 mod str_lossy;
--- a/library/core/tests/simd.rs
+++ b/library/core/tests/simd.rs
@ -0,0 +1,13 @@
+use core::simd::f32x4;
+
+#[test]
+fn testing() {
+    let x = f32x4::from_array([1.0, 1.0, 1.0, 1.0]);
+    let y = -x;
+
+    let h = x * 0.5;
+
+    let r = y.abs();
+    assert_eq!(x, r);
+    assert_eq!(h, f32x4::splat(0.5));
+}
--- a/library/portable-simd/.github/ISSUE_TEMPLATE/blank_issue.md
+++ b/library/portable-simd/.github/ISSUE_TEMPLATE/blank_issue.md
@ -0,0 +1,4 @@
+---
+name: Blank Issue
+about: Create a blank issue.
+---
--- a/library/portable-simd/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/library/portable-simd/.github/ISSUE_TEMPLATE/bug_report.md
@ -0,0 +1,50 @@
+---
+name: Bug Report
+about: Create a bug report for Rust.
+labels: C-bug
+---
+<!--
+Thank you for filing a bug report! 🐛 Please provide a short summary of the bug,
+along with any information you feel relevant to replicating the bug.
+-->
+
+I tried this code:
+
+```rust
+<code>
+```
+
+I expected to see this happen: *explanation*
+
+Instead, this happened: *explanation*
+
+### Meta
+
+`rustc --version --verbose`:
+```
+<version>
+```
+
+
+`crate version in Cargo.toml`:
+```toml
+[dependencies]
+stdsimd = 
+```
+<!-- If this specifies the repo at HEAD, please include the latest commit. -->
+
+
+<!--
+If a backtrace is available, please include a backtrace in the code block by
+setting `RUST_BACKTRACE=1` in your environment. e.g.
+`RUST_BACKTRACE=1 cargo build`.
+-->
+<details><summary>Backtrace</summary>
+<p>
+
+```
+<backtrace>
+```
+
+</p>
+</details>
--- a/library/portable-simd/.github/ISSUE_TEMPLATE/config.yml
+++ b/library/portable-simd/.github/ISSUE_TEMPLATE/config.yml
@ -0,0 +1,10 @@
+# This only controls whether a tiny, hard-to-find "open a blank issue" link appears at the end of
+# the template list.
+blank_issues_enabled: true
+contact_links:
+  - name: Intrinsic Support
+    url: https://github.com/rust-lang/stdarch/issues
+    about: Please direct issues about Rust's support for vendor intrinsics to core::arch
+  - name: Internal Compiler Error
+    url: https://github.com/rust-lang/rust/issues
+    about: Please report ICEs to the rustc repository
--- a/library/portable-simd/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/library/portable-simd/.github/ISSUE_TEMPLATE/feature_request.md
@ -0,0 +1,14 @@
+---
+name: Feature Request
+about: Request an addition to the core::simd API
+labels: C-feature-request
+---
+<!--
+  Hello!
+
+  We are very interested in any feature requests you may have.
+
+  However, please be aware that core::simd exists to address concerns with creating a portable SIMD API for Rust.
+  Requests for extensions to compiler features, such as `target_feature`, binary versioning for SIMD APIs, or
+  improving specific compilation issues in general should be discussed at https://internals.rust-lang.org/
+-->
--- a/library/portable-simd/.github/PULL_REQUEST_TEMPLATE.md
+++ b/library/portable-simd/.github/PULL_REQUEST_TEMPLATE.md
@ -0,0 +1,18 @@
+Hello, welcome to `std::simd`!
+
+It seems this pull request template checklist was created while a lot of vector math ops were being implemented, and only really applies to ops. Feel free to delete everything here if it's not applicable, or ask for help if you're not sure what it means!
+
+For a given vector math operation on TxN, please add tests for interactions with:
+  - [ ] `T::MAX`
+  - [ ] `T::MIN`
+  - [ ] -1
+  - [ ] 1
+  - [ ] 0
+
+
+For a given vector math operation on TxN where T is a float, please add tests for test interactions with:
+  - [ ] a really large number, larger than the mantissa
+  - [ ] a really small "subnormal" number
+  - [ ] NaN
+  - [ ] Infinity
+  - [ ] Negative Infinity
--- a/library/portable-simd/.github/workflows/ci.yml
+++ b/library/portable-simd/.github/workflows/ci.yml
@ -0,0 +1,260 @@
+name: CI
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+
+env:
+  CARGO_NET_RETRY: 10
+  RUSTUP_MAX_RETRIES: 10
+
+jobs:
+  rustfmt:
+    name: "rustfmt"
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup Rust
+        run: |
+          rustup update nightly --no-self-update
+          rustup default nightly
+          rustup component add rustfmt
+      - name: Run rustfmt
+        run: cargo fmt --all -- --check
+
+  clippy:
+    name: "clippy on ${{ matrix.target }}"
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        target:
+          # We shouldn't really have any OS-specific code, so think of this as a list of architectures
+          - x86_64-unknown-linux-gnu
+          - i686-unknown-linux-gnu
+          - i586-unknown-linux-gnu
+          - aarch64-unknown-linux-gnu
+          - armv7-unknown-linux-gnueabihf
+          - mips-unknown-linux-gnu
+          - mips64-unknown-linux-gnuabi64
+          - powerpc-unknown-linux-gnu
+          - powerpc64-unknown-linux-gnu
+          - riscv64gc-unknown-linux-gnu
+          - s390x-unknown-linux-gnu
+          - sparc64-unknown-linux-gnu
+          - wasm32-unknown-unknown
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup Rust
+        run: |
+          rustup update nightly --no-self-update
+          rustup default nightly
+          rustup target add ${{ matrix.target }}
+          rustup component add clippy
+      - name: Run Clippy
+        run: cargo clippy --all-targets --target ${{ matrix.target }}
+
+  x86-tests:
+    name: "${{ matrix.target_feature }} on ${{ matrix.target }}"
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        target: [x86_64-pc-windows-msvc, i686-pc-windows-msvc, i586-pc-windows-msvc, x86_64-unknown-linux-gnu, x86_64-apple-darwin]
+        # `default` means we use the default target config for the target,
+        # `native` means we run with `-Ctarget-cpu=native`, and anything else is
+        # an arg to `-Ctarget-feature`
+        target_feature: [default, native, +sse3, +ssse3, +sse4.1, +sse4.2, +avx, +avx2]
+
+        exclude:
+          # The macos runners seem to only reliably support up to `avx`.
+          - { target: x86_64-apple-darwin, target_feature: +avx2 }
+          # These features are statically known to be present for all 64 bit
+          # macs, and thus are covered by the `default` test
+          - { target: x86_64-apple-darwin, target_feature: +sse3 }
+          - { target: x86_64-apple-darwin, target_feature: +ssse3 }
+          # -Ctarget-cpu=native sounds like bad-news if target != host
+          - { target: i686-pc-windows-msvc, target_feature: native }
+          - { target: i586-pc-windows-msvc, target_feature: native }
+
+        include:
+          # Populate the `matrix.os` field
+          - { target: x86_64-apple-darwin,      os: macos-latest }
+          - { target: x86_64-unknown-linux-gnu, os: ubuntu-latest }
+          - { target: x86_64-pc-windows-msvc,   os: windows-latest }
+          - { target: i686-pc-windows-msvc,     os: windows-latest }
+          - { target: i586-pc-windows-msvc,     os: windows-latest }
+
+          # These are globally available on all the other targets.
+          - { target: i586-pc-windows-msvc, target_feature: +sse, os: windows-latest }
+          - { target: i586-pc-windows-msvc, target_feature: +sse2, os: windows-latest }
+
+          # Annoyingly, the x86_64-unknown-linux-gnu runner *almost* always has
+          # avx512vl, but occasionally doesn't.  Maybe one day we can enable it.
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup Rust
+        run: |
+          rustup update nightly --no-self-update
+          rustup default nightly
+          rustup target add ${{ matrix.target }}
+
+      - name: Configure RUSTFLAGS
+        shell: bash
+        run: |
+          case "${{ matrix.target_feature }}" in
+            default)
+              echo "RUSTFLAGS=-Dwarnings" >> $GITHUB_ENV;;
+            native)
+              echo "RUSTFLAGS=-Dwarnings -Ctarget-cpu=native" >> $GITHUB_ENV
+              ;;
+            *)
+              echo "RUSTFLAGS=-Dwarnings -Ctarget-feature=${{ matrix.target_feature }}" >> $GITHUB_ENV
+              ;;
+          esac
+
+      # Super useful for debugging why a SIGILL occurred.
+      - name: Dump target configuration and support
+        run: |
+          rustc -Vv
+
+          echo "Caveat: not all target features are expected to be logged"
+
+          echo "## Requested target configuration (RUSTFLAGS=$RUSTFLAGS)"
+          rustc --print=cfg --target=${{ matrix.target }} $RUSTFLAGS
+
+          echo "## Supported target configuration for --target=${{ matrix.target }}"
+          rustc --print=cfg --target=${{ matrix.target }} -Ctarget-cpu=native
+
+          echo "## Natively supported target configuration"
+          rustc --print=cfg -Ctarget-cpu=native
+
+      - name: Test (debug)
+        run: cargo test --verbose --target=${{ matrix.target }}
+
+      - name: Test (release)
+        run: cargo test --verbose --target=${{ matrix.target }} --release
+
+  wasm-tests:
+    name: "wasm (firefox, ${{ matrix.name }})"
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        include:
+          - { name: default, RUSTFLAGS: "" }
+          - { name: simd128, RUSTFLAGS: "-C target-feature=+simd128" }
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup Rust
+        run: |
+          rustup update nightly --no-self-update
+          rustup default nightly
+      - name: Install wasm-pack
+        run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
+      - name: Test (debug)
+        run: wasm-pack test --firefox --headless crates/core_simd
+        env:
+            RUSTFLAGS: ${{ matrix.rustflags }}
+      - name: Test (release)
+        run: wasm-pack test --firefox --headless crates/core_simd --release
+        env:
+            RUSTFLAGS: ${{ matrix.rustflags }}
+
+  cross-tests:
+    name: "${{ matrix.target }} (via cross)"
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      # TODO: Sadly, we cant configure target-feature in a meaningful way
+      # because `cross` doesn't tell qemu to enable any non-default cpu
+      # features, nor does it give us a way to do so.
+      #
+      # Ultimately, we'd like to do something like [rust-lang/stdarch][stdarch].
+      # This is a lot more complex... but in practice it's likely that we can just
+      # snarf the docker config from around [here][1000-dockerfiles].
+      #
+      # [stdarch]: https://github.com/rust-lang/stdarch/blob/a5db4eaf/.github/workflows/main.yml#L67
+      # [1000-dockerfiles]: https://github.com/rust-lang/stdarch/tree/a5db4eaf/ci/docker
+
+      matrix:
+        target:
+          - i586-unknown-linux-gnu
+          # 32-bit arm has a few idiosyncracies like having subnormal flushing
+          # to zero on by default. Ideally we'd set
+          - armv7-unknown-linux-gnueabihf
+          - aarch64-unknown-linux-gnu
+          # Note: The issue above means neither of these mips targets will use
+          # MSA (mips simd) but MIPS uses a nonstandard binary representation
+          # for NaNs which makes it worth testing on despite that.
+          - mips-unknown-linux-gnu
+          - mips64-unknown-linux-gnuabi64
+          - riscv64gc-unknown-linux-gnu
+          # TODO this test works, but it appears to time out
+          # - powerpc-unknown-linux-gnu
+          # TODO this test is broken, but it appears to be a problem with QEMU, not us.
+          # - powerpc64le-unknown-linux-gnu
+          # TODO enable this once a new version of cross is released
+          # - powerpc64-unknown-linux-gnu
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup Rust
+        run: |
+          rustup update nightly --no-self-update
+          rustup default nightly
+          rustup target add ${{ matrix.target }}
+          rustup component add rust-src
+
+      - name: Install Cross
+        # Equivalent to `cargo install cross`, but downloading a prebuilt
+        # binary. Ideally we wouldn't hardcode a version, but the version number
+        # being part of the tarball means we can't just use the download/latest
+        # URL :(
+        run: |
+          CROSS_URL=https://github.com/rust-embedded/cross/releases/download/v0.2.1/cross-v0.2.1-x86_64-unknown-linux-gnu.tar.gz
+          mkdir -p "$HOME/.bin"
+          curl -sfSL --retry-delay 10 --retry 5 "${CROSS_URL}" | tar zxf - -C "$HOME/.bin"
+          echo "$HOME/.bin" >> $GITHUB_PATH
+
+      - name: Test (debug)
+        run: cross test --verbose --target=${{ matrix.target }}
+
+      - name: Test (release)
+        run: cross test --verbose --target=${{ matrix.target }} --release
+
+  features:
+    name: "Check cargo features (${{ matrix.simd }} × ${{ matrix.features }})"
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        simd:
+          - ""
+          - "avx512"
+        features:
+          - ""
+          - "--features std"
+          - "--features generic_const_exprs"
+          - "--features std --features generic_const_exprs"
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup Rust
+        run: |
+          rustup update nightly --no-self-update
+          rustup default nightly
+      - name: Detect AVX512
+        run: echo "CPU_FEATURE=$(lscpu | grep -o avx512[a-z]* | sed s/avx/+avx/ | tr '\n' ',' )" >> $GITHUB_ENV
+      - name: Check build
+        if: ${{ matrix.simd == '' }}
+        run: RUSTFLAGS="-Dwarnings" cargo check --all-targets --no-default-features ${{ matrix.features }}
+      - name: Check AVX
+        if: ${{ matrix.simd == 'avx512' && contains(env.CPU_FEATURE, 'avx512') }}
+        run: |
+          echo "Found AVX features: $CPU_FEATURE"
+          RUSTFLAGS="-Dwarnings -Ctarget-feature=$CPU_FEATURE" cargo check --all-targets --no-default-features ${{ matrix.features }}
--- a/library/portable-simd/.github/workflows/doc.yml
+++ b/library/portable-simd/.github/workflows/doc.yml
@ -0,0 +1,30 @@
+name: Documentation
+
+on:
+  push:
+    branches:
+      - master
+
+jobs:
+  release:
+    name: Deploy Documentation
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v1
+
+      - name: Setup Rust
+        run: |
+          rustup update nightly --no-self-update
+          rustup default nightly
+
+      - name: Build Documentation
+        run: cargo doc --no-deps
+      
+      - name: Deploy Documentation
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_branch: gh-pages
+          publish_dir: ./target/doc
--- a/library/portable-simd/.gitignore
+++ b/library/portable-simd/.gitignore
@ -0,0 +1,2 @@
+/target
+Cargo.lock
--- a/library/portable-simd/CONTRIBUTING.md
+++ b/library/portable-simd/CONTRIBUTING.md
@ -0,0 +1,32 @@
+# Contributing to `std::simd`
+
+Simple version:
+1. Fork it and `git clone` it
+2. Create your feature branch: `git checkout -b my-branch`
+3. Write your changes.
+4. Test it: `cargo test`. Remember to enable whatever SIMD features you intend to test by setting `RUSTFLAGS`.
+5. Commit your changes: `git commit add ./path/to/changes && git commit -m 'Fix some bug'`
+6. Push the branch: `git push --set-upstream origin my-branch`
+7. Submit a pull request!
+
+## Taking on an Issue
+
+SIMD can be quite complex, and even a "simple" issue can be huge. If an issue is organized like a tracking issue, with an itemized list of items that don't necessarily have to be done in a specific order, please take the issue one item at a time. This will help by letting work proceed apace on the rest of the issue. If it's a (relatively) small issue, feel free to announce your intention to solve it on the issue tracker and take it in one go!
+
+## CI
+
+We currently have 2 CI matrices through Travis CI and GitHub Actions that will automatically build and test your change in order to verify that `std::simd`'s portable API is, in fact, portable. If your change builds locally, but does not build on either, this is likely due to a platform-specific concern that your code has not addressed. Please consult the build logs and address the error, or ask for help if you need it.
+
+## Beyond stdsimd
+
+A large amount of the core SIMD implementation is found in the rustc_codegen_* crates in the [main rustc repo](https://github.com/rust-lang/rust). In addition, actual platform-specific functions are implemented in [stdarch]. Not all changes to `std::simd` require interacting with either of these, but if you're wondering where something is and it doesn't seem to be in this repository, those might be where to start looking.
+
+## Questions? Concerns? Need Help?
+
+Please feel free to ask in the [#project-portable-simd][zulip-portable-simd] stream on the [rust-lang Zulip][zulip] for help with making changes to `std::simd`!
+If your changes include directly modifying the compiler, it might also be useful to ask in [#t-compiler/help][zulip-compiler-help].
+
+[zulip-portable-simd]: https://rust-lang.zulipchat.com/#narrow/stream/257879-project-portable-simd
+[zulip-compiler-help]: https://rust-lang.zulipchat.com/#narrow/stream/182449-t-compiler.2Fhelp
+[zulip]: https://rust-lang.zulipchat.com
+[stdarch]: https://github.com/rust-lang/stdarch
--- a/library/portable-simd/Cargo.toml
+++ b/library/portable-simd/Cargo.toml
@ -0,0 +1,6 @@
+[workspace]
+
+members = [
+    "crates/core_simd",
+    "crates/test_helpers",
+]
--- a/library/portable-simd/LICENSE-APACHE
+++ b/library/portable-simd/LICENSE-APACHE
@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/library/portable-simd/LICENSE-MIT
+++ b/library/portable-simd/LICENSE-MIT
@ -0,0 +1,19 @@
+Copyright (c) 2020 The Rust Project Developers
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/library/portable-simd/README.md
+++ b/library/portable-simd/README.md
@ -0,0 +1,69 @@
+# The Rust standard library's portable SIMD API
+[![Build Status](https://travis-ci.com/rust-lang/portable-simd.svg?branch=master)](https://travis-ci.com/rust-lang/portable-simd)
+
+Code repository for the [Portable SIMD Project Group](https://github.com/rust-lang/project-portable-simd).
+Please refer to [CONTRIBUTING.md](./CONTRIBUTING.md) for our contributing guidelines.
+
+The docs for this crate are published from the main branch.
+You can [read them here][docs].
+
+If you have questions about SIMD, we have begun writing a [guide][simd-guide].
+We can also be found on [Zulip][zulip-project-portable-simd].
+
+If you are interested in support for a specific architecture, you may want [stdarch] instead.
+
+## Hello World
+
+Now we're gonna dip our toes into this world with a small SIMD "Hello, World!" example. Make sure your compiler is up to date and using `nightly`. We can do that by running 
+
+```bash
+rustup update -- nightly
+```
+
+or by setting up `rustup default nightly` or else with `cargo +nightly {build,test,run}`. After updating, run 
+```bash
+cargo new hellosimd
+```
+to create a new crate. Edit `hellosimd/Cargo.toml` to be 
+```toml
+[package]
+name = "hellosimd"
+version = "0.1.0"
+edition = "2018"
+[dependencies]
+core_simd = { git = "https://github.com/rust-lang/portable-simd" }
+```
+
+and finally write this in `src/main.rs`:
+```rust
+use core_simd::*;
+fn main() {
+    let a = f32x4::splat(10.0);
+    let b = f32x4::from_array([1.0, 2.0, 3.0, 4.0]);
+    println!("{:?}", a + b);
+}
+```
+
+Explanation: We import all the bindings from the crate with the first line. Then, we construct our SIMD vectors with methods like `splat` or `from_array`. Finally, we can use operators on them like `+` and the appropriate SIMD instructions will be carried out. When we run `cargo run` you should get `[11.0, 12.0, 13.0, 14.0]`.
+
+## Code Organization
+
+Currently the crate is organized so that each element type is a file, and then the 64-bit, 128-bit, 256-bit, and 512-bit vectors using those types are contained in said file.
+
+All types are then exported as a single, flat module.
+
+Depending on the size of the primitive type, the number of lanes the vector will have varies. For example, 128-bit vectors have four `f32` lanes and two `f64` lanes.
+
+The supported element types are as follows:
+* **Floating Point:** `f32`, `f64`
+* **Signed Integers:** `i8`, `i16`, `i32`, `i64`, `i128`, `isize`
+* **Unsigned Integers:** `u8`, `u16`, `u32`, `u64`, `u128`, `usize`
+* **Masks:** `mask8`, `mask16`, `mask32`, `mask64`, `mask128`, `masksize`
+
+Floating point, signed integers, and unsigned integers are the [primitive types](https://doc.rust-lang.org/core/primitive/index.html) you're already used to.
+The `mask` types are "truthy" values, but they use the number of bits in their name instead of just 1 bit like a normal `bool` uses.
+
+[simd-guide]: ./beginners-guide.md
+[zulip-project-portable-simd]: https://rust-lang.zulipchat.com/#narrow/stream/257879-project-portable-simd
+[stdarch]: https://github.com/rust-lang/stdarch
+[docs]: https://rust-lang.github.io/portable-simd/core_simd
--- a/library/portable-simd/beginners-guide.md
+++ b/library/portable-simd/beginners-guide.md
@ -0,0 +1,86 @@
+
+# Beginner's Guide To SIMD
+
+Hello and welcome to our SIMD basics guide!
+
+Because SIMD is a subject that many programmers haven't worked with before, we thought that it's best to outline some terms and other basics for you to get started with.
+
+## Quick Background
+
+**SIMD** stands for *Single Instruction, Multiple Data*. In other words, SIMD is when the CPU performs a single action on more than one logical piece of data at the same time. Instead of adding two registers that each contain one `f32` value and getting an `f32` as the result, you might add two registers that each contain `f32x4` (128 bits of data) and then you get an `f32x4` as the output.
+
+This might seem a tiny bit weird at first, but there's a good reason for it. Back in the day, as CPUs got faster and faster, eventually they got so fast that the CPU would just melt itself. The heat management (heat sinks, fans, etc) simply couldn't keep up with how much electricity was going through the metal. Two main strategies were developed to help get around the limits of physics.
+* One of them you're probably familiar with: Multi-core processors. By giving a processor more than one core, each core can do its own work, and because they're physically distant (at least on the CPU's scale) the heat can still be managed. Unfortunately, not all tasks can just be split up across cores in an efficient way.
+* The second strategy is SIMD. If you can't make the register go any faster, you can still make the register *wider*. This lets you process more data at a time, which is *almost* as good as just having a faster CPU. As with multi-core programming, SIMD doesn't fit every kind of task, so you have to know when it will improve your program.
+
+## Terms
+
+SIMD has a few special vocabulary terms you should know:
+
+* **Vector:** A SIMD value is called a vector. This shouldn't be confused with the `Vec<T>` type. A SIMD vector has a fixed size, known at compile time. All of the elements within the vector are of the same type. This makes vectors *similar to* arrays. One difference is that a vector is generally aligned to its *entire* size (eg: 16 bytes, 32 bytes, etc), not just the size of an individual element. Sometimes vector data is called "packed" data.
+
+* **Vectorize**: An operation that uses SIMD instructions to operate over a vector is often referred to as "vectorized".
+
+* **Autovectorization**: Also known as _implicit vectorization_. This is when a compiler can automatically recognize a situation where scalar instructions may be replaced with SIMD instructions, and use those instead.
+
+* **Scalar:** "Scalar" in mathematical contexts refers to values that can be represented as a single element, mostly numbers like 6, 3.14, or -2. It can also be used to describe "scalar operations" that use strictly scalar values, like addition. This term is mostly used to differentiate between vectorized operations that use SIMD instructions and scalar operations that don't.
+
+* **Lane:** A single element position within a vector is called a lane. If you have `N` lanes available then they're numbered from `0` to `N-1` when referring to them, again like an array. The biggest difference between an array element and a vector lane is that in general is *relatively costly* to access an individual lane value. On most architectures, the vector has to be pushed out of the SIMD register onto the stack, then an individual lane is accessed while it's on the stack (and possibly the stack value is read back into a register). For this reason, when working with SIMD you should avoid reading or writing the value of an individual lane during hot loops.
+
+* **Bit Widths:** When talking about SIMD, the bit widths used are the bit size of the vectors involved, *not* the individual elements. So "128-bit SIMD" has 128-bit vectors, and that might be `f32x4`, `i32x4`, `i16x8`, or other variations. While 128-bit SIMD is the most common, there's also 64-bit, 256-bit, and even 512-bit on the newest CPUs.
+
+* **Vector Register:** The extra-wide registers that are used for SIMD operations are commonly called vector registers, though you may also see "SIMD registers", vendor names for specific features, or even "floating-point register" as it is common for the same registers to be used with both scalar and vectorized floating-point operations.
+
+* **Vertical:** When an operation is "vertical", each lane processes individually without regard to the other lanes in the same vector. For example, a "vertical add" between two vectors would add lane 0 in `a` with lane 0 in `b`, with the total in lane 0 of `out`, and then the same thing for lanes 1, 2, etc. Most SIMD operations are vertical operations, so if your problem is a vertical problem then you can probably solve it with SIMD.
+
+* **Horizontal:** When an operation is "horizontal", the lanes within a single vector interact in some way. A "horizontal add" might add up lane 0 of `a` with lane 1 of `a`, with the total in lane 0 of `out`.
+
+* **Target Feature:** Rust calls a CPU architecture extension a `target_feature`. Proper SIMD requires various CPU extensions to be enabled (details below). Don't confuse this with `feature`, which is a Cargo crate concept.
+
+## Target Features
+
+When using SIMD, you should be familiar with the CPU feature set that you're targeting.
+
+On `arm` and `aarch64` it's fairly simple. There's just one CPU feature that controls if SIMD is available: `neon` (or "NEON", all caps, as the ARM docs often put it). Neon registers can be used as 64-bit or 128-bit. When doing 128-bit operations it just uses two 64-bit registers as a single 128-bit register.
+
+> By default, the `aarch64`, `arm`, and `thumb` Rust targets generally do not enable `neon` unless it's in the target string.
+
+On `x86` and `x86_64` it's slightly more complicated. The SIMD support is split into many levels:
+* 128-bit: `sse`, `sse2`, `sse3`, `ssse3` (not a typo!), `sse4.1`, `sse4.2`, `sse4a` (AMD only)
+* 256-bit (mostly): `avx`, `avx2`, `fma`
+* 512-bit (mostly): a *wide* range of `avx512` variations
+
+The list notes the bit widths available at each feature level, though the operations of the more advanced features can generally be used with the smaller register sizes as well. For example, new operations introduced in `avx` generally have a 128-bit form as well as a 256-bit form. This means that even if you only do 128-bit work you can still benefit from the later feature levels.
+
+> By default, the `i686` and `x86_64` Rust targets enable `sse` and `sse2`.
+
+### Selecting Additional Target Features
+
+If you want to enable support for a target feature within your build, generally you should use a [target-feature](https://rust-lang.github.io/packed_simd/perf-guide/target-feature/rustflags.html#target-feature) setting within you `RUSTFLAGS` setting.
+
+If you know that you're targeting a specific CPU you can instead use the [target-cpu](https://rust-lang.github.io/packed_simd/perf-guide/target-feature/rustflags.html#target-cpu) flag and the compiler will enable the correct set of features for that CPU.
+
+The [Steam Hardware Survey](https://store.steampowered.com/hwsurvey/Steam-Hardware-Software-Survey-Welcome-to-Steam) is one of the few places with data on how common various CPU features are. The dataset is limited to "the kinds of computers owned by people who play computer games", so the info only covers `x86`/`x86_64`, and it also probably skews to slightly higher quality computers than average. Still, we can see that the `sse` levels have very high support, `avx` and `avx2` are quite common as well, and the `avx-512` family is still so early in adoption you can barely find it in consumer grade stuff.
+
+## Running a program compiled for a CPU feature level that the CPU doesn't support is automatic undefined behavior.
+
+This means that if you build your program with `avx` support enabled and run it on a CPU without `avx` support, it's **instantly** undefined behavior.
+
+Even without an `unsafe` block in sight.
+
+This is no bug in Rust, or soundness hole in the type system. You just plain can't make a CPU do what it doesn't know how to do.
+
+This is why the various Rust targets *don't* enable many CPU feature flags by default: requiring a more advanced CPU makes the final binary *less* portable.
+
+So please select an appropriate CPU feature level when building your programs.
+
+## Size, Alignment, and Unsafe Code
+
+Most of the portable SIMD API is designed to allow the user to gloss over the details of different architectures and avoid using unsafe code. However, there are plenty of reasons to want to use unsafe code with these SIMD types, such as using an intrinsic function from `core::arch` to further accelerate particularly specialized SIMD operations on a given platform, while still using the portable API elsewhere. For these cases, there are some rules to keep in mind.
+
+Fortunately, most SIMD types have a fairly predictable size. `i32x4` is bit-equivalent to `[i32; 4]` and so can be bitcast to it, e.g. using [`mem::transmute`], though the API usually offers a safe cast you can use instead.
+
+However, this is not the same as alignment. Computer architectures generally prefer aligned accesses, especially when moving data between memory and vector registers, and while some support specialized operations that can bend the rules to help with this, unaligned access is still typically slow, or even undefined behavior. In addition, different architectures can require different alignments when interacting with their native SIMD types. For this reason, any `#[repr(simd)]` type has a non-portable alignment. If it is necessary to directly interact with the alignment of these types, it should be via [`mem::align_of`].
+
+[`mem::transmute`]: https://doc.rust-lang.org/core/mem/fn.transmute.html
+[`mem::align_of`]: https://doc.rust-lang.org/core/mem/fn.align_of.html
--- a/library/portable-simd/crates/core_simd/Cargo.toml
+++ b/library/portable-simd/crates/core_simd/Cargo.toml
@ -0,0 +1,28 @@
+[package]
+name = "core_simd"
+version = "0.1.0"
+edition = "2021"
+homepage = "https://github.com/rust-lang/portable-simd"
+repository = "https://github.com/rust-lang/portable-simd"
+keywords = ["core", "simd", "intrinsics"]
+categories = ["hardware-support", "no-std"]
+license = "MIT OR Apache-2.0"
+
+[features]
+default = ["std", "generic_const_exprs"]
+std = []
+generic_const_exprs = []
+
+[target.'cfg(target_arch = "wasm32")'.dev-dependencies.wasm-bindgen]
+version = "0.2"
+
+[dev-dependencies.wasm-bindgen-test]
+version = "0.3"
+
+[dev-dependencies.proptest]
+version = "0.10"
+default-features = false
+features = ["alloc"]
+
+[dev-dependencies.test_helpers]
+path = "../test_helpers"
--- a/library/portable-simd/crates/core_simd/LICENSE-APACHE
+++ b/library/portable-simd/crates/core_simd/LICENSE-APACHE
@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/library/portable-simd/crates/core_simd/LICENSE-MIT
+++ b/library/portable-simd/crates/core_simd/LICENSE-MIT
@ -0,0 +1,19 @@
+Copyright (c) 2020 The Rust Project Developers
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/library/portable-simd/crates/core_simd/examples/matrix_inversion.rs
+++ b/library/portable-simd/crates/core_simd/examples/matrix_inversion.rs
@ -0,0 +1,316 @@
+//! 4x4 matrix inverse
+// Code ported from the `packed_simd` crate
+// Run this code with `cargo test --example matrix_inversion`
+#![feature(array_chunks, portable_simd)]
+use core_simd::simd::*;
+use Which::*;
+
+// Gotta define our own 4x4 matrix since Rust doesn't ship multidim arrays yet :^)
+#[derive(Copy, Clone, Debug, PartialEq, PartialOrd)]
+pub struct Matrix4x4([[f32; 4]; 4]);
+
+#[allow(clippy::too_many_lines)]
+pub fn scalar_inv4x4(m: Matrix4x4) -> Option<Matrix4x4> {
+    let m = m.0;
+
+    #[rustfmt::skip]
+    let mut inv = [
+        // row 0:
+        [
+            // 0,0:
+            m[1][1] * m[2][2] * m[3][3] -
+            m[1][1] * m[2][3] * m[3][2] -
+            m[2][1] * m[1][2] * m[3][3] +
+            m[2][1] * m[1][3] * m[3][2] +
+            m[3][1] * m[1][2] * m[2][3] -
+            m[3][1] * m[1][3] * m[2][2],
+            // 0,1:
+           -m[0][1] * m[2][2] * m[3][3] +
+            m[0][1] * m[2][3] * m[3][2] +
+            m[2][1] * m[0][2] * m[3][3] -
+            m[2][1] * m[0][3] * m[3][2] -
+            m[3][1] * m[0][2] * m[2][3] +
+            m[3][1] * m[0][3] * m[2][2],
+            // 0,2:
+            m[0][1] * m[1][2] * m[3][3] -
+            m[0][1] * m[1][3] * m[3][2] -
+            m[1][1] * m[0][2] * m[3][3] +
+            m[1][1] * m[0][3] * m[3][2] +
+            m[3][1] * m[0][2] * m[1][3] -
+            m[3][1] * m[0][3] * m[1][2],
+            // 0,3:
+           -m[0][1] * m[1][2] * m[2][3] +
+            m[0][1] * m[1][3] * m[2][2] +
+            m[1][1] * m[0][2] * m[2][3] -
+            m[1][1] * m[0][3] * m[2][2] -
+            m[2][1] * m[0][2] * m[1][3] +
+            m[2][1] * m[0][3] * m[1][2],
+        ],
+        // row 1
+        [
+            // 1,0:
+           -m[1][0] * m[2][2] * m[3][3] +
+            m[1][0] * m[2][3] * m[3][2] +
+            m[2][0] * m[1][2] * m[3][3] -
+            m[2][0] * m[1][3] * m[3][2] -
+            m[3][0] * m[1][2] * m[2][3] +
+            m[3][0] * m[1][3] * m[2][2],
+            // 1,1:
+            m[0][0] * m[2][2] * m[3][3] -
+            m[0][0] * m[2][3] * m[3][2] -
+            m[2][0] * m[0][2] * m[3][3] +
+            m[2][0] * m[0][3] * m[3][2] +
+            m[3][0] * m[0][2] * m[2][3] -
+            m[3][0] * m[0][3] * m[2][2],
+            // 1,2:
+           -m[0][0] * m[1][2] * m[3][3] +
+            m[0][0] * m[1][3] * m[3][2] +
+            m[1][0] * m[0][2] * m[3][3] -
+            m[1][0] * m[0][3] * m[3][2] -
+            m[3][0] * m[0][2] * m[1][3] +
+            m[3][0] * m[0][3] * m[1][2],
+            // 1,3:
+            m[0][0] * m[1][2] * m[2][3] -
+            m[0][0] * m[1][3] * m[2][2] -
+            m[1][0] * m[0][2] * m[2][3] +
+            m[1][0] * m[0][3] * m[2][2] +
+            m[2][0] * m[0][2] * m[1][3] -
+            m[2][0] * m[0][3] * m[1][2],
+        ],
+        // row 2
+        [
+            // 2,0:
+            m[1][0] * m[2][1] * m[3][3] -
+            m[1][0] * m[2][3] * m[3][1] -
+            m[2][0] * m[1][1] * m[3][3] +
+            m[2][0] * m[1][3] * m[3][1] +
+            m[3][0] * m[1][1] * m[2][3] -
+            m[3][0] * m[1][3] * m[2][1],
+            // 2,1:
+           -m[0][0] * m[2][1] * m[3][3] +
+            m[0][0] * m[2][3] * m[3][1] +
+            m[2][0] * m[0][1] * m[3][3] -
+            m[2][0] * m[0][3] * m[3][1] -
+            m[3][0] * m[0][1] * m[2][3] +
+            m[3][0] * m[0][3] * m[2][1],
+            // 2,2:
+            m[0][0] * m[1][1] * m[3][3] -
+            m[0][0] * m[1][3] * m[3][1] -
+            m[1][0] * m[0][1] * m[3][3] +
+            m[1][0] * m[0][3] * m[3][1] +
+            m[3][0] * m[0][1] * m[1][3] -
+            m[3][0] * m[0][3] * m[1][1],
+            // 2,3:
+           -m[0][0] * m[1][1] * m[2][3] +
+            m[0][0] * m[1][3] * m[2][1] +
+            m[1][0] * m[0][1] * m[2][3] -
+            m[1][0] * m[0][3] * m[2][1] -
+            m[2][0] * m[0][1] * m[1][3] +
+            m[2][0] * m[0][3] * m[1][1],
+        ],
+        // row 3
+        [
+            // 3,0:
+           -m[1][0] * m[2][1] * m[3][2] +
+            m[1][0] * m[2][2] * m[3][1] +
+            m[2][0] * m[1][1] * m[3][2] -
+            m[2][0] * m[1][2] * m[3][1] -
+            m[3][0] * m[1][1] * m[2][2] +
+            m[3][0] * m[1][2] * m[2][1],
+            // 3,1:
+            m[0][0] * m[2][1] * m[3][2] -
+            m[0][0] * m[2][2] * m[3][1] -
+            m[2][0] * m[0][1] * m[3][2] +
+            m[2][0] * m[0][2] * m[3][1] +
+            m[3][0] * m[0][1] * m[2][2] -
+            m[3][0] * m[0][2] * m[2][1],
+            // 3,2:
+           -m[0][0] * m[1][1] * m[3][2] +
+            m[0][0] * m[1][2] * m[3][1] +
+            m[1][0] * m[0][1] * m[3][2] -
+            m[1][0] * m[0][2] * m[3][1] -
+            m[3][0] * m[0][1] * m[1][2] +
+            m[3][0] * m[0][2] * m[1][1],
+            // 3,3:
+            m[0][0] * m[1][1] * m[2][2] -
+            m[0][0] * m[1][2] * m[2][1] -
+            m[1][0] * m[0][1] * m[2][2] +
+            m[1][0] * m[0][2] * m[2][1] +
+            m[2][0] * m[0][1] * m[1][2] -
+            m[2][0] * m[0][2] * m[1][1],
+        ],
+    ];
+
+    let det = m[0][0] * inv[0][0] + m[0][1] * inv[1][0] + m[0][2] * inv[2][0] + m[0][3] * inv[3][0];
+    if det == 0. {
+        return None;
+    }
+
+    let det_inv = 1. / det;
+
+    for row in &mut inv {
+        for elem in row.iter_mut() {
+            *elem *= det_inv;
+        }
+    }
+
+    Some(Matrix4x4(inv))
+}
+
+pub fn simd_inv4x4(m: Matrix4x4) -> Option<Matrix4x4> {
+    let m = m.0;
+    let m_0 = f32x4::from_array(m[0]);
+    let m_1 = f32x4::from_array(m[1]);
+    let m_2 = f32x4::from_array(m[2]);
+    let m_3 = f32x4::from_array(m[3]);
+
+    const SHUFFLE01: [Which; 4] = [First(0), First(1), Second(0), Second(1)];
+    const SHUFFLE02: [Which; 4] = [First(0), First(2), Second(0), Second(2)];
+    const SHUFFLE13: [Which; 4] = [First(1), First(3), Second(1), Second(3)];
+    const SHUFFLE23: [Which; 4] = [First(2), First(3), Second(2), Second(3)];
+
+    let tmp = simd_swizzle!(m_0, m_1, SHUFFLE01);
+    let row1 = simd_swizzle!(m_2, m_3, SHUFFLE01);
+
+    let row0 = simd_swizzle!(tmp, row1, SHUFFLE02);
+    let row1 = simd_swizzle!(row1, tmp, SHUFFLE13);
+
+    let tmp = simd_swizzle!(m_0, m_1, SHUFFLE23);
+    let row3 = simd_swizzle!(m_2, m_3, SHUFFLE23);
+    let row2 = simd_swizzle!(tmp, row3, SHUFFLE02);
+    let row3 = simd_swizzle!(row3, tmp, SHUFFLE13);
+
+    let tmp = (row2 * row3).reverse().rotate_lanes_right::<2>();
+    let minor0 = row1 * tmp;
+    let minor1 = row0 * tmp;
+    let tmp = tmp.rotate_lanes_right::<2>();
+    let minor0 = (row1 * tmp) - minor0;
+    let minor1 = (row0 * tmp) - minor1;
+    let minor1 = minor1.rotate_lanes_right::<2>();
+
+    let tmp = (row1 * row2).reverse().rotate_lanes_right::<2>();
+    let minor0 = (row3 * tmp) + minor0;
+    let minor3 = row0 * tmp;
+    let tmp = tmp.rotate_lanes_right::<2>();
+
+    let minor0 = minor0 - row3 * tmp;
+    let minor3 = row0 * tmp - minor3;
+    let minor3 = minor3.rotate_lanes_right::<2>();
+
+    let tmp = (row3 * row1.rotate_lanes_right::<2>())
+        .reverse()
+        .rotate_lanes_right::<2>();
+    let row2 = row2.rotate_lanes_right::<2>();
+    let minor0 = row2 * tmp + minor0;
+    let minor2 = row0 * tmp;
+    let tmp = tmp.rotate_lanes_right::<2>();
+    let minor0 = minor0 - row2 * tmp;
+    let minor2 = row0 * tmp - minor2;
+    let minor2 = minor2.rotate_lanes_right::<2>();
+
+    let tmp = (row0 * row1).reverse().rotate_lanes_right::<2>();
+    let minor2 = minor2 + row3 * tmp;
+    let minor3 = row2 * tmp - minor3;
+    let tmp = tmp.rotate_lanes_right::<2>();
+    let minor2 = row3 * tmp - minor2;
+    let minor3 = minor3 - row2 * tmp;
+
+    let tmp = (row0 * row3).reverse().rotate_lanes_right::<2>();
+    let minor1 = minor1 - row2 * tmp;
+    let minor2 = row1 * tmp + minor2;
+    let tmp = tmp.rotate_lanes_right::<2>();
+    let minor1 = row2 * tmp + minor1;
+    let minor2 = minor2 - row1 * tmp;
+
+    let tmp = (row0 * row2).reverse().rotate_lanes_right::<2>();
+    let minor1 = row3 * tmp + minor1;
+    let minor3 = minor3 - row1 * tmp;
+    let tmp = tmp.rotate_lanes_right::<2>();
+    let minor1 = minor1 - row3 * tmp;
+    let minor3 = row1 * tmp + minor3;
+
+    let det = row0 * minor0;
+    let det = det.rotate_lanes_right::<2>() + det;
+    let det = det.reverse().rotate_lanes_right::<2>() + det;
+
+    if det.horizontal_sum() == 0. {
+        return None;
+    }
+    // calculate the reciprocal
+    let tmp = f32x4::splat(1.0) / det;
+    let det = tmp + tmp - det * tmp * tmp;
+
+    let res0 = minor0 * det;
+    let res1 = minor1 * det;
+    let res2 = minor2 * det;
+    let res3 = minor3 * det;
+
+    let mut m = m;
+
+    m[0] = res0.to_array();
+    m[1] = res1.to_array();
+    m[2] = res2.to_array();
+    m[3] = res3.to_array();
+
+    Some(Matrix4x4(m))
+}
+
+#[cfg(test)]
+#[rustfmt::skip]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test() {
+    let tests: &[(Matrix4x4, Option<Matrix4x4>)] = &[
+        // Identity:
+        (Matrix4x4([
+            [1., 0., 0., 0.],
+            [0., 1., 0., 0.],
+            [0., 0., 1., 0.],
+            [0., 0., 0., 1.],
+         ]),
+         Some(Matrix4x4([
+             [1., 0., 0., 0.],
+             [0., 1., 0., 0.],
+             [0., 0., 1., 0.],
+             [0., 0., 0., 1.],
+         ]))
+        ),
+        // None:
+        (Matrix4x4([
+            [1., 2., 3., 4.],
+            [12., 11., 10., 9.],
+            [5., 6., 7., 8.],
+            [16., 15., 14., 13.],
+        ]),
+         None
+        ),
+        // Other:
+        (Matrix4x4([
+            [1., 1., 1., 0.],
+            [0., 3., 1., 2.],
+            [2., 3., 1., 0.],
+            [1., 0., 2., 1.],
+        ]),
+         Some(Matrix4x4([
+             [-3., -0.5,   1.5,  1.0],
+             [ 1., 0.25, -0.25, -0.5],
+             [ 3., 0.25, -1.25, -0.5],
+             [-3., 0.0,    1.0,  1.0],
+         ]))
+        ),
+
+
+    ];
+
+        for &(input, output) in tests {
+            assert_eq!(scalar_inv4x4(input), output);
+            assert_eq!(simd_inv4x4(input), output);
+        }
+    }
+}
+
+fn main() {
+    // Empty main to make cargo happy
+}
--- a/library/portable-simd/crates/core_simd/examples/nbody.rs
+++ b/library/portable-simd/crates/core_simd/examples/nbody.rs
@ -0,0 +1,193 @@
+#![cfg_attr(feature = "std", feature(portable_simd))]
+
+/// Benchmarks game nbody code
+/// Taken from the `packed_simd` crate
+/// Run this benchmark with `cargo test --example nbody`
+#[cfg(feature = "std")]
+mod nbody {
+    use core_simd::*;
+
+    use std::f64::consts::PI;
+    const SOLAR_MASS: f64 = 4.0 * PI * PI;
+    const DAYS_PER_YEAR: f64 = 365.24;
+
+    #[derive(Debug, Clone, Copy)]
+    struct Body {
+        pub x: f64x4,
+        pub v: f64x4,
+        pub mass: f64,
+    }
+
+    const N_BODIES: usize = 5;
+    const BODIES: [Body; N_BODIES] = [
+        // sun:
+        Body {
+            x: f64x4::from_array([0., 0., 0., 0.]),
+            v: f64x4::from_array([0., 0., 0., 0.]),
+            mass: SOLAR_MASS,
+        },
+        // jupiter:
+        Body {
+            x: f64x4::from_array([
+                4.84143144246472090e+00,
+                -1.16032004402742839e+00,
+                -1.03622044471123109e-01,
+                0.,
+            ]),
+            v: f64x4::from_array([
+                1.66007664274403694e-03 * DAYS_PER_YEAR,
+                7.69901118419740425e-03 * DAYS_PER_YEAR,
+                -6.90460016972063023e-05 * DAYS_PER_YEAR,
+                0.,
+            ]),
+            mass: 9.54791938424326609e-04 * SOLAR_MASS,
+        },
+        // saturn:
+        Body {
+            x: f64x4::from_array([
+                8.34336671824457987e+00,
+                4.12479856412430479e+00,
+                -4.03523417114321381e-01,
+                0.,
+            ]),
+            v: f64x4::from_array([
+                -2.76742510726862411e-03 * DAYS_PER_YEAR,
+                4.99852801234917238e-03 * DAYS_PER_YEAR,
+                2.30417297573763929e-05 * DAYS_PER_YEAR,
+                0.,
+            ]),
+            mass: 2.85885980666130812e-04 * SOLAR_MASS,
+        },
+        // uranus:
+        Body {
+            x: f64x4::from_array([
+                1.28943695621391310e+01,
+                -1.51111514016986312e+01,
+                -2.23307578892655734e-01,
+                0.,
+            ]),
+            v: f64x4::from_array([
+                2.96460137564761618e-03 * DAYS_PER_YEAR,
+                2.37847173959480950e-03 * DAYS_PER_YEAR,
+                -2.96589568540237556e-05 * DAYS_PER_YEAR,
+                0.,
+            ]),
+            mass: 4.36624404335156298e-05 * SOLAR_MASS,
+        },
+        // neptune:
+        Body {
+            x: f64x4::from_array([
+                1.53796971148509165e+01,
+                -2.59193146099879641e+01,
+                1.79258772950371181e-01,
+                0.,
+            ]),
+            v: f64x4::from_array([
+                2.68067772490389322e-03 * DAYS_PER_YEAR,
+                1.62824170038242295e-03 * DAYS_PER_YEAR,
+                -9.51592254519715870e-05 * DAYS_PER_YEAR,
+                0.,
+            ]),
+            mass: 5.15138902046611451e-05 * SOLAR_MASS,
+        },
+    ];
+
+    fn offset_momentum(bodies: &mut [Body; N_BODIES]) {
+        let (sun, rest) = bodies.split_at_mut(1);
+        let sun = &mut sun[0];
+        for body in rest {
+            let m_ratio = body.mass / SOLAR_MASS;
+            sun.v -= body.v * m_ratio;
+        }
+    }
+
+    fn energy(bodies: &[Body; N_BODIES]) -> f64 {
+        let mut e = 0.;
+        for i in 0..N_BODIES {
+            let bi = &bodies[i];
+            e += bi.mass * (bi.v * bi.v).horizontal_sum() * 0.5;
+            for bj in bodies.iter().take(N_BODIES).skip(i + 1) {
+                let dx = bi.x - bj.x;
+                e -= bi.mass * bj.mass / (dx * dx).horizontal_sum().sqrt()
+            }
+        }
+        e
+    }
+
+    fn advance(bodies: &mut [Body; N_BODIES], dt: f64) {
+        const N: usize = N_BODIES * (N_BODIES - 1) / 2;
+
+        // compute distance between bodies:
+        let mut r = [f64x4::splat(0.); N];
+        {
+            let mut i = 0;
+            for j in 0..N_BODIES {
+                for k in j + 1..N_BODIES {
+                    r[i] = bodies[j].x - bodies[k].x;
+                    i += 1;
+                }
+            }
+        }
+
+        let mut mag = [0.0; N];
+        for i in (0..N).step_by(2) {
+            let d2s = f64x2::from_array([
+                (r[i] * r[i]).horizontal_sum(),
+                (r[i + 1] * r[i + 1]).horizontal_sum(),
+            ]);
+            let dmags = f64x2::splat(dt) / (d2s * d2s.sqrt());
+            mag[i] = dmags[0];
+            mag[i + 1] = dmags[1];
+        }
+
+        let mut i = 0;
+        for j in 0..N_BODIES {
+            for k in j + 1..N_BODIES {
+                let f = r[i] * mag[i];
+                bodies[j].v -= f * bodies[k].mass;
+                bodies[k].v += f * bodies[j].mass;
+                i += 1
+            }
+        }
+        for body in bodies {
+            body.x += dt * body.v
+        }
+    }
+
+    pub fn run(n: usize) -> (f64, f64) {
+        let mut bodies = BODIES;
+        offset_momentum(&mut bodies);
+        let energy_before = energy(&bodies);
+        for _ in 0..n {
+            advance(&mut bodies, 0.01);
+        }
+        let energy_after = energy(&bodies);
+
+        (energy_before, energy_after)
+    }
+}
+
+#[cfg(feature = "std")]
+#[cfg(test)]
+mod tests {
+    // Good enough for demonstration purposes, not going for strictness here.
+    fn approx_eq_f64(a: f64, b: f64) -> bool {
+        (a - b).abs() < 0.00001
+    }
+    #[test]
+    fn test() {
+        const OUTPUT: [f64; 2] = [-0.169075164, -0.169087605];
+        let (energy_before, energy_after) = super::nbody::run(1000);
+        assert!(approx_eq_f64(energy_before, OUTPUT[0]));
+        assert!(approx_eq_f64(energy_after, OUTPUT[1]));
+    }
+}
+
+fn main() {
+    #[cfg(feature = "std")]
+    {
+        let (energy_before, energy_after) = nbody::run(1000);
+        println!("Energy before: {}", energy_before);
+        println!("Energy after:  {}", energy_after);
+    }
+}
--- a/library/portable-simd/crates/core_simd/src/comparisons.rs
+++ b/library/portable-simd/crates/core_simd/src/comparisons.rs
@ -0,0 +1,50 @@
+use crate::simd::intrinsics;
+use crate::simd::{LaneCount, Mask, Simd, SimdElement, SupportedLaneCount};
+
+impl<T, const LANES: usize> Simd<T, LANES>
+where
+    T: SimdElement + PartialEq,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    /// Test if each lane is equal to the corresponding lane in `other`.
+    #[inline]
+    pub fn lanes_eq(self, other: Self) -> Mask<T::Mask, LANES> {
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_eq(self, other)) }
+    }
+
+    /// Test if each lane is not equal to the corresponding lane in `other`.
+    #[inline]
+    pub fn lanes_ne(self, other: Self) -> Mask<T::Mask, LANES> {
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_ne(self, other)) }
+    }
+}
+
+impl<T, const LANES: usize> Simd<T, LANES>
+where
+    T: SimdElement + PartialOrd,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    /// Test if each lane is less than the corresponding lane in `other`.
+    #[inline]
+    pub fn lanes_lt(self, other: Self) -> Mask<T::Mask, LANES> {
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_lt(self, other)) }
+    }
+
+    /// Test if each lane is greater than the corresponding lane in `other`.
+    #[inline]
+    pub fn lanes_gt(self, other: Self) -> Mask<T::Mask, LANES> {
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_gt(self, other)) }
+    }
+
+    /// Test if each lane is less than or equal to the corresponding lane in `other`.
+    #[inline]
+    pub fn lanes_le(self, other: Self) -> Mask<T::Mask, LANES> {
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_le(self, other)) }
+    }
+
+    /// Test if each lane is greater than or equal to the corresponding lane in `other`.
+    #[inline]
+    pub fn lanes_ge(self, other: Self) -> Mask<T::Mask, LANES> {
+        unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
+    }
+}
--- a/library/portable-simd/crates/core_simd/src/core_simd_docs.md
+++ b/library/portable-simd/crates/core_simd/src/core_simd_docs.md
@ -0,0 +1,4 @@
+Portable SIMD module.
+
+This module offers a portable abstraction for SIMD operations
+that is not bound to any particular hardware architecture.
--- a/library/portable-simd/crates/core_simd/src/fmt.rs
+++ b/library/portable-simd/crates/core_simd/src/fmt.rs
@ -0,0 +1,39 @@
+use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+use core::fmt;
+
+macro_rules! impl_fmt_trait {
+    { $($trait:ident,)* } => {
+        $(
+            impl<T, const LANES: usize> fmt::$trait for Simd<T, LANES>
+            where
+                LaneCount<LANES>: SupportedLaneCount,
+                T: SimdElement + fmt::$trait,
+            {
+                fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                    #[repr(transparent)]
+                    struct Wrapper<'a, T: fmt::$trait>(&'a T);
+
+                    impl<T: fmt::$trait> fmt::Debug for Wrapper<'_, T> {
+                        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                            self.0.fmt(f)
+                        }
+                    }
+
+                    f.debug_list()
+                        .entries(self.as_array().iter().map(|x| Wrapper(x)))
+                        .finish()
+                }
+            }
+        )*
+    }
+}
+
+impl_fmt_trait! {
+    Debug,
+    Binary,
+    LowerExp,
+    UpperExp,
+    Octal,
+    LowerHex,
+    UpperHex,
+}
--- a/library/portable-simd/crates/core_simd/src/intrinsics.rs
+++ b/library/portable-simd/crates/core_simd/src/intrinsics.rs
@ -0,0 +1,115 @@
+//! This module contains the LLVM intrinsics bindings that provide the functionality for this
+//! crate.
+//!
+//! The LLVM assembly language is documented here: <https://llvm.org/docs/LangRef.html>
+
+/// These intrinsics aren't linked directly from LLVM and are mostly undocumented, however they are
+/// simply lowered to the matching LLVM instructions by the compiler.  The associated instruction
+/// is documented alongside each intrinsic.
+extern "platform-intrinsic" {
+    /// add/fadd
+    pub(crate) fn simd_add<T>(x: T, y: T) -> T;
+
+    /// sub/fsub
+    pub(crate) fn simd_sub<T>(x: T, y: T) -> T;
+
+    /// mul/fmul
+    pub(crate) fn simd_mul<T>(x: T, y: T) -> T;
+
+    /// udiv/sdiv/fdiv
+    pub(crate) fn simd_div<T>(x: T, y: T) -> T;
+
+    /// urem/srem/frem
+    pub(crate) fn simd_rem<T>(x: T, y: T) -> T;
+
+    /// shl
+    pub(crate) fn simd_shl<T>(x: T, y: T) -> T;
+
+    /// lshr/ashr
+    pub(crate) fn simd_shr<T>(x: T, y: T) -> T;
+
+    /// and
+    pub(crate) fn simd_and<T>(x: T, y: T) -> T;
+
+    /// or
+    pub(crate) fn simd_or<T>(x: T, y: T) -> T;
+
+    /// xor
+    pub(crate) fn simd_xor<T>(x: T, y: T) -> T;
+
+    /// fptoui/fptosi/uitofp/sitofp
+    pub(crate) fn simd_cast<T, U>(x: T) -> U;
+
+    /// neg/fneg
+    pub(crate) fn simd_neg<T>(x: T) -> T;
+
+    /// fabs
+    pub(crate) fn simd_fabs<T>(x: T) -> T;
+
+    pub(crate) fn simd_eq<T, U>(x: T, y: T) -> U;
+    pub(crate) fn simd_ne<T, U>(x: T, y: T) -> U;
+    pub(crate) fn simd_lt<T, U>(x: T, y: T) -> U;
+    pub(crate) fn simd_le<T, U>(x: T, y: T) -> U;
+    pub(crate) fn simd_gt<T, U>(x: T, y: T) -> U;
+    pub(crate) fn simd_ge<T, U>(x: T, y: T) -> U;
+
+    // shufflevector
+    pub(crate) fn simd_shuffle<T, U, V>(x: T, y: T, idx: U) -> V;
+
+    pub(crate) fn simd_gather<T, U, V>(val: T, ptr: U, mask: V) -> T;
+    pub(crate) fn simd_scatter<T, U, V>(val: T, ptr: U, mask: V);
+
+    // {s,u}add.sat
+    pub(crate) fn simd_saturating_add<T>(x: T, y: T) -> T;
+
+    // {s,u}sub.sat
+    pub(crate) fn simd_saturating_sub<T>(x: T, y: T) -> T;
+
+    // reductions
+    pub(crate) fn simd_reduce_add_ordered<T, U>(x: T, y: U) -> U;
+    pub(crate) fn simd_reduce_mul_ordered<T, U>(x: T, y: U) -> U;
+    #[allow(unused)]
+    pub(crate) fn simd_reduce_all<T>(x: T) -> bool;
+    #[allow(unused)]
+    pub(crate) fn simd_reduce_any<T>(x: T) -> bool;
+    pub(crate) fn simd_reduce_max<T, U>(x: T) -> U;
+    pub(crate) fn simd_reduce_min<T, U>(x: T) -> U;
+    pub(crate) fn simd_reduce_and<T, U>(x: T) -> U;
+    pub(crate) fn simd_reduce_or<T, U>(x: T) -> U;
+    pub(crate) fn simd_reduce_xor<T, U>(x: T) -> U;
+
+    // truncate integer vector to bitmask
+    #[allow(unused)]
+    pub(crate) fn simd_bitmask<T, U>(x: T) -> U;
+
+    // select
+    pub(crate) fn simd_select<M, T>(m: M, a: T, b: T) -> T;
+    #[allow(unused)]
+    pub(crate) fn simd_select_bitmask<M, T>(m: M, a: T, b: T) -> T;
+}
+
+#[cfg(feature = "std")]
+mod std {
+    extern "platform-intrinsic" {
+        // ceil
+        pub(crate) fn simd_ceil<T>(x: T) -> T;
+
+        // floor
+        pub(crate) fn simd_floor<T>(x: T) -> T;
+
+        // round
+        pub(crate) fn simd_round<T>(x: T) -> T;
+
+        // trunc
+        pub(crate) fn simd_trunc<T>(x: T) -> T;
+
+        // fsqrt
+        pub(crate) fn simd_fsqrt<T>(x: T) -> T;
+
+        // fma
+        pub(crate) fn simd_fma<T>(x: T, y: T, z: T) -> T;
+    }
+}
+
+#[cfg(feature = "std")]
+pub(crate) use crate::simd::intrinsics::std::*;
--- a/library/portable-simd/crates/core_simd/src/iter.rs
+++ b/library/portable-simd/crates/core_simd/src/iter.rs
@ -0,0 +1,58 @@
+use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+use core::{
+    iter::{Product, Sum},
+    ops::{Add, Mul},
+};
+
+macro_rules! impl_traits {
+    { $type:ty } => {
+        impl<const LANES: usize> Sum<Self> for Simd<$type, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
+                iter.fold(Simd::splat(0 as $type), Add::add)
+            }
+        }
+
+        impl<const LANES: usize> Product<Self> for Simd<$type, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            fn product<I: Iterator<Item = Self>>(iter: I) -> Self {
+                iter.fold(Simd::splat(1 as $type), Mul::mul)
+            }
+        }
+
+        impl<'a, const LANES: usize> Sum<&'a Self> for Simd<$type, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            fn sum<I: Iterator<Item = &'a Self>>(iter: I) -> Self {
+                iter.fold(Simd::splat(0 as $type), Add::add)
+            }
+        }
+
+        impl<'a, const LANES: usize> Product<&'a Self> for Simd<$type, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            fn product<I: Iterator<Item = &'a Self>>(iter: I) -> Self {
+                iter.fold(Simd::splat(1 as $type), Mul::mul)
+            }
+        }
+    }
+}
+
+impl_traits! { f32 }
+impl_traits! { f64 }
+impl_traits! { u8 }
+impl_traits! { u16 }
+impl_traits! { u32 }
+impl_traits! { u64 }
+impl_traits! { usize }
+impl_traits! { i8 }
+impl_traits! { i16 }
+impl_traits! { i32 }
+impl_traits! { i64 }
+impl_traits! { isize }
--- a/library/portable-simd/crates/core_simd/src/lane_count.rs
+++ b/library/portable-simd/crates/core_simd/src/lane_count.rs
@ -0,0 +1,48 @@
+mod sealed {
+    pub trait Sealed {}
+}
+use sealed::Sealed;
+
+/// A type representing a vector lane count.
+pub struct LaneCount<const LANES: usize>;
+
+impl<const LANES: usize> LaneCount<LANES> {
+    /// The number of bytes in a bitmask with this many lanes.
+    pub const BITMASK_LEN: usize = (LANES + 7) / 8;
+}
+
+/// Helper trait for vector lane counts.
+pub trait SupportedLaneCount: Sealed {
+    #[doc(hidden)]
+    type BitMask: Copy + Default + AsRef<[u8]> + AsMut<[u8]>;
+
+    #[doc(hidden)]
+    type IntBitMask;
+}
+
+impl<const LANES: usize> Sealed for LaneCount<LANES> {}
+
+impl SupportedLaneCount for LaneCount<1> {
+    type BitMask = [u8; 1];
+    type IntBitMask = u8;
+}
+impl SupportedLaneCount for LaneCount<2> {
+    type BitMask = [u8; 1];
+    type IntBitMask = u8;
+}
+impl SupportedLaneCount for LaneCount<4> {
+    type BitMask = [u8; 1];
+    type IntBitMask = u8;
+}
+impl SupportedLaneCount for LaneCount<8> {
+    type BitMask = [u8; 1];
+    type IntBitMask = u8;
+}
+impl SupportedLaneCount for LaneCount<16> {
+    type BitMask = [u8; 2];
+    type IntBitMask = u16;
+}
+impl SupportedLaneCount for LaneCount<32> {
+    type BitMask = [u8; 4];
+    type IntBitMask = u32;
+}
--- a/library/portable-simd/crates/core_simd/src/lib.rs
+++ b/library/portable-simd/crates/core_simd/src/lib.rs
@ -0,0 +1,21 @@
+#![cfg_attr(not(feature = "std"), no_std)]
+#![feature(
+    const_fn_trait_bound,
+    decl_macro,
+    platform_intrinsics,
+    repr_simd,
+    simd_ffi,
+    staged_api,
+    stdsimd
+)]
+#![cfg_attr(feature = "generic_const_exprs", feature(generic_const_exprs))]
+#![cfg_attr(feature = "generic_const_exprs", allow(incomplete_features))]
+#![warn(missing_docs)]
+#![deny(unsafe_op_in_unsafe_fn)]
+#![unstable(feature = "portable_simd", issue = "86656")]
+//! Portable SIMD module.
+
+#[path = "mod.rs"]
+mod core_simd;
+pub use self::core_simd::simd;
+pub use simd::*;
--- a/library/portable-simd/crates/core_simd/src/masks.rs
+++ b/library/portable-simd/crates/core_simd/src/masks.rs
@ -0,0 +1,545 @@
+//! Types and traits associated with masking lanes of vectors.
+//! Types representing
+#![allow(non_camel_case_types)]
+
+#[cfg_attr(
+    not(all(target_arch = "x86_64", target_feature = "avx512f")),
+    path = "masks/full_masks.rs"
+)]
+#[cfg_attr(
+    all(target_arch = "x86_64", target_feature = "avx512f"),
+    path = "masks/bitmask.rs"
+)]
+mod mask_impl;
+
+use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+use core::cmp::Ordering;
+use core::fmt;
+
+mod sealed {
+    use super::*;
+
+    /// Not only does this seal the `MaskElement` trait, but these functions prevent other traits
+    /// from bleeding into the parent bounds.
+    ///
+    /// For example, `eq` could be provided by requiring `MaskElement: PartialEq`, but that would
+    /// prevent us from ever removing that bound, or from implementing `MaskElement` on
+    /// non-`PartialEq` types in the future.
+    pub trait Sealed {
+        fn valid<const LANES: usize>(values: Simd<Self, LANES>) -> bool
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+            Self: SimdElement;
+
+        fn eq(self, other: Self) -> bool;
+
+        const TRUE: Self;
+
+        const FALSE: Self;
+    }
+}
+use sealed::Sealed;
+
+/// Marker trait for types that may be used as SIMD mask elements.
+pub unsafe trait MaskElement: SimdElement + Sealed {}
+
+macro_rules! impl_element {
+    { $ty:ty } => {
+        impl Sealed for $ty {
+            fn valid<const LANES: usize>(value: Simd<Self, LANES>) -> bool
+            where
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                (value.lanes_eq(Simd::splat(0)) | value.lanes_eq(Simd::splat(-1))).all()
+            }
+
+            fn eq(self, other: Self) -> bool { self == other }
+
+            const TRUE: Self = -1;
+            const FALSE: Self = 0;
+        }
+
+        unsafe impl MaskElement for $ty {}
+    }
+}
+
+impl_element! { i8 }
+impl_element! { i16 }
+impl_element! { i32 }
+impl_element! { i64 }
+impl_element! { isize }
+
+/// A SIMD vector mask for `LANES` elements of width specified by `Element`.
+///
+/// The layout of this type is unspecified.
+#[repr(transparent)]
+pub struct Mask<T, const LANES: usize>(mask_impl::Mask<T, LANES>)
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount;
+
+impl<T, const LANES: usize> Copy for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+}
+
+impl<T, const LANES: usize> Clone for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T, const LANES: usize> Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    /// Construct a mask by setting all lanes to the given value.
+    pub fn splat(value: bool) -> Self {
+        Self(mask_impl::Mask::splat(value))
+    }
+
+    /// Converts an array to a SIMD vector.
+    pub fn from_array(array: [bool; LANES]) -> Self {
+        let mut vector = Self::splat(false);
+        for (i, v) in array.iter().enumerate() {
+            vector.set(i, *v);
+        }
+        vector
+    }
+
+    /// Converts a SIMD vector to an array.
+    pub fn to_array(self) -> [bool; LANES] {
+        let mut array = [false; LANES];
+        for (i, v) in array.iter_mut().enumerate() {
+            *v = self.test(i);
+        }
+        array
+    }
+
+    /// Converts a vector of integers to a mask, where 0 represents `false` and -1
+    /// represents `true`.
+    ///
+    /// # Safety
+    /// All lanes must be either 0 or -1.
+    #[inline]
+    pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
+        unsafe { Self(mask_impl::Mask::from_int_unchecked(value)) }
+    }
+
+    /// Converts a vector of integers to a mask, where 0 represents `false` and -1
+    /// represents `true`.
+    ///
+    /// # Panics
+    /// Panics if any lane is not 0 or -1.
+    #[inline]
+    pub fn from_int(value: Simd<T, LANES>) -> Self {
+        assert!(T::valid(value), "all values must be either 0 or -1",);
+        unsafe { Self::from_int_unchecked(value) }
+    }
+
+    /// Converts the mask to a vector of integers, where 0 represents `false` and -1
+    /// represents `true`.
+    #[inline]
+    pub fn to_int(self) -> Simd<T, LANES> {
+        self.0.to_int()
+    }
+
+    /// Tests the value of the specified lane.
+    ///
+    /// # Safety
+    /// `lane` must be less than `LANES`.
+    #[inline]
+    pub unsafe fn test_unchecked(&self, lane: usize) -> bool {
+        unsafe { self.0.test_unchecked(lane) }
+    }
+
+    /// Tests the value of the specified lane.
+    ///
+    /// # Panics
+    /// Panics if `lane` is greater than or equal to the number of lanes in the vector.
+    #[inline]
+    pub fn test(&self, lane: usize) -> bool {
+        assert!(lane < LANES, "lane index out of range");
+        unsafe { self.test_unchecked(lane) }
+    }
+
+    /// Sets the value of the specified lane.
+    ///
+    /// # Safety
+    /// `lane` must be less than `LANES`.
+    #[inline]
+    pub unsafe fn set_unchecked(&mut self, lane: usize, value: bool) {
+        unsafe {
+            self.0.set_unchecked(lane, value);
+        }
+    }
+
+    /// Sets the value of the specified lane.
+    ///
+    /// # Panics
+    /// Panics if `lane` is greater than or equal to the number of lanes in the vector.
+    #[inline]
+    pub fn set(&mut self, lane: usize, value: bool) {
+        assert!(lane < LANES, "lane index out of range");
+        unsafe {
+            self.set_unchecked(lane, value);
+        }
+    }
+
+    /// Convert this mask to a bitmask, with one bit set per lane.
+    #[cfg(feature = "generic_const_exprs")]
+    pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
+        self.0.to_bitmask()
+    }
+
+    /// Convert a bitmask to a mask.
+    #[cfg(feature = "generic_const_exprs")]
+    pub fn from_bitmask(bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
+        Self(mask_impl::Mask::from_bitmask(bitmask))
+    }
+
+    /// Returns true if any lane is set, or false otherwise.
+    #[inline]
+    pub fn any(self) -> bool {
+        self.0.any()
+    }
+
+    /// Returns true if all lanes are set, or false otherwise.
+    #[inline]
+    pub fn all(self) -> bool {
+        self.0.all()
+    }
+}
+
+// vector/array conversion
+impl<T, const LANES: usize> From<[bool; LANES]> for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    fn from(array: [bool; LANES]) -> Self {
+        Self::from_array(array)
+    }
+}
+
+impl<T, const LANES: usize> From<Mask<T, LANES>> for [bool; LANES]
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    fn from(vector: Mask<T, LANES>) -> Self {
+        vector.to_array()
+    }
+}
+
+impl<T, const LANES: usize> Default for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn default() -> Self {
+        Self::splat(false)
+    }
+}
+
+impl<T, const LANES: usize> PartialEq for Mask<T, LANES>
+where
+    T: MaskElement + PartialEq,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn eq(&self, other: &Self) -> bool {
+        self.0 == other.0
+    }
+}
+
+impl<T, const LANES: usize> PartialOrd for Mask<T, LANES>
+where
+    T: MaskElement + PartialOrd,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        self.0.partial_cmp(&other.0)
+    }
+}
+
+impl<T, const LANES: usize> fmt::Debug for Mask<T, LANES>
+where
+    T: MaskElement + fmt::Debug,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_list()
+            .entries((0..LANES).map(|lane| self.test(lane)))
+            .finish()
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitAnd for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Self;
+    #[inline]
+    fn bitand(self, rhs: Self) -> Self {
+        Self(self.0 & rhs.0)
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitAnd<bool> for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Self;
+    #[inline]
+    fn bitand(self, rhs: bool) -> Self {
+        self & Self::splat(rhs)
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitAnd<Mask<T, LANES>> for bool
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Mask<T, LANES>;
+    #[inline]
+    fn bitand(self, rhs: Mask<T, LANES>) -> Mask<T, LANES> {
+        Mask::splat(self) & rhs
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitOr for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Self;
+    #[inline]
+    fn bitor(self, rhs: Self) -> Self {
+        Self(self.0 | rhs.0)
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitOr<bool> for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Self;
+    #[inline]
+    fn bitor(self, rhs: bool) -> Self {
+        self | Self::splat(rhs)
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitOr<Mask<T, LANES>> for bool
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Mask<T, LANES>;
+    #[inline]
+    fn bitor(self, rhs: Mask<T, LANES>) -> Mask<T, LANES> {
+        Mask::splat(self) | rhs
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitXor for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Self;
+    #[inline]
+    fn bitxor(self, rhs: Self) -> Self::Output {
+        Self(self.0 ^ rhs.0)
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitXor<bool> for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Self;
+    #[inline]
+    fn bitxor(self, rhs: bool) -> Self::Output {
+        self ^ Self::splat(rhs)
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitXor<Mask<T, LANES>> for bool
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Mask<T, LANES>;
+    #[inline]
+    fn bitxor(self, rhs: Mask<T, LANES>) -> Self::Output {
+        Mask::splat(self) ^ rhs
+    }
+}
+
+impl<T, const LANES: usize> core::ops::Not for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Mask<T, LANES>;
+    #[inline]
+    fn not(self) -> Self::Output {
+        Self(!self.0)
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitAndAssign for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn bitand_assign(&mut self, rhs: Self) {
+        self.0 = self.0 & rhs.0;
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitAndAssign<bool> for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn bitand_assign(&mut self, rhs: bool) {
+        *self &= Self::splat(rhs);
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitOrAssign for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn bitor_assign(&mut self, rhs: Self) {
+        self.0 = self.0 | rhs.0;
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitOrAssign<bool> for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn bitor_assign(&mut self, rhs: bool) {
+        *self |= Self::splat(rhs);
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitXorAssign for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn bitxor_assign(&mut self, rhs: Self) {
+        self.0 = self.0 ^ rhs.0;
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitXorAssign<bool> for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn bitxor_assign(&mut self, rhs: bool) {
+        *self ^= Self::splat(rhs);
+    }
+}
+
+/// Vector of eight 8-bit masks
+pub type mask8x8 = Mask<i8, 8>;
+
+/// Vector of 16 8-bit masks
+pub type mask8x16 = Mask<i8, 16>;
+
+/// Vector of 32 8-bit masks
+pub type mask8x32 = Mask<i8, 32>;
+
+/// Vector of 16 8-bit masks
+pub type mask8x64 = Mask<i8, 64>;
+
+/// Vector of four 16-bit masks
+pub type mask16x4 = Mask<i16, 4>;
+
+/// Vector of eight 16-bit masks
+pub type mask16x8 = Mask<i16, 8>;
+
+/// Vector of 16 16-bit masks
+pub type mask16x16 = Mask<i16, 16>;
+
+/// Vector of 32 16-bit masks
+pub type mask16x32 = Mask<i32, 32>;
+
+/// Vector of two 32-bit masks
+pub type mask32x2 = Mask<i32, 2>;
+
+/// Vector of four 32-bit masks
+pub type mask32x4 = Mask<i32, 4>;
+
+/// Vector of eight 32-bit masks
+pub type mask32x8 = Mask<i32, 8>;
+
+/// Vector of 16 32-bit masks
+pub type mask32x16 = Mask<i32, 16>;
+
+/// Vector of two 64-bit masks
+pub type mask64x2 = Mask<i64, 2>;
+
+/// Vector of four 64-bit masks
+pub type mask64x4 = Mask<i64, 4>;
+
+/// Vector of eight 64-bit masks
+pub type mask64x8 = Mask<i64, 8>;
+
+/// Vector of two pointer-width masks
+pub type masksizex2 = Mask<isize, 2>;
+
+/// Vector of four pointer-width masks
+pub type masksizex4 = Mask<isize, 4>;
+
+/// Vector of eight pointer-width masks
+pub type masksizex8 = Mask<isize, 8>;
+
+macro_rules! impl_from {
+    { $from:ty  => $($to:ty),* } => {
+        $(
+        impl<const LANES: usize> From<Mask<$from, LANES>> for Mask<$to, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            fn from(value: Mask<$from, LANES>) -> Self {
+                Self(value.0.convert())
+            }
+        }
+        )*
+    }
+}
+impl_from! { i8 => i16, i32, i64, isize }
+impl_from! { i16 => i32, i64, isize, i8 }
+impl_from! { i32 => i64, isize, i8, i16 }
+impl_from! { i64 => isize, i8, i16, i32 }
+impl_from! { isize => i8, i16, i32, i64 }
--- a/library/portable-simd/crates/core_simd/src/masks/bitmask.rs
+++ b/library/portable-simd/crates/core_simd/src/masks/bitmask.rs
@ -0,0 +1,220 @@
+use super::MaskElement;
+use crate::simd::intrinsics;
+use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+use core::marker::PhantomData;
+
+/// A mask where each lane is represented by a single bit.
+#[repr(transparent)]
+pub struct Mask<T, const LANES: usize>(
+    <LaneCount<LANES> as SupportedLaneCount>::BitMask,
+    PhantomData<T>,
+)
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount;
+
+impl<T, const LANES: usize> Copy for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+}
+
+impl<T, const LANES: usize> Clone for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T, const LANES: usize> PartialEq for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    fn eq(&self, other: &Self) -> bool {
+        self.0.as_ref() == other.0.as_ref()
+    }
+}
+
+impl<T, const LANES: usize> PartialOrd for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
+        self.0.as_ref().partial_cmp(other.0.as_ref())
+    }
+}
+
+impl<T, const LANES: usize> Eq for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+}
+
+impl<T, const LANES: usize> Ord for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        self.0.as_ref().cmp(other.0.as_ref())
+    }
+}
+
+impl<T, const LANES: usize> Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    pub fn splat(value: bool) -> Self {
+        let mut mask = <LaneCount<LANES> as SupportedLaneCount>::BitMask::default();
+        if value {
+            mask.as_mut().fill(u8::MAX)
+        } else {
+            mask.as_mut().fill(u8::MIN)
+        }
+        if LANES % 8 > 0 {
+            *mask.as_mut().last_mut().unwrap() &= u8::MAX >> (8 - LANES % 8);
+        }
+        Self(mask, PhantomData)
+    }
+
+    #[inline]
+    pub unsafe fn test_unchecked(&self, lane: usize) -> bool {
+        (self.0.as_ref()[lane / 8] >> (lane % 8)) & 0x1 > 0
+    }
+
+    #[inline]
+    pub unsafe fn set_unchecked(&mut self, lane: usize, value: bool) {
+        unsafe {
+            self.0.as_mut()[lane / 8] ^= ((value ^ self.test_unchecked(lane)) as u8) << (lane % 8)
+        }
+    }
+
+    #[inline]
+    pub fn to_int(self) -> Simd<T, LANES> {
+        unsafe {
+            let mask: <LaneCount<LANES> as SupportedLaneCount>::IntBitMask =
+                core::mem::transmute_copy(&self);
+            intrinsics::simd_select_bitmask(mask, Simd::splat(T::TRUE), Simd::splat(T::FALSE))
+        }
+    }
+
+    #[inline]
+    pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
+        // TODO remove the transmute when rustc is more flexible
+        assert_eq!(
+            core::mem::size_of::<<LaneCount::<LANES> as SupportedLaneCount>::BitMask>(),
+            core::mem::size_of::<<LaneCount::<LANES> as SupportedLaneCount>::IntBitMask>(),
+        );
+        unsafe {
+            let mask: <LaneCount<LANES> as SupportedLaneCount>::IntBitMask =
+                intrinsics::simd_bitmask(value);
+            Self(core::mem::transmute_copy(&mask), PhantomData)
+        }
+    }
+
+    #[cfg(feature = "generic_const_exprs")]
+    #[inline]
+    pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
+        // Safety: these are the same type and we are laundering the generic
+        unsafe { core::mem::transmute_copy(&self.0) }
+    }
+
+    #[cfg(feature = "generic_const_exprs")]
+    #[inline]
+    pub fn from_bitmask(bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
+        // Safety: these are the same type and we are laundering the generic
+        Self(unsafe { core::mem::transmute_copy(&bitmask) }, PhantomData)
+    }
+
+    #[inline]
+    pub fn convert<U>(self) -> Mask<U, LANES>
+    where
+        U: MaskElement,
+    {
+        unsafe { core::mem::transmute_copy(&self) }
+    }
+
+    #[inline]
+    pub fn any(self) -> bool {
+        self != Self::splat(false)
+    }
+
+    #[inline]
+    pub fn all(self) -> bool {
+        self == Self::splat(true)
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitAnd for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+    <LaneCount<LANES> as SupportedLaneCount>::BitMask: AsRef<[u8]> + AsMut<[u8]>,
+{
+    type Output = Self;
+    #[inline]
+    fn bitand(mut self, rhs: Self) -> Self {
+        for (l, r) in self.0.as_mut().iter_mut().zip(rhs.0.as_ref().iter()) {
+            *l &= r;
+        }
+        self
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitOr for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+    <LaneCount<LANES> as SupportedLaneCount>::BitMask: AsRef<[u8]> + AsMut<[u8]>,
+{
+    type Output = Self;
+    #[inline]
+    fn bitor(mut self, rhs: Self) -> Self {
+        for (l, r) in self.0.as_mut().iter_mut().zip(rhs.0.as_ref().iter()) {
+            *l |= r;
+        }
+        self
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitXor for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Self;
+    #[inline]
+    fn bitxor(mut self, rhs: Self) -> Self::Output {
+        for (l, r) in self.0.as_mut().iter_mut().zip(rhs.0.as_ref().iter()) {
+            *l ^= r;
+        }
+        self
+    }
+}
+
+impl<T, const LANES: usize> core::ops::Not for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Self;
+    #[inline]
+    fn not(mut self) -> Self::Output {
+        for x in self.0.as_mut() {
+            *x = !*x;
+        }
+        if LANES % 8 > 0 {
+            *self.0.as_mut().last_mut().unwrap() &= u8::MAX >> (8 - LANES % 8);
+        }
+        self
+    }
+}
--- a/library/portable-simd/crates/core_simd/src/masks/full_masks.rs
+++ b/library/portable-simd/crates/core_simd/src/masks/full_masks.rs
@ -0,0 +1,228 @@
+//! Masks that take up full SIMD vector registers.
+
+use super::MaskElement;
+use crate::simd::intrinsics;
+use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+
+#[repr(transparent)]
+pub struct Mask<T, const LANES: usize>(Simd<T, LANES>)
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount;
+
+impl<T, const LANES: usize> Copy for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+}
+
+impl<T, const LANES: usize> Clone for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T, const LANES: usize> PartialEq for Mask<T, LANES>
+where
+    T: MaskElement + PartialEq,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    fn eq(&self, other: &Self) -> bool {
+        self.0.eq(&other.0)
+    }
+}
+
+impl<T, const LANES: usize> PartialOrd for Mask<T, LANES>
+where
+    T: MaskElement + PartialOrd,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
+        self.0.partial_cmp(&other.0)
+    }
+}
+
+impl<T, const LANES: usize> Eq for Mask<T, LANES>
+where
+    T: MaskElement + Eq,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+}
+
+impl<T, const LANES: usize> Ord for Mask<T, LANES>
+where
+    T: MaskElement + Ord,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        self.0.cmp(&other.0)
+    }
+}
+
+impl<T, const LANES: usize> Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    pub fn splat(value: bool) -> Self {
+        Self(Simd::splat(if value { T::TRUE } else { T::FALSE }))
+    }
+
+    #[inline]
+    pub unsafe fn test_unchecked(&self, lane: usize) -> bool {
+        T::eq(self.0[lane], T::TRUE)
+    }
+
+    #[inline]
+    pub unsafe fn set_unchecked(&mut self, lane: usize, value: bool) {
+        self.0[lane] = if value { T::TRUE } else { T::FALSE }
+    }
+
+    #[inline]
+    pub fn to_int(self) -> Simd<T, LANES> {
+        self.0
+    }
+
+    #[inline]
+    pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
+        Self(value)
+    }
+
+    #[inline]
+    pub fn convert<U>(self) -> Mask<U, LANES>
+    where
+        U: MaskElement,
+    {
+        unsafe { Mask(intrinsics::simd_cast(self.0)) }
+    }
+
+    #[cfg(feature = "generic_const_exprs")]
+    #[inline]
+    pub fn to_bitmask(self) -> [u8; LaneCount::<LANES>::BITMASK_LEN] {
+        unsafe {
+            // TODO remove the transmute when rustc can use arrays of u8 as bitmasks
+            assert_eq!(
+                core::mem::size_of::<<LaneCount::<LANES> as SupportedLaneCount>::IntBitMask>(),
+                LaneCount::<LANES>::BITMASK_LEN,
+            );
+            let bitmask: <LaneCount<LANES> as SupportedLaneCount>::IntBitMask =
+                intrinsics::simd_bitmask(self.0);
+            let mut bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN] =
+                core::mem::transmute_copy(&bitmask);
+
+            // There is a bug where LLVM appears to implement this operation with the wrong
+            // bit order.
+            // TODO fix this in a better way
+            if cfg!(target_endian = "big") {
+                for x in bitmask.as_mut() {
+                    *x = x.reverse_bits();
+                }
+            }
+
+            bitmask
+        }
+    }
+
+    #[cfg(feature = "generic_const_exprs")]
+    #[inline]
+    pub fn from_bitmask(mut bitmask: [u8; LaneCount::<LANES>::BITMASK_LEN]) -> Self {
+        unsafe {
+            // There is a bug where LLVM appears to implement this operation with the wrong
+            // bit order.
+            // TODO fix this in a better way
+            if cfg!(target_endian = "big") {
+                for x in bitmask.as_mut() {
+                    *x = x.reverse_bits();
+                }
+            }
+
+            // TODO remove the transmute when rustc can use arrays of u8 as bitmasks
+            assert_eq!(
+                core::mem::size_of::<<LaneCount::<LANES> as SupportedLaneCount>::IntBitMask>(),
+                LaneCount::<LANES>::BITMASK_LEN,
+            );
+            let bitmask: <LaneCount<LANES> as SupportedLaneCount>::IntBitMask =
+                core::mem::transmute_copy(&bitmask);
+
+            Self::from_int_unchecked(intrinsics::simd_select_bitmask(
+                bitmask,
+                Self::splat(true).to_int(),
+                Self::splat(false).to_int(),
+            ))
+        }
+    }
+
+    #[inline]
+    pub fn any(self) -> bool {
+        unsafe { intrinsics::simd_reduce_any(self.to_int()) }
+    }
+
+    #[inline]
+    pub fn all(self) -> bool {
+        unsafe { intrinsics::simd_reduce_all(self.to_int()) }
+    }
+}
+
+impl<T, const LANES: usize> core::convert::From<Mask<T, LANES>> for Simd<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    fn from(value: Mask<T, LANES>) -> Self {
+        value.0
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitAnd for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Self;
+    #[inline]
+    fn bitand(self, rhs: Self) -> Self {
+        unsafe { Self(intrinsics::simd_and(self.0, rhs.0)) }
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitOr for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Self;
+    #[inline]
+    fn bitor(self, rhs: Self) -> Self {
+        unsafe { Self(intrinsics::simd_or(self.0, rhs.0)) }
+    }
+}
+
+impl<T, const LANES: usize> core::ops::BitXor for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Self;
+    #[inline]
+    fn bitxor(self, rhs: Self) -> Self {
+        unsafe { Self(intrinsics::simd_xor(self.0, rhs.0)) }
+    }
+}
+
+impl<T, const LANES: usize> core::ops::Not for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    type Output = Self;
+    #[inline]
+    fn not(self) -> Self::Output {
+        Self::splat(true) ^ self
+    }
+}
--- a/library/portable-simd/crates/core_simd/src/math.rs
+++ b/library/portable-simd/crates/core_simd/src/math.rs
@ -0,0 +1,159 @@
+use crate::simd::intrinsics::{simd_saturating_add, simd_saturating_sub};
+use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+
+macro_rules! impl_uint_arith {
+    ($($ty:ty),+) => {
+        $( impl<const LANES: usize> Simd<$ty, LANES> where LaneCount<LANES>: SupportedLaneCount {
+
+            /// Lanewise saturating add.
+            ///
+            /// # Examples
+            /// ```
+            /// # #![feature(portable_simd)]
+            /// # #[cfg(feature = "std")] use core_simd::Simd;
+            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+            #[doc = concat!("# use core::", stringify!($ty), "::MAX;")]
+            /// let x = Simd::from_array([2, 1, 0, MAX]);
+            /// let max = Simd::splat(MAX);
+            /// let unsat = x + max;
+            /// let sat = x.saturating_add(max);
+            /// assert_eq!(x - 1, unsat);
+            /// assert_eq!(sat, max);
+            /// ```
+            #[inline]
+            pub fn saturating_add(self, second: Self) -> Self {
+                unsafe { simd_saturating_add(self, second) }
+            }
+
+            /// Lanewise saturating subtract.
+            ///
+            /// # Examples
+            /// ```
+            /// # #![feature(portable_simd)]
+            /// # #[cfg(feature = "std")] use core_simd::Simd;
+            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+            #[doc = concat!("# use core::", stringify!($ty), "::MAX;")]
+            /// let x = Simd::from_array([2, 1, 0, MAX]);
+            /// let max = Simd::splat(MAX);
+            /// let unsat = x - max;
+            /// let sat = x.saturating_sub(max);
+            /// assert_eq!(unsat, x + 1);
+            /// assert_eq!(sat, Simd::splat(0));
+            #[inline]
+            pub fn saturating_sub(self, second: Self) -> Self {
+                unsafe { simd_saturating_sub(self, second) }
+            }
+        })+
+    }
+}
+
+macro_rules! impl_int_arith {
+    ($($ty:ty),+) => {
+        $( impl<const LANES: usize> Simd<$ty, LANES> where LaneCount<LANES>: SupportedLaneCount {
+
+            /// Lanewise saturating add.
+            ///
+            /// # Examples
+            /// ```
+            /// # #![feature(portable_simd)]
+            /// # #[cfg(feature = "std")] use core_simd::Simd;
+            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+            #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
+            /// let x = Simd::from_array([MIN, 0, 1, MAX]);
+            /// let max = Simd::splat(MAX);
+            /// let unsat = x + max;
+            /// let sat = x.saturating_add(max);
+            /// assert_eq!(unsat, Simd::from_array([-1, MAX, MIN, -2]));
+            /// assert_eq!(sat, Simd::from_array([-1, MAX, MAX, MAX]));
+            /// ```
+            #[inline]
+            pub fn saturating_add(self, second: Self) -> Self {
+                unsafe { simd_saturating_add(self, second) }
+            }
+
+            /// Lanewise saturating subtract.
+            ///
+            /// # Examples
+            /// ```
+            /// # #![feature(portable_simd)]
+            /// # #[cfg(feature = "std")] use core_simd::Simd;
+            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+            #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
+            /// let x = Simd::from_array([MIN, -2, -1, MAX]);
+            /// let max = Simd::splat(MAX);
+            /// let unsat = x - max;
+            /// let sat = x.saturating_sub(max);
+            /// assert_eq!(unsat, Simd::from_array([1, MAX, MIN, 0]));
+            /// assert_eq!(sat, Simd::from_array([MIN, MIN, MIN, 0]));
+            #[inline]
+            pub fn saturating_sub(self, second: Self) -> Self {
+                unsafe { simd_saturating_sub(self, second) }
+            }
+
+            /// Lanewise absolute value, implemented in Rust.
+            /// Every lane becomes its absolute value.
+            ///
+            /// # Examples
+            /// ```
+            /// # #![feature(portable_simd)]
+            /// # #[cfg(feature = "std")] use core_simd::Simd;
+            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+            #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
+            /// let xs = Simd::from_array([MIN, MIN +1, -5, 0]);
+            /// assert_eq!(xs.abs(), Simd::from_array([MIN, MAX, 5, 0]));
+            /// ```
+            #[inline]
+            pub fn abs(self) -> Self {
+                const SHR: $ty = <$ty>::BITS as $ty - 1;
+                let m = self >> SHR;
+                (self^m) - m
+            }
+
+            /// Lanewise saturating absolute value, implemented in Rust.
+            /// As abs(), except the MIN value becomes MAX instead of itself.
+            ///
+            /// # Examples
+            /// ```
+            /// # #![feature(portable_simd)]
+            /// # #[cfg(feature = "std")] use core_simd::Simd;
+            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+            #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
+            /// let xs = Simd::from_array([MIN, -2, 0, 3]);
+            /// let unsat = xs.abs();
+            /// let sat = xs.saturating_abs();
+            /// assert_eq!(unsat, Simd::from_array([MIN, 2, 0, 3]));
+            /// assert_eq!(sat, Simd::from_array([MAX, 2, 0, 3]));
+            /// ```
+            #[inline]
+            pub fn saturating_abs(self) -> Self {
+                // arith shift for -1 or 0 mask based on sign bit, giving 2s complement
+                const SHR: $ty = <$ty>::BITS as $ty - 1;
+                let m = self >> SHR;
+                (self^m).saturating_sub(m)
+            }
+
+            /// Lanewise saturating negation, implemented in Rust.
+            /// As neg(), except the MIN value becomes MAX instead of itself.
+            ///
+            /// # Examples
+            /// ```
+            /// # #![feature(portable_simd)]
+            /// # #[cfg(feature = "std")] use core_simd::Simd;
+            /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+            #[doc = concat!("# use core::", stringify!($ty), "::{MIN, MAX};")]
+            /// let x = Simd::from_array([MIN, -2, 3, MAX]);
+            /// let unsat = -x;
+            /// let sat = x.saturating_neg();
+            /// assert_eq!(unsat, Simd::from_array([MIN, 2, -3, MIN + 1]));
+            /// assert_eq!(sat, Simd::from_array([MAX, 2, -3, MIN + 1]));
+            /// ```
+            #[inline]
+            pub fn saturating_neg(self) -> Self {
+                Self::splat(0).saturating_sub(self)
+            }
+        })+
+    }
+}
+
+impl_uint_arith! { u8, u16, u32, u64, usize }
+impl_int_arith! { i8, i16, i32, i64, isize }
--- a/library/portable-simd/crates/core_simd/src/mod.rs
+++ b/library/portable-simd/crates/core_simd/src/mod.rs
@ -0,0 +1,33 @@
+#[macro_use]
+mod reduction;
+
+#[macro_use]
+mod swizzle;
+
+pub(crate) mod intrinsics;
+
+#[cfg(feature = "generic_const_exprs")]
+mod to_bytes;
+
+mod comparisons;
+mod fmt;
+mod iter;
+mod lane_count;
+mod masks;
+mod math;
+mod ops;
+mod round;
+mod select;
+mod vector;
+mod vendor;
+
+#[doc = include_str!("core_simd_docs.md")]
+pub mod simd {
+    pub(crate) use crate::core_simd::intrinsics;
+
+    pub use crate::core_simd::lane_count::{LaneCount, SupportedLaneCount};
+    pub use crate::core_simd::masks::*;
+    pub use crate::core_simd::select::Select;
+    pub use crate::core_simd::swizzle::*;
+    pub use crate::core_simd::vector::*;
+}
--- a/library/portable-simd/crates/core_simd/src/ops.rs
+++ b/library/portable-simd/crates/core_simd/src/ops.rs
@ -0,0 +1,644 @@
+use crate::simd::intrinsics;
+use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+
+impl<I, T, const LANES: usize> core::ops::Index<I> for Simd<T, LANES>
+where
+    T: SimdElement,
+    LaneCount<LANES>: SupportedLaneCount,
+    I: core::slice::SliceIndex<[T]>,
+{
+    type Output = I::Output;
+    fn index(&self, index: I) -> &Self::Output {
+        &self.as_array()[index]
+    }
+}
+
+impl<I, T, const LANES: usize> core::ops::IndexMut<I> for Simd<T, LANES>
+where
+    T: SimdElement,
+    LaneCount<LANES>: SupportedLaneCount,
+    I: core::slice::SliceIndex<[T]>,
+{
+    fn index_mut(&mut self, index: I) -> &mut Self::Output {
+        &mut self.as_mut_array()[index]
+    }
+}
+
+/// Checks if the right-hand side argument of a left- or right-shift would cause overflow.
+fn invalid_shift_rhs<T>(rhs: T) -> bool
+where
+    T: Default + PartialOrd + core::convert::TryFrom<usize>,
+    <T as core::convert::TryFrom<usize>>::Error: core::fmt::Debug,
+{
+    let bits_in_type = T::try_from(8 * core::mem::size_of::<T>()).unwrap();
+    rhs < T::default() || rhs >= bits_in_type
+}
+
+/// Automatically implements operators over references in addition to the provided operator.
+macro_rules! impl_ref_ops {
+    // binary op
+    {
+        impl<const $lanes:ident: usize> core::ops::$trait:ident<$rhs:ty> for $type:ty
+        where
+            LaneCount<$lanes2:ident>: SupportedLaneCount,
+        {
+            type Output = $output:ty;
+
+            $(#[$attrs:meta])*
+            fn $fn:ident($self_tok:ident, $rhs_arg:ident: $rhs_arg_ty:ty) -> Self::Output $body:tt
+        }
+    } => {
+        impl<const $lanes: usize> core::ops::$trait<$rhs> for $type
+        where
+            LaneCount<$lanes2>: SupportedLaneCount,
+        {
+            type Output = $output;
+
+            $(#[$attrs])*
+            fn $fn($self_tok, $rhs_arg: $rhs_arg_ty) -> Self::Output $body
+        }
+
+        impl<const $lanes: usize> core::ops::$trait<&'_ $rhs> for $type
+        where
+            LaneCount<$lanes2>: SupportedLaneCount,
+        {
+            type Output = <$type as core::ops::$trait<$rhs>>::Output;
+
+            $(#[$attrs])*
+            fn $fn($self_tok, $rhs_arg: &$rhs) -> Self::Output {
+                core::ops::$trait::$fn($self_tok, *$rhs_arg)
+            }
+        }
+
+        impl<const $lanes: usize> core::ops::$trait<$rhs> for &'_ $type
+        where
+            LaneCount<$lanes2>: SupportedLaneCount,
+        {
+            type Output = <$type as core::ops::$trait<$rhs>>::Output;
+
+            $(#[$attrs])*
+            fn $fn($self_tok, $rhs_arg: $rhs) -> Self::Output {
+                core::ops::$trait::$fn(*$self_tok, $rhs_arg)
+            }
+        }
+
+        impl<const $lanes: usize> core::ops::$trait<&'_ $rhs> for &'_ $type
+        where
+            LaneCount<$lanes2>: SupportedLaneCount,
+        {
+            type Output = <$type as core::ops::$trait<$rhs>>::Output;
+
+            $(#[$attrs])*
+            fn $fn($self_tok, $rhs_arg: &$rhs) -> Self::Output {
+                core::ops::$trait::$fn(*$self_tok, *$rhs_arg)
+            }
+        }
+    };
+
+    // binary assignment op
+    {
+        impl<const $lanes:ident: usize> core::ops::$trait:ident<$rhs:ty> for $type:ty
+        where
+            LaneCount<$lanes2:ident>: SupportedLaneCount,
+        {
+            $(#[$attrs:meta])*
+            fn $fn:ident(&mut $self_tok:ident, $rhs_arg:ident: $rhs_arg_ty:ty) $body:tt
+        }
+    } => {
+        impl<const $lanes: usize> core::ops::$trait<$rhs> for $type
+        where
+            LaneCount<$lanes2>: SupportedLaneCount,
+        {
+            $(#[$attrs])*
+            fn $fn(&mut $self_tok, $rhs_arg: $rhs_arg_ty) $body
+        }
+
+        impl<const $lanes: usize> core::ops::$trait<&'_ $rhs> for $type
+        where
+            LaneCount<$lanes2>: SupportedLaneCount,
+        {
+            $(#[$attrs])*
+            fn $fn(&mut $self_tok, $rhs_arg: &$rhs_arg_ty) {
+                core::ops::$trait::$fn($self_tok, *$rhs_arg)
+            }
+        }
+    };
+
+    // unary op
+    {
+        impl<const $lanes:ident: usize> core::ops::$trait:ident for $type:ty
+        where
+            LaneCount<$lanes2:ident>: SupportedLaneCount,
+        {
+            type Output = $output:ty;
+            fn $fn:ident($self_tok:ident) -> Self::Output $body:tt
+        }
+    } => {
+        impl<const $lanes: usize> core::ops::$trait for $type
+        where
+            LaneCount<$lanes2>: SupportedLaneCount,
+        {
+            type Output = $output;
+            fn $fn($self_tok) -> Self::Output $body
+        }
+
+        impl<const $lanes: usize> core::ops::$trait for &'_ $type
+        where
+            LaneCount<$lanes2>: SupportedLaneCount,
+        {
+            type Output = <$type as core::ops::$trait>::Output;
+            fn $fn($self_tok) -> Self::Output {
+                core::ops::$trait::$fn(*$self_tok)
+            }
+        }
+    }
+}
+
+/// Automatically implements operators over vectors and scalars for a particular vector.
+macro_rules! impl_op {
+    { impl Add for $scalar:ty } => {
+        impl_op! { @binary $scalar, Add::add, AddAssign::add_assign, simd_add }
+    };
+    { impl Sub for $scalar:ty } => {
+        impl_op! { @binary $scalar, Sub::sub, SubAssign::sub_assign, simd_sub }
+    };
+    { impl Mul for $scalar:ty } => {
+        impl_op! { @binary $scalar, Mul::mul, MulAssign::mul_assign, simd_mul }
+    };
+    { impl Div for $scalar:ty } => {
+        impl_op! { @binary $scalar, Div::div, DivAssign::div_assign, simd_div }
+    };
+    { impl Rem for $scalar:ty } => {
+        impl_op! { @binary $scalar, Rem::rem, RemAssign::rem_assign, simd_rem }
+    };
+    { impl Shl for $scalar:ty } => {
+        impl_op! { @binary $scalar, Shl::shl, ShlAssign::shl_assign, simd_shl }
+    };
+    { impl Shr for $scalar:ty } => {
+        impl_op! { @binary $scalar, Shr::shr, ShrAssign::shr_assign, simd_shr }
+    };
+    { impl BitAnd for $scalar:ty } => {
+        impl_op! { @binary $scalar, BitAnd::bitand, BitAndAssign::bitand_assign, simd_and }
+    };
+    { impl BitOr for $scalar:ty } => {
+        impl_op! { @binary $scalar, BitOr::bitor, BitOrAssign::bitor_assign, simd_or }
+    };
+    { impl BitXor for $scalar:ty } => {
+        impl_op! { @binary $scalar, BitXor::bitxor, BitXorAssign::bitxor_assign, simd_xor }
+    };
+
+    { impl Not for $scalar:ty } => {
+        impl_ref_ops! {
+            impl<const LANES: usize> core::ops::Not for Simd<$scalar, LANES>
+            where
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                type Output = Self;
+                fn not(self) -> Self::Output {
+                    self ^ Self::splat(!<$scalar>::default())
+                }
+            }
+        }
+    };
+
+    { impl Neg for $scalar:ty } => {
+        impl_ref_ops! {
+            impl<const LANES: usize> core::ops::Neg for Simd<$scalar, LANES>
+            where
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                type Output = Self;
+                fn neg(self) -> Self::Output {
+                    unsafe { intrinsics::simd_neg(self) }
+                }
+            }
+        }
+    };
+
+    // generic binary op with assignment when output is `Self`
+    { @binary $scalar:ty, $trait:ident :: $trait_fn:ident, $assign_trait:ident :: $assign_trait_fn:ident, $intrinsic:ident } => {
+        impl_ref_ops! {
+            impl<const LANES: usize> core::ops::$trait<Self> for Simd<$scalar, LANES>
+            where
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                type Output = Self;
+
+                #[inline]
+                fn $trait_fn(self, rhs: Self) -> Self::Output {
+                    unsafe {
+                        intrinsics::$intrinsic(self, rhs)
+                    }
+                }
+            }
+        }
+
+        impl_ref_ops! {
+            impl<const LANES: usize> core::ops::$trait<$scalar> for Simd<$scalar, LANES>
+            where
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                type Output = Self;
+
+                #[inline]
+                fn $trait_fn(self, rhs: $scalar) -> Self::Output {
+                    core::ops::$trait::$trait_fn(self, Self::splat(rhs))
+                }
+            }
+        }
+
+        impl_ref_ops! {
+            impl<const LANES: usize> core::ops::$trait<Simd<$scalar, LANES>> for $scalar
+            where
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                type Output = Simd<$scalar, LANES>;
+
+                #[inline]
+                fn $trait_fn(self, rhs: Simd<$scalar, LANES>) -> Self::Output {
+                    core::ops::$trait::$trait_fn(Simd::splat(self), rhs)
+                }
+            }
+        }
+
+        impl_ref_ops! {
+            impl<const LANES: usize> core::ops::$assign_trait<Self> for Simd<$scalar, LANES>
+            where
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                #[inline]
+                fn $assign_trait_fn(&mut self, rhs: Self) {
+                    unsafe {
+                        *self = intrinsics::$intrinsic(*self, rhs);
+                    }
+                }
+            }
+        }
+
+        impl_ref_ops! {
+            impl<const LANES: usize> core::ops::$assign_trait<$scalar> for Simd<$scalar, LANES>
+            where
+                LaneCount<LANES>: SupportedLaneCount,
+            {
+                #[inline]
+                fn $assign_trait_fn(&mut self, rhs: $scalar) {
+                    core::ops::$assign_trait::$assign_trait_fn(self, Self::splat(rhs));
+                }
+            }
+        }
+    };
+}
+
+/// Implements floating-point operators for the provided types.
+macro_rules! impl_float_ops {
+    { $($scalar:ty),* } => {
+        $(
+            impl_op! { impl Add for $scalar }
+            impl_op! { impl Sub for $scalar }
+            impl_op! { impl Mul for $scalar }
+            impl_op! { impl Div for $scalar }
+            impl_op! { impl Rem for $scalar }
+            impl_op! { impl Neg for $scalar }
+        )*
+    };
+}
+
+/// Implements unsigned integer operators for the provided types.
+macro_rules! impl_unsigned_int_ops {
+    { $($scalar:ty),* } => {
+        $(
+            impl_op! { impl Add for $scalar }
+            impl_op! { impl Sub for $scalar }
+            impl_op! { impl Mul for $scalar }
+            impl_op! { impl BitAnd for $scalar }
+            impl_op! { impl BitOr  for $scalar }
+            impl_op! { impl BitXor for $scalar }
+            impl_op! { impl Not for $scalar }
+
+            // Integers panic on divide by 0
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::Div<Self> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    type Output = Self;
+
+                    #[inline]
+                    fn div(self, rhs: Self) -> Self::Output {
+                        if rhs.as_array()
+                            .iter()
+                            .any(|x| *x == 0)
+                        {
+                            panic!("attempt to divide by zero");
+                        }
+
+                        // Guards for div(MIN, -1),
+                        // this check only applies to signed ints
+                        if <$scalar>::MIN != 0 && self.as_array().iter()
+                                .zip(rhs.as_array().iter())
+                                .any(|(x,y)| *x == <$scalar>::MIN && *y == -1 as _) {
+                            panic!("attempt to divide with overflow");
+                        }
+                        unsafe { intrinsics::simd_div(self, rhs) }
+                    }
+                }
+            }
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::Div<$scalar> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    type Output = Self;
+
+                    #[inline]
+                    fn div(self, rhs: $scalar) -> Self::Output {
+                        if rhs == 0 {
+                            panic!("attempt to divide by zero");
+                        }
+                        if <$scalar>::MIN != 0 &&
+                            self.as_array().iter().any(|x| *x == <$scalar>::MIN) &&
+                            rhs == -1 as _ {
+                                panic!("attempt to divide with overflow");
+                        }
+                        let rhs = Self::splat(rhs);
+                        unsafe { intrinsics::simd_div(self, rhs) }
+                    }
+                }
+            }
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::Div<Simd<$scalar, LANES>> for $scalar
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    type Output = Simd<$scalar, LANES>;
+
+                    #[inline]
+                    fn div(self, rhs: Simd<$scalar, LANES>) -> Self::Output {
+                        Simd::splat(self) / rhs
+                    }
+                }
+            }
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::DivAssign<Self> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    #[inline]
+                    fn div_assign(&mut self, rhs: Self) {
+                        *self = *self / rhs;
+                    }
+                }
+            }
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::DivAssign<$scalar> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    #[inline]
+                    fn div_assign(&mut self, rhs: $scalar) {
+                        *self = *self / rhs;
+                    }
+                }
+            }
+
+            // remainder panics on zero divisor
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::Rem<Self> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    type Output = Self;
+
+                    #[inline]
+                    fn rem(self, rhs: Self) -> Self::Output {
+                        if rhs.as_array()
+                            .iter()
+                            .any(|x| *x == 0)
+                        {
+                            panic!("attempt to calculate the remainder with a divisor of zero");
+                        }
+
+                        // Guards for rem(MIN, -1)
+                        // this branch applies the check only to signed ints
+                        if <$scalar>::MIN != 0 && self.as_array().iter()
+                                .zip(rhs.as_array().iter())
+                                .any(|(x,y)| *x == <$scalar>::MIN && *y == -1 as _) {
+                            panic!("attempt to calculate the remainder with overflow");
+                        }
+                        unsafe { intrinsics::simd_rem(self, rhs) }
+                    }
+                }
+            }
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::Rem<$scalar> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    type Output = Self;
+
+                    #[inline]
+                    fn rem(self, rhs: $scalar) -> Self::Output {
+                        if rhs == 0 {
+                            panic!("attempt to calculate the remainder with a divisor of zero");
+                        }
+                        if <$scalar>::MIN != 0 &&
+                            self.as_array().iter().any(|x| *x == <$scalar>::MIN) &&
+                            rhs == -1 as _ {
+                                panic!("attempt to calculate the remainder with overflow");
+                        }
+                        let rhs = Self::splat(rhs);
+                        unsafe { intrinsics::simd_rem(self, rhs) }
+                    }
+                }
+            }
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::Rem<Simd<$scalar, LANES>> for $scalar
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    type Output = Simd<$scalar, LANES>;
+
+                    #[inline]
+                    fn rem(self, rhs: Simd<$scalar, LANES>) -> Self::Output {
+                        Simd::splat(self) % rhs
+                    }
+                }
+            }
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::RemAssign<Self> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    #[inline]
+                    fn rem_assign(&mut self, rhs: Self) {
+                        *self = *self % rhs;
+                    }
+                }
+            }
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::RemAssign<$scalar> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    #[inline]
+                    fn rem_assign(&mut self, rhs: $scalar) {
+                        *self = *self % rhs;
+                    }
+                }
+            }
+
+            // shifts panic on overflow
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::Shl<Self> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    type Output = Self;
+
+                    #[inline]
+                    fn shl(self, rhs: Self) -> Self::Output {
+                        // TODO there is probably a better way of doing this
+                        if rhs.as_array()
+                            .iter()
+                            .copied()
+                            .any(invalid_shift_rhs)
+                        {
+                            panic!("attempt to shift left with overflow");
+                        }
+                        unsafe { intrinsics::simd_shl(self, rhs) }
+                    }
+                }
+            }
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::Shl<$scalar> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    type Output = Self;
+
+                    #[inline]
+                    fn shl(self, rhs: $scalar) -> Self::Output {
+                        if invalid_shift_rhs(rhs) {
+                            panic!("attempt to shift left with overflow");
+                        }
+                        let rhs = Self::splat(rhs);
+                        unsafe { intrinsics::simd_shl(self, rhs) }
+                    }
+                }
+            }
+
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::ShlAssign<Self> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    #[inline]
+                    fn shl_assign(&mut self, rhs: Self) {
+                        *self = *self << rhs;
+                    }
+                }
+            }
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::ShlAssign<$scalar> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    #[inline]
+                    fn shl_assign(&mut self, rhs: $scalar) {
+                        *self = *self << rhs;
+                    }
+                }
+            }
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::Shr<Self> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    type Output = Self;
+
+                    #[inline]
+                    fn shr(self, rhs: Self) -> Self::Output {
+                        // TODO there is probably a better way of doing this
+                        if rhs.as_array()
+                            .iter()
+                            .copied()
+                            .any(invalid_shift_rhs)
+                        {
+                            panic!("attempt to shift with overflow");
+                        }
+                        unsafe { intrinsics::simd_shr(self, rhs) }
+                    }
+                }
+            }
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::Shr<$scalar> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    type Output = Self;
+
+                    #[inline]
+                    fn shr(self, rhs: $scalar) -> Self::Output {
+                        if invalid_shift_rhs(rhs) {
+                            panic!("attempt to shift with overflow");
+                        }
+                        let rhs = Self::splat(rhs);
+                        unsafe { intrinsics::simd_shr(self, rhs) }
+                    }
+                }
+            }
+
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::ShrAssign<Self> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    #[inline]
+                    fn shr_assign(&mut self, rhs: Self) {
+                        *self = *self >> rhs;
+                    }
+                }
+            }
+
+            impl_ref_ops! {
+                impl<const LANES: usize> core::ops::ShrAssign<$scalar> for Simd<$scalar, LANES>
+                where
+                    LaneCount<LANES>: SupportedLaneCount,
+                {
+                    #[inline]
+                    fn shr_assign(&mut self, rhs: $scalar) {
+                        *self = *self >> rhs;
+                    }
+                }
+            }
+        )*
+    };
+}
+
+/// Implements unsigned integer operators for the provided types.
+macro_rules! impl_signed_int_ops {
+    { $($scalar:ty),* } => {
+        impl_unsigned_int_ops! { $($scalar),* }
+        $( // scalar
+            impl_op! { impl Neg for $scalar }
+        )*
+    };
+}
+
+impl_unsigned_int_ops! { u8, u16, u32, u64, usize }
+impl_signed_int_ops! { i8, i16, i32, i64, isize }
+impl_float_ops! { f32, f64 }
--- a/library/portable-simd/crates/core_simd/src/reduction.rs
+++ b/library/portable-simd/crates/core_simd/src/reduction.rs
@ -0,0 +1,123 @@
+use crate::simd::intrinsics::{
+    simd_reduce_add_ordered, simd_reduce_and, simd_reduce_max, simd_reduce_min,
+    simd_reduce_mul_ordered, simd_reduce_or, simd_reduce_xor,
+};
+use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+
+macro_rules! impl_integer_reductions {
+    { $scalar:ty } => {
+        impl<const LANES: usize> Simd<$scalar, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            /// Horizontal wrapping add.  Returns the sum of the lanes of the vector, with wrapping addition.
+            #[inline]
+            pub fn horizontal_sum(self) -> $scalar {
+                unsafe { simd_reduce_add_ordered(self, 0) }
+            }
+
+            /// Horizontal wrapping multiply.  Returns the product of the lanes of the vector, with wrapping multiplication.
+            #[inline]
+            pub fn horizontal_product(self) -> $scalar {
+                unsafe { simd_reduce_mul_ordered(self, 1) }
+            }
+
+            /// Horizontal bitwise "and".  Returns the cumulative bitwise "and" across the lanes of
+            /// the vector.
+            #[inline]
+            pub fn horizontal_and(self) -> $scalar {
+                unsafe { simd_reduce_and(self) }
+            }
+
+            /// Horizontal bitwise "or".  Returns the cumulative bitwise "or" across the lanes of
+            /// the vector.
+            #[inline]
+            pub fn horizontal_or(self) -> $scalar {
+                unsafe { simd_reduce_or(self) }
+            }
+
+            /// Horizontal bitwise "xor".  Returns the cumulative bitwise "xor" across the lanes of
+            /// the vector.
+            #[inline]
+            pub fn horizontal_xor(self) -> $scalar {
+                unsafe { simd_reduce_xor(self) }
+            }
+
+            /// Horizontal maximum.  Returns the maximum lane in the vector.
+            #[inline]
+            pub fn horizontal_max(self) -> $scalar {
+                unsafe { simd_reduce_max(self) }
+            }
+
+            /// Horizontal minimum.  Returns the minimum lane in the vector.
+            #[inline]
+            pub fn horizontal_min(self) -> $scalar {
+                unsafe { simd_reduce_min(self) }
+            }
+        }
+    }
+}
+
+impl_integer_reductions! { i8 }
+impl_integer_reductions! { i16 }
+impl_integer_reductions! { i32 }
+impl_integer_reductions! { i64 }
+impl_integer_reductions! { isize }
+impl_integer_reductions! { u8 }
+impl_integer_reductions! { u16 }
+impl_integer_reductions! { u32 }
+impl_integer_reductions! { u64 }
+impl_integer_reductions! { usize }
+
+macro_rules! impl_float_reductions {
+    { $scalar:ty } => {
+        impl<const LANES: usize> Simd<$scalar, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+
+            /// Horizontal add.  Returns the sum of the lanes of the vector.
+            #[inline]
+            pub fn horizontal_sum(self) -> $scalar {
+                // LLVM sum is inaccurate on i586
+                if cfg!(all(target_arch = "x86", not(target_feature = "sse2"))) {
+                    self.as_array().iter().sum()
+                } else {
+                    unsafe { simd_reduce_add_ordered(self, 0.) }
+                }
+            }
+
+            /// Horizontal multiply.  Returns the product of the lanes of the vector.
+            #[inline]
+            pub fn horizontal_product(self) -> $scalar {
+                // LLVM product is inaccurate on i586
+                if cfg!(all(target_arch = "x86", not(target_feature = "sse2"))) {
+                    self.as_array().iter().product()
+                } else {
+                    unsafe { simd_reduce_mul_ordered(self, 1.) }
+                }
+            }
+
+            /// Horizontal maximum.  Returns the maximum lane in the vector.
+            ///
+            /// Returns values based on equality, so a vector containing both `0.` and `-0.` may
+            /// return either.  This function will not return `NaN` unless all lanes are `NaN`.
+            #[inline]
+            pub fn horizontal_max(self) -> $scalar {
+                unsafe { simd_reduce_max(self) }
+            }
+
+            /// Horizontal minimum.  Returns the minimum lane in the vector.
+            ///
+            /// Returns values based on equality, so a vector containing both `0.` and `-0.` may
+            /// return either.  This function will not return `NaN` unless all lanes are `NaN`.
+            #[inline]
+            pub fn horizontal_min(self) -> $scalar {
+                unsafe { simd_reduce_min(self) }
+            }
+        }
+    }
+}
+
+impl_float_reductions! { f32 }
+impl_float_reductions! { f64 }
--- a/library/portable-simd/crates/core_simd/src/round.rs
+++ b/library/portable-simd/crates/core_simd/src/round.rs
@ -0,0 +1,78 @@
+use crate::simd::intrinsics;
+use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+
+macro_rules! implement {
+    {
+        $type:ty, $int_type:ty
+    } => {
+        #[cfg(feature = "std")]
+        impl<const LANES: usize> Simd<$type, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            /// Returns the smallest integer greater than or equal to each lane.
+            #[must_use = "method returns a new vector and does not mutate the original value"]
+            #[inline]
+            pub fn ceil(self) -> Self {
+                unsafe { intrinsics::simd_ceil(self) }
+            }
+
+            /// Returns the largest integer value less than or equal to each lane.
+            #[must_use = "method returns a new vector and does not mutate the original value"]
+            #[inline]
+            pub fn floor(self) -> Self {
+                unsafe { intrinsics::simd_floor(self) }
+            }
+
+            /// Rounds to the nearest integer value. Ties round toward zero.
+            #[must_use = "method returns a new vector and does not mutate the original value"]
+            #[inline]
+            pub fn round(self) -> Self {
+                unsafe { intrinsics::simd_round(self) }
+            }
+
+            /// Returns the floating point's integer value, with its fractional part removed.
+            #[must_use = "method returns a new vector and does not mutate the original value"]
+            #[inline]
+            pub fn trunc(self) -> Self {
+                unsafe { intrinsics::simd_trunc(self) }
+            }
+
+            /// Returns the floating point's fractional value, with its integer part removed.
+            #[must_use = "method returns a new vector and does not mutate the original value"]
+            #[inline]
+            pub fn fract(self) -> Self {
+                self - self.trunc()
+            }
+        }
+
+        impl<const LANES: usize> Simd<$type, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            /// Rounds toward zero and converts to the same-width integer type, assuming that
+            /// the value is finite and fits in that type.
+            ///
+            /// # Safety
+            /// The value must:
+            ///
+            /// * Not be NaN
+            /// * Not be infinite
+            /// * Be representable in the return type, after truncating off its fractional part
+            #[inline]
+            pub unsafe fn to_int_unchecked(self) -> Simd<$int_type, LANES> {
+                unsafe { intrinsics::simd_cast(self) }
+            }
+
+            /// Creates a floating-point vector from an integer vector.  Rounds values that are
+            /// not exactly representable.
+            #[inline]
+            pub fn round_from_int(value: Simd<$int_type, LANES>) -> Self {
+                unsafe { intrinsics::simd_cast(value) }
+            }
+        }
+    }
+}
+
+implement! { f32, i32 }
+implement! { f64, i64 }
--- a/library/portable-simd/crates/core_simd/src/select.rs
+++ b/library/portable-simd/crates/core_simd/src/select.rs
@ -0,0 +1,86 @@
+use crate::simd::intrinsics;
+use crate::simd::{LaneCount, Mask, MaskElement, Simd, SimdElement, SupportedLaneCount};
+
+mod sealed {
+    pub trait Sealed<Mask> {
+        fn select(mask: Mask, true_values: Self, false_values: Self) -> Self;
+    }
+}
+use sealed::Sealed;
+
+/// Supporting trait for vector `select` function
+pub trait Select<Mask>: Sealed<Mask> {}
+
+impl<T, const LANES: usize> Sealed<Mask<T::Mask, LANES>> for Simd<T, LANES>
+where
+    T: SimdElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn select(mask: Mask<T::Mask, LANES>, true_values: Self, false_values: Self) -> Self {
+        unsafe { intrinsics::simd_select(mask.to_int(), true_values, false_values) }
+    }
+}
+
+impl<T, const LANES: usize> Select<Mask<T::Mask, LANES>> for Simd<T, LANES>
+where
+    T: SimdElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+}
+
+impl<T, const LANES: usize> Sealed<Self> for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    #[inline]
+    fn select(mask: Self, true_values: Self, false_values: Self) -> Self {
+        mask & true_values | !mask & false_values
+    }
+}
+
+impl<T, const LANES: usize> Select<Self> for Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+}
+
+impl<T, const LANES: usize> Mask<T, LANES>
+where
+    T: MaskElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    /// Choose lanes from two vectors.
+    ///
+    /// For each lane in the mask, choose the corresponding lane from `true_values` if
+    /// that lane mask is true, and `false_values` if that lane mask is false.
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "std")] use core_simd::{Simd, Mask};
+    /// # #[cfg(not(feature = "std"))] use core::simd::{Simd, Mask};
+    /// let a = Simd::from_array([0, 1, 2, 3]);
+    /// let b = Simd::from_array([4, 5, 6, 7]);
+    /// let mask = Mask::from_array([true, false, false, true]);
+    /// let c = mask.select(a, b);
+    /// assert_eq!(c.to_array(), [0, 5, 6, 3]);
+    /// ```
+    ///
+    /// `select` can also be used on masks:
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "std")] use core_simd::Mask;
+    /// # #[cfg(not(feature = "std"))] use core::simd::Mask;
+    /// let a = Mask::<i32, 4>::from_array([true, true, false, false]);
+    /// let b = Mask::<i32, 4>::from_array([false, false, true, true]);
+    /// let mask = Mask::<i32, 4>::from_array([true, false, false, true]);
+    /// let c = mask.select(a, b);
+    /// assert_eq!(c.to_array(), [true, false, true, false]);
+    /// ```
+    #[inline]
+    pub fn select<S: Select<Self>>(self, true_values: S, false_values: S) -> S {
+        S::select(self, true_values, false_values)
+    }
+}
--- a/library/portable-simd/crates/core_simd/src/swizzle.rs
+++ b/library/portable-simd/crates/core_simd/src/swizzle.rs
@ -0,0 +1,374 @@
+use crate::simd::intrinsics;
+use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+
+/// Constructs a new vector by selecting values from the lanes of the source vector or vectors to use.
+///
+/// When swizzling one vector, the indices of the result vector are indicated by a `const` array
+/// of `usize`, like [`Swizzle`].
+/// When swizzling two vectors, the indices are indicated by a `const` array of [`Which`], like
+/// [`Swizzle2`].
+///
+/// # Examples
+/// ## One source vector
+/// ```
+/// # #![feature(portable_simd)]
+/// # #[cfg(feature = "std")] use core_simd::{Simd, simd_swizzle};
+/// # #[cfg(not(feature = "std"))] use core::simd::{Simd, simd_swizzle};
+/// let v = Simd::<f32, 4>::from_array([0., 1., 2., 3.]);
+///
+/// // Keeping the same size
+/// let r = simd_swizzle!(v, [3, 0, 1, 2]);
+/// assert_eq!(r.to_array(), [3., 0., 1., 2.]);
+///
+/// // Changing the number of lanes
+/// let r = simd_swizzle!(v, [3, 1]);
+/// assert_eq!(r.to_array(), [3., 1.]);
+/// ```
+///
+/// ## Two source vectors
+/// ```
+/// # #![feature(portable_simd)]
+/// # #[cfg(feature = "std")] use core_simd::{Simd, simd_swizzle, Which};
+/// # #[cfg(not(feature = "std"))] use core::simd::{Simd, simd_swizzle, Which};
+/// use Which::*;
+/// let a = Simd::<f32, 4>::from_array([0., 1., 2., 3.]);
+/// let b = Simd::<f32, 4>::from_array([4., 5., 6., 7.]);
+///
+/// // Keeping the same size
+/// let r = simd_swizzle!(a, b, [First(0), First(1), Second(2), Second(3)]);
+/// assert_eq!(r.to_array(), [0., 1., 6., 7.]);
+///
+/// // Changing the number of lanes
+/// let r = simd_swizzle!(a, b, [First(0), Second(0)]);
+/// assert_eq!(r.to_array(), [0., 4.]);
+/// ```
+#[allow(unused_macros)]
+pub macro simd_swizzle {
+    (
+        $vector:expr, $index:expr $(,)?
+    ) => {
+        {
+            use $crate::simd::Swizzle;
+            struct Impl;
+            impl<const LANES: usize> Swizzle<LANES, {$index.len()}> for Impl {
+                const INDEX: [usize; {$index.len()}] = $index;
+            }
+            Impl::swizzle($vector)
+        }
+    },
+    (
+        $first:expr, $second:expr, $index:expr $(,)?
+    ) => {
+        {
+            use $crate::simd::{Which, Swizzle2};
+            struct Impl;
+            impl<const LANES: usize> Swizzle2<LANES, {$index.len()}> for Impl {
+                const INDEX: [Which; {$index.len()}] = $index;
+            }
+            Impl::swizzle2($first, $second)
+        }
+    }
+}
+
+/// An index into one of two vectors.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub enum Which {
+    /// Indexes the first vector.
+    First(usize),
+    /// Indexes the second vector.
+    Second(usize),
+}
+
+/// Create a vector from the elements of another vector.
+pub trait Swizzle<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
+    /// Map from the lanes of the input vector to the output vector.
+    const INDEX: [usize; OUTPUT_LANES];
+
+    /// Create a new vector from the lanes of `vector`.
+    ///
+    /// Lane `i` of the output is `vector[Self::INDEX[i]]`.
+    fn swizzle<T>(vector: Simd<T, INPUT_LANES>) -> Simd<T, OUTPUT_LANES>
+    where
+        T: SimdElement,
+        LaneCount<INPUT_LANES>: SupportedLaneCount,
+        LaneCount<OUTPUT_LANES>: SupportedLaneCount,
+    {
+        unsafe { intrinsics::simd_shuffle(vector, vector, Self::INDEX_IMPL) }
+    }
+}
+
+/// Create a vector from the elements of two other vectors.
+pub trait Swizzle2<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
+    /// Map from the lanes of the input vectors to the output vector
+    const INDEX: [Which; OUTPUT_LANES];
+
+    /// Create a new vector from the lanes of `first` and `second`.
+    ///
+    /// Lane `i` is `first[j]` when `Self::INDEX[i]` is `First(j)`, or `second[j]` when it is
+    /// `Second(j)`.
+    fn swizzle2<T>(
+        first: Simd<T, INPUT_LANES>,
+        second: Simd<T, INPUT_LANES>,
+    ) -> Simd<T, OUTPUT_LANES>
+    where
+        T: SimdElement,
+        LaneCount<INPUT_LANES>: SupportedLaneCount,
+        LaneCount<OUTPUT_LANES>: SupportedLaneCount,
+    {
+        unsafe { intrinsics::simd_shuffle(first, second, Self::INDEX_IMPL) }
+    }
+}
+
+/// The `simd_shuffle` intrinsic expects `u32`, so do error checking and conversion here.
+/// This trait hides `INDEX_IMPL` from the public API.
+trait SwizzleImpl<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
+    const INDEX_IMPL: [u32; OUTPUT_LANES];
+}
+
+impl<T, const INPUT_LANES: usize, const OUTPUT_LANES: usize> SwizzleImpl<INPUT_LANES, OUTPUT_LANES>
+    for T
+where
+    T: Swizzle<INPUT_LANES, OUTPUT_LANES> + ?Sized,
+{
+    const INDEX_IMPL: [u32; OUTPUT_LANES] = {
+        let mut output = [0; OUTPUT_LANES];
+        let mut i = 0;
+        while i < OUTPUT_LANES {
+            let index = Self::INDEX[i];
+            assert!(index as u32 as usize == index);
+            assert!(index < INPUT_LANES, "source lane exceeds input lane count",);
+            output[i] = index as u32;
+            i += 1;
+        }
+        output
+    };
+}
+
+/// The `simd_shuffle` intrinsic expects `u32`, so do error checking and conversion here.
+/// This trait hides `INDEX_IMPL` from the public API.
+trait Swizzle2Impl<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
+    const INDEX_IMPL: [u32; OUTPUT_LANES];
+}
+
+impl<T, const INPUT_LANES: usize, const OUTPUT_LANES: usize> Swizzle2Impl<INPUT_LANES, OUTPUT_LANES>
+    for T
+where
+    T: Swizzle2<INPUT_LANES, OUTPUT_LANES> + ?Sized,
+{
+    const INDEX_IMPL: [u32; OUTPUT_LANES] = {
+        let mut output = [0; OUTPUT_LANES];
+        let mut i = 0;
+        while i < OUTPUT_LANES {
+            let (offset, index) = match Self::INDEX[i] {
+                Which::First(index) => (false, index),
+                Which::Second(index) => (true, index),
+            };
+            assert!(index < INPUT_LANES, "source lane exceeds input lane count",);
+
+            // lanes are indexed by the first vector, then second vector
+            let index = if offset { index + INPUT_LANES } else { index };
+            assert!(index as u32 as usize == index);
+            output[i] = index as u32;
+            i += 1;
+        }
+        output
+    };
+}
+
+impl<T, const LANES: usize> Simd<T, LANES>
+where
+    T: SimdElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    /// Reverse the order of the lanes in the vector.
+    #[inline]
+    pub fn reverse(self) -> Self {
+        const fn reverse_index<const LANES: usize>() -> [usize; LANES] {
+            let mut index = [0; LANES];
+            let mut i = 0;
+            while i < LANES {
+                index[i] = LANES - i - 1;
+                i += 1;
+            }
+            index
+        }
+
+        struct Reverse;
+
+        impl<const LANES: usize> Swizzle<LANES, LANES> for Reverse {
+            const INDEX: [usize; LANES] = reverse_index::<LANES>();
+        }
+
+        Reverse::swizzle(self)
+    }
+
+    /// Rotates the vector such that the first `OFFSET` elements of the slice move to the end
+    /// while the last `LANES - OFFSET` elements move to the front. After calling `rotate_lanes_left`,
+    /// the element previously in lane `OFFSET` will become the first element in the slice.
+    #[inline]
+    pub fn rotate_lanes_left<const OFFSET: usize>(self) -> Self {
+        const fn rotate_index<const OFFSET: usize, const LANES: usize>() -> [usize; LANES] {
+            let offset = OFFSET % LANES;
+            let mut index = [0; LANES];
+            let mut i = 0;
+            while i < LANES {
+                index[i] = (i + offset) % LANES;
+                i += 1;
+            }
+            index
+        }
+
+        struct Rotate<const OFFSET: usize>;
+
+        impl<const OFFSET: usize, const LANES: usize> Swizzle<LANES, LANES> for Rotate<OFFSET> {
+            const INDEX: [usize; LANES] = rotate_index::<OFFSET, LANES>();
+        }
+
+        Rotate::<OFFSET>::swizzle(self)
+    }
+
+    /// Rotates the vector such that the first `LANES - OFFSET` elements of the vector move to
+    /// the end while the last `OFFSET` elements move to the front. After calling `rotate_lanes_right`,
+    /// the element previously at index `LANES - OFFSET` will become the first element in the slice.
+    #[inline]
+    pub fn rotate_lanes_right<const OFFSET: usize>(self) -> Self {
+        const fn rotate_index<const OFFSET: usize, const LANES: usize>() -> [usize; LANES] {
+            let offset = LANES - OFFSET % LANES;
+            let mut index = [0; LANES];
+            let mut i = 0;
+            while i < LANES {
+                index[i] = (i + offset) % LANES;
+                i += 1;
+            }
+            index
+        }
+
+        struct Rotate<const OFFSET: usize>;
+
+        impl<const OFFSET: usize, const LANES: usize> Swizzle<LANES, LANES> for Rotate<OFFSET> {
+            const INDEX: [usize; LANES] = rotate_index::<OFFSET, LANES>();
+        }
+
+        Rotate::<OFFSET>::swizzle(self)
+    }
+
+    /// Interleave two vectors.
+    ///
+    /// Produces two vectors with lanes taken alternately from `self` and `other`.
+    ///
+    /// The first result contains the first `LANES / 2` lanes from `self` and `other`,
+    /// alternating, starting with the first lane of `self`.
+    ///
+    /// The second result contains the last `LANES / 2` lanes from `self` and `other`,
+    /// alternating, starting with the lane `LANES / 2` from the start of `self`.
+    ///
+    /// ```
+    /// #![feature(portable_simd)]
+    /// # #[cfg(feature = "std")] use core_simd::Simd;
+    /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+    /// let a = Simd::from_array([0, 1, 2, 3]);
+    /// let b = Simd::from_array([4, 5, 6, 7]);
+    /// let (x, y) = a.interleave(b);
+    /// assert_eq!(x.to_array(), [0, 4, 1, 5]);
+    /// assert_eq!(y.to_array(), [2, 6, 3, 7]);
+    /// ```
+    #[inline]
+    pub fn interleave(self, other: Self) -> (Self, Self) {
+        const fn lo<const LANES: usize>() -> [Which; LANES] {
+            let mut idx = [Which::First(0); LANES];
+            let mut i = 0;
+            while i < LANES {
+                let offset = i / 2;
+                idx[i] = if i % 2 == 0 {
+                    Which::First(offset)
+                } else {
+                    Which::Second(offset)
+                };
+                i += 1;
+            }
+            idx
+        }
+        const fn hi<const LANES: usize>() -> [Which; LANES] {
+            let mut idx = [Which::First(0); LANES];
+            let mut i = 0;
+            while i < LANES {
+                let offset = (LANES + i) / 2;
+                idx[i] = if i % 2 == 0 {
+                    Which::First(offset)
+                } else {
+                    Which::Second(offset)
+                };
+                i += 1;
+            }
+            idx
+        }
+
+        struct Lo;
+        struct Hi;
+
+        impl<const LANES: usize> Swizzle2<LANES, LANES> for Lo {
+            const INDEX: [Which; LANES] = lo::<LANES>();
+        }
+
+        impl<const LANES: usize> Swizzle2<LANES, LANES> for Hi {
+            const INDEX: [Which; LANES] = hi::<LANES>();
+        }
+
+        (Lo::swizzle2(self, other), Hi::swizzle2(self, other))
+    }
+
+    /// Deinterleave two vectors.
+    ///
+    /// The first result takes every other lane of `self` and then `other`, starting with
+    /// the first lane.
+    ///
+    /// The second result takes every other lane of `self` and then `other`, starting with
+    /// the second lane.
+    ///
+    /// ```
+    /// #![feature(portable_simd)]
+    /// # #[cfg(feature = "std")] use core_simd::Simd;
+    /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+    /// let a = Simd::from_array([0, 4, 1, 5]);
+    /// let b = Simd::from_array([2, 6, 3, 7]);
+    /// let (x, y) = a.deinterleave(b);
+    /// assert_eq!(x.to_array(), [0, 1, 2, 3]);
+    /// assert_eq!(y.to_array(), [4, 5, 6, 7]);
+    /// ```
+    #[inline]
+    pub fn deinterleave(self, other: Self) -> (Self, Self) {
+        const fn even<const LANES: usize>() -> [Which; LANES] {
+            let mut idx = [Which::First(0); LANES];
+            let mut i = 0;
+            while i < LANES / 2 {
+                idx[i] = Which::First(2 * i);
+                idx[i + LANES / 2] = Which::Second(2 * i);
+                i += 1;
+            }
+            idx
+        }
+        const fn odd<const LANES: usize>() -> [Which; LANES] {
+            let mut idx = [Which::First(0); LANES];
+            let mut i = 0;
+            while i < LANES / 2 {
+                idx[i] = Which::First(2 * i + 1);
+                idx[i + LANES / 2] = Which::Second(2 * i + 1);
+                i += 1;
+            }
+            idx
+        }
+
+        struct Even;
+        struct Odd;
+
+        impl<const LANES: usize> Swizzle2<LANES, LANES> for Even {
+            const INDEX: [Which; LANES] = even::<LANES>();
+        }
+
+        impl<const LANES: usize> Swizzle2<LANES, LANES> for Odd {
+            const INDEX: [Which; LANES] = odd::<LANES>();
+        }
+
+        (Even::swizzle2(self, other), Odd::swizzle2(self, other))
+    }
+}
--- a/library/portable-simd/crates/core_simd/src/to_bytes.rs
+++ b/library/portable-simd/crates/core_simd/src/to_bytes.rs
@ -0,0 +1,39 @@
+macro_rules! impl_to_bytes {
+    { $ty:ty, $size:literal } => {
+        impl<const LANES: usize> crate::simd::Simd<$ty, LANES>
+        where
+            crate::simd::LaneCount<LANES>: crate::simd::SupportedLaneCount,
+            crate::simd::LaneCount<{{ $size * LANES }}>: crate::simd::SupportedLaneCount,
+        {
+            /// Return the memory representation of this integer as a byte array in native byte
+            /// order.
+            pub fn to_ne_bytes(self) -> crate::simd::Simd<u8, {{ $size * LANES }}> {
+                unsafe { core::mem::transmute_copy(&self) }
+            }
+
+            /// Create a native endian integer value from its memory representation as a byte array
+            /// in native endianness.
+            pub fn from_ne_bytes(bytes: crate::simd::Simd<u8, {{ $size * LANES }}>) -> Self {
+                unsafe { core::mem::transmute_copy(&bytes) }
+            }
+        }
+    }
+}
+
+impl_to_bytes! { u8, 1 }
+impl_to_bytes! { u16, 2 }
+impl_to_bytes! { u32, 4 }
+impl_to_bytes! { u64, 8 }
+#[cfg(target_pointer_width = "32")]
+impl_to_bytes! { usize, 4 }
+#[cfg(target_pointer_width = "64")]
+impl_to_bytes! { usize, 8 }
+
+impl_to_bytes! { i8, 1 }
+impl_to_bytes! { i16, 2 }
+impl_to_bytes! { i32, 4 }
+impl_to_bytes! { i64, 8 }
+#[cfg(target_pointer_width = "32")]
+impl_to_bytes! { isize, 4 }
+#[cfg(target_pointer_width = "64")]
+impl_to_bytes! { isize, 8 }
--- a/library/portable-simd/crates/core_simd/src/vector.rs
+++ b/library/portable-simd/crates/core_simd/src/vector.rs
@ -0,0 +1,528 @@
+mod float;
+mod int;
+mod uint;
+
+pub use float::*;
+pub use int::*;
+pub use uint::*;
+
+// Vectors of pointers are not for public use at the current time.
+pub(crate) mod ptr;
+
+use crate::simd::intrinsics;
+use crate::simd::{LaneCount, Mask, MaskElement, SupportedLaneCount};
+
+/// A SIMD vector of `LANES` elements of type `T`.
+#[repr(simd)]
+pub struct Simd<T, const LANES: usize>([T; LANES])
+where
+    T: SimdElement,
+    LaneCount<LANES>: SupportedLaneCount;
+
+impl<T, const LANES: usize> Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement,
+{
+    /// Number of lanes in this vector.
+    pub const LANES: usize = LANES;
+
+    /// Get the number of lanes in this vector.
+    pub const fn lanes(&self) -> usize {
+        LANES
+    }
+
+    /// Construct a SIMD vector by setting all lanes to the given value.
+    pub const fn splat(value: T) -> Self {
+        Self([value; LANES])
+    }
+
+    /// Returns an array reference containing the entire SIMD vector.
+    pub const fn as_array(&self) -> &[T; LANES] {
+        &self.0
+    }
+
+    /// Returns a mutable array reference containing the entire SIMD vector.
+    pub fn as_mut_array(&mut self) -> &mut [T; LANES] {
+        &mut self.0
+    }
+
+    /// Converts an array to a SIMD vector.
+    pub const fn from_array(array: [T; LANES]) -> Self {
+        Self(array)
+    }
+
+    /// Converts a SIMD vector to an array.
+    pub const fn to_array(self) -> [T; LANES] {
+        self.0
+    }
+
+    /// Converts a slice to a SIMD vector containing `slice[..LANES]`
+    /// # Panics
+    /// `from_slice` will panic if the slice's `len` is less than the vector's `Simd::LANES`.
+    #[must_use]
+    pub const fn from_slice(slice: &[T]) -> Self {
+        assert!(
+            slice.len() >= LANES,
+            "slice length must be at least the number of lanes"
+        );
+        let mut array = [slice[0]; LANES];
+        let mut i = 0;
+        while i < LANES {
+            array[i] = slice[i];
+            i += 1;
+        }
+        Self(array)
+    }
+
+    /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
+    /// If an index is out-of-bounds, the lane is instead selected from the `or` vector.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "std")] use core_simd::Simd;
+    /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+    /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
+    /// let idxs = Simd::from_array([9, 3, 0, 5]);
+    /// let alt = Simd::from_array([-5, -4, -3, -2]);
+    ///
+    /// let result = Simd::gather_or(&vec, idxs, alt); // Note the lane that is out-of-bounds.
+    /// assert_eq!(result, Simd::from_array([-5, 13, 10, 15]));
+    /// ```
+    #[must_use]
+    #[inline]
+    pub fn gather_or(slice: &[T], idxs: Simd<usize, LANES>, or: Self) -> Self {
+        Self::gather_select(slice, Mask::splat(true), idxs, or)
+    }
+
+    /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
+    /// If an index is out-of-bounds, the lane is set to the default value for the type.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "std")] use core_simd::Simd;
+    /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+    /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
+    /// let idxs = Simd::from_array([9, 3, 0, 5]);
+    ///
+    /// let result = Simd::gather_or_default(&vec, idxs); // Note the lane that is out-of-bounds.
+    /// assert_eq!(result, Simd::from_array([0, 13, 10, 15]));
+    /// ```
+    #[must_use]
+    #[inline]
+    pub fn gather_or_default(slice: &[T], idxs: Simd<usize, LANES>) -> Self
+    where
+        T: Default,
+    {
+        Self::gather_or(slice, idxs, Self::splat(T::default()))
+    }
+
+    /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
+    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
+    /// If an index is disabled or is out-of-bounds, the lane is selected from the `or` vector.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "std")] use core_simd::{Simd, Mask};
+    /// # #[cfg(not(feature = "std"))] use core::simd::{Simd, Mask};
+    /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
+    /// let idxs = Simd::from_array([9, 3, 0, 5]);
+    /// let alt = Simd::from_array([-5, -4, -3, -2]);
+    /// let enable = Mask::from_array([true, true, true, false]); // Note the mask of the last lane.
+    ///
+    /// let result = Simd::gather_select(&vec, enable, idxs, alt); // Note the lane that is out-of-bounds.
+    /// assert_eq!(result, Simd::from_array([-5, 13, 10, -2]));
+    /// ```
+    #[must_use]
+    #[inline]
+    pub fn gather_select(
+        slice: &[T],
+        enable: Mask<isize, LANES>,
+        idxs: Simd<usize, LANES>,
+        or: Self,
+    ) -> Self {
+        let enable: Mask<isize, LANES> = enable & idxs.lanes_lt(Simd::splat(slice.len()));
+        // SAFETY: We have masked-off out-of-bounds lanes.
+        unsafe { Self::gather_select_unchecked(slice, enable, idxs, or) }
+    }
+
+    /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
+    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
+    /// If an index is disabled, the lane is selected from the `or` vector.
+    ///
+    /// # Safety
+    ///
+    /// Calling this function with an `enable`d out-of-bounds index is *[undefined behavior]*
+    /// even if the resulting value is not used.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "std")] use core_simd::{Simd, Mask};
+    /// # #[cfg(not(feature = "std"))] use core::simd::{Simd, Mask};
+    /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
+    /// let idxs = Simd::from_array([9, 3, 0, 5]);
+    /// let alt = Simd::from_array([-5, -4, -3, -2]);
+    /// let enable = Mask::from_array([true, true, true, false]); // Note the final mask lane.
+    /// // If this mask was used to gather, it would be unsound. Let's fix that.
+    /// let enable = enable & idxs.lanes_lt(Simd::splat(vec.len()));
+    ///
+    /// // We have masked the OOB lane, so it's safe to gather now.
+    /// let result = unsafe { Simd::gather_select_unchecked(&vec, enable, idxs, alt) };
+    /// assert_eq!(result, Simd::from_array([-5, 13, 10, -2]));
+    /// ```
+    /// [undefined behavior]: https://doc.rust-lang.org/reference/behavior-considered-undefined.html
+    #[must_use]
+    #[inline]
+    pub unsafe fn gather_select_unchecked(
+        slice: &[T],
+        enable: Mask<isize, LANES>,
+        idxs: Simd<usize, LANES>,
+        or: Self,
+    ) -> Self {
+        let base_ptr = crate::simd::ptr::SimdConstPtr::splat(slice.as_ptr());
+        // Ferris forgive me, I have done pointer arithmetic here.
+        let ptrs = base_ptr.wrapping_add(idxs);
+        // SAFETY: The ptrs have been bounds-masked to prevent memory-unsafe reads insha'allah
+        unsafe { intrinsics::simd_gather(or, ptrs, enable.to_int()) }
+    }
+
+    /// Writes the values in a SIMD vector to potentially discontiguous indices in `slice`.
+    /// If two lanes in the scattered vector would write to the same index
+    /// only the last lane is guaranteed to actually be written.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "std")] use core_simd::Simd;
+    /// # #[cfg(not(feature = "std"))] use core::simd::Simd;
+    /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
+    /// let idxs = Simd::from_array([9, 3, 0, 0]);
+    /// let vals = Simd::from_array([-27, 82, -41, 124]);
+    ///
+    /// vals.scatter(&mut vec, idxs); // index 0 receives two writes.
+    /// assert_eq!(vec, vec![124, 11, 12, 82, 14, 15, 16, 17, 18]);
+    /// ```
+    #[inline]
+    pub fn scatter(self, slice: &mut [T], idxs: Simd<usize, LANES>) {
+        self.scatter_select(slice, Mask::splat(true), idxs)
+    }
+
+    /// Writes the values in a SIMD vector to multiple potentially discontiguous indices in `slice`.
+    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
+    /// If an enabled index is out-of-bounds, the lane is not written.
+    /// If two enabled lanes in the scattered vector would write to the same index,
+    /// only the last lane is guaranteed to actually be written.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "std")] use core_simd::{Simd, Mask};
+    /// # #[cfg(not(feature = "std"))] use core::simd::{Simd, Mask};
+    /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
+    /// let idxs = Simd::from_array([9, 3, 0, 0]);
+    /// let vals = Simd::from_array([-27, 82, -41, 124]);
+    /// let enable = Mask::from_array([true, true, true, false]); // Note the mask of the last lane.
+    ///
+    /// vals.scatter_select(&mut vec, enable, idxs); // index 0's second write is masked, thus omitted.
+    /// assert_eq!(vec, vec![-41, 11, 12, 82, 14, 15, 16, 17, 18]);
+    /// ```
+    #[inline]
+    pub fn scatter_select(
+        self,
+        slice: &mut [T],
+        enable: Mask<isize, LANES>,
+        idxs: Simd<usize, LANES>,
+    ) {
+        let enable: Mask<isize, LANES> = enable & idxs.lanes_lt(Simd::splat(slice.len()));
+        // SAFETY: We have masked-off out-of-bounds lanes.
+        unsafe { self.scatter_select_unchecked(slice, enable, idxs) }
+    }
+
+    /// Writes the values in a SIMD vector to multiple potentially discontiguous indices in `slice`.
+    /// The mask `enable`s all `true` lanes and disables all `false` lanes.
+    /// If two enabled lanes in the scattered vector would write to the same index,
+    /// only the last lane is guaranteed to actually be written.
+    ///
+    /// # Safety
+    ///
+    /// Calling this function with an enabled out-of-bounds index is *[undefined behavior]*,
+    /// and may lead to memory corruption.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "std")] use core_simd::{Simd, Mask};
+    /// # #[cfg(not(feature = "std"))] use core::simd::{Simd, Mask};
+    /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
+    /// let idxs = Simd::from_array([9, 3, 0, 0]);
+    /// let vals = Simd::from_array([-27, 82, -41, 124]);
+    /// let enable = Mask::from_array([true, true, true, false]); // Note the mask of the last lane.
+    /// // If this mask was used to scatter, it would be unsound. Let's fix that.
+    /// let enable = enable & idxs.lanes_lt(Simd::splat(vec.len()));
+    ///
+    /// // We have masked the OOB lane, so it's safe to scatter now.
+    /// unsafe { vals.scatter_select_unchecked(&mut vec, enable, idxs); }
+    /// // index 0's second write is masked, thus was omitted.
+    /// assert_eq!(vec, vec![-41, 11, 12, 82, 14, 15, 16, 17, 18]);
+    /// ```
+    /// [undefined behavior]: https://doc.rust-lang.org/reference/behavior-considered-undefined.html
+    #[inline]
+    pub unsafe fn scatter_select_unchecked(
+        self,
+        slice: &mut [T],
+        enable: Mask<isize, LANES>,
+        idxs: Simd<usize, LANES>,
+    ) {
+        // SAFETY: This block works with *mut T derived from &mut 'a [T],
+        // which means it is delicate in Rust's borrowing model, circa 2021:
+        // &mut 'a [T] asserts uniqueness, so deriving &'a [T] invalidates live *mut Ts!
+        // Even though this block is largely safe methods, it must be exactly this way
+        // to prevent invalidating the raw ptrs while they're live.
+        // Thus, entering this block requires all values to use being already ready:
+        // 0. idxs we want to write to, which are used to construct the mask.
+        // 1. enable, which depends on an initial &'a [T] and the idxs.
+        // 2. actual values to scatter (self).
+        // 3. &mut [T] which will become our base ptr.
+        unsafe {
+            // Now Entering ☢️ *mut T Zone
+            let base_ptr = crate::simd::ptr::SimdMutPtr::splat(slice.as_mut_ptr());
+            // Ferris forgive me, I have done pointer arithmetic here.
+            let ptrs = base_ptr.wrapping_add(idxs);
+            // The ptrs have been bounds-masked to prevent memory-unsafe writes insha'allah
+            intrinsics::simd_scatter(self, ptrs, enable.to_int())
+            // Cleared ☢️ *mut T Zone
+        }
+    }
+}
+
+impl<T, const LANES: usize> Copy for Simd<T, LANES>
+where
+    T: SimdElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+}
+
+impl<T, const LANES: usize> Clone for Simd<T, LANES>
+where
+    T: SimdElement,
+    LaneCount<LANES>: SupportedLaneCount,
+{
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+
+impl<T, const LANES: usize> Default for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement + Default,
+{
+    #[inline]
+    fn default() -> Self {
+        Self::splat(T::default())
+    }
+}
+
+impl<T, const LANES: usize> PartialEq for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement + PartialEq,
+{
+    #[inline]
+    fn eq(&self, other: &Self) -> bool {
+        // TODO use SIMD equality
+        self.to_array() == other.to_array()
+    }
+}
+
+impl<T, const LANES: usize> PartialOrd for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement + PartialOrd,
+{
+    #[inline]
+    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
+        // TODO use SIMD equality
+        self.to_array().partial_cmp(other.as_ref())
+    }
+}
+
+impl<T, const LANES: usize> Eq for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement + Eq,
+{
+}
+
+impl<T, const LANES: usize> Ord for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement + Ord,
+{
+    #[inline]
+    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
+        // TODO use SIMD equality
+        self.to_array().cmp(other.as_ref())
+    }
+}
+
+impl<T, const LANES: usize> core::hash::Hash for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement + core::hash::Hash,
+{
+    #[inline]
+    fn hash<H>(&self, state: &mut H)
+    where
+        H: core::hash::Hasher,
+    {
+        self.as_array().hash(state)
+    }
+}
+
+// array references
+impl<T, const LANES: usize> AsRef<[T; LANES]> for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement,
+{
+    #[inline]
+    fn as_ref(&self) -> &[T; LANES] {
+        &self.0
+    }
+}
+
+impl<T, const LANES: usize> AsMut<[T; LANES]> for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement,
+{
+    #[inline]
+    fn as_mut(&mut self) -> &mut [T; LANES] {
+        &mut self.0
+    }
+}
+
+// slice references
+impl<T, const LANES: usize> AsRef<[T]> for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement,
+{
+    #[inline]
+    fn as_ref(&self) -> &[T] {
+        &self.0
+    }
+}
+
+impl<T, const LANES: usize> AsMut<[T]> for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement,
+{
+    #[inline]
+    fn as_mut(&mut self) -> &mut [T] {
+        &mut self.0
+    }
+}
+
+// vector/array conversion
+impl<T, const LANES: usize> From<[T; LANES]> for Simd<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement,
+{
+    fn from(array: [T; LANES]) -> Self {
+        Self(array)
+    }
+}
+
+impl<T, const LANES: usize> From<Simd<T, LANES>> for [T; LANES]
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: SimdElement,
+{
+    fn from(vector: Simd<T, LANES>) -> Self {
+        vector.to_array()
+    }
+}
+
+mod sealed {
+    pub trait Sealed {}
+}
+use sealed::Sealed;
+
+/// Marker trait for types that may be used as SIMD vector elements.
+/// SAFETY: This trait, when implemented, asserts the compiler can monomorphize
+/// `#[repr(simd)]` structs with the marked type as an element.
+/// Strictly, it is valid to impl if the vector will not be miscompiled.
+/// Practically, it is user-unfriendly to impl it if the vector won't compile,
+/// even when no soundness guarantees are broken by allowing the user to try.
+pub unsafe trait SimdElement: Sealed + Copy {
+    /// The mask element type corresponding to this element type.
+    type Mask: MaskElement;
+}
+
+impl Sealed for u8 {}
+unsafe impl SimdElement for u8 {
+    type Mask = i8;
+}
+
+impl Sealed for u16 {}
+unsafe impl SimdElement for u16 {
+    type Mask = i16;
+}
+
+impl Sealed for u32 {}
+unsafe impl SimdElement for u32 {
+    type Mask = i32;
+}
+
+impl Sealed for u64 {}
+unsafe impl SimdElement for u64 {
+    type Mask = i64;
+}
+
+impl Sealed for usize {}
+unsafe impl SimdElement for usize {
+    type Mask = isize;
+}
+
+impl Sealed for i8 {}
+unsafe impl SimdElement for i8 {
+    type Mask = i8;
+}
+
+impl Sealed for i16 {}
+unsafe impl SimdElement for i16 {
+    type Mask = i16;
+}
+
+impl Sealed for i32 {}
+unsafe impl SimdElement for i32 {
+    type Mask = i32;
+}
+
+impl Sealed for i64 {}
+unsafe impl SimdElement for i64 {
+    type Mask = i64;
+}
+
+impl Sealed for isize {}
+unsafe impl SimdElement for isize {
+    type Mask = isize;
+}
+
+impl Sealed for f32 {}
+unsafe impl SimdElement for f32 {
+    type Mask = i32;
+}
+
+impl Sealed for f64 {}
+unsafe impl SimdElement for f64 {
+    type Mask = i64;
+}
--- a/library/portable-simd/crates/core_simd/src/vector/float.rs
+++ b/library/portable-simd/crates/core_simd/src/vector/float.rs
@ -0,0 +1,210 @@
+#![allow(non_camel_case_types)]
+
+use crate::simd::intrinsics;
+use crate::simd::{LaneCount, Mask, Simd, SupportedLaneCount};
+
+/// Implements inherent methods for a float vector containing multiple
+/// `$lanes` of float `$type`, which uses `$bits_ty` as its binary
+/// representation.
+macro_rules! impl_float_vector {
+    { $type:ty, $bits_ty:ty, $mask_ty:ty } => {
+        impl<const LANES: usize> Simd<$type, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            /// Raw transmutation to an unsigned integer vector type with the
+            /// same size and number of lanes.
+            #[inline]
+            pub fn to_bits(self) -> Simd<$bits_ty, LANES> {
+                assert_eq!(core::mem::size_of::<Self>(), core::mem::size_of::<Simd<$bits_ty, LANES>>());
+                unsafe { core::mem::transmute_copy(&self) }
+            }
+
+            /// Raw transmutation from an unsigned integer vector type with the
+            /// same size and number of lanes.
+            #[inline]
+            pub fn from_bits(bits: Simd<$bits_ty, LANES>) -> Self {
+                assert_eq!(core::mem::size_of::<Self>(), core::mem::size_of::<Simd<$bits_ty, LANES>>());
+                unsafe { core::mem::transmute_copy(&bits) }
+            }
+
+            /// Produces a vector where every lane has the absolute value of the
+            /// equivalently-indexed lane in `self`.
+            #[inline]
+            pub fn abs(self) -> Self {
+                unsafe { intrinsics::simd_fabs(self) }
+            }
+
+            /// Fused multiply-add.  Computes `(self * a) + b` with only one rounding error,
+            /// yielding a more accurate result than an unfused multiply-add.
+            ///
+            /// Using `mul_add` *may* be more performant than an unfused multiply-add if the target
+            /// architecture has a dedicated `fma` CPU instruction.  However, this is not always
+            /// true, and will be heavily dependent on designing algorithms with specific target
+            /// hardware in mind.
+            #[cfg(feature = "std")]
+            #[inline]
+            pub fn mul_add(self, a: Self, b: Self) -> Self {
+                unsafe { intrinsics::simd_fma(self, a, b) }
+            }
+
+            /// Produces a vector where every lane has the square root value
+            /// of the equivalently-indexed lane in `self`
+            #[inline]
+            #[cfg(feature = "std")]
+            pub fn sqrt(self) -> Self {
+                unsafe { intrinsics::simd_fsqrt(self) }
+            }
+
+            /// Takes the reciprocal (inverse) of each lane, `1/x`.
+            #[inline]
+            pub fn recip(self) -> Self {
+                Self::splat(1.0) / self
+            }
+
+            /// Converts each lane from radians to degrees.
+            #[inline]
+            pub fn to_degrees(self) -> Self {
+                // to_degrees uses a special constant for better precision, so extract that constant
+                self * Self::splat(<$type>::to_degrees(1.))
+            }
+
+            /// Converts each lane from degrees to radians.
+            #[inline]
+            pub fn to_radians(self) -> Self {
+                self * Self::splat(<$type>::to_radians(1.))
+            }
+
+            /// Returns true for each lane if it has a positive sign, including
+            /// `+0.0`, `NaN`s with positive sign bit and positive infinity.
+            #[inline]
+            pub fn is_sign_positive(self) -> Mask<$mask_ty, LANES> {
+                !self.is_sign_negative()
+            }
+
+            /// Returns true for each lane if it has a negative sign, including
+            /// `-0.0`, `NaN`s with negative sign bit and negative infinity.
+            #[inline]
+            pub fn is_sign_negative(self) -> Mask<$mask_ty, LANES> {
+                let sign_bits = self.to_bits() & Simd::splat((!0 >> 1) + 1);
+                sign_bits.lanes_gt(Simd::splat(0))
+            }
+
+            /// Returns true for each lane if its value is `NaN`.
+            #[inline]
+            pub fn is_nan(self) -> Mask<$mask_ty, LANES> {
+                self.lanes_ne(self)
+            }
+
+            /// Returns true for each lane if its value is positive infinity or negative infinity.
+            #[inline]
+            pub fn is_infinite(self) -> Mask<$mask_ty, LANES> {
+                self.abs().lanes_eq(Self::splat(<$type>::INFINITY))
+            }
+
+            /// Returns true for each lane if its value is neither infinite nor `NaN`.
+            #[inline]
+            pub fn is_finite(self) -> Mask<$mask_ty, LANES> {
+                self.abs().lanes_lt(Self::splat(<$type>::INFINITY))
+            }
+
+            /// Returns true for each lane if its value is subnormal.
+            #[inline]
+            pub fn is_subnormal(self) -> Mask<$mask_ty, LANES> {
+                self.abs().lanes_ne(Self::splat(0.0)) & (self.to_bits() & Self::splat(<$type>::INFINITY).to_bits()).lanes_eq(Simd::splat(0))
+            }
+
+            /// Returns true for each lane if its value is neither neither zero, infinite,
+            /// subnormal, or `NaN`.
+            #[inline]
+            pub fn is_normal(self) -> Mask<$mask_ty, LANES> {
+                !(self.abs().lanes_eq(Self::splat(0.0)) | self.is_nan() | self.is_subnormal() | self.is_infinite())
+            }
+
+            /// Replaces each lane with a number that represents its sign.
+            ///
+            /// * `1.0` if the number is positive, `+0.0`, or `INFINITY`
+            /// * `-1.0` if the number is negative, `-0.0`, or `NEG_INFINITY`
+            /// * `NAN` if the number is `NAN`
+            #[inline]
+            pub fn signum(self) -> Self {
+                self.is_nan().select(Self::splat(<$type>::NAN), Self::splat(1.0).copysign(self))
+            }
+
+            /// Returns each lane with the magnitude of `self` and the sign of `sign`.
+            ///
+            /// If any lane is a `NAN`, then a `NAN` with the sign of `sign` is returned.
+            #[inline]
+            pub fn copysign(self, sign: Self) -> Self {
+                let sign_bit = sign.to_bits() & Self::splat(-0.).to_bits();
+                let magnitude = self.to_bits() & !Self::splat(-0.).to_bits();
+                Self::from_bits(sign_bit | magnitude)
+            }
+
+            /// Returns the minimum of each lane.
+            ///
+            /// If one of the values is `NAN`, then the other value is returned.
+            #[inline]
+            pub fn min(self, other: Self) -> Self {
+                // TODO consider using an intrinsic
+                self.is_nan().select(
+                    other,
+                    self.lanes_ge(other).select(other, self)
+                )
+            }
+
+            /// Returns the maximum of each lane.
+            ///
+            /// If one of the values is `NAN`, then the other value is returned.
+            #[inline]
+            pub fn max(self, other: Self) -> Self {
+                // TODO consider using an intrinsic
+                self.is_nan().select(
+                    other,
+                    self.lanes_le(other).select(other, self)
+                )
+            }
+
+            /// Restrict each lane to a certain interval unless it is NaN.
+            ///
+            /// For each lane in `self`, returns the corresponding lane in `max` if the lane is
+            /// greater than `max`, and the corresponding lane in `min` if the lane is less
+            /// than `min`.  Otherwise returns the lane in `self`.
+            #[inline]
+            pub fn clamp(self, min: Self, max: Self) -> Self {
+                assert!(
+                    min.lanes_le(max).all(),
+                    "each lane in `min` must be less than or equal to the corresponding lane in `max`",
+                );
+                let mut x = self;
+                x = x.lanes_lt(min).select(min, x);
+                x = x.lanes_gt(max).select(max, x);
+                x
+            }
+        }
+    };
+}
+
+impl_float_vector! { f32, u32, i32 }
+impl_float_vector! { f64, u64, i64 }
+
+/// Vector of two `f32` values
+pub type f32x2 = Simd<f32, 2>;
+
+/// Vector of four `f32` values
+pub type f32x4 = Simd<f32, 4>;
+
+/// Vector of eight `f32` values
+pub type f32x8 = Simd<f32, 8>;
+
+/// Vector of 16 `f32` values
+pub type f32x16 = Simd<f32, 16>;
+
+/// Vector of two `f64` values
+pub type f64x2 = Simd<f64, 2>;
+
+/// Vector of four `f64` values
+pub type f64x4 = Simd<f64, 4>;
+
+/// Vector of eight `f64` values
+pub type f64x8 = Simd<f64, 8>;
--- a/library/portable-simd/crates/core_simd/src/vector/int.rs
+++ b/library/portable-simd/crates/core_simd/src/vector/int.rs
@ -0,0 +1,103 @@
+#![allow(non_camel_case_types)]
+
+use crate::simd::{LaneCount, Mask, Simd, SupportedLaneCount};
+
+/// Implements additional integer traits (Eq, Ord, Hash) on the specified vector `$name`, holding multiple `$lanes` of `$type`.
+macro_rules! impl_integer_vector {
+    { $type:ty } => {
+        impl<const LANES: usize> Simd<$type, LANES>
+        where
+            LaneCount<LANES>: SupportedLaneCount,
+        {
+            /// Returns true for each positive lane and false if it is zero or negative.
+            #[inline]
+            pub fn is_positive(self) -> Mask<$type, LANES> {
+                self.lanes_gt(Self::splat(0))
+            }
+
+            /// Returns true for each negative lane and false if it is zero or positive.
+            #[inline]
+            pub fn is_negative(self) -> Mask<$type, LANES> {
+                self.lanes_lt(Self::splat(0))
+            }
+
+            /// Returns numbers representing the sign of each lane.
+            /// * `0` if the number is zero
+            /// * `1` if the number is positive
+            /// * `-1` if the number is negative
+            #[inline]
+            pub fn signum(self) -> Self {
+                self.is_positive().select(
+                    Self::splat(1),
+                    self.is_negative().select(Self::splat(-1), Self::splat(0))
+                )
+            }
+        }
+    }
+}
+
+impl_integer_vector! { isize }
+impl_integer_vector! { i16 }
+impl_integer_vector! { i32 }
+impl_integer_vector! { i64 }
+impl_integer_vector! { i8 }
+
+/// Vector of two `isize` values
+pub type isizex2 = Simd<isize, 2>;
+
+/// Vector of four `isize` values
+pub type isizex4 = Simd<isize, 4>;
+
+/// Vector of eight `isize` values
+pub type isizex8 = Simd<isize, 8>;
+
+/// Vector of two `i16` values
+pub type i16x2 = Simd<i16, 2>;
+
+/// Vector of four `i16` values
+pub type i16x4 = Simd<i16, 4>;
+
+/// Vector of eight `i16` values
+pub type i16x8 = Simd<i16, 8>;
+
+/// Vector of 16 `i16` values
+pub type i16x16 = Simd<i16, 16>;
+
+/// Vector of 32 `i16` values
+pub type i16x32 = Simd<i16, 32>;
+
+/// Vector of two `i32` values
+pub type i32x2 = Simd<i32, 2>;
+
+/// Vector of four `i32` values
+pub type i32x4 = Simd<i32, 4>;
+
+/// Vector of eight `i32` values
+pub type i32x8 = Simd<i32, 8>;
+
+/// Vector of 16 `i32` values
+pub type i32x16 = Simd<i32, 16>;
+
+/// Vector of two `i64` values
+pub type i64x2 = Simd<i64, 2>;
+
+/// Vector of four `i64` values
+pub type i64x4 = Simd<i64, 4>;
+
+/// Vector of eight `i64` values
+pub type i64x8 = Simd<i64, 8>;
+
+/// Vector of four `i8` values
+pub type i8x4 = Simd<i8, 4>;
+
+/// Vector of eight `i8` values
+pub type i8x8 = Simd<i8, 8>;
+
+/// Vector of 16 `i8` values
+pub type i8x16 = Simd<i8, 16>;
+
+/// Vector of 32 `i8` values
+pub type i8x32 = Simd<i8, 32>;
+
+/// Vector of 64 `i8` values
+pub type i8x64 = Simd<i8, 64>;
--- a/library/portable-simd/crates/core_simd/src/vector/ptr.rs
+++ b/library/portable-simd/crates/core_simd/src/vector/ptr.rs
@ -0,0 +1,55 @@
+//! Private implementation details of public gather/scatter APIs.
+use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+use core::mem;
+
+/// A vector of *const T.
+#[derive(Debug, Copy, Clone)]
+#[repr(simd)]
+pub(crate) struct SimdConstPtr<T, const LANES: usize>([*const T; LANES]);
+
+impl<T, const LANES: usize> SimdConstPtr<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: Sized,
+{
+    #[inline]
+    #[must_use]
+    pub fn splat(ptr: *const T) -> Self {
+        Self([ptr; LANES])
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
+        unsafe {
+            let x: Simd<usize, LANES> = mem::transmute_copy(&self);
+            mem::transmute_copy(&{ x + (addend * mem::size_of::<T>()) })
+        }
+    }
+}
+
+/// A vector of *mut T. Be very careful around potential aliasing.
+#[derive(Debug, Copy, Clone)]
+#[repr(simd)]
+pub(crate) struct SimdMutPtr<T, const LANES: usize>([*mut T; LANES]);
+
+impl<T, const LANES: usize> SimdMutPtr<T, LANES>
+where
+    LaneCount<LANES>: SupportedLaneCount,
+    T: Sized,
+{
+    #[inline]
+    #[must_use]
+    pub fn splat(ptr: *mut T) -> Self {
+        Self([ptr; LANES])
+    }
+
+    #[inline]
+    #[must_use]
+    pub fn wrapping_add(self, addend: Simd<usize, LANES>) -> Self {
+        unsafe {
+            let x: Simd<usize, LANES> = mem::transmute_copy(&self);
+            mem::transmute_copy(&{ x + (addend * mem::size_of::<T>()) })
+        }
+    }
+}
--- a/library/portable-simd/crates/core_simd/src/vector/uint.rs
+++ b/library/portable-simd/crates/core_simd/src/vector/uint.rs
@ -0,0 +1,63 @@
+#![allow(non_camel_case_types)]
+
+use crate::simd::Simd;
+
+/// Vector of two `usize` values
+pub type usizex2 = Simd<usize, 2>;
+
+/// Vector of four `usize` values
+pub type usizex4 = Simd<usize, 4>;
+
+/// Vector of eight `usize` values
+pub type usizex8 = Simd<usize, 8>;
+
+/// Vector of two `u16` values
+pub type u16x2 = Simd<u16, 2>;
+
+/// Vector of four `u16` values
+pub type u16x4 = Simd<u16, 4>;
+
+/// Vector of eight `u16` values
+pub type u16x8 = Simd<u16, 8>;
+
+/// Vector of 16 `u16` values
+pub type u16x16 = Simd<u16, 16>;
+
+/// Vector of 32 `u16` values
+pub type u16x32 = Simd<u16, 32>;
+
+/// Vector of two `u32` values
+pub type u32x2 = Simd<u32, 2>;
+
+/// Vector of four `u32` values
+pub type u32x4 = Simd<u32, 4>;
+
+/// Vector of eight `u32` values
+pub type u32x8 = Simd<u32, 8>;
+
+/// Vector of 16 `u32` values
+pub type u32x16 = Simd<u32, 16>;
+
+/// Vector of two `u64` values
+pub type u64x2 = Simd<u64, 2>;
+
+/// Vector of four `u64` values
+pub type u64x4 = Simd<u64, 4>;
+
+/// Vector of eight `u64` values
+pub type u64x8 = Simd<u64, 8>;
+
+/// Vector of four `u8` values
+pub type u8x4 = Simd<u8, 4>;
+
+/// Vector of eight `u8` values
+pub type u8x8 = Simd<u8, 8>;
+
+/// Vector of 16 `u8` values
+pub type u8x16 = Simd<u8, 16>;
+
+/// Vector of 32 `u8` values
+pub type u8x32 = Simd<u8, 32>;
+
+/// Vector of 64 `u8` values
+pub type u8x64 = Simd<u8, 64>;
--- a/library/portable-simd/crates/core_simd/src/vendor.rs
+++ b/library/portable-simd/crates/core_simd/src/vendor.rs
@ -0,0 +1,29 @@
+/// Provides implementations of `From<$a> for $b` and `From<$b> for $a` that transmutes the value.
+#[allow(unused)]
+macro_rules! from_transmute {
+    { unsafe $a:ty => $b:ty } => {
+        from_transmute!{ @impl $a => $b }
+        from_transmute!{ @impl $b => $a }
+    };
+    { @impl $from:ty => $to:ty } => {
+        impl core::convert::From<$from> for $to {
+            #[inline]
+            fn from(value: $from) -> $to {
+                unsafe { core::mem::transmute(value) }
+            }
+        }
+    };
+}
+
+/// Conversions to x86's SIMD types.
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod x86;
+
+#[cfg(any(target_arch = "wasm32"))]
+mod wasm32;
+
+#[cfg(any(target_arch = "aarch64", target_arch = "arm",))]
+mod arm;
+
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+mod powerpc;
--- a/library/portable-simd/crates/core_simd/src/vendor/arm.rs
+++ b/library/portable-simd/crates/core_simd/src/vendor/arm.rs
@ -0,0 +1,76 @@
+#![allow(unused)]
+use crate::simd::*;
+
+#[cfg(target_arch = "arm")]
+use core::arch::arm::*;
+
+#[cfg(target_arch = "aarch64")]
+use core::arch::aarch64::*;
+
+#[cfg(any(
+    target_arch = "aarch64",
+    all(target_arch = "arm", target_feature = "v7"),
+))]
+mod neon {
+    use super::*;
+
+    from_transmute! { unsafe f32x2 => float32x2_t }
+    from_transmute! { unsafe f32x4 => float32x4_t }
+
+    from_transmute! { unsafe u8x8 => uint8x8_t }
+    from_transmute! { unsafe u8x16 => uint8x16_t }
+    from_transmute! { unsafe i8x8 => int8x8_t }
+    from_transmute! { unsafe i8x16 => int8x16_t }
+    from_transmute! { unsafe u8x8 => poly8x8_t }
+    from_transmute! { unsafe u8x16 => poly8x16_t }
+
+    from_transmute! { unsafe u16x4 => uint16x4_t }
+    from_transmute! { unsafe u16x8 => uint16x8_t }
+    from_transmute! { unsafe i16x4 => int16x4_t }
+    from_transmute! { unsafe i16x8 => int16x8_t }
+    from_transmute! { unsafe u16x4 => poly16x4_t }
+    from_transmute! { unsafe u16x8 => poly16x8_t }
+
+    from_transmute! { unsafe u32x2 => uint32x2_t }
+    from_transmute! { unsafe u32x4 => uint32x4_t }
+    from_transmute! { unsafe i32x2 => int32x2_t }
+    from_transmute! { unsafe i32x4 => int32x4_t }
+
+    from_transmute! { unsafe Simd<u64, 1> => uint64x1_t }
+    from_transmute! { unsafe u64x2 => uint64x2_t }
+    from_transmute! { unsafe Simd<i64, 1> => int64x1_t }
+    from_transmute! { unsafe i64x2 => int64x2_t }
+    from_transmute! { unsafe Simd<u64, 1> => poly64x1_t }
+    from_transmute! { unsafe u64x2 => poly64x2_t }
+}
+
+#[cfg(any(
+    all(target_feature = "v5te", not(target_feature = "mclass")),
+    all(target_feature = "mclass", target_feature = "dsp"),
+))]
+mod dsp {
+    use super::*;
+
+    from_transmute! { unsafe Simd<u16, 2> => uint16x2_t }
+    from_transmute! { unsafe Simd<i16, 2> => int16x2_t }
+}
+
+#[cfg(any(
+    all(target_feature = "v6", not(target_feature = "mclass")),
+    all(target_feature = "mclass", target_feature = "dsp"),
+))]
+mod simd32 {
+    use super::*;
+
+    from_transmute! { unsafe Simd<u8, 4> => uint8x4_t }
+    from_transmute! { unsafe Simd<i8, 4> => int8x4_t }
+}
+
+#[cfg(target_arch = "aarch64")]
+mod aarch64 {
+    use super::neon::*;
+    use super::*;
+
+    from_transmute! { unsafe Simd<f64, 1> => float64x1_t }
+    from_transmute! { unsafe f64x2 => float64x2_t }
+}
--- a/library/portable-simd/crates/core_simd/src/vendor/powerpc.rs
+++ b/library/portable-simd/crates/core_simd/src/vendor/powerpc.rs
@ -0,0 +1,11 @@
+use crate::simd::*;
+
+#[cfg(target_arch = "powerpc")]
+use core::arch::powerpc::*;
+
+#[cfg(target_arch = "powerpc64")]
+use core::arch::powerpc64::*;
+
+from_transmute! { unsafe f64x2 => vector_double }
+from_transmute! { unsafe i64x2 => vector_signed_long }
+from_transmute! { unsafe u64x2 => vector_unsigned_long }
--- a/library/portable-simd/crates/core_simd/src/vendor/wasm32.rs
+++ b/library/portable-simd/crates/core_simd/src/vendor/wasm32.rs
@ -0,0 +1,30 @@
+use crate::simd::*;
+use core::arch::wasm32::v128;
+
+from_transmute! { unsafe u8x16 => v128 }
+from_transmute! { unsafe i8x16 => v128 }
+
+from_transmute! { unsafe u16x8 => v128 }
+from_transmute! { unsafe i16x8 => v128 }
+
+from_transmute! { unsafe u32x4 => v128 }
+from_transmute! { unsafe i32x4 => v128 }
+from_transmute! { unsafe f32x4 => v128 }
+
+from_transmute! { unsafe u64x2 => v128 }
+from_transmute! { unsafe i64x2 => v128 }
+from_transmute! { unsafe f64x2 => v128 }
+
+#[cfg(target_pointer_width = "32")]
+mod p32 {
+    use super::*;
+    from_transmute! { unsafe usizex4 => v128 }
+    from_transmute! { unsafe isizex4 => v128 }
+}
+
+#[cfg(target_pointer_width = "64")]
+mod p64 {
+    use super::*;
+    from_transmute! { unsafe usizex2 => v128 }
+    from_transmute! { unsafe isizex2 => v128 }
+}
--- a/library/portable-simd/crates/core_simd/src/vendor/x86.rs
+++ b/library/portable-simd/crates/core_simd/src/vendor/x86.rs
@ -0,0 +1,63 @@
+use crate::simd::*;
+
+#[cfg(any(target_arch = "x86"))]
+use core::arch::x86::*;
+
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::*;
+
+from_transmute! { unsafe u8x16 => __m128i }
+from_transmute! { unsafe u8x32 => __m256i }
+//from_transmute! { unsafe u8x64 => __m512i }
+from_transmute! { unsafe i8x16 => __m128i }
+from_transmute! { unsafe i8x32 => __m256i }
+//from_transmute! { unsafe i8x64 => __m512i }
+
+from_transmute! { unsafe u16x8 => __m128i }
+from_transmute! { unsafe u16x16 => __m256i }
+from_transmute! { unsafe u16x32 => __m512i }
+from_transmute! { unsafe i16x8 => __m128i }
+from_transmute! { unsafe i16x16 => __m256i }
+from_transmute! { unsafe i16x32 => __m512i }
+
+from_transmute! { unsafe u32x4 => __m128i }
+from_transmute! { unsafe u32x8 => __m256i }
+from_transmute! { unsafe u32x16 => __m512i }
+from_transmute! { unsafe i32x4 => __m128i }
+from_transmute! { unsafe i32x8 => __m256i }
+from_transmute! { unsafe i32x16 => __m512i }
+from_transmute! { unsafe f32x4 => __m128 }
+from_transmute! { unsafe f32x8 => __m256 }
+from_transmute! { unsafe f32x16 => __m512 }
+
+from_transmute! { unsafe u64x2 => __m128i }
+from_transmute! { unsafe u64x4 => __m256i }
+from_transmute! { unsafe u64x8 => __m512i }
+from_transmute! { unsafe i64x2 => __m128i }
+from_transmute! { unsafe i64x4 => __m256i }
+from_transmute! { unsafe i64x8 => __m512i }
+from_transmute! { unsafe f64x2 => __m128d }
+from_transmute! { unsafe f64x4 => __m256d }
+from_transmute! { unsafe f64x8 => __m512d }
+
+#[cfg(target_pointer_width = "32")]
+mod p32 {
+    use super::*;
+    from_transmute! { unsafe usizex4 => __m128i }
+    from_transmute! { unsafe usizex8 => __m256i }
+    from_transmute! { unsafe Simd<usize, 16> => __m512i }
+    from_transmute! { unsafe isizex4 => __m128i }
+    from_transmute! { unsafe isizex8 => __m256i }
+    from_transmute! { unsafe Simd<isize, 16> => __m512i }
+}
+
+#[cfg(target_pointer_width = "64")]
+mod p64 {
+    use super::*;
+    from_transmute! { unsafe usizex2 => __m128i }
+    from_transmute! { unsafe usizex4 => __m256i }
+    from_transmute! { unsafe usizex8 => __m512i }
+    from_transmute! { unsafe isizex2 => __m128i }
+    from_transmute! { unsafe isizex4 => __m256i }
+    from_transmute! { unsafe isizex8 => __m512i }
+}
--- a/library/portable-simd/crates/core_simd/tests/f32_ops.rs
+++ b/library/portable-simd/crates/core_simd/tests/f32_ops.rs
@ -0,0 +1,5 @@
+#![feature(portable_simd)]
+
+#[macro_use]
+mod ops_macros;
+impl_float_tests! { f32, i32 }
--- a/library/portable-simd/crates/core_simd/tests/f64_ops.rs
+++ b/library/portable-simd/crates/core_simd/tests/f64_ops.rs
@ -0,0 +1,5 @@
+#![feature(portable_simd)]
+
+#[macro_use]
+mod ops_macros;
+impl_float_tests! { f64, i64 }
--- a/library/portable-simd/crates/core_simd/tests/i16_ops.rs
+++ b/library/portable-simd/crates/core_simd/tests/i16_ops.rs
@ -0,0 +1,5 @@
+#![feature(portable_simd)]
+
+#[macro_use]
+mod ops_macros;
+impl_signed_tests! { i16 }
--- a/library/portable-simd/crates/core_simd/tests/i32_ops.rs
+++ b/library/portable-simd/crates/core_simd/tests/i32_ops.rs
@ -0,0 +1,5 @@
+#![feature(portable_simd)]
+
+#[macro_use]
+mod ops_macros;
+impl_signed_tests! { i32 }
--- a/library/portable-simd/crates/core_simd/tests/i64_ops.rs
+++ b/library/portable-simd/crates/core_simd/tests/i64_ops.rs
@ -0,0 +1,5 @@
+#![feature(portable_simd)]
+
+#[macro_use]
+mod ops_macros;
+impl_signed_tests! { i64 }
--- a/library/portable-simd/crates/core_simd/tests/i8_ops.rs
+++ b/library/portable-simd/crates/core_simd/tests/i8_ops.rs
@ -0,0 +1,5 @@
+#![feature(portable_simd)]
+
+#[macro_use]
+mod ops_macros;
+impl_signed_tests! { i8 }
--- a/library/portable-simd/crates/core_simd/tests/isize_ops.rs
+++ b/library/portable-simd/crates/core_simd/tests/isize_ops.rs
@ -0,0 +1,5 @@
+#![feature(portable_simd)]
+
+#[macro_use]
+mod ops_macros;
+impl_signed_tests! { isize }
--- a/library/portable-simd/crates/core_simd/tests/mask_ops.rs
+++ b/library/portable-simd/crates/core_simd/tests/mask_ops.rs
@ -0,0 +1,3 @@
+#![feature(portable_simd)]
+
+mod mask_ops_impl;
--- a/library/portable-simd/crates/core_simd/tests/mask_ops_impl/mask16.rs
+++ b/library/portable-simd/crates/core_simd/tests/mask_ops_impl/mask16.rs
@ -0,0 +1,4 @@
+mask_tests! { mask16x4, 4 }
+mask_tests! { mask16x8, 8 }
+mask_tests! { mask16x16, 16 }
+mask_tests! { mask16x32, 32 }
--- a/library/portable-simd/crates/core_simd/tests/mask_ops_impl/mask32.rs
+++ b/library/portable-simd/crates/core_simd/tests/mask_ops_impl/mask32.rs
@ -0,0 +1,4 @@
+mask_tests! { mask32x2, 2 }
+mask_tests! { mask32x4, 4 }
+mask_tests! { mask32x8, 8 }
+mask_tests! { mask32x16, 16 }
--- a/library/portable-simd/crates/core_simd/tests/mask_ops_impl/mask64.rs
+++ b/library/portable-simd/crates/core_simd/tests/mask_ops_impl/mask64.rs
@ -0,0 +1,3 @@
+mask_tests! { mask64x2, 2 }
+mask_tests! { mask64x4, 4 }
+mask_tests! { mask64x8, 8 }
--- a/library/portable-simd/crates/core_simd/tests/mask_ops_impl/mask8.rs
+++ b/library/portable-simd/crates/core_simd/tests/mask_ops_impl/mask8.rs
@ -0,0 +1,3 @@
+mask_tests! { mask8x8, 8 }
+mask_tests! { mask8x16, 16 }
+mask_tests! { mask8x32, 32 }
--- a/library/portable-simd/crates/core_simd/tests/mask_ops_impl/mask_macros.rs
+++ b/library/portable-simd/crates/core_simd/tests/mask_ops_impl/mask_macros.rs
@ -0,0 +1,225 @@
+macro_rules! mask_tests {
+    { $vector:ident, $lanes:literal } => {
+        #[cfg(test)]
+        mod $vector {
+            use core_simd::$vector as Vector;
+            const LANES: usize = $lanes;
+
+            #[cfg(target_arch = "wasm32")]
+            use wasm_bindgen_test::*;
+
+            #[cfg(target_arch = "wasm32")]
+            wasm_bindgen_test_configure!(run_in_browser);
+
+            fn from_slice(slice: &[bool]) -> Vector {
+                let mut value = Vector::default();
+                for (i, b) in slice.iter().take(LANES).enumerate() {
+                    value.set(i, *b);
+                }
+                value
+            }
+
+            fn apply_unary_lanewise(x: Vector, f: impl Fn(bool) -> bool) -> Vector {
+                let mut value = Vector::default();
+                for i in 0..LANES {
+                    value.set(i, f(x.test(i)));
+                }
+                value
+            }
+
+            fn apply_binary_lanewise(x: Vector, y: Vector, f: impl Fn(bool, bool) -> bool) -> Vector {
+                let mut value = Vector::default();
+                for i in 0..LANES {
+                    value.set(i, f(x.test(i), y.test(i)));
+                }
+                value
+            }
+
+            fn apply_binary_scalar_lhs_lanewise(x: bool, mut y: Vector, f: impl Fn(bool, bool) -> bool) -> Vector {
+                for i in 0..LANES {
+                    y.set(i, f(x, y.test(i)));
+                }
+                y
+            }
+
+            fn apply_binary_scalar_rhs_lanewise(mut x: Vector, y: bool, f: impl Fn(bool, bool) -> bool) -> Vector {
+                for i in 0..LANES {
+                    x.set(i, f(x.test(i), y));
+                }
+                x
+            }
+
+            const A: [bool; 64] = [
+                false, true, false, true, false, false, true, true,
+                false, true, false, true, false, false, true, true,
+                false, true, false, true, false, false, true, true,
+                false, true, false, true, false, false, true, true,
+                false, true, false, true, false, false, true, true,
+                false, true, false, true, false, false, true, true,
+                false, true, false, true, false, false, true, true,
+                false, true, false, true, false, false, true, true,
+            ];
+            const B: [bool; 64] = [
+                false, false, true, true, false, true, false, true,
+                false, false, true, true, false, true, false, true,
+                false, false, true, true, false, true, false, true,
+                false, false, true, true, false, true, false, true,
+                false, false, true, true, false, true, false, true,
+                false, false, true, true, false, true, false, true,
+                false, false, true, true, false, true, false, true,
+                false, false, true, true, false, true, false, true,
+            ];
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitand() {
+                let a = from_slice(&A);
+                let b = from_slice(&B);
+                let expected = apply_binary_lanewise(a, b, core::ops::BitAnd::bitand);
+                assert_eq!(a & b, expected);
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitand_assign() {
+                let mut a = from_slice(&A);
+                let b = from_slice(&B);
+                let expected = apply_binary_lanewise(a, b, core::ops::BitAnd::bitand);
+                a &= b;
+                assert_eq!(a, expected);
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitand_scalar_rhs() {
+                let a = from_slice(&A);
+                let expected = a;
+                assert_eq!(a & true, expected);
+                assert_eq!(a & false, Vector::splat(false));
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitand_scalar_lhs() {
+                let a = from_slice(&A);
+                let expected = a;
+                assert_eq!(true & a, expected);
+                assert_eq!(false & a, Vector::splat(false));
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitand_assign_scalar() {
+                let mut a = from_slice(&A);
+                let expected = a;
+                a &= true;
+                assert_eq!(a, expected);
+                a &= false;
+                assert_eq!(a, Vector::splat(false));
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitor() {
+                let a = from_slice(&A);
+                let b = from_slice(&B);
+                let expected = apply_binary_lanewise(a, b, core::ops::BitOr::bitor);
+                assert_eq!(a | b, expected);
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitor_assign() {
+                let mut a = from_slice(&A);
+                let b = from_slice(&B);
+                let expected = apply_binary_lanewise(a, b, core::ops::BitOr::bitor);
+                a |= b;
+                assert_eq!(a, expected);
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitor_scalar_rhs() {
+                let a = from_slice(&A);
+                assert_eq!(a | false, a);
+                assert_eq!(a | true, Vector::splat(true));
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitor_scalar_lhs() {
+                let a = from_slice(&A);
+                assert_eq!(false | a, a);
+                assert_eq!(true | a, Vector::splat(true));
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitor_assign_scalar() {
+                let mut a = from_slice(&A);
+                let expected = a;
+                a |= false;
+                assert_eq!(a, expected);
+                a |= true;
+                assert_eq!(a, Vector::splat(true));
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitxor() {
+                let a = from_slice(&A);
+                let b = from_slice(&B);
+                let expected = apply_binary_lanewise(a, b, core::ops::BitXor::bitxor);
+                assert_eq!(a ^ b, expected);
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitxor_assign() {
+                let mut a = from_slice(&A);
+                let b = from_slice(&B);
+                let expected = apply_binary_lanewise(a, b, core::ops::BitXor::bitxor);
+                a ^= b;
+                assert_eq!(a, expected);
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitxor_scalar_rhs() {
+                let a = from_slice(&A);
+                let expected = apply_binary_scalar_rhs_lanewise(a, true, core::ops::BitXor::bitxor);
+                assert_eq!(a ^ false, a);
+                assert_eq!(a ^ true, expected);
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitxor_scalar_lhs() {
+                let a = from_slice(&A);
+                let expected = apply_binary_scalar_lhs_lanewise(true, a, core::ops::BitXor::bitxor);
+                assert_eq!(false ^ a, a);
+                assert_eq!(true ^ a, expected);
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn bitxor_assign_scalar() {
+                let mut a = from_slice(&A);
+                let expected_unset = a;
+                let expected_set = apply_binary_scalar_rhs_lanewise(a, true, core::ops::BitXor::bitxor);
+                a ^= false;
+                assert_eq!(a, expected_unset);
+                a ^= true;
+                assert_eq!(a, expected_set);
+            }
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn not() {
+                let v = from_slice(&A);
+                let expected = apply_unary_lanewise(v, core::ops::Not::not);
+                assert_eq!(!v, expected);
+            }
+        }
+    }
+}
--- a/library/portable-simd/crates/core_simd/tests/mask_ops_impl/masksize.rs
+++ b/library/portable-simd/crates/core_simd/tests/mask_ops_impl/masksize.rs
@ -0,0 +1,3 @@
+mask_tests! { masksizex2, 2 }
+mask_tests! { masksizex4, 4 }
+mask_tests! { masksizex8, 8 }
--- a/library/portable-simd/crates/core_simd/tests/mask_ops_impl/mod.rs
+++ b/library/portable-simd/crates/core_simd/tests/mask_ops_impl/mod.rs
@ -0,0 +1,9 @@
+#[macro_use]
+mod mask_macros;
+
+#[rustfmt::skip]
+mod mask8;
+mod mask16;
+mod mask32;
+mod mask64;
+mod masksize;
--- a/library/portable-simd/crates/core_simd/tests/masks.rs
+++ b/library/portable-simd/crates/core_simd/tests/masks.rs
@ -0,0 +1,102 @@
+#![feature(portable_simd)]
+
+#[cfg(target_arch = "wasm32")]
+use wasm_bindgen_test::*;
+
+#[cfg(target_arch = "wasm32")]
+wasm_bindgen_test_configure!(run_in_browser);
+
+macro_rules! test_mask_api {
+    { $type:ident } => {
+        #[allow(non_snake_case)]
+        mod $type {
+            #[cfg(target_arch = "wasm32")]
+            use wasm_bindgen_test::*;
+
+            #[test]
+            #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+            fn set_and_test() {
+                let values = [true, false, false, true, false, false, true, false];
+                let mut mask = core_simd::Mask::<$type, 8>::splat(false);
+                for (lane, value) in values.iter().copied().enumerate() {
+                    mask.set(lane, value);
+                }
+                for (lane, value) in values.iter().copied().enumerate() {
+                    assert_eq!(mask.test(lane), value);
+                }
+            }
+
+            #[test]
+            #[should_panic]
+            fn set_invalid_lane() {
+                let mut mask = core_simd::Mask::<$type, 8>::splat(false);
+                mask.set(8, true);
+                let _ = mask;
+            }
+
+            #[test]
+            #[should_panic]
+            fn test_invalid_lane() {
+                let mask = core_simd::Mask::<$type, 8>::splat(false);
+                let _ = mask.test(8);
+            }
+
+            #[test]
+            fn any() {
+                assert!(!core_simd::Mask::<$type, 8>::splat(false).any());
+                assert!(core_simd::Mask::<$type, 8>::splat(true).any());
+                let mut v = core_simd::Mask::<$type, 8>::splat(false);
+                v.set(2, true);
+                assert!(v.any());
+            }
+
+            #[test]
+            fn all() {
+                assert!(!core_simd::Mask::<$type, 8>::splat(false).all());
+                assert!(core_simd::Mask::<$type, 8>::splat(true).all());
+                let mut v = core_simd::Mask::<$type, 8>::splat(false);
+                v.set(2, true);
+                assert!(!v.all());
+            }
+
+            #[test]
+            fn roundtrip_int_conversion() {
+                let values = [true, false, false, true, false, false, true, false];
+                let mask = core_simd::Mask::<$type, 8>::from_array(values);
+                let int = mask.to_int();
+                assert_eq!(int.to_array(), [-1, 0, 0, -1, 0, 0, -1, 0]);
+                assert_eq!(core_simd::Mask::<$type, 8>::from_int(int), mask);
+            }
+
+            #[cfg(feature = "generic_const_exprs")]
+            #[test]
+            fn roundtrip_bitmask_conversion() {
+                let values = [
+                    true, false, false, true, false, false, true, false,
+                    true, true, false, false, false, false, false, true,
+                ];
+                let mask = core_simd::Mask::<$type, 16>::from_array(values);
+                let bitmask = mask.to_bitmask();
+                assert_eq!(bitmask, [0b01001001, 0b10000011]);
+                assert_eq!(core_simd::Mask::<$type, 16>::from_bitmask(bitmask), mask);
+            }
+        }
+    }
+}
+
+mod mask_api {
+    test_mask_api! { i8 }
+    test_mask_api! { i16 }
+    test_mask_api! { i32 }
+    test_mask_api! { i64 }
+    test_mask_api! { isize }
+}
+
+#[test]
+fn convert() {
+    let values = [true, false, false, true, false, false, true, false];
+    assert_eq!(
+        core_simd::Mask::<i8, 8>::from_array(values),
+        core_simd::Mask::<i32, 8>::from_array(values).into()
+    );
+}
--- a/library/portable-simd/crates/core_simd/tests/ops_macros.rs
+++ b/library/portable-simd/crates/core_simd/tests/ops_macros.rs
@ -0,0 +1,618 @@
+/// Implements a test on a unary operation using proptest.
+///
+/// Compares the vector operation to the equivalent scalar operation.
+#[macro_export]
+macro_rules! impl_unary_op_test {
+    { $scalar:ty, $trait:ident :: $fn:ident, $scalar_fn:expr } => {
+        test_helpers::test_lanes! {
+            fn $fn<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &<core_simd::Simd<$scalar, LANES> as core::ops::$trait>::$fn,
+                    &$scalar_fn,
+                    &|_| true,
+                );
+            }
+        }
+    };
+    { $scalar:ty, $trait:ident :: $fn:ident } => {
+        impl_unary_op_test! { $scalar, $trait::$fn, <$scalar as core::ops::$trait>::$fn }
+    };
+}
+
+/// Implements a test on a binary operation using proptest.
+///
+/// Compares the vector operation to the equivalent scalar operation.
+#[macro_export]
+macro_rules! impl_binary_op_test {
+    { $scalar:ty, $trait:ident :: $fn:ident, $trait_assign:ident :: $fn_assign:ident, $scalar_fn:expr } => {
+        mod $fn {
+            use super::*;
+            use core_simd::Simd;
+
+            test_helpers::test_lanes! {
+                fn normal<const LANES: usize>() {
+                    test_helpers::test_binary_elementwise(
+                        &<Simd<$scalar, LANES> as core::ops::$trait>::$fn,
+                        &$scalar_fn,
+                        &|_, _| true,
+                    );
+                }
+
+                fn scalar_rhs<const LANES: usize>() {
+                    test_helpers::test_binary_scalar_rhs_elementwise(
+                        &<Simd<$scalar, LANES> as core::ops::$trait<$scalar>>::$fn,
+                        &$scalar_fn,
+                        &|_, _| true,
+                    );
+                }
+
+                fn scalar_lhs<const LANES: usize>() {
+                    test_helpers::test_binary_scalar_lhs_elementwise(
+                        &<$scalar as core::ops::$trait<Simd<$scalar, LANES>>>::$fn,
+                        &$scalar_fn,
+                        &|_, _| true,
+                    );
+                }
+
+                fn assign<const LANES: usize>() {
+                    test_helpers::test_binary_elementwise(
+                        &|mut a, b| { <Simd<$scalar, LANES> as core::ops::$trait_assign>::$fn_assign(&mut a, b); a },
+                        &$scalar_fn,
+                        &|_, _| true,
+                    );
+                }
+
+                fn assign_scalar_rhs<const LANES: usize>() {
+                    test_helpers::test_binary_scalar_rhs_elementwise(
+                        &|mut a, b| { <Simd<$scalar, LANES> as core::ops::$trait_assign<$scalar>>::$fn_assign(&mut a, b); a },
+                        &$scalar_fn,
+                        &|_, _| true,
+                    );
+                }
+            }
+        }
+    };
+    { $scalar:ty, $trait:ident :: $fn:ident, $trait_assign:ident :: $fn_assign:ident } => {
+        impl_binary_op_test! { $scalar, $trait::$fn, $trait_assign::$fn_assign, <$scalar as core::ops::$trait>::$fn }
+    };
+}
+
+/// Implements a test on a binary operation using proptest.
+///
+/// Like `impl_binary_op_test`, but allows providing a function for rejecting particular inputs
+/// (like the `proptest_assume` macro).
+///
+/// Compares the vector operation to the equivalent scalar operation.
+#[macro_export]
+macro_rules! impl_binary_checked_op_test {
+    { $scalar:ty, $trait:ident :: $fn:ident, $trait_assign:ident :: $fn_assign:ident, $scalar_fn:expr, $check_fn:expr } => {
+        mod $fn {
+            use super::*;
+            use core_simd::Simd;
+
+            test_helpers::test_lanes! {
+                fn normal<const LANES: usize>() {
+                    test_helpers::test_binary_elementwise(
+                        &<Simd<$scalar, LANES> as core::ops::$trait>::$fn,
+                        &$scalar_fn,
+                        &|x, y| x.iter().zip(y.iter()).all(|(x, y)| $check_fn(*x, *y)),
+                    );
+                }
+
+                fn scalar_rhs<const LANES: usize>() {
+                    test_helpers::test_binary_scalar_rhs_elementwise(
+                        &<Simd<$scalar, LANES> as core::ops::$trait<$scalar>>::$fn,
+                        &$scalar_fn,
+                        &|x, y| x.iter().all(|x| $check_fn(*x, y)),
+                    );
+                }
+
+                fn scalar_lhs<const LANES: usize>() {
+                    test_helpers::test_binary_scalar_lhs_elementwise(
+                        &<$scalar as core::ops::$trait<Simd<$scalar, LANES>>>::$fn,
+                        &$scalar_fn,
+                        &|x, y| y.iter().all(|y| $check_fn(x, *y)),
+                    );
+                }
+
+                fn assign<const LANES: usize>() {
+                    test_helpers::test_binary_elementwise(
+                        &|mut a, b| { <Simd<$scalar, LANES> as core::ops::$trait_assign>::$fn_assign(&mut a, b); a },
+                        &$scalar_fn,
+                        &|x, y| x.iter().zip(y.iter()).all(|(x, y)| $check_fn(*x, *y)),
+                    )
+                }
+
+                fn assign_scalar_rhs<const LANES: usize>() {
+                    test_helpers::test_binary_scalar_rhs_elementwise(
+                        &|mut a, b| { <Simd<$scalar, LANES> as core::ops::$trait_assign<$scalar>>::$fn_assign(&mut a, b); a },
+                        &$scalar_fn,
+                        &|x, y| x.iter().all(|x| $check_fn(*x, y)),
+                    )
+                }
+            }
+        }
+    };
+    { $scalar:ty, $trait:ident :: $fn:ident, $trait_assign:ident :: $fn_assign:ident, $check_fn:expr } => {
+        impl_binary_checked_op_test! { $scalar, $trait::$fn, $trait_assign::$fn_assign, <$scalar as core::ops::$trait>::$fn, $check_fn }
+    };
+}
+
+#[macro_export]
+macro_rules! impl_common_integer_tests {
+    { $vector:ident, $scalar:ident } => {
+        test_helpers::test_lanes! {
+            fn horizontal_sum<const LANES: usize>() {
+                test_helpers::test_1(&|x| {
+                    test_helpers::prop_assert_biteq! (
+                        $vector::<LANES>::from_array(x).horizontal_sum(),
+                        x.iter().copied().fold(0 as $scalar, $scalar::wrapping_add),
+                    );
+                    Ok(())
+                });
+            }
+
+            fn horizontal_product<const LANES: usize>() {
+                test_helpers::test_1(&|x| {
+                    test_helpers::prop_assert_biteq! (
+                        $vector::<LANES>::from_array(x).horizontal_product(),
+                        x.iter().copied().fold(1 as $scalar, $scalar::wrapping_mul),
+                    );
+                    Ok(())
+                });
+            }
+
+            fn horizontal_and<const LANES: usize>() {
+                test_helpers::test_1(&|x| {
+                    test_helpers::prop_assert_biteq! (
+                        $vector::<LANES>::from_array(x).horizontal_and(),
+                        x.iter().copied().fold(-1i8 as $scalar, <$scalar as core::ops::BitAnd>::bitand),
+                    );
+                    Ok(())
+                });
+            }
+
+            fn horizontal_or<const LANES: usize>() {
+                test_helpers::test_1(&|x| {
+                    test_helpers::prop_assert_biteq! (
+                        $vector::<LANES>::from_array(x).horizontal_or(),
+                        x.iter().copied().fold(0 as $scalar, <$scalar as core::ops::BitOr>::bitor),
+                    );
+                    Ok(())
+                });
+            }
+
+            fn horizontal_xor<const LANES: usize>() {
+                test_helpers::test_1(&|x| {
+                    test_helpers::prop_assert_biteq! (
+                        $vector::<LANES>::from_array(x).horizontal_xor(),
+                        x.iter().copied().fold(0 as $scalar, <$scalar as core::ops::BitXor>::bitxor),
+                    );
+                    Ok(())
+                });
+            }
+
+            fn horizontal_max<const LANES: usize>() {
+                test_helpers::test_1(&|x| {
+                    test_helpers::prop_assert_biteq! (
+                        $vector::<LANES>::from_array(x).horizontal_max(),
+                        x.iter().copied().max().unwrap(),
+                    );
+                    Ok(())
+                });
+            }
+
+            fn horizontal_min<const LANES: usize>() {
+                test_helpers::test_1(&|x| {
+                    test_helpers::prop_assert_biteq! (
+                        $vector::<LANES>::from_array(x).horizontal_min(),
+                        x.iter().copied().min().unwrap(),
+                    );
+                    Ok(())
+                });
+            }
+        }
+    }
+}
+
+/// Implement tests for signed integers.
+#[macro_export]
+macro_rules! impl_signed_tests {
+    { $scalar:tt } => {
+        mod $scalar {
+            type Vector<const LANES: usize> = core_simd::Simd<Scalar, LANES>;
+            type Scalar = $scalar;
+
+            impl_common_integer_tests! { Vector, Scalar }
+
+            test_helpers::test_lanes! {
+                fn neg<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise(
+                        &<Vector::<LANES> as core::ops::Neg>::neg,
+                        &<Scalar as core::ops::Neg>::neg,
+                        &|x| !x.contains(&Scalar::MIN),
+                    );
+                }
+
+                fn is_positive<const LANES: usize>() {
+                    test_helpers::test_unary_mask_elementwise(
+                        &Vector::<LANES>::is_positive,
+                        &Scalar::is_positive,
+                        &|_| true,
+                    );
+                }
+
+                fn is_negative<const LANES: usize>() {
+                    test_helpers::test_unary_mask_elementwise(
+                        &Vector::<LANES>::is_negative,
+                        &Scalar::is_negative,
+                        &|_| true,
+                    );
+                }
+
+                fn signum<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise(
+                        &Vector::<LANES>::signum,
+                        &Scalar::signum,
+                        &|_| true,
+                    )
+                }
+
+            }
+
+            test_helpers::test_lanes_panic! {
+                fn div_min_overflow_panics<const LANES: usize>() {
+                    let a = Vector::<LANES>::splat(Scalar::MIN);
+                    let b = Vector::<LANES>::splat(-1);
+                    let _ = a / b;
+                }
+
+                fn div_by_all_zeros_panics<const LANES: usize>() {
+                    let a = Vector::<LANES>::splat(42);
+                    let b = Vector::<LANES>::splat(0);
+                    let _ = a / b;
+                }
+
+                fn div_by_one_zero_panics<const LANES: usize>() {
+                    let a = Vector::<LANES>::splat(42);
+                    let mut b = Vector::<LANES>::splat(21);
+                    b[0] = 0 as _;
+                    let _ = a / b;
+                }
+
+                fn rem_min_overflow_panic<const LANES: usize>() {
+                    let a = Vector::<LANES>::splat(Scalar::MIN);
+                    let b = Vector::<LANES>::splat(-1);
+                    let _ = a % b;
+                }
+
+                fn rem_zero_panic<const LANES: usize>() {
+                    let a = Vector::<LANES>::splat(42);
+                    let b = Vector::<LANES>::splat(0);
+                    let _ = a % b;
+                }
+            }
+
+            test_helpers::test_lanes! {
+                fn div_neg_one_no_panic<const LANES: usize>() {
+                    let a = Vector::<LANES>::splat(42);
+                    let b = Vector::<LANES>::splat(-1);
+                    let _ = a / b;
+                }
+
+                fn rem_neg_one_no_panic<const LANES: usize>() {
+                    let a = Vector::<LANES>::splat(42);
+                    let b = Vector::<LANES>::splat(-1);
+                    let _ = a % b;
+                }
+            }
+
+            impl_binary_op_test!(Scalar, Add::add, AddAssign::add_assign, Scalar::wrapping_add);
+            impl_binary_op_test!(Scalar, Sub::sub, SubAssign::sub_assign, Scalar::wrapping_sub);
+            impl_binary_op_test!(Scalar, Mul::mul, MulAssign::mul_assign, Scalar::wrapping_mul);
+
+            // Exclude Div and Rem panicking cases
+            impl_binary_checked_op_test!(Scalar, Div::div, DivAssign::div_assign, Scalar::wrapping_div, |x, y| y != 0 && !(x == Scalar::MIN && y == -1));
+            impl_binary_checked_op_test!(Scalar, Rem::rem, RemAssign::rem_assign, Scalar::wrapping_rem, |x, y| y != 0 && !(x == Scalar::MIN && y == -1));
+
+            impl_unary_op_test!(Scalar, Not::not);
+            impl_binary_op_test!(Scalar, BitAnd::bitand, BitAndAssign::bitand_assign);
+            impl_binary_op_test!(Scalar, BitOr::bitor, BitOrAssign::bitor_assign);
+            impl_binary_op_test!(Scalar, BitXor::bitxor, BitXorAssign::bitxor_assign);
+        }
+    }
+}
+
+/// Implement tests for unsigned integers.
+#[macro_export]
+macro_rules! impl_unsigned_tests {
+    { $scalar:tt } => {
+        mod $scalar {
+            type Vector<const LANES: usize> = core_simd::Simd<Scalar, LANES>;
+            type Scalar = $scalar;
+
+            impl_common_integer_tests! { Vector, Scalar }
+
+            test_helpers::test_lanes_panic! {
+                fn rem_zero_panic<const LANES: usize>() {
+                    let a = Vector::<LANES>::splat(42);
+                    let b = Vector::<LANES>::splat(0);
+                    let _ = a % b;
+                }
+            }
+
+            impl_binary_op_test!(Scalar, Add::add, AddAssign::add_assign, Scalar::wrapping_add);
+            impl_binary_op_test!(Scalar, Sub::sub, SubAssign::sub_assign, Scalar::wrapping_sub);
+            impl_binary_op_test!(Scalar, Mul::mul, MulAssign::mul_assign, Scalar::wrapping_mul);
+
+            // Exclude Div and Rem panicking cases
+            impl_binary_checked_op_test!(Scalar, Div::div, DivAssign::div_assign, Scalar::wrapping_div, |_, y| y != 0);
+            impl_binary_checked_op_test!(Scalar, Rem::rem, RemAssign::rem_assign, Scalar::wrapping_rem, |_, y| y != 0);
+
+            impl_unary_op_test!(Scalar, Not::not);
+            impl_binary_op_test!(Scalar, BitAnd::bitand, BitAndAssign::bitand_assign);
+            impl_binary_op_test!(Scalar, BitOr::bitor, BitOrAssign::bitor_assign);
+            impl_binary_op_test!(Scalar, BitXor::bitxor, BitXorAssign::bitxor_assign);
+        }
+    }
+}
+
+/// Implement tests for floating point numbers.
+#[macro_export]
+macro_rules! impl_float_tests {
+    { $scalar:tt, $int_scalar:tt } => {
+        mod $scalar {
+            type Vector<const LANES: usize> = core_simd::Simd<Scalar, LANES>;
+            type Scalar = $scalar;
+
+            impl_unary_op_test!(Scalar, Neg::neg);
+            impl_binary_op_test!(Scalar, Add::add, AddAssign::add_assign);
+            impl_binary_op_test!(Scalar, Sub::sub, SubAssign::sub_assign);
+            impl_binary_op_test!(Scalar, Mul::mul, MulAssign::mul_assign);
+            impl_binary_op_test!(Scalar, Div::div, DivAssign::div_assign);
+            impl_binary_op_test!(Scalar, Rem::rem, RemAssign::rem_assign);
+
+            test_helpers::test_lanes! {
+                fn is_sign_positive<const LANES: usize>() {
+                    test_helpers::test_unary_mask_elementwise(
+                        &Vector::<LANES>::is_sign_positive,
+                        &Scalar::is_sign_positive,
+                        &|_| true,
+                    );
+                }
+
+                fn is_sign_negative<const LANES: usize>() {
+                    test_helpers::test_unary_mask_elementwise(
+                        &Vector::<LANES>::is_sign_negative,
+                        &Scalar::is_sign_negative,
+                        &|_| true,
+                    );
+                }
+
+                fn is_finite<const LANES: usize>() {
+                    test_helpers::test_unary_mask_elementwise(
+                        &Vector::<LANES>::is_finite,
+                        &Scalar::is_finite,
+                        &|_| true,
+                    );
+                }
+
+                fn is_infinite<const LANES: usize>() {
+                    test_helpers::test_unary_mask_elementwise(
+                        &Vector::<LANES>::is_infinite,
+                        &Scalar::is_infinite,
+                        &|_| true,
+                    );
+                }
+
+                fn is_nan<const LANES: usize>() {
+                    test_helpers::test_unary_mask_elementwise(
+                        &Vector::<LANES>::is_nan,
+                        &Scalar::is_nan,
+                        &|_| true,
+                    );
+                }
+
+                fn is_normal<const LANES: usize>() {
+                    test_helpers::test_unary_mask_elementwise(
+                        &Vector::<LANES>::is_normal,
+                        &Scalar::is_normal,
+                        &|_| true,
+                    );
+                }
+
+                fn is_subnormal<const LANES: usize>() {
+                    test_helpers::test_unary_mask_elementwise(
+                        &Vector::<LANES>::is_subnormal,
+                        &Scalar::is_subnormal,
+                        &|_| true,
+                    );
+                }
+
+                fn abs<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise(
+                        &Vector::<LANES>::abs,
+                        &Scalar::abs,
+                        &|_| true,
+                    )
+                }
+
+                fn recip<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise(
+                        &Vector::<LANES>::recip,
+                        &Scalar::recip,
+                        &|_| true,
+                    )
+                }
+
+                fn to_degrees<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise(
+                        &Vector::<LANES>::to_degrees,
+                        &Scalar::to_degrees,
+                        &|_| true,
+                    )
+                }
+
+                fn to_radians<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise(
+                        &Vector::<LANES>::to_radians,
+                        &Scalar::to_radians,
+                        &|_| true,
+                    )
+                }
+
+                fn signum<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise(
+                        &Vector::<LANES>::signum,
+                        &Scalar::signum,
+                        &|_| true,
+                    )
+                }
+
+                fn copysign<const LANES: usize>() {
+                    test_helpers::test_binary_elementwise(
+                        &Vector::<LANES>::copysign,
+                        &Scalar::copysign,
+                        &|_, _| true,
+                    )
+                }
+
+                fn min<const LANES: usize>() {
+                    // Regular conditions (both values aren't zero)
+                    test_helpers::test_binary_elementwise(
+                        &Vector::<LANES>::min,
+                        &Scalar::min,
+                        // Reject the case where both values are zero with different signs
+                        &|a, b| {
+                            for (a, b) in a.iter().zip(b.iter()) {
+                                if *a == 0. && *b == 0. && a.signum() != b.signum() {
+                                    return false;
+                                }
+                            }
+                            true
+                        }
+                    );
+
+                    // Special case where both values are zero
+                    let p_zero = Vector::<LANES>::splat(0.);
+                    let n_zero = Vector::<LANES>::splat(-0.);
+                    assert!(p_zero.min(n_zero).to_array().iter().all(|x| *x == 0.));
+                    assert!(n_zero.min(p_zero).to_array().iter().all(|x| *x == 0.));
+                }
+
+                fn max<const LANES: usize>() {
+                    // Regular conditions (both values aren't zero)
+                    test_helpers::test_binary_elementwise(
+                        &Vector::<LANES>::max,
+                        &Scalar::max,
+                        // Reject the case where both values are zero with different signs
+                        &|a, b| {
+                            for (a, b) in a.iter().zip(b.iter()) {
+                                if *a == 0. && *b == 0. && a.signum() != b.signum() {
+                                    return false;
+                                }
+                            }
+                            true
+                        }
+                    );
+
+                    // Special case where both values are zero
+                    let p_zero = Vector::<LANES>::splat(0.);
+                    let n_zero = Vector::<LANES>::splat(-0.);
+                    assert!(p_zero.max(n_zero).to_array().iter().all(|x| *x == 0.));
+                    assert!(n_zero.max(p_zero).to_array().iter().all(|x| *x == 0.));
+                }
+
+                fn clamp<const LANES: usize>() {
+                    test_helpers::test_3(&|value: [Scalar; LANES], mut min: [Scalar; LANES], mut max: [Scalar; LANES]| {
+                        for (min, max) in min.iter_mut().zip(max.iter_mut()) {
+                            if max < min {
+                                core::mem::swap(min, max);
+                            }
+                            if min.is_nan() {
+                                *min = Scalar::NEG_INFINITY;
+                            }
+                            if max.is_nan() {
+                                *max = Scalar::INFINITY;
+                            }
+                        }
+
+                        let mut result_scalar = [Scalar::default(); LANES];
+                        for i in 0..LANES {
+                            result_scalar[i] = value[i].clamp(min[i], max[i]);
+                        }
+                        let result_vector = Vector::from_array(value).clamp(min.into(), max.into()).to_array();
+                        test_helpers::prop_assert_biteq!(result_scalar, result_vector);
+                        Ok(())
+                    })
+                }
+
+                fn horizontal_sum<const LANES: usize>() {
+                    test_helpers::test_1(&|x| {
+                        test_helpers::prop_assert_biteq! (
+                            Vector::<LANES>::from_array(x).horizontal_sum(),
+                            x.iter().sum(),
+                        );
+                        Ok(())
+                    });
+                }
+
+                fn horizontal_product<const LANES: usize>() {
+                    test_helpers::test_1(&|x| {
+                        test_helpers::prop_assert_biteq! (
+                            Vector::<LANES>::from_array(x).horizontal_product(),
+                            x.iter().product(),
+                        );
+                        Ok(())
+                    });
+                }
+
+                fn horizontal_max<const LANES: usize>() {
+                    test_helpers::test_1(&|x| {
+                        let vmax = Vector::<LANES>::from_array(x).horizontal_max();
+                        let smax = x.iter().copied().fold(Scalar::NAN, Scalar::max);
+                        // 0 and -0 are treated the same
+                        if !(x.contains(&0.) && x.contains(&-0.) && vmax.abs() == 0. && smax.abs() == 0.) {
+                            test_helpers::prop_assert_biteq!(vmax, smax);
+                        }
+                        Ok(())
+                    });
+                }
+
+                fn horizontal_min<const LANES: usize>() {
+                    test_helpers::test_1(&|x| {
+                        let vmax = Vector::<LANES>::from_array(x).horizontal_min();
+                        let smax = x.iter().copied().fold(Scalar::NAN, Scalar::min);
+                        // 0 and -0 are treated the same
+                        if !(x.contains(&0.) && x.contains(&-0.) && vmax.abs() == 0. && smax.abs() == 0.) {
+                            test_helpers::prop_assert_biteq!(vmax, smax);
+                        }
+                        Ok(())
+                    });
+                }
+            }
+
+            #[cfg(feature = "std")]
+            mod std {
+                use super::*;
+                test_helpers::test_lanes! {
+                    fn sqrt<const LANES: usize>() {
+                        test_helpers::test_unary_elementwise(
+                            &Vector::<LANES>::sqrt,
+                            &Scalar::sqrt,
+                            &|_| true,
+                        )
+                    }
+
+                    fn mul_add<const LANES: usize>() {
+                        test_helpers::test_ternary_elementwise(
+                            &Vector::<LANES>::mul_add,
+                            &Scalar::mul_add,
+                            &|_, _, _| true,
+                        )
+                    }
+                }
+            }
+        }
+    }
+}
--- a/library/portable-simd/crates/core_simd/tests/round.rs
+++ b/library/portable-simd/crates/core_simd/tests/round.rs
@ -0,0 +1,92 @@
+#![feature(portable_simd)]
+
+macro_rules! float_rounding_test {
+    { $scalar:tt, $int_scalar:tt } => {
+        mod $scalar {
+            type Vector<const LANES: usize> = core_simd::Simd<$scalar, LANES>;
+            type Scalar = $scalar;
+            type IntScalar = $int_scalar;
+
+            #[cfg(feature = "std")]
+            test_helpers::test_lanes! {
+                fn ceil<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise(
+                        &Vector::<LANES>::ceil,
+                        &Scalar::ceil,
+                        &|_| true,
+                    )
+                }
+
+                fn floor<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise(
+                        &Vector::<LANES>::floor,
+                        &Scalar::floor,
+                        &|_| true,
+                    )
+                }
+
+                fn round<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise(
+                        &Vector::<LANES>::round,
+                        &Scalar::round,
+                        &|_| true,
+                    )
+                }
+
+                fn trunc<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise(
+                        &Vector::<LANES>::trunc,
+                        &Scalar::trunc,
+                        &|_| true,
+                    )
+                }
+
+                fn fract<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise(
+                        &Vector::<LANES>::fract,
+                        &Scalar::fract,
+                        &|_| true,
+                    )
+                }
+            }
+
+            test_helpers::test_lanes! {
+                fn from_int<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise(
+                        &Vector::<LANES>::round_from_int,
+                        &|x| x as Scalar,
+                        &|_| true,
+                    )
+                }
+
+                fn to_int_unchecked<const LANES: usize>() {
+                    // The maximum integer that can be represented by the equivalently sized float has
+                    // all of the mantissa digits set to 1, pushed up to the MSB.
+                    const ALL_MANTISSA_BITS: IntScalar = ((1 << <Scalar>::MANTISSA_DIGITS) - 1);
+                    const MAX_REPRESENTABLE_VALUE: Scalar =
+                        (ALL_MANTISSA_BITS << (core::mem::size_of::<Scalar>() * 8 - <Scalar>::MANTISSA_DIGITS as usize - 1)) as Scalar;
+
+                    let mut runner = proptest::test_runner::TestRunner::default();
+                    runner.run(
+                        &test_helpers::array::UniformArrayStrategy::new(-MAX_REPRESENTABLE_VALUE..MAX_REPRESENTABLE_VALUE),
+                        |x| {
+                            let result_1 = unsafe { Vector::from_array(x).to_int_unchecked().to_array() };
+                            let result_2 = {
+                                let mut result = [0; LANES];
+                                for (i, o) in x.iter().zip(result.iter_mut()) {
+                                    *o = unsafe { i.to_int_unchecked() };
+                                }
+                                result
+                            };
+                            test_helpers::prop_assert_biteq!(result_1, result_2);
+                            Ok(())
+                        },
+                    ).unwrap();
+                }
+            }
+        }
+    }
+}
+
+float_rounding_test! { f32, i32 }
+float_rounding_test! { f64, i64 }
--- a/library/portable-simd/crates/core_simd/tests/swizzle.rs
+++ b/library/portable-simd/crates/core_simd/tests/swizzle.rs
@ -0,0 +1,62 @@
+#![feature(portable_simd)]
+use core_simd::{Simd, Swizzle};
+
+#[cfg(target_arch = "wasm32")]
+use wasm_bindgen_test::*;
+
+#[cfg(target_arch = "wasm32")]
+wasm_bindgen_test_configure!(run_in_browser);
+
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn swizzle() {
+    struct Index;
+    impl Swizzle<4, 4> for Index {
+        const INDEX: [usize; 4] = [2, 1, 3, 0];
+    }
+    impl Swizzle<4, 2> for Index {
+        const INDEX: [usize; 2] = [1, 1];
+    }
+
+    let vector = Simd::from_array([2, 4, 1, 9]);
+    assert_eq!(Index::swizzle(vector).to_array(), [1, 4, 9, 2]);
+    assert_eq!(Index::swizzle(vector).to_array(), [4, 4]);
+}
+
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn reverse() {
+    let a = Simd::from_array([1, 2, 3, 4]);
+    assert_eq!(a.reverse().to_array(), [4, 3, 2, 1]);
+}
+
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn rotate() {
+    let a = Simd::from_array([1, 2, 3, 4]);
+    assert_eq!(a.rotate_lanes_left::<0>().to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.rotate_lanes_left::<1>().to_array(), [2, 3, 4, 1]);
+    assert_eq!(a.rotate_lanes_left::<2>().to_array(), [3, 4, 1, 2]);
+    assert_eq!(a.rotate_lanes_left::<3>().to_array(), [4, 1, 2, 3]);
+    assert_eq!(a.rotate_lanes_left::<4>().to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.rotate_lanes_left::<5>().to_array(), [2, 3, 4, 1]);
+    assert_eq!(a.rotate_lanes_right::<0>().to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.rotate_lanes_right::<1>().to_array(), [4, 1, 2, 3]);
+    assert_eq!(a.rotate_lanes_right::<2>().to_array(), [3, 4, 1, 2]);
+    assert_eq!(a.rotate_lanes_right::<3>().to_array(), [2, 3, 4, 1]);
+    assert_eq!(a.rotate_lanes_right::<4>().to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.rotate_lanes_right::<5>().to_array(), [4, 1, 2, 3]);
+}
+
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn interleave() {
+    let a = Simd::from_array([0, 1, 2, 3, 4, 5, 6, 7]);
+    let b = Simd::from_array([8, 9, 10, 11, 12, 13, 14, 15]);
+    let (lo, hi) = a.interleave(b);
+    assert_eq!(lo.to_array(), [0, 8, 1, 9, 2, 10, 3, 11]);
+    assert_eq!(hi.to_array(), [4, 12, 5, 13, 6, 14, 7, 15]);
+    let (even, odd) = lo.deinterleave(hi);
+    assert_eq!(even, a);
+    assert_eq!(odd, b);
+}
--- a/library/portable-simd/crates/core_simd/tests/to_bytes.rs
+++ b/library/portable-simd/crates/core_simd/tests/to_bytes.rs
@ -0,0 +1,14 @@
+#![feature(portable_simd, generic_const_exprs, adt_const_params)]
+#![allow(incomplete_features)]
+#![cfg(feature = "generic_const_exprs")]
+
+use core_simd::Simd;
+
+#[test]
+fn byte_convert() {
+    let int = Simd::<u32, 2>::from_array([0xdeadbeef, 0x8badf00d]);
+    let bytes = int.to_ne_bytes();
+    assert_eq!(int[0].to_ne_bytes(), bytes[..4]);
+    assert_eq!(int[1].to_ne_bytes(), bytes[4..]);
+    assert_eq!(Simd::<u32, 2>::from_ne_bytes(bytes), int);
+}
--- a/library/portable-simd/crates/core_simd/tests/u16_ops.rs
+++ b/library/portable-simd/crates/core_simd/tests/u16_ops.rs
@ -0,0 +1,5 @@
+#![feature(portable_simd)]
+
+#[macro_use]
+mod ops_macros;
+impl_unsigned_tests! { u16 }
--- a/library/portable-simd/crates/core_simd/tests/u32_ops.rs
+++ b/library/portable-simd/crates/core_simd/tests/u32_ops.rs
@ -0,0 +1,5 @@
+#![feature(portable_simd)]
+
+#[macro_use]
+mod ops_macros;
+impl_unsigned_tests! { u32 }
--- a/library/portable-simd/crates/core_simd/tests/u64_ops.rs
+++ b/library/portable-simd/crates/core_simd/tests/u64_ops.rs
@ -0,0 +1,5 @@
+#![feature(portable_simd)]
+
+#[macro_use]
+mod ops_macros;
+impl_unsigned_tests! { u64 }
--- a/library/portable-simd/crates/core_simd/tests/u8_ops.rs
+++ b/library/portable-simd/crates/core_simd/tests/u8_ops.rs
@ -0,0 +1,5 @@
+#![feature(portable_simd)]
+
+#[macro_use]
+mod ops_macros;
+impl_unsigned_tests! { u8 }
--- a/library/portable-simd/crates/core_simd/tests/usize_ops.rs
+++ b/library/portable-simd/crates/core_simd/tests/usize_ops.rs
@ -0,0 +1,5 @@
+#![feature(portable_simd)]
+
+#[macro_use]
+mod ops_macros;
+impl_unsigned_tests! { usize }
--- a/library/portable-simd/crates/core_simd/webdriver.json
+++ b/library/portable-simd/crates/core_simd/webdriver.json
@ -0,0 +1,7 @@
+{
+    "goog:chromeOptions": {
+        "args": [
+            "--enable-features=WebAssemblySimd"
+        ]
+    }
+}
--- a/library/portable-simd/crates/test_helpers/Cargo.toml
+++ b/library/portable-simd/crates/test_helpers/Cargo.toml
@ -0,0 +1,10 @@
+[package]
+name = "test_helpers"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+[dependencies.proptest]
+version = "0.10"
+default-features = false
+features = ["alloc"]
--- a/library/portable-simd/crates/test_helpers/src/array.rs
+++ b/library/portable-simd/crates/test_helpers/src/array.rs
@ -0,0 +1,97 @@
+//! Generic-length array strategy.
+
+// Adapted from proptest's array code
+// Copyright 2017 Jason Lingle
+
+use core::{marker::PhantomData, mem::MaybeUninit};
+use proptest::{
+    strategy::{NewTree, Strategy, ValueTree},
+    test_runner::TestRunner,
+};
+
+#[must_use = "strategies do nothing unless used"]
+#[derive(Clone, Copy, Debug)]
+pub struct UniformArrayStrategy<S, T> {
+    strategy: S,
+    _marker: PhantomData<T>,
+}
+
+impl<S, T> UniformArrayStrategy<S, T> {
+    pub const fn new(strategy: S) -> Self {
+        Self {
+            strategy,
+            _marker: PhantomData,
+        }
+    }
+}
+
+pub struct ArrayValueTree<T> {
+    tree: T,
+    shrinker: usize,
+    last_shrinker: Option<usize>,
+}
+
+impl<T, S, const LANES: usize> Strategy for UniformArrayStrategy<S, [T; LANES]>
+where
+    T: core::fmt::Debug,
+    S: Strategy<Value = T>,
+{
+    type Tree = ArrayValueTree<[S::Tree; LANES]>;
+    type Value = [T; LANES];
+
+    fn new_tree(&self, runner: &mut TestRunner) -> NewTree<Self> {
+        let tree: [S::Tree; LANES] = unsafe {
+            let mut tree: [MaybeUninit<S::Tree>; LANES] = MaybeUninit::uninit().assume_init();
+            for t in tree.iter_mut() {
+                *t = MaybeUninit::new(self.strategy.new_tree(runner)?)
+            }
+            core::mem::transmute_copy(&tree)
+        };
+        Ok(ArrayValueTree {
+            tree,
+            shrinker: 0,
+            last_shrinker: None,
+        })
+    }
+}
+
+impl<T: ValueTree, const LANES: usize> ValueTree for ArrayValueTree<[T; LANES]> {
+    type Value = [T::Value; LANES];
+
+    fn current(&self) -> Self::Value {
+        unsafe {
+            let mut value: [MaybeUninit<T::Value>; LANES] = MaybeUninit::uninit().assume_init();
+            for (tree_elem, value_elem) in self.tree.iter().zip(value.iter_mut()) {
+                *value_elem = MaybeUninit::new(tree_elem.current());
+            }
+            core::mem::transmute_copy(&value)
+        }
+    }
+
+    fn simplify(&mut self) -> bool {
+        while self.shrinker < LANES {
+            if self.tree[self.shrinker].simplify() {
+                self.last_shrinker = Some(self.shrinker);
+                return true;
+            } else {
+                self.shrinker += 1;
+            }
+        }
+
+        false
+    }
+
+    fn complicate(&mut self) -> bool {
+        if let Some(shrinker) = self.last_shrinker {
+            self.shrinker = shrinker;
+            if self.tree[shrinker].complicate() {
+                true
+            } else {
+                self.last_shrinker = None;
+                false
+            }
+        } else {
+            false
+        }
+    }
+}
--- a/library/portable-simd/crates/test_helpers/src/biteq.rs
+++ b/library/portable-simd/crates/test_helpers/src/biteq.rs
@ -0,0 +1,106 @@
+//! Compare numeric types by exact bit value.
+
+pub trait BitEq {
+    fn biteq(&self, other: &Self) -> bool;
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result;
+}
+
+impl BitEq for bool {
+    fn biteq(&self, other: &Self) -> bool {
+        self == other
+    }
+
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+macro_rules! impl_integer_biteq {
+    { $($type:ty),* } => {
+        $(
+        impl BitEq for $type {
+            fn biteq(&self, other: &Self) -> bool {
+                self == other
+            }
+
+            fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+                write!(f, "{:?} ({:x})", self, self)
+            }
+        }
+        )*
+    };
+}
+
+impl_integer_biteq! { u8, u16, u32, u64, u128, usize, i8, i16, i32, i64, i128, isize }
+
+macro_rules! impl_float_biteq {
+    { $($type:ty),* } => {
+        $(
+        impl BitEq for $type {
+            fn biteq(&self, other: &Self) -> bool {
+                if self.is_nan() && other.is_nan() {
+                    true // exact nan bits don't matter
+                } else {
+                    self.to_bits() == other.to_bits()
+                }
+            }
+
+            fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+                write!(f, "{:?} ({:x})", self, self.to_bits())
+            }
+        }
+        )*
+    };
+}
+
+impl_float_biteq! { f32, f64 }
+
+impl<T: BitEq, const N: usize> BitEq for [T; N] {
+    fn biteq(&self, other: &Self) -> bool {
+        self.iter()
+            .zip(other.iter())
+            .fold(true, |value, (left, right)| value && left.biteq(right))
+    }
+
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        #[repr(transparent)]
+        struct Wrapper<'a, T: BitEq>(&'a T);
+
+        impl<T: BitEq> core::fmt::Debug for Wrapper<'_, T> {
+            fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+                self.0.fmt(f)
+            }
+        }
+
+        f.debug_list()
+            .entries(self.iter().map(|x| Wrapper(x)))
+            .finish()
+    }
+}
+
+#[doc(hidden)]
+pub struct BitEqWrapper<'a, T>(pub &'a T);
+
+impl<T: BitEq> PartialEq for BitEqWrapper<'_, T> {
+    fn eq(&self, other: &Self) -> bool {
+        self.0.biteq(other.0)
+    }
+}
+
+impl<T: BitEq> core::fmt::Debug for BitEqWrapper<'_, T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+#[macro_export]
+macro_rules! prop_assert_biteq {
+    { $a:expr, $b:expr $(,)? } => {
+        {
+            use $crate::biteq::BitEqWrapper;
+            let a = $a;
+            let b = $b;
+            proptest::prop_assert_eq!(BitEqWrapper(&a), BitEqWrapper(&b));
+        }
+    }
+}
--- a/library/portable-simd/crates/test_helpers/src/lib.rs
+++ b/library/portable-simd/crates/test_helpers/src/lib.rs
@ -0,0 +1,437 @@
+pub mod array;
+
+#[cfg(target_arch = "wasm32")]
+pub mod wasm;
+
+#[macro_use]
+pub mod biteq;
+
+/// Specifies the default strategy for testing a type.
+///
+/// This strategy should be what "makes sense" to test.
+pub trait DefaultStrategy {
+    type Strategy: proptest::strategy::Strategy<Value = Self>;
+    fn default_strategy() -> Self::Strategy;
+}
+
+macro_rules! impl_num {
+    { $type:tt } => {
+        impl DefaultStrategy for $type {
+            type Strategy = proptest::num::$type::Any;
+            fn default_strategy() -> Self::Strategy {
+                proptest::num::$type::ANY
+            }
+        }
+    }
+}
+
+impl_num! { i8 }
+impl_num! { i16 }
+impl_num! { i32 }
+impl_num! { i64 }
+impl_num! { isize }
+impl_num! { u8 }
+impl_num! { u16 }
+impl_num! { u32 }
+impl_num! { u64 }
+impl_num! { usize }
+impl_num! { f32 }
+impl_num! { f64 }
+
+#[cfg(not(target_arch = "wasm32"))]
+impl DefaultStrategy for u128 {
+    type Strategy = proptest::num::u128::Any;
+    fn default_strategy() -> Self::Strategy {
+        proptest::num::u128::ANY
+    }
+}
+
+#[cfg(not(target_arch = "wasm32"))]
+impl DefaultStrategy for i128 {
+    type Strategy = proptest::num::i128::Any;
+    fn default_strategy() -> Self::Strategy {
+        proptest::num::i128::ANY
+    }
+}
+
+#[cfg(target_arch = "wasm32")]
+impl DefaultStrategy for u128 {
+    type Strategy = crate::wasm::u128::Any;
+    fn default_strategy() -> Self::Strategy {
+        crate::wasm::u128::ANY
+    }
+}
+
+#[cfg(target_arch = "wasm32")]
+impl DefaultStrategy for i128 {
+    type Strategy = crate::wasm::i128::Any;
+    fn default_strategy() -> Self::Strategy {
+        crate::wasm::i128::ANY
+    }
+}
+
+impl<T: core::fmt::Debug + DefaultStrategy, const LANES: usize> DefaultStrategy for [T; LANES] {
+    type Strategy = crate::array::UniformArrayStrategy<T::Strategy, Self>;
+    fn default_strategy() -> Self::Strategy {
+        Self::Strategy::new(T::default_strategy())
+    }
+}
+
+/// Test a function that takes a single value.
+pub fn test_1<A: core::fmt::Debug + DefaultStrategy>(
+    f: &dyn Fn(A) -> proptest::test_runner::TestCaseResult,
+) {
+    let mut runner = proptest::test_runner::TestRunner::default();
+    runner.run(&A::default_strategy(), f).unwrap();
+}
+
+/// Test a function that takes two values.
+pub fn test_2<A: core::fmt::Debug + DefaultStrategy, B: core::fmt::Debug + DefaultStrategy>(
+    f: &dyn Fn(A, B) -> proptest::test_runner::TestCaseResult,
+) {
+    let mut runner = proptest::test_runner::TestRunner::default();
+    runner
+        .run(&(A::default_strategy(), B::default_strategy()), |(a, b)| {
+            f(a, b)
+        })
+        .unwrap();
+}
+
+/// Test a function that takes two values.
+pub fn test_3<
+    A: core::fmt::Debug + DefaultStrategy,
+    B: core::fmt::Debug + DefaultStrategy,
+    C: core::fmt::Debug + DefaultStrategy,
+>(
+    f: &dyn Fn(A, B, C) -> proptest::test_runner::TestCaseResult,
+) {
+    let mut runner = proptest::test_runner::TestRunner::default();
+    runner
+        .run(
+            &(
+                A::default_strategy(),
+                B::default_strategy(),
+                C::default_strategy(),
+            ),
+            |(a, b, c)| f(a, b, c),
+        )
+        .unwrap();
+}
+
+/// Test a unary vector function against a unary scalar function, applied elementwise.
+#[inline(never)]
+pub fn test_unary_elementwise<Scalar, ScalarResult, Vector, VectorResult, const LANES: usize>(
+    fv: &dyn Fn(Vector) -> VectorResult,
+    fs: &dyn Fn(Scalar) -> ScalarResult,
+    check: &dyn Fn([Scalar; LANES]) -> bool,
+) where
+    Scalar: Copy + Default + core::fmt::Debug + DefaultStrategy,
+    ScalarResult: Copy + Default + biteq::BitEq + core::fmt::Debug + DefaultStrategy,
+    Vector: Into<[Scalar; LANES]> + From<[Scalar; LANES]> + Copy,
+    VectorResult: Into<[ScalarResult; LANES]> + From<[ScalarResult; LANES]> + Copy,
+{
+    test_1(&|x: [Scalar; LANES]| {
+        proptest::prop_assume!(check(x));
+        let result_1: [ScalarResult; LANES] = fv(x.into()).into();
+        let result_2: [ScalarResult; LANES] = {
+            let mut result = [ScalarResult::default(); LANES];
+            for (i, o) in x.iter().zip(result.iter_mut()) {
+                *o = fs(*i);
+            }
+            result
+        };
+        crate::prop_assert_biteq!(result_1, result_2);
+        Ok(())
+    });
+}
+
+/// Test a unary vector function against a unary scalar function, applied elementwise.
+#[inline(never)]
+pub fn test_unary_mask_elementwise<Scalar, Vector, Mask, const LANES: usize>(
+    fv: &dyn Fn(Vector) -> Mask,
+    fs: &dyn Fn(Scalar) -> bool,
+    check: &dyn Fn([Scalar; LANES]) -> bool,
+) where
+    Scalar: Copy + Default + core::fmt::Debug + DefaultStrategy,
+    Vector: Into<[Scalar; LANES]> + From<[Scalar; LANES]> + Copy,
+    Mask: Into<[bool; LANES]> + From<[bool; LANES]> + Copy,
+{
+    test_1(&|x: [Scalar; LANES]| {
+        proptest::prop_assume!(check(x));
+        let result_1: [bool; LANES] = fv(x.into()).into();
+        let result_2: [bool; LANES] = {
+            let mut result = [false; LANES];
+            for (i, o) in x.iter().zip(result.iter_mut()) {
+                *o = fs(*i);
+            }
+            result
+        };
+        crate::prop_assert_biteq!(result_1, result_2);
+        Ok(())
+    });
+}
+
+/// Test a binary vector function against a binary scalar function, applied elementwise.
+#[inline(never)]
+pub fn test_binary_elementwise<
+    Scalar1,
+    Scalar2,
+    ScalarResult,
+    Vector1,
+    Vector2,
+    VectorResult,
+    const LANES: usize,
+>(
+    fv: &dyn Fn(Vector1, Vector2) -> VectorResult,
+    fs: &dyn Fn(Scalar1, Scalar2) -> ScalarResult,
+    check: &dyn Fn([Scalar1; LANES], [Scalar2; LANES]) -> bool,
+) where
+    Scalar1: Copy + Default + core::fmt::Debug + DefaultStrategy,
+    Scalar2: Copy + Default + core::fmt::Debug + DefaultStrategy,
+    ScalarResult: Copy + Default + biteq::BitEq + core::fmt::Debug + DefaultStrategy,
+    Vector1: Into<[Scalar1; LANES]> + From<[Scalar1; LANES]> + Copy,
+    Vector2: Into<[Scalar2; LANES]> + From<[Scalar2; LANES]> + Copy,
+    VectorResult: Into<[ScalarResult; LANES]> + From<[ScalarResult; LANES]> + Copy,
+{
+    test_2(&|x: [Scalar1; LANES], y: [Scalar2; LANES]| {
+        proptest::prop_assume!(check(x, y));
+        let result_1: [ScalarResult; LANES] = fv(x.into(), y.into()).into();
+        let result_2: [ScalarResult; LANES] = {
+            let mut result = [ScalarResult::default(); LANES];
+            for ((i1, i2), o) in x.iter().zip(y.iter()).zip(result.iter_mut()) {
+                *o = fs(*i1, *i2);
+            }
+            result
+        };
+        crate::prop_assert_biteq!(result_1, result_2);
+        Ok(())
+    });
+}
+
+/// Test a binary vector-scalar function against a binary scalar function, applied elementwise.
+#[inline(never)]
+pub fn test_binary_scalar_rhs_elementwise<
+    Scalar1,
+    Scalar2,
+    ScalarResult,
+    Vector,
+    VectorResult,
+    const LANES: usize,
+>(
+    fv: &dyn Fn(Vector, Scalar2) -> VectorResult,
+    fs: &dyn Fn(Scalar1, Scalar2) -> ScalarResult,
+    check: &dyn Fn([Scalar1; LANES], Scalar2) -> bool,
+) where
+    Scalar1: Copy + Default + core::fmt::Debug + DefaultStrategy,
+    Scalar2: Copy + Default + core::fmt::Debug + DefaultStrategy,
+    ScalarResult: Copy + Default + biteq::BitEq + core::fmt::Debug + DefaultStrategy,
+    Vector: Into<[Scalar1; LANES]> + From<[Scalar1; LANES]> + Copy,
+    VectorResult: Into<[ScalarResult; LANES]> + From<[ScalarResult; LANES]> + Copy,
+{
+    test_2(&|x: [Scalar1; LANES], y: Scalar2| {
+        proptest::prop_assume!(check(x, y));
+        let result_1: [ScalarResult; LANES] = fv(x.into(), y).into();
+        let result_2: [ScalarResult; LANES] = {
+            let mut result = [ScalarResult::default(); LANES];
+            for (i, o) in x.iter().zip(result.iter_mut()) {
+                *o = fs(*i, y);
+            }
+            result
+        };
+        crate::prop_assert_biteq!(result_1, result_2);
+        Ok(())
+    });
+}
+
+/// Test a binary vector-scalar function against a binary scalar function, applied elementwise.
+#[inline(never)]
+pub fn test_binary_scalar_lhs_elementwise<
+    Scalar1,
+    Scalar2,
+    ScalarResult,
+    Vector,
+    VectorResult,
+    const LANES: usize,
+>(
+    fv: &dyn Fn(Scalar1, Vector) -> VectorResult,
+    fs: &dyn Fn(Scalar1, Scalar2) -> ScalarResult,
+    check: &dyn Fn(Scalar1, [Scalar2; LANES]) -> bool,
+) where
+    Scalar1: Copy + Default + core::fmt::Debug + DefaultStrategy,
+    Scalar2: Copy + Default + core::fmt::Debug + DefaultStrategy,
+    ScalarResult: Copy + Default + biteq::BitEq + core::fmt::Debug + DefaultStrategy,
+    Vector: Into<[Scalar2; LANES]> + From<[Scalar2; LANES]> + Copy,
+    VectorResult: Into<[ScalarResult; LANES]> + From<[ScalarResult; LANES]> + Copy,
+{
+    test_2(&|x: Scalar1, y: [Scalar2; LANES]| {
+        proptest::prop_assume!(check(x, y));
+        let result_1: [ScalarResult; LANES] = fv(x, y.into()).into();
+        let result_2: [ScalarResult; LANES] = {
+            let mut result = [ScalarResult::default(); LANES];
+            for (i, o) in y.iter().zip(result.iter_mut()) {
+                *o = fs(x, *i);
+            }
+            result
+        };
+        crate::prop_assert_biteq!(result_1, result_2);
+        Ok(())
+    });
+}
+
+/// Test a ternary vector function against a ternary scalar function, applied elementwise.
+#[inline(never)]
+pub fn test_ternary_elementwise<
+    Scalar1,
+    Scalar2,
+    Scalar3,
+    ScalarResult,
+    Vector1,
+    Vector2,
+    Vector3,
+    VectorResult,
+    const LANES: usize,
+>(
+    fv: &dyn Fn(Vector1, Vector2, Vector3) -> VectorResult,
+    fs: &dyn Fn(Scalar1, Scalar2, Scalar3) -> ScalarResult,
+    check: &dyn Fn([Scalar1; LANES], [Scalar2; LANES], [Scalar3; LANES]) -> bool,
+) where
+    Scalar1: Copy + Default + core::fmt::Debug + DefaultStrategy,
+    Scalar2: Copy + Default + core::fmt::Debug + DefaultStrategy,
+    Scalar3: Copy + Default + core::fmt::Debug + DefaultStrategy,
+    ScalarResult: Copy + Default + biteq::BitEq + core::fmt::Debug + DefaultStrategy,
+    Vector1: Into<[Scalar1; LANES]> + From<[Scalar1; LANES]> + Copy,
+    Vector2: Into<[Scalar2; LANES]> + From<[Scalar2; LANES]> + Copy,
+    Vector3: Into<[Scalar3; LANES]> + From<[Scalar3; LANES]> + Copy,
+    VectorResult: Into<[ScalarResult; LANES]> + From<[ScalarResult; LANES]> + Copy,
+{
+    test_3(
+        &|x: [Scalar1; LANES], y: [Scalar2; LANES], z: [Scalar3; LANES]| {
+            proptest::prop_assume!(check(x, y, z));
+            let result_1: [ScalarResult; LANES] = fv(x.into(), y.into(), z.into()).into();
+            let result_2: [ScalarResult; LANES] = {
+                let mut result = [ScalarResult::default(); LANES];
+                for ((i1, (i2, i3)), o) in
+                    x.iter().zip(y.iter().zip(z.iter())).zip(result.iter_mut())
+                {
+                    *o = fs(*i1, *i2, *i3);
+                }
+                result
+            };
+            crate::prop_assert_biteq!(result_1, result_2);
+            Ok(())
+        },
+    );
+}
+
+/// Expand a const-generic test into separate tests for each possible lane count.
+#[macro_export]
+macro_rules! test_lanes {
+    {
+        $(fn $test:ident<const $lanes:ident: usize>() $body:tt)*
+    } => {
+        $(
+            mod $test {
+                use super::*;
+
+                fn implementation<const $lanes: usize>()
+                where
+                    core_simd::LaneCount<$lanes>: core_simd::SupportedLaneCount,
+                $body
+
+                #[cfg(target_arch = "wasm32")]
+                wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser);
+
+                #[test]
+                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+                fn lanes_1() {
+                    implementation::<1>();
+                }
+
+                #[test]
+                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+                fn lanes_2() {
+                    implementation::<2>();
+                }
+
+                #[test]
+                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+                fn lanes_4() {
+                    implementation::<4>();
+                }
+
+                #[test]
+                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+                fn lanes_8() {
+                    implementation::<8>();
+                }
+
+                #[test]
+                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+                fn lanes_16() {
+                    implementation::<16>();
+                }
+
+                #[test]
+                #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
+                fn lanes_32() {
+                    implementation::<32>();
+                }
+            }
+        )*
+    }
+}
+
+/// Expand a const-generic `#[should_panic]` test into separate tests for each possible lane count.
+#[macro_export]
+macro_rules! test_lanes_panic {
+    {
+        $(fn $test:ident<const $lanes:ident: usize>() $body:tt)*
+    } => {
+        $(
+            mod $test {
+                use super::*;
+
+                fn implementation<const $lanes: usize>()
+                where
+                    core_simd::LaneCount<$lanes>: core_simd::SupportedLaneCount,
+                $body
+
+                #[test]
+                #[should_panic]
+                fn lanes_1() {
+                    implementation::<1>();
+                }
+
+                #[test]
+                #[should_panic]
+                fn lanes_2() {
+                    implementation::<2>();
+                }
+
+                #[test]
+                #[should_panic]
+                fn lanes_4() {
+                    implementation::<4>();
+                }
+
+                #[test]
+                #[should_panic]
+                fn lanes_8() {
+                    implementation::<8>();
+                }
+
+                #[test]
+                #[should_panic]
+                fn lanes_16() {
+                    implementation::<16>();
+                }
+
+                #[test]
+                #[should_panic]
+                fn lanes_32() {
+                    implementation::<32>();
+                }
+            }
+        )*
+    }
+}
--- a/library/portable-simd/crates/test_helpers/src/wasm.rs
+++ b/library/portable-simd/crates/test_helpers/src/wasm.rs
@ -0,0 +1,51 @@
+//! Strategies for `u128` and `i128`, since proptest doesn't provide them for the wasm target.
+
+macro_rules! impl_num {
+    { $name:ident } => {
+        pub(crate) mod $name {
+            type InnerStrategy = crate::array::UniformArrayStrategy<proptest::num::u64::Any, [u64; 2]>;
+            use proptest::strategy::{Strategy, ValueTree, NewTree};
+
+
+            #[must_use = "strategies do nothing unless used"]
+            #[derive(Clone, Copy, Debug)]
+            pub struct Any {
+                strategy: InnerStrategy,
+            }
+
+            pub struct BinarySearch {
+                inner: <InnerStrategy as Strategy>::Tree,
+            }
+
+            impl ValueTree for BinarySearch {
+                type Value = $name;
+
+                fn current(&self) -> $name {
+                    unsafe { core::mem::transmute(self.inner.current()) }
+                }
+
+                fn simplify(&mut self) -> bool {
+                    self.inner.simplify()
+                }
+
+                fn complicate(&mut self) -> bool {
+                    self.inner.complicate()
+                }
+            }
+
+            impl Strategy for Any {
+                type Tree = BinarySearch;
+                type Value = $name;
+
+                fn new_tree(&self, runner: &mut proptest::test_runner::TestRunner) -> NewTree<Self> {
+                    Ok(BinarySearch { inner: self.strategy.new_tree(runner)? })
+                }
+            }
+
+            pub const ANY: Any = Any { strategy: InnerStrategy::new(proptest::num::u64::ANY) };
+        }
+    }
+}
+
+impl_num! { u128 }
+impl_num! { i128 }
--- a/library/std/src/lib.rs
+++ b/library/std/src/lib.rs
@ -320,6 +320,7 @@
 #![feature(panic_internals)]
 #![feature(panic_unwind)]
 #![feature(pin_static_ref)]
+#![cfg_attr(not(bootstrap), feature(portable_simd))]
 #![feature(prelude_import)]
 #![feature(ptr_internals)]
 #![feature(rustc_attrs)]
@ -471,6 +472,9 @@ pub use core::pin;
 pub use core::ptr;
 #[stable(feature = "rust1", since = "1.0.0")]
 pub use core::result;
+#[unstable(feature = "portable_simd", issue = "86656")]
+#[cfg(not(bootstrap))]
+pub use core::simd;
 #[unstable(feature = "async_stream", issue = "79024")]
 pub use core::stream;
 #[stable(feature = "i128", since = "1.26.0")]
--- a/rustfmt.toml
+++ b/rustfmt.toml
@ -17,6 +17,7 @@ ignore = [

    # do not format submodules
    "library/backtrace",
+    "library/portable-simd",
    "library/stdarch",
    "compiler/rustc_codegen_cranelift",
    "compiler/rustc_codegen_gcc",
--- a/src/test/ui/simd/libm_no_std_cant_float.rs
+++ b/src/test/ui/simd/libm_no_std_cant_float.rs
@ -0,0 +1,21 @@
+#![crate_type = "rlib"]
+#![no_std]
+#![feature(portable_simd)]
+use core::simd::f32x4;
+
+// For SIMD float ops, the LLIR version which is used to implement the portable
+// forms of them may become calls to math.h AKA libm. So, we can't guarantee
+// we can compile them for #![no_std] crates.
+// Someday we may solve this.
+// Until then, this test at least guarantees these functions require std.
+fn guarantee_no_std_nolibm_calls() -> f32x4 {
+    let x = f32x4::from_array([0.1, 0.5, 0.6, -1.5]);
+    let x2 = x + x;
+    let _xc = x.ceil(); //~ ERROR E0599
+    let _xf = x.floor(); //~ ERROR E0599
+    let _xr = x.round(); //~ ERROR E0599
+    let _xt = x.trunc(); //~ ERROR E0599
+    let _xfma = x.mul_add(x, x); //~ ERROR E0599
+    let _xsqrt = x.sqrt(); //~ ERROR E0599
+    x2.abs() * x2
+}
--- a/src/test/ui/simd/libm_no_std_cant_float.stderr
+++ b/src/test/ui/simd/libm_no_std_cant_float.stderr
@ -0,0 +1,39 @@
+error[E0599]: no method named `ceil` found for struct `Simd` in the current scope
+  --> $DIR/libm_no_std_cant_float.rs:14:17
+   |
+LL |     let _xc = x.ceil();
+   |                 ^^^^ method not found in `Simd<f32, 4_usize>`
+
+error[E0599]: no method named `floor` found for struct `Simd` in the current scope
+  --> $DIR/libm_no_std_cant_float.rs:15:17
+   |
+LL |     let _xf = x.floor();
+   |                 ^^^^^ method not found in `Simd<f32, 4_usize>`
+
+error[E0599]: no method named `round` found for struct `Simd` in the current scope
+  --> $DIR/libm_no_std_cant_float.rs:16:17
+   |
+LL |     let _xr = x.round();
+   |                 ^^^^^ method not found in `Simd<f32, 4_usize>`
+
+error[E0599]: no method named `trunc` found for struct `Simd` in the current scope
+  --> $DIR/libm_no_std_cant_float.rs:17:17
+   |
+LL |     let _xt = x.trunc();
+   |                 ^^^^^ method not found in `Simd<f32, 4_usize>`
+
+error[E0599]: no method named `mul_add` found for struct `Simd` in the current scope
+  --> $DIR/libm_no_std_cant_float.rs:18:19
+   |
+LL |     let _xfma = x.mul_add(x, x);
+   |                   ^^^^^^^ method not found in `Simd<f32, 4_usize>`
+
+error[E0599]: no method named `sqrt` found for struct `Simd` in the current scope
+  --> $DIR/libm_no_std_cant_float.rs:19:20
+   |
+LL |     let _xsqrt = x.sqrt();
+   |                    ^^^^ method not found in `Simd<f32, 4_usize>`
+
+error: aborting due to 6 previous errors
+
+For more information about this error, try `rustc --explain E0599`.
--- a/src/test/ui/simd/portable-intrinsics-arent-exposed.rs
+++ b/src/test/ui/simd/portable-intrinsics-arent-exposed.rs
@ -0,0 +1,8 @@
+// May not matter, since people can use them with a nightly feature.
+// However this tests to guarantee they don't leak out via portable_simd,
+// and thus don't accidentally get stabilized.
+use std::simd::intrinsics; //~ERROR E0603
+
+fn main() {
+    ()
+}
--- a/src/test/ui/simd/portable-intrinsics-arent-exposed.stderr
+++ b/src/test/ui/simd/portable-intrinsics-arent-exposed.stderr
@ -0,0 +1,15 @@
+error[E0603]: module `intrinsics` is private
+  --> $DIR/portable-intrinsics-arent-exposed.rs:4:16
+   |
+LL | use std::simd::intrinsics;
+   |                ^^^^^^^^^^ private module
+   |
+note: the module `intrinsics` is defined here
+  --> $SRC_DIR/core/src/lib.rs:LL:COL
+   |
+LL |     pub use crate::core_simd::simd::*;
+   |             ^^^^^^^^^^^^^^^^^^^^^^^^^
+
+error: aborting due to previous error
+
+For more information about this error, try `rustc --explain E0603`.
--- a/src/test/ui/suggestions/issue-71394-no-from-impl.stderr
+++ b/src/test/ui/suggestions/issue-71394-no-from-impl.stderr
@ -4,6 +4,9 @@ error[E0277]: the trait bound `&[i8]: From<&[u8]>` is not satisfied
 LL |     let _: &[i8] = data.into();
   |                         ^^^^ the trait `From<&[u8]>` is not implemented for `&[i8]`
   |
+   = help: the following implementations were found:
+             <[T; LANES] as From<Simd<T, LANES>>>
+             <[bool; LANES] as From<Mask<T, LANES>>>
   = note: required because of the requirements on the impl of `Into<&[i8]>` for `&[u8]`

 error: aborting due to previous error
--- a/src/tools/tidy/src/lib.rs
+++ b/src/tools/tidy/src/lib.rs
@ -60,6 +60,7 @@ fn filter_dirs(path: &Path) -> bool {
        "compiler/rustc_codegen_gcc",
        "src/llvm-project",
        "library/backtrace",
+        "library/portable-simd",
        "library/stdarch",
        "src/tools/cargo",
        "src/tools/clippy",