mirror of
https://github.com/rust-lang/rust.git
synced 2025-01-12 07:43:31 +00:00
e244e840f2
[Arm64] use isb instruction instead of yield in spin loops On arm64 we have seen on several databases that ISB (instruction synchronization barrier) is better to use than yield in a spin loop. The yield instruction is a nop. The isb instruction puts the processor to sleep for some short time. isb is a good equivalent to the pause instruction on x86. Below is an experiment that shows the effects of yield and isb on Arm64 and the time of a pause instruction on x86 Intel processors. The micro-benchmarks use https://github.com/google/benchmark.git ``` $ cat a.cc static void BM_scalar_increment(benchmark::State& state) { int i = 0; for (auto _ : state) benchmark::DoNotOptimize(i++); } BENCHMARK(BM_scalar_increment); static void BM_yield(benchmark::State& state) { for (auto _ : state) asm volatile("yield"::); } BENCHMARK(BM_yield); static void BM_isb(benchmark::State& state) { for (auto _ : state) asm volatile("isb"::); } BENCHMARK(BM_isb); BENCHMARK_MAIN(); $ g++ -o run a.cc -O2 -lbenchmark -lpthread $ ./run -------------------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------------------- AWS Graviton2 (Neoverse-N1) processor: BM_scalar_increment 0.485 ns 0.485 ns 1000000000 BM_yield 0.400 ns 0.400 ns 1000000000 BM_isb 13.2 ns 13.2 ns 52993304 AWS Graviton (A-72) processor: BM_scalar_increment 0.897 ns 0.874 ns 801558633 BM_yield 0.877 ns 0.875 ns 800002377 BM_isb 13.0 ns 12.7 ns 55169412 Apple Arm64 M1 processor: BM_scalar_increment 0.315 ns 0.315 ns 1000000000 BM_yield 0.313 ns 0.313 ns 1000000000 BM_isb 9.06 ns 9.06 ns 77259282 ``` ``` static void BM_pause(benchmark::State& state) { for (auto _ : state) asm volatile("pause"::); } BENCHMARK(BM_pause); Intel Skylake processor: BM_scalar_increment 0.295 ns 0.295 ns 1000000000 BM_pause 41.7 ns 41.7 ns 16780553 ``` Tested on Graviton2 aarch64-linux with `./x.py test`.
175 lines
6.7 KiB
Rust
175 lines
6.7 KiB
Rust
#![stable(feature = "core_hint", since = "1.27.0")]
|
|
|
|
//! Hints to compiler that affects how code should be emitted or optimized.
|
|
//! Hints may be compile time or runtime.
|
|
|
|
use crate::intrinsics;
|
|
|
|
/// Informs the compiler that this point in the code is not reachable, enabling
|
|
/// further optimizations.
|
|
///
|
|
/// # Safety
|
|
///
|
|
/// Reaching this function is completely *undefined behavior* (UB). In
|
|
/// particular, the compiler assumes that all UB must never happen, and
|
|
/// therefore will eliminate all branches that reach to a call to
|
|
/// `unreachable_unchecked()`.
|
|
///
|
|
/// Like all instances of UB, if this assumption turns out to be wrong, i.e., the
|
|
/// `unreachable_unchecked()` call is actually reachable among all possible
|
|
/// control flow, the compiler will apply the wrong optimization strategy, and
|
|
/// may sometimes even corrupt seemingly unrelated code, causing
|
|
/// difficult-to-debug problems.
|
|
///
|
|
/// Use this function only when you can prove that the code will never call it.
|
|
/// Otherwise, consider using the [`unreachable!`] macro, which does not allow
|
|
/// optimizations but will panic when executed.
|
|
///
|
|
/// # Example
|
|
///
|
|
/// ```
|
|
/// fn div_1(a: u32, b: u32) -> u32 {
|
|
/// use std::hint::unreachable_unchecked;
|
|
///
|
|
/// // `b.saturating_add(1)` is always positive (not zero),
|
|
/// // hence `checked_div` will never return `None`.
|
|
/// // Therefore, the else branch is unreachable.
|
|
/// a.checked_div(b.saturating_add(1))
|
|
/// .unwrap_or_else(|| unsafe { unreachable_unchecked() })
|
|
/// }
|
|
///
|
|
/// assert_eq!(div_1(7, 0), 7);
|
|
/// assert_eq!(div_1(9, 1), 4);
|
|
/// assert_eq!(div_1(11, u32::MAX), 0);
|
|
/// ```
|
|
#[inline]
|
|
#[stable(feature = "unreachable", since = "1.27.0")]
|
|
#[rustc_const_unstable(feature = "const_unreachable_unchecked", issue = "53188")]
|
|
pub const unsafe fn unreachable_unchecked() -> ! {
|
|
// SAFETY: the safety contract for `intrinsics::unreachable` must
|
|
// be upheld by the caller.
|
|
unsafe { intrinsics::unreachable() }
|
|
}
|
|
|
|
/// Emits a machine instruction to signal the processor that it is running in
|
|
/// a busy-wait spin-loop ("spin lock").
|
|
///
|
|
/// Upon receiving the spin-loop signal the processor can optimize its behavior by,
|
|
/// for example, saving power or switching hyper-threads.
|
|
///
|
|
/// This function is different from [`thread::yield_now`] which directly
|
|
/// yields to the system's scheduler, whereas `spin_loop` does not interact
|
|
/// with the operating system.
|
|
///
|
|
/// A common use case for `spin_loop` is implementing bounded optimistic
|
|
/// spinning in a CAS loop in synchronization primitives. To avoid problems
|
|
/// like priority inversion, it is strongly recommended that the spin loop is
|
|
/// terminated after a finite amount of iterations and an appropriate blocking
|
|
/// syscall is made.
|
|
///
|
|
/// **Note**: On platforms that do not support receiving spin-loop hints this
|
|
/// function does not do anything at all.
|
|
///
|
|
/// # Examples
|
|
///
|
|
/// ```
|
|
/// use std::sync::atomic::{AtomicBool, Ordering};
|
|
/// use std::sync::Arc;
|
|
/// use std::{hint, thread};
|
|
///
|
|
/// // A shared atomic value that threads will use to coordinate
|
|
/// let live = Arc::new(AtomicBool::new(false));
|
|
///
|
|
/// // In a background thread we'll eventually set the value
|
|
/// let bg_work = {
|
|
/// let live = live.clone();
|
|
/// thread::spawn(move || {
|
|
/// // Do some work, then make the value live
|
|
/// do_some_work();
|
|
/// live.store(true, Ordering::Release);
|
|
/// })
|
|
/// };
|
|
///
|
|
/// // Back on our current thread, we wait for the value to be set
|
|
/// while !live.load(Ordering::Acquire) {
|
|
/// // The spin loop is a hint to the CPU that we're waiting, but probably
|
|
/// // not for very long
|
|
/// hint::spin_loop();
|
|
/// }
|
|
///
|
|
/// // The value is now set
|
|
/// # fn do_some_work() {}
|
|
/// do_some_work();
|
|
/// bg_work.join()?;
|
|
/// # Ok::<(), Box<dyn core::any::Any + Send + 'static>>(())
|
|
/// ```
|
|
///
|
|
/// [`thread::yield_now`]: ../../std/thread/fn.yield_now.html
|
|
#[inline]
|
|
#[stable(feature = "renamed_spin_loop", since = "1.49.0")]
|
|
pub fn spin_loop() {
|
|
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))]
|
|
{
|
|
#[cfg(target_arch = "x86")]
|
|
{
|
|
// SAFETY: the `cfg` attr ensures that we only execute this on x86 targets.
|
|
unsafe { crate::arch::x86::_mm_pause() };
|
|
}
|
|
|
|
#[cfg(target_arch = "x86_64")]
|
|
{
|
|
// SAFETY: the `cfg` attr ensures that we only execute this on x86_64 targets.
|
|
unsafe { crate::arch::x86_64::_mm_pause() };
|
|
}
|
|
}
|
|
|
|
#[cfg(any(target_arch = "aarch64", all(target_arch = "arm", target_feature = "v6")))]
|
|
{
|
|
#[cfg(target_arch = "aarch64")]
|
|
{
|
|
// SAFETY: the `cfg` attr ensures that we only execute this on aarch64 targets.
|
|
unsafe { crate::arch::aarch64::__isb(crate::arch::aarch64::SY) };
|
|
}
|
|
#[cfg(target_arch = "arm")]
|
|
{
|
|
// SAFETY: the `cfg` attr ensures that we only execute this on arm targets
|
|
// with support for the v6 feature.
|
|
unsafe { crate::arch::arm::__yield() };
|
|
}
|
|
}
|
|
}
|
|
|
|
/// An identity function that *__hints__* to the compiler to be maximally pessimistic about what
|
|
/// `black_box` could do.
|
|
///
|
|
/// Unlike [`std::convert::identity`], a Rust compiler is encouraged to assume that `black_box` can
|
|
/// use `dummy` in any possible valid way that Rust code is allowed to without introducing undefined
|
|
/// behavior in the calling code. This property makes `black_box` useful for writing code in which
|
|
/// certain optimizations are not desired, such as benchmarks.
|
|
///
|
|
/// Note however, that `black_box` is only (and can only be) provided on a "best-effort" basis. The
|
|
/// extent to which it can block optimisations may vary depending upon the platform and code-gen
|
|
/// backend used. Programs cannot rely on `black_box` for *correctness* in any way.
|
|
///
|
|
/// [`std::convert::identity`]: crate::convert::identity
|
|
#[cfg_attr(not(miri), inline)]
|
|
#[cfg_attr(miri, inline(never))]
|
|
#[unstable(feature = "bench_black_box", issue = "64102")]
|
|
#[cfg_attr(miri, allow(unused_mut))]
|
|
pub fn black_box<T>(mut dummy: T) -> T {
|
|
// We need to "use" the argument in some way LLVM can't introspect, and on
|
|
// targets that support it we can typically leverage inline assembly to do
|
|
// this. LLVM's interpretation of inline assembly is that it's, well, a black
|
|
// box. This isn't the greatest implementation since it probably deoptimizes
|
|
// more than we want, but it's so far good enough.
|
|
|
|
#[cfg(not(miri))] // This is just a hint, so it is fine to skip in Miri.
|
|
// SAFETY: the inline assembly is a no-op.
|
|
unsafe {
|
|
// FIXME: Cannot use `asm!` because it doesn't support MIPS and other architectures.
|
|
llvm_asm!("" : : "r"(&mut dummy) : "memory" : "volatile");
|
|
}
|
|
|
|
dummy
|
|
}
|