mirror of
https://github.com/rust-lang/rust.git
synced 2025-01-13 00:04:12 +00:00
e244e840f2
[Arm64] use isb instruction instead of yield in spin loops On arm64 we have seen on several databases that ISB (instruction synchronization barrier) is better to use than yield in a spin loop. The yield instruction is a nop. The isb instruction puts the processor to sleep for some short time. isb is a good equivalent to the pause instruction on x86. Below is an experiment that shows the effects of yield and isb on Arm64 and the time of a pause instruction on x86 Intel processors. The micro-benchmarks use https://github.com/google/benchmark.git ``` $ cat a.cc static void BM_scalar_increment(benchmark::State& state) { int i = 0; for (auto _ : state) benchmark::DoNotOptimize(i++); } BENCHMARK(BM_scalar_increment); static void BM_yield(benchmark::State& state) { for (auto _ : state) asm volatile("yield"::); } BENCHMARK(BM_yield); static void BM_isb(benchmark::State& state) { for (auto _ : state) asm volatile("isb"::); } BENCHMARK(BM_isb); BENCHMARK_MAIN(); $ g++ -o run a.cc -O2 -lbenchmark -lpthread $ ./run -------------------------------------------------------------- Benchmark Time CPU Iterations -------------------------------------------------------------- AWS Graviton2 (Neoverse-N1) processor: BM_scalar_increment 0.485 ns 0.485 ns 1000000000 BM_yield 0.400 ns 0.400 ns 1000000000 BM_isb 13.2 ns 13.2 ns 52993304 AWS Graviton (A-72) processor: BM_scalar_increment 0.897 ns 0.874 ns 801558633 BM_yield 0.877 ns 0.875 ns 800002377 BM_isb 13.0 ns 12.7 ns 55169412 Apple Arm64 M1 processor: BM_scalar_increment 0.315 ns 0.315 ns 1000000000 BM_yield 0.313 ns 0.313 ns 1000000000 BM_isb 9.06 ns 9.06 ns 77259282 ``` ``` static void BM_pause(benchmark::State& state) { for (auto _ : state) asm volatile("pause"::); } BENCHMARK(BM_pause); Intel Skylake processor: BM_scalar_increment 0.295 ns 0.295 ns 1000000000 BM_pause 41.7 ns 41.7 ns 16780553 ``` Tested on Graviton2 aarch64-linux with `./x.py test`. |
||
---|---|---|
.. | ||
alloc | ||
array | ||
char | ||
convert | ||
fmt | ||
future | ||
hash | ||
iter | ||
macros | ||
mem | ||
num | ||
ops | ||
prelude | ||
ptr | ||
slice | ||
str | ||
stream | ||
sync | ||
task | ||
unicode | ||
any.rs | ||
ascii.rs | ||
bool.rs | ||
borrow.rs | ||
cell.rs | ||
clone.rs | ||
cmp.rs | ||
default.rs | ||
ffi.rs | ||
hint.rs | ||
internal_macros.rs | ||
intrinsics.rs | ||
lazy.rs | ||
lib.rs | ||
marker.rs | ||
option.rs | ||
panic.rs | ||
panicking.rs | ||
pin.rs | ||
primitive.rs | ||
raw.rs | ||
result.rs | ||
time.rs | ||
tuple.rs | ||
unit.rs |