rust/core at bfe5e8cef698ccc4fca655b4cdbabf78fed43816 - rust

mirror of https://github.com/rust-lang/rust.git synced 2024-11-26 08:44:35 +00:00

History

bors bfe5e8cef6 Auto merge of #128204 - GuillaumeGomez:integers-opti, r=workingjubilee Small optimization for integers Display implementation This is a first pass to try to speed up a bit integers `Display` implementation. The idea behind this is to reduce the stack usage for the buffer storing the output (shouldn't be visible in bench normally) and some small specialization which benefits a lot to smaller integers like `u8` and `i8`. Here are the results of the benchmarks: \| bench name \| current std \| with this PR \| \|-\|-\|-\| \| bench_std_fmt::bench_i16_0 \| 16.45 ns/iter (+/- 0.25) \| 16.50 ns/iter (+/- 0.15) \| \| bench_std_fmt::bench_i16_max \| 17.83 ns/iter (+/- 0.66) \| 17.58 ns/iter (+/- 0.10) \| \| bench_std_fmt::bench_i16_min \| 20.97 ns/iter (+/- 0.49) \| 20.50 ns/iter (+/- 0.28) \| \| bench_std_fmt::bench_i32_0 \| 16.63 ns/iter (+/- 0.06) \| 16.62 ns/iter (+/- 0.07) \| \| bench_std_fmt::bench_i32_max \| 19.79 ns/iter (+/- 0.43) \| 19.55 ns/iter (+/- 0.14) \| \| bench_std_fmt::bench_i32_min \| 22.97 ns/iter (+/- 0.50) \| 22.08 ns/iter (+/- 0.08) \| \| bench_std_fmt::bench_i64_0 \| 16.63 ns/iter (+/- 0.39) \| 16.69 ns/iter (+/- 0.44) \| \| bench_std_fmt::bench_i64_half \| 19.60 ns/iter (+/- 0.05) \| 19.10 ns/iter (+/- 0.05) \| \| bench_std_fmt::bench_i64_max \| 25.22 ns/iter (+/- 0.34) \| 24.43 ns/iter (+/- 0.02) \| \| bench_std_fmt::bench_i8_0 \| 16.27 ns/iter (+/- 0.32) \| 15.80 ns/iter (+/- 0.17) \| \| bench_std_fmt::bench_i8_max \| 16.71 ns/iter (+/- 0.09) \| 16.25 ns/iter (+/- 0.01) \| \| bench_std_fmt::bench_i8_min \| 20.07 ns/iter (+/- 0.22) \| 19.80 ns/iter (+/- 0.30) \| \| bench_std_fmt::bench_u128_0 \| 21.37 ns/iter (+/- 0.24) \| 21.35 ns/iter (+/- 0.35) \| \| bench_std_fmt::bench_u128_max \| 48.13 ns/iter (+/- 0.20) \| 48.78 ns/iter (+/- 0.29) \| \| bench_std_fmt::bench_u16_0 \| 16.48 ns/iter (+/- 0.46) \| 16.03 ns/iter (+/- 0.39) \| \| bench_std_fmt::bench_u16_max \| 17.31 ns/iter (+/- 0.32) \| 17.41 ns/iter (+/- 0.32) \| \| bench_std_fmt::bench_u16_min \| 16.40 ns/iter (+/- 0.45) \| 16.02 ns/iter (+/- 0.39) \| \| bench_std_fmt::bench_u32_0 \| 16.17 ns/iter (+/- 0.04) \| 16.29 ns/iter (+/- 0.16) \| \| bench_std_fmt::bench_u32_max \| 19.00 ns/iter (+/- 0.10) \| 19.16 ns/iter (+/- 0.28) \| \| bench_std_fmt::bench_u32_min \| 16.16 ns/iter (+/- 0.09) \| 16.28 ns/iter (+/- 0.11) \| \| bench_std_fmt::bench_u64_0 \| 16.22 ns/iter (+/- 0.22) \| 16.14 ns/iter (+/- 0.18) \| \| bench_std_fmt::bench_u64_half \| 19.25 ns/iter (+/- 0.07) \| 18.95 ns/iter (+/- 0.05) \| \| bench_std_fmt::bench_u64_max \| 24.31 ns/iter (+/- 0.08) \| 24.18 ns/iter (+/- 0.08) \| \| bench_std_fmt::bench_u8_0 \| 15.76 ns/iter (+/- 0.08) \| 15.66 ns/iter (+/- 0.08) \| \| bench_std_fmt::bench_u8_max \| 16.53 ns/iter (+/- 0.03) \| 16.29 ns/iter (+/- 0.02) \| \| bench_std_fmt::bench_u8_min \| 15.77 ns/iter (+/- 0.06) \| 15.67 ns/iter (+/- 0.02) \| The source code is: <details> <summary>source code</summary> ```rust #![feature(test)] #![allow(non_snake_case)] #![allow(clippy::cast_lossless)] extern crate test; macro_rules! benches { ($($name:ident($value:expr))) => { mod bench_std_fmt { use std::io::Write; use test::{Bencher, black_box}; $( #[bench] fn $name(b: &mut Bencher) { let mut buf = Vec::with_capacity(40); b.iter(\|\| { buf.clear(); write!(&mut buf, "{}", black_box($value)).unwrap(); black_box(&buf); }); } ) } } } benches! { bench_u64_0(0u64) bench_u64_half(u32::max_value() as u64) bench_u64_max(u64::max_value()) bench_i64_0(0i64) bench_i64_half(i32::max_value() as i64) bench_i64_max(i64::max_value()) bench_u16_0(0u16) bench_u16_min(u16::min_value()) bench_u16_max(u16::max_value()) bench_i16_0(0i16) bench_i16_min(i16::min_value()) bench_i16_max(i16::max_value()) bench_u128_0(0u128) bench_u128_max(u128::max_value()) bench_i8_0(0i8) bench_i8_min(i8::min_value()) bench_i8_max(i8::max_value()) bench_u8_0(0u8) bench_u8_min(u8::min_value()) bench_u8_max(u8::max_value()) bench_u32_0(0u32) bench_u32_min(u32::min_value()) bench_u32_max(u32::max_value()) bench_i32_0(0i32) bench_i32_min(i32::min_value()) bench_i32_max(i32::max_value()) } ``` </details> And then I ran the equivalent code (source code below) in callgrind with [callgrind_differ](https://github.com/Ethiraric/callgrind_differ) to generate a nice output and here's the result: ``` core::fmt::num:👿:<impl core::fmt::Display for i16>::fmt \| 1300000 \| - 70000 - 5.385% 1230000 core::fmt::num:👿:<impl core::fmt::Display for i32>::fmt \| 1910000 \| - 100000 - 5.236% 1810000 core::fmt::num:👿:<impl core::fmt::Display for i64>::fmt \| 2430000 \| - 110000 - 4.527% 2320000 core::fmt::num:👿:<impl core::fmt::Display for i8>::fmt \| 1080000 \| - 170000 - 15.741% 910000 core::fmt::num:👿:<impl core::fmt::Display for u16>::fmt \| 960000 \| + 10000 + 1.042% 970000 core::fmt::num:👿:<impl core::fmt::Display for u32>::fmt \| 1300000 \| + 30000 + 2.308% 1330000 core::fmt::num:👿:<impl core::fmt::Display for u8>::fmt \| 820000 \| - 30000 - 3.659% 790000 ``` <details> <summary>Source code</summary> ```rust #![feature(test)] extern crate test; use std::io::{stdout, Write}; use std::io::StdoutLock; use test::black_box; macro_rules! benches { ($handle:ident, $buf:ident, $($name:ident($value:expr))) => { $( fn $name(handle: &mut StdoutLock, buf: &mut Vec<u8>) { for _ in 0..10000 { buf.clear(); write!(buf, "{}", black_box($value)).unwrap(); handle.write_all(buf); } } $name(&mut $handle, &mut $buf); ) } } fn main() { let mut handle = stdout().lock(); let mut buf = Vec::with_capacity(40); benches! { handle, buf, bench_u64_0(0u64) bench_u64_half(u32::max_value() as u64) bench_u64_max(u64::max_value()) bench_i64_0(0i64) bench_i64_half(i32::max_value() as i64) bench_i64_max(i64::max_value()) bench_u16_0(0u16) bench_u16_min(u16::min_value()) bench_u16_max(u16::max_value()) bench_i16_0(0i16) bench_i16_min(i16::min_value()) bench_i16_max(i16::max_value()) bench_u128_0(0u128) bench_u128_max(u128::max_value()) bench_i8_0(0i8) bench_i8_min(i8::min_value()) bench_i8_max(i8::max_value()) bench_u8_0(0u8) bench_u8_min(u8::min_value()) bench_u8_max(u8::max_value()) bench_i32_0(0i32) bench_i32_min(i32::min_value()) bench_i32_max(i32::max_value()) bench_u32_0(0u32) bench_u32_min(u32::min_value()) bench_u32_max(u32::max_value()) } } ``` </details> The next step would be to specialize the `ToString` implementation so it doesn't go through the `Display` trait. I'm not sure if it will improve anything but I think it's worth a try. r? `@Amanieu`		2024-10-01 22:12:44 +00:00
..
benches	Reformat using the new identifier sorting from rustfmt	2024-09-22 19:11:29 -04:00
src	Auto merge of #128204 - GuillaumeGomez:integers-opti, r=workingjubilee	2024-10-01 22:12:44 +00:00
tests	stabilize const_cell_into_inner	2024-09-28 11:29:02 +02:00
Cargo.toml	Port std library to RTEMS	2024-09-03 09:19:29 +02:00