mirror of
https://github.com/rust-lang/rust.git
synced 2025-04-28 02:57:37 +00:00
Auto merge of #44051 - eddyb:apfloat-faster-div, r=nagisa
Speed up APFloat division by using short division for small divisors. Fixes #43828 (hopefully), by not doing long division bit-by-bit for small divisors. When parsing the ~200,000 decimal float literals in the `tuple-stress` benchmark, this change brings roughly a 5x speed increase (from `0.6s` to `0.12s`), and the hottest instructions are native `div`s.
This commit is contained in:
commit
c0771f2190
@ -460,18 +460,15 @@ impl<S: Semantics> fmt::Display for IeeeFloat<S> {
|
||||
// rem <- sig % 10
|
||||
// sig <- sig / 10
|
||||
let mut rem = 0;
|
||||
for limb in sig.iter_mut().rev() {
|
||||
// We don't have an integer doubly wide than Limb,
|
||||
// so we have to split the divrem on two halves.
|
||||
const HALF_BITS: usize = LIMB_BITS / 2;
|
||||
let mut halves = [*limb & ((1 << HALF_BITS) - 1), *limb >> HALF_BITS];
|
||||
for half in halves.iter_mut().rev() {
|
||||
*half |= rem << HALF_BITS;
|
||||
rem = *half % 10;
|
||||
*half /= 10;
|
||||
}
|
||||
*limb = halves[0] | (halves[1] << HALF_BITS);
|
||||
}
|
||||
|
||||
// Use 64-bit division and remainder, with 32-bit chunks from sig.
|
||||
sig::each_chunk(&mut sig, 32, |chunk| {
|
||||
let chunk = chunk as u32;
|
||||
let combined = ((rem as u64) << 32) | (chunk as u64);
|
||||
rem = (combined % 10) as u8;
|
||||
(combined / 10) as u32 as Limb
|
||||
});
|
||||
|
||||
// Reduce the sigificand to avoid wasting time dividing 0's.
|
||||
while sig.last() == Some(&0) {
|
||||
sig.pop();
|
||||
@ -491,7 +488,7 @@ impl<S: Semantics> fmt::Display for IeeeFloat<S> {
|
||||
exp += 1;
|
||||
} else {
|
||||
in_trail = false;
|
||||
buffer.push(b'0' + digit as u8);
|
||||
buffer.push(b'0' + digit);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2065,7 +2062,7 @@ impl<S: Semantics> IeeeFloat<S> {
|
||||
};
|
||||
|
||||
// Attempt dec_sig * 10^dec_exp with increasing precision.
|
||||
let mut attempt = 1;
|
||||
let mut attempt = 0;
|
||||
loop {
|
||||
let calc_precision = (LIMB_BITS << attempt) - 1;
|
||||
attempt += 1;
|
||||
@ -2310,6 +2307,17 @@ mod sig {
|
||||
limbs.iter().all(|&l| l == 0)
|
||||
}
|
||||
|
||||
/// One, not zero, based LSB. That is, returns 0 for a zeroed significand.
|
||||
pub(super) fn olsb(limbs: &[Limb]) -> usize {
|
||||
for i in 0..limbs.len() {
|
||||
if limbs[i] != 0 {
|
||||
return i * LIMB_BITS + limbs[i].trailing_zeros() as usize + 1;
|
||||
}
|
||||
}
|
||||
|
||||
0
|
||||
}
|
||||
|
||||
/// One, not zero, based MSB. That is, returns 0 for a zeroed significand.
|
||||
pub(super) fn omsb(limbs: &[Limb]) -> usize {
|
||||
for i in (0..limbs.len()).rev() {
|
||||
@ -2468,6 +2476,20 @@ mod sig {
|
||||
}
|
||||
}
|
||||
|
||||
/// For every consecutive chunk of `bits` bits from `limbs`,
|
||||
/// going from most significant to the least significant bits,
|
||||
/// call `f` to transform those bits and store the result back.
|
||||
pub(super) fn each_chunk<F: FnMut(Limb) -> Limb>(limbs: &mut [Limb], bits: usize, mut f: F) {
|
||||
assert_eq!(LIMB_BITS % bits, 0);
|
||||
for limb in limbs.iter_mut().rev() {
|
||||
let mut r = 0;
|
||||
for i in (0..LIMB_BITS / bits).rev() {
|
||||
r |= f((*limb >> (i * bits)) & ((1 << bits) - 1)) << (i * bits);
|
||||
}
|
||||
*limb = r;
|
||||
}
|
||||
}
|
||||
|
||||
/// Increment in-place, return the carry flag.
|
||||
pub(super) fn increment(dst: &mut [Limb]) -> Limb {
|
||||
for x in dst {
|
||||
@ -2686,10 +2708,6 @@ mod sig {
|
||||
divisor: &mut [Limb],
|
||||
precision: usize,
|
||||
) -> Loss {
|
||||
// Zero the quotient before setting bits in it.
|
||||
for x in &mut quotient[..limbs_for_bits(precision)] {
|
||||
*x = 0;
|
||||
}
|
||||
|
||||
// Normalize the divisor.
|
||||
let bits = precision - omsb(divisor);
|
||||
@ -2700,6 +2718,13 @@ mod sig {
|
||||
let bits = precision - omsb(dividend);
|
||||
shift_left(dividend, exp, bits);
|
||||
|
||||
// Division by 1.
|
||||
let olsb_divisor = olsb(divisor);
|
||||
if olsb_divisor == precision {
|
||||
quotient.copy_from_slice(dividend);
|
||||
return Loss::ExactlyZero;
|
||||
}
|
||||
|
||||
// Ensure the dividend >= divisor initially for the loop below.
|
||||
// Incidentally, this means that the division loop below is
|
||||
// guaranteed to set the integer bit to one.
|
||||
@ -2708,6 +2733,58 @@ mod sig {
|
||||
assert_ne!(cmp(dividend, divisor), Ordering::Less)
|
||||
}
|
||||
|
||||
// Helper for figuring out the lost fraction.
|
||||
let lost_fraction = |dividend: &[Limb], divisor: &[Limb]| {
|
||||
match cmp(dividend, divisor) {
|
||||
Ordering::Greater => Loss::MoreThanHalf,
|
||||
Ordering::Equal => Loss::ExactlyHalf,
|
||||
Ordering::Less => {
|
||||
if is_all_zeros(dividend) {
|
||||
Loss::ExactlyZero
|
||||
} else {
|
||||
Loss::LessThanHalf
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Try to perform a (much faster) short division for small divisors.
|
||||
let divisor_bits = precision - (olsb_divisor - 1);
|
||||
macro_rules! try_short_div {
|
||||
($W:ty, $H:ty, $half:expr) => {
|
||||
if divisor_bits * 2 <= $half {
|
||||
// Extract the small divisor.
|
||||
let _: Loss = shift_right(divisor, &mut 0, olsb_divisor - 1);
|
||||
let divisor = divisor[0] as $H as $W;
|
||||
|
||||
// Shift the dividend to produce a quotient with the unit bit set.
|
||||
let top_limb = *dividend.last().unwrap();
|
||||
let mut rem = (top_limb >> (LIMB_BITS - (divisor_bits - 1))) as $H;
|
||||
shift_left(dividend, &mut 0, divisor_bits - 1);
|
||||
|
||||
// Apply short division in place on $H (of $half bits) chunks.
|
||||
each_chunk(dividend, $half, |chunk| {
|
||||
let chunk = chunk as $H;
|
||||
let combined = ((rem as $W) << $half) | (chunk as $W);
|
||||
rem = (combined % divisor) as $H;
|
||||
(combined / divisor) as $H as Limb
|
||||
});
|
||||
quotient.copy_from_slice(dividend);
|
||||
|
||||
return lost_fraction(&[(rem as Limb) << 1], &[divisor as Limb]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try_short_div!(u32, u16, 16);
|
||||
try_short_div!(u64, u32, 32);
|
||||
try_short_div!(u128, u64, 64);
|
||||
|
||||
// Zero the quotient before setting bits in it.
|
||||
for x in &mut quotient[..limbs_for_bits(precision)] {
|
||||
*x = 0;
|
||||
}
|
||||
|
||||
// Long division.
|
||||
for bit in (0..precision).rev() {
|
||||
if cmp(dividend, divisor) != Ordering::Less {
|
||||
@ -2717,17 +2794,6 @@ mod sig {
|
||||
shift_left(dividend, &mut 0, 1);
|
||||
}
|
||||
|
||||
// Figure out the lost fraction.
|
||||
match cmp(dividend, divisor) {
|
||||
Ordering::Greater => Loss::MoreThanHalf,
|
||||
Ordering::Equal => Loss::ExactlyHalf,
|
||||
Ordering::Less => {
|
||||
if is_all_zeros(dividend) {
|
||||
Loss::ExactlyZero
|
||||
} else {
|
||||
Loss::LessThanHalf
|
||||
}
|
||||
}
|
||||
}
|
||||
lost_fraction(dividend, divisor)
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user