Speed up APFloat division by using short division for small divisors.

2025-06-05 03:38:29 +00:00 · 2017-08-23 02:44:41 +03:00 · 2017-08-23 02:44:41 +03:00 · b9c69ec3c3
commit b9c69ec3c3
parent 469a6f9bd9
1 changed files with 96 additions and 30 deletions
--- a/src/librustc_apfloat/ieee.rs
+++ b/src/librustc_apfloat/ieee.rs
@ -460,18 +460,15 @@ impl<S: Semantics> fmt::Display for IeeeFloat<S> {
            // rem <- sig % 10
            // sig <- sig / 10
            let mut rem = 0;
-            for limb in sig.iter_mut().rev() {
+
-                // We don't have an integer doubly wide than Limb,
+            // Use 64-bit division and remainder, with 32-bit chunks from sig.
-                // so we have to split the divrem on two halves.
+            sig::each_chunk(&mut sig, 32, |chunk| {
-                const HALF_BITS: usize = LIMB_BITS / 2;
+                let chunk = chunk as u32;
-                let mut halves = [*limb & ((1 << HALF_BITS) - 1), *limb >> HALF_BITS];
+                let combined = ((rem as u64) << 32) | (chunk as u64);
-                for half in halves.iter_mut().rev() {
+                rem = (combined % 10) as u8;
-                    *half |= rem << HALF_BITS;
+                (combined / 10) as u32 as Limb
-                    rem = *half % 10;
+            });
-                    *half /= 10;
+
                }
                *limb = halves[0] | (halves[1] << HALF_BITS);
            }
            // Reduce the sigificand to avoid wasting time dividing 0's.
            while sig.last() == Some(&0) {
                sig.pop();
@ -491,7 +488,7 @@ impl<S: Semantics> fmt::Display for IeeeFloat<S> {
                exp += 1;
            } else {
                in_trail = false;
-                buffer.push(b'0' + digit as u8);
+                buffer.push(b'0' + digit);
            }
        }
@ -2065,7 +2062,7 @@ impl<S: Semantics> IeeeFloat<S> {
        };
        // Attempt dec_sig * 10^dec_exp with increasing precision.
-        let mut attempt = 1;
+        let mut attempt = 0;
        loop {
            let calc_precision = (LIMB_BITS << attempt) - 1;
            attempt += 1;
@ -2310,6 +2307,17 @@ mod sig {
        limbs.iter().all(|&l| l == 0)
    }
    /// One, not zero, based LSB. That is, returns 0 for a zeroed significand.
    pub(super) fn olsb(limbs: &[Limb]) -> usize {
        for i in 0..limbs.len() {
            if limbs[i] != 0 {
                return i * LIMB_BITS + limbs[i].trailing_zeros() as usize + 1;
            }
        }
        0
    }
    /// One, not zero, based MSB. That is, returns 0 for a zeroed significand.
    pub(super) fn omsb(limbs: &[Limb]) -> usize {
        for i in (0..limbs.len()).rev() {
@ -2468,6 +2476,20 @@ mod sig {
        }
    }
    /// For every consecutive chunk of `bits` bits from `limbs`,
    /// going from most significant to the least significant bits,
    /// call `f` to transform those bits and store the result back.
    pub(super) fn each_chunk<F: FnMut(Limb) -> Limb>(limbs: &mut [Limb], bits: usize, mut f: F) {
        assert_eq!(LIMB_BITS % bits, 0);
        for limb in limbs.iter_mut().rev() {
            let mut r = 0;
            for i in (0..LIMB_BITS / bits).rev() {
                r |= f((*limb >> (i * bits)) & ((1 << bits) - 1)) << (i * bits);
            }
            *limb = r;
        }
    }
    /// Increment in-place, return the carry flag.
    pub(super) fn increment(dst: &mut [Limb]) -> Limb {
        for x in dst {
@ -2686,10 +2708,6 @@ mod sig {
        divisor: &mut [Limb],
        precision: usize,
    ) -> Loss {
        // Zero the quotient before setting bits in it.
        for x in &mut quotient[..limbs_for_bits(precision)] {
            *x = 0;
        }
        // Normalize the divisor.
        let bits = precision - omsb(divisor);
@ -2700,6 +2718,13 @@ mod sig {
        let bits = precision - omsb(dividend);
        shift_left(dividend, exp, bits);
        // Division by 1.
        let olsb_divisor = olsb(divisor);
        if olsb_divisor == precision {
            quotient.copy_from_slice(dividend);
            return Loss::ExactlyZero;
        }
        // Ensure the dividend >= divisor initially for the loop below.
        // Incidentally, this means that the division loop below is
        // guaranteed to set the integer bit to one.
@ -2708,6 +2733,58 @@ mod sig {
            assert_ne!(cmp(dividend, divisor), Ordering::Less)
        }
        // Helper for figuring out the lost fraction.
        let lost_fraction = |dividend: &[Limb], divisor: &[Limb]| {
            match cmp(dividend, divisor) {
                Ordering::Greater => Loss::MoreThanHalf,
                Ordering::Equal => Loss::ExactlyHalf,
                Ordering::Less => {
                    if is_all_zeros(dividend) {
                        Loss::ExactlyZero
                    } else {
                        Loss::LessThanHalf
                    }
                }
            }
        };
        // Try to perform a (much faster) short division for small divisors.
        let divisor_bits = precision - (olsb_divisor - 1);
        macro_rules! try_short_div {
            ($W:ty, $H:ty, $half:expr) => {
                if divisor_bits * 2 <= $half {
                    // Extract the small divisor.
                    let _: Loss = shift_right(divisor, &mut 0, olsb_divisor - 1);
                    let divisor = divisor[0] as $H as $W;
                    // Shift the dividend to produce a quotient with the unit bit set.
                    let top_limb = *dividend.last().unwrap();
                    let mut rem = (top_limb >> (LIMB_BITS - (divisor_bits - 1))) as $H;
                    shift_left(dividend, &mut 0, divisor_bits - 1);
                    // Apply short division in place on $H (of $half bits) chunks.
                    each_chunk(dividend, $half, |chunk| {
                        let chunk = chunk as $H;
                        let combined = ((rem as $W) << $half) | (chunk as $W);
                        rem = (combined % divisor) as $H;
                        (combined / divisor) as $H as Limb
                    });
                    quotient.copy_from_slice(dividend);
                    return lost_fraction(&[(rem as Limb) << 1], &[divisor as Limb]);
                }
            }
        }
        try_short_div!(u32, u16, 16);
        try_short_div!(u64, u32, 32);
        try_short_div!(u128, u64, 64);
        // Zero the quotient before setting bits in it.
        for x in &mut quotient[..limbs_for_bits(precision)] {
            *x = 0;
        }
        // Long division.
        for bit in (0..precision).rev() {
            if cmp(dividend, divisor) != Ordering::Less {
@ -2717,17 +2794,6 @@ mod sig {
            shift_left(dividend, &mut 0, 1);
        }
-        // Figure out the lost fraction.
+        lost_fraction(dividend, divisor)
        match cmp(dividend, divisor) {
            Ordering::Greater => Loss::MoreThanHalf,
            Ordering::Equal => Loss::ExactlyHalf,
            Ordering::Less => {
                if is_all_zeros(dividend) {
                    Loss::ExactlyZero
                } else {
                    Loss::LessThanHalf
                }
            }
        }
    }
 }