mirror of
https://github.com/rust-lang/rust.git
synced 2024-11-22 06:44:35 +00:00
Auto merge of #130002 - orlp:better-div-floor-ceil, r=thomcc
better implementation of signed div_floor/ceil Tracking issue for signed `div_floor`/`div_ceil`: https://github.com/rust-lang/rust/issues/88581. This PR improves the implementation of those two functions by adding a better branchless algorithm. Side-by-side comparison of `i32::div_floor` on x86-64: ```asm div_floor_new: div_floor_old: push rax push rax test esi, esi test esi, esi je .LBB0_3 je .LBB1_6 mov eax, esi mov eax, esi not eax not eax lea ecx, [rdi - 2147483648] lea ecx, [rdi - 2147483648] or ecx, eax or ecx, eax je .LBB0_2 je .LBB1_7 mov eax, edi mov eax, edi cdq cdq idiv esi idiv esi xor esi, edi test edx, edx sar esi, 31 setg cl test edx, edx test esi, esi cmove esi, edx sets dil add eax, esi test dil, cl pop rcx jne .LBB1_4 ret test edx, edx .LBB0_3: setns cl lea rdi, [rip + .L__unnamed_1] test esi, esi call qword ptr [rip + panic...] setle dl .LBB0_2: or dl, cl lea rdi, [rip + .L__unnamed_1] jne .LBB1_5 call qword ptr [rip + panic...] .LBB1_4: dec eax .LBB1_5: pop rcx ret .LBB1_6: lea rdi, [rip + .L__unnamed_2] call qword ptr [rip + panic...] .LBB1_7: lea rdi, [rip + .L__unnamed_2] call qword ptr [rip + panic...] ``` And on Aarch64: ```asm _div_floor_new: _div_floor_old: stp x29, x30, [sp, #-16]! stp x29, x30, [sp, #-16]! mov x29, sp mov x29, sp cbz w1, LBB0_4 cbz w1, LBB1_9 mov w8, #-2147483648 mov x8, x0 cmp w0, w8 mov w9, #-2147483648 b.ne LBB0_3 cmp w0, w9 cmn w1, #1 b.ne LBB1_3 b.eq LBB0_5 cmn w1, #1 LBB0_3: b.eq LBB1_10 sdiv w8, w0, w1 LBB1_3: msub w9, w8, w1, w0 sdiv w0, w8, w1 eor w10, w1, w0 msub w8, w0, w1, w8 asr w10, w10, #31 tbz w1, #31, LBB1_5 cmp w9, #0 cmp w8, #0 csel w9, wzr, w10, eq b.gt LBB1_7 add w0, w9, w8 LBB1_5: ldp x29, x30, [sp], #16 cmp w1, #1 ret b.lt LBB1_8 LBB0_4: tbz w8, #31, LBB1_8 adrp x0, l___unnamed_1@PAGE LBB1_7: add x0, x0, l___unnamed_1@PAGEOFF sub w0, w0, #1 bl panic... LBB1_8: LBB0_5: ldp x29, x30, [sp], #16 adrp x0, l___unnamed_1@PAGE ret add x0, x0, l___unnamed_1@PAGEOFF LBB1_9: bl panic... adrp x0, l___unnamed_2@PAGE add x0, x0, l___unnamed_2@PAGEOFF bl panic... LBB1_10: adrp x0, l___unnamed_2@PAGE add x0, x0, l___unnamed_2@PAGEOFF bl panic... ```
This commit is contained in:
commit
adf8d168af
@ -3023,8 +3023,16 @@ macro_rules! int_impl {
|
||||
pub const fn div_floor(self, rhs: Self) -> Self {
|
||||
let d = self / rhs;
|
||||
let r = self % rhs;
|
||||
if (r > 0 && rhs < 0) || (r < 0 && rhs > 0) {
|
||||
d - 1
|
||||
|
||||
// If the remainder is non-zero, we need to subtract one if the
|
||||
// signs of self and rhs differ, as this means we rounded upwards
|
||||
// instead of downwards. We do this branchlessly by creating a mask
|
||||
// which is all-ones iff the signs differ, and 0 otherwise. Then by
|
||||
// adding this mask (which corresponds to the signed value -1), we
|
||||
// get our correction.
|
||||
let correction = (self ^ rhs) >> (Self::BITS - 1);
|
||||
if r != 0 {
|
||||
d + correction
|
||||
} else {
|
||||
d
|
||||
}
|
||||
@ -3059,8 +3067,12 @@ macro_rules! int_impl {
|
||||
pub const fn div_ceil(self, rhs: Self) -> Self {
|
||||
let d = self / rhs;
|
||||
let r = self % rhs;
|
||||
if (r > 0 && rhs > 0) || (r < 0 && rhs < 0) {
|
||||
d + 1
|
||||
|
||||
// When remainder is non-zero we have a.div_ceil(b) == 1 + a.div_floor(b),
|
||||
// so we can re-use the algorithm from div_floor, just adding 1.
|
||||
let correction = 1 + ((self ^ rhs) >> (Self::BITS - 1));
|
||||
if r != 0 {
|
||||
d + correction
|
||||
} else {
|
||||
d
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user