Aatch / ramp

RAMP - Rust Arithmetic in Multiple Precision
Apache License 2.0
261 stars 38 forks source link

`let x = mem::replace(self, Int::zero()); *self = ...` pattern seems to result in very poor code-gen #40

Closed huonw closed 9 years ago

huonw commented 9 years ago

For instance, this is the contents of shl_assign:

impl ShlAssign<usize> for Int {
    #[inline]
    fn shl_assign(&mut self, other: usize) {
        let this = std::mem::replace(self, Int::zero());
        *self = this << other;
    }
}

And this is its assembly:

_ZN3int13_$LT$impl$GT$10shl_assign20h9a00eefbb5e99e3aTafE:
    pushq   %rbx
    subq    $48, %rsp
    movq    %rsi, %rax
    movq    %rdi, %rbx
    movq    (%rbx), %rcx
    movq    8(%rbx), %rdx
    movq    %rcx, (%rsp)
    movl    %edx, 8(%rsp)
    shrq    $32, %rdx
    movb    16(%rbx), %cl
    movq    $1, (%rbx)
    movl    $0, 8(%rbx)
    movl    $0, 12(%rbx)
    movb    $-44, 16(%rbx)
    movl    %edx, 12(%rsp)
    movb    %cl, 16(%rsp)
    movb    23(%rbx), %cl
    movb    %cl, 23(%rsp)
    movw    21(%rbx), %cx
    movw    %cx, 21(%rsp)
    movl    17(%rbx), %ecx
    movl    %ecx, 17(%rsp)
    leaq    24(%rsp), %rdi
    leaq    (%rsp), %rsi
    movq    %rax, %rdx
    callq   _ZN3int13_$LT$impl$GT$3shl20hc829f8e5c8476956r6eE@PLT
    movq    40(%rsp), %rax
    movq    %rax, 16(%rbx)
    movups  24(%rsp), %xmm0
    movups  %xmm0, (%rbx)
    addq    $48, %rsp
    popq    %rbx
    retq

(Why are there _movb_s in there?)

The other way to share code is to write the non-Assign versions in terms of the Assign versions, which seems to generate better code and so hence it may be better to try to use this pattern where possible, e.g.

impl<'a> BitAnd<Limb> for Int {
    type Output = Int;

    fn bitand(mut self, other: Limb) -> Int {
        self &= other;
        self
    }
}
_ZN3int13_$LT$impl$GT$6bitand20hb4c960ade11a09ccRsfE:
    pushq   %r15
    pushq   %r14
    pushq   %rbx
    movq    %rdx, %rax
    movq    %rsi, %r15
    movq    %rdi, %rbx
    xorl    %edx, %edx
    xorl    %ecx, %ecx
    movq    %r15, %rdi
    movq    %rax, %rsi
    callq   _ZN3int10bitop_limb20h0a6a37ca86ca46f2AofE@PLT
    movq    16(%r15), %rax
    movq    %rax, 16(%rbx)
    movups  (%r15), %xmm0
    movups  %xmm0, (%rbx)
    movq    %rbx, %rax
    popq    %rbx
    popq    %r14
    popq    %r15
    retq
Aatch commented 9 years ago

Sounds good.

I think the weird codegen in the first case is LLVM trying to be too clever. It's interleaving the swap of the bytes (due to the mem::replace). Why it's doing it in such a weird way is beyond me, though I can only guess it's trying to make sure as many functional units are active as possible and this is what it decided.