llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.84k stars 11.91k forks source link

Suboptimal code for checking if nth bit is set #102694

Open Kmeakin opened 2 months ago

Kmeakin commented 2 months ago

https://godbolt.org/z/h54KPcvKh

bool contains_unchecked(u32 bits, u8 index) {
    return (bits & (1 << index)) != 0;
}

bool contains_checked(u32 bits, u8 index) {
    if (index >= 32) {
        return false;
    }
    return (bits & (1 << index)) != 0;
}

For contains_unchecked, LLVM generates optimal assembly. The naive implementation would be

contains_unchecked:
        mov     w8, #0x1
        lsl     w8, w8, w1
        ands    w8, #0x1
        cset    w0, ne
        ret

But in this case, LLVM is smart enough to rewrite (bits & (1 << index)) != 0 to ((bits >> index) & 1) != 0 and so save 2 instructions:

contains_unchecked:
        lsr     w8, w0, w1
        and     w0, w8, #0x1
        ret

However, LLVM fails to make the same optimization when bounds checking is added to avoid UB:

contains_checked:
        mov     w8, #1
        and     w9, w1, #0xff
        lsl     w8, w8, w1
        tst     w8, w0
        mov     w8, #32
        ccmp    w9, w8, #2, ne
        cset    w0, lo
        ret

I believe the optimal assembly would be:

        lsr     w8, w0, w1
        and     w0, w8, #0x1
        and     w9, w1, #0xff
        cmp     w9, #32
        csel    w0, w0, wzr, ls
        ret
llvmbot commented 2 months ago

@llvm/issue-subscribers-backend-aarch64

Author: Karl Meakin (Kmeakin)

https://godbolt.org/z/h54KPcvKh ```c bool contains_unchecked(u32 bits, u8 index) { return (bits & (1 << index)) != 0; } bool contains_checked(u32 bits, u8 index) { if (index >= 32) { return false; } return (bits & (1 << index)) != 0; } ``` For `contains_unchecked`, LLVM generates optimal assembly. The naive implementation would be ```asm contains_unchecked: lsr w8, w0, w1 mov w8, #0x1 lsl w8, w8, w1 ands w8, #0x1 cset w0, ne ``` But in this case, LLVM is smart enough to rewrite `(bits & (1 << index)) != 0` to `(bits >> index) != 0` and so save 2 instructions: ```asm contains_unchecked: lsr w8, w0, w1 and w0, w8, #0x1 ret ``` However, LLVM fails to make the same optimization when bounds checking is added to avoid UB: ```asm contains_checked: mov w8, #1 and w9, w1, #0xff lsl w8, w8, w1 tst w8, w0 mov w8, #32 ccmp w9, w8, #2, ne cset w0, lo ret ``` I believe the optimal assembly would be: ```asm lsr w8, w0, w1 and w0, w8, #0x1 and w9, w1, #0xff cmp w9, #32 csel w0, w0, wzr, ls ret ```
Kmeakin commented 2 months ago

I am less familiar with x86, but I think the x86 assembly could also be optimized:

contains_checked:
        mov     ecx, esi
        cmp     cl, 32
        setb    dl
        mov     eax, 1
        shl     eax, cl
        test    eax, edi
        setne   al
        and     al, dl
        ret

to

contains_checked:
        bt      edi, esi
        setb    al
        cmp     sil, 32
        setb    dl
        and     al, dl
        ret
llvmbot commented 2 months ago

@llvm/issue-subscribers-backend-x86

Author: Karl Meakin (Kmeakin)

https://godbolt.org/z/h54KPcvKh ```c bool contains_unchecked(u32 bits, u8 index) { return (bits & (1 << index)) != 0; } bool contains_checked(u32 bits, u8 index) { if (index >= 32) { return false; } return (bits & (1 << index)) != 0; } ``` For `contains_unchecked`, LLVM generates optimal assembly. The naive implementation would be ```asm contains_unchecked: lsr w8, w0, w1 mov w8, #0x1 lsl w8, w8, w1 ands w8, #0x1 cset w0, ne ``` But in this case, LLVM is smart enough to rewrite `(bits & (1 << index)) != 0` to `(bits >> index) != 0` and so save 2 instructions: ```asm contains_unchecked: lsr w8, w0, w1 and w0, w8, #0x1 ret ``` However, LLVM fails to make the same optimization when bounds checking is added to avoid UB: ```asm contains_checked: mov w8, #1 and w9, w1, #0xff lsl w8, w8, w1 tst w8, w0 mov w8, #32 ccmp w9, w8, #2, ne cset w0, lo ret ``` I believe the optimal assembly would be: ```asm lsr w8, w0, w1 and w0, w8, #0x1 and w9, w1, #0xff cmp w9, #32 csel w0, w0, wzr, ls ret ```
Kmeakin commented 2 months ago

With AArch64 you could even fold the 2nd and and cmp together:

        lsr     w8, w0, w1
        and     w0, w8, #0x1
        tst     w9, w1, #0xef
        csel    w0, w0, wzr, ne
        ret
RKSimon commented 2 months ago

CC @RKSimon

RKSimon commented 2 months ago
define i1 @contains_unchecked(i32 %bits, i8 %index) {
  %conv = zext nneg i8 %index to i32
  %shl = shl nuw i32 1, %conv
  %and = and i32 %shl, %bits
  %cmp = icmp ne i32 %and, 0
  ret i1 %cmp
}

define i1 @contains_checked(i32 %bits, i8 %index) {
  %cmp = icmp ult i8 %index, 32
  %conv = zext nneg i8 %index to i32
  %shl = shl nuw i32 1, %conv
  %and = and i32 %shl, %bits
  %cmp2 = icmp ne i32 %and, 0
  %retval.0 = select i1 %cmp, i1 %cmp2, i1 false
  ret i1 %retval.0
}
Explorer09 commented 2 months ago

But in this case, LLVM is smart enough to rewrite (bits & (1 << index)) != 0 to (bits >> index) != 0 and so save 2 instructions

Is there a typo is the original post? I think the expression should be (bits >> index) & 1 and not (bits >> index) != 0.

Kmeakin commented 1 month ago

But in this case, LLVM is smart enough to rewrite (bits & (1 << index)) != 0 to (bits >> index) != 0 and so save 2 instructions

Is there a typo is the original post? I think the expression should be (bits >> index) & 1 and not (bits >> index) != 0.

Yes, that was a typo. Thanks