popcnt not generated - Githubissues

llvmbot commented 11 years ago


Bugzilla Link	17101
Version	trunk
OS	All
Depends On	llvm/llvm-project#1860
Blocks	llvm/llvm-project#17461
Reporter	LLVM Bugzilla Contributor
CC	@topperc,@dsprenkels,@davidbolvansky,@hfinkel,@rotateright

Extended Description

$ ./clang -v
clang version 3.4 (trunk 189776)
Target: x86_64-apple-darwin11.4.2
Thread model: posix

$ cat popcnt.c 
unsigned int foo(unsigned int x) {
    unsigned int countOfOnes = 0;
    unsigned int i;
    for (i=0; i<32; i++) {
        if (((x >> i) & 0x1) == 1) countOfOnes++;
    }
    return countOfOnes;
}

$ ./clang -S -O3 -march=core-avx2 popcnt.c -o /dev/stdout 
    .section    __TEXT,__text,regular,pure_instructions
    .section    __TEXT,__const
    .align  5
LCPI0_0:
    .long   1                       ## 0x1
    .long   2                       ## 0x2
    .long   4                       ## 0x4
    .long   8                       ## 0x8
    .long   16                      ## 0x10
    .long   32                      ## 0x20
    .long   64                      ## 0x40
    .long   128                     ## 0x80
LCPI0_2:
    .long   256                     ## 0x100
    .long   512                     ## 0x200
    .long   1024                    ## 0x400
    .long   2048                    ## 0x800
    .long   4096                    ## 0x1000
    .long   8192                    ## 0x2000
    .long   16384                   ## 0x4000
    .long   32768                   ## 0x8000
LCPI0_3:
    .long   16777216                ## 0x1000000
    .long   33554432                ## 0x2000000
    .long   67108864                ## 0x4000000
    .long   134217728               ## 0x8000000
    .long   268435456               ## 0x10000000
    .long   536870912               ## 0x20000000
    .long   1073741824              ## 0x40000000
    .long   2147483648              ## 0x80000000
LCPI0_4:
    .long   65536                   ## 0x10000
    .long   131072                  ## 0x20000
    .long   262144                  ## 0x40000
    .long   524288                  ## 0x80000
    .long   1048576                 ## 0x100000
    .long   2097152                 ## 0x200000
    .long   4194304                 ## 0x400000
    .long   8388608                 ## 0x800000
    .section    __TEXT,__literal4,4byte_literals
    .align  2
LCPI0_1:
    .long   1                       ## 0x1
    .section    __TEXT,__text,regular,pure_instructions
    .globl  _foo
    .align  4, 0x90
_foo:                                   ## @foo
    .cfi_startproc
## BB#0:                                ## %for.end
    pushq   %rbp
Ltmp2:
    .cfi_def_cfa_offset 16
Ltmp3:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp4:
    .cfi_def_cfa_register %rbp
    vmovd   %edi, %xmm0
    vbroadcastss    %xmm0, %ymm0
    vandps  LCPI0_0(%rip), %ymm0, %ymm1
    vpxor   %ymm2, %ymm2, %ymm2
    vpcmpeqd    %ymm2, %ymm1, %ymm1
    vpbroadcastd    LCPI0_1(%rip), %ymm3
    vpandn  %ymm3, %ymm1, %ymm1
    vandps  LCPI0_2(%rip), %ymm0, %ymm4
    vpcmpeqd    %ymm2, %ymm4, %ymm4
    vpandn  %ymm3, %ymm4, %ymm4
    vpaddd  %ymm1, %ymm4, %ymm1
    vandps  LCPI0_3(%rip), %ymm0, %ymm4
    vpcmpeqd    %ymm2, %ymm4, %ymm4
    vpandn  %ymm3, %ymm4, %ymm4
    vandps  LCPI0_4(%rip), %ymm0, %ymm0
    vpcmpeqd    %ymm2, %ymm0, %ymm0
    vpandn  %ymm3, %ymm0, %ymm0
    vpaddd  %ymm1, %ymm0, %ymm0
    vpaddd  %ymm0, %ymm4, %ymm0
    vextracti128    $1, %ymm0, %xmm1
    vpaddd  %ymm1, %ymm0, %ymm0
    vpalignr    $8, %ymm0, %ymm0, %ymm1 ## ymm1 = ymm0[8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31,16,17,18,19,20,21,22,23]
    vpaddd  %ymm1, %ymm0, %ymm0
    vphaddd %ymm0, %ymm0, %ymm0
    vmovd   %xmm0, %eax
    popq    %rbp
    vzeroupper
    ret

It looks like clang/llvm did something heroic here to recognize that this is a popcount function and by using AVX2 to avoid the loop, but I was expecting this loop to generate a simple popcnt instruction. I think this has been available since Nehalem for Intel and Family 10H for AMD.

Better codegen would be something like this:

popcnt %edi, %eax

rotateright commented 3 years ago

That got reverted because it exposed existing logic bugs in LLVM: https://reviews.llvm.org/D72396

That one finally re-landed: https://reviews.llvm.org/rGc7da0c383a1b ...so at least the 3rd and 4th examples are the same now, but still not recognized as popcnt.

davidbolvansky commented 3 years ago

BTW, codegen regressed a bit

LLVM 12

count_ones:                             # @count_ones
        xor     eax, eax
        test    edi, edi
        je      .LBB0_2
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        mov     ecx, edi
        and     ecx, 1
        add     eax, ecx
        shr     edi
        jne     .LBB0_1
.LBB0_2:
        ret

LLVM 13 trunk

count_ones:                             # @count_ones
        xor     eax, eax
        test    edi, edi
        je      .LBB0_3
        mov     ecx, edi
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        mov     edx, edi
        and     edx, 1
        add     eax, edx
        shr     ecx
        cmp     edi, 1
        mov     edi, ecx
        ja      .LBB0_2
.LBB0_3:
        ret

see extra cmp.

rotateright commented 4 years ago

I found some more examples of code that have this missed optimization:

https://c.godbolt.org/z/MxQW34

Interestingly, while all four implementations should be equal, LLVM does not see that in both cases the loop can be terminated when the input is 0, though this happens only in the while snippets. Maybe this is something worth to look at.

The 3rd and 4th variants should be identical (but still not recognized as popcnt) after: https://reviews.llvm.org/rGa041c4ec6f7a

That got reverted because it exposed existing logic bugs in LLVM: https://reviews.llvm.org/D72396

rotateright commented 4 years ago

I found some more examples of code that have this missed optimization:

https://c.godbolt.org/z/MxQW34

Interestingly, while all four implementations should be equal, LLVM does not see that in both cases the loop can be terminated when the input is 0, though this happens only in the while snippets. Maybe this is something worth to look at.

The 3rd and 4th variants should be identical (but still not recognized as popcnt) after: https://reviews.llvm.org/rGa041c4ec6f7a

dsprenkels commented 4 years ago

I found some more examples of code that have this missed optimization:

https://c.godbolt.org/z/MxQW34

Interestingly, while all four implementations should be equal, LLVM does not see that in both cases the loop can be terminated when the input is 0, though this happens only in the while snippets. Maybe this is something worth to look at.

llvmbot commented 11 years ago

I don't think it is good idea to link 1488 and 17087. They are related just in name. popcnt has many variants. When I fixed 1488, I intentionally divide all variants into two broad categories:

c1: non-countable loop based,
c2: countable loop based.

I only implemented one variant in the 1st category. One need to recognize this pattern in the 2nd category. The code is organized in the way such that you can plug-in one or two c++ class to detect the pattern without lots of interaction with existing code.

llvmbot commented 11 years ago

I just discovered bug 1488; this bug could be closed as a dup of it...but that bug is marked resolved for some reason (because the SPEC battle was won?).

In comment 4, I've also listed an implementation that isn't mentioned in bug 1488.

topperc commented 11 years ago

The loop idiom detect code only handles the exact code that Nick pasted.

llvmbot commented 11 years ago

Thanks, Nick. I wasn't sure if it was a loop recognition problem or an instruction selection problem. For reference, here's another loop implementation that I tried. This one just produces the straightforward codegen instead of anything fancy:

$ cat pop.c
unsigned int bitCount (unsigned int value) {
    unsigned int count = 0;
    while (value > 0) {           // until all bits are zero
        if ((value & 1) == 1)     // check lower bit
            count++;
        value >>= 1;              // shift bits, removing lower bit
    }
    return count;
}

$ ./clang -S -O3 -march=core-avx2 pop.c -o /dev/stdout 
    .section    __TEXT,__text,regular,pure_instructions
    .globl  _bitCount
    .align  4, 0x90
_bitCount:                              ## @bitCount
    .cfi_startproc
## BB#0:                                ## %entry
    pushq   %rbp
Ltmp2:
    .cfi_def_cfa_offset 16
Ltmp3:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
Ltmp4:
    .cfi_def_cfa_register %rbp
    xorl    %eax, %eax
    testl   %edi, %edi
    je  LBB0_2
    .align  4, 0x90
LBB0_1:                                 ## %while.body
                                        ## =>This Inner Loop Header: Depth=1
    movl    %edi, %ecx
    andl    $1, %ecx
    addl    %ecx, %eax
    shrl    %edi
    jne LBB0_1
LBB0_2:                                 ## %while.end
    popq    %rbp
    ret

llvmbot commented 11 years ago

This is a missed optimization in the loop idiom recognizer. It expects that a popcount loop will terminate when the value is equal to zero (ie,. the form "while(x) {cnt++; x &= x - 1;}").

llvmbot commented 11 years ago

And the current codegen for plain SSE (no AVX) via -march=corei7:

movd    %edi, %xmm0
pshufd  $0, %xmm0, %xmm0        ## xmm0 = xmm0[0,0,0,0]
movdqa  LCPI0_0(%rip), %xmm1
pand    %xmm0, %xmm1
pxor    %xmm2, %xmm2
pcmpeqd %xmm2, %xmm1
movdqa  LCPI0_1(%rip), %xmm3
pandn   %xmm3, %xmm1
movdqa  LCPI0_2(%rip), %xmm4
pand    %xmm0, %xmm4
pcmpeqd %xmm2, %xmm4
pandn   %xmm3, %xmm4
paddd   %xmm1, %xmm4
movdqa  LCPI0_3(%rip), %xmm1
pand    %xmm0, %xmm1
pcmpeqd %xmm2, %xmm1
pandn   %xmm3, %xmm1
paddd   %xmm4, %xmm1
movdqa  LCPI0_4(%rip), %xmm4
pand    %xmm0, %xmm4
pcmpeqd %xmm2, %xmm4
pandn   %xmm3, %xmm4
paddd   %xmm1, %xmm4
movdqa  LCPI0_5(%rip), %xmm1
pand    %xmm0, %xmm1
pcmpeqd %xmm2, %xmm1
pandn   %xmm3, %xmm1
paddd   %xmm4, %xmm1
movdqa  LCPI0_6(%rip), %xmm4
pand    %xmm0, %xmm4
pcmpeqd %xmm2, %xmm4
pandn   %xmm3, %xmm4
movdqa  LCPI0_7(%rip), %xmm5
pand    %xmm0, %xmm5
paddd   %xmm1, %xmm4
pcmpeqd %xmm2, %xmm5
pand    LCPI0_8(%rip), %xmm0
pcmpeqd %xmm2, %xmm0
pandn   %xmm3, %xmm0
paddd   %xmm4, %xmm0
pandn   %xmm3, %xmm5
paddd   %xmm0, %xmm5
movdqa  %xmm5, %xmm0
movhlps %xmm0, %xmm0            ## xmm0 = xmm0[1,1]
paddd   %xmm5, %xmm0
phaddd  %xmm0, %xmm0
movd    %xmm0, %eax
popq    %rbp
ret

llvmbot commented 11 years ago

For reference, here's the current codegen for an AVX-capable CPU (-march=corei7-avx):

vmovd   %edi, %xmm0
vpshufd $0, %xmm0, %xmm0        ## xmm0 = xmm0[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vandps  LCPI0_0(%rip), %ymm0, %ymm1
vextractf128    $1, %ymm1, %xmm2
vpxor   %xmm3, %xmm3, %xmm3
vmovaps LCPI0_1(%rip), %ymm4
vmovaps LCPI0_2(%rip), %ymm9
vpcmpeqd    %xmm6, %xmm6, %xmm6
vmovaps LCPI0_3(%rip), %ymm8
vpcmpeqd    %xmm3, %xmm2, %xmm2
vpxor   %xmm6, %xmm2, %xmm2
vpcmpeqd    %xmm3, %xmm1, %xmm1
vpxor   %xmm6, %xmm1, %xmm1
vinsertf128 $1, %xmm2, %ymm1, %ymm1
vandps  %ymm9, %ymm1, %ymm10
vandps  %ymm4, %ymm0, %ymm2
vextractf128    $1, %ymm2, %xmm4
vpcmpeqd    %xmm3, %xmm4, %xmm4
vextractf128    $1, %ymm10, %xmm7
vpxor   %xmm6, %xmm4, %xmm4
vpcmpeqd    %xmm3, %xmm2, %xmm2
vpxor   %xmm6, %xmm2, %xmm2
vinsertf128 $1, %xmm4, %ymm2, %ymm2
vandps  %ymm9, %ymm2, %ymm2
vandps  %ymm8, %ymm0, %ymm4
vextractf128    $1, %ymm4, %xmm5
vpcmpeqd    %xmm3, %xmm5, %xmm5
vextractf128    $1, %ymm2, %xmm1
vpaddd  %xmm7, %xmm1, %xmm1
vpxor   %xmm6, %xmm5, %xmm5
vpcmpeqd    %xmm3, %xmm4, %xmm4
vpxor   %xmm6, %xmm4, %xmm4
vinsertf128 $1, %xmm5, %ymm4, %ymm4
vandps  %ymm9, %ymm4, %ymm4
vandps  LCPI0_4(%rip), %ymm0, %ymm0
vextractf128    $1, %ymm0, %xmm5
vpcmpeqd    %xmm3, %xmm5, %xmm5
vextractf128    $1, %ymm4, %xmm7
vpaddd  %xmm1, %xmm7, %xmm1
vpxor   %xmm6, %xmm5, %xmm5
vpcmpeqd    %xmm3, %xmm0, %xmm0
vpxor   %xmm6, %xmm0, %xmm0
vinsertf128 $1, %xmm5, %ymm0, %ymm0
vandps  %ymm9, %ymm0, %ymm0
vextractf128    $1, %ymm0, %xmm3
vpaddd  %xmm1, %xmm3, %xmm1
vpaddd  %xmm10, %xmm2, %xmm2
vpaddd  %xmm2, %xmm4, %xmm2
vpaddd  %xmm2, %xmm0, %xmm0
vpaddd  %xmm1, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm1
vshufps $14, %ymm0, %ymm1, %ymm1 ## ymm1 = ymm1[2,3],ymm0[0,0],ymm1[6,7],ymm0[4,4]
vpaddd  %xmm1, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm1
vmovshdup   %ymm1, %ymm1
vpaddd  %xmm1, %xmm0, %xmm0
vmovd   %xmm0, %eax
popq    %rbp
vzeroupper
ret

llvm / llvm-project

popcnt not generated #17475

Extended Description