Open llvmbot opened 11 years ago
That got reverted because it exposed existing logic bugs in LLVM: https://reviews.llvm.org/D72396
That one finally re-landed: https://reviews.llvm.org/rGc7da0c383a1b ...so at least the 3rd and 4th examples are the same now, but still not recognized as popcnt.
BTW, codegen regressed a bit
LLVM 12
count_ones: # @count_ones
xor eax, eax
test edi, edi
je .LBB0_2
.LBB0_1: # =>This Inner Loop Header: Depth=1
mov ecx, edi
and ecx, 1
add eax, ecx
shr edi
jne .LBB0_1
.LBB0_2:
ret
LLVM 13 trunk
count_ones: # @count_ones
xor eax, eax
test edi, edi
je .LBB0_3
mov ecx, edi
.LBB0_2: # =>This Inner Loop Header: Depth=1
mov edx, edi
and edx, 1
add eax, edx
shr ecx
cmp edi, 1
mov edi, ecx
ja .LBB0_2
.LBB0_3:
ret
see extra cmp.
I found some more examples of code that have this missed optimization:
https://c.godbolt.org/z/MxQW34
Interestingly, while all four implementations should be equal, LLVM does not see that in both cases the loop can be terminated when the input is 0, though this happens only in the
while
snippets. Maybe this is something worth to look at.The 3rd and 4th variants should be identical (but still not recognized as popcnt) after: https://reviews.llvm.org/rGa041c4ec6f7a
That got reverted because it exposed existing logic bugs in LLVM: https://reviews.llvm.org/D72396
I found some more examples of code that have this missed optimization:
https://c.godbolt.org/z/MxQW34
Interestingly, while all four implementations should be equal, LLVM does not see that in both cases the loop can be terminated when the input is 0, though this happens only in the
while
snippets. Maybe this is something worth to look at.
The 3rd and 4th variants should be identical (but still not recognized as popcnt) after: https://reviews.llvm.org/rGa041c4ec6f7a
I found some more examples of code that have this missed optimization:
https://c.godbolt.org/z/MxQW34
Interestingly, while all four implementations should be equal, LLVM does not see that in both cases the loop can be terminated when the input is 0, though this happens only in the while
snippets. Maybe this is something worth to look at.
I don't think it is good idea to link 1488 and 17087. They are related just in name. popcnt has many variants. When I fixed 1488, I intentionally divide all variants into two broad categories:
c2: countable loop based.
I only implemented one variant in the 1st category. One need to recognize this pattern in the 2nd category. The code is organized in the way such that you can plug-in one or two c++ class to detect the pattern without lots of interaction with existing code.
I just discovered bug 1488; this bug could be closed as a dup of it...but that bug is marked resolved for some reason (because the SPEC battle was won?).
In comment 4, I've also listed an implementation that isn't mentioned in bug 1488.
The loop idiom detect code only handles the exact code that Nick pasted.
Thanks, Nick. I wasn't sure if it was a loop recognition problem or an instruction selection problem. For reference, here's another loop implementation that I tried. This one just produces the straightforward codegen instead of anything fancy:
$ cat pop.c
unsigned int bitCount (unsigned int value) {
unsigned int count = 0;
while (value > 0) { // until all bits are zero
if ((value & 1) == 1) // check lower bit
count++;
value >>= 1; // shift bits, removing lower bit
}
return count;
}
$ ./clang -S -O3 -march=core-avx2 pop.c -o /dev/stdout
.section __TEXT,__text,regular,pure_instructions
.globl _bitCount
.align 4, 0x90
_bitCount: ## @bitCount
.cfi_startproc
## BB#0: ## %entry
pushq %rbp
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
xorl %eax, %eax
testl %edi, %edi
je LBB0_2
.align 4, 0x90
LBB0_1: ## %while.body
## =>This Inner Loop Header: Depth=1
movl %edi, %ecx
andl $1, %ecx
addl %ecx, %eax
shrl %edi
jne LBB0_1
LBB0_2: ## %while.end
popq %rbp
ret
This is a missed optimization in the loop idiom recognizer. It expects that a popcount loop will terminate when the value is equal to zero (ie,. the form "while(x) {cnt++; x &= x - 1;}").
And the current codegen for plain SSE (no AVX) via -march=corei7:
movd %edi, %xmm0
pshufd $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
movdqa LCPI0_0(%rip), %xmm1
pand %xmm0, %xmm1
pxor %xmm2, %xmm2
pcmpeqd %xmm2, %xmm1
movdqa LCPI0_1(%rip), %xmm3
pandn %xmm3, %xmm1
movdqa LCPI0_2(%rip), %xmm4
pand %xmm0, %xmm4
pcmpeqd %xmm2, %xmm4
pandn %xmm3, %xmm4
paddd %xmm1, %xmm4
movdqa LCPI0_3(%rip), %xmm1
pand %xmm0, %xmm1
pcmpeqd %xmm2, %xmm1
pandn %xmm3, %xmm1
paddd %xmm4, %xmm1
movdqa LCPI0_4(%rip), %xmm4
pand %xmm0, %xmm4
pcmpeqd %xmm2, %xmm4
pandn %xmm3, %xmm4
paddd %xmm1, %xmm4
movdqa LCPI0_5(%rip), %xmm1
pand %xmm0, %xmm1
pcmpeqd %xmm2, %xmm1
pandn %xmm3, %xmm1
paddd %xmm4, %xmm1
movdqa LCPI0_6(%rip), %xmm4
pand %xmm0, %xmm4
pcmpeqd %xmm2, %xmm4
pandn %xmm3, %xmm4
movdqa LCPI0_7(%rip), %xmm5
pand %xmm0, %xmm5
paddd %xmm1, %xmm4
pcmpeqd %xmm2, %xmm5
pand LCPI0_8(%rip), %xmm0
pcmpeqd %xmm2, %xmm0
pandn %xmm3, %xmm0
paddd %xmm4, %xmm0
pandn %xmm3, %xmm5
paddd %xmm0, %xmm5
movdqa %xmm5, %xmm0
movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1]
paddd %xmm5, %xmm0
phaddd %xmm0, %xmm0
movd %xmm0, %eax
popq %rbp
ret
For reference, here's the current codegen for an AVX-capable CPU (-march=corei7-avx):
vmovd %edi, %xmm0
vpshufd $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vandps LCPI0_0(%rip), %ymm0, %ymm1
vextractf128 $1, %ymm1, %xmm2
vpxor %xmm3, %xmm3, %xmm3
vmovaps LCPI0_1(%rip), %ymm4
vmovaps LCPI0_2(%rip), %ymm9
vpcmpeqd %xmm6, %xmm6, %xmm6
vmovaps LCPI0_3(%rip), %ymm8
vpcmpeqd %xmm3, %xmm2, %xmm2
vpxor %xmm6, %xmm2, %xmm2
vpcmpeqd %xmm3, %xmm1, %xmm1
vpxor %xmm6, %xmm1, %xmm1
vinsertf128 $1, %xmm2, %ymm1, %ymm1
vandps %ymm9, %ymm1, %ymm10
vandps %ymm4, %ymm0, %ymm2
vextractf128 $1, %ymm2, %xmm4
vpcmpeqd %xmm3, %xmm4, %xmm4
vextractf128 $1, %ymm10, %xmm7
vpxor %xmm6, %xmm4, %xmm4
vpcmpeqd %xmm3, %xmm2, %xmm2
vpxor %xmm6, %xmm2, %xmm2
vinsertf128 $1, %xmm4, %ymm2, %ymm2
vandps %ymm9, %ymm2, %ymm2
vandps %ymm8, %ymm0, %ymm4
vextractf128 $1, %ymm4, %xmm5
vpcmpeqd %xmm3, %xmm5, %xmm5
vextractf128 $1, %ymm2, %xmm1
vpaddd %xmm7, %xmm1, %xmm1
vpxor %xmm6, %xmm5, %xmm5
vpcmpeqd %xmm3, %xmm4, %xmm4
vpxor %xmm6, %xmm4, %xmm4
vinsertf128 $1, %xmm5, %ymm4, %ymm4
vandps %ymm9, %ymm4, %ymm4
vandps LCPI0_4(%rip), %ymm0, %ymm0
vextractf128 $1, %ymm0, %xmm5
vpcmpeqd %xmm3, %xmm5, %xmm5
vextractf128 $1, %ymm4, %xmm7
vpaddd %xmm1, %xmm7, %xmm1
vpxor %xmm6, %xmm5, %xmm5
vpcmpeqd %xmm3, %xmm0, %xmm0
vpxor %xmm6, %xmm0, %xmm0
vinsertf128 $1, %xmm5, %ymm0, %ymm0
vandps %ymm9, %ymm0, %ymm0
vextractf128 $1, %ymm0, %xmm3
vpaddd %xmm1, %xmm3, %xmm1
vpaddd %xmm10, %xmm2, %xmm2
vpaddd %xmm2, %xmm4, %xmm2
vpaddd %xmm2, %xmm0, %xmm0
vpaddd %xmm1, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm1
vshufps $14, %ymm0, %ymm1, %ymm1 ## ymm1 = ymm1[2,3],ymm0[0,0],ymm1[6,7],ymm0[4,4]
vpaddd %xmm1, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm0, %ymm1
vmovshdup %ymm1, %ymm1
vpaddd %xmm1, %xmm0, %xmm0
vmovd %xmm0, %eax
popq %rbp
vzeroupper
ret
Extended Description
It looks like clang/llvm did something heroic here to recognize that this is a popcount function and by using AVX2 to avoid the loop, but I was expecting this loop to generate a simple popcnt instruction. I think this has been available since Nehalem for Intel and Family 10H for AMD.
Better codegen would be something like this:
popcnt %edi, %eax