Multiple consecutive __builtin_popcount()'s may be combined to a bigger type

Quuxplusone commented 11 years ago


Bugzilla Link	PR14392
Status	NEW
Importance	P enhancement
Reported by	Jung-uk Kim (jkim@FreeBSD.org)
Reported on	2012-11-20 13:31:49 -0800
Last modified on	2012-11-22 10:28:52 -0800
Version	trunk
Hardware	PC FreeBSD
CC	clattner@nondot.org, craig.topper@gmail.com, llvm-bugs@lists.llvm.org, pawel@32bitmicro.com
Fixed by commit(s)
Attachments
Blocks
Blocked by
See also

Please consider the following example:

% cat popcount.c
int
popcount(unsigned long x)
{
        unsigned *p;
        int c, i;

        for (c = i = 0, p = (void *)&x; i < sizeof(x) / sizeof(*p); i++)
                c += __builtin_popcount(*p++);
        return (c);
}
% clang -O2 -mpopcnt -S popcount.c
% cat popcount.s
        .file   "popcount.c"
        .text
        .globl  popcount
        .align  16, 0x90
        .type   popcount,@function
popcount:                               # @popcount
        .cfi_startproc
# BB#0:                                 # %entry
        popcntl %edi, %ecx
        shrq    $32, %rdi
        popcntl %edi, %eax
        addl    %ecx, %eax
        ret
.Ltmp0:
        .size   popcount, .Ltmp0-popcount
        .cfi_endproc

        .section        ".note.GNU-stack","",@progbits

Ideally, it should be folded into "return (__builtin_popcountl(x));" and
generate "popcntq %rdi, %rax; ret".

Quuxplusone commented 11 years ago

Do you have a test case that isn't contrived? Recognizing this situation doesn't seem generally useful.

Quuxplusone commented 11 years ago

One day, I wrote a standalone and (almost) portable function (i.e., no
memcpy(), etc.) to count bits from arbitrary length array.  A simplified
version goes something like this:

#ifdef USE_BUILTIN_POPCOUNT
#define __BITCOUNT32(x) __builtin_popcount(x)
#define __BITCOUNT(x)   __builtin_popcountl(x)
#define __BITCOUNT_T    unsigned long
#else
#define __BITCOUNT32(x) __extension__                           \
({                                                              \
        unsigned _x = x;                                        \
                                                                \
        _x += ~((_x >> 1) & 0x55555555) + 1;                    \
        _x = (_x & 0x33333333) + ((_x >> 2) & 0x33333333);      \
        _x = (_x + (_x >> 4)) & 0x0f0f0f0f;                     \
        _x += _x >> 8;                                          \
        _x += _x >> 16;                                         \
        _x & 0x3f;                                              \
})
#define __BITCOUNT(x)   __BITCOUNT32(x)
#define __BITCOUNT_T    unsigned
#endif

static __inline int
bitcount(void *_p, int _l)
{
        __BITCOUNT_T *_s;
        int _c, _i;

        /*
         * Number of bits must be non-zero and less or equal to INT_MAX.
         */
        if (_l <= 0 || _l >= 0x10000000)
                return (-1);

        _c = 0;
        _s = _p;
        for (_i = 0; _i < _l / sizeof(*_s); _i++)
                _c += __BITCOUNT(*_s++);
        for (_i = 0; _i < _l % sizeof(*_s); _i++)
                _c += __BITCOUNT32(((unsigned char *)_s)[_i]);
        return (_c);
}

Unfortunately, generated code wasn't very good, especially when _l is not
multiple of sizeof(__BITCOUNT_T).  In fact, it didn't do any optimization at
all, except for unrolling the loop.  Therefore, I had to use more complex code
to differentiate LP64/LP32 and other suboptimal cases (i.e., _l is not multiple
of sizeof(__BITCOUNT_T)) with lots of switch/cases, which isn't very useful for
an inline function.

This is yet another example from a different angle:

% cat bitcount.c
#include "bitcount.h" /* the above naive implementation */

int
popcount(void *p)
{

        return (bitcount(p, 5));
}
% clang -O2 -DUSE_BUILTIN_POPCOUNT -mpopcnt -S bitcount.c
% cat bitcount.s
        .file   "bitcount.c"
        .text
        .globl  popcount
        .align  16, 0x90
        .type   popcount,@function
popcount:                               # @popcount
        .cfi_startproc
# BB#0:                                 # %entry
        movzbl  (%rdi), %eax
        popcntl %eax, %ecx
        movzbl  1(%rdi), %eax
        popcntl %eax, %eax
        addl    %ecx, %eax
        movzbl  2(%rdi), %ecx
        popcntl %ecx, %ecx
        addl    %eax, %ecx
        movzbl  3(%rdi), %eax
        popcntl %eax, %edx
        addl    %ecx, %edx
        movzbl  4(%rdi), %eax
        popcntl %eax, %eax
        addl    %edx, %eax
        ret
.Ltmp0:
        .size   popcount, .Ltmp0-popcount
        .cfi_endproc

        .section        ".note.GNU-stack","",@progbits

I wish it generated two popcntl's and addl.  I wouldn't mind if it generated
"xorl %eax, %eax; (copy 5 bytes from (%rdi) to %rax); popcntq %rax, %rax;"
sequence.  If POPCNT instruction is not available, it is much much worse, of
course. :-/

I know it is hard to detect all of these cases.  Maybe it is just my pipe
dream...

Quuxplusone commented 11 years ago

What happens if you use "long long" and __builtin_popcountll on 32-bit and 64-bit systems?

Quuxplusone commented 11 years ago

(In reply to comment #3)
> What happens if you use "long long" and __builtin_popcountll on 32-bit and
> 64-bit systems?

Although long long is not portable, I tried that, too.  Unfortunately,
generated code have no difference.

Quuxplusone commented 11 years ago

I wouldn't have expected long long to split on 64-bit architecture. Can you post the llvm IR for the long long case.

Quuxplusone commented 11 years ago

(In reply to comment #5)
> I wouldn't have expected long long to split on 64-bit architecture. Can you
> post the llvm IR for the long long case.

I didn't say long long "splits".  In fact, I want smaller ones to "merge".
Maybe I am not understanding your question right. :-/

Quuxplusone commented 11 years ago

I missed that the last loop was actually calling popcount on individual zero-extended bytes.

Quuxplusone / LLVMBugzillaTest

Multiple consecutive __builtin_popcount()'s may be combined to a bigger type #14287