__uint128_t is not portable to 32-bit

easyaspi314 commented 5 years ago

I noticed that you added __uint128_t to the xxh3. That is not portable to 32-bit; the type is only defined on 64-bit, and as you see below, for a pretty decent reason.

I suggest this:

static U64 mult128(U64 bot, U64 top) {
#if (SIZE_MAX > 0xFFFFFFFFULL) || defined(__SIZEOF_UINT128__) /* TODO: better detection */
    __uint128_t prod = (__uint128_t)bot * (__uint128_t)top;
    return prod + (prod >> 64);
#else 
    /* based off of LLVM's optimization of https://github.com/calccrypto/uint128_t's
     * operator* when both upper halves are zero.
     * There's probably a better way to do this, and if it weren't for the
     * mixing in the middle, it could be easily SIMD'd.  */
    U64 BD = (top & 0xFFFFFFFF) * (bot & 0xFFFFFFFF);
    U64 AC = (top >> 32)        * (bot >> 32);

    U64 BC = (top & 0xFFFFFFFF) * (bot >> 32);
    U64 AD = (top >> 32)        * (bot & 0xFFFFFFFF);

    U64 sum1 = (BC & 0xFFFFFFFF);
    U64 sum2 = (AD & 0xFFFFFFFF);

    sum1 += (BD >> 32);
    sum2 += sum1;

    sum1 = sum2 >> 32;
    sum2 <<= 32;

    sum1 += (BD & 0xFFFFFFFF);
    sum2 += (AC & 0xFFFFFFFF);

    sum1 += (BC >> 32);
    sum2 += (AD >> 32);

    sum1 += (AC & 0xFFFFFFFF00000000ULL);
    sum2 += sum1;

    printf("%llx\n", sum2);
    return sum2;
}
#endif 
}

However, I do want to mention that GCC doesn't really like this code, and it half vectorizes it. I'm trying out a potential SSE2/NEON32 version, though, which has a chance of being faster or slower. The first part I got from MSDN, but the second part confuses me.

    uint32_t A = top >> 32;
    uint32_t B = top & 0xFFFFFFFF;
    uint32_t C = bot >> 32;
    uint32_t D = bot & 0xFFFFFFFF;
    __m128i ba = _mm_set_epi32(0, B, 0, A); // { B, A }
    __m128i dc = _mm_set_epi32(0, D, 0, C); // { D, C }
    __m128i cd = _mm_shuffle_epi32(dc, _MM_SHUFFLE(1, 0, 3, 2)); // { C, D }

    __m128i bd_ac = _mm_mul_epu32(ba, dc); // { BD, AC }
    __m128i bc_ad = _mm_mul_epu32(ba, cd); // { BC, AD }

    // this could be improved probably
#ifdef __SSE4_1__
    __m128i zero = _mm_setzero_si128();
    __m128i bc_ad_lo = _mm_blend_epi16(bc_ad, zero, 0xCC); // bd_ac & 0xFFFFFFFF;
#else 
    __m128i ff = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF);
    __m128i bc_ad_lo = _mm_and_si128(bc_ad, ff);
#endif 
    __m128i bd_hi = _mm_srli_si128(bd_ac, 12); // {   0, BD >> 32 }
    __m128i sum = _mm_add_epi64(bc_ad_lo, bd_hi);  // { bc & 0xFFFFFFFF, ad & 0xFFFFFFFF + BD >> 32 }
    __m128i sumShuf = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(sum), _mm_setzero_ps(), _MM_SHUFFLE(0, 0, 3, 2)));
    sum = _mm_add_epi64(sum, sumShuf);
   // dunno

If I do it with vector extensions, Clang gives me this:

    U64x2 BA = { top >> 32, top & 0xFFFFFFFF };
    U64x2 DC = { bot >> 32, bot & 0xFFFFFFFF };
    U64x2 CD = { bot & 0xFFFFFFFF, bot >> 32 };
    U64x2 BD_AC = (BA & 0xFFFFFFFF) * (DC & 0xFFFFFFFF);
    U64x2 BC_AD = (BA & 0xFFFFFFFF) * (CD & 0xFFFFFFFF);
    U64x2 sum = BC_AD & 0xFFFFFFFF;
    sum[1] += BD_AC[0] >> 32;
    sum[0] += sum[1];
    U64x2 sumv2 = { sum[0] << 32, sum[0] >> 32 };
    sum = sumv2;
    sum += BD_AC & 0xFFFFFFFF;
    sum += BC_AD >> 32;
    sum[1] += (BD_AC[1] & 0xFFFFFFFF00000000);
    sum[0] += sum[1];
    return sum[0];

However, in the middle it switches to scalar. I kinda wish I had NEON's half registers…

_mull_u64_v2:                           ## @mull_u64_v2
## %bb.0:
        push    esi
        call    L1$pb
L1$pb:
        pop     eax
        movd    xmm0, dword ptr [esp + 16] ## xmm0 = mem[0],zero,zero,zero
        movd    xmm2, dword ptr [esp + 20] ## xmm2 = mem[0],zero,zero,zero
        punpcklqdq      xmm2, xmm0      ## xmm2 = xmm2[0],xmm0[0]
        movd    xmm0, dword ptr [esp + 8] ## xmm0 = mem[0],zero,zero,zero
        movd    xmm3, dword ptr [esp + 12] ## xmm3 = mem[0],zero,zero,zero
        movdqa  xmm1, xmm3
        punpcklqdq      xmm1, xmm0      ## xmm1 = xmm1[0],xmm0[0]
        punpcklqdq      xmm0, xmm3      ## xmm0 = xmm0[0],xmm3[0]
        pmuludq xmm1, xmm2
        pmuludq xmm0, xmm2
        pshufd  xmm2, xmm1, 229         ## xmm2 = xmm1[1,1,2,3]
        movd    edx, xmm2
        pshufd  xmm2, xmm0, 78          ## xmm2 = xmm0[2,3,0,1]
        movd    esi, xmm2
        xor     ecx, ecx
        add     esi, edx
        setb    cl
        movd    edx, xmm0
        add     edx, esi
        adc     ecx, 0
        movd    xmm2, ecx
        movd    xmm3, edx
        pshufd  xmm4, xmm1, 231         ## xmm4 = xmm1[3,1,2,3]
        pand    xmm1, xmmword ptr [eax + LCPI1_0-L1$pb]
        shufps  xmm3, xmm2, 65          ## xmm3 = xmm3[1,0],xmm2[0,1]
        psrlq   xmm0, 32
        paddq   xmm0, xmm1
        paddq   xmm0, xmm3
        movd    eax, xmm4
        pshufd  xmm1, xmm0, 78          ## xmm1 = xmm0[2,3,0,1]
        movd    ecx, xmm1
        pshufd  xmm1, xmm0, 231         ## xmm1 = xmm0[3,1,2,3]
        movd    esi, xmm1
        add     esi, eax
        pshufd  xmm1, xmm0, 229         ## xmm1 = xmm0[1,1,2,3]
        movd    edx, xmm1
        movd    eax, xmm0
        add     eax, ecx
        adc     edx, esi
        pop     esi
        ret

Cyan4973 commented 5 years ago

Yes, you are right @easyaspi314 , and this is a known issue. I plan to add a dedicate code to emulate that for 32-bit platforms (and non-gcc ones). There are already multiple implementation available, so I should be able to grab and plug one.

easyaspi314 commented 5 years ago

Actually, the scalar one doesn't look too bad.

GCC still hates the code and insists on using the stack (or an ugly partial vectorization for the first one), but Clang emits clean code for everyone. It emits the best code for ARMv7 though, but that is mainly because of the accumulate instructions.

umaal is a complete beast of an instruction added in ARMv6:

void umaal(uint32_t *RdLo, uint32_t *RdHi, uint32_t Rn, uint32_t Rm)
{
    uint64_t prod = (uint64_t) Rn * (uint64_t) Rm;
    prod += (uint64_t) *RdLo;
    prod += (uint64_t) *RdHi;
    *RdLo = prod & 0xFFFFFFFF;
    *RdHi = prod >> 32;
}

Here's a comparison of a few implementations I found online. https://gcc.godbolt.org/z/pWh9No

The second one might be better because GCC doesn't vectorize it and it isn't terrible on ARM (it only adds an extra instruction). On MSVC, we want to do that intrinsic instead of cast because MSVC is stupid and will try to do a full 64-bit multiply.

Cyan4973 commented 5 years ago

I've uploaded xxh3 branch with a new version of xxh3.h which contains an update for mul128 function on non-x64 platforms. I could check it compiles and run properly in 32-bits mode.

It also contains a dedicated path for ARM aarch, though I believe the existing gcc has higher priority and will likely produce the same result (haven't yet verified generated assembly). You might be able to understand this part better.

easyaspi314 commented 5 years ago

https://gcc.godbolt.org/z/PRtMJy

Boom. Money.

37-48 instructions on x86, 8-9 instructions on ARMv6+, and thanks to __attribute__((__target__("no-sse2"))), GCC doesn't partially vectorize it when tuning for Core 2.

I had to use inline assembly for ARM because I couldn't get Clang or GCC to figure out my request for umaal. Unfortunately, yeah, that mess of #if statements is kinda sucky.

        umull   r12, lr, r0, r2  @ {r12, lr} = (U64)r0 * (U64)r2
        mov     r5, #0           @ r5 = 0
        mov     r4, #0           @ r4 = 0
        umaal   r5, lr, r1, r2   @ {r5, lr} = ((U64)r1 * (U64)r2) + r5 + lr
        umaal   r5, r4, r0, r3   @ {r5, r4} = ((U64)r0 * (U64)r3) + r5 + r4
        umaal   lr, r4, r1, r3   @ {lr, r4} = ((U64)r1 * (U64)r3) + lr + r4
        adds    r0, lr, r12      @ <-.
                                 @    {r0, r1} = (U64){lr, r4} + (U64){r12, r5}
        adc     r1, r4, r5       @ <-'

I don't think I can optimize it more than that. That's only 8 instructions.

I added XXH_mult32to64 which expands to the __emulu intrinsic on MSVC which makes it so MSVC is far less likely to output a __allmul call (although to be fair it doesn't do it in this case).

I still think the x86 can be optimized more, although it might just be the fixed output registers that causes all that shuffling. It actually runs quite fast:

    uint32_t val32[4];
    uint64_t *val = val32;
    uint64_t sum = 0;
    srand(0);
    double start = (double)clock();
    for (int i = 0; i < 10000000; i++) {
        val32[0] = rand();
        val32[1] = rand();
        val32[2] = rand();
        val32[3] = rand();
        sum += XXH_mul128AndFold_32(val[0], val[1]);
    }
    double end = (double)clock();
    printf("%lld %lfs\n", sum, (end - start) / CLOCKS_PER_SEC);

7652620537862933594 0.454625s

For comparison, this is in 64-bit mode with __uint128_t:

7652620537862933594 0.336406

Only 35% slower considering how much more work it does.

Side note: Clang optimizes the __uint128_t version to this on aarch64, so that is probably best to use it. Fused multiply and add is always preferred.

        umulh   x8, x1, x0     // x8 = ((__uint128_t)x1 * x0) >> 64
        madd    x0, x1, x0, x8 // x0 = (x1 * x0) + x8;

Cyan4973 commented 5 years ago

Looks great !

easyaspi314 commented 5 years ago

It's definitely slower on ARM despite the significantly fewer instructions, but that is mainly due to the rand() call being pretty sophisticated on Bionic.

967456348838854209 5.572055s

Replace it with uninitialised data, force it in a register and use -fno-inline to keep it from cheating and it is very fast:

5764888493227865371 0.119198s

(without the rand calls on x86, it is 0.076690s my implementation vs 0.031167 native x86_64 which makes more sense. Still pretty fast though.)

Cyan4973 / xxHash

__uint128_t is not portable to 32-bit #170