chfast / intx

Extended precision integer C++ library
Apache License 2.0
129 stars 29 forks source link

Optimize comparison (less-than) with AVX2 #302

Open chfast opened 8 months ago

chfast commented 8 months ago

https://godbolt.org/z/xEcGzqKo9

unsigned bsr(unsigned m)
{
    return  31 - __builtin_clz(m);  
}

auto lt_avx(const u256& x, const u256& y)
{
    auto xv = std::bit_cast<__m256i>(x);
    auto yv = std::bit_cast<__m256i>(y);
    auto e = _mm256_cmpeq_epi64(xv, yv);
    auto ed = std::bit_cast<__m256d>(e);
    unsigned m = _mm256_movemask_pd(ed);
    auto f = m ^ 0xf;  // flip mask (4 bits)
    auto g = f | 1;  // fixup eq
    auto i = bsr(g);
    return x.w[i] < y.w[i];
}
chfast commented 8 months ago

New idea https://godbolt.org/z/Ge7TanY3M

auto lt_avx_v2_8(const u256& x, const u256& y)
{
    auto xv = std::bit_cast<__m256i>(x);
    auto yv = std::bit_cast<__m256i>(y);
    auto gtv = _mm256_cmpgt_epi8(xv, yv);
    auto ltv = _mm256_cmpgt_epi8(yv, xv);
    unsigned gt = _mm256_movemask_epi8(gtv);
    unsigned lt = _mm256_movemask_epi8(ltv);
    return lt > gt;
}