vrsqrte_u32, vrsqrteq_u32, results are a bit rough sometimes

beru commented 5 years ago

I've noticed that the results of vrsqrte_u32 and vrsqrteq_u32 are sometimes incorrect. Here's a workaround I've found. However, It's probablly slower than current implementation.

// taken from ARM (c) Architecture Reference Manual ARMv7-A and ARMv7-R edition
static
double recip_sqrt_estimate(double a)
{
    int q0, q1, s;
    double r;
    if (a < 0.5) /* range 0.25 <= a < 0.5 */
    {
        q0 = (int)(a * 512.0); /* a in units of 1/512 rounded down */
        r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0); /* reciprocal root r */
    }
    else /* range 0.5 <= a < 1.0 */
    {
        q1 = (int)(a * 256.0); /* a in units of 1/256 rounded down */
        r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0); /* reciprocal root r */
    }
    s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
    return (double)s / 256.0;
}

static
uint32_t UnsignedRSqrtEstimate(uint32_t operand)
{
    if ((operand >> 30) == 0) // Operands <= 0x3FFFFFFF produce 0xFFFFFFFF
        return 0xFFFFFFFF;
    else {
        // Generate double-precision value = operand * 2^(-32). This has zero sign bit, with:
        //     exponent = 1022 or 1021 = double-precision representation of 2^(-1) or 2^(-2)
        //     fraction taken from operand, excluding its most significant one or two bits.
        uint64_t dp_operand;
        if (operand & 0x80000000) {
            dp_operand = (0x3feLL << 52) | (((uint64_t)operand & 0x7FFFFFFF) << 21) | 0;
        }else {
            dp_operand = (0x3fdLL << 52) | (((uint64_t)operand & 0x3FFFFFFF) << 22) | 0;
        }
        union {
            double d;
            uint64_t u64;
        } u;
        u.u64 = dp_operand;
        u.d = recip_sqrt_estimate(u.d);
        uint32_t ret = 0x80000000 | ((u.u64 >> 21) & 0x7FFFFFFF);
        return ret;
    }
}

_NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
{
    //Input is  fixed point number!!!
    //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
    uint32x2_t res;
    __m128 tmp;
    float r, resf, coeff;
    int i,q0, s;
#if 0
    for (i =0; i<2; i++){
        if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff
            res.m64_u32[i] = 0xffffffff;
        }else{
            resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
            coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
            q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
            r = ((float)q0 + 0.5) / coeff;
            tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
            _mm_store_ss(&r, tmp);
            s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
            r = (float)(s / 256.0);
            res.m64_u32[i] = r * (((uint32_t)1) << 31);
        }
    }
#else
    for (i =0; i<2; i++){
        res.m64_u32[i] = UnsignedRSqrtEstimate(a.m64_u32[i]);
    }
#endif
    return res;
}

_NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
{
    //Input is  fixed point number!!!
    //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
   _NEON2SSE_ALIGN_16 uint32_t  atmp[4], res[4];
   _NEON2SSE_ALIGN_16 static const uint32_t c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000};
   __m128 tmp;
   __m128i res128, mask, zero;
    float r, resf, coeff;
    int i,q0, s;
    _mm_store_si128((__m128i*)atmp, a);
    zero = _mm_setzero_si128();
#if 0
    for (i =0; i<4; i++){
        resf =  (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31)));
        coeff = (float)((resf < 0.5)? 512.0 : 256.0); /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
        q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
        r = ((float)q0 + 0.5) / coeff;
        tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
        _mm_store_ss(&r, tmp);
        s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
        r = (float)s / 256.0;
        res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
    }
    res128 = _mm_load_si128((__m128i*)res);
    mask = _mm_and_si128(a, *(__m128i*)c_c0000000);
    mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x3fffffff
    return _mm_or_si128(res128, mask);
#else
    for (i =0; i<4; i++){
        res[i] = UnsignedRSqrtEstimate(atmp[i]);
    }
    res128 = _mm_load_si128((__m128i*)res);
    return res128;
#endif
}

Zvictoria commented 5 years ago

Hi, beru. Thanks for reporting. Could you please specify the input data for which an incorrect results are seen? You will help me there a lot! I will try to fix my implementation because it is definitely faster. All tests I have pass unfortunately. Thanks in advance!

beru commented 5 years ago

Hi, Zvictoria.

These are the fraction of the input data.

0xfeffffff, 0xfcffffff, 0xfaffffff, 0xf8ffffff, 0xf6ffffff, 0xf4ffffff, 0xf2ffffff, 0xf1ffffff, 0xefffffff, 0xedffffff, 0xebffffff, 0xe9ffffff, 0xe8ffffff, 0xe6ffffff, 0xe4ffffff, 0xe3ffffff, 0xe1ffffff, 0xdfffffff, 0xdeffffff, 0xdcffffff, 0xdaffffff, 0xd9ffffff, 0xd7ffffff, 0xd6ffffff, 0xd4ffffff, 0xd3ffffff, 0xd1ffffff, 0xd0ffffff, 0xceffffff, 0xcdffffff, 0xcbffffff, 0xcaffffff, 0xc9ffffff, 0xc7ffffff, 0xc6ffffff, 0xc4ffffff, 0xc3ffffff, 0xc2ffffff, 0xc0ffffff, 0xbfffffff, 0xbeffffff, 0xbdffffff, 0xbbffffff, 0xbaffffff, 0xb9ffffff, 0xb8ffffff, 0xb6ffffff, 0xb5ffffff, 0xb4ffffff, 0xb3ffffff, 0xb2ffffff, 0xb0ffffff, 0xafffffff, 0xaeffffff, 0xadffffff, 0xacffffff, 0xabffffff, 0xaaffffff, 0xa9ffffff, 0xa8ffffff, 0xa6ffffff, 0xa5ffffff, 0xa4ffffff, 0xa3ffffff, 0xa2ffffff, 0xa1ffffff, 0xa0ffffff, 0x9fffffff, 0x9effffff, 0x9dffffff, 0x9cffffff, 0x9bffffff, 0x9affffff, 0x9affff7f, 0x99ffffff, 0x98ffffff, 0x97ffffff, 0x96ffffff, 0x95ffffff, 0x94ffffff, 0x93ffffff, 0x92ffffff, 0x91ffffff, 0x90ffffff, 0x8fffffff, 0x8effffff, 0x8dffffff, 0x8cffffff, 0x8bffffff, 0x8affffff, 0x89ffffff, 0x88ffffff, 0x87ffffff, 0x86ffffff, 0x85ffffff, 0x85ffff7f, 0x84ffffff, 0x83ffffff, 0x82ffffff, 0x82ffff7f, 0x81ffffff, 0x80ffffff, 0x7fffffff, 0x7f7fffff, 0x7effffff, 0x7e7fffff, 0x7d7fffff, 0x7cffffff, 0x7bffffff, 0x7b7fffff, 0x7affffff, 0x79ffffff, 0x797fffff, 0x78ffffff, 0x787fffff, 0x777fffff, 0x76ffffff, 0x767fffff, 0x757fffff, 0x74ffffff, 0x747fffff, 0x73ffffff, 0x737fffff, 0x727fffff, 0x71ffffff, 0x717fffff, 0x70ffffff, 0x707fffff, 0x6f7fffff, 0x6effffff, 0x6e7fffff, 0x6dffffff, 0x6d7fffff, 0x6cffffff, 0x6c7fffff, 0x6bffffff, 0x6b7fffff, 0x6a7fffff, 0x69ffffff, 0x697fffff, 0x68ffffff, 0x687fffff, 0x67ffffff, 0x677fffff, 0x66ffffff, 0x667fffff, 0x65ffffff, 0x657fffff, 0x64ffffff, 0x647fffff, 0x63ffffff, 0x637fffff, 0x62ffffff, 0x627fffff, 0x61ffffff, 0x617fffff, 0x60ffffff, 0x607fffff, 0x607fffbf, 0x5fffffff, 0x5f7fffff, 0x5effffff, 0x5e7fffff, 0x5dffffff, 0x5d7fffff, 0x5cffffff, 0x5c7fffff, 0x5bffffff, 0x5b7fffff, 0x5affffff, 0x5a7fffff, 0x59ffffff, 0x597fffff, 0x58ffffff, 0x587fffff, 0x57ffffff, 0x577fffff, 0x577fffbf, 0x56ffffff, 0x567fffff, 0x55ffffff, 0x557fffff, 0x54ffffff, 0x547fffff, 0x53ffffff, 0x537fffff, 0x52ffffff, 0x527fffff, 0x51ffffff, 0x517fffff, 0x50ffffff, 0x507fffff, 0x4fffffff, 0x4f7fffff, 0x4effffff, 0x4e7fffff, 0x4dffffff, 0x4dffffbf, 0x4d7fffff, 0x4cffffff, 0x4cffffbf, 0x4c7fffff, 0x4bffffff, 0x4b7fffff, 0x4affffff, 0x4a7fffff, 0x49ffffff, 0x497fffff, 0x48ffffff, 0x487fffff, 0x47ffffff, 0x47ffffbf, 0x477fffff, 0x46ffffff, 0x467fffff, 0x45ffffff, 0x457fffff, 0x44ffffff, 0x447fffff, 0x43ffffff, 0x437fffff, 0x42ffffff, 0x427fffff, 0x41ffffff, 0x417fffff, 0x40ffffff, 0x407fffff,

I used below program to print out the numbers.

#include <stdio.h>
#include <stdint.h>
#include <math.h>
#define NEON2SSE_DISABLE_PERFORMANCE_WARNING
#include "NEON_2_SSE.h"

// taken from ARM (c) Architecture Reference Manual ARMv7-A and ARMv7-R edition
static
double recip_sqrt_estimate(double a)
{
    int q0, q1, s;
    double r;
    if (a < 0.5) /* range 0.25 <= a < 0.5 */
    {
        q0 = (int)(a * 512.0); /* a in units of 1/512 rounded down */
        r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0); /* reciprocal root r */
    }
    else /* range 0.5 <= a < 1.0 */
    {
        q1 = (int)(a * 256.0); /* a in units of 1/256 rounded down */
        r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0); /* reciprocal root r */
    }
    s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
    return (double)s / 256.0;
}

static
uint32_t UnsignedRSqrtEstimate(uint32_t operand)
{
    if ((operand >> 30) == 0) // Operands <= 0x3FFFFFFF produce 0xFFFFFFFF
        return 0xFFFFFFFF;
    else {
        // Generate double-precision value = operand * 2^(-32). This has zero sign bit, with:
        //     exponent = 1022 or 1021 = double-precision representation of 2^(-1) or 2^(-2)
        //     fraction taken from operand, excluding its most significant one or two bits.
        uint64_t dp_operand;
        if (operand & 0x80000000) {
            dp_operand = (0x3feLL << 52) | (((uint64_t)operand & 0x7FFFFFFF) << 21) | 0;
        }else {
            dp_operand = (0x3fdLL << 52) | (((uint64_t)operand & 0x3FFFFFFF) << 22) | 0;
        }
        union {
            double d;
            uint64_t u64;
        } u;
        u.u64 = dp_operand;
        u.d = recip_sqrt_estimate(u.d);
        uint32_t ret = 0x80000000 | ((u.u64 >> 21) & 0x7FFFFFFF);
        return ret;
    }
}

uint32x2_t vrsqrte_u32_alt(uint32x2_t a)
{
    //Input is  fixed point number!!!
    //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
    uint32x2_t res;
    __m128 tmp;
    float r, resf, coeff;
    int i,q0, s;
#if 0
    for (i =0; i<2; i++){
        if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff
            res.m64_u32[i] = 0xffffffff;
        }else{
            resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
            coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
            q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
            r = ((float)q0 + 0.5) / coeff;
            tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
            _mm_store_ss(&r, tmp);
            s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
            r = (float)(s / 256.0);
            res.m64_u32[i] = r * (((uint32_t)1) << 31);
        }
    }
#else
    for (i =0; i<2; i++){
        res.m64_u32[i] = UnsignedRSqrtEstimate(a.m64_u32[i]);
    }
#endif
    return res;
}

uint32_t rsqrte_panda_dog(uint32_t v)
{
    return vrsqrte_u32(vdup_n_u32(v)).m64_u32[0];
}

uint32_t rsqrte_panda(uint32_t v)
{
    return vrsqrte_u32_alt(vdup_n_u32(v)).m64_u32[0];
}

int main(int argc, char* argv[])
{
    uint32_t pa = 0;
    uint32_t pb = 0;
    printf("arg, panda_dog, panda\n");
    for (uint32_t i = 0xFFFFFFFF; i >= 0x3FFFFFFF; i-=32) {
        uint32_t a = rsqrte_panda_dog(i);
        uint32_t b = rsqrte_panda(i);
        if (a != b && (a != pa || b != pb)) {
            printf("0x%08x, 0x%08x, 0x%08x\n", i, a, b);
//            printf("0x%08x, ", i);
        }
        pa = a;
        pb = b;
    }

    return 0;
}

Zvictoria commented 5 years ago

@beru thanks for sharing, sorry for my impudence but could you please share the ARM produced (correct) results for our input data as well? I don't have ARM device currently at hand and it will speed up the process significantly.

beru commented 5 years ago

@Zvictoria Here are the results.

arm_vrsqrte_u32.log

win_vrsqrte_u32.log

and the program that printed out the numbers.

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <arm_neon.h>

uint32_t because_its_not_there(uint32_t v)
{
    return vget_lane_u32(vrsqrte_u32(vdup_n_u32(v)), 0);
}

int main() {
    printf("\r\n");
    for (uint32_t i = 0xFFFFFFFF; i >= 0x3FFFFFFF; i-=(1u<<24)) {
        uint32_t b = because_its_not_there(i);
        printf("%08lx %08lx\r\n", i, b);
    }
    return 0;
}

The used processor is Renesas Electronics's RZ/A1H which has an ARM Cortex-A9 core.

Zvictoria commented 5 years ago

Hi, sorry for some delay with this issue. Spent some time investigating it. And found out it is not a perfect world :) As a result of it please see my today's patch that is a compromises of the precision and performance. It makes results more consistent with ARM (4 times less differences :) but not the same still. Are you ok with it?

intel / ARM_NEON_2_x86_SSE

vrsqrte_u32, vrsqrteq_u32, results are a bit rough sometimes #28