Closed beru closed 4 years ago
Hi, beru. Thanks for reporting. Could you please specify the input data for which an incorrect results are seen? You will help me there a lot! I will try to fix my implementation because it is definitely faster. All tests I have pass unfortunately. Thanks in advance!
Hi, Zvictoria.
These are the fraction of the input data.
0xfeffffff, 0xfcffffff, 0xfaffffff, 0xf8ffffff, 0xf6ffffff, 0xf4ffffff, 0xf2ffffff, 0xf1ffffff, 0xefffffff, 0xedffffff, 0xebffffff, 0xe9ffffff, 0xe8ffffff, 0xe6ffffff, 0xe4ffffff, 0xe3ffffff, 0xe1ffffff, 0xdfffffff, 0xdeffffff, 0xdcffffff, 0xdaffffff, 0xd9ffffff, 0xd7ffffff, 0xd6ffffff, 0xd4ffffff, 0xd3ffffff, 0xd1ffffff, 0xd0ffffff, 0xceffffff, 0xcdffffff, 0xcbffffff, 0xcaffffff, 0xc9ffffff, 0xc7ffffff, 0xc6ffffff, 0xc4ffffff, 0xc3ffffff, 0xc2ffffff, 0xc0ffffff, 0xbfffffff, 0xbeffffff, 0xbdffffff, 0xbbffffff, 0xbaffffff, 0xb9ffffff, 0xb8ffffff, 0xb6ffffff, 0xb5ffffff, 0xb4ffffff, 0xb3ffffff, 0xb2ffffff, 0xb0ffffff, 0xafffffff, 0xaeffffff, 0xadffffff, 0xacffffff, 0xabffffff, 0xaaffffff, 0xa9ffffff, 0xa8ffffff, 0xa6ffffff, 0xa5ffffff, 0xa4ffffff, 0xa3ffffff, 0xa2ffffff, 0xa1ffffff, 0xa0ffffff, 0x9fffffff, 0x9effffff, 0x9dffffff, 0x9cffffff, 0x9bffffff, 0x9affffff, 0x9affff7f, 0x99ffffff, 0x98ffffff, 0x97ffffff, 0x96ffffff, 0x95ffffff, 0x94ffffff, 0x93ffffff, 0x92ffffff, 0x91ffffff, 0x90ffffff, 0x8fffffff, 0x8effffff, 0x8dffffff, 0x8cffffff, 0x8bffffff, 0x8affffff, 0x89ffffff, 0x88ffffff, 0x87ffffff, 0x86ffffff, 0x85ffffff, 0x85ffff7f, 0x84ffffff, 0x83ffffff, 0x82ffffff, 0x82ffff7f, 0x81ffffff, 0x80ffffff, 0x7fffffff, 0x7f7fffff, 0x7effffff, 0x7e7fffff, 0x7d7fffff, 0x7cffffff, 0x7bffffff, 0x7b7fffff, 0x7affffff, 0x79ffffff, 0x797fffff, 0x78ffffff, 0x787fffff, 0x777fffff, 0x76ffffff, 0x767fffff, 0x757fffff, 0x74ffffff, 0x747fffff, 0x73ffffff, 0x737fffff, 0x727fffff, 0x71ffffff, 0x717fffff, 0x70ffffff, 0x707fffff, 0x6f7fffff, 0x6effffff, 0x6e7fffff, 0x6dffffff, 0x6d7fffff, 0x6cffffff, 0x6c7fffff, 0x6bffffff, 0x6b7fffff, 0x6a7fffff, 0x69ffffff, 0x697fffff, 0x68ffffff, 0x687fffff, 0x67ffffff, 0x677fffff, 0x66ffffff, 0x667fffff, 0x65ffffff, 0x657fffff, 0x64ffffff, 0x647fffff, 0x63ffffff, 0x637fffff, 0x62ffffff, 0x627fffff, 0x61ffffff, 0x617fffff, 0x60ffffff, 0x607fffff, 0x607fffbf, 0x5fffffff, 0x5f7fffff, 0x5effffff, 0x5e7fffff, 0x5dffffff, 0x5d7fffff, 0x5cffffff, 0x5c7fffff, 0x5bffffff, 0x5b7fffff, 0x5affffff, 0x5a7fffff, 0x59ffffff, 0x597fffff, 0x58ffffff, 0x587fffff, 0x57ffffff, 0x577fffff, 0x577fffbf, 0x56ffffff, 0x567fffff, 0x55ffffff, 0x557fffff, 0x54ffffff, 0x547fffff, 0x53ffffff, 0x537fffff, 0x52ffffff, 0x527fffff, 0x51ffffff, 0x517fffff, 0x50ffffff, 0x507fffff, 0x4fffffff, 0x4f7fffff, 0x4effffff, 0x4e7fffff, 0x4dffffff, 0x4dffffbf, 0x4d7fffff, 0x4cffffff, 0x4cffffbf, 0x4c7fffff, 0x4bffffff, 0x4b7fffff, 0x4affffff, 0x4a7fffff, 0x49ffffff, 0x497fffff, 0x48ffffff, 0x487fffff, 0x47ffffff, 0x47ffffbf, 0x477fffff, 0x46ffffff, 0x467fffff, 0x45ffffff, 0x457fffff, 0x44ffffff, 0x447fffff, 0x43ffffff, 0x437fffff, 0x42ffffff, 0x427fffff, 0x41ffffff, 0x417fffff, 0x40ffffff, 0x407fffff,
I used below program to print out the numbers.
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#define NEON2SSE_DISABLE_PERFORMANCE_WARNING
#include "NEON_2_SSE.h"
// taken from ARM (c) Architecture Reference Manual ARMv7-A and ARMv7-R edition
static
double recip_sqrt_estimate(double a)
{
int q0, q1, s;
double r;
if (a < 0.5) /* range 0.25 <= a < 0.5 */
{
q0 = (int)(a * 512.0); /* a in units of 1/512 rounded down */
r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0); /* reciprocal root r */
}
else /* range 0.5 <= a < 1.0 */
{
q1 = (int)(a * 256.0); /* a in units of 1/256 rounded down */
r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0); /* reciprocal root r */
}
s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
return (double)s / 256.0;
}
static
uint32_t UnsignedRSqrtEstimate(uint32_t operand)
{
if ((operand >> 30) == 0) // Operands <= 0x3FFFFFFF produce 0xFFFFFFFF
return 0xFFFFFFFF;
else {
// Generate double-precision value = operand * 2^(-32). This has zero sign bit, with:
// exponent = 1022 or 1021 = double-precision representation of 2^(-1) or 2^(-2)
// fraction taken from operand, excluding its most significant one or two bits.
uint64_t dp_operand;
if (operand & 0x80000000) {
dp_operand = (0x3feLL << 52) | (((uint64_t)operand & 0x7FFFFFFF) << 21) | 0;
}else {
dp_operand = (0x3fdLL << 52) | (((uint64_t)operand & 0x3FFFFFFF) << 22) | 0;
}
union {
double d;
uint64_t u64;
} u;
u.u64 = dp_operand;
u.d = recip_sqrt_estimate(u.d);
uint32_t ret = 0x80000000 | ((u.u64 >> 21) & 0x7FFFFFFF);
return ret;
}
}
uint32x2_t vrsqrte_u32_alt(uint32x2_t a)
{
//Input is fixed point number!!!
//We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
uint32x2_t res;
__m128 tmp;
float r, resf, coeff;
int i,q0, s;
#if 0
for (i =0; i<2; i++){
if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff
res.m64_u32[i] = 0xffffffff;
}else{
resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
r = ((float)q0 + 0.5) / coeff;
tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
_mm_store_ss(&r, tmp);
s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
r = (float)(s / 256.0);
res.m64_u32[i] = r * (((uint32_t)1) << 31);
}
}
#else
for (i =0; i<2; i++){
res.m64_u32[i] = UnsignedRSqrtEstimate(a.m64_u32[i]);
}
#endif
return res;
}
uint32_t rsqrte_panda_dog(uint32_t v)
{
return vrsqrte_u32(vdup_n_u32(v)).m64_u32[0];
}
uint32_t rsqrte_panda(uint32_t v)
{
return vrsqrte_u32_alt(vdup_n_u32(v)).m64_u32[0];
}
int main(int argc, char* argv[])
{
uint32_t pa = 0;
uint32_t pb = 0;
printf("arg, panda_dog, panda\n");
for (uint32_t i = 0xFFFFFFFF; i >= 0x3FFFFFFF; i-=32) {
uint32_t a = rsqrte_panda_dog(i);
uint32_t b = rsqrte_panda(i);
if (a != b && (a != pa || b != pb)) {
printf("0x%08x, 0x%08x, 0x%08x\n", i, a, b);
// printf("0x%08x, ", i);
}
pa = a;
pb = b;
}
return 0;
}
@beru thanks for sharing, sorry for my impudence but could you please share the ARM produced (correct) results for our input data as well? I don't have ARM device currently at hand and it will speed up the process significantly.
@Zvictoria Here are the results.
and the program that printed out the numbers.
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <arm_neon.h>
uint32_t because_its_not_there(uint32_t v)
{
return vget_lane_u32(vrsqrte_u32(vdup_n_u32(v)), 0);
}
int main() {
printf("\r\n");
for (uint32_t i = 0xFFFFFFFF; i >= 0x3FFFFFFF; i-=(1u<<24)) {
uint32_t b = because_its_not_there(i);
printf("%08lx %08lx\r\n", i, b);
}
return 0;
}
The used processor is Renesas Electronics's RZ/A1H which has an ARM Cortex-A9 core.
Hi, sorry for some delay with this issue. Spent some time investigating it. And found out it is not a perfect world :) As a result of it please see my today's patch that is a compromises of the precision and performance. It makes results more consistent with ARM (4 times less differences :) but not the same still. Are you ok with it?
I've noticed that the results of
vrsqrte_u32
andvrsqrteq_u32
are sometimes incorrect. Here's a workaround I've found. However, It's probablly slower than current implementation.