Fix vcvt_n functions to handle 32 fraction bits

When 32 bit conversion requested left shift to 32 bits overflows and result does not match real ARM output.

before:

./vcvt_test 
Source: -0.000000 -0.429497 ->
VCVT N to 1 bit: 0 0
VCVT N to 31 bit: 0 3372630080
VCVT N to 32 bit: 0 0
Source: 0.000000 0.000000 ->
VCVT N to 1 bit: 0 0
VCVT N to 31 bit: 0 0
VCVT N to 32 bit: 0 0
Source: 0.214748 0.429497 ->
VCVT N to 1 bit: 0 0
VCVT N to 31 bit: 461168608 922337216
VCVT N to 32 bit: 0 0

after fix and ARM output:

Source: -0.000000 -0.429497 ->
VCVT N to 1 bit: 0 0
VCVT N to 31 bit: 0 0
VCVT N to 32 bit: 0 0
Source: 0.000000 0.000000 ->
VCVT N to 1 bit: 0 0
VCVT N to 31 bit: 0 0
VCVT N to 32 bit: 0 0
Source: 0.214748 0.429497 ->
VCVT N to 1 bit: 0 0
VCVT N to 31 bit: 461168608 922337216
VCVT N to 32 bit: 922337216 1844674432

Test app

#if defined(__ARM_NEON)
#define ANDROID_ARM_NEON
#include <arm_neon.h>
#else
#include <NEON_2_SSE.h>
#endif

#include <stdio.h>

#if defined(__ARM_NEON)
#define __float32x2_declare(var, v0, v1) float32x2_t var = {v0, v1}
#define __uint32x2_p(v) (uint32_t)v[0], (uint32_t)v[1]
#define __float32x2_p(v) v[0], v[1]
#else
#define __float32x2_declare(var, v0, v1) float32x2_t var; var.m64_f32[0] = v0; var.m64_f32[1] = v1
#define __uint32x2_p(v) v.m64_u32[0], v.m64_u32[1]
#define __float32x2_p(v) v.m64_f32[0], v.m64_f32[1]
#endif

static void testme(float v0, float v1)
{
    __float32x2_declare(a, v0 / 1E10, v1 / 1E10);

    printf("Source: %f %f ->\n", __float32x2_p(a));
    printf("VCVT N to 1 bit: %u %u\n", __uint32x2_p(vcvt_n_u32_f32(a, 1)));
    printf("VCVT N to 31 bit: %u %u\n", __uint32x2_p(vcvt_n_u32_f32(a, 31)));
    printf("VCVT N to 32 bit: %u %u\n", __uint32x2_p(vcvt_n_u32_f32(a, 32)));
}

int main()
{

    testme(-1.0f, (float)(-1LL << 32));

    testme(0.0f, 1.0f);

    testme((float)(1U << 31), (float)(1ULL << 32));

    return 0;
}

intel / ARM_NEON_2_x86_SSE

Fix vcvt_n functions to handle 32 fraction bits #50