rookiepig / math-neon

Automatically exported from code.google.com/p/math-neon
1 stars 1 forks source link

Operands access #8

Open GoogleCodeExporter opened 8 years ago

GoogleCodeExporter commented 8 years ago
Architecture: Xilinx Zynq (ARM Cortex-A9)
Compiler: arm-xilinx-eabi-gcc (Sourcery CodeBench Lite 2012.09-105) 4.7.2
Arguments: -Wall -O0 -g3 -c -fmessage-length=0 
-I../../cpu0_bsp/ps7_cortexa9_0/include
3.

The following warnings appear:
math_sinf.c:123:1: warning: control reaches end of non-void function 
[-Wreturn-type]
math_sinf.c:111:1: warning: control reaches end of non-void function 
[-Wreturn-type]

Also the function sinf_neon() does not return the correct value. However, the 
following code behaves correctly:

float sinf_neon_rms(float x)
{
    asm volatile (

        "vld1.32                d3, [%1]                                \n\t"   //d3 = {invrange, range}
        "vdup.f32               d0, %3                                  \n\t"   //d0 = {x, x}
        "vabs.f32               d1, d0                                  \n\t"   //d1 = {ax, ax}

        "vmul.f32               d2, d1, d3[0]                           \n\t"   //d2 = d1 * d3[0]
        "vcvt.u32.f32           d2, d2                                  \n\t"   //d2 = (int) d2
        "vmov.i32               d5, #1                                  \n\t"   //d5 = 1
        "vcvt.f32.u32           d4, d2                                  \n\t"   //d4 = (float) d2
        "vshr.u32               d7, d2, #1                              \n\t"   //d7 = d2 >> 1
        "vmls.f32               d1, d4, d3[1]                           \n\t"   //d1 = d1 - d4 * d3[1]

        "vand.i32               d5, d2, d5                              \n\t"   //d5 = d2 & d5
        "vclt.f32               d18, d0, #0                             \n\t"   //d18 = (d0 < 0.0)
        "vcvt.f32.u32           d6, d5                                  \n\t"   //d6 = (float) d5
        "vmls.f32               d1, d6, d3[1]                           \n\t"   //d1 = d1 - d6 * d3[1]
        "veor.i32               d5, d5, d7                              \n\t"   //d5 = d5 ^ d7
        "vmul.f32               d2, d1, d1                              \n\t"   //d2 = d1*d1 = {x^2, x^2}

        "vld1.32                {d16, d17}, [%2]                        \n\t"   //q8 = {p7, p3, p5, p1}
        "veor.i32               d5, d5, d18                             \n\t"   //d5 = d5 ^ d18
        "vshl.i32               d5, d5, #31                             \n\t"   //d5 = d5 << 31
        "veor.i32               d1, d1, d5                              \n\t"   //d1 = d1 ^ d5

        "vmul.f32               d3, d2, d2                              \n\t"   //d3 = d2*d2 = {x^4, x^4}
        "vmul.f32               q0, q8, d1[0]                           \n\t"   //q0 = q8 * d1[0] = {p7x, p3x, p5x, p1x}
        "vmla.f32               d1, d0, d2[0]                           \n\t"   //d1 = d1 + d0*d2 = {p5x + p7x^3, p1x + p3x^3}
        "vmla.f32               d1, d3, d1[0]                           \n\t"   //d1 = d1 + d3*d0 = {...., p1x + p3x^3 + p5x^5 + p7x^7}

        "vmov.f32               %0, s3                                  \n\t"   //s0 = s3
        : "=r"(x)
        : "r"(__sinf_rng), "r"(__sinf_lut), "r"(x)
        : "q0", "q1", "q2", "q3", "q8", "q9"
        );

        return x;
}

Original issue reported on code.google.com by schwermi...@gmail.com on 27 Jun 2013 at 9:18