namhyung / uftrace

Function graph tracer for C/C++/Rust/Python
https://uftrace.github.io/slide/
GNU General Public License v2.0
3.05k stars 473 forks source link

arm floating point argument failures #1088

Closed honggyukim closed 4 years ago

honggyukim commented 4 years ago

Here is a test program.

$ cat float.c
#include <stdio.h>

float float_add(float a, float b)
{
        fprintf(stderr, "a = %f, b = %f\n", a, b);
        return a + b;
}

int main(int argc, char *argv[])
{
        double a;

        a = float_add(-0.1, 0.2);
        return a > 0;
}

It shows floating point numbers in normal execution.

$ gcc -pg -g float.c

$ file a.out
a.out: ELF 32-bit LSB executable, ARM, EABI5 version 1 (SYSV), ...

$ ./a.out
a = -0.100000, b = 0.200000

It also shows correct numbers with simple uftrace record execution.

$ uftrace record --no-libcall --no-event a.out
a = -0.100000, b = 0.200000

$ uftrace record --no-libcall --no-event --debug-domain uftrace:1 a.out
uftrace: checking binary a.out
uftrace: removing uftrace.data.old directory
uftrace: creating 1 thread(s) for recording
uftrace: using /home/honggyu/usr/lib/libmcount.so library for tracing
a = -0.100000, b = 0.200000
uftrace: child terminated with exit code: 1
uftrace: reading uftrace.data/task.txt file
uftrace: flushing /uftrace-3a8039083d891c45-17880-000

However, the floating point numbers are somehow printed incorrect values if uftrace tries to record an argument.

$ uftrace record -A float_add@fparg1 --no-libcall --no-event a.out
a = 0.000000, b = 0.000000

$ uftrace replay
# DURATION     TID     FUNCTION
            [ 17914] | main() {
  47.760 us [ 17914] |   float_add(0.000000);
  54.740 us [ 17914] | } /* main */

The result is same with --auto-args.

$ uftrace record -a --no-libcall --no-event a.out
a = 0.000000, b = 0.000000
honggyukim commented 4 years ago

Here is another problem in return value recording.

$ uftrace -R float_add@retval/f32 --no-libcall --no-event a.out
a = -0.100000, b = 0.200000
# DURATION     TID     FUNCTION
            [  1009] | main() {
   1.247 ms [  1009] |   float_add() = -nan;
   1.268 ms [  1009] | } /* main */
honggyukim commented 4 years ago

It seems that memset@plt makes s0 and s1 cleared to zero.

 1│ Dump of assembler code for function save_argument:
 2│ /home/honggyu/work/uftrace/libmcount/record.c:
 3│ 503     {
 4│    0x76f8a518 <+0>:     push    {r4, r11, lr}
 5│    0x76f8a51c <+4>:     add     r11, sp, #8
 6│    0x76f8a520 <+8>:     sub     sp, sp, #84     ; 0x54
 7│    0x76f8a524 <+12>:    str     r0, [r11, #-80] ; 0xffffffb0
 8│    0x76f8a528 <+16>:    str     r1, [r11, #-84] ; 0xffffffac
 9│    0x76f8a52c <+20>:    str     r2, [r11, #-88] ; 0xffffffa8
10│    0x76f8a530 <+24>:    str     r3, [r11, #-92] ; 0xffffffa4
11│    0x76f8a534 <+28>:    sub     r3, r11, #12
12│    0x76f8a538 <+32>:    sub     r3, r3, #64     ; 0x40
13│    0x76f8a53c <+36>:    add     r3, r3, #15
14│    0x76f8a540 <+40>:    lsr     r3, r3, #4
15│    0x76f8a544 <+44>:    lsl     r4, r3, #4
16│
17│ 504             void *argbuf = get_argbuf(mtdp, rstack);
18│    0x76f8a548 <+48>:    ldr     r1, [r11, #-84] ; 0xffffffac
19│    0x76f8a54c <+52>:    ldr     r0, [r11, #-80] ; 0xffffffb0
20│    0x76f8a550 <+56>:    bl      0x76f89ac8 <get_argbuf>
21│    0x76f8a554 <+60>:    str     r0, [r11, #-16]
22│
23│ 505             unsigned size;
24│ 506             struct mcount_arg_context ctx = {
25│    0x76f8a558 <+64>:    mov     r0, r4
26│    0x76f8a55c <+68>:    mov     r3, #48 ; 0x30
27│    0x76f8a560 <+72>:    mov     r2, r3
28│    0x76f8a564 <+76>:    mov     r1, #0
29│    0x76f8a568 <+80>:    bl      0x76f7e384 <memset@plt>
30├──> 0x76f8a56c <+84>:    ldr     r3, [r11, #-92] ; 0xffffffa4
31│    0x76f8a570 <+88>:    str     r3, [r4]
32│
33│ 507                     .regs = regs,
34│ 508                     .stack_base = rstack->parent_loc,
35│    0x76f8a574 <+922>:   ldr     r3, [r11, #-84] ; 0xffffffac
36│    0x76f8a578 <+966>:   ldr     r3, [r3]
37│
38│ 506             struct mcount_arg_context ctx = {
39│    0x76f8a57c <+100>:   str     r3, [r4, #4]
40│
41│ 509                     .regions = &mtdp->mem_regions,
42│    0x76f8a580 <+104>:   ldr     r3, [r11, #-80] ; 0xffffffb0
43│    0x76f8a584 <+108>:   add     r3, r3, #4160   ; 0x1040
44│    0x76f8a588 <+112>:   add     r3, r3, #36     ; 0x24

The original source code in C doesn't have such memset call, but it seems to be inserted for struct initialization by compiler implicitly.

 499│ void save_argument(struct mcount_thread_data *mtdp,
 500│                    struct mcount_ret_stack *rstack,
 501│                    struct list_head *args_spec,
 502│                    struct mcount_regs *regs)
 503│ {
 504│         void *argbuf = get_argbuf(mtdp, rstack);
 505│         unsigned size;
 506├───────> struct mcount_arg_context ctx = {
 507│                 .regs = regs,
 508│                 .stack_base = rstack->parent_loc,
 509│                 .regions = &mtdp->mem_regions,
 510│                 .arch = &mtdp->arch,
 511│         };
 512│
 513│         size = save_to_argbuf(argbuf, args_spec, &ctx);
 514│         if (size == -1U) {
 515│                 pr_warn("argument data is too big\n");
 516│                 return;
 517│         }
 518│
 519│         *(unsigned *)argbuf = size;
 520│         rstack->flags |= MCOUNT_FL_ARGUMENT;
 521│ }
honggyukim commented 4 years ago

Here is the memset code in /usr/lib/arm-linux-gnueabihf/libarmmem-v7l.so.

The instruction in line 3 overwrites s0 and s1 registers with vdup.8 q0, r1 instruction.

 1│ Dump of assembler code for function memset:
 2│    0x76f690f8 <+0>:     mov     r3, r0
 3├──> 0x76f690fc <+4>:     vdup.8  q0, r1
 4│    0x76f69100 <+8>:     cmp     r2, #79 ; 0x4f
 5│    0x76f69104 <+12>:    vdup.8  q1, r1
 6│    0x76f69108 <+16>:    bcc     0x76f69190 <memset+152>
 7│    0x76f6910c <+20>:    ands    r12, r0, #15
 8│    0x76f69110 <+24>:    beq     0x76f69140 <memset+72>
 9│    0x76f69114 <+28>:    rsb     r12, r12, #16
10│    0x76f69118 <+32>:    sub     r2, r2, r12
11│    0x76f6911c <+36>:                    ; <UNDEFINED> instruction: 0xe6ffcf3c
12│    0x76f69120 <+40>:    tst     r0, #1
13│    0x76f69124 <+44>:    strbne  r1, [r3], #1
14│    0x76f69128 <+48>:    lsls    r12, r12, #2
15│    0x76f6912c <+52>:    strbcs  r1, [r3, #1]
16│    0x76f69130 <+56>:    strbcs  r1, [r3], #2
17│    0x76f69134 <+60>:    vstmiami        r3!, {s0}
18│    0x76f69138 <+64>:    lsls    r12, r12, #2
19│    0x76f6913c <+68>:    vstmiacs        r3!, {d0}
20│    0x76f69140 <+72>:    mov     r12, #64        ; 0x40
21│    0x76f69144 <+76>:    sub     r2, r2, #64     ; 0x40
22│    0x76f69148 <+80>:    add     r1, r3, #32
23│    0x76f6914c <+84>:    vst1.8  {d0-d3}, [r3 :128], r12
24│    0x76f69150 <+88>:    subs    r2, r2, #64     ; 0x40
25│    0x76f69154 <+92>:    vst1.8  {d0-d3}, [r1 :128], r12
26│    0x76f69158 <+96>:    bcs     0x76f6914c <memset+84>
27│    0x76f6915c <+100>:   vmov.32 r1, d0[0]
28│    0x76f69160 <+104>:   lsls    r2, r2, #27
29│    0x76f69164 <+108>:   bcc     0x76f6916c <memset+116>
30│    0x76f69168 <+112>:   vst1.8  {d0-d3}, [r3]!
31│    0x76f6916c <+116>:   bpl     0x76f69174 <memset+124>
32│    0x76f69170 <+120>:   vst1.8  {d0-d1}, [r3]!
33│    0x76f69174 <+124>:   lsls    r2, r2, #2
34│    0x76f69178 <+128>:   vstmiacs        r3!, {d0}
35│    0x76f6917c <+132>:   strmi   r1, [r3], #4
36│    0x76f69180 <+136>:   lsls    r2, r2, #2
37│    0x76f69184 <+140>:   strhcs  r1, [r3], #2
38│    0x76f69188 <+144>:   strbmi  r1, [r3]
39│    0x76f6918c <+148>:   bx      lr
40│    0x76f69190 <+152>:   tst     r3, #3
41│    0x76f69194 <+156>:   beq     0x76f691ac <memset+180>
42│    0x76f69198 <+160>:   subs    r2, r2, #1
43│    0x76f6919c <+164>:   bcc     0x76f6918c <memset+148>
44│    0x76f691a0 <+168>:   strb    r1, [r3], #1
45│    0x76f691a4 <+172>:   tst     r3, #3
46│    0x76f691a8 <+176>:   bne     0x76f69198 <memset+160>
47│    0x76f691ac <+180>:   cmp     r2, #32
48│    0x76f691b0 <+184>:   bcc     0x76f6915c <memset+100>
49│    0x76f691b4 <+188>:   vst1.8  {d0-d3}, [r3]!
50│    0x76f691b8 <+192>:   sub     r2, r2, #32
51│    0x76f691bc <+196>:   b       0x76f6915c <memset+100>
52│ End of assembler dump.
honggyukim commented 4 years ago

I manually changed the code as follows:

diff --git a/libmcount/record.c b/libmcount/record.c
index 4bf383c5..bdf4711b 100644
--- a/libmcount/record.c
+++ b/libmcount/record.c
@@ -503,12 +503,21 @@ void save_argument(struct mcount_thread_data *mtdp,
 {
        void *argbuf = get_argbuf(mtdp, rstack);
        unsigned size;
+#if 0
        struct mcount_arg_context ctx = {
                .regs = regs,
                .stack_base = rstack->parent_loc,
                .regions = &mtdp->mem_regions,
                .arch = &mtdp->arch,
        };
+#else
+       struct mcount_arg_context ctx;
+       mcount_memset1(&ctx, 0, sizeof(ctx));
+       ctx.regs = regs;
+       ctx.stack_base = rstack->parent_loc;
+       ctx.regions = &mtdp->mem_regions;
+       ctx.arch = &mtdp->arch;
+#endif

        size = save_to_argbuf(argbuf, args_spec, &ctx);
        if (size == -1U) {

At least it doesn't change the original value printed by the program.

$ uftrace record -A float_add@fparg1 --no-libcall --no-event a.out
a = -0.100000, b = 0.200000

However, it still doesn't record the argument properly.

$ uftrace replay
# DURATION     TID     FUNCTION
            [  5371] | main() {
  85.364 us [  5371] |   float_add(0.000000);
 104.792 us [  5371] | } /* main */

There might be somewhere else to be fixed together.

$ uftrace -R float_add@retval/f32 --no-libcall --no-event a.out
a = -0.100000, b = 0.200000
# DURATION     TID     FUNCTION
            [  5580] | main() {
 169.479 us [  5580] |   float_add() = -nan;
 201.145 us [  5580] | } /* main */
honggyukim commented 4 years ago

I think the right fix might be possible if there is a way to disable such implicit usage of memset call for designated initialization in struct, but I don't know if there is such a way to do it.

honggyukim commented 4 years ago

The return value can be fixed with the following change.

diff --git a/arch/arm/mcount-support.c b/arch/arm/mcount-support.c
index e609c2f9..4335e854 100644
--- a/arch/arm/mcount-support.c
+++ b/arch/arm/mcount-support.c
@@ -488,8 +488,10 @@ void mcount_arch_get_retval(struct mcount_arg_context *ctx,
        /* type of return value cannot be FLOAT, so check format instead */
 #ifdef HAVE_ARM_HARDFP
        if (spec->fmt == ARG_FMT_FLOAT && use_hard_float) {
-               /* d0 register (64 bit) was saved below the r0 */
-               memcpy(ctx->val.v, ctx->retval - 2, spec->size);
+               /* d0, d1 registers (64 bit) were saved below the r0 */
+               long *float_retval = ctx->retval - 4;
+
+               memcpy(ctx->val.v, float_retval, spec->size);
        }
        else
 #endif /* HAVE_ARM_HARDFP */

The following commit should have also modified it as above.

The return value is correct now.

$ uftrace -R float_add@retval/f32 --no-libcall --no-event a.out
a = -0.100000, b = 0.200000
# DURATION     TID     FUNCTION
            [ 16135] | main() {
  68.176 us [ 16135] |   float_add() = 0.100000;
 178.802 us [ 16135] | } /* main */
honggyukim commented 4 years ago

The summarized change is as follows:

diff --git a/arch/arm/mcount-support.c b/arch/arm/mcount-support.c
index e609c2f9..4335e854 100644
--- a/arch/arm/mcount-support.c
+++ b/arch/arm/mcount-support.c
@@ -488,8 +488,10 @@ void mcount_arch_get_retval(struct mcount_arg_context *ctx,
        /* type of return value cannot be FLOAT, so check format instead */
 #ifdef HAVE_ARM_HARDFP
        if (spec->fmt == ARG_FMT_FLOAT && use_hard_float) {
-               /* d0 register (64 bit) was saved below the r0 */
-               memcpy(ctx->val.v, ctx->retval - 2, spec->size);
+               /* d0, d1 registers (64 bit) were saved below the r0 */
+               long *float_retval = ctx->retval - 4;
+
+               memcpy(ctx->val.v, float_retval, spec->size);
        }
        else
 #endif /* HAVE_ARM_HARDFP */
diff --git a/libmcount/record.c b/libmcount/record.c
index 4bf383c5..b3ea9dcd 100644
--- a/libmcount/record.c
+++ b/libmcount/record.c
@@ -503,12 +503,13 @@ void save_argument(struct mcount_thread_data *mtdp,
 {
        void *argbuf = get_argbuf(mtdp, rstack);
        unsigned size;
-       struct mcount_arg_context ctx = {
-               .regs = regs,
-               .stack_base = rstack->parent_loc,
-               .regions = &mtdp->mem_regions,
-               .arch = &mtdp->arch,
-       };
+       struct mcount_arg_context ctx;
+
+       mcount_memset1(&ctx, 0, sizeof(ctx));
+       ctx.regs = regs;
+       ctx.stack_base = rstack->parent_loc;
+       ctx.regions = &mtdp->mem_regions;
+       ctx.arch = &mtdp->arch;

        size = save_to_argbuf(argbuf, args_spec, &ctx);
        if (size == -1U) {
@@ -526,11 +527,12 @@ void save_retval(struct mcount_thread_data *mtdp,
        struct list_head *args_spec = rstack->pargs;
        void *argbuf = get_argbuf(mtdp, rstack);
        unsigned size;
-       struct mcount_arg_context ctx = {
-               .retval = retval,
-               .regions = &mtdp->mem_regions,
-               .arch = &mtdp->arch,
-       };
+       struct mcount_arg_context ctx;
+
+       mcount_memset1(&ctx, 0, sizeof(ctx));
+       ctx.retval = retval;
+       ctx.regions = &mtdp->mem_regions;
+       ctx.arch = &mtdp->arch;

        size = save_to_argbuf(argbuf, args_spec, &ctx);
        if (size == -1U) {
honggyukim commented 4 years ago

However, the above change doesn't fix the following example.

$ cat float2.c
#include <stdio.h>
#include <math.h>

int main(int argc, char *argv[])
{
        float a = 1.0f;
        float e = expf(a);
        fprintf(stderr, "expf(%f) = %f\n", a, e);
        double one = log(e);
        fprintf(stderr, "log(%f) = %lf\n", e, one);

        return one - 1;
}
$ gcc -pg -fno-builtin float2.c -lm

$ ./a.out
expf(1.000000) = 2.718282
log(2.718282) = 1.000000

$ uftrace -A expf@fparg1/32 -R expf@retval/f32 -A log@fparg1/64 -R log@retval/f64 -N fprintf a.out
expf(1.000000) = 1.000000
log(1.000000) = -inf
# DURATION     TID     FUNCTION
            [ 16572] | main() {
 126.302 us [ 16572] |   expf(0.000000) = 1.000000;
  55.104 us [ 16572] |   log(0.000000) = -inf;
 323.749 us [ 16572] | } /* main */

It's slightly changed example based on t198_lib_arg_float.py.

honggyukim commented 4 years ago

To fix the above problem, more change is needed in plthook routine as well.

diff --git a/libmcount/plthook.c b/libmcount/plthook.c
index 2ce62cce..160a0c3c 100644
--- a/libmcount/plthook.c
+++ b/libmcount/plthook.c
@@ -732,9 +732,6 @@ static unsigned long __plthook_entry(unsigned long *ret_addr,
        struct sym *sym;
        struct mcount_thread_data *mtdp = NULL;
        struct mcount_ret_stack *rstack;
-       struct uftrace_trigger tr = {
-               .flags = 0,
-       };
        bool skip = false;
        bool recursion = true;
        enum filter_result filtered;
@@ -742,6 +739,10 @@ static unsigned long __plthook_entry(unsigned long *ret_addr,
        struct plthook_special_func *func;
        unsigned long special_flag = 0;
        unsigned long real_addr = 0;
+       struct uftrace_trigger tr;
+
+       mcount_memset1(&tr, 0, sizeof(tr));
+       tr.flags = 0;

        // if necessary, implement it by architecture.
        child_idx = mcount_arch_child_idx(child_idx);

Now, the result is correct.

$ ./a.out
expf(1.000000) = 2.718282
log(2.718282) = 1.000000

$ uftrace -A expf@fparg1/32 -R expf@retval/f32 -A log@fparg1/64 -R log@retval/f64 -N fprintf a.out
expf(1.000000) = 2.718282
log(2.718282) = 1.000000
# DURATION     TID     FUNCTION
            [ 17613] | main() {
  88.697 us [ 17613] |   expf(1.000000) = 2.718282;
 120.780 us [ 17613] |   log(2.718282) = 1.000000;
 337.082 us [ 17613] | } /* main */
honggyukim commented 4 years ago

The above change fixes floating point related tests in ARM 32-bit.

$ ./runtest.py float
Start 2 tests with 2 worker
Test case                 pg             finstrument-fu
------------------------: O0 O1 O2 O3 Os O0 O1 O2 O3 Os
083 arg_float           : OK OK OK OK OK SK SK SK SK SK
198 lib_arg_float       : OK OK OK OK OK SK SK SK SK SK

But it requires the following test fix as well.

diff --git a/tests/t083_arg_float.py b/tests/t083_arg_float.py
index e4f2de8b..02410436 100644
--- a/tests/t083_arg_float.py
+++ b/tests/t083_arg_float.py
@@ -31,7 +31,5 @@ class TestCase(TestBase):
             # argument count follows the size of type
             argopt = argopt.replace('float_mul@fparg1/64,fparg2/32',
                                     'float_mul@fparg1/64,fparg3/32')
-            argopt = argopt.replace('float_div@fparg1,fparg2',
-                                    'float_div@fparg1,fparg3')

         return '%s %s %s' % (TestBase.uftrace_cmd, argopt, 't-' + self.name)

@namhyung What do you think about this approach?

honggyukim commented 4 years ago

@namhyung Could you please take a look at this?

namhyung commented 4 years ago

Sorry for the late reply. Both changes are look good.

Since ffb69ce2 offset from *retval and d0 register chagned. And I think we can use mcount_memset4() instead because it's gauranteed that both of address and size will be multiple of 4.