Closed honggyukim closed 4 years ago
Here is another problem in return value recording.
$ uftrace -R float_add@retval/f32 --no-libcall --no-event a.out
a = -0.100000, b = 0.200000
# DURATION TID FUNCTION
[ 1009] | main() {
1.247 ms [ 1009] | float_add() = -nan;
1.268 ms [ 1009] | } /* main */
It seems that memset@plt
makes s0
and s1
cleared to zero.
1│ Dump of assembler code for function save_argument:
2│ /home/honggyu/work/uftrace/libmcount/record.c:
3│ 503 {
4│ 0x76f8a518 <+0>: push {r4, r11, lr}
5│ 0x76f8a51c <+4>: add r11, sp, #8
6│ 0x76f8a520 <+8>: sub sp, sp, #84 ; 0x54
7│ 0x76f8a524 <+12>: str r0, [r11, #-80] ; 0xffffffb0
8│ 0x76f8a528 <+16>: str r1, [r11, #-84] ; 0xffffffac
9│ 0x76f8a52c <+20>: str r2, [r11, #-88] ; 0xffffffa8
10│ 0x76f8a530 <+24>: str r3, [r11, #-92] ; 0xffffffa4
11│ 0x76f8a534 <+28>: sub r3, r11, #12
12│ 0x76f8a538 <+32>: sub r3, r3, #64 ; 0x40
13│ 0x76f8a53c <+36>: add r3, r3, #15
14│ 0x76f8a540 <+40>: lsr r3, r3, #4
15│ 0x76f8a544 <+44>: lsl r4, r3, #4
16│
17│ 504 void *argbuf = get_argbuf(mtdp, rstack);
18│ 0x76f8a548 <+48>: ldr r1, [r11, #-84] ; 0xffffffac
19│ 0x76f8a54c <+52>: ldr r0, [r11, #-80] ; 0xffffffb0
20│ 0x76f8a550 <+56>: bl 0x76f89ac8 <get_argbuf>
21│ 0x76f8a554 <+60>: str r0, [r11, #-16]
22│
23│ 505 unsigned size;
24│ 506 struct mcount_arg_context ctx = {
25│ 0x76f8a558 <+64>: mov r0, r4
26│ 0x76f8a55c <+68>: mov r3, #48 ; 0x30
27│ 0x76f8a560 <+72>: mov r2, r3
28│ 0x76f8a564 <+76>: mov r1, #0
29│ 0x76f8a568 <+80>: bl 0x76f7e384 <memset@plt>
30├──> 0x76f8a56c <+84>: ldr r3, [r11, #-92] ; 0xffffffa4
31│ 0x76f8a570 <+88>: str r3, [r4]
32│
33│ 507 .regs = regs,
34│ 508 .stack_base = rstack->parent_loc,
35│ 0x76f8a574 <+922>: ldr r3, [r11, #-84] ; 0xffffffac
36│ 0x76f8a578 <+966>: ldr r3, [r3]
37│
38│ 506 struct mcount_arg_context ctx = {
39│ 0x76f8a57c <+100>: str r3, [r4, #4]
40│
41│ 509 .regions = &mtdp->mem_regions,
42│ 0x76f8a580 <+104>: ldr r3, [r11, #-80] ; 0xffffffb0
43│ 0x76f8a584 <+108>: add r3, r3, #4160 ; 0x1040
44│ 0x76f8a588 <+112>: add r3, r3, #36 ; 0x24
The original source code in C doesn't have such memset
call, but it seems to be inserted for struct initialization by compiler implicitly.
499│ void save_argument(struct mcount_thread_data *mtdp,
500│ struct mcount_ret_stack *rstack,
501│ struct list_head *args_spec,
502│ struct mcount_regs *regs)
503│ {
504│ void *argbuf = get_argbuf(mtdp, rstack);
505│ unsigned size;
506├───────> struct mcount_arg_context ctx = {
507│ .regs = regs,
508│ .stack_base = rstack->parent_loc,
509│ .regions = &mtdp->mem_regions,
510│ .arch = &mtdp->arch,
511│ };
512│
513│ size = save_to_argbuf(argbuf, args_spec, &ctx);
514│ if (size == -1U) {
515│ pr_warn("argument data is too big\n");
516│ return;
517│ }
518│
519│ *(unsigned *)argbuf = size;
520│ rstack->flags |= MCOUNT_FL_ARGUMENT;
521│ }
Here is the memset
code in /usr/lib/arm-linux-gnueabihf/libarmmem-v7l.so
.
The instruction in line 3 overwrites s0
and s1
registers with vdup.8 q0, r1
instruction.
1│ Dump of assembler code for function memset:
2│ 0x76f690f8 <+0>: mov r3, r0
3├──> 0x76f690fc <+4>: vdup.8 q0, r1
4│ 0x76f69100 <+8>: cmp r2, #79 ; 0x4f
5│ 0x76f69104 <+12>: vdup.8 q1, r1
6│ 0x76f69108 <+16>: bcc 0x76f69190 <memset+152>
7│ 0x76f6910c <+20>: ands r12, r0, #15
8│ 0x76f69110 <+24>: beq 0x76f69140 <memset+72>
9│ 0x76f69114 <+28>: rsb r12, r12, #16
10│ 0x76f69118 <+32>: sub r2, r2, r12
11│ 0x76f6911c <+36>: ; <UNDEFINED> instruction: 0xe6ffcf3c
12│ 0x76f69120 <+40>: tst r0, #1
13│ 0x76f69124 <+44>: strbne r1, [r3], #1
14│ 0x76f69128 <+48>: lsls r12, r12, #2
15│ 0x76f6912c <+52>: strbcs r1, [r3, #1]
16│ 0x76f69130 <+56>: strbcs r1, [r3], #2
17│ 0x76f69134 <+60>: vstmiami r3!, {s0}
18│ 0x76f69138 <+64>: lsls r12, r12, #2
19│ 0x76f6913c <+68>: vstmiacs r3!, {d0}
20│ 0x76f69140 <+72>: mov r12, #64 ; 0x40
21│ 0x76f69144 <+76>: sub r2, r2, #64 ; 0x40
22│ 0x76f69148 <+80>: add r1, r3, #32
23│ 0x76f6914c <+84>: vst1.8 {d0-d3}, [r3 :128], r12
24│ 0x76f69150 <+88>: subs r2, r2, #64 ; 0x40
25│ 0x76f69154 <+92>: vst1.8 {d0-d3}, [r1 :128], r12
26│ 0x76f69158 <+96>: bcs 0x76f6914c <memset+84>
27│ 0x76f6915c <+100>: vmov.32 r1, d0[0]
28│ 0x76f69160 <+104>: lsls r2, r2, #27
29│ 0x76f69164 <+108>: bcc 0x76f6916c <memset+116>
30│ 0x76f69168 <+112>: vst1.8 {d0-d3}, [r3]!
31│ 0x76f6916c <+116>: bpl 0x76f69174 <memset+124>
32│ 0x76f69170 <+120>: vst1.8 {d0-d1}, [r3]!
33│ 0x76f69174 <+124>: lsls r2, r2, #2
34│ 0x76f69178 <+128>: vstmiacs r3!, {d0}
35│ 0x76f6917c <+132>: strmi r1, [r3], #4
36│ 0x76f69180 <+136>: lsls r2, r2, #2
37│ 0x76f69184 <+140>: strhcs r1, [r3], #2
38│ 0x76f69188 <+144>: strbmi r1, [r3]
39│ 0x76f6918c <+148>: bx lr
40│ 0x76f69190 <+152>: tst r3, #3
41│ 0x76f69194 <+156>: beq 0x76f691ac <memset+180>
42│ 0x76f69198 <+160>: subs r2, r2, #1
43│ 0x76f6919c <+164>: bcc 0x76f6918c <memset+148>
44│ 0x76f691a0 <+168>: strb r1, [r3], #1
45│ 0x76f691a4 <+172>: tst r3, #3
46│ 0x76f691a8 <+176>: bne 0x76f69198 <memset+160>
47│ 0x76f691ac <+180>: cmp r2, #32
48│ 0x76f691b0 <+184>: bcc 0x76f6915c <memset+100>
49│ 0x76f691b4 <+188>: vst1.8 {d0-d3}, [r3]!
50│ 0x76f691b8 <+192>: sub r2, r2, #32
51│ 0x76f691bc <+196>: b 0x76f6915c <memset+100>
52│ End of assembler dump.
I manually changed the code as follows:
diff --git a/libmcount/record.c b/libmcount/record.c
index 4bf383c5..bdf4711b 100644
--- a/libmcount/record.c
+++ b/libmcount/record.c
@@ -503,12 +503,21 @@ void save_argument(struct mcount_thread_data *mtdp,
{
void *argbuf = get_argbuf(mtdp, rstack);
unsigned size;
+#if 0
struct mcount_arg_context ctx = {
.regs = regs,
.stack_base = rstack->parent_loc,
.regions = &mtdp->mem_regions,
.arch = &mtdp->arch,
};
+#else
+ struct mcount_arg_context ctx;
+ mcount_memset1(&ctx, 0, sizeof(ctx));
+ ctx.regs = regs;
+ ctx.stack_base = rstack->parent_loc;
+ ctx.regions = &mtdp->mem_regions;
+ ctx.arch = &mtdp->arch;
+#endif
size = save_to_argbuf(argbuf, args_spec, &ctx);
if (size == -1U) {
At least it doesn't change the original value printed by the program.
$ uftrace record -A float_add@fparg1 --no-libcall --no-event a.out
a = -0.100000, b = 0.200000
However, it still doesn't record the argument properly.
$ uftrace replay
# DURATION TID FUNCTION
[ 5371] | main() {
85.364 us [ 5371] | float_add(0.000000);
104.792 us [ 5371] | } /* main */
There might be somewhere else to be fixed together.
$ uftrace -R float_add@retval/f32 --no-libcall --no-event a.out
a = -0.100000, b = 0.200000
# DURATION TID FUNCTION
[ 5580] | main() {
169.479 us [ 5580] | float_add() = -nan;
201.145 us [ 5580] | } /* main */
I think the right fix might be possible if there is a way to disable such implicit usage of memset
call for designated initialization in struct, but I don't know if there is such a way to do it.
The return value can be fixed with the following change.
diff --git a/arch/arm/mcount-support.c b/arch/arm/mcount-support.c
index e609c2f9..4335e854 100644
--- a/arch/arm/mcount-support.c
+++ b/arch/arm/mcount-support.c
@@ -488,8 +488,10 @@ void mcount_arch_get_retval(struct mcount_arg_context *ctx,
/* type of return value cannot be FLOAT, so check format instead */
#ifdef HAVE_ARM_HARDFP
if (spec->fmt == ARG_FMT_FLOAT && use_hard_float) {
- /* d0 register (64 bit) was saved below the r0 */
- memcpy(ctx->val.v, ctx->retval - 2, spec->size);
+ /* d0, d1 registers (64 bit) were saved below the r0 */
+ long *float_retval = ctx->retval - 4;
+
+ memcpy(ctx->val.v, float_retval, spec->size);
}
else
#endif /* HAVE_ARM_HARDFP */
The following commit should have also modified it as above.
The return value is correct now.
$ uftrace -R float_add@retval/f32 --no-libcall --no-event a.out
a = -0.100000, b = 0.200000
# DURATION TID FUNCTION
[ 16135] | main() {
68.176 us [ 16135] | float_add() = 0.100000;
178.802 us [ 16135] | } /* main */
The summarized change is as follows:
diff --git a/arch/arm/mcount-support.c b/arch/arm/mcount-support.c
index e609c2f9..4335e854 100644
--- a/arch/arm/mcount-support.c
+++ b/arch/arm/mcount-support.c
@@ -488,8 +488,10 @@ void mcount_arch_get_retval(struct mcount_arg_context *ctx,
/* type of return value cannot be FLOAT, so check format instead */
#ifdef HAVE_ARM_HARDFP
if (spec->fmt == ARG_FMT_FLOAT && use_hard_float) {
- /* d0 register (64 bit) was saved below the r0 */
- memcpy(ctx->val.v, ctx->retval - 2, spec->size);
+ /* d0, d1 registers (64 bit) were saved below the r0 */
+ long *float_retval = ctx->retval - 4;
+
+ memcpy(ctx->val.v, float_retval, spec->size);
}
else
#endif /* HAVE_ARM_HARDFP */
diff --git a/libmcount/record.c b/libmcount/record.c
index 4bf383c5..b3ea9dcd 100644
--- a/libmcount/record.c
+++ b/libmcount/record.c
@@ -503,12 +503,13 @@ void save_argument(struct mcount_thread_data *mtdp,
{
void *argbuf = get_argbuf(mtdp, rstack);
unsigned size;
- struct mcount_arg_context ctx = {
- .regs = regs,
- .stack_base = rstack->parent_loc,
- .regions = &mtdp->mem_regions,
- .arch = &mtdp->arch,
- };
+ struct mcount_arg_context ctx;
+
+ mcount_memset1(&ctx, 0, sizeof(ctx));
+ ctx.regs = regs;
+ ctx.stack_base = rstack->parent_loc;
+ ctx.regions = &mtdp->mem_regions;
+ ctx.arch = &mtdp->arch;
size = save_to_argbuf(argbuf, args_spec, &ctx);
if (size == -1U) {
@@ -526,11 +527,12 @@ void save_retval(struct mcount_thread_data *mtdp,
struct list_head *args_spec = rstack->pargs;
void *argbuf = get_argbuf(mtdp, rstack);
unsigned size;
- struct mcount_arg_context ctx = {
- .retval = retval,
- .regions = &mtdp->mem_regions,
- .arch = &mtdp->arch,
- };
+ struct mcount_arg_context ctx;
+
+ mcount_memset1(&ctx, 0, sizeof(ctx));
+ ctx.retval = retval;
+ ctx.regions = &mtdp->mem_regions;
+ ctx.arch = &mtdp->arch;
size = save_to_argbuf(argbuf, args_spec, &ctx);
if (size == -1U) {
However, the above change doesn't fix the following example.
$ cat float2.c
#include <stdio.h>
#include <math.h>
int main(int argc, char *argv[])
{
float a = 1.0f;
float e = expf(a);
fprintf(stderr, "expf(%f) = %f\n", a, e);
double one = log(e);
fprintf(stderr, "log(%f) = %lf\n", e, one);
return one - 1;
}
$ gcc -pg -fno-builtin float2.c -lm
$ ./a.out
expf(1.000000) = 2.718282
log(2.718282) = 1.000000
$ uftrace -A expf@fparg1/32 -R expf@retval/f32 -A log@fparg1/64 -R log@retval/f64 -N fprintf a.out
expf(1.000000) = 1.000000
log(1.000000) = -inf
# DURATION TID FUNCTION
[ 16572] | main() {
126.302 us [ 16572] | expf(0.000000) = 1.000000;
55.104 us [ 16572] | log(0.000000) = -inf;
323.749 us [ 16572] | } /* main */
It's slightly changed example based on t198_lib_arg_float.py
.
To fix the above problem, more change is needed in plthook routine as well.
diff --git a/libmcount/plthook.c b/libmcount/plthook.c
index 2ce62cce..160a0c3c 100644
--- a/libmcount/plthook.c
+++ b/libmcount/plthook.c
@@ -732,9 +732,6 @@ static unsigned long __plthook_entry(unsigned long *ret_addr,
struct sym *sym;
struct mcount_thread_data *mtdp = NULL;
struct mcount_ret_stack *rstack;
- struct uftrace_trigger tr = {
- .flags = 0,
- };
bool skip = false;
bool recursion = true;
enum filter_result filtered;
@@ -742,6 +739,10 @@ static unsigned long __plthook_entry(unsigned long *ret_addr,
struct plthook_special_func *func;
unsigned long special_flag = 0;
unsigned long real_addr = 0;
+ struct uftrace_trigger tr;
+
+ mcount_memset1(&tr, 0, sizeof(tr));
+ tr.flags = 0;
// if necessary, implement it by architecture.
child_idx = mcount_arch_child_idx(child_idx);
Now, the result is correct.
$ ./a.out
expf(1.000000) = 2.718282
log(2.718282) = 1.000000
$ uftrace -A expf@fparg1/32 -R expf@retval/f32 -A log@fparg1/64 -R log@retval/f64 -N fprintf a.out
expf(1.000000) = 2.718282
log(2.718282) = 1.000000
# DURATION TID FUNCTION
[ 17613] | main() {
88.697 us [ 17613] | expf(1.000000) = 2.718282;
120.780 us [ 17613] | log(2.718282) = 1.000000;
337.082 us [ 17613] | } /* main */
The above change fixes floating point related tests in ARM 32-bit.
$ ./runtest.py float
Start 2 tests with 2 worker
Test case pg finstrument-fu
------------------------: O0 O1 O2 O3 Os O0 O1 O2 O3 Os
083 arg_float : OK OK OK OK OK SK SK SK SK SK
198 lib_arg_float : OK OK OK OK OK SK SK SK SK SK
But it requires the following test fix as well.
diff --git a/tests/t083_arg_float.py b/tests/t083_arg_float.py
index e4f2de8b..02410436 100644
--- a/tests/t083_arg_float.py
+++ b/tests/t083_arg_float.py
@@ -31,7 +31,5 @@ class TestCase(TestBase):
# argument count follows the size of type
argopt = argopt.replace('float_mul@fparg1/64,fparg2/32',
'float_mul@fparg1/64,fparg3/32')
- argopt = argopt.replace('float_div@fparg1,fparg2',
- 'float_div@fparg1,fparg3')
return '%s %s %s' % (TestBase.uftrace_cmd, argopt, 't-' + self.name)
@namhyung What do you think about this approach?
@namhyung Could you please take a look at this?
Sorry for the late reply. Both changes are look good.
Since ffb69ce2 offset from *retval
and d0
register chagned. And I think we can use mcount_memset4()
instead because it's gauranteed that both of address and size will be multiple of 4.
Here is a test program.
It shows floating point numbers in normal execution.
It also shows correct numbers with simple uftrace record execution.
However, the floating point numbers are somehow printed incorrect values if uftrace tries to record an argument.
The result is same with
--auto-args
.