Open honggyukim opened 5 years ago
Here is the debug output.
root@raspberrypi3:~# uftrace record --debug-domain mcount:3 a.out
mcount: initializing mcount library
mcount: mcount setup done
mcount: new session started: 943a81e175472a8f: a.out
mcount: preparing shmem buffers: tid = 7316
mcount: <0> enter 10c03
mcount: tr->flags: 0, filter mode, count: [0] 0/0
mcount: <1> enter 10bcd
...
mcount: <14> exit 10808
mcount: task 7316 recorded 32 bytes (record count = 2)
mcount: rstack[13] ENTRY 10808
mcount: rstack[13] EXIT 10808
mcount: <13> enter 10844
mcount: tr->flags: 0, filter mode, count: [0] 0/0
mcount: <14> exit 10844
mcount: task 7316 recorded 32 bytes (record count = 2)
mcount: rstack[13] ENTRY 10844
mcount: rstack[13] EXIT 10844
mcount: <13> enter 10814
mcount: tr->flags: 0, filter mode, count: [0] 0/0
#1 0x0000000076f2c2a4 sp=0x000000007ea13768 __gnu_mcount_nc + 0x34
mcount: <14> exit 10814
mcount: task 7316 recorded 32 bytes (record count = 2)
mcount: rstack[13] ENTRY 10814
mcount: rstack[13] EXIT 10814
mcount: <13> enter 10820
mcount: tr->flags: 0, filter mode, count: [0] 0/0
child terminated by signal: 11: Segmentation fault
After applying recover
trigger, it goes one more step, but also crashes. It needs to find where to apply recover
trigger more.
root@raspberrypi3:~# uftrace record -T _ULarm_step@recover ./backtrace
#1 0x0000000000010bd9 sp=0x000000007efab658 fac(int) + 0x18
#2 0x0000000076fbc2a4 sp=0x000000007efab668 __gnu_mcount_nc + 0x34
child terminated by signal: 11: Segmentation fault
root@raspberrypi3:~# uftrace replay
# DURATION TID FUNCTION
[ 9054] | main() {
[ 9054] | fac() {
[ 9054] | fac() {
[ 9054] | fac() {
[ 9054] | fac() {
[ 9054] | fac() {
[ 9054] | fac() {
[ 9054] | fac() {
[ 9054] | fac() {
[ 9054] | fac() {
[ 9054] | fac() {
[ 9054] | fac() {
[ 9054] | backtrace() {
288.905 us [ 9054] | _ULarm_init_local();
484.998 us [ 9054] | _ULarm_step();
3.542 us [ 9054] | _ULarm_get_reg();
0.521 us [ 9054] | _ULarm_get_reg();
3.958 us [ 9054] | memset();
331.092 us [ 9054] | _ULarm_get_proc_name();
87.968 us [ 9054] | __cxa_demangle();
[ 9054] | printf() {
304.270 us [ 9054] | /* linux:schedule */
76.770 us [ 9054] | /* linux:schedule */
488.071 us [ 9054] | } /* printf */
5.677 us [ 9054] | free();
0.625 us [ 9054] | _ULarm_get_reg();
0.364 us [ 9054] | _ULarm_get_reg();
0.521 us [ 9054] | memset();
580.467 us [ 9054] | _ULarm_get_proc_name();
12.916 us [ 9054] | __cxa_demangle();
[ 9054] | printf() {
139.375 us [ 9054] | /* linux:schedule */
68.854 us [ 9054] | /* linux:schedule */
270.780 us [ 9054] | } /* printf */
75.208 us [ 9054] | /* linux:schedule */
323.957 us [ 9054] | /* linux:schedule */
8.463 ms [ 9054] | /* linux:schedule */
[ 9054] | /* linux:task-exit */
uftrace stopped tracing with remaining functions
================================================
...
In #721, the first prebuilt binary trace backtrace log was as follows:
task: 3632
[10] _Uarm_step
[9] blink::ThreadHeap::enterSafePoint
[8] blink::ThreadState::enterSafePoint
[7] blink::ThreadState::addInterruptor
[6] blink::Platform::initialize
[5] blink::initialize
[4] content::RenderThreadImpl::InitializeWebKit
[3] content::RenderThreadImpl::Init
[2] content::RenderThreadImpl::RenderThreadImpl
[1] content::RenderThreadImpl::Create
[0] content::ContentMain
It also shows _Uarm_step
and it seems to be related to fail when trying to get some backtrace.
void ThreadHeap::enterSafePoint(ThreadState* threadState)
{
m_safePointBarrier->enterSafePoint(threadState);
}
void SafePointBarrier::enterSafePoint(ThreadState* state)
{
ASSERT(!state->sweepForbidden());
pushAllRegisters(this, state, enterSafePointAfterPushRegisters);
}
$ cat third_party/WebKit/Source/platform/heap/asm/SaveRegisters_arm.S
/*
* typedef void (*PushAllRegistersCallback)(SafePointBarrier*, ThreadState*, intptr_t*);
* extern "C" void pushAllRegisters(SafePointBarrier*, ThreadState*,
* PushAllRegistersCallback)
*/
.type pushAllRegisters, %function
.global pushAllRegisters
.hidden pushAllRegisters
#ifdef __thumb__
/* In THUMB Mode jump to ARM stub via bx to ensure CPU mode switch.
* FIXME: This trampoline is provided to workaround bugs in
* the THUMB/ARM interworking that appear in the component build.
* When these issues are resolved this stub can be removed.
*/
.align 2
.code 16
.thumb_func
pushAllRegisters:
adr r3, pushAllRegistersARM
bx r3
.type pushAllRegistersARM, %function
.hidden pushAllRegistersARM
.align 4
.code 32
pushAllRegistersARM:
#else
/* ARM Mode */
.align 4
.code 32
pushAllRegisters:
#endif
/* Push all callee-saved registers and save return address. */
push {r4-r11, lr}
/* Pass the two first arguments unchanged (r0, r1)
* and pass the stack pointer after pushing callee-saved
* registers to the callback function.
*/
mov r3, r2
mov r2, sp . /* stackEnd */
blx r3
/* Discard all the registers, and pop lr into pc which returns
* and switches mode if needed.
*/
add sp, sp, #32
pop {pc}
class SafePointBarrier final {
...
static void enterSafePointAfterPushRegisters(SafePointBarrier* barrier,
ThreadState* state,
intptr_t* stackEnd) // r2: stackEnd
{
barrier->doEnterSafePoint(state, stackEnd);
}
...
};
void SafePointBarrier::doEnterSafePoint(ThreadState* state, intptr_t* stackEnd)
{
state->recordStackEnd(stackEnd);
state->copyStackUntilSafePointScope();
if (!atomicDecrement(&m_unparkedThreadCount)) {
MutexLocker locker(m_mutex);
m_parked.signal(); // Safe point reached.
}
}
Hi @namhyung, do you think it's tricky to apply auto recover to ARM?
After simply applying the following change, backtrace example works fine in ARM.
diff --git a/arch/arm/mcount-arch.h b/arch/arm/mcount-arch.h
index 3f48a89a..7269d962 100644
--- a/arch/arm/mcount-arch.h
+++ b/arch/arm/mcount-arch.h
@@ -32,4 +32,7 @@ unsigned long * mcount_arch_parent_location(struct symtabs *symtabs,
#define ARCH_PLT0_SIZE 20
#define ARCH_PLTHOOK_ADDR_OFFSET 0
+#define ARCH_SUPPORT_AUTO_RECOVER 1
+#define ARCH_CAN_RESTORE_PLTHOOK 1
+
#endif /* MCOUNT_ARCH_H */
root@raspberrypi3:~# uftrace record ./backtrace
#1 0x0000000000010bd9 sp=0x000000007e899678 fac(int) + 0x18
#2 0x0000000000010be7 sp=0x000000007e899688 fac(int) + 0x26
#3 0x0000000000010be7 sp=0x000000007e899698 fac(int) + 0x26
#4 0x0000000000010be7 sp=0x000000007e8996a8 fac(int) + 0x26
#5 0x0000000000010be7 sp=0x000000007e8996b8 fac(int) + 0x26
#6 0x0000000000010be7 sp=0x000000007e8996c8 fac(int) + 0x26
#7 0x0000000000010be7 sp=0x000000007e8996d8 fac(int) + 0x26
#8 0x0000000000010be7 sp=0x000000007e8996e8 fac(int) + 0x26
#9 0x0000000000010be7 sp=0x000000007e8996f8 fac(int) + 0x26
#10 0x0000000000010be7 sp=0x000000007e899708 fac(int) + 0x26
#11 0x0000000000010be7 sp=0x000000007e899718 fac(int) + 0x26
#12 0x0000000000010c09 sp=0x000000007e899728 main + 0x10
#13 0x0000000076cdf680 sp=0x000000007e899730 __libc_start_main + 0x118
#14 0x00000000000108cd sp=0x000000007e899878 _start + 0x34
root@raspberrypi3:~# uftrace replay
# DURATION TID FUNCTION
[ 9951] | main() {
[ 9951] | fac() {
[ 9951] | fac() {
[ 9951] | fac() {
[ 9951] | fac() {
[ 9951] | fac() {
[ 9951] | fac() {
[ 9951] | fac() {
[ 9951] | fac() {
[ 9951] | fac() {
[ 9951] | fac() {
[ 9951] | fac() {
[ 9951] | backtrace() {
445.050 us [ 9951] | _ULarm_init_local();
835.829 us [ 9951] | _ULarm_step();
5.781 us [ 9951] | _ULarm_get_reg();
1.042 us [ 9951] | _ULarm_get_reg();
8.489 us [ 9951] | memset();
594.997 us [ 9951] | _ULarm_get_proc_name();
70.937 us [ 9951] | __cxa_demangle();
803.851 us [ 9951] | printf();
11.250 us [ 9951] | free();
36.770 us [ 9951] | _ULarm_step();
0.885 us [ 9951] | _ULarm_get_reg();
0.833 us [ 9951] | _ULarm_get_reg();
1.093 us [ 9951] | memset();
664.007 us [ 9951] | _ULarm_get_proc_name();
10.624 us [ 9951] | __cxa_demangle();
497.394 us [ 9951] | printf();
...
811.402 us [ 9951] | _ULarm_get_proc_name();
2.865 us [ 9951] | __cxa_demangle();
437.706 us [ 9951] | printf();
426.196 us [ 9951] | _ULarm_step();
1.094 us [ 9951] | _ULarm_get_reg();
0.989 us [ 9951] | _ULarm_get_reg();
1.771 us [ 9951] | memset();
667.914 us [ 9951] | _ULarm_get_proc_name();
3.125 us [ 9951] | __cxa_demangle();
528.748 us [ 9951] | printf();
32.344 us [ 9951] | _ULarm_step();
18.304 ms [ 9951] | } /* backtrace */
18.326 ms [ 9951] | } /* fac */
18.329 ms [ 9951] | } /* fac */
18.331 ms [ 9951] | } /* fac */
18.333 ms [ 9951] | } /* fac */
18.335 ms [ 9951] | } /* fac */
18.337 ms [ 9951] | } /* fac */
18.339 ms [ 9951] | } /* fac */
18.341 ms [ 9951] | } /* fac */
18.343 ms [ 9951] | } /* fac */
18.346 ms [ 9951] | } /* fac */
18.348 ms [ 9951] | } /* fac */
18.353 ms [ 9951] | } /* main */
However, it gets crashed when tracing other bigger programs.
@namhyung I don't know if you agree to put critical
tag to here but it's not possible to apply uftrace to chrome without fixing or avoiding this problem. The solution could be an auto recover or just a workaround. I have to find a way to avoid this to move forward.
Here is a related commit.
commit 0ba73d0b324d82946095d898c9fadb977eeb7853 Author: Namhyung Kim namhyung@gmail.com CommitDate: Wed Jan 16 19:54:44 2019 +0900
mcount: Add ARCH_CAN_RESTORE_PLTHOOK logic
Some arch like ARM/AArch64 call PLT functions before setting return
address in the stack as they have the linker register to save it. So
plt_hooker() sees an intermediate address it sets up during the hooking
and the address cannot be used to restore real return address after
plt_hooker() returns.
That means it cannot restore all return stacks (rstack) from the finish
trigger and needs to keep up the rstack for PLT functions to be returned
to the real callers after plthook_exit().
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
diff --git a/libmcount/misc.c b/libmcount/misc.c
--- a/libmcount/misc.c
+++ b/libmcount/misc.c
@@ -130,16 +141,21 @@
/* restore saved original return address */
void mcount_rstack_restore(struct mcount_thread_data *mtdp)
{
int idx;
+ struct mcount_ret_stack *rstack;
/* reverse order due to tail calls */
for (idx = mtdp->idx - 1; idx >= 0; idx--) {
- unsigned long parent_ip = mtdp->rstack[idx].parent_ip;
+ rstack = &mtdp->rstack[idx];
- if (parent_ip == (unsigned long)mcount_return ||
- parent_ip == (unsigned long)plthook_return)
+ if (rstack->parent_ip == (unsigned long)mcount_return ||
+ rstack->parent_ip == (unsigned long)plthook_return)
+ continue;
+
+ if (!ARCH_CAN_RESTORE_PLTHOOK &&
+ rstack->dyn_idx != MCOUNT_INVALID_DYNIDX)
continue;
- *mtdp->rstack[idx].parent_loc = parent_ip;
+ *rstack->parent_loc = rstack->parent_ip;
}
}
@namhyung According to the description above, we cannot restore return address only for PLT functions.
Is it possible to use auto recover when --no-libcall
is given in ARM/AArch64?
In addition to this, it seems that we can get the return address for known problematic functions such as backtrace
, and exception handling functions.
Is it possible to handle them in a limited way after getting the return address? It looks as follows:
diff --git a/libmcount/wrap.c b/libmcount/wrap.c
index 291c7a66..3ef62a44 100644
--- a/libmcount/wrap.c
+++ b/libmcount/wrap.c
@@ -260,6 +260,10 @@ __visible_default int backtrace(void **buffer, int sz)
{
int ret;
struct mcount_thread_data *mtdp;
+#if defined(__arm__) || defined(__thumb__)
+ register unsigned long *lr asm("lr");
+ /* "lr" register is to be used as a return address */
+#endif
if (unlikely(real_backtrace == NULL))
mcount_hook_functions();
Due to the link register in ARM it's a diiferent situation to x86 which always use stack for saving return address. I need to think about what we can do safely..
It seems that the case you're worrying about is only for library function calls because ARM pushes lr
register to stack before calling general mcount functions. Is it right?
I also worry about the mcount case but not sure.. Currently I don't have a box to test unfortunately.
It doesn't crash on AArch64 machine but it didn't work as expected.
$ file a.out
a.out: ELF 64-bit LSB shared object, ARM aarch64, version 1 (SYSV), dynamically linked, interpreter /lib/ld-linux-aarch64.so.1, for GNU/Linux 3.7.0, BuildID[sha1]=a09e61748782844a670e10df2d2497cbf7f8027c, with debug_info, not stripped
$ ./a.out
#1 0x0000aaaad098ddc0 sp=0x0000ffffd6c2aca0 fac(int) + 0x28
#2 0x0000aaaad098ddd4 sp=0x0000ffffd6c2acc0 fac(int) + 0x3c
#3 0x0000aaaad098ddd4 sp=0x0000ffffd6c2ace0 fac(int) + 0x3c
#4 0x0000aaaad098ddd4 sp=0x0000ffffd6c2ad00 fac(int) + 0x3c
#5 0x0000aaaad098ddd4 sp=0x0000ffffd6c2ad20 fac(int) + 0x3c
#6 0x0000aaaad098ddd4 sp=0x0000ffffd6c2ad40 fac(int) + 0x3c
#7 0x0000aaaad098ddd4 sp=0x0000ffffd6c2ad60 fac(int) + 0x3c
#8 0x0000aaaad098ddd4 sp=0x0000ffffd6c2ad80 fac(int) + 0x3c
#9 0x0000aaaad098ddd4 sp=0x0000ffffd6c2ada0 fac(int) + 0x3c
#10 0x0000aaaad098ddd4 sp=0x0000ffffd6c2adc0 fac(int) + 0x3c
#11 0x0000aaaad098ddd4 sp=0x0000ffffd6c2ade0 fac(int) + 0x3c
#12 0x0000aaaad098de00 sp=0x0000ffffd6c2ae00 main + 0x18
#13 0x0000ffff8a412d24 sp=0x0000ffffd6c2ae10 __libc_start_main + 0xe4
#14 0x0000aaaad098da74 sp=0x0000ffffd6c2af50 _start + 0x34
$ uftrace record a.out
#1 0x0000ffff805a3784 sp=0x0000ffffc548e6c0 _mcount + 0x44
$ uftrace replay
# DURATION TID FUNCTION
[ 28388] | main() {
[ 28388] | fac() {
[ 28388] | fac() {
[ 28388] | fac() {
[ 28388] | fac() {
[ 28388] | fac() {
[ 28388] | fac() {
[ 28388] | fac() {
[ 28388] | fac() {
[ 28388] | fac() {
[ 28388] | fac() {
[ 28388] | fac() {
[ 28388] | backtrace() {
880.363 us [ 28388] | _ULaarch64_init_local();
802.081 us [ 28388] | _ULaarch64_step();
3.645 us [ 28388] | _ULaarch64_get_reg();
1.823 us [ 28388] | _ULaarch64_get_reg();
4.219 us [ 28388] | memset();
1.221 ms [ 28388] | _ULaarch64_get_proc_name();
67.708 us [ 28388] | __cxa_demangle();
79.791 us [ 28388] | printf();
129.375 us [ 28388] | _ULaarch64_step();
3.230 ms [ 28388] | } /* backtrace */
3.244 ms [ 28388] | } /* fac */
3.246 ms [ 28388] | } /* fac */
3.247 ms [ 28388] | } /* fac */
3.248 ms [ 28388] | } /* fac */
3.249 ms [ 28388] | } /* fac */
3.250 ms [ 28388] | } /* fac */
3.251 ms [ 28388] | } /* fac */
3.252 ms [ 28388] | } /* fac */
3.253 ms [ 28388] | } /* fac */
3.256 ms [ 28388] | } /* fac */
3.258 ms [ 28388] | } /* fac */
3.260 ms [ 28388] | } /* main */
Same program in C version.
#define UNW_LOCAL_ONLY
#include <libunwind.h>
#include <stdio.h>
#include <stdlib.h>
void backtrace()
{
unw_cursor_t cursor;
unw_context_t context;
unw_getcontext(&context);
unw_init_local(&cursor, &context);
int n = 0;
while ( unw_step(&cursor) ) {
unw_word_t ip, sp, off;
unw_get_reg(&cursor, UNW_REG_IP, &ip);
unw_get_reg(&cursor, UNW_REG_SP, &sp);
char symbol[256] = {"<unknown>"};
char *name = symbol;
if ( !unw_get_proc_name(&cursor, symbol, sizeof(symbol), &off) ) {
name = symbol;
}
printf("#%-2d 0x%016" PRIxPTR " sp=0x%016" PRIxPTR " %s + 0x%" PRIxPTR "\n",
++n,
(uintptr_t)(ip),
(uintptr_t)(sp),
name,
(uintptr_t)(off));
if ( name != symbol )
free(name);
}
}
int fac(int n)
{
if ( n == 0 ) {
backtrace();
return 1;
} else {
return n * fac(n - 1);
}
}
int main()
{
fac(10);
return 0;
}
How about using --estimate-return
?
Sorry, I don't have an ARM 32-bit board to test it.
Here is the code example.
This code can be traced properly as follows in x86_64:
But it gets crashed when tracing in ARM.