Open YangKeao opened 3 years ago
Here is a full backtrace from a libgcc compiled with -g -ggdb
(running the example flamegraph: gdb target/debug/examples/flamegraph
):
Program received signal SIGSEGV, Segmentation fault.
0x0000007fbf6b1a78 in aarch64_fallback_frame_state (context=0x7fffffbbf0,
context=0x7fffffbbf0, fs=0x7fffffbfb0) at ./md-unwind-support.h:74
74 ./md-unwind-support.h: No such file or directory.
(gdb) bt
#0 0x0000007fbf6b1a78 in aarch64_fallback_frame_state (context=0x7fffffbbf0,
context=0x7fffffbbf0, fs=0x7fffffbfb0) at ./md-unwind-support.h:74
#1 uw_frame_state_for (context=context@entry=0x7fffffbbf0, fs=fs@entry=0x7fffffbfb0)
at /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c:1271
#2 0x0000007fbf6b31a0 in _Unwind_Backtrace (
trace=0x55556de09c <backtrace::backtrace::libunwind::trace::trace_fn>,
trace_argument=0x7fffffc650)
at /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind.inc:302
#3 0x000000555557893c in backtrace::backtrace::libunwind::trace (cb=...)
at /data/gentoo64/home/yangkeao/.cargo/registry/src/github.com-1ecc6299db9ec823/backtrace-0.3.61/src/backtrace/libunwind.rs:90
#4 backtrace::backtrace::trace_unsynchronized<pprof::profiler::perf_signal_handler::{closure#0}> (cb=...)
at /data/gentoo64/home/yangkeao/.cargo/registry/src/github.com-1ecc6299db9ec823/backtrace-0.3.61/src/backtrace/mod.rs:66
#5 0x000000555557af20 in pprof::profiler::perf_signal_handler (_signal=27,
_siginfo=0x7fffffd3c0, ucontext=0x7fffffd440) at src/profiler.rs:248
#6 <signal handler called>
#7 0x0000005555680b8c in core::ptr::non_null::NonNull<usize>::as_ptr<usize> (self=...)
at /rustc/09c42c45858d5f3aedfa670698275303a3d19afa/library/core/src/ptr/non_null.rs:270
#8 0x00000055556812a4 in core::slice::iter::Iter<usize>::post_inc_start<usize> (
self=0x555582aec8, offset=246)
at /rustc/09c42c45858d5f3aedfa670698275303a3d19afa/library/core/src/slice/iter/macros.rs:93
#9 core::slice::iter::{impl#172}::next<usize> (self=0x7fffffe700)
at /rustc/09c42c45858d5f3aedfa670698275303a3d19afa/library/core/src/slice/iter/macros.rs:149
#10 0x00000000000004cd in ?? ()
The signal frame has been resolved successfully. But the following codes get wrong pc
value:
/* A signal frame will have a return address pointing to
__default_sa_restorer. This code is hardwired as:
0xd2801168 movz x8, #0x8b
0xd4000001 svc 0x0
*/
if (pc[0] != MOVZ_X8_8B || pc[1] != SVC_0)
{
return _URC_END_OF_STACK;
}
In this stack frame, the pc
value is 0x4cd
, which causes this bug.
It is only 64bit away from the correct parent frame.
The corrupted frame located at 0x12c524
in the file, the related FDE is :
000280a0 00000014 0002801c FDE cie=00000088 pc=0012c48c...0012c584
Format: DWARF32
DW_CFA_advance_loc: 8
DW_CFA_def_cfa_offset: +144
DW_CFA_offset: reg30 -16
DW_CFA_nop:
0x12c48c: CFA=reg31
0x12c494: CFA=reg31+144: reg30=[CFA-16]
Then we should calculate 0x0000007fffffe190 + 144 - 16 = 0x7fffffe210
, which is actually the correct address. The dwarf information is actually correct. It seems that the problem locates in the libgcc
?
Breakpoint 9, uw_frame_state_for (context=context@entry=0x7fffffb890, fs=fs@entry=0x7fffffbc50) at /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c:1256
1256 in /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c
$93369 = (void *) 0x7fffffc2d0
$93370 = (_Unwind_Context_Reg_Val) 0x7fffffb7f8
Breakpoint 9, uw_frame_state_for (context=context@entry=0x7fffffb890, fs=fs@entry=0x7fffffbc50) at /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c:1256
1256 in /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c
$93371 = (void *) 0x7fffffc310
$93372 = (_Unwind_Context_Reg_Val) 0x7fffffc300
Breakpoint 9, uw_frame_state_for (context=context@entry=0x7fffffb890, fs=fs@entry=0x7fffffbc50) at /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c:1256
1256 in /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c
$93373 = (void *) 0x7fffffd060
$93374 = (_Unwind_Context_Reg_Val) 0x7fffffd058
Breakpoint 9, uw_frame_state_for (context=context@entry=0x7fffffb890, fs=fs@entry=0x7fffffbc50) at /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c:1256
1256 in /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c
$93375 = (void *) 0x7fffffd190
$93376 = (_Unwind_Context_Reg_Val) 0x7fffffd288
Breakpoint 9, uw_frame_state_for (context=context@entry=0x7fffffb890, fs=fs@entry=0x7fffffbc50) at /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c:1256
1256 in /data/gentoo64/var/tmp/portage/sys-devel/gcc-11.2.0/work/gcc-11.2.0/libgcc/unwind-dw2.c
$93377 = (void *) 0x7fffffe350
$93378 = (_Unwind_Context_Reg_Val) 0x7fffffe340
Here is the last several CFA
and SP
.
0x0000007fffffc2f8│+0x0b68: 0x000000555581ab40 → 0x000000555556da18 → <core::ptr::drop_in_
place<pprof::profiler::perf_signal_handler::{{closure}}>+0> sub sp, sp, #0x10
0x0000007fffffc300│+0x0b70: 0x000000555557b1a0 → <perf_signal_handler+400> bl 0x5555568540 <pthread_self@plt>
0x0000007fffffd058│+0x18c8: 0x0000007fbf6fc668 → <__kernel_rt_sigreturn+0> mov x8, #0x8b
// #139
0x0000007fffffd288│+0x1af8: 0x000000555556b400 → <flamegraph::main+836> str x0, [sp, #48]
0x0000007fffffe350│+0x2bc0: 0x00000000000004cd ← $x6
Related dwarf information:
00000154 00000020 00000048 FDE cie=00000110 pc=000160bc...00016700
Format: DWARF32
LSDA Address: 00000000002a8c70
DW_CFA_advance_loc: 8
DW_CFA_def_cfa_offset: +1024
DW_CFA_offset: reg30 -8
DW_CFA_offset: reg29 -16
DW_CFA_nop:
DW_CFA_nop:
DW_CFA_nop:
0x160bc: CFA=reg31
0x160c4: CFA=reg31+1024: reg29=[CFA-16], reg30=[CFA-8]
0x000000555556b400
is 0x16400
.
Thanks for @SchrodingerZhu . I have tried to use nongnu libunwind and the unw_xxx
API to get the backtrace in arm. It works surprisingly well!
The modification on backtrace-rs
can be found in https://github.com/YangKeao/backtrace-rs/commit/ee71341ca6a1ea68e2c60677a4d9b31f0042378b
Thanks for @SchrodingerZhu . I have tried to use nongnu libunwind and the
unw_xxx
API to get the backtrace in arm. It works surprisingly well!The modification on
backtrace-rs
can be found in YangKeao/backtrace-rs@ee71341
Great progress! I will also continue to help test nongnu/linunwind
based backtrace in signal handler.
Hi @YangKeao, what is your test env? Have you tested it on darwin+aarch64 (m1)? If not, I will make up for this part.
glad to see this
@mornyx
No. I only test it on raspberry pi.
nongnu-libunwind seems to be a good solution then. it will also not break TiFlash (where rust and c++ transits), while llvm-libunwind has problem handling that solution.
cc @YangKeao
Cool. Then I could move on fixing this issue in these steps:
unw_xxx
APIs, and allow users to (dynamically) link with nongnu-libunwind.tikv-server
to distribute. In this case, the other functions in libgcc_s
will also be overridden (e.g. _Unwind_Backtrace
).BTW, some magic (like rewriting the symbol name with objcopy
) can also be tried, but I'm not sure it will work perfectly. As a .a
file has many .o
files, and they could call each other, it means we should also rewrite the calling symbol (relocation rules), not only the exposed symbol, which could be hard to do perfectly.
/cc @BusyJay @sticnarf
What's your opinion on linking nongu-libunwind with tikv-server
?
Why Rust choose llvm libuwind in the first place?
Why Rust choose llvm libuwind in the first place?
Default Rust unwind implementation is libgcc_s
. I think this is because the C runtime libgcc_s
is provided by almost all Linux distributions.
BTW, some magic (like rewriting the symbol name with
objcopy
) can also be tried, but I'm not sure it will work perfectly. As a.a
file has many.o
files, and they could call each other, it means we should also rewrite the calling symbol (relocation rules), not only the exposed symbol, which could be hard to do perfectly.
Is it possible to patch the nongnu libunwind code? For example, to avoid overriding _Unwind_Backtrace
, just remove the _Unwind_Backtrace
provided by nongnu libunwind. And the unw_
prefix might also be patched?
Is it possible to patch the nongnu libunwind code? For example, to avoid override
_Unwind_Backtrace
, just remove the_Unwind_Backtrace
provided by nongnu libunwind. And theunw_
prefix might be also be patched?
I could try.
@sticnarf @mornyx Building nongnu-libunwind
without enable-cxx-exceptions
will not build the function _Unwind_XXX
(e.g. _Unwind_Backtrace
), which is also the default behavior of the nongnu-libunwind shipped by ubuntu. I think it's fine to directly static-link with it.
The libunwind-sys needs to be modified to support static link. I will try to modify it and submit a PR.
nongnu/libunwind
does not support macOS. I implemented a custom unwind library on macOS using pure Rust. I avoid handling signal frame by using ucontext
in the perf_signal_handler
's param list (which contains the register context before the signal frame). It seems to work well in TiKV so far, I will clean up the code and submit a PR soon.
/cc @YangKeao @sticnarf
Additional info: I previously wrote a minimal Rust demo to test nongnu/libunwind
on AArch64+Linux, it still panics in debug mode, but works fine in release mode (seems).
So please help to confirm that nongnu/libunwind
works successfully in TiKV compiled in debug mode, maybe it is the problem of my demo. cc @YangKeao
Additional info: I previously wrote a minimal Rust demo to test
nongnu/libunwind
on AArch64+Linux, it still panics in debug mode, but works fine in release mode (seems). So please help to confirm thatnongnu/libunwind
works successfully in TiKV compiled in debug mode, maybe it is the problem of my demo. cc @YangKeao
The debug vs release mode difference may be related to inlining..
Additional info: I previously wrote a minimal Rust demo to test
nongnu/libunwind
on AArch64+Linux, it still panics in debug mode, but works fine in release mode (seems). So please help to confirm thatnongnu/libunwind
works successfully in TiKV compiled in debug mode, maybe it is the problem of my demo. cc @YangKeao
libunwind-rs inits its Cursor
with unw_init_local
:
But from the nongnu libunwind doc, we need to use unw_init_local2
and pass special flag if we are using it in a signal handler. I guess if the problem is related to this.
Additional info: I previously wrote a minimal Rust demo to test
nongnu/libunwind
on AArch64+Linux, it still panics in debug mode, but works fine in release mode (seems). So please help to confirm thatnongnu/libunwind
works successfully in TiKV compiled in debug mode, maybe it is the problem of my demo. cc @YangKeaolibunwind-rs inits its
Cursor
withunw_init_local
:But from the nongnu libunwind doc, we need to use
unw_init_local2
and pass special flag if we are using it in a signal handler. I guess if the problem is related to this.
I use nongnu/libunwind
bindings provided by Keao and it looks like UNW_INIT_SIGNAL_FRAME
is set correctly (https://github.com/YangKeao/backtrace-rs/blob/master/src/backtrace/libunwind.rs#L453).
Here is my demo:
use nix::libc;
use nix::sys::signal;
use rand::Rng;
use parking_lot::RwLock;
use smallvec::SmallVec;
// same as pprof-rs/src/timer.rs
mod timer;
const MAX_DEPTH: usize = 32;
lazy_static::lazy_static! {
static ref PROFILER: RwLock<()> = RwLock::new(());
}
fn main() {
// Register signal handler.
let h = signal::SigHandler::SigAction(perf_signal_handler);
let a = signal::SigAction::new(h, signal::SaFlags::SA_SIGINFO, signal::SigSet::empty());
unsafe {
signal::sigaction(signal::SIGPROF, &a).unwrap();
}
// Register SIGPROF.
let _t = timer::Timer::new(99);
// Run some workloads.
loop {
let mut rng = rand::thread_rng();
let mut vec: Vec<i32> = vec![];
for _ in 0..1000000 {
vec.push(rng.gen())
}
vec.sort();
}
}
#[no_mangle]
pub extern "C" fn perf_signal_handler(_: libc::c_int, _: *mut libc::siginfo_t, _: *mut libc::c_void) {
if let Some(mut guard) = PROFILER.try_write() {
let mut bt: SmallVec<[_; MAX_DEPTH]> = SmallVec::with_capacity(MAX_DEPTH);
let mut index = 0;
unsafe {
// in Cargo.toml:
// backtrace = { git = "https://github.com/YangKeao/backtrace-rs.git", features = ["nongnu-unwind"], branch = "master" }
backtrace::trace_unsynchronized_external_api(|frame| {
if index < MAX_DEPTH {
bt.push(frame.clone());
index += 1;
true
} else {
false
}
}, true); // signal_frame = true
}
}
}
Crash stack:
(gdb) r
Starting program: /home/ubuntu/demo/target/debug/demo
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/aarch64-linux-gnu/atomics/libthread_db.so.1".
Program received signal SIGSEGV, Segmentation fault.
0x0000fffff7f8a078 in access_mem (as=<optimized out>, addr=726228, val=0xffffffffa350, write=0, arg=<optimized out>) at aarch64/Ginit.c:337
337 *val = *(unw_word_t *) addr;
(gdb) bt
#0 0x0000fffff7f8a078 in access_mem (as=<optimized out>, addr=726228, val=0xffffffffa350, write=0, arg=<optimized out>) at aarch64/Ginit.c:337
#1 0x0000fffff7f8ae9c in _ULaarch64_is_signal_frame (cursor=cursor@entry=0xffffffffad48) at aarch64/Gis_signal_frame.c:51
#2 0x0000fffff7f8b8cc in _ULaarch64_step (cursor=0xffffffffad48) at aarch64/Gstep.c:147
#3 0x0000aaaaaaaaecdc in backtrace::backtrace::libunwind::external_unwind::UnwCursor::step (self=0xffffffffad48)
at /home/ubuntu/.cargo/git/checkouts/backtrace-rs-d2437bb348eacd45/ee71341/src/backtrace/libunwind.rs:470
#4 0x0000aaaaaaaa9c64 in backtrace::backtrace::libunwind::trace_external_api::{{closure}} (x=...)
at /home/ubuntu/.cargo/git/checkouts/backtrace-rs-d2437bb348eacd45/ee71341/src/backtrace/libunwind.rs:136
#5 0x0000aaaaaaaa8d38 in core::result::Result<T,E>::and_then (self=..., op=...) at /rustc/db9d1b20bba1968c1ec1fc49616d4742c1725b4b/library/core/src/result.rs:966
#6 0x0000aaaaaaaa9288 in backtrace::backtrace::libunwind::trace_external_api (f=0xffffffffbd80, signal_frame=true)
at /home/ubuntu/.cargo/git/checkouts/backtrace-rs-d2437bb348eacd45/ee71341/src/backtrace/libunwind.rs:121
#7 backtrace::backtrace::trace_unsynchronized_external_api (cb=..., signal_frame=true)
at /home/ubuntu/.cargo/git/checkouts/backtrace-rs-d2437bb348eacd45/ee71341/src/backtrace/mod.rs:72
#8 0x0000aaaaaaaad794 in demo::perf_signal_handler (_signal=27, _siginfo=0xffffffffdc50, _ucontext=0xffffffffdcd0) at src/main.rs:43
#9 <signal handler called>
#10 0x0000aaaaaaab0564 in core::cmp::impls::<impl core::cmp::PartialOrd for i32>::lt (self=0xfffffffff200, other=0x376726befffff1f8)
at /rustc/db9d1b20bba1968c1ec1fc49616d4742c1725b4b/library/core/src/cmp.rs:1327
#11 0x0000aaaaaaab0678 in <core::ops::range::Range<T> as core::iter::range::RangeIteratorImpl>::spec_next (
self=0xaaaaaaab062c <core::iter::range::<impl core::iter::traits::iterator::Iterator for core::ops::range::Range<A>>::next+20>)
at /rustc/db9d1b20bba1968c1ec1fc49616d4742c1725b4b/library/core/src/iter/range.rs:621
#12 0x00000000000b14d4 in ?? ()
Backtrace stopped: previous frame identical to this frame (corrupt stack?)
uname -a
:
Linux ubuntu 5.11.0-1025-aws #27~20.04.1-Ubuntu SMP Fri Jan 7 13:14:39 UTC 2022 aarch64 aarch64 aarch64 GNU/Linux
nongnu/libunwind
does not support macOS.
@sticnarf Can you help me to check whether the released pprof-rs will cause the bug in macOS? It seems that the macos has its own unwind library, which "may" be better than libgcc
😢 . If the released pprof-rs and tikv-server can work properly on M1 Mac, then we only need to handle this issue on linux.
@siddontang told me he cannot reproduce this issue (with the examples in pprof-rs
) on M1 Mac.
@sticnarf Can you help me to check whether the released pprof-rs will cause the bug in macOS? It seems that the macos has its own unwind library, which "may" be better than
libgcc
. If the released pprof-rs and tikv-server can work properly on M1 Mac, then we only need to handle this issue on linux.
Actually https://github.com/tikv/tikv/issues/9957#issuecomment-813368686 still segfaults on latest macOS :(
@siddontang told me he cannot reproduce this issue (with the examples in pprof-rs) on M1 Mac.
Maybe he was using macOS 10.x. I remember the issue appears after macOS 11
nongnu/libunwind
does not support macOS.@sticnarf Can you help me to check whether the released pprof-rs will cause the bug in macOS? It seems that the macos has its own unwind library, which "may" be better than
libgcc
😢 . If the released pprof-rs and tikv-server can work properly on M1 Mac, then we only need to handle this issue on linux.@siddontang told me he cannot reproduce this issue (with the examples in
pprof-rs
) on M1 Mac.
This problem can be reproduced by executing the demo I posted above on my m1 mac.
(lldb) r
Process 41331 launched: '/Users/mornyx/Desktop/repos/segfault-demo/target/debug/segfault-demo' (arm64)
Process 41331 stopped
* thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BREAKPOINT (code=1, subcode=0x1b7dada18)
frame #0: 0x00000001b7dada18 libunwind.dylib`libunwind::UnwindCursor<libunwind::LocalAddressSpace, libunwind::Registers_arm64>::step() + 680
libunwind.dylib`libunwind::UnwindCursor<libunwind::LocalAddressSpace, libunwind::Registers_arm64>::step:
-> 0x1b7dada18 <+680>: brk #0xc471
0x1b7dada1c <+684>: pacib x16, x8
0x1b7dada20 <+688>: str x16, [x8]
0x1b7dada24 <+692>: ldr x16, [x19]
Target 0: (segfault-demo) stopped.
(lldb) bt
error: need to add support for DW_TAG_base_type '()' encoded with DW_ATE = 0x7, bit_size = 0
* thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BREAKPOINT (code=1, subcode=0x1b7dada18)
* frame #0: 0x00000001b7dada18 libunwind.dylib`libunwind::UnwindCursor<libunwind::LocalAddressSpace, libunwind::Registers_arm64>::step() + 680
frame #1: 0x00000001b7db010c libunwind.dylib`_Unwind_Backtrace + 284
frame #2: 0x0000000100004470 segfault-demo`backtrace::backtrace::trace_unsynchronized::hc3e1b7c2884d399c [inlined] backtrace::backtrace::libunwind::trace::hf29b36d39a2f0171(cb=&mut dyn core::ops::function::FnMut<(&backtrace::backtrace::Frame), Output=bool> @ 0x000000016fdfe3a0) at libunwind.rs:93:5
frame #3: 0x0000000100004464 segfault-demo`backtrace::backtrace::trace_unsynchronized::hc3e1b7c2884d399c(cb={closure#0} @ 0x000000016fdfe388) at mod.rs:66:5
frame #4: 0x0000000100006158 segfault-demo`perf_signal_handler((null)=27, (null)=0x000000016fdfe450, ucontext=0x000000016fdfe4b8) at main.rs:43:13
frame #5: 0x00000001ad3984e4 libsystem_platform.dylib`_sigtramp + 56
frame #6: 0x00000001ad196594 libsystem_malloc.dylib`szone_malloc_should_clear + 92
frame #7: 0x00000001ad196594 libsystem_malloc.dylib`szone_malloc_should_clear + 92
frame #8: 0x00000001ad19876c libsystem_malloc.dylib`nanov2_realloc + 88
frame #9: 0x00000001ad198658 libsystem_malloc.dylib`malloc_zone_realloc + 140
frame #10: 0x00000001ad1973c8 libsystem_malloc.dylib`realloc + 328
frame #11: 0x0000000100034d18 segfault-demo`alloc::alloc::realloc::h74f14f2ce4ab84b0(ptr="@B\n", layout=Layout @ 0x000000016fdfeab0, new_size=512) at alloc.rs:124:14
frame #12: 0x0000000100034ab4 segfault-demo`alloc::alloc::Global::grow_impl::hb5673b9e9d83a1b7(self=0x000000016fdff2e8, ptr=(pointer = "@B\n"), old_layout=Layout @ 0x000000016fdfeb90, new_layout=Layout @ 0x000000016fdfeba0, zeroed=false) at alloc.rs:201:31
frame #13: 0x0000000100034dc8 segfault-demo`_$LT$alloc..alloc..Global$u20$as$u20$core..alloc..Allocator$GT$::grow::h9a808bc86915a00c(self=0x000000016fdff2e8, ptr=(pointer = "@B\n"), old_layout=Layout @ 0x000000016fdfecd0, new_layout=Layout @ 0x000000016fdfece0) at alloc.rs:254:18
frame #14: 0x00000001000344a0 segfault-demo`alloc::raw_vec::finish_grow::h4ae967a0979b21be(new_layout=Result<core::alloc::layout::Layout, core::alloc::layout::LayoutError> @ 0x000000016fdfedc8, current_memory=Option<(core::ptr::non_null::NonNull<u8>, core::alloc::layout::Layout)> @ 0x000000016fdfef20, alloc=0x000000016fdff2e8) at raw_vec.rs:457:13
frame #15: 0x00000001000319c4 segfault-demo`alloc::raw_vec::RawVec$LT$T$C$A$GT$::grow_amortized::hf32eb34c9a647fa0(self=0x000000016fdff2e8, len=16, additional=1) at raw_vec.rs:393:19
frame #16: 0x0000000100062874 segfault-demo`alloc::raw_vec::RawVec$LT$T$C$A$GT$::reserve::do_reserve_and_handle::hf1a1ca5b41c0d04a(slf=0x000000016fdff2e8, len=16, additional=1) at raw_vec.rs:284:28
frame #17: 0x0000000100031b40 segfault-demo`alloc::raw_vec::RawVec$LT$T$C$A$GT$::reserve::h31a07e947ecd1269(self=0x000000016fdff2e8, len=16, additional=1) at raw_vec.rs:288:13
frame #18: 0x0000000100033280 segfault-demo`alloc::vec::Vec$LT$T$C$A$GT$::reserve::h2fb49bd2d09baade(self=0x000000016fdff2e8, additional=1) at mod.rs:809:9
frame #19: 0x0000000100033068 segfault-demo`alloc::vec::Vec$LT$T$C$A$GT$::push::h7a8573a64b7fabd6(self=0x000000016fdff2e8, value=(start = 344640, len = 10)) at mod.rs:1729:13
frame #20: 0x0000000100004e20 segfault-demo`alloc::slice::merge_sort::h5090a8047527648e(v=&mut [i32] @ 0x000000016fdff3b8, is_less={closure#0} @ 0x000000016fdff297) at slice.rs:1131:9
frame #21: 0x0000000100005440 segfault-demo`alloc::slice::_$LT$impl$u20$$u5b$T$u5d$$GT$::sort::hab2ae5faf8805e55(self=&mut [i32] @ 0x000000016fdff430) at slice.rs:276:9
frame #22: 0x00000001000060c8 segfault-demo`segfault_demo::main::hdc6e6b0d97558958 at main.rs:34:9
frame #23: 0x0000000100004050 segfault-demo`core::ops::function::FnOnce::call_once::h769b14bc6391e915((null)=(segfault-demo`segfault_demo::main::hdc6e6b0d97558958 at main.rs:16), (null)=<unavailable>) at function.rs:227:5
frame #24: 0x00000001000059b0 segfault-demo`std::sys_common::backtrace::__rust_begin_short_backtrace::h212780ff714d2704(f=(segfault-demo`segfault_demo::main::hdc6e6b0d97558958 at main.rs:16)) at backtrace.rs:123:18
frame #25: 0x000000010000678c segfault-demo`std::rt::lang_start::_$u7b$$u7b$closure$u7d$$u7d$::he43552e542027ddc at rt.rs:145:18
frame #26: 0x000000010004bd88 segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] core::ops::function::impls::_$LT$impl$u20$core..ops..function..FnOnce$LT$A$GT$$u20$for$u20$$RF$F$GT$::call_once::h10f2582b16e2b13c at function.rs:259:13 [opt]
frame #27: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] std::panicking::try::do_call::hd3dfc31f9ced2f42 at panicking.rs:406:40 [opt]
frame #28: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] std::panicking::try::h584945b02ec0e15d at panicking.rs:370:19 [opt]
frame #29: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] std::panic::catch_unwind::h1138cecd37279bb6 at panic.rs:133:14 [opt]
frame #30: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] std::rt::lang_start_internal::_$u7b$$u7b$closure$u7d$$u7d$::hf94f7401539e24a6 at rt.rs:128:48 [opt]
frame #31: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] std::panicking::try::do_call::ha8b5def05088e3d3 at panicking.rs:406:40 [opt]
frame #32: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] std::panicking::try::h3ce579dae5f3a6fb at panicking.rs:370:19 [opt]
frame #33: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 [inlined] std::panic::catch_unwind::h29ecbe0d385e9017 at panic.rs:133:14 [opt]
frame #34: 0x000000010004bd7c segfault-demo`std::rt::lang_start_internal::h35c587f98e9244f6 at rt.rs:128:20 [opt]
frame #35: 0x0000000100006758 segfault-demo`std::rt::lang_start::h969a743b4698eeaf(main=(segfault-demo`segfault_demo::main::hdc6e6b0d97558958 at main.rs:16), argc=1, argv=0x000000016fdff810) at rt.rs:144:17
frame #36: 0x0000000100006270 segfault-demo`main + 32
frame #37: 0x00000001001090f4 dyld`start + 520
On ARM+macOS, we can be confident that the frame-pointer exists and we can take advantage of this. And because our stacktrace scene only exists in signal handler, we can use the ucontext provided by kernel to skip the signal frame. Here is my implementation: https://github.com/mornyx/unwind
Then I introduced this implementation in pprof-rs
:
https://github.com/mornyx/pprof-rs/commit/a68772cb4093c8e31064f5516520afb1cb692aa3
And it worked fine in TiKV (based on sysbench stress):
In fact I also tried parsing compact unwind info
(the __unwind_info
section of the binary on macOS). It does not contain the function index in the darwin-xnu kernel (as discussed in https://github.com/tikv/tikv/issues/9957).
on macos, I think libunwind is referring to llvm's libunwind. As I early mentioned in the internal chat, at least on linux, llvm's libunwind would parse rust's debug CFI wrongly. This actually forbids the tiflash team from using the lib when migrating to LLVM toolchain.
Sent from ProtonMail mobile
-------- Original Message -------- On Feb 25, 2022, 1:28 PM, Yilin Chen < @.***> wrote:
@.***sticnarf Can you help me to check whether the released pprof-rs will cause the bug in macOS? It seems that the macos has its own unwind library, which "may" be better than
libgcc
. If the released pprof-rs and tikv-server can work properly on M1 Mac, then we only need to handle this issue on linux.Actually tikv/tikv#9957 (comment) still segfaults on latest macOS :(
— Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you were mentioned.Message ID: @.***>
On ARM+macOS, we can be confident that the frame-pointer exists and we can take advantage of this. And because our stacktrace scene only exists in signal handler, we can use the ucontext provided by kernel to skip the signal frame. Here is my implementation: https://github.com/mornyx/unwind
Then I introduced this implementation in
pprof-rs
: mornyx@a68772cAnd it worked fine in TiKV (based on sysbench stress):
From the graph you posted, the result looks distorted.
For example, I never see _get_attr_list
so frequently sampled. And a lot of stacks seem missing in the grpc part.
I guess many frame pointers are not actually emitted or they are not properly parsed.
I suppose frame pointers cannot be used for inlined functions while dwarf section can still be used locate inline functions.
I suppose frame pointers cannot be used for inlined functions while dwarf section can still be used locate inline functions.
I tested it on newer versions of macOS, and in fact the binary only contains the __unwind_info
segment, no longer the __eh_frame
segment. The point is that __unwind_info
does not contain inline information either. However backtrace::resolve()
can correctly call callback for inlined functions.
The demo below confirms this point:
use unwind::{init_unwind_context, UnwindContext};
fn main() {
func1_inlined();
}
#[inline(always)]
fn func1_inlined() {
func2();
}
fn func2() {
unsafe {
let mut context = UnwindContext::default();
init_unwind_context(&mut context as _);
show(context.pc);
// jump to next frame
context.pc = *std::mem::transmute::<_, *const u64>(context.fp + 8);
context.fp = *std::mem::transmute::<_, *const u64>(context.fp);
show(context.pc);
}
}
unsafe fn show(pc: u64) {
println!("{:#x}", pc);
backtrace::resolve(std::mem::transmute(pc), |s| {
println!("{:?}", s.name());
});
}
output:
0x100a82ffc
Some(demo::func2::ha91bc1d34ffee8cf)
0x100a82fd4
Some(demo::func1_inlined::h45123ba64dc5a176)
Some(demo::main::h9c57e81ec6be060b)
Looking at 0x100a82fd4
, we got two functions for one pointer, including the one which is inlined.
From the graph you posted, the result looks distorted.
For example, I never see
_get_attr_list
so frequently sampled. And a lot of stacks seem missing in the grpc part.I guess many frame pointers are not actually emitted or they are not properly parsed.
I am not familiar with the hotspot distribution of TiKV before, what about this:
I limited the sample pointer address to the address space of the tikv executable itself, without any dynamic link library. I guess the sample set from the previous image is the "real" one. pprof-rs
filters out many samples from the C world via blocklist()
. And the world of dynamic link libraries is very different on linux and macOS.
From the graph you posted, the result looks distorted. For example, I never see
_get_attr_list
so frequently sampled. And a lot of stacks seem missing in the grpc part. I guess many frame pointers are not actually emitted or they are not properly parsed.I am not familiar with the hotspot distribution of TiKV before, what about this:
I limited the sample pointer address to the address space of the tikv executable itself, without any dynamic link library. I guess the sample set from the previous image is the "real" one.
pprof-rs
filters out many samples from the C world viablocklist()
. And the world of dynamic link libraries is very different on linux and macOS.
Yep, this looks more reasonable.
From the graph you posted, the result looks distorted. For example, I never see
_get_attr_list
so frequently sampled. And a lot of stacks seem missing in the grpc part. I guess many frame pointers are not actually emitted or they are not properly parsed.I am not familiar with the hotspot distribution of TiKV before, what about this:
Maybe you can perform a standard workload (like sysbench or tpcc) for a TiKV on Linux. Then you can compare with the output on MacOS.
Update from TiFlash: after a very long run of continuous profiling, TiFlash's profiling also crashed. Notice that this is on normal x86_64 platform.
The scenario of unwinding across languages is a little bit complicated, but it shows that currently gcc_s
's implementation is not reliable in this situation.
Update from TiFlash: after a very long run of continuous profiling, TiFlash's profiling also crashed. Notice that this is on normal x86_64 platform.
The scenario of unwinding across languages is a little bit complicated, but it shows that currently
gcc_s
's implementation is not reliable in this situation.
After observing, I found that if we trace stack directly from the signal handler, there will always be strange problems. This is because there is always an opaque signal frame placed by the kernel between the signal handler's stack frame and the previous user mode stack frame.
My solution is to use the ucontext
provided by the kernel to the signal handler to find the previous user mode stack frame, so we will avoid handling any signal frames.
Also, for any non-legal range address taken from .eh_frame
or __unwind_info
, backtracking will abort rather than attempt to continue. This can effectively avoid segmentation faults. This is not rigorous for general unwind implementations, but it is sufficient for profiling.
I've added Linux + x86_64 support to my unwind implementation, and I'll make a PR to pprof-rs
after full testing.
@mornyx Sorry, I don't understand how you implement it. It sounds really magic. I don't think it's possible to do offline dwarf unwind without saving the stack (e.g. the perf
actually unwind offline, but it needs to copy and save the full current stack).
As I know, the dwarf can only help you to get the address of the callee-saved register on the stack, while running, the stack changed, and I don't know how you can find the remains of previous running.
Oops! I understand! You were using the ucontext
as the startpoint to only unwind the stack before the signal frame? Really cool 🍻 ! But I guess the problem isn't only because of the signal frame (actually, the x86_64 unwind implementation can all handle the signal frame). The bigger problem is that the unwind implementation may call some functions which are not allowed inside a signal handler, and will cause wired behavior.
@mornyx Sorry, I don't understand how you implement it. It sounds really magic. I don't think it's possible to do offline dwarf unwind without saving the stack (e.g. the
perf
actually unwind offline, but it needs to copy and save the full current stack).As I know, the dwarf can only help you to get the address of the callee-saved register on the stack, while running, the stack changed, and I don't know how you can find the remains of previous running.
Oops! I understand! You were using the
ucontext
as the startpoint to only unwind the stack before the signal frame? Really cool 🍻 ! But I guess the problem isn't only because of the signal frame (actually, the x86_64 unwind implementation can all handle the signal frame). The bigger problem is that the unwind implementation may call some functions which are not allowed inside a signal handler, and will cause wired behavior.
Right~ This also seems to be possible by modifying nongnu/libunwind
(by converting ucontext
to unw_context_t
instead of calling unw_getcontext()
in the signal handler).
But this only solved half the problem for us (as you said, and also doesn't support macOS). So I reimplemented a minimal subset of libunwind, to try to avoid the problems you said.
But this only solved half the problem for us (as you said, and also doesn't support macOS). So I reimplemented a minimal subset of libunwind, to try to avoid the problems you said.
Of course, it also needs to be fully tested to verify that it works correctly...
But this only solved half the problem for us (as you said, and also doesn't support macOS). So I reimplemented a minimal subset of libunwind, to try to avoid the problems you said.
Of course, it also needs to be fully tested to verify that it works correctly...
Don't worry. We have testing environments to run different complicated user payloads. It can be pretty helpful to discover problems and make it close to "battle-tested".
Right~ This also seems to be possible by modifying nongnu/libunwind (by converting ucontext to unw_context_t instead of calling unw_getcontext() in the signal handler).
The nongnu/libunwind
announces that it supports unwind from a signal handler (and accept an argument to tell it whether we are in a signal handler), though we haven't fully tested it yet.
So I reimplemented a minimal subset of libunwind, to try to avoid the problems you said.
It's always glad to see another implementation of libunwind
. Hope it can be better than any existing one (as we have suffered a lot with these 😭 )
I think we can mimic what is going on here but use ucontext to init the UnwindContext.
There is some confusion in various comments on this issue so I'll try to clear up some of it.
The resolution of inline functions happens during symbolication: A single address can resolve to one or more functions. If the address is inside code which the compiler inlined into another function, then you get both the inlined function name (or even multiple inline function names, if the compiler inlined multiple levels deep), and then the outer function name. Whether you get inline functions is completely independent from how you walk the stack.
If you want to do stack walking "offline", you need to capture the entire stack bytes. If you want to do inline function resolution offline, you only need to capture the code addresses on the stack (instruction pointer + return addresses) and enough information to be able to match an address to the library it was in.
A slightly confusing part here is that both stack walking and symbolication can make use of DWARF information. However, they're different subsets of DWARF information. DWARF stack walking information is stored in the eh_frame
or debug_frame
sections. DWARF symbolication information is stored in a other sections which start with debug_
, but not in debug_frame
.
On macOS x86_64 and arm64, all system libraries are compiled with frame pointers enabled. And having framepointers enabled is also the default for clang and Xcode, unless you manually set -fomit-frame-pointer. So frame pointer stack walking mostly works fine, unless you're profiling a program that has been compiled with -fomit-frame-pointer. There is one exception: On arm64, leaf functions don't have frame pointers even if you compile with frame pointers enabled. This means that on arm64, if you just use frame pointer unwinding, you will be missing the second frame in the stack if you're currently inside the leaf function: The first frame will be correct (it comes straight from the instruction pointer), the immediate caller is missing, and the frame pointer gives you the caller's caller, i.e. the third frame. From that point on the rest of stack unwinding works fine.
To unwind leaf functions correctly, you need to look at the compact unwind info in the __unwind_info
section.
On macOS, most binaries will have an __unwind_info
section, and some will also have an __eh_frame
section. For complete unwinding, you need both: __unwind_info
covers the simple cases, and __eh_frame
covers the hard cases.
Here's how it breaks down.
On x86_64:
__unwind_info
covers 99% of the functions.__eh_frame
is needed to correctly unwind the remaining 1% of cases.On arm64:
__unwind_info
lets you unwind leaf functions correctly, but still requires frame pointers for non-leaf functions.__eh_frame
lets you unwind if you don't have frame pointers.I've written a crate called framehop
which is a pure Rust implementation of everything you need to get reliable and correct unwinding on macOS. It doesn't have much documentation yet though. Edit: Documentation is in place now.
@mstange Thanks for the clarification :beers: . As I know, @mornyx is also creating a new dwarf unwind library. I also tried to unwind through frame pointer (for Linux) in https://github.com/tikv/pprof-rs/pull/116 . I still have some confusion:
This means that on arm64, if you just use frame pointer unwinding, you will be missing the second frame in the stack if you're currently inside the leaf function: The first frame will be correct (it comes straight from the instruction pointer), the immediate caller is missing, and the frame pointer gives you the caller's caller, i.e. the third frame.
For non-leaf frame pointer, will the frame pointer register (rbp
or x29
) be modified by the leaf function? If the leaf function stores 0x0
in the frame pointer register, we will have no way to get the former frame. Am I correct? (Or maybe I misunderstood). As shown in https://github.com/tikv/pprof-rs/pull/116#issuecomment-1097548889 , without the force-frame-pointer enabled, the stack is really short, which means we lost not only the caller's frame.
I will try https://github.com/mstange/framehop/ these days :smile: , as #116 has already gave us a chance to provide more options of unwinding method for the user.
This means that on arm64, if you just use frame pointer unwinding, you will be missing the second frame in the stack if you're currently inside the leaf function: The first frame will be correct (it comes straight from the instruction pointer), the immediate caller is missing, and the frame pointer gives you the caller's caller, i.e. the third frame.
For non-leaf frame pointer, will the frame pointer register (
rbp
orx29
) be modified by the leaf function?
Just to clarify: I am talking about the subset of functions which, on arm64, will not create a "frame record" for themselves even if you compile with frame pointers enabled. This happens for functions which do not call other functions (i.e. which are "leaf" functions) and which also don't need to save and restore any registers.
These functions leave x29 ("fp") unchanged. And because they don't call any other functions, the lr register also stays unchanged. So unwinding from these functions only means "get the return address from lr and leave all other registers unchanged".
If the leaf function stores
0x0
in the frame pointer register, we will have no way to get the former frame. Am I correct?
If it did that then yes, frame pointer unwinding would not work at all and there would be no way to get the rest of the stack. You would need to use some kind of unwind information to recover a usable frame pointer value. But luckily these functions leave the frame pointer from the parent frame intact.
(Or maybe I misunderstood). As shown in #116 (comment) , without the force-frame-pointer enabled, the stack is really short, which means we lost not only the caller's frame.
Yes, in code compiled with -fomit-frame-pointer, the easy solution fails. Luckily macOS has established a culture of always enabling frame pointers.
I will try https://github.com/mstange/framehop/ these days 😄 , as #116 has already gave us a chance to provide more options of unwinding method for the user.
Great, please file issues if you run into any trouble. Framehop only solves a subset of the problem; it's still up to the user to find where libraries are mapped in memory, to get their unwind section data, and to read the stack memory in a way that doesn't cause segfaults. Framehop is mostly about speed, caching, handling multiple types of unwind data, and supporting the offline use case on different machines and architectures.
These functions leave x29 ("fp") unchanged. And because they don't call any other functions, the lr register also stays unchanged. So unwinding from these functions only means "get the return address from lr and leave all other registers unchanged".
I did a test and it does exactly what @mstange says, here is a simple demo:
int func1() {
int x = 1;
int y = 2;
return x + y;
}
int func2() {
return func1() + 1;
}
int func3() {
return func2() + 1;
}
On my macOS, the ARM64 asm code generated by clang is as follows:
.section __TEXT,__text,regular,pure_instructions
.build_version macos, 12, 0 sdk_version 12, 3
.globl _func1 ; -- Begin function func1
.p2align 2
_func1: ; @func1
.cfi_startproc
; %bb.0:
sub sp, sp, #16
.cfi_def_cfa_offset 16
mov w8, #1
str w8, [sp, #12]
mov w8, #2
str w8, [sp, #8]
ldr w8, [sp, #12]
ldr w9, [sp, #8]
add w0, w8, w9
add sp, sp, #16
ret
.cfi_endproc
; -- End function
.globl _func2 ; -- Begin function func2
.p2align 2
_func2: ; @func2
.cfi_startproc
; %bb.0:
stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
mov x29, sp
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
bl _func1
add w0, w0, #1
ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
ret
.cfi_endproc
; -- End function
.globl _func3 ; -- Begin function func3
.p2align 2
_func3: ; @func3
.cfi_startproc
; %bb.0:
stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
mov x29, sp
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
bl _func2
add w0, w0, #1
ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
ret
.cfi_endproc
; -- End function
.subsections_via_symbols
Assuming we are executing func1
at this time, when we start backtracking directly in func1
, the starting point is the IP register in the scope of func1
. But since func1
does not save the FP register (x29), the Frame Pointer still points to the frame start address of func2
, and the frame start address of func2
stores the return address within the range of func3
. So from func1
will directly backtrack to func3
.
But in a regular backtracking scenario, this problem can be easily avoided. We usually wrap the stack backtrace as a function like backtrace()
, and in the implementation of backtrace()
, we call a function like getcontext()
to initialize the register context. So with getcontext()
as a leaf function, we skip the backtrace()
function when backtracking, which is exactly what we want.
The only thing that needs to be done is to ensure that the backtrace()
and getcontext()
functions are not inlined.
The rust demo below proves this conclusion:
use unwind::{unwind_init_registers, Registers};
#[inline(never)]
fn main() {
func1();
}
#[inline(never)]
fn func1() -> i32 {
func2() + 1
}
#[inline(never)]
fn func2() -> i32 {
func3() + 1
}
#[inline(never)]
fn func3() -> i32 {
let x = 1;
let y = 2;
backtrace();
x + y
}
#[inline(never)]
fn backtrace() {
let mut registers = Registers::default();
unsafe {
// Similar to `unw_getcontext()`.
unwind_init_registers(&mut registers as _);
}
// Do stack backtrace.
while registers[29] != 0 {
let pc = load::<u64>(registers[29] + 8); // x29 + 8 points to `Return Address`
registers[29] = load::<u64>(registers[29]);
// Show function name.
println!("{:#x}", pc);
backtrace::resolve(pc as _, |s| {
println!(" {:?}", s.name());
});
}
}
#[inline]
fn load<T: Copy>(address: u64) -> T {
unsafe { *(address as *const T) }
}
The output is (on my ARM64 macOS):
0x100755b38
Some(demo::func3::h552eb43de9d1600a)
0x100755acc
Some(demo::func2::hee00c49c139ef337)
0x100755a70
Some(demo::func1::hc597aa0db27740e8)
0x100755a54
Some(demo::main::h60f61d40767d4a8f)
0x100756a84
Some(core::ops::function::FnOnce::call_once::h3e2c7a62c3d8b7b0)
0x100756db8
Some(std::sys_common::backtrace::__rust_begin_short_backtrace::h344b61623a7d58a2)
0x100756d70
Some(std::rt::lang_start::{{closure}}::h16ad8343d4bef89d)
0x10082466c
Some(core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once::h8eb3ac20f80eabfa)
Some(std::panicking::try::do_call::ha6ddf2c638427188)
Some(std::panicking::try::hda8741de507c1ad0)
Some(std::panic::catch_unwind::h82424a01f258bd39)
Some(std::rt::lang_start_internal::{{closure}}::h67e296ed5b030b7b)
Some(std::panicking::try::do_call::hd3dd7e7e10f6424e)
Some(std::panicking::try::ha0a7bd8122e3fb7c)
Some(std::panic::catch_unwind::h809b0e1092e9475d)
Some(std::rt::lang_start_internal::h358b6d58e23c88c7)
0x100756d38
Some(std::rt::lang_start::h74b9170615de9a35)
0x100755d44
Some("_main")
0x100cb5088
0xb11b000000000000
This is exactly what we expected.
However, when the scene comes to CPU Profiling, if the leaf function is interrupted by the SIGPROF
, I think its parent function will indeed be skipped. This can be avoided by setting -mno-omit-leaf-frame-pointer
.
tikv/tikv#10658
TiKV on HUAWEI,Kunpeng 920 failed to profile and got an panic.