Open brendangregg opened 7 years ago
Kernel side supports "always skip first X frames", but not "only keep last Y frames". Is that OK?
Oh, I thought it did have a top-frames-only flag.
Maybe we can use sysctl_perf_event_max_stack instead for the duration of the trace?
We could always use the reserved flag bits for a new control parameter.
I'm also curious, do top frames normally tend to be more useful (as this issue suggested), or less useful (as the syscall suggested)? In my experience, for example I'm tracing say sys_write
, the the top few frames are normally always the same set of libc functions, and skipping them does feel make sense to me?
I often just want to see the first 5 frames.
Eg, I was tracing Xen hypercalls:
# ./stackcount.py 't:xen:xen_mc_issue'
Tracing 1 functions for "t:xen:xen_mc_issue"... Hit Ctrl-C to end.
^C
xen_load_sp0
__switch_to
__schedule
schedule
do_nanosleep
hrtimer_nanosleep
sys_nanosleep
entry_SYSCALL_64_fastpath
1
[...]
xen_write_cr3
leave_mm
xen_exit_mmap
exit_mmap
mmput
do_exit
do_group_exit
get_signal
do_signal
exit_to_usermode_loop
syscall_return_slowpath
entry_SYSCALL_64_fastpath
48
__xen_pgd_unpin
xen_exit_mmap
exit_mmap
mmput
do_exit
do_group_exit
get_signal
do_signal
exit_to_usermode_loop
syscall_return_slowpath
entry_SYSCALL_64_fastpath
48
xen_alloc_pmd
__pmd_alloc
handle_mm_fault
__do_page_fault
do_page_fault
page_fault
clear_user
load_elf_binary
search_binary_handler
do_execveat_common.isra.37
sys_execve
do_syscall_64
return_from_SYSCALL_64
51
xen_load_sp0
__switch_to
__schedule
schedule
schedule_hrtimeout_range_clock
schedule_hrtimeout_range
poll_schedule_timeout
do_select
core_sys_select
SyS_pselect6
entry_SYSCALL_64_fastpath
56
xen_flush_tlb
flush_tlb_mm_range
tlb_flush_mmu_tlbonly
tlb_finish_mmu
unmap_region
do_munmap
SyS_munmap
entry_SYSCALL_64_fastpath
60
xen_flush_tlb_single
flush_tlb_mm_range
move_page_tables
shift_arg_pages
setup_arg_pages
load_elf_binary
search_binary_handler
do_execveat_common.isra.37
sys_execve
do_syscall_64
return_from_SYSCALL_64
70
xen_flush_tlb_single
flush_tlb_mm_range
change_protection
mprotect_fixup
setup_arg_pages
load_elf_binary
search_binary_handler
do_execveat_common.isra.37
sys_execve
do_syscall_64
return_from_SYSCALL_64
70
xen_load_sp0
__switch_to
__schedule
schedule
pipe_wait
pipe_write
__vfs_write
vfs_write
SyS_write
entry_SYSCALL_64_fastpath
72
xen_load_sp0
__switch_to
__schedule
schedule
schedule_hrtimeout_range_clock
schedule_hrtimeout_range
poll_schedule_timeout
do_sys_poll
sys_poll
entry_SYSCALL_64_fastpath
88
xen_load_sp0
__switch_to
__schedule
schedule
smpboot_thread_fn
kthread
ret_from_fork
93
xen_load_sp0
__switch_to
__schedule
schedule
schedule_timeout
wait_for_completion
flush_work
tty_buffer_flush_work
n_tty_poll
tty_poll
do_select
core_sys_select
sys_select
entry_SYSCALL_64_fastpath
96
xen_set_pmd_hyper
xen_set_pmd
__pte_alloc
handle_mm_fault
__do_page_fault
do_page_fault
page_fault
105
xen_set_pmd_hyper
xen_set_pmd
free_pgd_range
free_pgtables
unmap_region
do_munmap
SyS_munmap
entry_SYSCALL_64_fastpath
126
xen_release_pte
___pte_free_tlb
free_pgd_range
free_pgtables
unmap_region
do_munmap
SyS_munmap
entry_SYSCALL_64_fastpath
129
xen_load_sp0
__switch_to
__schedule
schedule
futex_wait_queue_me
futex_wait
do_futex
SyS_futex
entry_SYSCALL_64_fastpath
139
xen_alloc_pte
__pte_alloc
handle_mm_fault
__do_page_fault
do_page_fault
page_fault
140
xen_load_sp0
__switch_to
__schedule
_cond_resched
dput
__fput
____fput
task_work_run
exit_to_usermode_loop
syscall_return_slowpath
entry_SYSCALL_64_fastpath
165
xen_load_sp0
__switch_to
__schedule
schedule
schedule_hrtimeout_range_clock
schedule_hrtimeout_range
poll_schedule_timeout
do_select
core_sys_select
sys_select
entry_SYSCALL_64_fastpath
182
xen_flush_tlb_others
flush_tlb_page
ptep_clear_flush
wp_page_copy
do_wp_page
handle_mm_fault
__do_page_fault
do_page_fault
page_fault
190
xen_load_sp0
__switch_to
__schedule
schedule
rcu_gp_kthread
kthread
ret_from_fork
208
xen_load_sp0
__switch_to
__schedule
schedule
do_wait
SyS_wait4
entry_SYSCALL_64_fastpath
234
xen_load_sp0
__switch_to
__schedule
schedule
schedule_hrtimeout_range_clock
schedule_hrtimeout_range
poll_schedule_timeout
do_select
core_sys_select
sys_select
entry_SYSCALL_64_fastpath
234
xen_load_sp0
__switch_to
__schedule
schedule
schedule_hrtimeout_range_clock
schedule_hrtimeout_range
poll_schedule_timeout
do_sys_poll
sys_poll
entry_SYSCALL_64_fastpath
246
xen_write_cr3
switch_mm_irqs_off
__schedule
schedule
schedule_preempt_disabled
cpu_startup_entry
rest_init
start_kernel
x86_64_start_reservations
xen_start_kernel
307
xen_load_sp0
__switch_to
__schedule
schedule
schedule_timeout
xfsaild
kthread
ret_from_fork
352
xen_flush_tlb_single
flush_tlb_page
ptep_clear_flush
wp_page_copy
do_wp_page
handle_mm_fault
__do_page_fault
do_page_fault
page_fault
pipe_read
__vfs_read
vfs_read
sys_read
entry_SYSCALL_64_fastpath
355
xen_load_sp0
__switch_to
__schedule
schedule
schedule_hrtimeout_range_clock
schedule_hrtimeout_range
ep_poll
SyS_epoll_wait
entry_SYSCALL_64_fastpath
392
xen_flush_tlb_others
flush_tlb_page
ptep_clear_flush
wp_page_copy
do_wp_page
handle_mm_fault
__do_page_fault
do_page_fault
page_fault
467
xen_ptep_modify_prot_commit
change_protection
mprotect_fixup
do_mprotect_pkey
sys_mprotect
entry_SYSCALL_64_fastpath
526
xen_flush_tlb_single
flush_tlb_mm_range
change_protection
mprotect_fixup
do_mprotect_pkey
sys_mprotect
entry_SYSCALL_64_fastpath
526
xen_flush_tlb_single
flush_tlb_mm_range
tlb_flush_mmu_tlbonly
tlb_finish_mmu
unmap_region
do_munmap
SyS_munmap
entry_SYSCALL_64_fastpath
622
xen_flush_tlb_single
flush_tlb_page
ptep_clear_flush
wp_page_copy
do_wp_page
handle_mm_fault
__do_page_fault
do_page_fault
page_fault
769
__xen_pgd_unpin
xen_exit_mmap
exit_mmap
mmput
do_exit
do_group_exit
SyS_exit_group
entry_SYSCALL_64_fastpath
871
xen_write_cr3
leave_mm
xen_exit_mmap
exit_mmap
mmput
do_exit
do_group_exit
SyS_exit_group
entry_SYSCALL_64_fastpath
871
xen_load_sp0
__switch_to
__schedule
schedule
schedule_timeout
rcu_gp_kthread
kthread
ret_from_fork
905
xen_flush_tlb
flush_tlb_mm_range
copy_process.part.33
_do_fork
sys_clone
do_syscall_64
return_from_SYSCALL_64
919
__xen_pgd_pin
xen_dup_mmap
copy_process.part.33
_do_fork
sys_clone
do_syscall_64
return_from_SYSCALL_64
919
xen_load_sp0
__switch_to
ret_from_fork
919
xen_load_sp0
__switch_to
__schedule
schedule
pipe_wait
pipe_read
__vfs_read
vfs_read
sys_read
entry_SYSCALL_64_fastpath
1281
xen_write_cr3
switch_mm_irqs_off
__schedule
schedule
schedule_preempt_disabled
cpu_startup_entry
cpu_bringup_and_idle
1483
xen_load_sp0
__switch_to
__schedule
schedule
schedule_preempt_disabled
cpu_startup_entry
rest_init
start_kernel
x86_64_start_reservations
xen_start_kernel
1529
xen_load_sp0
__switch_to
__schedule
schedule
worker_thread
kthread
ret_from_fork
2540
xen_set_pmd_hyper
xen_set_pmd
alloc_set_pte
filemap_map_pages
handle_mm_fault
__do_page_fault
do_page_fault
page_fault
3163
xen_alloc_pte
alloc_set_pte
filemap_map_pages
handle_mm_fault
__do_page_fault
do_page_fault
page_fault
3163
xen_load_sp0
__switch_to
__schedule
schedule
schedule_preempt_disabled
cpu_startup_entry
cpu_bringup_and_idle
6629
xen_load_tls
16448
xen_flush_tlb_single
flush_tlb_page
ptep_clear_flush
wp_page_copy
do_wp_page
handle_mm_fault
__do_page_fault
do_page_fault
page_fault
46604
xen_set_pte_at
copy_page_range
copy_process.part.33
_do_fork
sys_clone
do_syscall_64
return_from_SYSCALL_64
565901
Detaching...
Notice how the calling function, or the first 2 or 3, are pretty useful for identifying the hypercall?
Yes, we could use the reserved bits, but I'm wondering if Alexei was relying on a different way to do stack limits. See max_depth in https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d5a3b1f691865be576c2bffa708549b8cdccda19
And/or setting kernel.perf_event_max_stack. But when I try, I get errors.
# sysctl -w kernel.perf_event_max_stack=3
kernel.perf_event_max_stack = 3
# /mnt/src/bcc/tools/stackcount.py decay_load
/virtual/main.c:13:1: error: could not open bpf map: Invalid argument
is maps/stacktrace map type enabled in your kernel?
BPF_STACK_TRACE(stack_traces, 1024);
^
/virtual/include/bcc/helpers.h:199:3: note: expanded from macro 'BPF_STACK_TRACE'
BPF_TABLE("stacktrace", int, struct bpf_stacktrace, _name, roundup_pow_of_two(_max_entries))
^
/virtual/include/bcc/helpers.h:60:76: note: expanded from macro 'BPF_TABLE'
#define BPF_TABLE(_table_type, _key_type, _leaf_type, _name, _max_entries) \
^
/virtual/include/bcc/helpers.h:56:4: note: expanded from macro '\
BPF_F_TABLE'
}; \
^
/virtual/main.c:21:41: error: bpf_table stack_traces failed to open
key.user_stack_id = stack_traces.get_stackid(
^
2 errors generated.
Failed to compile BPF text
If that were to work, I could imagine setting and restoring kernel.perf_event_max_stack whenever running the stackcount tool.
Feature request for a stackcount option to limit the stack depth. Eg, "-l 5" for the top 5 frames only. get_stackid() should already support this, such that the filter and aggregation is done in kernel.