iovisor / bcc

BCC - Tools for BPF-based Linux IO analysis, networking, monitoring, and more
Apache License 2.0
20.43k stars 3.86k forks source link

stackcount option for stack depth limit #1103

Open brendangregg opened 7 years ago

brendangregg commented 7 years ago

Feature request for a stackcount option to limit the stack depth. Eg, "-l 5" for the top 5 frames only. get_stackid() should already support this, such that the filter and aggregation is done in kernel.

palmtenor commented 7 years ago

Kernel side supports "always skip first X frames", but not "only keep last Y frames". Is that OK?

brendangregg commented 7 years ago

Oh, I thought it did have a top-frames-only flag.

Maybe we can use sysctl_perf_event_max_stack instead for the duration of the trace?

palmtenor commented 7 years ago

We could always use the reserved flag bits for a new control parameter.

I'm also curious, do top frames normally tend to be more useful (as this issue suggested), or less useful (as the syscall suggested)? In my experience, for example I'm tracing say sys_write, the the top few frames are normally always the same set of libc functions, and skipping them does feel make sense to me?

brendangregg commented 7 years ago

I often just want to see the first 5 frames.

Eg, I was tracing Xen hypercalls:

# ./stackcount.py 't:xen:xen_mc_issue'
Tracing 1 functions for "t:xen:xen_mc_issue"... Hit Ctrl-C to end.
^C
  xen_load_sp0
  __switch_to
  __schedule
  schedule
  do_nanosleep
  hrtimer_nanosleep
  sys_nanosleep
  entry_SYSCALL_64_fastpath
    1
[...]

  xen_write_cr3
  leave_mm
  xen_exit_mmap
  exit_mmap
  mmput
  do_exit
  do_group_exit
  get_signal
  do_signal
  exit_to_usermode_loop
  syscall_return_slowpath
  entry_SYSCALL_64_fastpath
    48

  __xen_pgd_unpin
  xen_exit_mmap
  exit_mmap
  mmput
  do_exit
  do_group_exit
  get_signal
  do_signal
  exit_to_usermode_loop
  syscall_return_slowpath
  entry_SYSCALL_64_fastpath
    48

  xen_alloc_pmd
  __pmd_alloc
  handle_mm_fault
  __do_page_fault
  do_page_fault
  page_fault
  clear_user
  load_elf_binary
  search_binary_handler
  do_execveat_common.isra.37
  sys_execve
  do_syscall_64
  return_from_SYSCALL_64
    51

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  schedule_hrtimeout_range_clock
  schedule_hrtimeout_range
  poll_schedule_timeout
  do_select
  core_sys_select
  SyS_pselect6
  entry_SYSCALL_64_fastpath
    56

  xen_flush_tlb
  flush_tlb_mm_range
  tlb_flush_mmu_tlbonly
  tlb_finish_mmu
  unmap_region
  do_munmap
  SyS_munmap
  entry_SYSCALL_64_fastpath
    60

  xen_flush_tlb_single
  flush_tlb_mm_range
  move_page_tables
  shift_arg_pages
  setup_arg_pages
  load_elf_binary
  search_binary_handler
  do_execveat_common.isra.37
  sys_execve
  do_syscall_64
  return_from_SYSCALL_64
    70

  xen_flush_tlb_single
  flush_tlb_mm_range
  change_protection
  mprotect_fixup
  setup_arg_pages
  load_elf_binary
  search_binary_handler
  do_execveat_common.isra.37
  sys_execve
  do_syscall_64
  return_from_SYSCALL_64
    70

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  pipe_wait
  pipe_write
  __vfs_write
  vfs_write
  SyS_write
  entry_SYSCALL_64_fastpath
    72

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  schedule_hrtimeout_range_clock
  schedule_hrtimeout_range
  poll_schedule_timeout
  do_sys_poll
  sys_poll
  entry_SYSCALL_64_fastpath
    88

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  smpboot_thread_fn
  kthread
  ret_from_fork
    93

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  schedule_timeout
  wait_for_completion
  flush_work
  tty_buffer_flush_work
  n_tty_poll
  tty_poll
  do_select
  core_sys_select
  sys_select
  entry_SYSCALL_64_fastpath
    96

  xen_set_pmd_hyper
  xen_set_pmd
  __pte_alloc
  handle_mm_fault
  __do_page_fault
  do_page_fault
  page_fault
    105

  xen_set_pmd_hyper
  xen_set_pmd
  free_pgd_range
  free_pgtables
  unmap_region
  do_munmap
  SyS_munmap
  entry_SYSCALL_64_fastpath
    126

  xen_release_pte
  ___pte_free_tlb
  free_pgd_range
  free_pgtables
  unmap_region
  do_munmap
  SyS_munmap
  entry_SYSCALL_64_fastpath
    129

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  futex_wait_queue_me
  futex_wait
  do_futex
  SyS_futex
  entry_SYSCALL_64_fastpath
    139

  xen_alloc_pte
  __pte_alloc
  handle_mm_fault
  __do_page_fault
  do_page_fault
  page_fault
    140

  xen_load_sp0
  __switch_to
  __schedule
  _cond_resched
  dput
  __fput
  ____fput
  task_work_run
  exit_to_usermode_loop
  syscall_return_slowpath
  entry_SYSCALL_64_fastpath
    165

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  schedule_hrtimeout_range_clock
  schedule_hrtimeout_range
  poll_schedule_timeout
  do_select
  core_sys_select
  sys_select
  entry_SYSCALL_64_fastpath
    182

  xen_flush_tlb_others
  flush_tlb_page
  ptep_clear_flush
  wp_page_copy
  do_wp_page
  handle_mm_fault
  __do_page_fault
  do_page_fault
  page_fault
    190

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  rcu_gp_kthread
  kthread
  ret_from_fork
    208

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  do_wait
  SyS_wait4
  entry_SYSCALL_64_fastpath
    234

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  schedule_hrtimeout_range_clock
  schedule_hrtimeout_range
  poll_schedule_timeout
  do_select
  core_sys_select
  sys_select
  entry_SYSCALL_64_fastpath
    234

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  schedule_hrtimeout_range_clock
  schedule_hrtimeout_range
  poll_schedule_timeout
  do_sys_poll
  sys_poll
  entry_SYSCALL_64_fastpath
    246

  xen_write_cr3
  switch_mm_irqs_off
  __schedule
  schedule
  schedule_preempt_disabled
  cpu_startup_entry
  rest_init
  start_kernel
  x86_64_start_reservations
  xen_start_kernel
    307

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  schedule_timeout
  xfsaild
  kthread
  ret_from_fork
    352

  xen_flush_tlb_single
  flush_tlb_page
  ptep_clear_flush
  wp_page_copy
  do_wp_page
  handle_mm_fault
  __do_page_fault
  do_page_fault
  page_fault
  pipe_read
  __vfs_read
  vfs_read
  sys_read
  entry_SYSCALL_64_fastpath
    355

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  schedule_hrtimeout_range_clock
  schedule_hrtimeout_range
  ep_poll
  SyS_epoll_wait
  entry_SYSCALL_64_fastpath
    392

  xen_flush_tlb_others
  flush_tlb_page
  ptep_clear_flush
  wp_page_copy
  do_wp_page
  handle_mm_fault
  __do_page_fault
  do_page_fault
  page_fault
    467

  xen_ptep_modify_prot_commit
  change_protection
  mprotect_fixup
  do_mprotect_pkey
  sys_mprotect
  entry_SYSCALL_64_fastpath
    526

  xen_flush_tlb_single
  flush_tlb_mm_range
  change_protection
  mprotect_fixup
  do_mprotect_pkey
  sys_mprotect
  entry_SYSCALL_64_fastpath
    526

  xen_flush_tlb_single
  flush_tlb_mm_range
  tlb_flush_mmu_tlbonly
  tlb_finish_mmu
  unmap_region
  do_munmap
  SyS_munmap
  entry_SYSCALL_64_fastpath
    622

  xen_flush_tlb_single
  flush_tlb_page
  ptep_clear_flush
  wp_page_copy
  do_wp_page
  handle_mm_fault
  __do_page_fault
  do_page_fault
  page_fault
    769

  __xen_pgd_unpin
  xen_exit_mmap
  exit_mmap
  mmput
  do_exit
  do_group_exit
  SyS_exit_group
  entry_SYSCALL_64_fastpath
    871

  xen_write_cr3
  leave_mm
  xen_exit_mmap
  exit_mmap
  mmput
  do_exit
  do_group_exit
  SyS_exit_group
  entry_SYSCALL_64_fastpath
    871

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  schedule_timeout
  rcu_gp_kthread
  kthread
  ret_from_fork
    905

  xen_flush_tlb
  flush_tlb_mm_range
  copy_process.part.33
  _do_fork
  sys_clone
  do_syscall_64
  return_from_SYSCALL_64
    919

  __xen_pgd_pin
  xen_dup_mmap
  copy_process.part.33
  _do_fork
  sys_clone
  do_syscall_64
  return_from_SYSCALL_64
    919

  xen_load_sp0
  __switch_to
  ret_from_fork
    919

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  pipe_wait
  pipe_read
  __vfs_read
  vfs_read
  sys_read
  entry_SYSCALL_64_fastpath
    1281

  xen_write_cr3
  switch_mm_irqs_off
  __schedule
  schedule
  schedule_preempt_disabled
  cpu_startup_entry
  cpu_bringup_and_idle
    1483

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  schedule_preempt_disabled
  cpu_startup_entry
  rest_init
  start_kernel
  x86_64_start_reservations
  xen_start_kernel
    1529

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  worker_thread
  kthread
  ret_from_fork
    2540

  xen_set_pmd_hyper
  xen_set_pmd
  alloc_set_pte
  filemap_map_pages
  handle_mm_fault
  __do_page_fault
  do_page_fault
  page_fault
    3163

  xen_alloc_pte
  alloc_set_pte
  filemap_map_pages
  handle_mm_fault
  __do_page_fault
  do_page_fault
  page_fault
    3163

  xen_load_sp0
  __switch_to
  __schedule
  schedule
  schedule_preempt_disabled
  cpu_startup_entry
  cpu_bringup_and_idle
    6629

  xen_load_tls
    16448

  xen_flush_tlb_single
  flush_tlb_page
  ptep_clear_flush
  wp_page_copy
  do_wp_page
  handle_mm_fault
  __do_page_fault
  do_page_fault
  page_fault
    46604

  xen_set_pte_at
  copy_page_range
  copy_process.part.33
  _do_fork
  sys_clone
  do_syscall_64
  return_from_SYSCALL_64
    565901

Detaching...

Notice how the calling function, or the first 2 or 3, are pretty useful for identifying the hypercall?

brendangregg commented 6 years ago

Yes, we could use the reserved bits, but I'm wondering if Alexei was relying on a different way to do stack limits. See max_depth in https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d5a3b1f691865be576c2bffa708549b8cdccda19

And/or setting kernel.perf_event_max_stack. But when I try, I get errors.

# sysctl -w kernel.perf_event_max_stack=3
kernel.perf_event_max_stack = 3

# /mnt/src/bcc/tools/stackcount.py decay_load
/virtual/main.c:13:1: error: could not open bpf map: Invalid argument
is maps/stacktrace map type enabled in your kernel?
BPF_STACK_TRACE(stack_traces, 1024);
^
/virtual/include/bcc/helpers.h:199:3: note: expanded from macro 'BPF_STACK_TRACE'
  BPF_TABLE("stacktrace", int, struct bpf_stacktrace, _name, roundup_pow_of_two(_max_entries))
  ^
/virtual/include/bcc/helpers.h:60:76: note: expanded from macro 'BPF_TABLE'
#define BPF_TABLE(_table_type, _key_type, _leaf_type, _name, _max_entries) \
                                                                           ^
/virtual/include/bcc/helpers.h:56:4: note: expanded from macro '\
BPF_F_TABLE'
}; \
   ^
/virtual/main.c:21:41: error: bpf_table stack_traces failed to open
                    key.user_stack_id = stack_traces.get_stackid(
                                        ^
2 errors generated.
Failed to compile BPF text

If that were to work, I could imagine setting and restoring kernel.perf_event_max_stack whenever running the stackcount tool.