iovisor / bcc

BCC - Tools for BPF-based Linux IO analysis, networking, monitoring, and more
Apache License 2.0
20.63k stars 3.89k forks source link

Kernel locks tracing only in process context #5114

Open Anjali05 opened 1 month ago

Anjali05 commented 1 month ago

Hi,

I am using an extended version of klockstat tool to trace spin locks but because of interrupts, the stack traces I am getting are quite messy. Ideally, I would like to trace only in the process context. Is there a way to do this in eBPF currently? I tried using in_interrupt but I do not think its supported. Can anyone suggest any workaround for this? Thanks in advance!

yonghong-song commented 1 month ago

Currently BPF does not have helpers/kfuncs to identify the context. So I don't have a good way either. Is it possible that you filter some stack trace if their pattern suggests it is from non-process context?

Anjali05 commented 1 month ago

@yonghong-song There are patterns but if I filter lock acquisitions like this, I will be losing data on some locks. For example in stacks like these, I do not have separate stack traces giving me the name of the lock acquired by folio_lruvec_lock_irqsave:

   "locks": [
            "&head->lock"
        ],
        "kernel_stack": [
            "_raw_spin_lock+0x5 [kernel]",
            "__pcpu_freelist_pop+0xb3 [kernel]",
            "pcpu_freelist_pop+0x1a [kernel]",
            "__bpf_get_stackid+0x147 [kernel]",
            "bpf_get_stackid_raw_tp+0x56 [kernel]",
            "bpf_prog_6deef7357e7b4530+0x91a6 [bpf]",
            "ipmi_si_pci_init.cold+0x11741 [ipmi_si]",
            "_raw_spin_lock_nested+0x5 [kernel]",
            "raw_spin_rq_lock_nested+0x23 [kernel]",
            "try_to_wake_up+0x249 [kernel]",
            "hrtimer_wakeup+0x1e [kernel]",
            "__hrtimer_run_queues+0x137 [kernel]",
            "hrtimer_interrupt+0x118 [kernel]",
            "__sysvec_apic_timer_interrupt+0x97 [kernel]",
            "sysvec_apic_timer_interrupt+0x9d [kernel]",
            "asm_sysvec_apic_timer_interrupt+0x16 [kernel]",
            "unwind_next_frame+0x34 [kernel]",
            "__unwind_start+0x15b [kernel]",
            "perf_callchain_kernel+0x9d [kernel]",
            "get_perf_callchain+0xc5 [kernel]",
            "bpf_get_stackid+0x69 [kernel]",
            "bpf_get_stackid_raw_tp+0x56 [kernel]",
            "bpf_prog_6deef7357e7b4530+0x8b7e [bpf]",
            "ipmi_si_pci_init.cold+0xd73e [ipmi_si]",
            "_raw_spin_lock_irqsave+0x5 [kernel]",
            "folio_lruvec_lock_irqsave+0x5e [kernel]",
            "folio_batch_move_lru+0x58 [kernel]",
            "folio_add_lru+0x9c [kernel]",
            "shmem_get_folio_gfp+0x3b4 [kernel]",
            "shmem_fault+0x76 [kernel]",
            "__do_fault+0x31 [kernel]",
            "do_fault+0x21c [kernel]",
            "__handle_mm_fault+0x628 [kernel]",
            "handle_mm_fault+0x107 [kernel]",
            "__get_user_pages+0x231 [kernel]",
            "get_user_pages_unlocked+0xd3 [kernel]",
            "hva_to_pfn+0xfb [kvm]",
            "kvm_faultin_pfn+0x91 [kvm]",
            "direct_page_fault+0x22c [kvm]",
            "kvm_mmu_page_fault+0x10d [kvm]",
            "vmx_handle_exit+0x16b [kvm_intel]",
            "vcpu_enter_guest+0xa40 [kvm]",
            "kvm_arch_vcpu_ioctl_run+0x266 [kvm]",
            "kvm_vcpu_ioctl+0x27d [kvm]",
            "__x64_sys_ioctl+0x88 [kernel]",
            "do_syscall_64+0x38 [kernel]",
            "entry_SYSCALL_64_after_hwframe+0x63 [kernel]",
            ""
        ],
        "addresses": [
            "0xffffffff81cd0c35",
            "0xffffffff812559e3",
            "0xffffffff81255c2a",
            "0xffffffff812718c7",
            "0xffffffff81211ea6",
            "0xffffffffa0018192",
            "0xffffffffa09a0052",
            "0xffffffff81cd0cb5",
            "0xffffffff810e1013",
            "0xffffffff810ea3b9",
            "0xffffffff8116ad9e",
            "0xffffffff8116b057",
            "0xffffffff8116bc38",
            "0xffffffff8106eb67",
            "0xffffffff81cbb13d",
            "0xffffffff81e00e46",
            "0xffffffff81082c74",
            "0xffffffff810833eb",
            "0xffffffff810067dd",
            "0xffffffff81291115",
            "0xffffffff81271bf9",
            "0xffffffff81211ea6",
            "0xffffffffa0017b6a",
            "0xffffffffa099c04f",
            "0xffffffff81cd0e95",
            "0xffffffff8136847e",
            "0xffffffff812af818",
            "0xffffffff812af9cc",
            "0xffffffff812c07a4",
            "0xffffffff812c0b96",
            "0xffffffff812e3aa1",
            "0xffffffff812e4a1c",
            "0xffffffff812e8fe8",
            "0xffffffff812e9ac7",
            "0xffffffff812dbeb1",
            "0xffffffff812dc3d3",
            "0xffffffffa0e6196b",
            "0xffffffffa0eaf961",
            "0xffffffffa0eb631c",
            "0xffffffffa0eb6b1d",
            "0xffffffffa0df976b",
            "0xffffffffa0e843f0",
            "0xffffffffa0e863b6",
            "0xffffffffa0e5c11d",
            "0xffffffff813a0ea8",
            "0xffffffff81cb7148",
            "0xffffffff81e0009b"
        ]
    },

I did manually add tracepoints/kfunc probes for some interrupt handlers like this:

BPF_PERCPU_ARRAY(irq_flags, u64, 1);

// Function to mark the start of interrupt handling
TRACEPOINT_PROBE(irq, irq_handler_entry) {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 1; // Mark as in-interrupt
    irq_flags.update(&cpu, &val);
    return 0;
}

// Function to mark the end of interrupt handling
TRACEPOINT_PROBE(irq, irq_handler_exit) {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 0; // Mark as not in-interrupt
    irq_flags.update(&cpu, &val);
    return 0;
}

// Function to mark the start of interrupt handling
TRACEPOINT_PROBE(irq, softirq_entry) {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 1; // Mark as in-interrupt
    irq_flags.update(&cpu, &val);
    return 0;
}

// Function to mark the end of interrupt handling
TRACEPOINT_PROBE(irq, softirq_exit) {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 0; // Mark as not in-interrupt
    irq_flags.update(&cpu, &val);
    return 0;
}

static void handle_hrtimer_interrupt_enter() {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 1; // Mark as in-interrupt
    irq_flags.update(&cpu, &val); 
}

// The probe for hrtimer interrupt entry
KFUNC_PROBE(hrtimer_interrupt) {
    handle_hrtimer_interrupt_enter(); 
    return 0;
}

static void handle_hrtimer_interrupt_exit() {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 0; // Mark as not in-interrupt
    irq_flags.update(&cpu, &val); 
}

// The probe for hrtimer interrupt exit
KRETFUNC_PROBE(hrtimer_interrupt) {
    handle_hrtimer_interrupt_exit(); 
    return 0;
}

static void handle_sysvec_apic_timer_interrupt_enter() {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 1; // Mark as in-interrupt
    irq_flags.update(&cpu, &val); 

}

KFUNC_PROBE(__sysvec_apic_timer_interrupt) {
    handle_sysvec_apic_timer_interrupt_enter(); 
    return 0;
}

static void handle_sysvec_apic_timer_interrupt_exit() {
    int cpu = bpf_get_smp_processor_id();
    u64 val = 0; // Mark as not in-interrupt
    irq_flags.update(&cpu, &val);
}

KRETFUNC_PROBE(__sysvec_apic_timer_interrupt) {
    handle_sysvec_apic_timer_interrupt_exit(); 
    return 0;
}

static int lock__NAME_(struct pt_regs *ctx, LOCK_TYPE  *lock)
{
    u32 current_pid = bpf_get_current_pid_tgid();

    // Check if we're in an interrupt context
    int cpu = bpf_get_smp_processor_id();
    u64 *flag = irq_flags.lookup(&cpu);
    if (flag && *flag == 1) {
        //bpf_trace_printk("interrupt context\\n"); 
        return 0;  // In-interrupt, skip stack trace capture
    }

....
}

But I do not think it's doing anything. Do you have any other suggestions?

Anjali05 commented 1 month ago

@yonghong-song Can you point me to any useful guide to create a helper function to access context?

4ast commented 1 month ago

with libbpf the following works:

#include <vmlinux.h>
extern const struct pcpu_hot pcpu_hot __ksym __weak;
SEC("syscall")
int pcpu_hot_test(void *ctx)
{
        struct pcpu_hot *p = bpf_this_cpu_ptr(&pcpu_hot);

        bpf_printk("pc %x\n", p->preempt_count);
        return 0;
}

Then add your own on top:

# define irq_count()            (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
#define in_interrupt()          (irq_count())