iovisor / bcc

BCC - Tools for BPF-based Linux IO analysis, networking, monitoring, and more
Apache License 2.0
20.27k stars 3.84k forks source link

CPU profiling RFC #617

Closed brendangregg closed 8 years ago

brendangregg commented 8 years ago

Here's a prototype CPU profiler:

# ./profile.py -s 11
Sampling at 49 Hertz of all threads by user + kernel stack... Hit Ctrl-C to end.
^C
    ffffffff814ebf62 read_iter_zero
    ffffffff8120b7cd __vfs_read
    ffffffff8120bcb5 vfs_read
    ffffffff8120d0b6 sys_read
    ffffffff817cb636 entry_SYSCALL_64_fastpath
    --
    00007f317a59b9b0 read
    -                dd (6773)
        1

    --
    000000000040053e func_a
    0000000000400598 main
    00007f52e9455830 __libc_start_main
    083e258d4c544155 [unknown]
    -                func_ab (16829)
        1
[...]
    ffffffff8103663e default_idle
    ffffffff81036dbf arch_cpu_idle
    ffffffff810bb8ea default_idle_call
    ffffffff810bbb97 cpu_startup_entry
    ffffffff8104df85 start_secondary
    --
    -                swapper/2 (0)
        57

    ffffffff8103663e default_idle
    ffffffff81036dbf arch_cpu_idle
    ffffffff810bb8ea default_idle_call
    ffffffff810bbb97 cpu_startup_entry
    ffffffff8104df85 start_secondary
    --
    -                swapper/3 (0)
        57

Arguments taken from offcputime:

# ./profile.py -h
usage: profile.py [-h] [-p PID | -u | -k] [-U | -K] [-F FREQUENCY] [-d] [-f]
                  [--stack-storage-size STACK_STORAGE_SIZE] [-s KERNEL_SKIP]
                  [duration]

Profile CPU stack traces at a timed interval

positional arguments:
  duration              duration of trace, in seconds

optional arguments:
  -h, --help            show this help message and exit
  -p PID, --pid PID     profile this PID only
  -u, --user-threads-only
                        user threads only (no kernel threads)
  -k, --kernel-threads-only
                        kernel threads only (no user threads)
  -U, --user-stacks-only
                        show stacks from user space only (no kernel space
                        stacks)
  -K, --kernel-stacks-only
                        show stacks from kernel space only (no user space
                        stacks)
  -F FREQUENCY, --frequency FREQUENCY
                        sample frequency, Hertz (default 49)
  -d, --delimited       insert delimiter between kernel/user stacks
  -f, --folded          output folded format
  --stack-storage-size STACK_STORAGE_SIZE
                        the number of unique stack traces that can be stored
                        and displayed (default 2048)
  -s KERNEL_SKIP, --kernel-skip KERNEL_SKIP
                        skip this many kernel frames (default 3)

examples:
    ./profile             # profile stack traces at 49 Hertz until Ctrl-C
    ./profile -F 99       # profile stack traces at 99 Hertz
    ./profile 5           # profile at 49 Hertz for 5 seconds only
    ./profile -f 5        # 5 seconds, and output in folded format
    ./profile -p 185      # only profile threads for PID 185
    ./profile -u          # only profile user threads (no kernel)
    ./profile -k          # only profile kernel threads (no user)
    ./profile -U          # only show user space stacks (no kernel)
    ./profile -K          # only show kernel space stacks (no user)
    ./profile -s 11       # skip 11 frames of kernel stack (default 3)

It works by running perf to initialize a software timer, and then using BPF to instrument the events.

If a perf:perf_hrtimer tracepoint exists, it uses that, otherwise it defaults to kprobe of perf_misc_flags(). Here's the code for that tracepoint:

commit 61441110c7b75d7499a7662c11dd2f92aa665524
Author: Brendan Gregg <bgregg@netflix.com>
Date:   Sat Jul 16 01:07:50 2016 +0000

    Add a tracepoint for perf sampling

    When perf is performing hrtimer-based sampling, this tracepoint can be used by
    BPF to run additional logic on each sample. For exmaple, BPF can fetch stack
    traces and frequency count them in kernel context, for an efficient profiler.

    Signed-off-by: Brendan Gregg <bgregg@netflix.com>

diff --git a/include/trace/events/perf.h b/include/trace/events/perf.h
new file mode 100644
index 0000000..a5eb0ed
--- /dev/null
+++ b/include/trace/events/perf.h
@@ -0,0 +1,27 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM perf
+
+#if !defined(_TRACE_PERF_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PERF_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(perf_hrtimer,
+       TP_PROTO(struct pt_regs *regs),
+
+       TP_ARGS(regs),
+
+       TP_STRUCT__entry(
+        __field(struct pt_regs *, regs)
+       ),
+
+       TP_fast_assign(
+        __entry->regs = regs;
+       ),
+
+       TP_printk("regs=%p", __entry->regs)
+);
+#endif /* _TRACE_PERF_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 79dae18..13b0c45 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -51,6 +51,9 @@

 #include <asm/irq_regs.h>

+#define CREATE_TRACE_POINTS
+#include <trace/events/perf.h>
+
 typedef int (*remote_function_f)(void *);

 struct remote_function_call {
@@ -8036,6 +8039,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        perf_sample_data_init(&data, 0, event->hw.last_period);
        regs = get_irq_regs();

+    trace_perf_hrtimer(regs);
+
        if (regs && !perf_exclude_event(event, regs)) {
                if (!(event->attr.exclude_idle && is_idle_task(current)))
                        if (__perf_event_overflow(event, 1, &data, regs))

Ideally we'd have BPF sampling support, but I think this is a reasonable and useful workaround in the meantime. @ast, what do you think?

It could be made a little more efficient by adding a perf option to not emit sample records, but fire its timers anyway. perf does have --no-samples, but that avoids the timer as well.

Here's the Python program for the profiler:

#!/usr/bin/python
#
# profile  Profile CPU usage by sampling stack traces at a timed interval.
#          For Linux, uses BCC, BPF. Embedded C.
#
# This current version (July 2016) employs a workaround involving executing
# a perf command to initiate the sampling, and then using BPF to summarize
# the samples in kernel context. A future version should be able to do this
# all directly, without invoking perf.
#
# NOTE: This current implementation likely includes between 1 and 15 extra
# kernel frames on the top of the kernel stacks that should be skipped. The
# actual number will depend on your system. Eg:
#
#   ffffffff81174e78 perf_swevent_hrtimer
#   ffffffff810e6984 __hrtimer_run_queues
#   ffffffff810e70f8 hrtimer_interrupt
#   ffffffff81022c69 xen_timer_interrupt
#   ffffffff810d2942 handle_irq_event_percpu
#   ffffffff810d62da handle_percpu_irq
#   ffffffff810d1f52 generic_handle_irq
#   ffffffff814a5137 evtchn_2l_handle_events
#   ffffffff814a2853 __xen_evtchn_do_upcall
#   ffffffff814a4740 xen_evtchn_do_upcall
#   ffffffff817cd50c xen_hvm_callback_vector
#   ffffffff8103663e default_idle
#   ffffffff81036dbf arch_cpu_idle
#   ffffffff810bb8ea default_idle_call
#   ffffffff810bbb97 cpu_startup_entry
#   ffffffff8104df85 start_secondary
#
# Use the -s option to skip the appropriate number of frames for your system.
# It defaults to 3. Eg, the above sample stack is really the idle thread, and
# so -s 11 can be used to skip the top 11 lines.
#
# REQUIRES: Linux 4.7+ (BPF_PROG_TYPE_TRACEPOINT support).
#
# Copyright 2016 Netflix, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
#
# THANKS: Andrew Birchall and Evgeny Vereshchagin, who wrote much of the code
# here, borrowed from offcputime.py.
#
#15-Jul-2016   Brendan Gregg   Created this.

from __future__ import print_function
from bcc import BPF
from sys import stderr
from time import sleep
import argparse
import signal
import os

# arg validation
def positive_int(val):
    try:
        ival = int(val)
    except ValueError:
        raise argparse.ArgumentTypeError("must be an integer")

    if ival < 0:
        raise argparse.ArgumentTypeError("must be positive")
    return ival

def positive_nonzero_int(val):
    ival = positive_int(val)
    if ival == 0:
        raise argparse.ArgumentTypeError("must be nonzero")
    return ival

# arguments
examples = """examples:
    ./profile             # profile stack traces at 49 Hertz until Ctrl-C
    ./profile -F 99       # profile stack traces at 99 Hertz
    ./profile 5           # profile at 49 Hertz for 5 seconds only
    ./profile -f 5        # 5 seconds, and output in folded format
    ./profile -p 185      # only profile threads for PID 185
    ./profile -u          # only profile user threads (no kernel)
    ./profile -k          # only profile kernel threads (no user)
    ./profile -U          # only show user space stacks (no kernel)
    ./profile -K          # only show kernel space stacks (no user)
    ./profile -s 11       # skip 11 frames of kernel stack (default 3)
"""
parser = argparse.ArgumentParser(
    description="Profile CPU stack traces at a timed interval",
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog=examples)
thread_group = parser.add_mutually_exclusive_group()
thread_group.add_argument("-p", "--pid", type=positive_int,
    help="profile this PID only")
thread_group.add_argument("-u", "--user-threads-only", action="store_true",
    help="user threads only (no kernel threads)")
thread_group.add_argument("-k", "--kernel-threads-only", action="store_true",
    help="kernel threads only (no user threads)")
stack_group = parser.add_mutually_exclusive_group()
stack_group.add_argument("-U", "--user-stacks-only", action="store_true",
    help="show stacks from user space only (no kernel space stacks)")
stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true",
    help="show stacks from kernel space only (no user space stacks)")
parser.add_argument("-F", "--frequency", type=positive_int, default=49,
    help="sample frequency, Hertz (default 49)")
parser.add_argument("-d", "--delimited", action="store_true",
    help="insert delimiter between kernel/user stacks")
parser.add_argument("-f", "--folded", action="store_true",
    help="output folded format")
parser.add_argument("--stack-storage-size", default=2048,
    type=positive_nonzero_int,
    help="the number of unique stack traces that can be stored and " \
        "displayed (default 2048)")
parser.add_argument("-s", "--kernel-skip", type=positive_int, default=3,
    help="skip this many kernel frames (default 3)")
parser.add_argument("duration", nargs="?", default=99999999,
    type=positive_nonzero_int,
    help="duration of trace, in seconds")
args = parser.parse_args()
folded = args.folded
frequency = args.frequency
skip = args.kernel_skip
duration = int(args.duration)
debug = 0

# option logic
if args.kernel_threads_only and args.user_stacks_only:
    print("ERROR: Displaying user stacks for kernel threads " \
        "doesn't make sense.", file=stderr)
    exit(1)
need_delimiter = args.delimited and not (args.kernel_stacks_only or args.user_stacks_only)

# define BPF program
bpf_text="""
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>

struct key_t {
    u32 pid;
    int user_stack_id;
    int kernel_stack_id;
    char name[TASK_COMM_LEN];
};
BPF_HASH(counts, struct key_t);
BPF_HASH(start, u32);
BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE)

PERF_TRACE_EVENT {
    u32 pid = bpf_get_current_pid_tgid();
    if (!(THREAD_FILTER))
        return 0;

    // create map key
    u64 zero = 0, *val;
    struct key_t key = {};

    key.pid = pid;
    key.user_stack_id = USER_STACK_GET;
    key.kernel_stack_id = KERNEL_STACK_GET;
    bpf_get_current_comm(&key.name, sizeof(key.name));

    val = counts.lookup_or_init(&key, &zero);
    (*val)++;
    return 0;
}
"""

# set thread filter
perf_filter = "-a"
thread_context = ""
if args.pid is not None:
    perf_filter = '-p %s' % args.pid
    thread_context = "PID %s" % args.pid
    thread_filter = 'pid == %s' % args.pid
elif args.user_threads_only:
    thread_context = "user threads"
    thread_filter = '!(prev->flags & PF_KTHREAD)'
elif args.kernel_threads_only:
    thread_context = "kernel threads"
    thread_filter = 'prev->flags & PF_KTHREAD'
else:
    thread_context = "all threads"
    thread_filter = '1'
bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter)

# set stack storage size
bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))

# handle stack args
kernel_stack_get = "stack_traces.get_stackid(args, %d | BPF_F_REUSE_STACKID)" % \
    skip
user_stack_get = \
    "stack_traces.get_stackid(args, BPF_F_REUSE_STACKID | BPF_F_USER_STACK)"
stack_context = ""
if args.user_stacks_only:
    stack_context = "user"
    kernel_stack_get = "-1"
elif args.kernel_stacks_only:
    stack_context = "kernel"
    user_stack_get = "-1"
else:
    stack_context = "user + kernel"
bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get)
bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get)

# use perf tracepoint if it exists, else kprobe
if os.path.exists("/sys/kernel/debug/tracing/events/perf/perf_hrtimer"):
    bpf_text = bpf_text.replace('PERF_TRACE_EVENT',
        'TRACEPOINT_PROBE(perf, perf_hrtimer)')
else:
    bpf_text = bpf_text.replace('PERF_TRACE_EVENT',
        'int kprobe__perf_misc_flags(struct pt_regs *args)')
if debug:
    print(bpf_text)

# initialize BPF
b = BPF(text=bpf_text)

# header
if not folded:
    print("Sampling at %d Hertz of %s by %s stack" %
        (frequency, thread_context, stack_context), end="")
    if duration < 99999999:
        print(" for %d secs." % duration)
    else:
        print("... Hit Ctrl-C to end.")

# signal handler
def signal_ignore(signal, frame):
    print()

try:
    # force timer-based sampling by using -F with -e cpu-clock
    perfcmd = "perf record -F %d -e cpu-clock -N %s -o /dev/null -- " \
        "sleep %d 2>&1" % (frequency, perf_filter, duration)
    if debug:
        print(perfcmd)
    perfout = os.popen(perfcmd).read()
    if perfout.find("Captured and wrote") < 0:
        print("ERROR: running the following command:\n    %s\n" \
            "Try running it and fixing the problem (eg, $PATH). Exiting." \
            % perfcmd, file=stderr)
        exit(0)
except KeyboardInterrupt:
    # as cleanup can take some time, trap Ctrl-C:
    signal.signal(signal.SIGINT, signal_ignore)

if not folded:
    print()

missing_stacks = 0
has_enomem = False
counts = b.get_table("counts")
stack_traces = b.get_table("stack_traces")
for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
    # handle get_stackid erorrs
    if (not args.user_stacks_only and k.kernel_stack_id < 0 and \
            k.kernel_stack_id != -14) or \
            (not args.kernel_stacks_only and k.user_stack_id < 0 and \
            k.user_stack_id != -14):
        missing_stacks += 1
        # check for an ENOMEM error
        if k.kernel_stack_id == -12 or k.user_stack_id == -12:
            has_enomem = True

    user_stack = [] if k.user_stack_id < 0 else \
        stack_traces.walk(k.user_stack_id)
    kernel_stack = [] if k.kernel_stack_id < 0 else \
        stack_traces.walk(k.kernel_stack_id)

    if folded:
        # print folded stack output
        user_stack = list(user_stack)
        kernel_stack = list(kernel_stack)
        line = [k.name.decode()] + \
            [b.sym(addr, k.pid) for addr in reversed(user_stack)] + \
            (need_delimiter and ["-"] or []) + \
            [b.ksym(addr) for addr in reversed(kernel_stack)]
        print("%s %d" % (";".join(line), v.value))
    else:
        # print default multi-line stack output
        for addr in kernel_stack:
            print("    %016x %s" % (addr, b.ksym(addr)))
        if need_delimiter:
            print("    --")
        for addr in user_stack:
            print("    %016x %s" % (addr, b.sym(addr, k.pid)))
        print("    %-16s %s (%d)" % ("-", k.name, k.pid))
        print("        %d\n" % v.value)

if missing_stacks > 0:
    enomem_str = "" if not has_enomem else \
        " Consider increasing --stack-storage-size."
    print("WARNING: %d stack traces could not be displayed.%s" %
        (missing_stacks, enomem_str),
        file=stderr)
4ast commented 8 years ago

On Fri, Jul 15, 2016 at 6:33 PM, Brendan Gregg notifications@github.com wrote:

+#define CREATE_TRACE_POINTS+#include <trace/events/perf.h>+ typedef int (remote_function_f)(void );

struct remote_function_call {@@ -8036,6 +8039,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) perf_sample_data_init(&data, 0, event->hw.last_period); regs = get_irq_regs();

  • trace_perf_hrtimer(regs);+ if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && is_idle_task(current))) if (__perf_event_overflow(event, 1, &data, regs))

Ideally we'd have BPF sampling support, but I think this is a reasonable and useful workaround in the meantime. @ast https://github.com/ast, what do you think?

makes sense to me. Please run 'checkpatch' on the patch before submitting to make sure there are no spaces instead of tabs. Probably cc Arnaldo and Wang. I think they will be interested in it as well. Here's the Python program for the profiler:

Use the -s option to skip the appropriate number of frames for your system.# It defaults to 3. Eg, the above sample stack is really the idle thread, and# so -s 11 can be used to skip the top 11 lines.

that kinda sux. how perf deals with these extra frames? or it doesn't seem them?# use perf tracepoint if it exists, else kprobe

if os.path.exists("/sys/kernel/debug/tracing/events/perf/perf_hrtimer"): bpf_text = bpf_text.replace('PERF_TRACE_EVENT', 'TRACEPOINT_PROBE(perf, perf_hrtimer)')else: bpf_text = bpf_text.replace('PERF_TRACE_EVENT', 'int kprobe__perf_misc_flags(struct pt_regs *args)')

do you see the same extra frames with both kprobe and tracepoint approach? kprobe will be firing a lot more often. Including during bpf_perf_event_output and our recursion prevention check will do the right thing, but it's unnecessary overhead, so tracepoint is definitely cleaner.

brendangregg commented 8 years ago

Thanks, will run checkpatch (I'm reading Documentation/SubmittingPatches). Easy to mess up when switching between Python/C.

I'll check how perf deals with it. So it's a different number of frames between the tracepoint (11) and kprobe (14), but I think this will also vary based on the system, whether it's a Xen guest (see that sample stack) or not, etc.

brendangregg commented 8 years ago

The code in check_stack() seems to scan the stack trace for an ip match:

        /* Skip over the overhead of the stack tracer itself */
        for (i = 0; i < stack_trace_max.nr_entries; i++) {
                if (stack_dump_trace[i] == ip)
                        break;
        }

However, that doesn't work. I fetched the ip using the regs argument of my perf:perf_hrtimer tracepoint, and printed it after the kernel stack. Some code, and example output:

    struct pt_regs regs = {};
    bpf_probe_read(&regs, sizeof(regs), args->regs);
    key.ip = PT_REGS_IP(&regs);
    ffffffff81175094 perf_swevent_hrtimer
    ffffffff810e6bc4 __hrtimer_run_queues
    ffffffff810e7338 hrtimer_interrupt
    ffffffff81022f29 xen_timer_interrupt
    ffffffff810d2b82 handle_irq_event_percpu
    ffffffff810d651a handle_percpu_irq
    ffffffff810d2192 generic_handle_irq
    ffffffff814a5797 evtchn_2l_handle_events
    ffffffff814a2eb3 __xen_evtchn_do_upcall
    ffffffff814a4da0 xen_evtchn_do_upcall
    ffffffff817cdf0c xen_hvm_callback_vector
    ffffffff8136d5f8 apparmor_file_permission
    ffffffff81330fdd security_file_permission
    ffffffff8120c25e rw_verify_area
    ffffffff8120c477 vfs_write
    ffffffff8120d7f6 sys_write
    ffffffff817cc036 entry_SYSCALL_64_fastpath

IP: ffffffff8136d4c6 common_file_perm

ip is common_file_perm(), but that isn't in the stack trace. It's parent, apparmor_file_permission(), is. I see the same pattern in many other examples.

It looks like the interrupt isn't pushing a stack frame. So how does perf even work by scanning for the ip in the stack, if it's not in the stack? Maybe check_stack() is the wrong code, or it's fixed the stack before then (using saved regs and a saved stack depth?).

If I look at the ret instruction from the stack instead (RBP+8), I do find it in the stack trace:

    struct pt_regs regs = {};
    bpf_probe_read(&regs, sizeof(regs), args->regs);
    bpf_probe_read(&key.ip, sizeof(key.ip), (void *)(regs.bp + 8));
    ffffffff81175094 perf_swevent_hrtimer
    ffffffff810e6bc4 __hrtimer_run_queues
    ffffffff810e7338 hrtimer_interrupt
    ffffffff81022f29 xen_timer_interrupt
    ffffffff810d2b82 handle_irq_event_percpu
    ffffffff810d651a handle_percpu_irq
    ffffffff810d2192 generic_handle_irq
    ffffffff814a5797 evtchn_2l_handle_events
    ffffffff814a2eb3 __xen_evtchn_do_upcall
    ffffffff814a4da0 xen_evtchn_do_upcall
    ffffffff817cdf0c xen_hvm_callback_vector
    ffffffff8136d5f8 apparmor_file_permission
    ffffffff81330fdd security_file_permission
    ffffffff8120c25e rw_verify_area
    ffffffff8120c477 vfs_write
    ffffffff8120d7f6 sys_write
    ffffffff817cc036 entry_SYSCALL_64_fastpath

IP: ffffffff8136d5f8 apparmor_file_permission

Ok, so that's in the stack, and this seems reliable. I suppose I should enhance it to also put both ip and rbp+8 in the map key, the former as the first frame, and the latter for skipping interrupt framework frames.

So there'd be at least two ways forward:

Still would like to know why ip isn't in the stack...

goldshtn commented 8 years ago

Would it make sense to enable the perf event by calling perf_event_open instead of invoking perf directly? Could be a bit less brittle. I think I have an example of doing this from Python in tracepoint.py.

Also, what is the overhead of running perf in this mode with no output file? Are there still events written to a cyclic buffer and then flushed to /dev/null?

4ast commented 8 years ago

check_stack() and bits in trace_stack.c are used by ftrace infra to look for large stacks. It's gated by CONFIG_STACK_TRACER and can be enabled by /sys/kernel/debug/tracing/current_tracer. It's very specific to ftrace. perf_event logic doesn't call into it.

Also completely agree with @goldshtn Doing manual perf_event_open is much faster and more stable. perf flags and defaults may change, whereas perf_event_open is fixed kernel abi.

brendangregg commented 8 years ago

I've switched to perf_event_open (thanks @goldshtn for the tracepoint.py code), and it should be more efficient now: since I'm not setting up the perf ring buffers on the perf event file descriptors using mmap, nor reading from them, there's now no perf event data being read into user space or written to the file system, as there was with the perf command.

Here's before (perf cmd):

bcc/tools# ./funccount.py 'perf*'
Tracing 54 functions for "perf*"... Hit Ctrl-C to end.
^C
ADDR             FUNC                          COUNT
ffffffff81181870 perf_event_max_stack_handler        2
ffffffff811812f0 perf_mmap_to_page                 8
ffffffff81180790 perf_output_begin                12
ffffffff811801d0 perf_mmap_free_page            1041
ffffffff81180150 perf_mmap_alloc_page           1176
ffffffff811803d0 perf_output_begin_forward      3387
ffffffff81007ab0 perf_misc_flags                3391
ffffffff81007a40 perf_instruction_pointer       3392
ffffffff81180a30 perf_output_end                3397
ffffffff811800e0 perf_output_put_handle         3401
ffffffff81180340 perf_output_copy              20393
Detaching...

Here's after (perf_event_open):

bcc/tools# ./funccount.py 'perf*'
Tracing 54 functions for "perf*"... Hit Ctrl-C to end.
^C
ADDR             FUNC                          COUNT
ffffffff81180790 perf_output_begin                 5
ffffffff811801d0 perf_mmap_free_page               9
ffffffff81180150 perf_mmap_alloc_page            144
ffffffff81007ab0 perf_misc_flags                3587
ffffffff811803d0 perf_output_begin_forward      3587
Detaching...

(You can see why I was using perf_misc_flags as the fallback.)

I think this all works because perf_mmap() hasn't been called, so event->rb is not populated, and we exit early out of __perf_output_begin(). I'm glad that the PERF_EVENT_IOC_ENABLE call is successful, despite no ring buffer setup.

So this has ended up much more efficient than I was expecting. Great for us.