vislee / leevis.com

Blog
87 stars 13 forks source link

eBPF 学习 #188

Open vislee opened 2 years ago

vislee commented 2 years ago

概述

BPF是什么?

BPF全称是伯克利包过滤器(Berkeley Packet Filter),最早是伯克利大学发明用于内核实现网络数据包过滤的。 因设计理念新和性能高,发展到现在名称升级为eBPF(extended Berkeley Packet Filter),同时功能也支持的更多,不再仅仅是网络分析,可以基于eBPF开发性能分析、系统追踪、网络优化等。

eBPF 由 执行字节码指令、存储对象 和 辅助函数 组成。

BPF有什么作用?

有时候需要改动Linux内核实现一些功能,首先要考虑 “安全性”,不能允许不可信的代码运行在内核中,其次还要考虑“高性能”和“持续交付”。 我们可以通过开发Linux模块实现,但是需要有一定的Linux内核基础,并且随着Linux版本迭代,开发的Linux模块可能需要改动才能运行,搞不好就把Linux内核搞挂了。而BPF很好的解决了上述问题:

BPF是怎么工作的?

eBPF 程序在事件触发时由内核运行,所以可以被看作是一种函数挂钩或事件驱动的编程形式。 事件可由 kprobes/uprobes、tracepoints、dtrace probes、socket 等产生。 这允许在内核和用户进程的指令中钩住(hook)和检查任何函数的内存、拦截文件操作、检查特定的网络数据包等等。

运行 eBPF 程序的步骤:

eBPF 程序有哪些组件?

eBPF虚拟机

eBPF 是一个 RISC 寄存器机,共有 11 个 64 位寄存器,一个程序计数器和 512 字节的固定大小的栈。9 个寄存器是通用读写的,1 个是只读栈指针,程序计数器是隐式的。 eBPF 指令固定大小的 64 位编码,目前大约有 100 条指令,被分组8类指令 eBPF 程序类型决定了哪些内核函数的子集可以被调用

eBPF 字节码指令数组:

struct bpf_insn {
    __u8    code;       /* opcode */
    __u8    dst_reg:4;  /* dest register */
    __u8    src_reg:4;  /* source register */
    __s16   off;        /* signed offset */
    __s32   imm;        /* signed immediate constant */
};

msb                                                        lsb
+------------------------+----------------+----+----+--------+
|immediate               |offset          |src |dst |opcode  |
+------------------------+----------------+----+----+--------+

安装

参考: https://github.com/fbs/el7-bpf-specs/blob/master/README.md#repository

安装好以后,输出hello world 测试一下。

$ bpftrace -e 'BEGIN { printf("hello world\n");}'
/bpftrace/include/clang_workarounds.h:14:10: fatal error: 'linux/types.h' file not found

如果有上述报错,可能需要安装 kernel-headers , yum install kernel-headers

在ubuntu22.04安装

sudo apt update
sudo apt install snapd
sudo snap install bcc
$ sudo apt-get install -y bpftrace

$ sudo bpftrace -e 'BEGIN { printf("hello world\n");}'
Attaching 1 probe...
ERROR: Could not resolve symbol: /proc/self/exe:BEGIN_trigger

如果有报“ERROR: Could not resolve symbol: /proc/self/exe:BEGIN_trigger”则需要安装“bpftrace-dbgsym”:

echo "deb http://ddebs.ubuntu.com $(lsb_release -cs) main restricted universe multiverse
deb http://ddebs.ubuntu.com $(lsb_release -cs)-updates main restricted universe multiverse
deb http://ddebs.ubuntu.com $(lsb_release -cs)-proposed main restricted universe multiverse" | \
sudo tee -a /etc/apt/sources.list.d/ddebs.list
sudo apt install ubuntu-dbgsym-keyring
sudo apt update
sudo apt install bpftrace-dbgsym

正确安装后测试结果:

$ bpftrace -e 'BEGIN {printf("hello world!\n");}'
Attaching 1 probe...
hello world!
^C

// 输出当前系统调用跟踪点 bpftrace -l 'tracepoint:syscalls:*'

// 查看对应系统调用的参数 bpftrace -lv 'tracepoint:syscalls:sys_exit_open'

下面是个简单的例子,跟踪系统调用accept*connectbindsocket*和 内核函数 recvmsgsendmsg 每3s输出 那些命令调用这些系统调用和内核函数以及调用的次数。

$ cat sockstat.sh

#!/bin/env bpftrace

BEGIN
{
    printf("Tracing sock statistics. Output every 3s.\n");
}

tracepoint:syscalls:sys_enter_accept*,
t:syscalls:sys_enter_connect,
t:syscalls:sys_enter_bind,
t:syscalls:sys_enter_socket*,
kprobe:sock_recvmsg,
k:sock_sendmsg
{
    @[comm, probe] = count();
}

interval:s:3
{
    time();
    print(@);
    clear(@);
}

下面的例子是调用connectaccept accept4的命令和对应的次数。

$ cat sofamily.sh
#!/bin/env bpftrace

#include <linux/socket.h>

BEGIN
{
    printf("Tracing socket connect/accepts. Ctrl-C to end.\n");
    @s[AF_UNSPEC] = "AF_UNSPEC";
    @s[AF_UNIX] = "AF_UNIX";
    @s[AF_INET] = "AF_INET";
    @s[AF_INET6] = "AF_INET6";
}

t:syscalls:sys_enter_connect
{
    @conn[comm, @s[args->uservaddr->sa_family]] = count();
}

tracepoint:syscalls:sys_enter_accept*
{
    @sock[tid] = args->upeer_sockaddr;
}

tracepoint:syscalls:sys_exit_accept*
/@sock[tid]/
{
    if (args->ret > 0) {
        $sa = (struct sockaddr *)@sock[tid];
        @accept[comm, @s[$sa->sa_family]] = count();
    }
    delete(@sock[tid]);
}

END
{
    clear(@sock);
    clear(@s);
}

// 跟踪connect,本机主动连接 // tcpconnect.sh

#!/bin/env bpftrace

// struct sock定义
// https://elixir.bootlin.com/linux/latest/source/include/net/sock.h#L352
#include <net/sock.h>
#include <linux/socket.h>

BEGIN
{
    printf("Tracing connect ... Hit Ctrl-C to end\n");
    printf("%-8s %-8s %-16s ", "TIME", "PID", "COMM");
    printf("%-39s %-6s %-39s %-6s\n", "SADDR", "SPORT", "DADDR", "DPORT");
}

// tcp_connect 内核函数
// https://elixir.bootlin.com/linux/latest/source/include/net/tcp.h#L461
kprobe:tcp_connect
{
    $sk = (struct sock *)arg0; // tcp_connect的第一个参数
    $fa = $sk->__sk_common.skc_family;
    if ($fa == AF_INET) {
        $daddr = ntop($sk->__sk_common.skc_daddr);
        $saddr = ntop($sk->__sk_common.skc_rcv_saddr);
    }
    $lport = $sk->__sk_common.skc_num;
    $dport = $sk->__sk_common.skc_dport;

    $dport = ($dport >> 8) | (($dport << 8) & 0x00FF00); // 网络字节转化为主机字节序

    time("%H:%M:%S ");
    printf("%-8d %-16s ", pid, comm);
    printf("%-39s %-6d %-39s %-6d\n", $saddr, $lport, $daddr, $dport);
}

跟踪syn重传

./tcp_retransmit_kprobe.bt
#!/usr/bin/env bpftrace

#include <linux/socket.h>
#include <net/sock.h>

BEGIN
{
        printf("Tracing tcp retransmits. Hit Ctrl-C to end.\n");
        printf("%-8s %-8s %-34s %20s %21s %6s\n", "TIME", "PID", "PROBE", "LADDR:LPORT",
            "RADDR:RPORT", "STATE");

        // See include/net/tcp_states.h:
        @tcp_states[1] = "ESTABLISHED";
        @tcp_states[2] = "SYN_SENT";
        @tcp_states[3] = "SYN_RECV";
        @tcp_states[4] = "FIN_WAIT1";
        @tcp_states[5] = "FIN_WAIT2";
        @tcp_states[6] = "TIME_WAIT";
        @tcp_states[7] = "CLOSE";
        @tcp_states[8] = "CLOSE_WAIT";
        @tcp_states[9] = "LAST_ACK";
        @tcp_states[10] = "LISTEN";
        @tcp_states[11] = "CLOSING";
        @tcp_states[12] = "NEW_SYN_RECV";
}

kprobe:tcp_retransmit_skb
{
        $sk = (struct sock *)arg0;
        $inet_family = $sk->__sk_common.skc_family;

        if ($inet_family == AF_INET || $inet_family == AF_INET6) {
                // initialize variable type:
                $daddr = ntop(0);
                $saddr = ntop(0);
                if ($inet_family == AF_INET) {
                        $daddr = ntop($sk->__sk_common.skc_daddr);
                        $saddr = ntop($sk->__sk_common.skc_rcv_saddr);
                } else {
                        $daddr = ntop(
                            $sk->__sk_common.skc_v6_daddr.in6_u.u6_addr8);
                        $saddr = ntop(
                            $sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr8);
                }
                $lport = $sk->__sk_common.skc_num;
                $dport = $sk->__sk_common.skc_dport;

                // Destination port is big endian, it must be flipped
                $dport = ($dport >> 8) | (($dport << 8) & 0x00FF00);

                $state = $sk->__sk_common.skc_state;
                $statestr = @tcp_states[$state];

                time("%H:%M:%S ");
                printf("%-8d %-34s %14s:%-6d %14s:%-6d %6s\n", pid, probe, $saddr, $lport,
                    $daddr, $dport, $statestr);
        }
}

tracepoint:tcp:tcp_retransmit_skb
{
    $statestr = @tcp_states[args->state];
    time("%H:%M:%S ");
    printf("%-8d %-34s %14s:%-6d %14s:%-6d %6s\n\n", pid, probe, ntop(args->saddr),
        args->sport, ntop(args->daddr), args->dport, $statestr);
}

END
{
        clear(@tcp_states);
}

基础

BPF 程序类型

// include/uapi/linux/bpf.h

/* Note that tracing related programs such as
 * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT}
 * are not subject to a stable API since kernel internal data
 * structures can change from release to release and may
 * therefore break existing tracing BPF programs. Tracing BPF
 * programs correspond to /a/ specific kernel which is to be
 * analyzed, and not /a/ specific kernel /and/ all future ones.
 */
enum bpf_prog_type {
    BPF_PROG_TYPE_UNSPEC,
    BPF_PROG_TYPE_SOCKET_FILTER,
    BPF_PROG_TYPE_KPROBE,
    BPF_PROG_TYPE_SCHED_CLS,
    BPF_PROG_TYPE_SCHED_ACT,
    BPF_PROG_TYPE_TRACEPOINT,
    BPF_PROG_TYPE_XDP,
    BPF_PROG_TYPE_PERF_EVENT,
    BPF_PROG_TYPE_CGROUP_SKB,
    BPF_PROG_TYPE_CGROUP_SOCK,
    BPF_PROG_TYPE_LWT_IN,
    BPF_PROG_TYPE_LWT_OUT,
    BPF_PROG_TYPE_LWT_XMIT,
    BPF_PROG_TYPE_SOCK_OPS,
    BPF_PROG_TYPE_SK_SKB,
    BPF_PROG_TYPE_CGROUP_DEVICE,
    BPF_PROG_TYPE_SK_MSG,
    BPF_PROG_TYPE_RAW_TRACEPOINT,
    BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
    BPF_PROG_TYPE_LWT_SEG6LOCAL,
    BPF_PROG_TYPE_LIRC_MODE2,
    BPF_PROG_TYPE_SK_REUSEPORT,
    BPF_PROG_TYPE_FLOW_DISSECTOR,
    BPF_PROG_TYPE_CGROUP_SYSCTL,
    BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
    BPF_PROG_TYPE_CGROUP_SOCKOPT,
    BPF_PROG_TYPE_TRACING,
    BPF_PROG_TYPE_STRUCT_OPS,
    BPF_PROG_TYPE_EXT,
    BPF_PROG_TYPE_LSM,
    BPF_PROG_TYPE_SK_LOOKUP,
    BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
};

// linux/include/linux/bpf.h

#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
    extern const struct bpf_prog_ops _name ## _prog_ops; \
    extern const struct bpf_verifier_ops _name ## _verifier_ops;

BPF attach 类型

通过 socket() 系统调用将 BPF 程序 attach 到 hook 点时用到,

enum bpf_attach_type {
    BPF_CGROUP_INET_INGRESS,
    BPF_CGROUP_INET_EGRESS,
    BPF_CGROUP_INET_SOCK_CREATE,
    BPF_CGROUP_SOCK_OPS,
    BPF_SK_SKB_STREAM_PARSER,
    BPF_SK_SKB_STREAM_VERDICT,
    BPF_CGROUP_DEVICE,
    BPF_SK_MSG_VERDICT,
    BPF_CGROUP_INET4_BIND,
    BPF_CGROUP_INET6_BIND,
    BPF_CGROUP_INET4_CONNECT,
    BPF_CGROUP_INET6_CONNECT,
    BPF_CGROUP_INET4_POST_BIND,
    BPF_CGROUP_INET6_POST_BIND,
    BPF_CGROUP_UDP4_SENDMSG,
    BPF_CGROUP_UDP6_SENDMSG,
    BPF_LIRC_MODE2,
    BPF_FLOW_DISSECTOR,
    BPF_CGROUP_SYSCTL,
    BPF_CGROUP_UDP4_RECVMSG,
    BPF_CGROUP_UDP6_RECVMSG,
    BPF_CGROUP_GETSOCKOPT,
    BPF_CGROUP_SETSOCKOPT,
    BPF_TRACE_RAW_TP,
    BPF_TRACE_FENTRY,
    BPF_TRACE_FEXIT,
    BPF_MODIFY_RETURN,
    BPF_LSM_MAC,
    BPF_TRACE_ITER,
    BPF_CGROUP_INET4_GETPEERNAME,
    BPF_CGROUP_INET6_GETPEERNAME,
    BPF_CGROUP_INET4_GETSOCKNAME,
    BPF_CGROUP_INET6_GETSOCKNAME,
    BPF_XDP_DEVMAP,
    BPF_CGROUP_INET_SOCK_RELEASE,
    BPF_XDP_CPUMAP,
    BPF_SK_LOOKUP,
    BPF_XDP,
    __MAX_BPF_ATTACH_TYPE
};

Socket 相关类型

  1. BPF_PROG_TYPE_SOCKET_FILTER

    • 使用场景:
    • 流量过滤/复制(只读,相当于抓包)
    • 可观测性:流量统计
    • Hook 位置:sock_queue_rcv_skb() 在 sock_queue_rcv_skb() 中触发执行:
      
      int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
      {
      int err;

    err = sk_filter(sk, skb); if (err) return err;

    return __sock_queue_rcv_skb(sk, skb); }

    
    + 加载方式:setsockopt()
    通过 setsockopt(fd, SO_ATTACH_BPF, ...) 系统调用,其中 fd 是 BPF 程序的文件描述符。

XDP(eXpress Data Path)程序

  1. BPF_PROG_TYPE_XDP 场景: ddos防御、4层负载均衡

BPF Map

// linux/include/linux/bpf.h

#define BPF_MAP_TYPE(_id, _ops) \
    extern const struct bpf_map_ops _ops;

linux/include/linux/bpf_types.h

参考

https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md https://arthurchiao.art/articles-zh/ https://github.com/iovisor/bpf-docs/blob/master/eBPF.md