laik / demo

2 stars 0 forks source link

cilium的深入分析 #24

Open laik opened 2 years ago

laik commented 2 years ago

editing

laik commented 2 years ago

cilium 的容器网络

从传统网络架构的视觉去看cilium会有一种云里雾里的感觉,为什么呢? 因为从传统网络架构里,在容器里的网卡配置如以下:

在主空间进入容器空间
# ip netns exec k8s_POD_nginx-app-7f6fdf9556-gp2vg_default_d6462fb3-1b67-4b76-a058-f509e2e42e29_0 /bin/bash                

在容器空间里
# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
22: eth0@if23: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 3e:7c:ad:72:24:6d brd ff:ff:ff:ff:ff:ff
    inet 10.0.0.156/32 scope global eth0
       valid_lft forever preferred_lft forever

# ip route
default via 10.0.0.157 dev eth0
10.0.0.157 dev eth0 scope link

#route -n
Kernel IP routing table
Destination     Gateway         Genmask         Flags Metric Ref    Use Iface
0.0.0.0         10.0.0.157      0.0.0.0         UG    0      0        0 eth0
10.0.0.157      0.0.0.0         255.255.255.255 UH    0      0        0 eth0

看到容器eth0的是一个10.0.0.156/32,32位的子网掩码,跟一个默认到10.0.0.157通过eth0出去的默认路由。

像不像点到点的配置连接,在传统网络里,只有路由器与路由器直连才会用到像这种32位子网掩码的方式基本只能用在点对点的配置,而cilium则是通过这个网络接口,将容器的网络接口和cilium的网络接口连接起来,这样cilium就可以通过cilium的网络接口来访问容器的网络接口。

那cilium是如何做到的呢?

laik commented 2 years ago

cilium 的设备分析

在主空间,通过ip a命令查看cilium相关的网络接口设备

2: ens160: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast state UP group default qlen 1000
    link/ether 00:0c:29:8e:1a:5f brd ff:ff:ff:ff:ff:ff
    altname enp2s0
    inet 192.168.2.134/24 brd 192.168.2.255 scope global dynamic ens160
       valid_lft 1604sec preferred_lft 1604sec
    inet6 fe80::20c:29ff:fe8e:1a5f/64 scope link
       valid_lft forever preferred_lft forever
6: cilium_net@cilium_host: <BROADCAST,MULTICAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 7a:c7:50:d0:f9:d9 brd ff:ff:ff:ff:ff:ff
    inet6 fe80::78c7:50ff:fed0:f9d9/64 scope link
       valid_lft forever preferred_lft forever
7: cilium_host@cilium_net: <BROADCAST,MULTICAST,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 76:6a:81:31:1a:c7 brd ff:ff:ff:ff:ff:ff
    inet 10.0.0.157/32 scope link cilium_host
       valid_lft forever preferred_lft forever
    inet6 fe80::746a:81ff:fe31:1ac7/64 scope link
       valid_lft forever preferred_lft forever
9: cilium_vxlan: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000
    link/ether 2e:90:71:22:6e:da brd ff:ff:ff:ff:ff:ff
    inet6 fe80::2c90:71ff:fe22:6eda/64 scope link
       valid_lft forever preferred_lft forever
23: lxce22e557aa23f@if22: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 42:01:ab:d9:26:6f brd ff:ff:ff:ff:ff:ff link-netns k8s_POD_nginx-app-7f6fdf9556-gp2vg_default_d6462fb3-1b67-4b76-a058-f509e2e42e29_0
    inet6 fe80::4001:abff:fed9:266f/64 scope link
       valid_lft forever preferred_lft forever

lxce22e557aa23f 是刚刚对应的k8s_POD_nginx-app-7f6fdf9556-gp2vg_default_d6462fb3-1b67-4b76-a058-f509e2e42e29_0 空间的以太对接口,不了解以太对的可以把它想象成一条网线。

cilium_host 10.0.0.157/32 cilium创建的的一个网络设备,也是k8s_POD_nginx-app-7f6fdf9556-gp2vg_default_d6462fb3-1b67-4b76-a058-f509e2e42e29_0 的默认路由就是指向这个默认路由。 有没有发现有什么不对劲的地方了? 这个架构好像是悬空的。

因为这种使用32位子网掩码的连接方式如同以下点对点连接 image

而cilium 在host上的设备上看连接是这样的 image

那么再看看cilium上面的网络设备图 image

laik commented 2 years ago

网络收发的跟踪

容器eth0 10.0.0.156/32 如何去到cilium-host 10.0.0.157/32的?首先要从网络的二层以太包说起,当一个请求发起,在容器里需要知道二层的端口信息,也就是我们所说的ARP表,那么我们首先来dump一下这个10.0.0.156/32发起请求时,lxce22e557aa23f上有没有ARP的请求信息。

在容器空间里
root@debian:~# ip a
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
       valid_lft forever preferred_lft forever
22: eth0@if23: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
    link/ether 3e:7c:ad:72:24:6d brd ff:ff:ff:ff:ff:ff link-netns k8s_POD_kube-scheduler-debian_kube-system_813eb49f3a6b8d48aac527b845390dbb_1
    inet 10.0.0.156/32 scope global eth0
       valid_lft forever preferred_lft forever
root@debian:~# arp -a
root@debian:~# curl baidu.com
<html>
<meta http-equiv="refresh" content="0;url=http://www.baidu.com/">
</html>
root@debian:~#
在主空间
# tcpdump -i lxce22e557aa23f
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on lxce22e557aa23f, link-type EN10MB (Ethernet), snapshot length 262144 bytes
09:56:10.787365 ARP, Request who-has 10.0.0.157 tell 10.0.0.156, length 28               ## ARP请求
09:56:10.787374 ARP, Reply 10.0.0.157 is-at 42:01:ab:d9:26:6f (oui Unknown), length 28
09:56:10.787385 IP 10.0.0.156.33618 > 192.168.2.2.domain: 33962+ A? baidu.com. (27)
09:56:10.787594 IP 10.0.0.156.33618 > 192.168.2.2.domain: 50610+ AAAA? baidu.com. (27)
09:56:10.801016 IP 192.168.2.2.domain > 10.0.0.156.33618: 33962 2/0/0 A 220.181.38.148, A 220.181.38.251 (59)
09:56:10.803348 IP 192.168.2.2.domain > 10.0.0.156.33618: 50610 1/0/0 AAAA ::ffff:240.0.0.37 (55)
09:56:10.803877 IP 10.0.0.156.54446 > 220.181.38.148.http: Flags [S], seq 1602328105, win 64240, options [mss 1460,sackOK,TS val 3477992275 ecr 0,nop,wscale 7], length 0
09:56:10.804753 IP 220.181.38.148.http > 10.0.0.156.54446: Flags [S.], seq 332435032, ack 1602328106, win 64240, options [mss 1460], length 0
09:56:10.804829 IP 10.0.0.156.54446 > 220.181.38.148.http: Flags [.], ack 1, win 64240, length 0
09:56:10.804916 IP 10.0.0.156.54446 > 220.181.38.148.http: Flags [P.], seq 1:74, ack 1, win 64240, length 73: HTTP: GET / HTTP/1.1
09:56:10.805126 IP 220.181.38.148.http > 10.0.0.156.54446: Flags [.], ack 74, win 64240, length 0
09:56:10.912624 IP 220.181.38.148.http > 10.0.0.156.54446: Flags [P.], seq 1:387, ack 74, win 64240, length 386: HTTP: HTTP/1.1 200 OK
09:56:10.912677 IP 10.0.0.156.54446 > 220.181.38.148.http: Flags [.], ack 387, win 63854, length 0
09:56:10.912975 IP 10.0.0.156.54446 > 220.181.38.148.http: Flags [F.], seq 74, ack 387, win 63854, length 0
09:56:10.913399 IP 220.181.38.148.http > 10.0.0.156.54446: Flags [.], ack 75, win 64239, length 0

是不是有点惊讶,ARP的请求居然被响应了

在容器空间的ARP表
#  arp -a
? (10.0.0.157) at 42:01:ab:d9:26:6f [ether] on eth0

‘42:01:ab:d9:26:6f’ 这个是以太对lxce22e557aa23f的mac地址。这是不是更加疑惑了,传统的网络主,响应回网关的ARP信息对应网关的 ip+对应的mac地址。

看到这里,是不是觉得就不应该用传统的网络架构来看cilium了。 因为这个不是桥接的设备,二层的ARP正常来说是不能够相互请求响应的,但是当容器在发起请求时,确实是被响应的ARP响应包。

laik commented 2 years ago

ebpf 程序分析

上面提到lxce22e557aa23f中收到了ARP请求然后做了ARP的Reply,那么这个是如何工作的呢?

在主空间查看 lxce22e557aa23f 的 tc ingress
# tc filter show dev lxce22e557aa23f ingress
filter protocol all pref 1 bpf chain 0
filter protocol all pref 1 bpf chain 0 handle 0x1 bpf_lxc.o:[from-container] direct-action not_in_hw id 843 tag 005af7fd5971686b jited

返回的结果中看到这个以太对端的ingress上有一个bpf程序,那思考下,为什么是ingress? image

对于结构上来说,以太对像是一根网线,因为从容器的设备eth0发送数据就会到达lxce22e557aa23f这一端,而这一端上面属于在内核协议栈上来说是ingress, 然而这个“网线”的另一端补hook一个ebpf程序。这个ebpf的程序使用的是bpf_lxc.o[from-container]

// cilium 项目中 bpf/bpf_lxc.c 代码, cilium 1.11.x (20220111)
__section("from-container")
int handle_xgress(struct __ctx_buff *ctx)
{
__u16 proto;
    int ret;

    bpf_clear_meta(ctx);
    reset_queue_mapping(ctx);

    send_trace_notify(ctx, TRACE_FROM_LXC, SECLABEL, 0, 0, 0, 0,
              TRACE_PAYLOAD_LEN);

    if (!validate_ethertype(ctx, &proto)) {
        ret = DROP_UNSUPPORTED_L2;
        goto out;
    }

    switch (proto) {
#ifdef ENABLE_IPV6
    case bpf_htons(ETH_P_IPV6):
        edt_set_aggregate(ctx, LXC_ID);
        invoke_tailcall_if(__or3(is_defined(ENABLE_IPV4), is_defined(ENABLE_IPV6),
                     is_defined(DEBUG)),
                   CILIUM_CALL_IPV6_FROM_LXC, tail_handle_ipv6);
        break;
#endif /* ENABLE_IPV6 */
#ifdef ENABLE_IPV4
    case bpf_htons(ETH_P_IP):              ---> ip 流量的tail_call
        edt_set_aggregate(ctx, LXC_ID);
        invoke_tailcall_if(__or3(is_defined(ENABLE_IPV4), is_defined(ENABLE_IPV6),
                     is_defined(DEBUG)),
                   CILIUM_CALL_IPV4_FROM_LXC, tail_handle_ipv4);
        break;
#ifdef ENABLE_ARP_PASSTHROUGH
    case bpf_htons(ETH_P_ARP):
        ret = CTX_ACT_OK;
        break;
#elif defined(ENABLE_ARP_RESPONDER)
    case bpf_htons(ETH_P_ARP):
        ep_tail_call(ctx, CILIUM_CALL_ARP);           ---> ARP请求的tail_call
        ret = DROP_MISSED_TAIL_CALL;
        break;
#endif /* ENABLE_ARP_RESPONDER */
#endif /* ENABLE_IPV4 */
    default:
        ret = DROP_UNKNOWN_L3;
    }

out:
    if (IS_ERR(ret))
        return send_drop_notify(ctx, SECLABEL, 0, 0, ret, CTX_ACT_DROP,
                    METRIC_EGRESS);
    return ret;
}

从这段代码上看,确实是在以太对的另一端上,这个ARP请求被响应了;对于传统网络来说,这个是接入到一个桥设备中,然后开启linux arp代应答; 粗略一看,cilium这种简单、暴力的构架对比内核(ebtables) 中ARP porxy更容易控制及可靠。

laik commented 2 years ago

ebpf 程序分析2

现在通过一个个设备的hook点检查都被挂载了哪些ebpf程序。 image

通过上面说到的ebpf程序可以hook在网络设备的进出端(tc {in/e}gress)点上,那么根据上面的程序分析,及一条到达10.0.0.157/32的主机路由就可以看出一个pod(ns)如何到达网关节点。 对比传统架构 gw-[ [ br ] ns以太对] <->ns 是不是感觉更简洁。

用bpftool查看

# bpftool net                                                                                                                                                                                     255 xdp:

tc:
cilium_net(6) clsact/ingress bpf_host_cilium_net.o:[to-host] id 745
cilium_host(7) clsact/ingress bpf_host.o:[to-host] id 731
cilium_host(7) clsact/egress bpf_host.o:[from-host] id 738
cilium_vxlan(9) clsact/ingress bpf_overlay.o:[from-overlay] id 714
cilium_vxlan(9) clsact/egress bpf_overlay.o:[to-overlay] id 720
lxc_health(11) clsact/ingress bpf_lxc.o:[from-container] id 764
lxc04b6b383c793(13) clsact/ingress bpf_lxc.o:[from-container] id 767
lxcb7401eba32e3(15) clsact/ingress bpf_lxc.o:[from-container] id 770
lxce22e557aa23f(23) clsact/ingress bpf_lxc.o:[from-container] id 843

flow_dissector:
laik commented 2 years ago

cilium bpf_lxc.c handle_xgress 的调用链分析

__section("from-container")
handle_xgress                                                                   // bpf/bpf_lxc.c
  |-validate_ethertype(skb, &proto)
  |-switch (proto) {
      case ETH_P_IP:                                                           // invoke_tailcall_if -> /lib/tailcall.h CILIUM_CALL_IPV4_FROM_LXC
          tail_handle_ipv4                                                     // bpf/bpf_lxc.c
            |-handle_ipv4_from_lxc                                             // bpf/bpf_lxc.c
                |-if dst is k8s Service
                |    lb4_local()
                |      |-ct_create4
                |      |-lb4_lookup_backend
                |      |-lb4_xlate
                |
                |-policy_can_egress4()
                |
                |-if tunnel
                |     encap vxlan
                | else // direct routing, pass to kernel stack (continue normal routing)
                |     ipv4_l3()                       // dec TTL, set src/dst MAC
                |     asm_set_seclabel_identity(skb); // set identity to skb
                |-return TC_ACT_OK;
        case ETH_P_ARP: 
            tail_handle_arp -> bpf_lxc.c(941)
    }

从上面的代码结构分析中,可以看到好多tail_call的字眼,这个是ebpf的尾调用,就是把程序的ID放在ebpf prog map里,然后通过helper函数调用相应的程序id; 为什么这么设计就需要参考ebpf的架构设计(上下文传递的问题)

tail_handle_arp 细节分析

CILIUM_CALL_ARP -> 指向了tail_handle_arp 函数,

__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_ARP)
int tail_handle_arp(struct __ctx_buff *ctx)
{
    union macaddr mac = NODE_MAC;
    union macaddr smac;
    __be32 sip;
    __be32 tip;

    /* Pass any unknown ARP requests to the Linux stack */
    if (!arp_validate(ctx, &mac, &smac, &sip, &tip))
        return CTX_ACT_OK;

    /*
     * The endpoint is expected to make ARP requests for its gateway IP.
     * Most of the time, the gateway IP configured on the endpoint is
     * IPV4_GATEWAY but it may not be the case if after cilium agent reload
     * a different gateway is chosen. In such a case, existing endpoints
     * will have an old gateway configured. Since we don't know the IP of
     * previous gateways, we answer requests for all IPs with the exception
     * of the LXC IP (to avoid specific problems, like IP duplicate address
     * detection checks that might run within the container).
     */
    if (tip == LXC_IPV4)
        return CTX_ACT_OK;

    return arp_respond(ctx, &mac, tip, &smac, sip, 0);     ---> 来自容器ARP request 的响应 reply。
}
#endif /* ENABLE_ARP_RESPONDER */
#endif /* ENABLE_IPV4 */

管中窥豹一下先

看到这里是不是有点感觉了,但是对于如何reponse回去,tail_call的数据如何应答回去就有点蒙逼? 首先我们先插入个ebpf的小知识点及cilium对基础库的ebpf helper函数的封装。以刚刚代码arp_respond为例深入刨解下逻辑,便于后面理解这个简洁抽象的代码背后的深层次代码哲学。

# cilium bpf/lib/arp.h文件中
static __always_inline int
arp_respond(struct __ctx_buff *ctx, union macaddr *smac, __be32 sip,
        union macaddr *dmac, __be32 tip, int direction)
{
    int ret = arp_prepare_response(ctx, smac, sip, dmac, tip);

    if (unlikely(ret != 0))
        goto error;

    cilium_dbg_capture(ctx, DBG_CAPTURE_DELIVERY,
               ctx_get_ifindex(ctx));
    return ctx_redirect(ctx, ctx_get_ifindex(ctx), direction);

error:
    return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_EGRESS);
}

可以看到,arp_prepare_response(修改ctx中的数据),然后执行了 ctx_redirect(bpf redirect指的是将数据转发到哪个设备口上面),下面插入一下这个ctx_redirect函数,因为整个cilium ebpf代码架构里很多用到这个函数的地方;

# bpf/include/bpf/ctx/skb.h

static __always_inline __maybe_unused int
ctx_redirect(const struct __sk_buff *ctx __maybe_unused, int ifindex, __u32 flags)
{
    return redirect(ifindex, flags);
}

# bpf/include/bpf/helpers_skb.h 中的定义,这3个helper函数都是用作redirect, 不明所以的可以看看 https://github.com/laik/ebpf-app里有个xdp-redirect 的小测试就可以明白。
/* Packet redirection */
static int BPF_FUNC(redirect, int ifindex, __u32 flags);
static int BPF_FUNC(redirect_neigh, int ifindex, struct bpf_redir_neigh *params,
            int plen, __u32 flags);
static int BPF_FUNC(redirect_peer, int ifindex, __u32 flags);

arp_prepare_response 函数的实现,其实就是将ctx的数据来源的接口中的数据修改成以太对的数据。

static __always_inline int
arp_prepare_response(struct __ctx_buff *ctx, union macaddr *smac, __be32 sip,
             union macaddr *dmac, __be32 tip)
{
    __be16 arpop = bpf_htons(ARPOP_REPLY);

    if (eth_store_saddr(ctx, smac->addr, 0) < 0 ||
        eth_store_daddr(ctx, dmac->addr, 0) < 0 ||
        ctx_store_bytes(ctx, 20, &arpop, sizeof(arpop), 0) < 0 ||
        /* sizeof(macadrr)=8 because of padding, use ETH_ALEN instead */
        ctx_store_bytes(ctx, 22, smac, ETH_ALEN, 0) < 0 ||
        ctx_store_bytes(ctx, 28, &sip, sizeof(sip), 0) < 0 ||
        ctx_store_bytes(ctx, 32, dmac, ETH_ALEN, 0) < 0 ||
        ctx_store_bytes(ctx, 38, &tip, sizeof(tip), 0) < 0)
        return DROP_WRITE_ERROR;

    return 0;
}

arp_prepare_response执行完后,然后重转发给自己,然后容器端就收到了这个arp reply

tail_handle_ipv4 细节分析

在开始分析这个流量之前,需要做足的工课,因为接下来的datapath实现的是k8s的cni规范,而且还会替换了kube-proxy里的实现。

首先来大概分析下流量的走向

1.pod能跟其他的pod包括其他node的pod能直接通讯。
2.pod能跟其他的pod包括其他node能直接通讯。
3.pod能通过service clusterIP通讯(或者LB IP)。
4.node能通过service clusterIP通讯pod(或者LB IP)。
5.node能通讯pod (探针等等实现)。
6.k8s的流量访问策略等等。

service的cidr在k8s初始化的时候就需要指定,不指定默认用10.96.0.0/12,在声明一个service的时候,k8s里创建endpoint,关联到相应的pod上,实现可以访问vip访问pod的功能。

两台机上的设备与分配的svc ip/nodeport 等 image

那接下来就开始分析这个handle_ipv4函数了

# bpf_lxc.c
declare_tailcall_if(__or3(is_defined(ENABLE_IPV4), is_defined(ENABLE_IPV6),
              is_defined(DEBUG)), CILIUM_CALL_IPV4_FROM_LXC)
int tail_handle_ipv4(struct __ctx_buff *ctx)
{
    __u32 dst_id = 0;
    int ret = handle_ipv4_from_lxc(ctx, &dst_id);

    if (IS_ERR(ret))
        return send_drop_notify(ctx, SECLABEL, dst_id, 0, ret,
                    CTX_ACT_DROP, METRIC_EGRESS);

#ifdef ENABLE_CUSTOM_CALLS
    if (!encode_custom_prog_meta(ctx, ret, dst_id)) {
        tail_call_static(ctx, &CUSTOM_CALLS_MAP,
                 CUSTOM_CALLS_IDX_IPV4_EGRESS);
        update_metrics(ctx_full_len(ctx), METRIC_EGRESS,
                   REASON_MISSED_CUSTOM_CALL);
    }
#endif

    return ret;
}

handle_ipv4_from_lxc的函数,顾名思义,就是处理来自pod容器的ipv4数据包

# bpf_lxc.c
#ifdef ENABLE_IPV4
static __always_inline int handle_ipv4_from_lxc(struct __ctx_buff *ctx,
                        __u32 *dst_id)
{
    struct ipv4_ct_tuple tuple = {};
#ifdef ENABLE_ROUTING
    union macaddr router_mac = NODE_MAC;
#endif
    void *data, *data_end;
    struct iphdr *ip4;
    int ret, verdict = 0, l3_off = ETH_HLEN, l4_off;
    struct csum_offset csum_off = {};
    struct ct_state ct_state_new = {};
    struct ct_state ct_state = {};
    __be32 orig_dip;
    __u32 __maybe_unused tunnel_endpoint = 0;
    __u8 __maybe_unused encrypt_key = 0;
    __u32 monitor = 0;
    __u8 ct_ret;
    bool hairpin_flow = false; /* endpoint wants to access itself via service IP */
    __u8 policy_match_type = POLICY_MATCH_NONE;
    __u8 audited = 0;
    bool has_l4_header = false;
    bool __maybe_unused dst_remote_ep = false;

    if (!revalidate_data(ctx, &data, &data_end, &ip4))
        return DROP_INVALID;

/* If IPv4 fragmentation is disabled
 * AND a IPv4 fragmented packet is received,
 * then drop the packet.
 */
#ifndef ENABLE_IPV4_FRAGMENTS
    if (ipv4_is_fragment(ip4))
        return DROP_FRAG_NOSUPPORT;
#endif

    has_l4_header = ipv4_has_l4_header(ip4);

    tuple.nexthdr = ip4->protocol;

    if (unlikely(!is_valid_lxc_src_ipv4(ip4)))    // ---> 检查容器合法ipv4
        return DROP_INVALID_SIP;

    tuple.daddr = ip4->daddr;
    tuple.saddr = ip4->saddr;

    l4_off = l3_off + ipv4_hdrlen(ip4);

    // 以下逻辑处理其实都在围绕着CT数据状态(k8s里nf_conntrack的功能)

#ifdef ENABLE_PER_PACKET_LB      //判断是还访问k8s Service
    {
        struct lb4_service *svc;
        struct lb4_key key = {};

        ret = lb4_extract_key(ctx, ip4, l4_off, &key, &csum_off,
                      CT_EGRESS);
        if (IS_ERR(ret)) {
            if (ret == DROP_NO_SERVICE || ret == DROP_UNKNOWN_L4)
                goto skip_service_lookup;
            else
                return ret;
        }

        svc = lb4_lookup_service(&key, is_defined(ENABLE_NODEPORT));
        if (svc) {
            ret = lb4_local(get_ct_map4(&tuple), ctx, l3_off, l4_off,
                    &csum_off, &key, &tuple, svc, &ct_state_new,
                    ip4->saddr, has_l4_header, false);
            if (IS_ERR(ret))
                return ret;
            hairpin_flow |= ct_state_new.loopback;
        }
    }

skip_service_lookup:
#endif /* ENABLE_PER_PACKET_LB */

    /* The verifier wants to see this assignment here in case the above goto
     * skip_service_lookup is hit. However, in the case the packet
     * is _not_ TCP or UDP we should not be using proxy logic anyways. For
     * correctness it must be below the service handler in case the service
     * logic re-writes the tuple daddr. In "theory" however the assignment
     * should be OK to move above goto label.
     */
    orig_dip = tuple.daddr;

    /* WARNING: ip4 offset check invalidated, revalidate before use */

    /* Pass all outgoing packets through conntrack. This will create an
     * entry to allow reverse packets and return set cb[CB_POLICY] to
     * POLICY_SKIP if the packet is a reply packet to an existing incoming
     * connection.
     */
    ct_ret = ct_lookup4(get_ct_map4(&tuple), &tuple, ctx, l4_off, CT_EGRESS,
                &ct_state, &monitor);
    if (ct_ret < 0)
        return ct_ret;

    /* Check it this is return traffic to an ingress proxy. */
    if ((ct_ret == CT_REPLY || ct_ret == CT_RELATED) && ct_state.proxy_redirect) {
        /* Stack will do a socket match and deliver locally. */
        return ctx_redirect_to_proxy4(ctx, &tuple, 0, false);
    }

    /* Determine the destination category for policy fallback. */
    if (1) {
        struct remote_endpoint_info *info;

        info = lookup_ip4_remote_endpoint(orig_dip);
        if (info != NULL && info->sec_label) {
            *dst_id = info->sec_label;
            tunnel_endpoint = info->tunnel_endpoint;
            encrypt_key = get_min_encrypt_key(info->key);
#ifdef ENABLE_WIREGUARD
            /* If we detect that the dst is a remote endpoint, we
             * need to mark the packet. The ip rule which matches
             * on the MARK_MAGIC_ENCRYPT mark will steer the packet
             * to the Wireguard tunnel. The marking happens lower
             * in the code in the same place where we handle IPSec.
             */
            if (info->tunnel_endpoint != 0 &&
                !identity_is_node(info->sec_label))
                dst_remote_ep = true;
#endif /* ENABLE_WIREGUARD */
        } else {
            *dst_id = WORLD_ID;
        }

        cilium_dbg(ctx, info ? DBG_IP_ID_MAP_SUCCEED4 : DBG_IP_ID_MAP_FAILED4,
               orig_dip, *dst_id);
    }

    /* When an endpoint connects to itself via service clusterIP, we need
     * to skip the policy enforcement. If we didn't, the user would have to
     * define policy rules to allow pods to talk to themselves. We still
     * want to execute the conntrack logic so that replies can be correctly
     * matched.
     */
    if (hairpin_flow)
        goto skip_policy_enforcement;

    /* If the packet is in the establishing direction and it's destined
     * within the cluster, it must match policy or be dropped. If it's
     * bound for the host/outside, perform the CIDR policy check.
     */
    verdict = policy_can_egress4(ctx, &tuple, SECLABEL, *dst_id,
                     &policy_match_type, &audited);

    if (ct_ret != CT_REPLY && ct_ret != CT_RELATED && verdict < 0) {
        send_policy_verdict_notify(ctx, *dst_id, tuple.dport,
                       tuple.nexthdr, POLICY_EGRESS, 0,
                       verdict, policy_match_type, audited);
        return verdict;
    }

// CT的状态检查及更新
skip_policy_enforcement:
    switch (ct_ret) {
    case CT_NEW:
        if (!hairpin_flow)
            send_policy_verdict_notify(ctx, *dst_id, tuple.dport,
                           tuple.nexthdr, POLICY_EGRESS, 0,
                           verdict, policy_match_type, audited);
ct_recreate4:
        /* New connection implies that rev_nat_index remains untouched
         * to the index provided by the loadbalancer (if it applied).
         * Create a CT entry which allows to track replies and to
         * reverse NAT.
         */
        ct_state_new.src_sec_id = SECLABEL;
        /* We could avoid creating related entries for legacy ClusterIP
         * handling here, but turns out that verifier cannot handle it.
         */
        ret = ct_create4(get_ct_map4(&tuple), &CT_MAP_ANY4, &tuple, ctx,
                 CT_EGRESS, &ct_state_new, verdict > 0);
        if (IS_ERR(ret))
            return ret;
        break;

    case CT_REOPENED:
        if (!hairpin_flow)
            send_policy_verdict_notify(ctx, *dst_id, tuple.dport,
                           tuple.nexthdr, POLICY_EGRESS, 0,
                           verdict, policy_match_type, audited);
    case CT_ESTABLISHED:
        /* Did we end up at a stale non-service entry? Recreate if so. */
        if (unlikely(ct_state.rev_nat_index != ct_state_new.rev_nat_index))
            goto ct_recreate4;
        break;

    case CT_RELATED:
    case CT_REPLY:
        policy_mark_skip(ctx);

// 以NodePort方式访问,下面注释说如果是local节点,将重转发到bpf_host然后做dnat.
#ifdef ENABLE_NODEPORT
        /* This handles reply traffic for the case where the nodeport EP
         * is local to the node. We'll redirect to bpf_host egress to
         * perform the reverse DNAT.
         */
        if (ct_state.node_port) {
            ctx->tc_index |= TC_INDEX_F_SKIP_RECIRCULATION;
            ep_tail_call(ctx, CILIUM_CALL_IPV4_NODEPORT_REVNAT);
            return DROP_MISSED_TAIL_CALL;
        }
# ifdef ENABLE_DSR
        if (ct_state.dsr) {
            ret = xlate_dsr_v4(ctx, &tuple, l4_off, has_l4_header);
            if (ret != 0)
                return ret;
        }
# endif /* ENABLE_DSR */
#endif /* ENABLE_NODEPORT */

        if (ct_state.rev_nat_index) {
            ret = lb4_rev_nat(ctx, l3_off, l4_off, &csum_off,
                      &ct_state, &tuple, 0, has_l4_header);
            if (IS_ERR(ret))
                return ret;
        }
        break;

    default:
        return DROP_UNKNOWN_CT;
    }

    hairpin_flow |= ct_state.loopback;

    if (redirect_to_proxy(verdict, ct_ret)) {
        /* Trace the packet before it is forwarded to proxy */
        send_trace_notify(ctx, TRACE_TO_PROXY, SECLABEL, 0,
                  bpf_ntohs(verdict), 0, ct_ret, monitor);
        return ctx_redirect_to_proxy4(ctx, &tuple, verdict, false);
    }

    /* After L4 write in port mapping: revalidate for direct packet access */
    if (!revalidate_data(ctx, &data, &data_end, &ip4))
        return DROP_INVALID;

    orig_dip = ip4->daddr;

    /* Allow a hairpin packet to be redirected even if ENABLE_ROUTING is
     * disabled. Otherwise, the packet will be dropped by the kernel if
     * it is going to be routed via an interface it came from after it has
     * been passed to the stack.
     */
    if (is_defined(ENABLE_ROUTING) || hairpin_flow) {
        struct endpoint_info *ep;

        /* Lookup IPv4 address, this will return a match if:
         *  - The destination IP address belongs to a local endpoint
         *    managed by cilium
         *  - The destination IP address is an IP address associated with the
         *    host itself
         *  - The destination IP address belongs to endpoint itself.
         */
        ep = lookup_ip4_endpoint(ip4);
        if (ep) {
#ifdef ENABLE_ROUTING
            if (ep->flags & ENDPOINT_F_HOST) {
#ifdef HOST_IFINDEX
                goto to_host;
#else
                return DROP_HOST_UNREACHABLE;
#endif
            }
#endif /* ENABLE_ROUTING */
            policy_clear_mark(ctx);
            return ipv4_local_delivery(ctx, l3_off, SECLABEL, ip4,
                           ep, METRIC_EGRESS, false);
        }
    }

#if defined(ENABLE_HOST_FIREWALL) && !defined(ENABLE_ROUTING)
    /* If the destination is the local host and per-endpoint routes are
     * enabled, jump to the bpf_host program to enforce ingress host policies.
     */
    if (*dst_id == HOST_ID) {
        ctx_store_meta(ctx, CB_FROM_HOST, 0);
        tail_call_static(ctx, &POLICY_CALL_MAP, HOST_EP_ID);
        return DROP_MISSED_TAIL_CALL;
    }
#endif /* ENABLE_HOST_FIREWALL && !ENABLE_ROUTING */

#ifdef ENABLE_EGRESS_GATEWAY
    {
        struct egress_gw_policy_entry *egress_gw_policy;
        struct endpoint_key key = {};

        /* If the packet is destined to an entity inside the cluster,
         * either EP or node, it should not be forwarded to an egress
         * gateway since only traffic leaving the cluster is supposed to
         * be masqueraded with an egress IP.
         */
        if (is_cluster_destination(ip4, *dst_id, tunnel_endpoint))
            goto skip_egress_gateway;

        /* If the packet is a reply or is related, it means that outside
         * has initiated the connection, and so we should skip egress
         * gateway, since an egress policy is only matching connections
         * originating from a pod.
         */
        if (ct_ret == CT_REPLY || ct_ret == CT_RELATED)
            goto skip_egress_gateway;

        egress_gw_policy = lookup_ip4_egress_gw_policy(ip4->saddr, ip4->daddr);
        if (!egress_gw_policy)
            goto skip_egress_gateway;

        /* Encap and redirect the packet to egress gateway node through a tunnel.
         * Even if the tunnel endpoint is on the same host, follow the same data
         * path to be consistent. In future, it can be optimized by directly
         * direct to external interface.
         */
        ret = encap_and_redirect_lxc(ctx, egress_gw_policy->gateway_ip, encrypt_key,
                         &key, SECLABEL, monitor);
        if (ret == IPSEC_ENDPOINT)
            goto encrypt_to_stack;
        else
            return ret;
    }
skip_egress_gateway:
#endif

#ifdef TUNNEL_MODE
# ifdef ENABLE_WIREGUARD
    /* In the tunnel mode we encapsulate pod2pod traffic only via Wireguard
     * device, i.e. we do not encapsulate twice.
     */
    if (!dst_remote_ep)
# endif /* ENABLE_WIREGUARD */
    {
        struct endpoint_key key = {};

        key.ip4 = orig_dip & IPV4_MASK;
        key.family = ENDPOINT_KEY_IPV4;

        ret = encap_and_redirect_lxc(ctx, tunnel_endpoint, encrypt_key,
                         &key, SECLABEL, monitor);
        if (ret == DROP_NO_TUNNEL_ENDPOINT)
            goto pass_to_stack;
        /* If not redirected noteably due to IPSEC then pass up to stack
         * for further processing.
         */
        else if (ret == IPSEC_ENDPOINT)
            goto encrypt_to_stack;
        /* This is either redirect by encap code or an error has
         * occurred either way return and stack will consume ctx.
         */
        else
            return ret;
    }
#endif /* TUNNEL_MODE */
    if (is_defined(ENABLE_REDIRECT_FAST))
        return redirect_direct_v4(ctx, l3_off, ip4);

    goto pass_to_stack;

#ifdef ENABLE_ROUTING
to_host:
    if (is_defined(ENABLE_HOST_FIREWALL) && *dst_id == HOST_ID) {
        send_trace_notify(ctx, TRACE_TO_HOST, SECLABEL, HOST_ID, 0,
                  HOST_IFINDEX, ct_ret, monitor);
        return ctx_redirect(ctx, HOST_IFINDEX, BPF_F_INGRESS);
    }
#endif

pass_to_stack:
#ifdef ENABLE_ROUTING
    ret = ipv4_l3(ctx, l3_off, NULL, (__u8 *) &router_mac.addr, ip4);
    if (unlikely(ret != CTX_ACT_OK))
        return ret;
#endif

#ifdef ENABLE_WIREGUARD
    if (dst_remote_ep)
        set_encrypt_mark(ctx);
    else /* Wireguard and identity mark are mutually exclusive */
#elif !defined(TUNNEL_MODE)
# ifdef ENABLE_IPSEC
    if (encrypt_key && tunnel_endpoint) {
        set_encrypt_key_mark(ctx, encrypt_key);
#  ifdef IP_POOLS
        set_encrypt_dip(ctx, tunnel_endpoint);
#  endif /* IP_POOLS */
    } else
# endif /* ENABLE_IPSEC */
#endif /* ENABLE_WIREGUARD */
    {
#ifdef ENABLE_IDENTITY_MARK
        /* Always encode the source identity when passing to the stack.
         * If the stack hairpins the packet back to a local endpoint the
         * source identity can still be derived even if SNAT is
         * performed by a component such as portmap.
         */
        ctx->mark |= MARK_MAGIC_IDENTITY;
        set_identity_mark(ctx, SECLABEL);
#endif
    }

#if defined(TUNNEL_MODE) || defined(ENABLE_EGRESS_GATEWAY)
encrypt_to_stack:
#endif
    send_trace_notify(ctx, TRACE_TO_STACK, SECLABEL, *dst_id, 0, 0,
              ct_ret, monitor);
    cilium_dbg_capture(ctx, DBG_CAPTURE_DELIVERY, 0);
    return CTX_ACT_OK;
}

一大串代码,不过说明与变量的声明,注释等都可以说明一些大概的逻辑。注意代码里的skip标签,有直接goto到下一个的逻辑,从设计模式上来说,其实就是一套责任链模式在处理数据。(代码里的大概逻辑可以看出调用的关系,但是不是还是有点云里雾里,参数如何传进去,怎么根据数据来判断,这里就得知道ebpf里面的存储,ebpf map)

      case ETH_P_IP:                                                           // invoke_tailcall_if -> /lib/tailcall.h CILIUM_CALL_IPV4_FROM_LXC
          tail_handle_ipv4                                                     // bpf/bpf_lxc.c
            |-handle_ipv4_from_lxc                                             // bpf/bpf_lxc.c
                |-if dst is k8s Service
                |    lb4_local()
                |      |-ct_create4
                |      |-lb4_lookup_backend
                |      |-lb4_xlate
                |
                |-policy_can_egress4()
                |
                |-if tunnel
                |     encap vxlan
                | else // direct routing, pass to kernel stack (continue normal routing)
                |     ipv4_l3()                       // dec TTL, set src/dst MAC
                |     asm_set_seclabel_identity(skb); // set identity to skb
                |-return TC_ACT_OK;
        case ETH_P_ARP: 
laik commented 2 years ago

深入分析handle_ipv4_from_lxc逻辑1

上面的 tail_handle_ipv4-> handle_ipv4_from_lxc 调用链其实还不算很完整,下面继续补充,在看下面的之前,可以看看cilium的功能文档,其实是支持了L7、L4等策略访问(k8s里的策略)

kubectl get crd | grep .cilium.io

NAME                                         CREATED AT
ciliumclusterwidenetworkpolicies.cilium.io   2022-01-08T15:09:29Z
ciliumendpoints.cilium.io                    2022-01-08T15:09:29Z
ciliumexternalworkloads.cilium.io            2022-01-08T15:09:29Z
ciliumidentities.cilium.io                   2022-01-08T15:09:29Z
ciliumnetworkpolicies.cilium.io              2022-01-08T15:09:30Z
ciliumnodes.cilium.io                        2022-01-08T15:09:29Z

相关链接 https://docs.cilium.io/en/v1.8/gettingstarted/http/

样例

apiVersion: "cilium.io/v2"
kind: CiliumNetworkPolicy
description: "L3-L4 policy to restrict deathstar access to empire ships only"
metadata:
  name: "rule1"
spec:
  endpointSelector:
    matchLabels:
      org: empire
      class: deathstar
  ingress:
  - fromEndpoints:
    - matchLabels:
        org: empire
    toPorts:
    - ports:
      - port: "80"
        protocol: TCP
laik commented 2 years ago

深入分析handle_ipv4_from_lxc逻辑2

从一个pod出发,看都需要访问哪些点才能满足CNI的标准规范 image

图上简陋描述一下,可以看到k8s的网络模型其实还是挺复杂的。 首先就是service这块的地址设计, 如果你考虑的是多个集群需要互联相互能够访问,那么一定要提前规划容器子网,service子网的规划。

laik commented 2 years ago

补充 cilium 里用的ebpf map

通过cilium的工具查询相应的map,注意那些enable true的

#cilium map list
Name                     Num entries   Num errors   Cache enabled
cilium_policy_01995      0             0            false
cilium_policy_03222      0             0            false
cilium_lxc               6             0            true
cilium_lb4_backends_v2   7             0            true
cilium_policy_00068      0             0            false
cilium_policy_02062      0             0            false
cilium_ipcache           11            0            true
cilium_tunnel_map        1             0            true
cilium_policy_01611      0             0            false
cilium_lb4_services_v2   4             0            true
cilium_lb4_reverse_nat   4             0            true
cilium_metrics           0             0            false

通过cilium的工具查询相应的map,注意那些enable true的

cilium_lxc               6             0            true        -> 本地容器的个数据
cilium_lb4_backends_v2   7             0            true        -> 路由指向(endpoint)
cilium_ipcache           11            0            true        -> ip 池
cilium_tunnel_map        1             0            true        -> vxlan 隧道,因为只有两台机,从本地看只有一个tunnel
cilium_lb4_services_v2   4             0            true        -> service
cilium_lb4_reverse_nat   4             0            true        -> cluster IP nat

如果你要问为什么会分析得出这个,那么从后面的分析步骤你就知道。

laik commented 2 years ago

补充 cilium 里用的ebpf map 2

# cilium bpf endpoint list                                                                                                                                                                         
IP ADDRESS        LOCAL ENDPOINT INFO
10.0.0.82:0       id=1611  flags=0x0000 ifindex=13  mac=3A:DC:14:F0:92:FF nodemac=72:18:18:3D:4C:30
192.168.2.134:0   (localhost)
10.0.0.156:0      id=3222  flags=0x0000 ifindex=23  mac=3E:7C:AD:72:24:6D nodemac=42:01:AB:D9:26:6F
10.0.0.56:0       id=1995  flags=0x0000 ifindex=15  mac=22:2D:24:47:C1:FC nodemac=7E:CA:DD:8E:56:5A
10.0.0.157:0      (localhost)
10.0.0.163:0      id=2062  flags=0x0000 ifindex=11  mac=5E:E5:A7:FE:C8:92 nodemac=E6:5D:6A:0C:6B:C8

# cilium bpf ipcache list
IP PREFIX/ADDRESS   IDENTITY
10.0.1.25/32        4 0 192.168.2.136
10.0.1.147/32       31114 0 192.168.2.136
192.168.2.134/32    1 0 0.0.0.0
0.0.0.0/0           2 0 0.0.0.0
192.168.2.136/32    6 0 0.0.0.0
10.0.0.56/32        5212 0 0.0.0.0
10.0.0.82/32        5212 0 0.0.0.0
10.0.0.156/32       31114 0 0.0.0.0
10.0.0.157/32       1 0 0.0.0.0
10.0.0.163/32       4 0 0.0.0.0
10.0.1.126/32       6 0 192.168.2.136

# cilium bpf tunnel list
TUNNEL       VALUE
10.0.1.0:0   192.168.2.136:0

# cilium bpf lb list
SERVICE ADDRESS     BACKEND ADDRESS
10.106.116.145:80   0.0.0.0:0 (4) [ClusterIP, non-routable]
                    10.0.1.147:80 (4)
                    10.0.0.156:80 (4)
10.96.0.1:443       192.168.2.134:6443 (1)
                    0.0.0.0:0 (1) [ClusterIP, non-routable]
10.96.0.10:9153     10.0.0.82:9153 (3)
                    0.0.0.0:0 (3) [ClusterIP, non-routable]
                    10.0.0.56:9153 (3)
10.96.0.10:53       10.0.0.82:53 (2)
                    0.0.0.0:0 (2) [ClusterIP, non-routable]
                    10.0.0.56:53 (2)

从上面可以看到,整个map的数据结构都存储了具体的pod,endpoint,cidr的隧道,还有lb(service,service-nat的信息,结合上面的内容,继续分析后面的handle_ipv4_from_lxc

laik commented 2 years ago

深入分析handle_ipv4_from_lxc逻辑3

从pod流量的出口需要访问的方向大概的分析逻辑就是类似如下逻辑,当然里面有很多细节,例如流量加密,策略访问等。

__section("from-container")
handle_xgress                                                                   // bpf/bpf_lxc.c
  |-validate_ethertype(skb, &proto)
  |-switch (proto) {
      case ETH_P_IP:                                                           // invoke_tailcall_if -> /lib/tailcall.h CILIUM_CALL_IPV4_FROM_LXC
          tail_handle_ipv4                                                     // bpf/bpf_lxc.c
            |-handle_ipv4_from_lxc                                             // bpf/bpf_lxc.c
                |-ct_tuple create                                              // ct
                |-if dst is k8s Service
                |    lb4_local()
                |      |-ct_create4
                |      |-lb4_lookup_backend
                |      |-lb_skip_l4_dnat ? CTX_ACT_OK : lb4_xlate
                |- else
                |      |-ct_lookup()
                |      |-Check it this is return traffic to an ingress proxy?
                |         |-Stack will do a socket match and deliver locally.   ctx_redirect_to_proxy4
                |      |-lookup_ip4_remote_endpoint                            // ipcache_lookup4 --> map ipcache look
                |         |- get_min_encrypt_key                               // look ENCRYPT_MAP
                |         |- if ENABLE_WIREGUARD ...
                |
                |-if !hairpin_flow
                |   policy_can_egress4()
                |     |- __policy_can_access...
                |
                |- ct check ...
                |- if [ct_state.node_port] // 如果是 nodeport tail_call CILIUM_CALL_IPV4_NODEPORT_REVNAT 根据应答响应的流量来自节点本地,将重定向到 bpf_host 出口到执行反向 DNAT。
                |
                |- if [ct_state.dsr] // 如果是DSR xlate_dsr_v4 look SNAT_MAPPING_IPV4 and snat_v4_rewrite_egress
                |
                |- if [ct_state.rev_nat_index] // look LB4_REVERSE_NAT_MAP and  call __lb4_rev_nat
                |
                |- if lookup_ip4_endpoint(ip4).flag == ENDPOINT_F_HOST goto to_host // look ENDPOINTS_MAP
                |
                |- to_host: ctx_redirect(ctx, HOST_IFINDEX, BPF_F_INGRESS); // 重转发到local host
                |- pass_to_stack: ....
                |- if tunnel
                |     encap vxlan
                | else // direct routing, pass to kernel stack (continue normal routing)
                |     ipv4_l3()                       // dec TTL, set src/dst MAC
                |     asm_set_seclabel_identity(skb); // set identity to skb
                |-return TC_ACT_OK;
        case ETH_P_ARP: 
            tail_handle_arp -> bpf_lxc.c(941)
    }

IPCACHE_MAP

struct {
    __uint(type, LPM_MAP_TYPE);
    __type(key, struct ipcache_key);
    __type(value, struct remote_endpoint_info);
    __uint(pinning, LIBBPF_PIN_BY_NAME);
    __uint(max_entries, IPCACHE_MAP_SIZE);
    __uint(map_flags, BPF_F_NO_PREALLOC);
} IPCACHE_MAP __section_maps_btf;

struct ipcache_key {
        .lpm_key = { IPCACHE_PREFIX_LEN(prefix), {} },
        .family = ENDPOINT_KEY_IPV4,
        .ip4 = addr,
} __packed;

struct remote_endpoint_info {
    __u32       sec_label;
    __u32       tunnel_endpoint;
    __u8        key;                ----> encrypt_key 
};

# cilium bpf ipcache list
IP PREFIX/ADDRESS   IDENTITY
10.0.0.82/32        5212 0 0.0.0.0
10.0.0.157/32       1 0 0.0.0.0
10.0.0.163/32       4 0 0.0.0.0
10.0.1.25/32        4 0 192.168.2.136
0.0.0.0/0           2 0 0.0.0.0
10.0.0.56/32        5212 0 0.0.0.0
10.0.0.156/32       31114 0 0.0.0.0
10.0.1.126/32       6 0 192.168.2.136
10.0.1.147/32       31114 0 192.168.2.136
192.168.2.134/32    1 0 0.0.0.0
192.168.2.136/32    6 0 0.0.0.0

CILIUM_CALL_IPV4_NODEPORT_REVNAT tail_rev_nodeport_lb4

__section_tail(CILIUM_MAP_CALLS, CILIUM_CALL_IPV4_NODEPORT_REVNAT)
int tail_rev_nodeport_lb4(struct __ctx_buff *ctx)
{
    int ifindex = 0;
    int ret = 0;
#if defined(ENABLE_HOST_FIREWALL) && defined(IS_BPF_HOST)
    /* We only enforce the host policies if nodeport.h is included from
     * bpf_host.
     */
    __u32 src_id = 0;

    ret = ipv4_host_policy_ingress(ctx, &src_id);
    if (IS_ERR(ret))
        return send_drop_notify_error(ctx, src_id, ret, CTX_ACT_DROP,
                          METRIC_INGRESS);
    /* We don't want to enforce host policies a second time if we jump back to
     * bpf_host's handle_ipv6.
     */
    ctx_skip_host_fw_set(ctx);
#endif
    ret = rev_nodeport_lb4(ctx, &ifindex);
    if (IS_ERR(ret))
        return send_drop_notify_error(ctx, 0, ret, CTX_ACT_DROP, METRIC_EGRESS);

    edt_set_aggregate(ctx, 0);
    cilium_capture_out(ctx);

    return ctx_redirect(ctx, ifindex, 0);
}

ENDPOINTS_MAP

struct {
    __uint(type, BPF_MAP_TYPE_HASH);
    __type(key, struct endpoint_key);
    __type(value, struct endpoint_info);
    __uint(pinning, LIBBPF_PIN_BY_NAME);
    __uint(max_entries, ENDPOINTS_MAP_SIZE);
    __uint(map_flags, CONDITIONAL_PREALLOC);
} ENDPOINTS_MAP __section_maps_btf;

struct endpoint_key key = {};

key.ip4 = ip;
key.family = ENDPOINT_KEY_IPV4;

struct endpoint_info {
    __u32       ifindex;
    __u16       unused; /* used to be sec_label, no longer used */
    __u16           lxc_id;
    __u32       flags;
    mac_t       mac;
    mac_t       node_mac;
    __u32       pad[4];
};

# cilium bpf endpoint list
IP ADDRESS        LOCAL ENDPOINT INFO
10.0.0.157:0      (localhost)
10.0.0.163:0      id=2062  flags=0x0000 ifindex=11  mac=5E:E5:A7:FE:C8:92 nodemac=E6:5D:6A:0C:6B:C8
10.0.0.82:0       id=1611  flags=0x0000 ifindex=13  mac=3A:DC:14:F0:92:FF nodemac=72:18:18:3D:4C:30
192.168.2.134:0   (localhost)
10.0.0.156:0      id=3222  flags=0x0000 ifindex=23  mac=3E:7C:AD:72:24:6D nodemac=42:01:AB:D9:26:6F
10.0.0.56:0       id=1995  flags=0x0000 ifindex=15  mac=22:2D:24:47:C1:FC nodemac=7E:CA:DD:8E:56:5A
主空间路由
10.0.0.0        10.0.0.157      255.255.255.0   UG    0      0        0 cilium_host
10.0.0.157      0.0.0.0         255.255.255.255 UH    0      0        0 cilium_host
10.0.1.0        10.0.0.157      255.255.255.0   UG    0      0        0 cilium_host

从代码层面逻辑梳理一下,可以看到所有的数据面逻辑都是由ebpf map作为条件来执行数据流向。

laik commented 2 years ago

其他文章的一些分析

翻译 https://arthurchiao.art/blog/understanding-ebpf-datapath-in-cilium-zh/

image

网易 https://www.infoq.cn/article/p9vg2g9t49kpvhrckfwu

image

image

laik commented 2 years ago

cilium 与 iptables

在cilium-agent初始化的时候,默认增加了几条iptables

# iptables -L
# Warning: iptables-legacy tables present, use iptables-legacy to see them
Chain INPUT (policy ACCEPT)
target     prot opt source               destination
CILIUM_INPUT  all  --  anywhere             anywhere             /* cilium-feeder: CILIUM_INPUT */
KUBE-NODEPORTS  all  --  anywhere             anywhere             /* kubernetes health check service ports */
KUBE-EXTERNAL-SERVICES  all  --  anywhere             anywhere             ctstate NEW /* kubernetes externally-visible service portals */
KUBE-FIREWALL  all  --  anywhere             anywhere

Chain FORWARD (policy ACCEPT)
target     prot opt source               destination
CILIUM_FORWARD  all  --  anywhere             anywhere             /* cilium-feeder: CILIUM_FORWARD */
KUBE-FORWARD  all  --  anywhere             anywhere             /* kubernetes forwarding rules */
KUBE-SERVICES  all  --  anywhere             anywhere             ctstate NEW /* kubernetes service portals */
KUBE-EXTERNAL-SERVICES  all  --  anywhere             anywhere             ctstate NEW /* kubernetes externally-visible service portals */
DOCKER-USER  all  --  anywhere             anywhere
DOCKER-ISOLATION-STAGE-1  all  --  anywhere             anywhere
ACCEPT     all  --  anywhere             anywhere             ctstate RELATED,ESTABLISHED
DOCKER     all  --  anywhere             anywhere
ACCEPT     all  --  anywhere             anywhere
ACCEPT     all  --  anywhere             anywhere

Chain OUTPUT (policy ACCEPT)
target     prot opt source               destination
CILIUM_OUTPUT  all  --  anywhere             anywhere             /* cilium-feeder: CILIUM_OUTPUT */
KUBE-SERVICES  all  --  anywhere             anywhere             ctstate NEW /* kubernetes service portals */
KUBE-FIREWALL  all  --  anywhere             anywhere

Chain CILIUM_FORWARD (1 references)
target     prot opt source               destination
ACCEPT     all  --  anywhere             anywhere             /* cilium: any->cluster on cilium_host forward accept */
ACCEPT     all  --  anywhere             anywhere             /* cilium: cluster->any on cilium_host forward accept (nodeport) */
ACCEPT     all  --  anywhere             anywhere             /* cilium: cluster->any on lxc+ forward accept */
ACCEPT     all  --  anywhere             anywhere             /* cilium: cluster->any on cilium_net forward accept (nodeport) */

Chain CILIUM_INPUT (1 references)
target     prot opt source               destination
ACCEPT     all  --  anywhere             anywhere             mark match 0x200/0xf00 /* cilium: ACCEPT for proxy traffic */

Chain CILIUM_OUTPUT (1 references)
target     prot opt source               destination
ACCEPT     all  --  anywhere             anywhere             mark match 0xa00/0xfffffeff /* cilium: ACCEPT for proxy return traffic */
MARK       all  --  anywhere             anywhere             mark match ! 0xe00/0xf00 mark match ! 0xd00/0xf00 mark match ! 0xa00/0xe00 /* cilium: host->any mark as from host */ MARK xset 0xc00/0xf00
  1. CILIUM_INPUT
  2. CILIUM_FORWARD
  3. CILIUM_OUTPUT
laik commented 2 years ago

cilium ebpf cgroup sockops

image

bpftool 工具分析输出

# bpftool cgroup tree -p
....
{
        "cgroup": "/sys/fs/cgroup/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-podd6462fb3_1b67_4b76_a058_f509e2e42e29.slice/docker-3b850f777dbd5bc4d40cec0cd074b51ad70328ddef23c030457efeb99fb1c61d.scope",
        "programs": [{
                "id": 836,
                "attach_type": "device",
                "attach_flags": "multi",
                "name": ""
            },{
                "id": 11507,
                "attach_type": "device",
                "attach_flags": "multi",
                "name": ""
            }
        ]
}

# bpftool prog -p 
{
        "id": 836,
        "type": "cgroup_device",
        "tag": "ab4bc4523b7fe6b4",
        "gpl_compatible": false,
        "loaded_at": 1642120449,
        "uid": 0,
        "bytes_xlated": 552,
        "jited": true,
        "bytes_jited": 488,
        "bytes_memlock": 4096
}
{
        "id": 11507,
        "type": "cgroup_device",
        "tag": "5b66259bfca5c6d7",
        "gpl_compatible": true,
        "loaded_at": 1642191122,
        "uid": 0,
        "bytes_xlated": 472,
        "jited": true,
        "bytes_jited": 448,
        "bytes_memlock": 4096
}
laik commented 2 years ago

cilium ebpf bpf_sock.o 分析

我的分析方法一般都是先看代码骨架叫什么名字,然后再看他实现了什么,然后再推理他的架构,先上代码。

__section("cgroup/connect4")
int sock4_connect(struct bpf_sock_addr *ctx)
{
    if (sock_is_health_check(ctx))
        return __sock4_health_fwd(ctx);

    __sock4_xlate_fwd(ctx, ctx, false);
    return SYS_PROCEED;
}

__section("cgroup/post_bind4")
int sock4_post_bind(struct bpf_sock *ctx)
{
    if (__sock4_post_bind(ctx, ctx) < 0)
        return SYS_REJECT;

    return SYS_PROCEED;
}

__section("cgroup/sendmsg4")
int sock4_sendmsg(struct bpf_sock_addr *ctx)
{
    __sock4_xlate_fwd(ctx, ctx, true);
    return SYS_PROCEED;
}

__section("cgroup/recvmsg4")
int sock4_recvmsg(struct bpf_sock_addr *ctx)
{
    __sock4_xlate_rev(ctx, ctx);
    return SYS_PROCEED;
}

__section("cgroup/getpeername4")
int sock4_getpeername(struct bpf_sock_addr *ctx)
{
    __sock4_xlate_rev(ctx, ctx);
    return SYS_PROCEED;
}
__section("cgroup/bind4")
int sock4_pre_bind(struct bpf_sock_addr *ctx)
{
    int ret = SYS_PROCEED;

    if (!sock_proto_enabled(ctx->protocol) ||
        !ctx_in_hostns(ctx, NULL))
        return ret;
    if (sock_is_health_check(ctx) &&
        __sock4_pre_bind(ctx, ctx))
        ret = SYS_REJECT;
    return ret;
}

因为这一块代码的程序是由cilium-agent 加载的。那么先看看这些选项都是做些什么。 cilium doc https://docs.cilium.io/en/v1.10/cmdref/cilium-agent/

bpf/init.sh

function bpf_load_cgroups()
{
    OPTS=$1
    IN=$2
    OUT=$3
    PROG_TYPE=$4
    WHERE=$5
    CALLS_MAP=$6
    CGRP=$7
    BPFMNT=$8

    OPTS="${OPTS} -DCALLS_MAP=${CALLS_MAP}"
    bpf_compile $IN $OUT obj "$OPTS"

    TMP_FILE="$BPFMNT/tc/globals/cilium_cgroups_$WHERE"
    rm -f $TMP_FILE

    cilium bpf migrate-maps -s $OUT
    set +e
    tc exec bpf pin $TMP_FILE obj $OUT type $PROG_TYPE attach_type $WHERE sec "cgroup/$WHERE"
    RETCODE=$?
    set -e
    cilium bpf migrate-maps -e $OUT -r $RETCODE

    if [ "$RETCODE" -eq "0" ]; then
        set +e
        bpftool cgroup attach $CGRP $WHERE pinned $TMP_FILE
        RETCODE=$?
        set -e
        rm -f $TMP_FILE
    fi
    return $RETCODE
}

if [ "$HOSTLB" = "true" ]; then
        // ipv6的处理
    if [ "$IP6_HOST" != "<nil>" ]; then
        echo 1 > /proc/sys/net/ipv6/conf/all/forwarding
    fi

    // ipv4的处理
    if [ "$IP4_HOST" != "<nil>" ]; then
        bpf_load_cgroups "$COPTS" bpf_sock.c bpf_sock.o sockaddr connect4 $CALLS_MAP $CGROUP_ROOT $BPFFS_ROOT
        if [ "$HOSTLB_PEER" = "true" ]; then
            bpf_load_cgroups "$COPTS" bpf_sock.c bpf_sock.o sockaddr getpeername4 $CALLS_MAP $CGROUP_ROOT $BPFFS_ROOT
        fi
        if [ "$NODE_PORT" = "true" ] && [ "$NODE_PORT_BIND" = "true" ]; then
            bpf_load_cgroups "$COPTS" bpf_sock.c bpf_sock.o sock post_bind4 $CALLS_MAP $CGROUP_ROOT $BPFFS_ROOT
        else
            bpf_clear_cgroups $CGROUP_ROOT post_bind4
        fi
        if [ "$MODE" = "ipip" ]; then
            bpf_load_cgroups "$COPTS" bpf_sock.c bpf_sock.o sockaddr bind4 $CALLS_MAP $CGROUP_ROOT $BPFFS_ROOT
        else
            bpf_clear_cgroups $CGROUP_ROOT bind4
        fi
        if [ "$HOSTLB_UDP" = "true" ]; then
            bpf_load_cgroups "$COPTS" bpf_sock.c bpf_sock.o sockaddr sendmsg4 $CALLS_MAP $CGROUP_ROOT $BPFFS_ROOT
            bpf_load_cgroups "$COPTS" bpf_sock.c bpf_sock.o sockaddr recvmsg4 $CALLS_MAP $CGROUP_ROOT $BPFFS_ROOT
        else
            bpf_clear_cgroups $CGROUP_ROOT sendmsg4
            bpf_clear_cgroups $CGROUP_ROOT recvmsg4
        fi
    fi

因为cilium-agent是daemonset 的程序,这个init.sh的调用链

cilium
  |- daemon/cmd/daemon.go
  |- Daemon struct{ rec              *recorder.Recorder} 
      |- pkg/recorder/recorder.go
           |- datapath/loader/Loader.go -> Reinitialize

func (l *Loader) {
....
    prog := filepath.Join(option.Config.BpfDir, "init.sh")
    cmd := exec.CommandContext(ctx, prog, args...)
    cmd.Env = bpf.Environment()
    if _, err := cmd.CombinedOutput(log, true); err != nil {
        return err
    }

    if l.canDisableDwarfRelocations {
        // Validate alignments of C and Go equivalent structs
        if err := alignchecker.CheckStructAlignments(defaults.AlignCheckerName); err != nil {
            log.WithError(err).Fatal("C and Go structs alignment check failed")
        }
    } else {
        log.Warning("Cannot check matching of C and Go common struct alignments due to old LLVM/clang version")
    }
.....
}
laik commented 2 years ago

cilium ebpf bpf_sock.o 分析

从上面的脚本分析

  1. connect, sendmsg --- 正向转换 __sock4_xlate_fwd
  2. post_bind --- nodeport
  3. bind --- ipip模式
  4. getpeername,recvmsg4 --- 反向转换 __sock4_xlate_rev

dockerone 上也有相应的service实现 http://dockone.io/article/220800

主机上的cgroup程序

{
        "cgroup": "/sys/fs/cgroup/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-podd6462fb3_1b67_4b76_a058_f509e2e42e29.slice/docker-2dc04627b41553b8662f3cdfe9bd23089482f34adf09545796e4ad6a3dd5531d.scope",
        "programs": [{
                "id": 801,
                "attach_type": "device",
                "attach_flags": "multi",
                "name": ""
            }
        ]
    },{
        "cgroup": "/sys/fs/cgroup/kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-podd6462fb3_1b67_4b76_a058_f509e2e42e29.slice/docker-0655a28aad7016f55739e192a768a11cd15daf351141a0a6e0ba466dc24eb8d7.scope",
        "programs": [{
                "id": 749,
                "attach_type": "device",
                "attach_flags": "multi",
                "name": ""
            }
        ]

在看这个之前,如果对cgroup attach不了解的可以看看 https://github.com/laik/ebpf-app/tree/main/ebpf/sockops

那么按照上面的程序都做的,那么具体看看是如何实现正向转换的(xlate)

程序 cgroup/connect4中 引用了一个重点的函数 __sock4_xlate_fwd

static __always_inline int __sock4_xlate_fwd(struct bpf_sock_addr *ctx,
                         struct bpf_sock_addr *ctx_full,
                         const bool udp_only)
{
    union lb4_affinity_client_id id;
    const bool in_hostns = ctx_in_hostns(ctx_full, &id.client_cookie);
    struct lb4_backend *backend;
    struct lb4_service *svc;
    struct lb4_key key = {
        .address    = ctx->user_ip4,
        .dport      = ctx_dst_port(ctx),
    }, orig_key = key;
    struct lb4_service *backend_slot;
    bool backend_from_affinity = false;
    __u32 backend_id = 0;

    if (is_defined(ENABLE_SOCKET_LB_HOST_ONLY) && !in_hostns)
        return -ENXIO;

    if (!udp_only && !sock_proto_enabled(ctx->protocol))
        return -ENOTSUP;

    /* In case a direct match fails, we try to look-up surrogate
     * service entries via wildcarded lookup for NodePort and
     * HostPort services.
     */
    svc = lb4_lookup_service(&key, true);              --- 查询lb map里的backend
    if (!svc)
        svc = sock4_wildcard_lookup_full(&key, in_hostns);  --- 模糊查询
    if (!svc)
        return -ENXIO;

    /* Do not perform service translation for external IPs
     * that are not a local address because we don't want
     * a k8s service to easily do MITM attacks for a public
     * IP address. But do the service translation if the IP
     * is from the host.
     */
    if (sock4_skip_xlate(svc, orig_key.address))
        return -EPERM;

    if (lb4_svc_is_affinity(svc)) {                   --- k8s 服务节点亲和性
        /* Note, for newly created affinity entries there is a
         * small race window. Two processes on two different
         * CPUs but the same netns may select different backends
         * for the same service:port. lb4_update_affinity_by_netns()
         * below would then override the first created one if it
         * didn't make it into the lookup yet for the other CPU.
         */
        backend_id = lb4_affinity_backend_id_by_netns(svc, &id);
        backend_from_affinity = true;

        if (backend_id != 0) {
            backend = __lb4_lookup_backend(backend_id);
            if (!backend)
                /* Backend from the session affinity no longer
                 * exists, thus select a new one. Also, remove
                 * the affinity, so that if the svc doesn't have
                 * any backend, a subsequent request to the svc
                 * doesn't hit the reselection again.
                 */
                backend_id = 0;
        }
    }

    if (backend_id == 0) {
        backend_from_affinity = false;

        key.backend_slot = (sock_select_slot(ctx_full) % svc->count) + 1;
        backend_slot = __lb4_lookup_backend_slot(&key);
        if (!backend_slot) {
            update_metrics(0, METRIC_EGRESS, REASON_LB_NO_BACKEND_SLOT);
            return -ENOENT;
        }

        backend_id = backend_slot->backend_id;
        backend = __lb4_lookup_backend(backend_id);
    }

    if (!backend) {
        update_metrics(0, METRIC_EGRESS, REASON_LB_NO_BACKEND);
        return -ENOENT;
    }

    if (lb4_svc_is_localredirect(svc) &&                         --- 检查并修改svc需要去的是本地的节点或者就是自己本身
        sock4_skip_xlate_if_same_netns(ctx_full, backend))
        return -ENXIO;

    if (lb4_svc_is_affinity(svc) && !backend_from_affinity)
        lb4_update_affinity_by_netns(svc, &id, backend_id);

    if (sock4_update_revnat(ctx_full, backend, &orig_key,
                svc->rev_nat_index) < 0) {
        update_metrics(0, METRIC_EGRESS, REASON_LB_REVNAT_UPDATE);
        return -ENOMEM;
    }

    ctx->user_ip4 = backend->address;
    ctx_set_port(ctx, backend->port);                           --- 修改ctx的到达地址与端口

    return 0;
}

上面程序根据lb map修改需要到达的地址。对,只是修改,没有做ctx_redirect吧? 是的,因为这个ctx_redirect是在bpf_lxc程序处理的。

那么看看以下的__sock4_xlate_rev反向转换,其实原理也是一样

static __always_inline int __sock4_xlate_rev(struct bpf_sock_addr *ctx,
                         struct bpf_sock_addr *ctx_full)
{
    struct ipv4_revnat_entry *val;
    struct ipv4_revnat_tuple key = {
        .cookie     = sock_local_cookie(ctx_full),
        .address    = ctx->user_ip4,
        .port       = ctx_dst_port(ctx),
    };

    val = map_lookup_elem(&LB4_REVERSE_NAT_SK_MAP, &key);
    if (val) {
        struct lb4_service *svc;
        struct lb4_key svc_key = {
            .address    = val->address,
            .dport      = val->port,
        };

        svc = lb4_lookup_service(&svc_key, true);
        if (!svc)
            svc = sock4_wildcard_lookup_full(&svc_key,
                        ctx_in_hostns(ctx_full, NULL));
        if (!svc || svc->rev_nat_index != val->rev_nat_index) {
            map_delete_elem(&LB4_REVERSE_NAT_SK_MAP, &key);
            update_metrics(0, METRIC_INGRESS, REASON_LB_REVNAT_STALE);
            return -ENOENT;
        }

        ctx->user_ip4 = val->address;
        ctx_set_port(ctx, val->port);
        return 0;
    }

    return -ENXIO;
}

这里得提到一个问题,在访问service地址的时候在用户空间操作connect,sendmsg的时候就开始做nat(直接修改),然后通过bpf_lxc(egress)是直接rediect到对应的数据路径[host其他ns、本身ns、其他host ns(snat to other host)] 等操作。

网易的文章里也有提到,借鉴一下图片 image

laik commented 2 years ago

从上到下看完后,后面对于to_overlay/from_overlay,to_host .... 等其实都差不多原理。 使用ebpf的模式可以很简单且高效的实现SDN网络,google cloud 的vm也是32位子网掩码,本人估计也是类似的实现。

假设我们需要基于cilium二次开发overlay的网络,那么直接替换 to_overlay/from_overlay就可以了。 如果需要实现更复杂的subnet cidr在不同主机等等也是比较容易的。包括如果实现vpc等更复杂的功能都可以在这个基础之上实现。

laik commented 2 years ago

cilium 深入分析sock

说到bpf_sock.c的程序拦截cgroup实现其实就需要了解liunx的内核历史版本了,上面网易的图中其实有一个是5.4内核还有一个是5.10内核,而且网易图中pod-other node pod 的直接就出去了,当然,在新版本的cilium里其实默认使用的是vxlan的模式,跟网易分析图中的host-routing模式还是有很大的区别的。至于为什么cilium的新版本1.11改成vxlan/geneve呢,作者说的是比较简单。早期cilium用的是ipvlan的模式。

首先先认识下5.10里新的函数,bpf_redirect_peer,bpf_redirect_neigh

bpf_redirect_peer 的设计文档

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9aa1206e8f48

image

bpf_redirect_neigh 的提交与设计文档

https://lore.kernel.org/bpf/f207de81629e1724899b73b8112e0013be782d35.1601477936.git.daniel@iogearbox.net/

image

可以这么理解,一个解决ns与ns之间的重转发,一个解决ns与主栈之间的重转发。

以下就是可以看到这种redirect的路径。 image

因为数据最终还是来到了from-container的程序bpf_lxc.c处理那么本地pod的数据如何看到是用了bpf_redirect_peer?

在 bpf_lxc.c from-container中,如果检测到endpoint是本地的ns
代码1:
    ep = lookup_ip4_endpoint(ip4);
        if (ep) {
#ifdef ENABLE_ROUTING
            if (ep->flags & ENDPOINT_F_HOST) {
#ifdef HOST_IFINDEX
                goto to_host;
#else
                return DROP_HOST_UNREACHABLE;
#endif
            }
#endif /* ENABLE_ROUTING */
            policy_clear_mark(ctx);
            return ipv4_local_delivery(ctx, l3_off, SECLABEL, ip4,          ---> 发往本地
                           ep, METRIC_EGRESS, false);
        }

代码2 ipv4_local_delivery 定义

/* Performs IPv4 L2/L3 handling and delivers the packet to the destination pod
 * on the same node, either via the stack or via a redirect call.
 * Depending on the configuration, it may also enforce ingress policies for the
 * destination pod via a tail call.
 */
static __always_inline int ipv4_local_delivery(struct __ctx_buff *ctx, int l3_off,
                           __u32 seclabel, struct iphdr *ip4,
                           const struct endpoint_info *ep,
                           __u8 direction __maybe_unused,
                           bool from_host __maybe_unused)
{
    mac_t router_mac = ep->node_mac;
    mac_t lxc_mac = ep->mac;
    int ret;

    cilium_dbg(ctx, DBG_LOCAL_DELIVERY, ep->lxc_id, seclabel);

    ret = ipv4_l3(ctx, l3_off, (__u8 *) &router_mac, (__u8 *) &lxc_mac, ip4);
    if (ret != CTX_ACT_OK)
        return ret;

#ifdef LOCAL_DELIVERY_METRICS
    /*
     * Special LXC case for updating egress forwarding metrics.
     * Note that the packet could still be dropped but it would show up
     * as an ingress drop counter in metrics.
     */
    update_metrics(ctx_full_len(ctx), direction, REASON_FORWARDED);
#endif

#if defined(USE_BPF_PROG_FOR_INGRESS_POLICY) && \
    !defined(FORCE_LOCAL_POLICY_EVAL_AT_SOURCE)
    ctx->mark |= MARK_MAGIC_IDENTITY;
    set_identity_mark(ctx, seclabel);

    return redirect_ep(ctx, ep->ifindex, from_host);       ---> 看这个函数
#else
    /* Jumps to destination pod's BPF program to enforce ingress policies. */
    ctx_store_meta(ctx, CB_SRC_LABEL, seclabel);
    ctx_store_meta(ctx, CB_IFINDEX, ep->ifindex);
    ctx_store_meta(ctx, CB_FROM_HOST, from_host ? 1 : 0);

    tail_call_dynamic(ctx, &POLICY_CALL_MAP, ep->lxc_id);
    return DROP_MISSED_TAIL_CALL;
#endif
}

函数 redirect_ep 的定义 redirect_ep 这里的ep 是endpoint的意思

static __always_inline int redirect_ep(struct __ctx_buff *ctx __maybe_unused,
                       int ifindex __maybe_unused,
                       bool needs_backlog __maybe_unused)
{
      // 注意这里的注释
    /* If our datapath has proper redirect support, we make use
     * of it here, otherwise we terminate tc processing by letting
     * stack handle forwarding e.g. in ipvlan case.
     *
     * Going via CPU backlog queue (aka needs_backlog) is required
     * whenever we cannot do a fast ingress -> ingress switch but
     * instead need an ingress -> egress netns traversal or vice
     * versa.
     */
#ifdef ENABLE_HOST_REDIRECT
    if (needs_backlog || !is_defined(ENABLE_REDIRECT_FAST)) {
        return ctx_redirect(ctx, ifindex, 0);
    } else {
# ifdef ENCAP_IFINDEX
        /* When coming from overlay, we need to set packet type
         * to HOST as otherwise we might get dropped in IP layer.
         * 当来自overlay时,我们需要设置数据包类型
                  * 到 HOST,否则我们可能会在 IP 层被丢弃。
                 *
         */
        ctx_change_type(ctx, PACKET_HOST);
# endif /* ENCAP_IFINDEX */
        return ctx_redirect_peer(ctx, ifindex, 0);       ----> 嗯,在这里调用了,上面的注释
    }
#else
    return CTX_ACT_OK;
#endif /* ENABLE_HOST_REDIRECT */
}

from-container的程序bpf_lxc.c处理那么出去的数据如何看到是用了bpf_redirect_neigh?

handle_ipv4_from_lxc() 函数中

    if (is_defined(ENABLE_REDIRECT_FAST))
        return redirect_direct_v4(ctx, l3_off, ip4);

    goto pass_to_stack; -------------->好神奇的定义,从名字就能看出来。

redirect_direct_v4函数
static __always_inline int
redirect_direct_v4(struct __ctx_buff *ctx __maybe_unused,
           int l3_off __maybe_unused,
           struct iphdr *ip4 __maybe_unused)
{
    /* For deployments with just single external dev, redirect_neigh()
     * will resolve the GW and do L2 resolution for us. For multi-device
     * deployments we perform a FIB lookup prior to the redirect. If the
     * neigh entry cannot be resolved, we ask redirect_neigh() to do it,
     * otherwise we can directly call redirect().
     */
    bool no_neigh = is_defined(ENABLE_SKIP_FIB);
    int ret, oif = DIRECT_ROUTING_DEV_IFINDEX;
    struct bpf_redir_neigh *nh = NULL;
# ifndef ENABLE_SKIP_FIB
    struct bpf_redir_neigh nh_params;
    struct bpf_fib_lookup fib_params = {
        .family     = AF_INET,
        .ifindex    = ctx->ingress_ifindex,
        .ipv4_src   = ip4->saddr,
        .ipv4_dst   = ip4->daddr,
    };

    ret = fib_lookup(ctx, &fib_params, sizeof(fib_params),
             BPF_FIB_LOOKUP_DIRECT);
    switch (ret) {
    case BPF_FIB_LKUP_RET_SUCCESS:
        break;
    case BPF_FIB_LKUP_RET_NO_NEIGH:
        /* GW could also be v6, so copy union. */
        nh_params.nh_family = fib_params.family;
        __bpf_memcpy_builtin(&nh_params.ipv6_nh, &fib_params.ipv6_dst,
                     sizeof(nh_params.ipv6_nh));
        no_neigh = true;
        nh = &nh_params;
        break;
    default:
        return CTX_ACT_DROP;
    }

    oif = fib_params.ifindex;
# endif /* ENABLE_SKIP_FIB */

    ret = ipv4_l3(ctx, l3_off, NULL, NULL, ip4);
    if (unlikely(ret != CTX_ACT_OK))
        return ret;
    if (no_neigh)
        return redirect_neigh(oif, nh, nh ? sizeof(*nh) : 0, 0); -----> redirect_neigh,这个函数在上上面说redirect里已经提到,整个cilium用的redirect就只有3个函数。
# ifndef ENABLE_SKIP_FIB
    if (eth_store_daddr(ctx, fib_params.dmac, 0) < 0)
        return CTX_ACT_DROP;
    if (eth_store_saddr(ctx, fib_params.smac, 0) < 0)
        return CTX_ACT_DROP;
    return ctx_redirect(ctx, oif, 0);
# endif /* ENABLE_SKIP_FIB */
    return CTX_ACT_DROP;
}