iovisor / bcc

BCC - Tools for BPF-based Linux IO analysis, networking, monitoring, and more
Apache License 2.0
19.88k stars 3.8k forks source link

verifier failure for a xdp code computing udp checksum #2463

Open yonghong-song opened 4 years ago

yonghong-song commented 4 years ago

I filed this issue based on some old discussions in iovisor-dev mailing list. https://lists.iovisor.org/g/iovisor-dev/topic/30285987

I finally got some time to explore the solution for this and created a standalone example. Just to recap, the following is a simplified program.

-bash-4.4$ cat xdp_simple.c
#include <linux/bpf.h>
#include <stdio.h>
#include <string.h>
#include <linux/icmp.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/perf_event.h>

#include "bpf_helpers.h"
#include "bpf_endian.h"

/* 0x3FFF mask to check for fragment offset field */
#define IP_FRAGMENTED 65343

// MAC address
typedef unsigned char mac[6];

// Real Server structure (MAC address + IP address)
struct server {
    __be32 ipAddr;
    unsigned char macAddr[ETH_ALEN];
};

// packet structure to log load balancing
struct packet {
    unsigned char dmac[ETH_ALEN];
    unsigned char smac[ETH_ALEN];
    __be32 daddr;
    __be32 saddr;
};

struct {
        __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
        __uint(key_size, sizeof(int));
        __uint(value_size, sizeof(int));
} events SEC(".maps");

// A map which contains port to redirect
struct {
        __uint(type, BPF_MAP_TYPE_HASH);
        __uint(max_entries, 10);
        __type(key, __be16);
        __type(value, int);
} ports SEC(".maps");

// A map which contains real server 
struct {
        __uint(type, BPF_MAP_TYPE_HASH);
        __uint(max_entries, 10);
        __type(key, int);
        __type(value, struct server);
} realServers SEC(".maps");
// Virtual IP is accessible via the '0x5' constant

SEC("xdpclient")
int xdp_prog(struct xdp_md *ctx) {
    void *data_end = (void *)(long)ctx->data_end;
    void *data = (void *)(long)ctx->data;

    struct ethhdr * eth = data;
    if (eth + 1 > data_end)
        return XDP_DROP;

    if (eth->h_proto != bpf_htons(ETH_P_IP)){
        return XDP_PASS;
    }

    struct iphdr *iph;
    iph = eth + 1;
    if (iph + 1 > data_end)
        return XDP_DROP;

    if (iph->ihl < 5)
        return XDP_DROP;
    if (iph->ihl != 5) 
        return XDP_PASS;

    if (iph->protocol != IPPROTO_UDP) {
        return XDP_PASS;
    }

    struct udphdr *udp;
    udp = iph + 1;
    if (udp + 1 > data_end)
        return XDP_DROP;
    __u16 udp_len = bpf_ntohs(udp->len);
    if (udp_len < 8 || udp_len >= 512)
        return XDP_DROP;
    udp_len &= 0x1ff;
    if ((void *) udp + udp_len > data_end)
        return XDP_DROP;

    // Update UDP checksum
    __u64 cs = 0;
    udp->check = 0;
    cs = bpf_csum_diff(0, 0, udp, udp_len, cs);
    udp->check = cs;

    // Log packet after
    struct packet pkt = {};
    memcpy(&pkt, data, sizeof(pkt)); // crappy
    pkt.daddr = iph->daddr;
    pkt.saddr = iph->saddr;
    bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &pkt,sizeof(pkt));

    return XDP_TX;
}

char _license[] SEC("license") = "GPL";

The compilation command line looks like

clang -target bpf -O2 -c -I/home/yhs/work/linux/tools/include/uapi -I. -g xdp_simple.c

where the above include path pointing to linux repo for two helper header files.

Using bpftool (linux/tools/bpf/bpftool) to load and can reproduce the verifier issue:

./bpftool -d prog load ./xdp_simple.o /sys/fs/bpf/xdp_example type xdp
45: R0_w=inv1 R1_w=inv0 R2_w=pkt(id=1,off=34,r=34,umax_value=511,var_off=(0x0; 0x1ff)) R3=pkt(id=0,off=34,r=42,imm=0) R4_w=invP(id=0,umax_value=511,var_off=(0x0; 0x1ff)
) R6=ctx(id=0,off=0,imm=0) R7=pkt(id=0,off=0,r=42,imm=0) R10=fp0
; udp->check = 0;
45: (6b) *(u16 *)(r7 +40) = r1
46: R0_w=inv1 R1_w=inv0 R2_w=pkt(id=1,off=34,r=34,umax_value=511,var_off=(0x0; 0x1ff)) R3=pkt(id=0,off=34,r=42,imm=0) R4_w=invP(id=0,umax_value=511,var_off=(0x0; 0x1ff)
) R6=ctx(id=0,off=0,imm=0) R7=pkt(id=0,off=0,r=42,imm=0) R10=fp0
; cs = bpf_csum_diff(0, 0, udp, udp_len, cs);
46: (b7) r1 = 0
47: R0_w=inv1 R1_w=inv0 R2_w=pkt(id=1,off=34,r=34,umax_value=511,var_off=(0x0; 0x1ff)) R3=pkt(id=0,off=34,r=42,imm=0) R4_w=invP(id=0,umax_value=511,var_off=(0x0; 0x1ff)
) R6=ctx(id=0,off=0,imm=0) R7=pkt(id=0,off=0,r=42,imm=0) R10=fp0
47: (b7) r2 = 0
48: R0_w=inv1 R1_w=inv0 R2_w=inv0 R3=pkt(id=0,off=34,r=42,imm=0) R4_w=invP(id=0,umax_value=511,var_off=(0x0; 0x1ff)) R6=ctx(id=0,off=0,imm=0) R7=pkt(id=0,off=0,r=42,imm
=0) R10=fp0
48: (b7) r5 = 0
49: R0_w=inv1 R1_w=inv0 R2_w=inv0 R3=pkt(id=0,off=34,r=42,imm=0) R4_w=invP(id=0,umax_value=511,var_off=(0x0; 0x1ff)) R5_w=inv0 R6=ctx(id=0,off=0,imm=0) R7=pkt(id=0,off=
0,r=42,imm=0) R10=fp0
49: (85) call bpf_csum_diff#28
last_idx 49 first_idx 37
regs=4 stack=0 before 48: (b7) r5 = 0
regs=4 stack=0 before 47: (b7) r2 = 0
invalid access to packet, off=34 size=511, R3(id=0,off=34,r=42)
R3 offset is outside of the packet
yonghong-song commented 4 years ago

Note that I am using latest clang and latest bpf-next repo for the above experiments.

In the original code, there are three verifier related issues:

  1. scalar with a variable range spilled to stack and when restored, the range is gone
  2. packet pointer is refined later with a different id=1 due to variable udp length, but due to compiler optimization, id=0 packet pointer is still used for later bpf_csum_diff. The id=0 packet pointer is less precise than id=1 one.
  3. fundamentally, for bpf_csum_diff(), the verifier needs to verify the udp + udp_size. but udp has definite range (42, up to end of udp header, and optional max range to 511) and udp_size has maximum length to 511. But all these variable length won't play well in verifier, it is possible actual udp_len = 100 and packet range is 50. This will cause issues.

Improving verifier, esp. for 2 and 3 will be complex.

yonghong-song commented 4 years ago

So, considering all such complexity comes from calculating checksum for udp payload. After discussing with @4ast, one idea is to just implement a new helper to calculate checksum for the packet in xdp environment, similar to bpf_l4_csum_replace() but in xdp environment.

sbernard31 commented 4 years ago

@yonghong-song, as you explained I faced those issue when I tried to calculate UDP checksum.

My first idea was to just recalculate all checksum from scratch because this seemed more straightforward. But handling "dynamic payload size" triggered all this "bpf verifier vs clang compilation" issues.

As I didn't find any good solution/workaround to do that, I finally explored a more efficient way to calculate checksum which didn't need to handle "dynamic payload size" : Computation of the Internet Checksum via Incremental Update (RFC 1624). This way I only manage data with "static size". Here is my code which is not finished but functional for now. You can have a look at the checksum calculation part

About bpf_l4_csum_replace(), I looked at the documentation and finally it seems it does something like my update_csum function but more generic :

static inline void update_csum(__u64 *csum, __be32 old_addr,__be32 new_addr ) {

As calculating checksum seems to be a common use case for XDP. I think it makes totally sense to provide a function like this!

About the real issues you exposed in https://github.com/iovisor/bcc/issues/2463#issuecomment-512503958, finally the solution is maybe more at compilation time, maybe a specific "compilation profile" should be created to produce more "verifier compliant" code ? (I didn't know so much about that so maybe my remark is not so pertinent :grin:)

Last point, my feedback about bcc, I know that bcc was not really advised(https://github.com/facebookincubator/katran/issues/29#issuecomment-475114910) for creating a loadbalancer but currently I really appreciate to use it :+1: ! The main limitation for my use case is https://github.com/iovisor/bcc/issues/2223 which limit me to handle all my need in only 1 script but this is clearly out of topic.

yonghong-song commented 4 years ago

@sbernard31 Thanks for the feedback. Great to see you already got a workaround and it works. Regarding to

As calculating checksum seems to be a common use case for XDP. I think it makes totally sense to provide a function like this!

My original thinking is a helper to do a whole checksum calculation for a section of code, e.g., udp payload.

Since in xdp there is no skb involvement, for incremental update (no touching udp payload), yes bpf program can implement it easily. We can have a helper, but less value. So I will not pursue it now unless there is a strong need and additional requirement where doing in bpf program becomes hard.

sbernard31 commented 4 years ago

My original thinking is a helper to do a whole checksum calculation for a section of code, e.g., udp payload.

I don't know if there is pertinent use case where you can not use incremental checksum update. (I guess not so much :thinking:)

The problem is maybe more a documentation one, as "noob" like me will maybe take time before to understand they should use incremental checksum update instead of a full checksum calculation.

yonghong-song commented 4 years ago

True. I probably should first point out this also instead of 100% focusing on solving verifier complains :-)

sbernard31 commented 4 years ago

@yonghong-song, maybe this new patches in bfp would help to resolve both issues I opened :

yonghong-song commented 4 years ago

@sbernard31 bounded loop won't help here. The two issues you filed are straight line codes, no loop involved.

sbernard31 commented 4 years ago

To support loop it seems they enhanced verifier :

The compiler often puts variables on the stack (or "spills" them) when it needs to free up some registers. The verifier should be able to track such variables, since it may need to make decisions based on their contents. Until now it did not do so, resulting in certain programs being incorrectly rejected.

and

Comparisons involving variable values are clearly harder, which is why, until now, the verifier supported only comparisons of a register with a constant. In this patch set, Starovoitov added support for tests comparing two registers as well. This enhancement was necessary to be able to simulate the execution of loop tests.

yonghong-song commented 4 years ago

I see. Currently verifier only tracks spilled register with 'constant' values. In the udp load balancer case, the variable udp_len has a variable range, 0 <= udp_len <= 511. So the current verifier did not track that.

On the other hand, it is not too hard to add tracking for registers with non-constant range. I actually have tried to add this support to verifier. But this alone won't solve the ulb issue. The big issue is, as I stated in one of my earlier comment, with

fundamentally, for bpf_csum_diff(), the verifier needs to verify the udp + udp_size. but
udp has definite range (42, up to end of udp header, and optional max range to 511)
and udp_size has maximum length to 511. But all these variable length won't play well
in verifier, it is possible actual udp_len = 100 and packet range is 50. This will cause
issues.

So I did not even try to upstream the patch to check register with non-constant range. Note that it may have some side effects on verifier performance, which needs to be measured.

rayoluo commented 3 years ago

https://lists.iovisor.org/g/iovisor-dev/topic/30285987

Have you solved the problem on this mailing list so far?I recently try to recalculate the checksum based on an incorrect checksum value, in which I can't calculate the Internet Checksum via incremental update.

sbernard31 commented 3 years ago

Nope, I didn't try more since incremental update fits my needs. Do not hesitate to share here if you find a solution. (maybe here you can find some hint around Katran or polycube : https://github.com/AirVantage/sbulb#xdpbpf)

sbernard31 commented 3 years ago

By the way, I'm not sure you should fix an incoming checksum. If checksum is invalid this means that maybe the packet was altered and you probably don't want to fix it.

FedeParola commented 3 years ago

Hi @Rayoluo, if you need to compute the checksum from scratch you can use a bounded loop:

#define MAX_UDP_LENGTH 1480

[...]

u32 csum_buffer = 0;
u16 *buf = (void *)udp;

// Compute pseudo-header checksum
csum_buffer += (u16)ip->saddr;
csum_buffer += (u16)(ip->saddr >> 16);
csum_buffer += (u16)ip->daddr;
csum_buffer += (u16)(ip->daddr >> 16);
csum_buffer += (u16)ip->protocol << 8;
csum_buffer += udp->len;

// Compute checksum on udp header + payload
for (int i = 0; i < MAX_UDP_LENGTH; i += 2) {
  if ((void *)(buf + 1) > data_end) {
    break;
  }
  csum_buffer += *buf;
  buf++;
}
if ((void *)buf + 1 <= data_end) {
  // In case payload is not 2 bytes aligned
  csum_buffer += *(u8 *)buf;
}

u16 csum = (u16)csum_buffer + (u16)(csum_buffer >> 16);
csum = ~csum;

Edit: I just figured out the algorithm wasn't correct, I was performing a simple sum on the packet instead of the one's complement sum, fixed it.

rayoluo commented 3 years ago

Hi @FedeParola, thanks a million!! I'll fix my code in this way. 👍

gamemann commented 3 years ago

@FedeParola Thank you for this! I found this issue earlier today and have been trying to get things to work with the original code you provided. I then came back to this issue again and saw you just updated the code five hours ago. Great timing, haha!

The new code you provided works for me when calculating the UDP checksum plus payload from scratch and I made it into a function for anybody interested. I also believe it should work with the TCP checksums as well if you change struct udphdr *udph to struct tcphdr *tcph for example and modify the rest of the function.

#define MAX_UDP_SIZE 1480

...

/* All credit goes to FedeParola from https://github.com/iovisor/bcc/issues/2463 */
__attribute__((__always_inline__))
static inline __u16 caludpcsum(struct iphdr *iph, struct udphdr *udph, void *data_end)
{
    __u32 csum_buffer = 0;
    __u16 *buf = (void *)udph;

    // Compute pseudo-header checksum
    csum_buffer += (__u16)iph->saddr;
    csum_buffer += (__u16)(iph->saddr >> 16);
    csum_buffer += (__u16)iph->daddr;
    csum_buffer += (__u16)(iph->daddr >> 16);
    csum_buffer += (__u16)iph->protocol << 8;
    csum_buffer += udph->len;

    // Compute checksum on udp header + payload
    for (int i = 0; i < MAX_UDP_SIZE; i += 2) 
    {
        if ((void *)(buf + 1) > data_end) 
        {
            break;
        }

        csum_buffer += *buf;
        buf++;
    }

    if ((void *)buf + 1 <= data_end) 
    {
        // In case payload is not 2 bytes aligned
        csum_buffer += *(__u8 *)buf;
    }

    __u16 csum = (__u16)csum_buffer + (__u16)(csum_buffer >> 16);
    csum = ~csum;

    return csum;
}

...

udph->check = 0;
udph->check = caludpcsum(iph, udph, data_end);
luokaink commented 1 year ago

@FedeParola Thank you for this! I found this issue earlier today and have been trying to get things to work with the original code you provided. I then came back to this issue again and saw you just updated the code five hours ago. Great timing, haha!

The new code you provided works for me when calculating the UDP checksum plus payload from scratch and I made it into a function for anybody interested. I also believe it should work with the TCP checksums as well if you change struct udphdr *udph to struct tcphdr *tcph for example and modify the rest of the function.

#define MAX_UDP_SIZE 1480

...

/* All credit goes to FedeParola from https://github.com/iovisor/bcc/issues/2463 */
__attribute__((__always_inline__))
static inline __u16 caludpcsum(struct iphdr *iph, struct udphdr *udph, void *data_end)
{
    __u32 csum_buffer = 0;
    __u16 *buf = (void *)udph;

    // Compute pseudo-header checksum
    csum_buffer += (__u16)iph->saddr;
    csum_buffer += (__u16)(iph->saddr >> 16);
    csum_buffer += (__u16)iph->daddr;
    csum_buffer += (__u16)(iph->daddr >> 16);
    csum_buffer += (__u16)iph->protocol << 8;
    csum_buffer += udph->len;

    // Compute checksum on udp header + payload
    for (int i = 0; i < MAX_UDP_SIZE; i += 2) 
    {
        if ((void *)(buf + 1) > data_end) 
        {
            break;
        }

        csum_buffer += *buf;
        buf++;
    }

    if ((void *)buf + 1 <= data_end) 
    {
        // In case payload is not 2 bytes aligned
        csum_buffer += *(__u8 *)buf;
    }

    __u16 csum = (__u16)csum_buffer + (__u16)(csum_buffer >> 16);
    csum = ~csum;

    return csum;
}

...

udph->check = 0;
udph->check = caludpcsum(iph, udph, data_end);

This code does not work in ubutun

Hi @rayoluo, if you need to compute the checksum from scratch you can use a bounded loop:

#define MAX_UDP_LENGTH 1480

[...]

u32 csum_buffer = 0;
u16 *buf = (void *)udp;

// Compute pseudo-header checksum
csum_buffer += (u16)ip->saddr;
csum_buffer += (u16)(ip->saddr >> 16);
csum_buffer += (u16)ip->daddr;
csum_buffer += (u16)(ip->daddr >> 16);
csum_buffer += (u16)ip->protocol << 8;
csum_buffer += udp->len;

// Compute checksum on udp header + payload
for (int i = 0; i < MAX_UDP_LENGTH; i += 2) {
  if ((void *)(buf + 1) > data_end) {
    break;
  }
  csum_buffer += *buf;
  buf++;
}
if ((void *)buf + 1 <= data_end) {
  // In case payload is not 2 bytes aligned
  csum_buffer += *(u8 *)buf;
}

u16 csum = (u16)csum_buffer + (u16)(csum_buffer >> 16);
csum = ~csum;

Edit: I just figured out the algorithm wasn't correct, I was performing a simple sum on the packet instead of the one's complement sum, fixed it.

This code does not work for unbutu: 22.04, kernel:5.19.0 and llvm-14, but it works for unbutu: 22.04, kernel:5.19.0 and llvm-12. I think there is a bug in bbc with llvm-14

FedeParola commented 1 year ago

See #4612 for a solution.