polycube-network / polycube

eBPF/XDP-based software framework for fast network services running in the Linux kernel.
Apache License 2.0
504 stars 102 forks source link

IP routing with tc #412

Closed RuchiSaluja8 closed 2 years ago

RuchiSaluja8 commented 2 years ago

Hi all, This issue is not exactly for Polycube, but regarding eBPF/bcc. I am also researching on these topics and thought to ask about the following issue here. It would be really helpful if the team could go through the code and make some suggestions. I am trying to perform IP routing using tc. I am taking guidance from two repos : lb-from-scratch and ebpf-networking. The client and backend are running in containers on my system and the load balancer(LB) is receiving packets on the docker0 interface. However, when connection is initiated from client to LB, the client continuously retries sending the first SYN packet. Below is the bpf kernel code:

#include <bcc/proto.h>
#include <linux/bpf.h>
#include <uapi/linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/pkt_cls.h>
#include <linux/icmp.h>

#define IP_ADDRESS(x,y,z,w) (uint32_t)(x + (y << 8) + (z << 16) + (w << 24))
#define IS_PSEUDO 0x10

struct address{
    unsigned char mac[6];
    uint32_t ip;
    u16 port;
};

#define IP_CSUM_OFF (offsetof(struct iphdr, check))
#define TCP_CSUM_OFF offsetof(struct tcphdr, check)

static __always_inline void change_address_and_update_crc(
    struct __sk_buff * skb, 
    struct address * addr,
    struct ethhdr * eth,
    struct iphdr * ip,
    struct tcphdr * tcp){

    int flags = IS_PSEUDO;
    uint32_t orig_src_ip;
    uint64_t orig_src_mac;
    u16 orig_src_port;

    orig_src_mac = (unsigned char)(eth->h_source);
    orig_src_ip = ip->saddr;
    orig_src_port = tcp->source;

    eth->h_source[0] = eth->h_dest[0];
    eth->h_source[1] = eth->h_dest[1];
    eth->h_source[2] = eth->h_dest[2];
    eth->h_source[3] = eth->h_dest[3];
    eth->h_source[4] = eth->h_dest[4];
    eth->h_source[5] = eth->h_dest[5];
    ip->saddr = ip->daddr;
    //tcp->source = tcp->dest;

    uint32_t dst_ip = (*addr).ip;
    u16 dst_port = (*addr).port;
    eth->h_dest[0] = (*addr).mac[0];
    eth->h_dest[1] = (*addr).mac[1];
    eth->h_dest[2] = (*addr).mac[2];
    eth->h_dest[3] = (*addr).mac[3];
    eth->h_dest[4] = (*addr).mac[4];
    eth->h_dest[5] = (*addr).mac[5];
    ip->daddr = (*addr).ip;
    //tcp->dest = (*addr).port;

    bpf_trace_printk("AFTER CHANGING\n");
    bpf_trace_printk("src mac address [0-2] : %u %u %u\n", eth->h_source[0], eth->h_source[1], eth->h_source[2]);
    bpf_trace_printk("src mac address [3-5] : %u %u %u\n", eth->h_source[3], eth->h_source[4], eth->h_source[5]);
    bpf_trace_printk("dst mac address [0-2] : %u %u %u\n", eth->h_dest[0], eth->h_dest[1], eth->h_dest[2]);
    bpf_trace_printk("dst mac address [3-5] : %u %u %u\n", eth->h_dest[3], eth->h_dest[4], eth->h_dest[5]);
    bpf_trace_printk("ip addresses : %u %u\n", ip->saddr, ip->daddr);
    bpf_trace_printk("ports : %u %u\n", tcp->source, tcp->dest);

    bpf_l4_csum_replace(skb, TCP_CSUM_OFF, orig_src_ip, dst_ip, flags | sizeof(dst_ip));
    //ip->check = iph_csum(ip);
    bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(orig_src_ip), htons(dst_ip), 4);
    //bpf_l4_csum_replace(skb, TCP_CSUM_OFF, orig_src_port, dst_port, 2);   
}

int tc(struct __sk_buff * skb){
    int key=0;
    struct address client_addr, server_addr;

    server_addr.ip = IP_ADDRESS(172,17,0,2);
    server_addr.port = 8000;

    server_addr.mac[5] = 0x02;
    server_addr.mac[4] = 0x00;
    server_addr.mac[3] = 0x11;
    server_addr.mac[2] = 0xac;
    server_addr.mac[1] = 0x42;
    server_addr.mac[0] = 0x02;

    client_addr.ip = IP_ADDRESS(172,17,0,3);
    client_addr.port = 8000;

    client_addr.mac[5] = 0x03;
    client_addr.mac[4] = 0x00;
    client_addr.mac[3] = 0x11;
    client_addr.mac[2] = 0xac;
    client_addr.mac[1] = 0x42;
    client_addr.mac[0] = 0x02;

    bpf_trace_printk("pckt rcvd\n");
    void *data = (void *)(long)skb->data;
    void *data_end = (void *)(long)skb->data_end;
    struct ethhdr *eth = data;
    if ((void *)eth + sizeof(*eth) <= data_end)
    {
        struct iphdr *ip = data + sizeof(*eth);
        if ((void *)ip + sizeof(*ip) <= data_end)
        {
            if (ip->protocol == IPPROTO_TCP)
            {
                struct tcphdr *tcp = (void *)ip + sizeof(*ip);
                if ((void *)tcp + sizeof(*tcp) <= data_end)
                {
                    if(tcp->dest==ntohs(8000)){
                        uint64_t src_mac, dst_mac;
                        uint32_t src_ip, dst_ip;
                        u16 src_port, dst_port;
                        bpf_trace_printk("BEFORE CHANGING\n");
                        bpf_trace_printk("src mac address [0-2] : %u %u %u\n", eth->h_source[0], eth->h_source[1], eth->h_source[2]);
                        bpf_trace_printk("src mac address [3-5] : %u %u %u\n", eth->h_source[3], eth->h_source[4], eth->h_source[5]);
                        bpf_trace_printk("dst mac address [0-2] : %u %u %u\n", eth->h_dest[0], eth->h_dest[1], eth->h_dest[2]);
                        bpf_trace_printk("dst mac address [3-5] : %u %u %u\n", eth->h_dest[3], eth->h_dest[4], eth->h_dest[5]);
                        bpf_trace_printk("ip addresses : %u %u\n", ip->saddr, ip->daddr);
                        bpf_trace_printk("ports : %u %u\n", tcp->source, tcp->dest);

                        if(eth->h_source[5]==0x03){ //packet from client to server
                            bpf_trace_printk("CLIENT TO LB TO BACKEND\n");
                            change_address_and_update_crc(skb, &server_addr, eth, ip, tcp);
                            //bpf_redirect(6, 0);
                            return TC_ACT_REDIRECT;
                        }
                        else{
                            bpf_trace_printk("BACKEND TO LB TO CLIENT\n");
                            change_address_and_update_crc(skb, &client_addr, eth, ip, tcp);
                           // bpf_redirect(8, 0);
                           return TC_ACT_REDIRECT;
                        }
                    }   
                }
            }
        }
    }
    return TC_ACT_OK;
}

I'm not sure what I'm missing. For the checksum, I feel using the bpf helper functions bpf_l4_csum_replace and bpf_l3_csum_replace should be sufficient. In another article, I have read not to change the checksum at all.

frisso commented 2 years ago

Closing, as this issue is not related to Polycube. Better to post to the bcc mlist instead.