F-Stack / f-stack

F-Stack is an user space network development kit with high performance based on DPDK, FreeBSD TCP/IP stack and coroutine API.
http://www.f-stack.org
Other
3.87k stars 899 forks source link

Extremely Bad Latency on TCP Connection for receiving Data #842

Open winstonzhao opened 2 months ago

winstonzhao commented 2 months ago

Hello, my usecase for F-Stack is that I want to optimize for packet read speed for TCP.

I've setup a simple test that has a python server that will send out timestamp packets:

#! /usr/bin/python
# a simple tcp server
import socket,os
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)  
sock.bind(('0.0.0.0', 12373))  
sock.listen(5)  
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, b"ens7")
import time
while True:  
    connection,address = sock.accept()  
    buf = connection.recv(1024)  
    print(buf)
    start= time.time()
    for i in range(int(1e6)):
        connection.send((str(time.time_ns()) * 200).encode('utf-8'))            
    connection.close()

And then I setup a simple receiving using native linux sockets and F-stack, to compare how long it takes to proceses each of 5M messages.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <netdb.h>

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#include <arpa/inet.h>
#include <errno.h>
#include <assert.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <netdb.h>

#define SOCK_FSTACK 0x01000000
#define SOCK_KERNEL 0x02000000

#define MAX_EVENTS 10
#define BUFFER_SIZE 4096 

int64_t get_time_difference(const char *buffer, size_t bytes_read) {
    if (bytes_read < 16) {
        // printf("Buffer too small\n");
        return -1;
    }

    int64_t d = strtoll(buffer + (bytes_read - 20), NULL, 10);

    struct timespec current_time;
    clock_gettime(CLOCK_REALTIME, &current_time);

    // // Handle nanosecond underflow
    int64_t delta = current_time.tv_sec * 1000000000 + current_time.tv_nsec - d;

    // Step 6: Print the time difference
    return delta;
}

int set_nonblocking(int sockfd) {
    int opt = 1;
    if (ioctl(sockfd, FIONBIO, &opt)) {
        perror("ioctl FIONBIO");
        return -1;
    }
    return 0;
}

int do_req() {
    const char *hostname = "10.0.3.122";
    const char *port = "12373";
    const char *path = "/";

    // printf("socket\n");

    // Create a socket
    int sockfd = socket(AF_INET, SOCK_STREAM, 0);
    if (sockfd < 0) {
        perror("socket");
        exit(EXIT_FAILURE);
    }

    // Resolve hostname
    struct addrinfo hints, *res;
    memset(&hints, 0, sizeof(hints));
    hints.ai_family = AF_INET;
    hints.ai_socktype = SOCK_STREAM;

    if (getaddrinfo(hostname, port, &hints, &res) != 0) {
        perror("getaddrinfo");
        close(sockfd);
        exit(EXIT_FAILURE);
    }

    // Set the socket to non-blocking mode
    if (set_nonblocking(sockfd) == -1) {
        close(sockfd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    // Start connecting
    int connect_status = connect(sockfd, res->ai_addr, res->ai_addrlen);
    if (connect_status == -1 && errno != EINPROGRESS) {
        perror("connect");
        close(sockfd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    // Set up epoll
    int epoll_fd = epoll_create1(0);
    if (epoll_fd == -1) {
        perror("epoll_create1");
        close(sockfd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    struct epoll_event ev, events[MAX_EVENTS];
    ev.events = EPOLLOUT | EPOLLIN | EPOLLET;  // Wait for the socket to be writable (connect completion) and readable
    ev.data.fd = sockfd;

    if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, sockfd, &ev) == -1) {
        perror("epoll_ctl");
        close(sockfd);
        close(epoll_fd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    // HTTP GET request to send
    char request[512];
    snprintf(request, sizeof(request), "GET %s HTTP/1.1\r\nHost: %s\r\nConnection: keep-alive\r\n\r\n", path, hostname);

    int64_t br = 0;
    int64_t buffer_reads = 0;
    int last_latency = 0;
    int start_lat = 0;

    int done = 0;
    while (!done) {
        int n = epoll_wait(epoll_fd, events, MAX_EVENTS, -1);
        for (int i = 0; i < n; i++) {
            if (events[i].events & EPOLLOUT) {
                // Ready to write (send request)
                int sent = send(sockfd, request, strlen(request), 0);
                if (sent == -1) {
                    perror("send");
                    done = 1;
                    break;
                }
                // printf("Sent HTTP request:\n%s", request);
                ev.events = EPOLLIN | EPOLLET;  // Now, wait for the response
                epoll_ctl(epoll_fd, EPOLL_CTL_MOD, sockfd, &ev);
            } else if (events[i].events & EPOLLIN) {
                // Ready to read (response received)
                char buffer[BUFFER_SIZE];
                int bytes_read;
                while ((bytes_read = recv(sockfd, buffer, sizeof(buffer) - 1, 0)) > 0) {
                    br += bytes_read;
                    ++buffer_reads;
                    bytes_read += 1;
                    buffer[bytes_read - 1] = '\0';
                    last_latency = get_time_difference(buffer, bytes_read);
                    if (start_lat < 10 || start_lat > 10000000)
                        start_lat = last_latency;
                }

                if (bytes_read == 0) {
                    // Connection closed by the server
                    done = 1;
                } else if (bytes_read == -1 && errno != EAGAIN) {
                    perror("recv");
                    done = 1;
                }
            }
        }
    }

    float avg_bytes = (float)br / (float)buffer_reads;
    printf("avg bytes: %.6f\n", avg_bytes);
    printf("buffer_reads: %ld\n", buffer_reads);
    printf("bytes_read: %ld\n", br);
    printf("last_latency: %d\n", last_latency);
    printf("first_latency: %d\n", start_lat);
    printf("lat_diff: %d\n", last_latency - start_lat);

    // Clean up
    close(sockfd);
    close(epoll_fd);
    freeaddrinfo(res);

    return 0;
}

int main() {
    for (int i = 0; i < 1; ++i)    
    do_req();
    return 0;
}

Then, I write something similar using F-stack:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <netdb.h>

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#include <arpa/inet.h>
#include <errno.h>
#include <assert.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <netdb.h>

#include "ff_config.h"
#include "ff_api.h"
#include "ff_epoll.h"

#define SOCK_FSTACK 0x01000000
#define SOCK_KERNEL 0x02000000

#define MAX_EVENTS 10
#define BUFFER_SIZE 4096 

struct epoll_event ev;
struct epoll_event events[MAX_EVENTS];

int epfd;
int sockfd;

    // HTTP GET request to send
    char request[512];

  int64_t br = 0;
    int64_t buffer_reads = 0;
    int last_latency = 0;
    int start_lat = 0;

int64_t get_time_difference(const char *buffer, size_t bytes_read) {
    if (bytes_read < 16) {
        // printf("Buffer too small\n");
        return -1;
    }

    int64_t d = strtoll(buffer + (bytes_read - 20), NULL, 10);

    struct timespec current_time;
    clock_gettime(CLOCK_REALTIME, &current_time);

    // // Handle nanosecond underflow
    int64_t delta = current_time.tv_sec * 1000000000 + current_time.tv_nsec - d;

    // Step 6: Print the time difference
    return delta;
}

int set_nonblocking(int sockfd) {
    int opt = 1;
    if (ff_ioctl(sockfd, FIONBIO, &opt)) {
        perror("ioctl FIONBIO");
        return -1;
    }
    return 0;
}

int loop(void* arg) {
      int n = ff_epoll_wait(epfd, events, MAX_EVENTS, -1);
        for (int i = 0; i < n; i++) {
            if (events[i].events & EPOLLOUT) {
                // Ready to write (send request)
                int sent = ff_send(sockfd, request, strlen(request), 0);
                if (sent == -1) {
                    perror("send");
                    break;
                }
                // printf("Sent HTTP request:\n%s", request);
                ev.events = EPOLLIN | EPOLLET;  // Now, wait for the response
                ff_epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev);
            } else if (events[i].events & EPOLLIN) {
                // Ready to read (response received)
                char buffer[BUFFER_SIZE];
                int bytes_read;
                while ((bytes_read = ff_recv(sockfd, buffer, sizeof(buffer) - 1, 0)) > 0) {
                    br += bytes_read;
                    ++buffer_reads;
                    bytes_read += 1;
                    buffer[bytes_read - 1] = '\0';
                    last_latency = get_time_difference(buffer, bytes_read);
                    if (start_lat < 10 || start_lat > 10000000)
                        start_lat = last_latency;
                }

                if (bytes_read == 0) {
                    // Connection closed by the server
                    float avg_bytes = (float)br / (float)buffer_reads;
                    printf("avg bytes: %.6f\n", avg_bytes);
                    printf("buffer_reads: %ld\n", buffer_reads);
                    printf("bytes_read: %ld\n", br);
                    printf("last_latency: %d\n", last_latency);
                    printf("first_latency: %d\n", start_lat);
                    printf("lat_diff: %d\n", last_latency - start_lat);
                    exit(0);
                } else if (bytes_read == -1 && errno != EAGAIN) {
                    perror("recv");
                    exit(0);
                }
            }
        }
}

int do_req() {
    const char *hostname = "10.0.3.122";
    const char *port = "12373";
    const char *path = "/";

    // Create a socket
    sockfd = ff_socket(AF_INET, SOCK_STREAM, 0);
    if (sockfd < 0) {
        perror("socket");
        exit(EXIT_FAILURE);
    }

    // Resolve hostname
    struct addrinfo hints, *res;
    memset(&hints, 0, sizeof(hints));
    hints.ai_family = AF_INET;
    hints.ai_socktype = SOCK_STREAM;

    if (getaddrinfo(hostname, port, &hints, &res) != 0) {
        perror("getaddrinfo");
        ff_close(sockfd);
        exit(EXIT_FAILURE);
    }

    // Set the socket to non-blocking mode
    if (set_nonblocking(sockfd) == -1) {
        ff_close(sockfd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    // Start connecting
    int connect_status = ff_connect(sockfd, (struct linux_sockaddr *)res->ai_addr, res->ai_addrlen);
    if (connect_status == -1 && errno != EINPROGRESS) {
        perror("connect");  
        ff_close(sockfd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    // Set up epoll
    epfd = ff_epoll_create(0);
    if (epfd == -1) {
        perror("epoll_create1");
        ff_close(sockfd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    struct epoll_event ev, events[MAX_EVENTS];
    ev.events = EPOLLOUT | EPOLLIN | EPOLLET;  // Wait for the socket to be writable (connect completion) and readable
    ev.data.fd = sockfd;

    if (ff_epoll_ctl(epfd, EPOLL_CTL_ADD, sockfd, &ev) == -1) {
        perror("epoll_ctl");
        ff_close(sockfd);
        ff_close(epfd);
        freeaddrinfo(res);
        exit(EXIT_FAILURE);
    }

    snprintf(request, sizeof(request), "GET %s HTTP/1.1\r\nHost: %s\r\nConnection: keep-alive\r\n\r\n", path, hostname);
    ff_run(loop, NULL);

    // Clean up
    ff_close(sockfd);
    ff_close(epfd);
    freeaddrinfo(res);

    return 0;
}

int main(int argc, char * argv[])
{
    // printf("hello1\n");
    ff_init(argc, argv);
    // printf("hello\n");
    do_req();
    return 0;
}

My config is as follows:

[dpdk]
# Hexadecimal bitmask of cores to run on.
lcore_mask=2

# Number of memory channels.
channel=10

# Specify base virtual address to map.
#base_virtaddr=0x7f0000000000

# Promiscuous mode of nic, defualt: enabled.
promiscuous=1
numa_on=1

# TX checksum offload skip, default: disabled.
# We need this switch enabled in the following cases:
# -> The application want to enforce wrong checksum for testing purposes
# -> Some cards advertize the offload capability. However, doesn't calculate checksum.
tx_csum_offoad_skip=0

# TCP segment offload, default: disabled.
tso=0

# HW vlan strip, default: enabled.
vlan_strip=1

# sleep when no pkts incomming
# unit: microseconds
idle_sleep=0

# sent packet delay time(0-100) while send less than 32 pkts.
# default 100 us.
# if set 0, means send pkts immediately.
# if set >100, will dealy 100 us.
# unit: microseconds
pkt_tx_delay=0

# use symmetric Receive-side Scaling(RSS) key, default: disabled.
symmetric_rss=0

# PCI device enable list.
# And driver options
#allow=02:00.0
# for multiple PCI devices
#allow=02:00.0,03:00.0

# enabled port list
#
# EBNF grammar:
#
#    exp      ::= num_list {"," num_list}
#    num_list ::= <num> | <range>
#    range    ::= <num>"-"<num>
#    num      ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
#
# examples
#    0-3       ports 0, 1,2,3 are enabled
#    1-3,4,7   ports 1,2,3,4,7 are enabled
#
# If use bonding, shoule config the bonding port id in port_list
# and not config slave port id in port_list
# such as, port 0 and port 1 trank to a bonding port 2,
# should set `port_list=2` and config `[port2]` section

port_list=0

# Number of vdev.
nb_vdev=0

# Number of bond.
nb_bond=0

# log level for dpdk, optional
# log_level=0

# Each core write into own pcap file, which is open one time, close one time if enough.
# Support dump the first snaplen bytes of each packet.
# if pcap file is lager than savelen bytes, it will be closed and next file was dumped into.
[pcap]
enable=0
snaplen=96
savelen=16777216
savepath=.

# Port config section
# Correspond to dpdk.port_list's index: port0, port1...
[port0]
addr=10.0.3.234
netmask=255.255.255.0
broadcast=10.0.3.255
gateway=10.0.3.1
# set interface name, Optional parameter.
if_name=ens6

# IPv6 net addr, Optional parameters.
#addr6=ff::02
#prefix_len=64
#gateway6=ff::01

# Multi virtual IPv4/IPv6 net addr, Optional parameters.
#   `vip_ifname`: default `f-stack-x`
#   `vip_addr`: Separated by semicolons, MAX number 64;
#           Only support netmask 255.255.255.255, broadcast x.x.x.255 now, hard code in `ff_veth_setvaddr`.
#   `vip_addr6`: Separated by semicolons, MAX number 64.
#   `vip_prefix_len`: All addr6 use the same prefix now, default 64.
#vip_ifname=lo0
#vip_addr=192.168.1.3;192.168.1.4;192.168.1.5;192.168.1.6
#vip_addr6=ff::03;ff::04;ff::05;ff::06;ff::07
#vip_prefix_len=64

# lcore list used to handle this port
# the format is same as port_list
#lcore_list=0

# bonding slave port list used to handle this port
# need to config while this port is a bonding port
# the format is same as port_list
#slave_port_list=0,1

# Vdev config section
# orrespond to dpdk.nb_vdev's index: vdev0, vdev1...
#    iface : Shouldn't set always.
#    path : The vuser device path in container. Required.
#    queues : The max queues of vuser. Optional, default 1, greater or equal to the number of processes.
#    queue_size : Queue size.Optional, default 256.
#    mac : The mac address of vuser. Optional, default random, if vhost use phy NIC, it should be set to the phy NIC's mac.
#    cq : Optional, if queues = 1, default 0; if queues > 1 default 1.
#[vdev0]
##iface=/usr/local/var/run/openvswitch/vhost-user0
#path=/var/run/openvswitch/vhost-user0
#queues=1
#queue_size=256
#mac=00:00:00:00:00:01
#cq=0

# bond config section
# See http://doc.dpdk.org/guides/prog_guide/link_bonding_poll_mode_drv_lib.html
#[bond0]
#mode=4
#slave=0000:0a:00.0,slave=0000:0a:00.1
#primary=0000:0a:00.0
#mac=f0:98:38:xx:xx:xx
## opt argument
#socket_id=0
#xmit_policy=l23
#lsc_poll_period_ms=0
#up_delay=0
#down_delay=0

# Kni config: if enabled and method=reject,
# all packets that do not belong to the following tcp_port and udp_port
# will transmit to kernel; if method=accept, all packets that belong to
# the following tcp_port and udp_port will transmit to kernel.
#[kni]
#enable=1
#method=reject
## The format is same as port_list
#tcp_port=80,443
#udp_port=53

# FreeBSD network performance tuning configurations.
# Most native FreeBSD configurations are supported.
[freebsd.boot]
# If use rack/bbr which depend HPTS, you should set a greater value of hz, such as 1000000 means a tick is 1us.
hz=100

# Block out a range of descriptors to avoid overlap
# with the kernel's descriptor space.
# You can increase this value according to your app.
fd_reserve=1024

kern.ipc.maxsockets=262144

net.inet.tcp.syncache.hashsize=4096
net.inet.tcp.syncache.bucketlimit=100

net.inet.tcp.tcbhashsize=65536

kern.ncallout=262144

kern.features.inet6=1

[freebsd.sysctl]
kern.ipc.somaxconn=32768
kern.ipc.maxsockbuf=16777216

net.link.ether.inet.maxhold=5

net.inet.tcp.fast_finwait2_recycle=1
net.inet.tcp.sendspace=1677721
net.inet.tcp.recvspace=1677721
net.inet.tcp.nolocaltimewait=1
net.inet.tcp.cc.algorithm=bbr
net.inet.tcp.sendbuf_max=16777216
net.inet.tcp.recvbuf_max=16777216
net.inet.tcp.sendbuf_auto=1
net.inet.tcp.recvbuf_auto=1
net.inet.tcp.sendbuf_inc=16384
net.inet.tcp.recvbuf_inc=524288
net.inet.tcp.sack.enable=1
net.inet.tcp.blackhole=1
net.inet.tcp.msl=2000
net.inet.tcp.delayed_ack=0
net.inet.tcp.rfc1323=1

net.inet.udp.blackhole=1
net.inet.ip.redirect=0
net.inet.ip.forwarding=0

net.inet6.ip6.auto_linklocal=1
net.inet6.ip6.accept_rtadv=2
net.inet6.icmp6.rediraccept=1
net.inet6.ip6.forwarding=0

# set default stacks:freebsd, rack or bbr, may be you need increase the value of parameter 'freebsd.boot.hz' while use rack or bbr.
net.inet.tcp.functions_default=freebsd
# need by bbr, should enable it.
net.inet.tcp.hpts.skip_swi=1
# Interval between calls to hpts_timeout_dir. default min 250us, max 256-512ms, default 512ms.
net.inet.tcp.hpts.minsleep=0
# [25600-51200]
net.inet.tcp.hpts.maxsleep=51200

The native linux approach takes 2s, however the F-stack approach takes around 7.5s consistently.

Can anyone provide some advice on why F-stack is so much slower here?