Hello, my usecase for F-Stack is that I want to optimize for packet read speed for TCP.
I've setup a simple test that has a python server that will send out timestamp packets:
#! /usr/bin/python
# a simple tcp server
import socket,os
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.bind(('0.0.0.0', 12373))
sock.listen(5)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, b"ens7")
import time
while True:
connection,address = sock.accept()
buf = connection.recv(1024)
print(buf)
start= time.time()
for i in range(int(1e6)):
connection.send((str(time.time_ns()) * 200).encode('utf-8'))
connection.close()
And then I setup a simple receiving using native linux sockets and F-stack, to compare how long it takes to proceses each of 5M messages.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <netdb.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#include <arpa/inet.h>
#include <errno.h>
#include <assert.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <netdb.h>
#define SOCK_FSTACK 0x01000000
#define SOCK_KERNEL 0x02000000
#define MAX_EVENTS 10
#define BUFFER_SIZE 4096
int64_t get_time_difference(const char *buffer, size_t bytes_read) {
if (bytes_read < 16) {
// printf("Buffer too small\n");
return -1;
}
int64_t d = strtoll(buffer + (bytes_read - 20), NULL, 10);
struct timespec current_time;
clock_gettime(CLOCK_REALTIME, ¤t_time);
// // Handle nanosecond underflow
int64_t delta = current_time.tv_sec * 1000000000 + current_time.tv_nsec - d;
// Step 6: Print the time difference
return delta;
}
int set_nonblocking(int sockfd) {
int opt = 1;
if (ioctl(sockfd, FIONBIO, &opt)) {
perror("ioctl FIONBIO");
return -1;
}
return 0;
}
int do_req() {
const char *hostname = "10.0.3.122";
const char *port = "12373";
const char *path = "/";
// printf("socket\n");
// Create a socket
int sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd < 0) {
perror("socket");
exit(EXIT_FAILURE);
}
// Resolve hostname
struct addrinfo hints, *res;
memset(&hints, 0, sizeof(hints));
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
if (getaddrinfo(hostname, port, &hints, &res) != 0) {
perror("getaddrinfo");
close(sockfd);
exit(EXIT_FAILURE);
}
// Set the socket to non-blocking mode
if (set_nonblocking(sockfd) == -1) {
close(sockfd);
freeaddrinfo(res);
exit(EXIT_FAILURE);
}
// Start connecting
int connect_status = connect(sockfd, res->ai_addr, res->ai_addrlen);
if (connect_status == -1 && errno != EINPROGRESS) {
perror("connect");
close(sockfd);
freeaddrinfo(res);
exit(EXIT_FAILURE);
}
// Set up epoll
int epoll_fd = epoll_create1(0);
if (epoll_fd == -1) {
perror("epoll_create1");
close(sockfd);
freeaddrinfo(res);
exit(EXIT_FAILURE);
}
struct epoll_event ev, events[MAX_EVENTS];
ev.events = EPOLLOUT | EPOLLIN | EPOLLET; // Wait for the socket to be writable (connect completion) and readable
ev.data.fd = sockfd;
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, sockfd, &ev) == -1) {
perror("epoll_ctl");
close(sockfd);
close(epoll_fd);
freeaddrinfo(res);
exit(EXIT_FAILURE);
}
// HTTP GET request to send
char request[512];
snprintf(request, sizeof(request), "GET %s HTTP/1.1\r\nHost: %s\r\nConnection: keep-alive\r\n\r\n", path, hostname);
int64_t br = 0;
int64_t buffer_reads = 0;
int last_latency = 0;
int start_lat = 0;
int done = 0;
while (!done) {
int n = epoll_wait(epoll_fd, events, MAX_EVENTS, -1);
for (int i = 0; i < n; i++) {
if (events[i].events & EPOLLOUT) {
// Ready to write (send request)
int sent = send(sockfd, request, strlen(request), 0);
if (sent == -1) {
perror("send");
done = 1;
break;
}
// printf("Sent HTTP request:\n%s", request);
ev.events = EPOLLIN | EPOLLET; // Now, wait for the response
epoll_ctl(epoll_fd, EPOLL_CTL_MOD, sockfd, &ev);
} else if (events[i].events & EPOLLIN) {
// Ready to read (response received)
char buffer[BUFFER_SIZE];
int bytes_read;
while ((bytes_read = recv(sockfd, buffer, sizeof(buffer) - 1, 0)) > 0) {
br += bytes_read;
++buffer_reads;
bytes_read += 1;
buffer[bytes_read - 1] = '\0';
last_latency = get_time_difference(buffer, bytes_read);
if (start_lat < 10 || start_lat > 10000000)
start_lat = last_latency;
}
if (bytes_read == 0) {
// Connection closed by the server
done = 1;
} else if (bytes_read == -1 && errno != EAGAIN) {
perror("recv");
done = 1;
}
}
}
}
float avg_bytes = (float)br / (float)buffer_reads;
printf("avg bytes: %.6f\n", avg_bytes);
printf("buffer_reads: %ld\n", buffer_reads);
printf("bytes_read: %ld\n", br);
printf("last_latency: %d\n", last_latency);
printf("first_latency: %d\n", start_lat);
printf("lat_diff: %d\n", last_latency - start_lat);
// Clean up
close(sockfd);
close(epoll_fd);
freeaddrinfo(res);
return 0;
}
int main() {
for (int i = 0; i < 1; ++i)
do_req();
return 0;
}
Then, I write something similar using F-stack:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <netdb.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <strings.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/epoll.h>
#include <arpa/inet.h>
#include <errno.h>
#include <assert.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <netdb.h>
#include "ff_config.h"
#include "ff_api.h"
#include "ff_epoll.h"
#define SOCK_FSTACK 0x01000000
#define SOCK_KERNEL 0x02000000
#define MAX_EVENTS 10
#define BUFFER_SIZE 4096
struct epoll_event ev;
struct epoll_event events[MAX_EVENTS];
int epfd;
int sockfd;
// HTTP GET request to send
char request[512];
int64_t br = 0;
int64_t buffer_reads = 0;
int last_latency = 0;
int start_lat = 0;
int64_t get_time_difference(const char *buffer, size_t bytes_read) {
if (bytes_read < 16) {
// printf("Buffer too small\n");
return -1;
}
int64_t d = strtoll(buffer + (bytes_read - 20), NULL, 10);
struct timespec current_time;
clock_gettime(CLOCK_REALTIME, ¤t_time);
// // Handle nanosecond underflow
int64_t delta = current_time.tv_sec * 1000000000 + current_time.tv_nsec - d;
// Step 6: Print the time difference
return delta;
}
int set_nonblocking(int sockfd) {
int opt = 1;
if (ff_ioctl(sockfd, FIONBIO, &opt)) {
perror("ioctl FIONBIO");
return -1;
}
return 0;
}
int loop(void* arg) {
int n = ff_epoll_wait(epfd, events, MAX_EVENTS, -1);
for (int i = 0; i < n; i++) {
if (events[i].events & EPOLLOUT) {
// Ready to write (send request)
int sent = ff_send(sockfd, request, strlen(request), 0);
if (sent == -1) {
perror("send");
break;
}
// printf("Sent HTTP request:\n%s", request);
ev.events = EPOLLIN | EPOLLET; // Now, wait for the response
ff_epoll_ctl(epfd, EPOLL_CTL_MOD, sockfd, &ev);
} else if (events[i].events & EPOLLIN) {
// Ready to read (response received)
char buffer[BUFFER_SIZE];
int bytes_read;
while ((bytes_read = ff_recv(sockfd, buffer, sizeof(buffer) - 1, 0)) > 0) {
br += bytes_read;
++buffer_reads;
bytes_read += 1;
buffer[bytes_read - 1] = '\0';
last_latency = get_time_difference(buffer, bytes_read);
if (start_lat < 10 || start_lat > 10000000)
start_lat = last_latency;
}
if (bytes_read == 0) {
// Connection closed by the server
float avg_bytes = (float)br / (float)buffer_reads;
printf("avg bytes: %.6f\n", avg_bytes);
printf("buffer_reads: %ld\n", buffer_reads);
printf("bytes_read: %ld\n", br);
printf("last_latency: %d\n", last_latency);
printf("first_latency: %d\n", start_lat);
printf("lat_diff: %d\n", last_latency - start_lat);
exit(0);
} else if (bytes_read == -1 && errno != EAGAIN) {
perror("recv");
exit(0);
}
}
}
}
int do_req() {
const char *hostname = "10.0.3.122";
const char *port = "12373";
const char *path = "/";
// Create a socket
sockfd = ff_socket(AF_INET, SOCK_STREAM, 0);
if (sockfd < 0) {
perror("socket");
exit(EXIT_FAILURE);
}
// Resolve hostname
struct addrinfo hints, *res;
memset(&hints, 0, sizeof(hints));
hints.ai_family = AF_INET;
hints.ai_socktype = SOCK_STREAM;
if (getaddrinfo(hostname, port, &hints, &res) != 0) {
perror("getaddrinfo");
ff_close(sockfd);
exit(EXIT_FAILURE);
}
// Set the socket to non-blocking mode
if (set_nonblocking(sockfd) == -1) {
ff_close(sockfd);
freeaddrinfo(res);
exit(EXIT_FAILURE);
}
// Start connecting
int connect_status = ff_connect(sockfd, (struct linux_sockaddr *)res->ai_addr, res->ai_addrlen);
if (connect_status == -1 && errno != EINPROGRESS) {
perror("connect");
ff_close(sockfd);
freeaddrinfo(res);
exit(EXIT_FAILURE);
}
// Set up epoll
epfd = ff_epoll_create(0);
if (epfd == -1) {
perror("epoll_create1");
ff_close(sockfd);
freeaddrinfo(res);
exit(EXIT_FAILURE);
}
struct epoll_event ev, events[MAX_EVENTS];
ev.events = EPOLLOUT | EPOLLIN | EPOLLET; // Wait for the socket to be writable (connect completion) and readable
ev.data.fd = sockfd;
if (ff_epoll_ctl(epfd, EPOLL_CTL_ADD, sockfd, &ev) == -1) {
perror("epoll_ctl");
ff_close(sockfd);
ff_close(epfd);
freeaddrinfo(res);
exit(EXIT_FAILURE);
}
snprintf(request, sizeof(request), "GET %s HTTP/1.1\r\nHost: %s\r\nConnection: keep-alive\r\n\r\n", path, hostname);
ff_run(loop, NULL);
// Clean up
ff_close(sockfd);
ff_close(epfd);
freeaddrinfo(res);
return 0;
}
int main(int argc, char * argv[])
{
// printf("hello1\n");
ff_init(argc, argv);
// printf("hello\n");
do_req();
return 0;
}
My config is as follows:
[dpdk]
# Hexadecimal bitmask of cores to run on.
lcore_mask=2
# Number of memory channels.
channel=10
# Specify base virtual address to map.
#base_virtaddr=0x7f0000000000
# Promiscuous mode of nic, defualt: enabled.
promiscuous=1
numa_on=1
# TX checksum offload skip, default: disabled.
# We need this switch enabled in the following cases:
# -> The application want to enforce wrong checksum for testing purposes
# -> Some cards advertize the offload capability. However, doesn't calculate checksum.
tx_csum_offoad_skip=0
# TCP segment offload, default: disabled.
tso=0
# HW vlan strip, default: enabled.
vlan_strip=1
# sleep when no pkts incomming
# unit: microseconds
idle_sleep=0
# sent packet delay time(0-100) while send less than 32 pkts.
# default 100 us.
# if set 0, means send pkts immediately.
# if set >100, will dealy 100 us.
# unit: microseconds
pkt_tx_delay=0
# use symmetric Receive-side Scaling(RSS) key, default: disabled.
symmetric_rss=0
# PCI device enable list.
# And driver options
#allow=02:00.0
# for multiple PCI devices
#allow=02:00.0,03:00.0
# enabled port list
#
# EBNF grammar:
#
# exp ::= num_list {"," num_list}
# num_list ::= <num> | <range>
# range ::= <num>"-"<num>
# num ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
#
# examples
# 0-3 ports 0, 1,2,3 are enabled
# 1-3,4,7 ports 1,2,3,4,7 are enabled
#
# If use bonding, shoule config the bonding port id in port_list
# and not config slave port id in port_list
# such as, port 0 and port 1 trank to a bonding port 2,
# should set `port_list=2` and config `[port2]` section
port_list=0
# Number of vdev.
nb_vdev=0
# Number of bond.
nb_bond=0
# log level for dpdk, optional
# log_level=0
# Each core write into own pcap file, which is open one time, close one time if enough.
# Support dump the first snaplen bytes of each packet.
# if pcap file is lager than savelen bytes, it will be closed and next file was dumped into.
[pcap]
enable=0
snaplen=96
savelen=16777216
savepath=.
# Port config section
# Correspond to dpdk.port_list's index: port0, port1...
[port0]
addr=10.0.3.234
netmask=255.255.255.0
broadcast=10.0.3.255
gateway=10.0.3.1
# set interface name, Optional parameter.
if_name=ens6
# IPv6 net addr, Optional parameters.
#addr6=ff::02
#prefix_len=64
#gateway6=ff::01
# Multi virtual IPv4/IPv6 net addr, Optional parameters.
# `vip_ifname`: default `f-stack-x`
# `vip_addr`: Separated by semicolons, MAX number 64;
# Only support netmask 255.255.255.255, broadcast x.x.x.255 now, hard code in `ff_veth_setvaddr`.
# `vip_addr6`: Separated by semicolons, MAX number 64.
# `vip_prefix_len`: All addr6 use the same prefix now, default 64.
#vip_ifname=lo0
#vip_addr=192.168.1.3;192.168.1.4;192.168.1.5;192.168.1.6
#vip_addr6=ff::03;ff::04;ff::05;ff::06;ff::07
#vip_prefix_len=64
# lcore list used to handle this port
# the format is same as port_list
#lcore_list=0
# bonding slave port list used to handle this port
# need to config while this port is a bonding port
# the format is same as port_list
#slave_port_list=0,1
# Vdev config section
# orrespond to dpdk.nb_vdev's index: vdev0, vdev1...
# iface : Shouldn't set always.
# path : The vuser device path in container. Required.
# queues : The max queues of vuser. Optional, default 1, greater or equal to the number of processes.
# queue_size : Queue size.Optional, default 256.
# mac : The mac address of vuser. Optional, default random, if vhost use phy NIC, it should be set to the phy NIC's mac.
# cq : Optional, if queues = 1, default 0; if queues > 1 default 1.
#[vdev0]
##iface=/usr/local/var/run/openvswitch/vhost-user0
#path=/var/run/openvswitch/vhost-user0
#queues=1
#queue_size=256
#mac=00:00:00:00:00:01
#cq=0
# bond config section
# See http://doc.dpdk.org/guides/prog_guide/link_bonding_poll_mode_drv_lib.html
#[bond0]
#mode=4
#slave=0000:0a:00.0,slave=0000:0a:00.1
#primary=0000:0a:00.0
#mac=f0:98:38:xx:xx:xx
## opt argument
#socket_id=0
#xmit_policy=l23
#lsc_poll_period_ms=0
#up_delay=0
#down_delay=0
# Kni config: if enabled and method=reject,
# all packets that do not belong to the following tcp_port and udp_port
# will transmit to kernel; if method=accept, all packets that belong to
# the following tcp_port and udp_port will transmit to kernel.
#[kni]
#enable=1
#method=reject
## The format is same as port_list
#tcp_port=80,443
#udp_port=53
# FreeBSD network performance tuning configurations.
# Most native FreeBSD configurations are supported.
[freebsd.boot]
# If use rack/bbr which depend HPTS, you should set a greater value of hz, such as 1000000 means a tick is 1us.
hz=100
# Block out a range of descriptors to avoid overlap
# with the kernel's descriptor space.
# You can increase this value according to your app.
fd_reserve=1024
kern.ipc.maxsockets=262144
net.inet.tcp.syncache.hashsize=4096
net.inet.tcp.syncache.bucketlimit=100
net.inet.tcp.tcbhashsize=65536
kern.ncallout=262144
kern.features.inet6=1
[freebsd.sysctl]
kern.ipc.somaxconn=32768
kern.ipc.maxsockbuf=16777216
net.link.ether.inet.maxhold=5
net.inet.tcp.fast_finwait2_recycle=1
net.inet.tcp.sendspace=1677721
net.inet.tcp.recvspace=1677721
net.inet.tcp.nolocaltimewait=1
net.inet.tcp.cc.algorithm=bbr
net.inet.tcp.sendbuf_max=16777216
net.inet.tcp.recvbuf_max=16777216
net.inet.tcp.sendbuf_auto=1
net.inet.tcp.recvbuf_auto=1
net.inet.tcp.sendbuf_inc=16384
net.inet.tcp.recvbuf_inc=524288
net.inet.tcp.sack.enable=1
net.inet.tcp.blackhole=1
net.inet.tcp.msl=2000
net.inet.tcp.delayed_ack=0
net.inet.tcp.rfc1323=1
net.inet.udp.blackhole=1
net.inet.ip.redirect=0
net.inet.ip.forwarding=0
net.inet6.ip6.auto_linklocal=1
net.inet6.ip6.accept_rtadv=2
net.inet6.icmp6.rediraccept=1
net.inet6.ip6.forwarding=0
# set default stacks:freebsd, rack or bbr, may be you need increase the value of parameter 'freebsd.boot.hz' while use rack or bbr.
net.inet.tcp.functions_default=freebsd
# need by bbr, should enable it.
net.inet.tcp.hpts.skip_swi=1
# Interval between calls to hpts_timeout_dir. default min 250us, max 256-512ms, default 512ms.
net.inet.tcp.hpts.minsleep=0
# [25600-51200]
net.inet.tcp.hpts.maxsleep=51200
The native linux approach takes 2s, however the F-stack approach takes around 7.5s consistently.
Can anyone provide some advice on why F-stack is so much slower here?
Hello, my usecase for F-Stack is that I want to optimize for packet read speed for TCP.
I've setup a simple test that has a python server that will send out timestamp packets:
And then I setup a simple receiving using native linux sockets and F-stack, to compare how long it takes to proceses each of 5M messages.
Then, I write something similar using F-stack:
My config is as follows:
The native linux approach takes 2s, however the F-stack approach takes around 7.5s consistently.
Can anyone provide some advice on why F-stack is so much slower here?