iqiyi / dpvs

DPVS is a high performance Layer-4 load balancer based on DPDK.
Other
3.03k stars 728 forks source link

KNI内存泄露:kni out of memory #508

Open Gongwenn opened 4 years ago

Gongwenn commented 4 years ago

测试环境中,我们用双网卡做bond,kni口会存在内存泄露问题,跑一段时间后会报错:KNI: Out of memory,即使kni口只跑BGP路由公告流量,也会造成泄露,只是泄露时间比较长。如果健康检查走kni口的话,泄露速度比较快,一两个小时就会报内存不足错误。

dpvs.log一直打印 KNI:Out of memory 之后kni口的IP就无法通讯了 原因是kni口的所有报文rx tx全部丢包 image image

dpdk版本:stable-17.11.2 dpvs版本:1.7.8 单臂模式,双网卡bond image

ywc689 commented 4 years ago

Could you please give the outputs of the following commands for the problem?

Gongwenn commented 4 years ago

Could you please give the outputs of the following commands for the problem?

  • ip link show image

  • ip addr show image

  • ip route show image

  • dpip link show image

  • dpip addr show image

  • dpip route show image

Gongwenn commented 4 years ago

+问题环境dpvs配置: ` !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! This is dpvs default configuration file. ! ! The attribute "" denotes the configuration item at initialization stage. Item of ! this type is configured oneshoot and not reloadable. If invalid value configured in the ! file, dpvs would use its default value. ! ! Note that dpvs configuration file supports the following comment type: ! line comment: using '#" or '!' ! inline range comment: using '<' and '>', put comment in between !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ! global config global_defs { log_level DEBUG log_file /data/var/log/dpvs.log } ! netif config netif_defs {

pktpool_size 4194296 pktpool_cache 512 device dpdk0 { rx { queue_number 16 descriptor_number 2048 rss all } tx { queue_number 16 descriptor_number 2048 } fdir { mode perfect pballoc 128k status matched } ! promisc_mode ! kni_name dpdk0.kni } device dpdk1 { rx { queue_number 16 descriptor_number 2048 rss all } tx { queue_number 16 descriptor_number 2048 } fdir { mode perfect pballoc 128k status matched } ! promisc_mode ! kni_name dpdk1.kni } bonding bond0 { mode 4 slave dpdk0 slave dpdk1 primary dpdk0 kni_name bond0.kni } } ! worker config (lcores) worker_defs { worker cpu0 { type master cpu_id 0 } worker cpu1 { type slave cpu_id 1 port bond0 { rx_queue_ids 0 tx_queue_ids 0 ! isol_rx_cpu_ids 9 ! isol_rxq_ring_sz 1048576 } } worker cpu2 { type slave cpu_id 2 port bond0 { rx_queue_ids 1 tx_queue_ids 1 ! isol_rx_cpu_ids 10 ! isol_rxq_ring_sz 1048576 } } worker cpu3 { type slave cpu_id 3 port bond0 { rx_queue_ids 2 tx_queue_ids 2 ! isol_rx_cpu_ids 11 ! isol_rxq_ring_sz 1048576 } } worker cpu4 { type slave cpu_id 4 port bond0 { rx_queue_ids 3 tx_queue_ids 3 ! isol_rx_cpu_ids 12 ! isol_rxq_ring_sz 1048576 } } worker cpu5 { type slave cpu_id 5 port bond0 { rx_queue_ids 4 tx_queue_ids 4 ! isol_rx_cpu_ids 13 ! isol_rxq_ring_sz 1048576 } } worker cpu6 { type slave cpu_id 6 port bond0 { rx_queue_ids 5 tx_queue_ids 5 ! isol_rx_cpu_ids 14 ! isol_rxq_ring_sz 1048576 } } worker cpu7 { type slave cpu_id 7 port bond0 { rx_queue_ids 6 tx_queue_ids 6 ! isol_rx_cpu_ids 15 ! isol_rxq_ring_sz 1048576 } } worker cpu8 { type slave cpu_id 8 port bond0 { rx_queue_ids 7 tx_queue_ids 7 ! isol_rx_cpu_ids 16 ! isol_rxq_ring_sz 1048576 } } worker cpu9 { type slave cpu_id 9 port bond0 { rx_queue_ids 8 tx_queue_ids 8 ! isol_rx_cpu_ids 16 ! isol_rxq_ring_sz 1048576 } } worker cpu10 { type slave cpu_id 10 port bond0 { rx_queue_ids 9 tx_queue_ids 9 ! isol_rx_cpu_ids 16 ! isol_rxq_ring_sz 1048576 } } worker cpu11 { type slave cpu_id 11 port bond0 { rx_queue_ids 10 tx_queue_ids 10 ! isol_rx_cpu_ids 16 ! isol_rxq_ring_sz 1048576 } } worker cpu12 { type slave cpu_id 12 port bond0 { rx_queue_ids 11 tx_queue_ids 11 ! isol_rx_cpu_ids 16 ! isol_rxq_ring_sz 1048576 } } worker cpu13 { type slave cpu_id 13 port bond0 { rx_queue_ids 12 tx_queue_ids 12 ! isol_rx_cpu_ids 16 ! isol_rxq_ring_sz 1048576 } } worker cpu14 { type slave cpu_id 14 port bond0 { rx_queue_ids 13 tx_queue_ids 13 ! isol_rx_cpu_ids 16 ! isol_rxq_ring_sz 1048576 } } worker cpu15 { type slave cpu_id 15 port bond0 { rx_queue_ids 14 tx_queue_ids 14 ! isol_rx_cpu_ids 16 ! isol_rxq_ring_sz 1048576 } } worker cpu16 { type slave cpu_id 16 port bond0 { rx_queue_ids 15 tx_queue_ids 15 ! isol_rx_cpu_ids 16 ! isol_rxq_ring_sz 1048576 } } } ! timer config timer_defs { # cpu job loops to schedule dpdk timer management schedule_interval 500 } ! dpvs neighbor config neigh_defs { unres_queue_length 128 timeout 60 } ! dpvs ipv4 config ipv4_defs { forwarding off default_ttl 64 fragment { bucket_number 4096 bucket_entries 16 max_entries 4096 ttl 1 } } ! dpvs ipv6 config ipv6_defs { disable off forwarding off route6 { method "hlist" recycle_time 10 } } ! control plane config ctrl_defs { lcore_msg { ring_size 4096 multicast_queue_length 256 sync_msg_timeout_us 20000 } ipc_msg { unix_domain /var/run/dpvs_ctrl } } ! ipvs config ipvs_defs { conn { conn_pool_size 33554432 conn_pool_cache 512 conn_init_timeout 3 ! expire_quiescent_template ! fast_xmit_close ! redirect off } udp { ! defence_udp_drop uoa_mode ipo uoa_max_trail 0 timeout { normal 300 last 3 } } tcp { ! defence_tcp_drop timeout { none 2 established 7200 syn_sent 3 syn_recv 30 fin_wait 7 time_wait 7 close 3 close_wait 7 last_ack 7 listen 120 synack 30 last 2 } synproxy { synack_options { mss 1380 ttl 63 sack ! wscale ! timestamp } ! defer_rs_syn rs_syn_max_retry 3 ack_storm_thresh 10 max_ack_saved 3 conn_reuse_state { close time_wait ! fin_wait ! close_wait ! last_ack } } } } ! sa_pool config sa_pool { pool_hash_size 8 `
haidfs commented 4 years ago

请教下这问题解决了吗?我也遇到了类似的问题

AssassinOdyssey commented 4 years ago

请问,使用的是什么版本DPDK出现的KNI OOM?问题是否已经解决?

sjaliang commented 3 years ago

可以看下这个帖子 https://bugs.dpdk.org/show_bug.cgi?id=213,还有需要调大 kni memory pool 的大小