iqiyi / qnsm

QNSM is network security monitoring framework based on DPDK.
Other
519 stars 188 forks source link

Segmentation fault (core dumped) #39

Closed xin053 closed 3 years ago

xin053 commented 3 years ago

运行命令:./qnsm-inspect -f qnsm_inspect1.cfg -c . -p 1

环境: centos7.6, dpdk 16.11.2 编译时候 kni 报错,所以使用了 dpdk 16.11.11 版本,编译正常 两个网卡:管理网卡(id 2), 使用了dpdk驱动的网卡(id 6)

配置文件:

[EAL]
log_level = 8
n = 1
socket_mem = 2048,2048
master_lcore = 1

[IDPS]
conf_file = ./suricata.yaml

;mbuf mempool cfg
;add mbuf priavte size para
[MEMPOOL0]
buffer_size = 2304
pool_size = 131072
cache_size = 256
cpu = 0;socket_id
private_size = 64 ;sizeof(QNSM_PACKET_INFO)

;for dump
[MEMPOOL1]
buffer_size = 2304
pool_size = 131072
cache_size = 256
cpu = 1;socket_id
private_size = 64 ;sizeof(QNSM_PACKET_INFO)

;link cfg
[LINK0]
rss_qs = 0 1 2 3
rss_proto_ipv4 = TCP UDP
rss_proto_ipv6 = TCP TCP_EX UDP UDP_EX
symmetrical_rss = yes
;ip_local_q = 7  reserved for future proto stack app
;arp_q = 8

;rx queue cfg
;http://dpdk.org/doc/guides/nics/ixgbe.html
[RXQ0.0]
size = 512
burst = 32

[RXQ0.1]
size = 512
burst = 32

[RXQ0.2]
size = 512
burst = 32

[RXQ0.3]
size = 512
burst = 32

[SWQ1]
cpu = 0
mempool = MEMPOOL0
dump = yes

[SWQ2]
cpu = 0
mempool = MEMPOOL0
dump = yes

[SWQ3]
cpu = 0
mempool = MEMPOOL0
dump = yes

[SWQ4]
cpu = 0
mempool = MEMPOOL0
dump = yes

[SWQ5]
size = 2048
cpu = 1
mempool = MEMPOOL1
dup = yes

[SWQ6]
size = 2048
cpu = 1
mempool = MEMPOOL1
dup = yes

[SWQ7]
size = 2048
cpu = 1
mempool = MEMPOOL1
dup = yes

[SWQ8]
size = 2048
cpu = 1
mempool = MEMPOOL1
dup = yes

;app cfg
[PIPELINE0]
type = MASTER
core = s0c0;lcore24

[PIPELINE1]
type = SESSM
core = s0c1
pktq_in = RXQ0.0
pktq_out = SWQ1 SWQ5 SWQ9
timer_period = 10

[PIPELINE2]
type = SESSM
core = s0c2
pktq_in = RXQ0.1
pktq_out = SWQ2 SWQ6 SWQ10
timer_period = 10

[PIPELINE3]
type = SESSM
core = s0c3
pktq_in = RXQ0.2
pktq_out = SWQ3 SWQ7 SWQ11
timer_period = 10

[PIPELINE4]
type = SESSM
core = s0c4
pktq_in = RXQ0.3
pktq_out = SWQ4 SWQ8 SWQ12
timer_period = 10

[PIPELINE5]
type = DUMP
core = s0c5
pktq_in = SWQ1 SWQ2 SWQ3 SWQ4
timer_period = 10

[PIPELINE6]
type = SIP_IN_AGG
core = s0c6

[PIPELINE7]
type = VIP_AGG
core = s0c7
pktq_in = SWQ9 SWQ10 SWQ11 SWQ12
timer_period = 10

[PIPELINE8]
type = EDGE
core = s0c6

;IPS BEGIN
[PIPELINE9]
type = DETECT
core = s1c1
pktq_in = SWQ5

[PIPELINE10]
type = DETECT
core = s1c2
pktq_in = SWQ6

[PIPELINE11]
type = DETECT
core = s1c3
pktq_in = SWQ7

[PIPELINE12]
type = DETECT
core = s1c4
pktq_in = SWQ8

;IPS END

报错

[root@cpe-172-100-20-248 qnsm]# ./qnsm-inspect -f qnsm_inspect.cfg -c . -p 1
[APP] Initializing CPU core map ...
[APP] CPU core mask = 0x00000000000000000000000000001eff
[APP] Initializing EAL ...
EAL: Detected 32 lcore(s)
EAL: No free hugepages reported in hugepages-1048576kB
EAL: Probing VFIO support...
EAL: PCI device 0000:01:00.0 on NUMA socket 0
EAL:   probe driver: 8086:1521 net_e1000_igb
EAL: PCI device 0000:01:00.1 on NUMA socket 0
EAL:   probe driver: 8086:1521 net_e1000_igb
EAL: PCI device 0000:81:00.0 on NUMA socket 1
EAL:   probe driver: 8086:10fb net_ixgbe
EAL: PCI device 0000:81:00.1 on NUMA socket 1
EAL:   probe driver: 8086:10fb net_ixgbe
EAL: PCI device 0000:84:00.0 on NUMA socket 1
EAL:   probe driver: 8086:10fb net_ixgbe
EAL: PCI device 0000:84:00.1 on NUMA socket 1
EAL:   probe driver: 8086:10fb net_ixgbe
[APP] Initializing MEMPOOL0 ...
[APP] Initializing MEMPOOL1 ...
[APP] Initializing LINK0 (0) (4 RXQ, 0 TXQ) ...
[APP] LINK0 (0) (10 Gbps) UP
[APP] Initializing SWQ1...
[APP] Initializing SWQ2...
[APP] Initializing SWQ3...
[APP] Initializing SWQ4...
[APP] Initializing SWQ5...
[APP] Initializing SWQ6...
[APP] Initializing SWQ7...
[APP] Initializing SWQ8...
[APP] Initializing SWQ9...
[APP] Initializing SWQ10...
[APP] Initializing SWQ11...
[APP] Initializing SWQ12...
[APP] Initializing MSGQ-REQ-PIPELINE0 ...
[APP] Initializing MSGQ-RSP-PIPELINE0 ...
[APP] Initializing MSGQ-REQ-CORE-s0c0 ...
[APP] Initializing MSGQ-RSP-CORE-s0c0 ...
[APP] Initializing MSGQ-REQ-PIPELINE1 ...
[APP] Initializing MSGQ-RSP-PIPELINE1 ...
[APP] Initializing MSGQ-REQ-CORE-s0c1 ...
[APP] Initializing MSGQ-RSP-CORE-s0c1 ...
[APP] Initializing MSGQ-REQ-PIPELINE2 ...
[APP] Initializing MSGQ-RSP-PIPELINE2 ...
[APP] Initializing MSGQ-REQ-CORE-s0c2 ...
[APP] Initializing MSGQ-RSP-CORE-s0c2 ...
[APP] Initializing MSGQ-REQ-PIPELINE3 ...
[APP] Initializing MSGQ-RSP-PIPELINE3 ...
[APP] Initializing MSGQ-REQ-CORE-s0c3 ...
[APP] Initializing MSGQ-RSP-CORE-s0c3 ...
[APP] Initializing MSGQ-REQ-PIPELINE4 ...
[APP] Initializing MSGQ-RSP-PIPELINE4 ...
[APP] Initializing MSGQ-REQ-CORE-s0c4 ...
[APP] Initializing MSGQ-RSP-CORE-s0c4 ...
[APP] Initializing MSGQ-REQ-PIPELINE5 ...
[APP] Initializing MSGQ-RSP-PIPELINE5 ...
[APP] Initializing MSGQ-REQ-CORE-s0c5 ...
[APP] Initializing MSGQ-RSP-CORE-s0c5 ...
[APP] Initializing MSGQ-REQ-PIPELINE6 ...
[APP] Initializing MSGQ-RSP-PIPELINE6 ...
[APP] Initializing MSGQ-REQ-CORE-s0c6 ...
[APP] Initializing MSGQ-RSP-CORE-s0c6 ...
[APP] Initializing MSGQ-REQ-PIPELINE7 ...
[APP] Initializing MSGQ-RSP-PIPELINE7 ...
[APP] Initializing MSGQ-REQ-CORE-s0c7 ...
[APP] Initializing MSGQ-RSP-CORE-s0c7 ...
[APP] Initializing MSGQ-REQ-PIPELINE8 ...
[APP] Initializing MSGQ-RSP-PIPELINE8 ...
[APP] Initializing MSGQ-REQ-PIPELINE9 ...
[APP] Initializing MSGQ-RSP-PIPELINE9 ...
[APP] Initializing MSGQ-REQ-CORE-s1c1 ...
[APP] Initializing MSGQ-RSP-CORE-s1c1 ...
[APP] Initializing MSGQ-REQ-PIPELINE10 ...
[APP] Initializing MSGQ-RSP-PIPELINE10 ...
[APP] Initializing MSGQ-REQ-CORE-s1c2 ...
[APP] Initializing MSGQ-RSP-CORE-s1c2 ...
[APP] Initializing MSGQ-REQ-PIPELINE11 ...
[APP] Initializing MSGQ-RSP-PIPELINE11 ...
[APP] Initializing MSGQ-REQ-CORE-s1c3 ...
[APP] Initializing MSGQ-RSP-CORE-s1c3 ...
[APP] Initializing MSGQ-REQ-PIPELINE12 ...
[APP] Initializing MSGQ-RSP-PIPELINE12 ...
[APP] Initializing MSGQ-REQ-CORE-s1c4 ...
[APP] Initializing MSGQ-RSP-CORE-s1c4 ...
Segmentation fault (core dumped)

能帮忙看看嘛,感激不尽

CosmosSun commented 3 years ago

是不是分配的内存不足?socket_mem = 2048,2048改成socket_mem = 10240,2048试试呢 可以生成coredump文件看下呢

xin053 commented 3 years ago

设置成 socket_mem = 10240,2048

会报错:

[APP] Initializing CPU core map ...
[APP] CPU core mask = 0x00000000000000000000000000001eff
[APP] Initializing EAL ...
EAL: Detected 32 lcore(s)
EAL: No free hugepages reported in hugepages-1048576kB
EAL: Probing VFIO support...
EAL: Not enough memory available on socket 0! Requested: 10240MB, available: 2048MB
PANIC in rte_eal_init():
Cannot init memory

所以使用的 socket_mem = 2048,2048, coredump 文件

[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".
Core was generated by `./qnsm-inspect -f qnsm_inspect.cfg -c . -p 1'.
Program terminated with signal 11, Segmentation fault.
#0  0x0000000000557e05 in qnsm_crm_msg_recv ()
Missing separate debuginfos, use: debuginfo-install cyrus-sasl-lib-2.1.26-23.el7.x86_64 file-libs-5.11-37.el7.x86_64 glibc-2.17-323.el7_9.x86_64 jansson-2.10-1.el7.x86_64 keyutils-libs-1.5.8-3.el7.x86_64 krb5-libs-1.15.1-34.el7.x86_64 libcap-ng-0.7.5-4.el7.x86_64 libcom_err-1.42.9-13.el7.x86_64 libpcap-1.5.3-12.el7.x86_64 librdkafka-0.11.5-1.el7.x86_64 libselinux-2.5-14.1.el7.x86_64 libxml2-2.9.1-6.el7.5.x86_64 libyaml-0.1.4-11.el7_0.x86_64 lz4-1.7.5-2.el7.x86_64 nspr-4.25.0-2.el7_9.x86_64 nss-3.53.1-3.el7_9.x86_64 nss-softokn-freebl-3.53.1-6.el7_9.x86_64 nss-util-3.53.1-1.el7_9.x86_64 openssl-libs-1.0.2k-16.el7.x86_64 pcre-8.32-17.el7.x86_64 xz-libs-5.2.2-1.el7.x86_64 zlib-1.2.7-19.el7_9.x86_64
(gdb) bt
#0  0x0000000000557e05 in qnsm_crm_msg_recv ()
#1  0x000000000056dca9 in qnsm_crm_msg_req_handle ()
#2  0x00000000004e9d07 in eal_alarm_callback ()
#3  0x00000000004e869d in eal_intr_thread_main ()
#4  0x00007f76f0687ea5 in start_thread () from /lib64/libpthread.so.0
#5  0x00007f76efb149fd in clone () from /lib64/libc.so.6
CosmosSun commented 3 years ago

设置成 socket_mem = 10240,2048同时需要将socket 0对应的hugepage num调整到相应的大小,再试下呢

xin053 commented 3 years ago

执行了

echo 20480 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
echo 1024 > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages

改成 socket_mem = 40960,2048,以及

echo 5120 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages
echo 1024 > /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages

改成 socket_mem = 10240,2048 都试了下,还是报错 Segmentation fault

测试物理机,cpu 32核,内存 128G

CosmosSun commented 3 years ago

SIP_IN_AGG和EDGE都使用了s0c6,目前不支持多个组件实例绑定同一个core,试试呢

xin053 commented 3 years ago

使用了不同的核,还是相同报错

[EAL]
log_level = 8
n = 4
socket_mem = 40960,2048
master_lcore = 0

[IDPS]
conf_file = ./suricata.yaml

;mbuf mempool cfg
;add mbuf priavte size para
[MEMPOOL0]
buffer_size = 2304
pool_size = 131072
cache_size = 256
cpu = 0;socket_id
private_size = 64 ;sizeof(QNSM_PACKET_INFO)

;for dump
[MEMPOOL1]
buffer_size = 2304
pool_size = 131072
cache_size = 256
cpu = 1;socket_id
private_size = 64 ;sizeof(QNSM_PACKET_INFO)

;link cfg
[LINK0]
rss_qs = 0 1 2 3
rss_proto_ipv4 = TCP UDP
rss_proto_ipv6 = TCP TCP_EX UDP UDP_EX
symmetrical_rss = yes
;ip_local_q = 7  reserved for future proto stack app
;arp_q = 8

;rx queue cfg
;http://dpdk.org/doc/guides/nics/ixgbe.html
[RXQ0.0]
size = 2048
burst = 32

[RXQ0.1]
size = 2048
burst = 32

[RXQ0.2]
size = 2048
burst = 32

[RXQ0.3]
size = 2048
burst = 32

;[SWQ1]
;cpu = 0
;mempool = MEMPOOL0
;dump = yes

;[SWQ2]
;cpu = 0
;mempool = MEMPOOL0
;dump = yes

;[SWQ3]
;cpu = 0
;mempool = MEMPOOL0
;dump = yes

;[SWQ4]
;cpu = 0
;mempool = MEMPOOL0
;dump = yes

[SWQ5]
size = 2048
cpu = 1
mempool = MEMPOOL1
dup = yes

[SWQ6]
size = 2048
cpu = 1
mempool = MEMPOOL1
dup = yes

[SWQ7]
size = 2048
cpu = 1
mempool = MEMPOOL1
dup = yes

[SWQ8]
size = 2048
cpu = 1
mempool = MEMPOOL1
dup = yes

;app cfg
[PIPELINE0]
type = MASTER
core = s0c0;lcore24

[PIPELINE1]
type = SESSM
core = s0c1
pktq_in = RXQ0.0
pktq_out = SWQ5 SWQ9
timer_period = 10

[PIPELINE2]
type = SESSM
core = s0c2
pktq_in = RXQ0.1
pktq_out = SWQ6 SWQ10
timer_period = 10

[PIPELINE3]
type = SESSM
core = s0c3
pktq_in = RXQ0.2
pktq_out = SWQ7 SWQ11
timer_period = 10

[PIPELINE4]
type = SESSM
core = s0c4
pktq_in = RXQ0.3
pktq_out = SWQ8 SWQ12
timer_period = 10

;[PIPELINE5]
;type = DUMP
;core = s0c5
;pktq_in = SWQ1 SWQ2 SWQ3 SWQ4
;timer_period = 10

[PIPELINE6]
type = SIP_IN_AGG
core = s0c5

[PIPELINE7]
type = VIP_AGG
core = s0c6
pktq_in = SWQ9 SWQ10 SWQ11 SWQ12
timer_period = 10

[PIPELINE8]
type = EDGE
core = s0c7

;IPS BEGIN
[PIPELINE9]
type = DETECT
core = s1c1
pktq_in = SWQ5

[PIPELINE10]
type = DETECT
core = s1c2
pktq_in = SWQ6

[PIPELINE11]
type = DETECT
core = s1c3
pktq_in = SWQ7

[PIPELINE12]
type = DETECT
core = s1c4
pktq_in = SWQ8

;IPS END
xin053 commented 3 years ago

换了台服务器重新部署了下,可以正常启动了.应该是之前部署哪个地方操作有问题