Closed lguohan closed 4 years ago
log in from the console, system appears normal
top - 23:35:26 up 7:33, 1 user, load average: 17.07, 16.90, 16.00
Tasks: 179 total, 1 running, 178 sleeping, 0 stopped, 0 zombie
%Cpu0 : 1.3 us, 1.0 sy, 0.0 ni, 97.7 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu1 : 1.0 us, 1.0 sy, 0.0 ni, 98.0 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
%Cpu2 : 0.7 us, 2.6 sy, 0.0 ni, 96.4 id, 0.0 wa, 0.0 hi, 0.3 si, 0.0 st
%Cpu3 : 1.0 us, 0.3 sy, 0.0 ni, 98.7 id, 0.0 wa, 0.0 hi, 0.0 si, 0.0 st
MiB Mem : 2866.4 total, 686.7 free, 1071.7 used, 1108.0 buff/cache
MiB Swap: 0.0 total, 0.0 free, 0.0 used. 1510.0 avail Mem
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
31527 admin 20 0 11708 3412 3000 R 2.6 0.1 0:00.23 top
4440 root 20 0 1462716 49048 14600 S 1.9 1.7 6:51.44 dialout_c+
4411 root 20 0 1389736 53412 15448 S 1.6 1.8 7:11.22 telemetry
27596 root 20 0 58932 21088 7232 S 0.6 0.7 0:13.15 superviso+
1 root 20 0 171044 10832 7792 S 0.3 0.4 1:22.99 systemd
830 root 20 0 0 0 0 I 0.3 0.0 0:08.16 kworker/1+
911 root 20 0 98884 15140 4412 S 0.3 0.5 18:04.21 redis-ser+
2021 root 20 0 26080 19848 7276 S 0.3 0.7 0:33.18 superviso+
4311 root 20 0 27740 21744 7396 S 0.3 0.7 0:26.70 superviso+
15910 root 20 0 0 0 0 I 0.3 0.0 0:14.15 kworker/3+
25510 root 20 0 0 0 0 D 0.3 0.0 0:49.82 kworker/3+
27244 root 20 0 27604 21372 7348 S 0.3 0.7 0:10.11 superviso+
27469 root 20 0 58940 21116 7212 S 0.3 0.7 0:13.90 superviso+
30897 root 20 0 0 0 0 I 0.3 0.0 0:06.49 kworker/2+
the first kworker to be blocked is appear to be following.
admin@vlab-01:~$ ps aux | grep kwork
root 6 0.0 0.0 0 0 ? I< 16:01 0:00 [kworker/0:0H-kblockd]
root 19 0.0 0.0 0 0 ? I< 16:01 0:00 [kworker/1:0H-kblockd]
root 24 0.0 0.0 0 0 ? I< 16:01 0:00 [kworker/2:0H-kblockd]
root 29 0.0 0.0 0 0 ? I< 16:01 0:00 [kworker/3:0H-kblockd]
root 148 0.0 0.0 0 0 ? I< 16:01 0:00 [kworker/0:1H-kblockd]
root 192 0.0 0.0 0 0 ? I< 16:02 0:00 [kworker/u9:0]
root 202 0.0 0.0 0 0 ? I< 16:02 0:00 [kworker/3:1H-kblockd]
root 203 0.0 0.0 0 0 ? I< 16:02 0:00 [kworker/2:1H-kblockd]
root 204 0.0 0.0 0 0 ? I< 16:02 0:00 [kworker/1:1H-kblockd]
root 830 0.2 0.0 0 0 ? I 22:35 0:08 [kworker/1:0-events_power_efficient]
root 1370 0.1 0.0 0 0 ? D 22:26 0:06 [kworker/3:1+events]
root 5734 0.0 0.0 0 0 ? I 22:08 0:03 [kworker/3:2-cgroup_destroy]
root 6664 0.1 0.0 0 0 ? I 22:36 0:07 [kworker/0:1-events]
root 10762 0.0 0.0 0 0 ? D 21:29 0:05 [kworker/1:2+ipv6_addrconf]
root 15910 0.4 0.0 0 0 ? I 22:39 0:14 [kworker/3:3-events]
root 21160 0.0 0.0 0 0 ? I 22:40 0:01 [kworker/2:1-cgroup_destroy]
root 25510 0.6 0.0 0 0 ? D 21:33 0:50 [kworker/3:0+events]
root 30880 0.0 0.0 0 0 ? I 22:47 0:00 [kworker/1:3-cgroup_destroy]
root 30897 0.2 0.0 0 0 ? I 22:47 0:06 [kworker/2:0-events]
root 30942 0.0 0.0 0 0 ? I 22:48 0:00 [kworker/0:2-events]
root 31176 0.0 0.0 0 0 ? I 22:55 0:00 [kworker/u8:1-flush-7:1]
root 31449 0.0 0.0 0 0 ? I 23:27 0:00 [kworker/u8:0-flush-254:0]
admin 31546 0.0 0.0 6824 828 ttyS0 S+ 23:36 0:00 grep kwork
another case
admin@vlab-01:~$ ps axu | grep kwor
root 6 0.0 0.0 0 0 ? I< Jun21 0:00 [kworker/0:0H-kblockd]
root 19 0.0 0.0 0 0 ? I< Jun21 0:00 [kworker/1:0H-kblockd]
root 24 0.0 0.0 0 0 ? I< Jun21 0:00 [kworker/2:0H-kblockd]
root 29 0.0 0.0 0 0 ? I< Jun21 0:00 [kworker/3:0H-kblockd]
root 147 0.0 0.0 0 0 ? I< Jun21 0:02 [kworker/3:1H-kblockd]
root 191 0.0 0.0 0 0 ? I< Jun21 0:00 [kworker/u9:0]
root 193 0.0 0.0 0 0 ? I< Jun21 0:03 [kworker/1:1H-kblockd]
root 194 0.0 0.0 0 0 ? I< Jun21 0:03 [kworker/2:1H-kblockd]
root 199 0.0 0.0 0 0 ? I< Jun21 0:02 [kworker/0:1H-kblockd]
root 2937 0.0 0.0 0 0 ? I 11:15 0:00 [kworker/0:4-events]
root 3006 0.0 0.0 0 0 ? I 11:22 0:00 [kworker/u8:1-flush-254:0]
root 3007 0.0 0.0 0 0 ? I 11:23 0:00 [kworker/0:1-cgroup_destroy]
root 3093 0.0 0.0 0 0 ? I 11:33 0:00 [kworker/u8:2-events_unbound]
root 3208 0.0 0.0 0 0 ? I 11:48 0:00 [kworker/u8:0-flush-254:0]
admin 3259 0.0 0.0 6692 892 ttyS0 S+ 11:51 0:00 grep kwor
root 7302 0.5 0.0 0 0 ? I 10:50 0:21 [kworker/3:0-mm_percpu_wq]
root 15512 0.2 0.0 0 0 ? I 10:34 0:10 [kworker/2:1-cgroup_destroy]
root 17552 0.0 0.0 0 0 ? D 10:56 0:00 [kworker/0:3+ipv6_addrconf]
root 22045 0.0 0.0 0 0 ? I 10:59 0:01 [kworker/1:0-events]
root 26404 0.0 0.0 0 0 ? D 11:02 0:00 [kworker/2:2+events]
root 26440 0.0 0.0 0 0 ? I 11:02 0:00 [kworker/3:1-mm_percpu_wq]
root 28332 0.4 0.0 0 0 ? D 11:03 0:12 [kworker/0:2+events]
root 31331 0.0 0.0 0 0 ? I 11:06 0:00 [kworker/1:2-mm_percpu_wq]
root 32718 0.0 0.0 0 0 ? I 11:06 0:01 [kworker/2:3-events_power_efficient]
admin@vlab-01:~$ pstree
[213967.219393] e1000 0000:03:02.0 eth31: Reset adapter
[214115.443195] INFO: task kworker/2:2:26404 blocked for more than 120 seconds.
[214115.447303] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[214115.451979] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[214115.457982] INFO: task ntpd:772 blocked for more than 120 seconds.
[214115.462522] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[214115.470280] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[214115.475385] INFO: task snmpd:1765 blocked for more than 120 seconds.
[214115.481206] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[214115.486604] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[214115.491254] INFO: task zebra:2022 blocked for more than 120 seconds.
[214115.494523] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[214115.499105] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[214115.505757] INFO: task teamd:2157 blocked for more than 120 seconds.
[214115.510606] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[214115.515195] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[214115.519273] INFO: task teamd:2191 blocked for more than 120 seconds.
[214115.522426] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[214115.526544] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[214115.532451] INFO: task teamd:2201 blocked for more than 120 seconds.
[214115.537019] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[214115.541644] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[214115.546335] INFO: task teamd:2212 blocked for more than 120 seconds.
[214115.550756] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[214115.555396] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[214115.559744] INFO: task lldpd:2264 blocked for more than 120 seconds.
[214115.567931] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[214115.573853] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[214115.579788] INFO: task teamsyncd:2275 blocked for more than 120 seconds.
[214115.586768] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[214115.592859] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
Kernel patch in later version available now, https://github.com/torvalds/linux/commit/e64e469b9a2c22d41b3dd7172118760cec22d473
Need to verify if it fixes this.
torvalds/linux@e64e469 already in 4.19 kernel, should be something else.
another repo.
[26825.855448] INFO: task kworker/2:1:5747 blocked for more than 120 seconds.
[26825.860651] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[26825.866477] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[26825.873379] INFO: task ntpd:11794 blocked for more than 120 seconds.
[26825.876949] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[26825.882849] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[26825.888053] INFO: task snmpd:12767 blocked for more than 120 seconds.
[26825.894316] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[26825.899711] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[26825.904202] INFO: task zebra:13083 blocked for more than 120 seconds.
[26825.909076] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[26825.913699] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[26825.918528] INFO: task fpmsyncd:13094 blocked for more than 120 seconds.
[26825.922279] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[26825.926638] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[26825.932217] INFO: task teamd:13177 blocked for more than 120 seconds.
[26825.936019] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[26825.941478] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[26825.947704] INFO: task teamd:13211 blocked for more than 120 seconds.
[26825.952621] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[26825.957684] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[26825.962553] INFO: task teamd:13221 blocked for more than 120 seconds.
[26825.966648] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[26825.973184] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[26825.978071] INFO: task teamd:13230 blocked for more than 120 seconds.
[26825.982120] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[26825.988493] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[26825.992946] INFO: task lldpd:13307 blocked for more than 120 seconds.
[26825.996550] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[26826.001602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
admin@vlab-01:~$ ps aux | grep kwork
root 6 0.0 0.0 0 0 ? I< Jun24 0:00 [kworker/0:0H-kblockd]
root 19 0.0 0.0 0 0 ? I< Jun24 0:00 [kworker/1:0H-kblockd]
root 24 0.0 0.0 0 0 ? I< Jun24 0:00 [kworker/2:0H-kblockd]
root 29 0.0 0.0 0 0 ? I< Jun24 0:00 [kworker/3:0H-kblockd]
root 147 0.0 0.0 0 0 ? I< Jun24 0:01 [kworker/2:1H-kblockd]
root 191 0.0 0.0 0 0 ? I< Jun24 0:00 [kworker/u9:0]
root 193 0.0 0.0 0 0 ? I< Jun24 0:01 [kworker/0:1H-kblockd]
root 196 0.0 0.0 0 0 ? I< Jun24 0:01 [kworker/3:1H-kblockd]
root 203 0.0 0.0 0 0 ? I< Jun24 0:01 [kworker/1:1H-kblockd]
root 5747 0.0 0.0 0 0 ? D 00:28 0:01 [kworker/2:1+events]
root 6876 0.4 0.0 0 0 ? D 00:29 0:12 [kworker/3:2+events]
root 7461 0.0 0.0 0 0 ? I 00:29 0:00 [kworker/2:3-events]
root 9907 1.1 0.0 0 0 ? I 00:31 0:30 [kworker/0:0-events]
root 13651 1.0 0.0 0 0 ? I 00:35 0:24 [kworker/3:0-events]
root 13652 0.0 0.0 0 0 ? I 00:35 0:00 [kworker/1:0-events_power_efficient]
root 13804 0.0 0.0 0 0 ? I 00:53 0:00 [kworker/u8:2-flush-7:1]
root 13917 0.0 0.0 0 0 ? I 01:06 0:00 [kworker/u8:1-events_unbound]
root 13977 0.0 0.0 0 0 ? I 01:13 0:00 [kworker/u8:0-flush-7:1]
admin 13995 0.0 0.0 6824 892 ttyS0 R+ 01:14 0:00 grep kwork
root 14604 0.2 0.0 0 0 ? I 00:10 0:08 [kworker/3:3-events]
root 16933 0.5 0.0 0 0 ? I 00:13 0:21 [kworker/2:0-events]
root 17283 0.0 0.0 0 0 ? I 00:13 0:02 [kworker/1:3-events_power_efficient]
root 21077 0.3 0.0 0 0 ? D 00:16 0:10 [kworker/3:1+ipv6_addrconf]
root 28065 0.1 0.0 0 0 ? I Jun24 0:08 [kworker/0:3-events]
further debug
[10392.714321] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[10392.718983] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[10392.725674] INFO: task ntpd:6056 blocked for more than 120 seconds.
[10392.732181] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[10392.738684] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[10392.745177] INFO: task snmpd:6988 blocked for more than 120 seconds.
[10392.750342] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[10392.758235] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[10392.762821] INFO: task zebra:7346 blocked for more than 120 seconds.
[10392.766575] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[10392.771307] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[10392.776866] INFO: task teamd:7446 blocked for more than 120 seconds.
[10392.782028] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[10392.789553] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[10392.794919] INFO: task teamd:7467 blocked for more than 120 seconds.
[10392.798667] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[10392.805170] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[10392.812838] INFO: task teamd:7486 blocked for more than 120 seconds.
[10392.817007] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[10392.821739] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[10392.827909] INFO: task teamd:7497 blocked for more than 120 seconds.
[10392.831154] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[10392.837983] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[10392.846949] INFO: task lldpd:7597 blocked for more than 120 seconds.
[10392.852424] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[10392.859290] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[10392.866490] INFO: task teamsyncd:7613 blocked for more than 120 seconds.
[10392.874310] Tainted: G W E 4.19.0-6-amd64 Azure/sonic-buildimage#1 Debian 4.19.67-2+deb10u2
[10392.881400] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
root 3923 0.3 0.0 0 0 ? D 05:42 0:07 [kworker/3:1+events]
root 11412 0.6 0.0 0 0 ? D 05:48 0:10 [kworker/0:1+events]
root 32361 0.3 0.0 0 0 ? D 06:03 0:02 [kworker/0:2+ipv6_addrconf]
root@vlab-01:/proc/32361# cat stack
[<0>] addrconf_verify_work+0xa/0x20
[<0>] process_one_work+0x1a7/0x3a0
[<0>] worker_thread+0x30/0x390
[<0>] kthread+0x112/0x130
[<0>] ret_from_fork+0x35/0x40
[<0>] 0xffffffffffffffff
root@vlab-01:/proc/3923# cat stack
[<0>] linkwatch_event+0xa/0x30
[<0>] process_one_work+0x1a7/0x3a0
[<0>] worker_thread+0x30/0x390
[<0>] kthread+0x112/0x130
[<0>] ret_from_fork+0x35/0x40
[<0>] 0xffffffffffffffff
root@vlab-01:/proc/11412# cat stack
[<0>] msleep+0x29/0x30
[<0>] napi_disable+0x2b/0x70
[<0>] e1000_down+0xfe/0x1c0 [e1000]
[<0>] e1000_reinit_locked+0x49/0x60 [e1000]
[<0>] process_one_work+0x1a7/0x3a0
[<0>] worker_thread+0x30/0x390
[<0>] kthread+0x112/0x130
[<0>] ret_from_fork+0x35/0x40
[<0>] 0xffffffffffffffff
root@vlab-01:/proc/6056# cat stack
[<0>] __netlink_dump_start+0x56/0x1e0
[<0>] rtnetlink_rcv_msg+0x22c/0x360
[<0>] netlink_rcv_skb+0x4c/0x120
[<0>] netlink_unicast+0x181/0x210
[<0>] netlink_sendmsg+0x204/0x3d0
[<0>] sock_sendmsg+0x36/0x40
[<0>] __sys_sendto+0xee/0x160
[<0>] __x64_sys_sendto+0x24/0x30
[<0>] do_syscall_64+0x53/0x110
[<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[<0>] 0xffffffffffffffff
root@vlab-01:/proc/7346# cat stack
[<0>] dev_ioctl+0xaa/0x560
[<0>] sock_do_ioctl+0xa0/0x140
[<0>] sock_ioctl+0x19e/0x320
[<0>] do_vfs_ioctl+0xa4/0x630
[<0>] ksys_ioctl+0x60/0x90
[<0>] __x64_sys_ioctl+0x16/0x20
[<0>] do_syscall_64+0x53/0x110
[<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[<0>] 0xffffffffffffffff
root@vlab-01:/proc/6988# cat stack
[<0>] sock_do_ioctl+0xe2/0x140
[<0>] sock_ioctl+0x19e/0x320
[<0>] do_vfs_ioctl+0xa4/0x630
[<0>] ksys_ioctl+0x60/0x90
[<0>] __x64_sys_ioctl+0x16/0x20
[<0>] do_syscall_64+0x53/0x110
[<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[<0>] 0xffffffffffffffff
Description
Steps to reproduce the issue:
Describe the results you received:
kernel stucked. you can still log from the console.
Describe the results you expected:
Additional information you deem important (e.g. issue happens only occasionally):