Open liyi-ibm opened 5 years ago
On one dubai:
Oops#1 Part1 <4>[362311.252595] CPU: 172 PID: 158155 Comm: java Kdump: loaded Tainted: G W ------------ 4.14.0-115.2.2.el7a.1.ppc64le #1 <4>[362311.252596] task: c0002012d26c0000 task.stack: c000201233a80000 <4>[362311.252597] NIP: c000000000695114 LR: c000000000695110 CTR: 0000000000000000 <4>[362311.252598] REGS: c000201233a83940 TRAP: 0700 Tainted: G W ------------ (4.14.0-115.2.2.el7a.1.ppc64le) <4>[362311.252599] MSR: 9000000000029033 <SF,HV,EE,ME,IR,DR,RI,LE> CR: 42262428 XER: 20040000 <4>[362311.252605] CFAR: c0000000001cdf74 SOFTE: 0 <4>[362311.252605] GPR00: c000000000695110 c000201233a83bc0 c0000000014ce100 0000000000000075 <4>[362311.252605] GPR04: 0000000000000000 0000000000000001 0000000000040308 0000000000000000 <4>[362311.252605] GPR08: 0000000000000001 0000000000000007 0000000000000006 0000000000000002 <4>[362311.252605] GPR12: 0000000022262422 c000000007af6400 00000000cd58be01 c000200de9271e00 <4>[362311.252605] GPR16: 00007fff5c909000 0000000400000000 c000200de9271e40 c000200de9271e58 <4>[362311.252605] GPR20: c000200de9271e18 0000000000000000 0000000000000000 c000201ca7ad09c8 <4>[362311.252605] GPR24: c000201ca7ad09c0 00000000606dc601 0000000000000001 c0000008d5fff008 <4>[362311.252605] GPR28: c000201c94599f18 c000201c94599f18 c0000008d5fff000 c000201c94599f00 <4>[362311.252629] NIP [c000000000695114] __list_add_valid+0x94/0xc0 <4>[362311.252631] LR [c000000000695110] __list_add_valid+0x90/0xc0 <4>[362311.252632] Call Trace: <4>[362311.252634] [c000201233a83bc0] [c000000000695110] __list_add_valid+0x90/0xc0 (unreliable) <4>[362311.252636] [c000201233a83c20] [c0000000001ae280] add_wait_queue+0x70/0xd0 <4>[362311.252639] [c000201233a83c70] [c0000000004c5b64] ep_ptable_queue_proc+0x134/0x140 <4>[362311.252641] [c000201233a83cb0] [c000000000adf4ac] tcp_poll+0x5c/0x380 <4>[362311.252644] [c000201233a83cf0] [c000000000a0c5f4] sock_poll+0xb4/0xf0 <4>[362311.252647] [c000201233a83d30] [c0000000004c8194] SyS_epoll_ctl+0x924/0xf10 <4>[362311.252650] [c000201233a83e30] [c00000000000b284] system_call+0x58/0x6c <4>[362311.252651] Instruction dump: <4>[362311.252652] 7d254b78 4bb38e45 60000000 0fe00000 38600000 4bffffd0 3c62ff9c 7cc43378 <4>[362311.252657] 38635010 7d264b78 4bb38e21 60000000 <0fe00000> 38600000 4bffffac 3c62ff9c <4>[362311.252662] ---[ end trace 06be13b799e17299 ]--- <1>[362311.259459] Unable to handle kernel paging request for data at address 0x5deadbeef0000100 <1>[362311.259594] Faulting instruction address: 0xc0000000001ae7f8 <4>[362311.259670] Oops: Kernel access of bad area, sig: 11 [#1] <4>[362311.259713] LE SMP NR_CPUS=2048 NUMA PowerNV <4>[362311.259758] Modules linked in: fuse vfat msdos fat xfs libcrc32c dm_mod bridge stp llc iptable_filter i2c_dev sr_mod cdrom sg shpchp ofpart ipmi_powernv at24 powernv_flash ipmi_devintf ibmpowernv mtd uio_pdrv_genirq uio ipmi_msghandler opal_prd nfsd auth_rpcgss nfs_acl lockd grace sunrpc binfmt_misc ip_tables ext4 mbcache jbd2 sd_mod uas usb_storage ast i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm ixgbe drm mpt3sas raid_class mdio scsi_transport_sas ptp pps_core <4>[362311.260178] CPU: 63 PID: 0 Comm: swapper/63 Kdump: loaded Tainted: G W ------------ 4.14.0-115.2.2.el7a.1.ppc64le #1 <4>[362311.260279] task: c000001fee6eb200 task.stack: c000001fee8a0000 <4>[362311.260374] NIP: c0000000001ae7f8 LR: c0000000001ae838 CTR: c0000000004c61c0 <4>[362311.260435] REGS: c000201fff4b7190 TRAP: 0380 Tainted: G W ------------ (4.14.0-115.2.2.el7a.1.ppc64le) <4>[362311.260513] MSR: 9000000000009033 <SF,HV,EE,ME,IR,DR,RI,LE> CR: 24888884 XER: 20040000 <4>[362311.260583] CFAR: c0000000001ae86c SOFTE: 0 <4>[362311.260583] GPR00: c0000000001ae838 c000201fff4b7410 c0000000014ce100 0000000000000001 <4>[362311.260583] GPR04: 0000000000000000 0000000000000001 00000000000000c3 00000000000000c3 <4>[362311.260583] GPR08: c000201fff4b74a0 5deadbeef00000e8 0000000000000000 c008000013964998 <4>[362311.260583] GPR12: 0000000000008000 c000000007aab500 c00800001a241340 c000000cd895b980 <4>[362311.260583] GPR16: c000001f60b00000 c000000000000000 0000000000000056 00000000000009a0 <4>[362311.260583] GPR20: c000201cab7c0900 0000000000000000 c000201fff4b74a0 0000000000000001 <4>[362311.260583] GPR24: 00000000000000c3 0000000000000001 0000000000000001 0000000000000001 <4>[362311.260583] GPR28: c0000008d5fff008 0000000000000000 5deadbeef00000e8 5deadbeef00000e8 <4>[362311.261215] NIP [c0000000001ae7f8] __wake_up_common+0xb8/0x2d0 <4>[362311.261268] LR [c0000000001ae838] __wake_up_common+0xf8/0x2d0 <4>[362311.261318] Call Trace: <4>[362311.261345] [c000201fff4b7410] [c0000000001ae838] __wake_up_common+0xf8/0x2d0 (unreliable) <4>[362311.261413] [c000201fff4b7480] [c0000000001af4ac] __wake_up_sync_key+0xcc/0x120 <4>[362311.261530] [c000201fff4b7510] [c000000000a178e4] sock_def_readable+0x54/0x90 <4>[362311.261594] [c000201fff4b7540] [c000000000af3514] tcp_data_queue+0x724/0x14f0 <4>[362311.261656] [c000201fff4b7630] [c000000000af452c] tcp_rcv_established+0x24c/0x7d0 <4>[362311.261725] [c000201fff4b7690] [c000000000b05f60] tcp_v4_do_rcv+0x1a0/0x2b0 <4>[362311.261788] [c000201fff4b76d0] [c000000000b08890] tcp_v4_rcv+0xc40/0xe90 <4>[362311.261898] [c000201fff4b77c0] [c000000000acc080] ip_local_deliver_finish+0x170/0x350 <4>[362311.261965] [c000201fff4b7810] [c000000000acc950] ip_local_deliver+0x60/0x130 <4>[362311.262029] [c000201fff4b7870] [c000000000acc468] ip_rcv_finish+0x208/0x4b0 <4>[362311.262091] [c000201fff4b7900] [c000000000accd18] ip_rcv+0x2f8/0x470 <4>[362311.262147] [c000201fff4b7980] [c000000000a3e73c] __netif_receive_skb_core+0x73c/0x1010 <4>[362311.262260] [c000201fff4b7a40] [c000000000a461e8] netif_receive_skb_internal+0x58/0x160 <4>[362311.262322] [c000201fff4b7a80] [c000000000a49738] napi_gro_receive+0x1c8/0x2f0 <4>[362311.262402] [c000201fff4b7ac0] [c00800001260818c] ixgbe_clean_rx_irq+0x46c/0xf50 [ixgbe] <4>[362311.262477] [c000201fff4b7bd0] [c008000012609194] ixgbe_poll+0x484/0xac0 [ixgbe] <4>[362311.262550] [c000201fff4b7cf0] [c000000000a493ec] net_rx_action+0x3cc/0x550 <4>[362311.262657] [c000201fff4b7e00] [c000000000c44c3c] __do_softirq+0x14c/0x3dc <4>[362311.262713] [c000201fff4b7ef0] [c00000000013f154] irq_exit+0x1e4/0x1f0 <4>[362311.262770] [c000201fff4b7f20] [c000000000016590] __do_irq+0xa0/0x200 <4>[362311.262824] [c000201fff4b7f90] [c00000000002c3c0] call_do_irq+0x14/0x24 <4>[362311.262876] [c000001fee8a3a50] [c00000000001678c] do_IRQ+0x9c/0x110 <4>[362311.262983] [c000001fee8a3aa0] [c000000000009bf4] h_virt_irq_common+0x114/0x120 <4>[362311.263049] --- interrupt: ea1 at replay_interrupt_return+0x0/0x4 <4>[362311.263049] LR = arch_local_irq_restore+0xf0/0x170 <4>[362311.263139] [c000001fee8a3d90] [c000001fee8a0080] 0xc000001fee8a0080 (unreliable) <4>[362311.263205] [c000001fee8a3db0] [c0000000009bc924] cpuidle_enter_state+0x114/0x460 <4>[362311.263321] [c000001fee8a3e10] [c0000000001b06a0] do_idle+0x340/0x3e0 <4>[362311.263375] [c000001fee8a3ea0] [c0000000001b092c] cpu_startup_entry+0x3c/0x50 <4>[362311.263440] [c000001fee8a3ed0] [c00000000005684c] start_secondary+0x7fc/0x890 <4>[362311.263503] [c000001fee8a3f90] [c00000000000ab6c] start_secondary_prolog+0x10/0x14 <4>[362311.263563] Instruction dump: <4>[362311.263597] 7fbc4840 419e00b0 ebfe0018 3bffffe8 41920150 3b600000 48000028 60000000 <4>[362311.263718] 60000000 60420000 7fe9fb78 7ffefb78 <ebe90019> 7fbc4840 3bffffe8 41de0078 <4>[362311.263788] ---[ end trace 06be13b799e1729a ]---
This machine also reports EEH.
On one dubai: