SoftRoCE / rxe-dev

Development Repository for RXE
Other
128 stars 55 forks source link

iscsi loging via iSER cause kernel panic #32

Closed Kamalheib closed 9 years ago

Kamalheib commented 9 years ago

iscsi loging via iSER cause kernel panic, To reproduce please follow the following steps:

1- start target: # tgtd 2- setup target device: # tgt-setup-lun -n tgt0 -d /tmp/null -b null -t iser

Output from Server:
[root@reg-l-vrt-036-005 ~]# tgtd
[root@reg-l-vrt-036-005 ~]# tgt-setup-lun -n tgt0 -d /tmp/null -b null -t iser
Using transport: iser
Creating new target (name=iqn.2001-04.com.reg-l-vrt-036-005-tgt0, tid=1)
Adding a logical unit (/tmp/null) to target, tid=1
Setting backing store type: null
Accepting connections from all initiators

1- check if target is up before login: # iscsiadm -m node 2- login to target: # iscsiadm -m node -l

Output from Client:
[root@reg-l-vrt-037-005 ~]# iscsiadm -m node
11.135.196.5:3260,1 iqn.2001-04.com.reg-l-vrt-036-005-tgt0
[root@reg-l-vrt-037-005 ~]# iscsiadm -m node -l
Logging in to [iface: default, target: iqn.2001-04.com.reg-l-vrt-036-005-tgt0, portal:         11.135.196.5,3260] (multiple)

Kernel Panic: [ 325.870106] BUG: unable to handle kernel NULL pointer dereference at (null) [ 325.870854] IP: [] memcpy+0xd/0x110 [ 325.966869] PGD 0 [ 325.966869] Oops: 0002 [#1] SMP DEBUG_PAGEALLOC [ 325.966869] Modules linked in: ib_iser libiscsi scsi_transport_iscsi ib_rxe rdma_ucm rdma_cm iw_cm ib_cm ib_uverbs ib_sa ib_mad ib_core mlx4_en ib_addr mlx4_core vxlan ip6_udp_tunnel udp_tunnel ptp pps_core nfsv3 rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache ppdev cfg80211 rfkill sg snd_hda_codec_generic snd_hda_intel snd_hda_controller snd_hda_codec snd_hwdep kvm_intel snd_seq kvm snd_seq_device snd_pcm serio_raw pcspkr virtio_balloon parport_pc parport snd_timer snd acpi_cpufreq i2c_piix4 soundcore dm_mirror dm_region_hash dm_log dm_mod nfsd auth_rpcgss nfs_acl lockd grace sunrpc uinput ext4 mbcache jbd2 cirrus ata_generic syscopyarea pata_acpi sysfillrect sysimgblt drm_kms_helper virtio_blk virtio_net ttm drm ata_piix libata virtio_pci i2c_core virtio_ring virtio floppy [last unloaded: mlx4_ib] [ 325.966869] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.0.0-rc4+ #13 [ 325.966869] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 [ 325.966869] task: ffff8807e8739a30 ti: ffff8807e8744000 task.ti: ffff8807e8744000 [ 325.966869] RIP: 0010:[] [] memcpy+0xd/0x110 [ 325.966869] RSP: 0018:ffff8807ec003810 EFLAGS: 00010202 [ 325.966869] RAX: 0000000000000000 RBX: ffff8807e2d572c6 RCX: 0000000000000004 [ 325.966869] RDX: 0000000000000004 RSI: ffff8807e2d572c6 RDI: 0000000000000000 [ 325.966869] RBP: ffff8807ec003898 R08: 0000000000000000 R09: 0000000000000000 [ 325.966869] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 [ 325.966869] R13: 0000000000000024 R14: 0000000000000024 R15: ffff8807e0d0fa28 [ 325.966869] FS: 0000000000000000(0000) GS:ffff8807ec000000(0000) knlGS:0000000000000000 [ 325.966869] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 325.966869] CR2: 0000000000000000 CR3: 0000000001aa2000 CR4: 00000000000007e0 [ 325.966869] Stack: [ 325.966869] ffffffffa06deb5a ffff8807ce39b0d0 0000000000000004 0000000000000000 [ 325.966869] ffffffffa06d8b16 ffff8807e8739a30 0000000000000000 ffffffff8278fe50 [ 325.966869] ffff8807ce33b8a0 0000000000000411 5076829536caba05 ffff8807ec003898 [ 325.966869] Call Trace: [ 325.966869] [ 325.966869] [] ? rxe_mem_copy+0x1ea/0x220 [ib_rxe] [ 325.966869] [] ? rxe_pool_get_index+0x96/0xc0 [ib_rxe] [ 325.966869] [] rxe_responder+0x10bc/0x1c60 [ib_rxe] [ 325.966869] [] ? sched_clock_local+0x25/0x90 [ 325.966869] [] ? local_clock+0x15/0x30 [ 325.966869] [] ? _raw_spin_unlock_irqrestore+0x36/0x70 [ 325.966869] [] rxe_do_task+0x9d/0x110 [ib_rxe] [ 325.966869] [] rxe_run_task+0x12/0x30 [ib_rxe] [ 325.966869] [] rxe_resp_queue_pkt+0x50/0x60 [ib_rxe] [ 325.966869] [] rxe_rcv+0x195/0x3d0 [ib_rxe] [ 325.966869] [] ? pskb_pull_tail+0x2a0/0x370 [ 325.966869] [] ? dma_device+0x40/0x40 [ib_rxe] [ 325.966869] [] rxe_udp_encap_recv+0xa8/0xe0 [ib_rxe] [ 325.966869] [] udp_queue_rcv_skb+0x309/0x590 [ 325.966869] [] udp4_lib_rcv+0x4fb/0xad0 [ 325.966869] [] udp_rcv+0x1a/0x20 [ 325.966869] [] ip_local_deliver_finish+0x133/0x3e0 [ 325.966869] [] ? ip_local_deliver_finish+0x4c/0x3e0 [ 325.966869] [] ip_local_deliver+0x48/0x80 [ 325.966869] [] ip_rcv_finish+0x168/0x5c0 [ 325.966869] [] ip_rcv+0x2a2/0x3f0 [ 325.966869] [] netif_receive_skb_core+0x7e6/0xed0 [ 325.966869] [] ? netif_receive_skb_core+0x91/0xed0 [ 325.966869] [] netif_receive_skb+0x18/0x60 [ 325.966869] [] netif_receive_skb_internal+0x40/0x220 [ 325.966869] [] napi_gro_frags+0x249/0x380 [ 325.966869] [] mlx4_en_process_rx_cq+0x76f/0xcc0 [mlx4_en] [ 325.966869] [] mlx4_en_poll_rx_cq+0x9f/0x170 [mlx4_en] [ 325.966869] [] net_rx_action+0x16b/0x550 [ 325.966869] [] do_softirq+0xf6/0x670 [ 325.966869] [] irq_exit+0x125/0x130 [ 325.966869] [] do_IRQ+0x5a/0xf0 [ 325.966869] [] common_interrupt+0x72/0x72 [ 325.966869] [ 325.966869] [] ? native_safe_halt+0x6/0x10 [ 325.966869] [] ? trace_hardirqs_on+0xd/0x10 [ 325.966869] [] default_idle+0x23/0x240 [ 325.966869] [] arch_cpu_idle+0xf/0x20 [ 325.966869] [] cpu_idle_loop+0x415/0x5e0 [ 325.966869] [] cpu_startup_entry+0x69/0x70 [ 325.966869] [] start_secondary+0x193/0x200 [ 325.966869] Code: 2b 43 50 88 43 4e 48 83 c4 08 5b 5d c3 90 e8 6b fc ff ff eb e6 90 90 90 90 90 90 90 90 90 48 89 f8 48 89 d1 48 c1 e9 03 83 e2 07 48 a5 89 d1 f3 a4 c3 20 4c 8b 06 4c 8b 4e 08 4c 8b 56 10 4c [ 325.966869] RIP [] memcpy+0xd/0x110 [ 325.966869] RSP [ 325.966869] CR2: 0000000000000000 [ 325.966869] ---[ end trace 074e350ea19548c2 ]--- [ 325.966869] Kernel panic - not syncing: Fatal exception in interrupt [ 325.966869] Kernel Offset: 0x0 from 0xffffffff81000000 (relocation range: 0xffffffff80000000- 0xffffffff9fffffff) [ 325.966869] drm_kms_helper: panic occurred, switching back to text console [ 325.966869] ---[ end Kernel panic - not syncing: Fatal exception in interrupt [ 326.276636] ------------[ cut here ]------------ [ 326.277633] WARNING: CPU: 2 PID: 0 at arch/x86/kernel/smp.c:124 native_smp_send_reschedule+0x5d/0x60() [ 326.277633] Modules linked in: ib_iser libiscsi scsi_transport_iscsi ib_rxe rdma_ucm rdma_cm iw_cm ib_cm ib_uverbs ib_sa ib_mad ib_core mlx4_en ib_addr mlx4_core vxlan ip6_udp_tunnel udp_tunnel ptp pps_core nfsv3 rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache ppdev cfg80211 rfkill sg snd_hda_codec_generic snd_hda_intel snd_hda_controller snd_hda_codec snd_hwdep kvm_intel snd_seq kvm snd_seq_device snd_pcm serio_raw pcspkr virtio_balloon parport_pc parport snd_timer snd acpi_cpufreq i2c_piix4 soundcore dm_mirror dm_region_hash dm_log dm_mod nfsd auth_rpcgss nfs_acl lockd grace sunrpc uinput ext4 mbcache jbd2 cirrus ata_generic syscopyarea pata_acpi sysfillrect sysimgblt drm_kms_helper virtio_blk virtio_net ttm drm ata_piix libata virtio_pci i2c_core virtio_ring virtio floppy [last unloaded: mlx4_ib] [ 326.277633] CPU: 2 PID: 0 Comm: swapper/2 Tainted: G D 4.0.0-rc4+ #13 [ 326.277633] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 [ 326.277633] 0000000000000000 5076829536caba05 ffff8807ec003230 ffffffff81743f13 [ 326.277633] 0000000000000000 0000000000000000 ffff8807ec003270 ffffffff81086e1a [ 326.277633] ffff880700000000 0000000000000000 0000000000000002 000000000000e928 [ 326.277633] Call Trace: [ 326.277633] [] dump_stack+0x4c/0x65 [ 326.277633] [] warn_slowpath_common+0x8a/0xc0 [ 326.277633] [] warn_slowpath_null+0x1a/0x20 [ 326.277633] [] native_smp_send_reschedule+0x5d/0x60 [ 326.277633] [] trigger_load_balance+0x33d/0x470 [ 326.277633] [] ? trigger_load_balance+0x74/0x470 [ 326.277633] [] scheduler_tick+0x9a/0xe0 [ 326.277633] [] update_process_times+0x51/0x60 [ 326.277633] [] tick_sched_handle.isra.20+0x25/0x60 [ 326.277633] [] tick_sched_timer+0x44/0x80 [ 326.277633] [] run_hrtimer+0xcd/0x6d0 [ 326.277633] [] ? hrtimer_interrupt+0x92/0x240 [ 326.277633] [] ? tick_sched_do_timer+0x50/0x50 [ 326.277633] [] hrtimer_interrupt+0x103/0x240 [ 326.277633] [] local_apic_timer_interrupt+0x39/0x60 [ 326.277633] [] smp_apic_timer_interrupt+0x45/0x60 [ 326.277633] [] apic_timer_interrupt+0x72/0x80 [ 326.277633] [] ? retint_restore_args+0x13/0x13 [ 326.277633] [] ? panic+0x1c6/0x20e [ 326.277633] [] ? panic+0x1cd/0x20e [ 326.277633] [] ? panic+0x1c6/0x20e [ 326.277633] [] oops_end+0x117/0x120 [ 326.277633] [] no_context+0x2f3/0x370 [ 326.277633] [] bad_area_nosemaphore+0x7e/0x1d7 [ 326.277633] [] bad_area_nosemaphore+0x13/0x15 [ 326.277633] [] do_page_fault+0x86/0x470 [ 326.277633] [] ? lock_is_held+0x5f/0x90 [ 326.277633] [] trace_do_page_fault+0x70/0x440 [ 326.277633] [] ? trace_hardirqs_off_thunk+0x3a/0x3c [ 326.277633] [] do_async_page_fault+0x1e/0xd0 [ 326.277633] [] async_page_fault+0x28/0x30 [ 326.277633] [] ? memcpy+0xd/0x110 [ 326.277633] [] ? rxe_mem_copy+0x1ea/0x220 [ib_rxe] [ 326.277633] [] ? rxe_pool_get_index+0x96/0xc0 [ib_rxe] [ 326.277633] [] rxe_responder+0x10bc/0x1c60 [ib_rxe] [ 326.277633] [] ? sched_clock_local+0x25/0x90 [ 326.277633] [] ? local_clock+0x15/0x30 [ 326.277633] [] ? _raw_spin_unlock_irqrestore+0x36/0x70 [ 326.277633] [] rxe_do_task+0x9d/0x110 [ib_rxe] [ 326.277633] [] rxe_run_task+0x12/0x30 [ib_rxe] [ 326.277633] [] rxe_resp_queue_pkt+0x50/0x60 [ib_rxe] [ 326.277633] [] rxe_rcv+0x195/0x3d0 [ib_rxe] [ 326.277633] [] ? pskb_pull_tail+0x2a0/0x370 [ 326.277633] [] ? dma_device+0x40/0x40 [ib_rxe] [ 326.277633] [] rxe_udp_encap_recv+0xa8/0xe0 [ib_rxe] [ 326.277633] [] udp_queue_rcv_skb+0x309/0x590 [ 326.277633] [] udp4_lib_rcv+0x4fb/0xad0 [ 326.277633] [] udp_rcv+0x1a/0x20 [ 326.277633] [] ip_local_deliver_finish+0x133/0x3e0 [ 326.277633] [] ? ip_local_deliver_finish+0x4c/0x3e0 [ 326.277633] [] ip_local_deliver+0x48/0x80 [ 326.277633] [] ip_rcv_finish+0x168/0x5c0 [ 326.277633] [] ip_rcv+0x2a2/0x3f0 [ 326.277633] [] netif_receive_skb_core+0x7e6/0xed0 [ 326.277633] [] ? __netif_receive_skb_core+0x91/0xed0 [ 326.277633] [] netif_receive_skb+0x18/0x60 [ 326.277633] [] netif_receive_skb_internal+0x40/0x220 [ 326.277633] [] napi_gro_frags+0x249/0x380 [ 326.277633] [] mlx4_en_process_rx_cq+0x76f/0xcc0 [mlx4_en] [ 326.277633] [] mlx4_en_poll_rx_cq+0x9f/0x170 [mlx4_en] [ 326.277633] [] net_rx_action+0x16b/0x550 [ 326.277633] [] __do_softirq+0xf6/0x670 [ 326.277633] [] irq_exit+0x125/0x130 [ 326.277633] [] do_IRQ+0x5a/0xf0 [ 326.277633] [] common_interrupt+0x72/0x72 [ 326.277633] [] ? native_safe_halt+0x6/0x10 [ 326.277633] [] ? trace_hardirqs_on+0xd/0x10 [ 326.277633] [] default_idle+0x23/0x240 [ 326.277633] [] arch_cpu_idle+0xf/0x20 [ 326.277633] [] cpu_idle_loop+0x415/0x5e0 [ 326.277633] [] cpu_startup_entry+0x69/0x70 [ 326.277633] [] start_secondary+0x193/0x200 [ 326.277633] ---[ end trace 074e350ea19548c3 ]---

Kamalheib commented 9 years ago

I found the bug it's in rxe_map_sg function, both dma_length and dma_address for each SG elements are never updated and stays zeros. This causes the initiator to think that he received a virtual address in address zero and he will try to copy data to it, which will case the panic.

I'll push a fix to this issue soon.