Open iocellnetworks opened 11 years ago
According to the functions exported in this bug, and what they use as the parameter,
ndas_core_main.c (153-183)
static ndas_error_t register_network(void) {
int c = 0;
ndas_error_t err;
struct net_device *dev;
read_lock(&dev_base_lock);
FOR_EACH_NETDEV(dev)
{
if ( strcmp(dev->name, "lo") == 0 )
continue;
if ( ndas_dev != NULL && ndas_dev[0]!=0 && strcmp(dev->name, ndas_dev) != 0 )
continue;
printk("ndas: registering network interface %s\n", dev->name);
---> err = ndas_register_network_interface(dev->name); <--- maybe passing null pointer
if ( err == NDAS_ERROR_OUT_OF_MEMORY) {
goto out;
} if ( !NDAS_SUCCESS(err) ) {
printk("ndas: fail to register network interface %s (%d): ignored\n", dev->name, err);
}
c++;
}
read_unlock(&dev_base_lock);
if ( c == 0 ) {
return NDAS_ERROR_NO_DEVICE;
}
return NDAS_OK;
out:
unregister_network();
return err;
}
ndasuser.c gets the device (106-110)
NDASUSER_API ndas_error_t
ndas_register_network_interface(const char* devname)
{
return lpx_register_dev(devname);
}
lpxproto.c takes the infomration and tries to create the device (6257-6260)
ndas_error_t lpx_register_dev(const char* devname)
{
return lpxitf_create(devname);
}
This is the function that creates the device lpxproto.c (476-530)
LOCAL ndas_error_t lpxitf_create(const char *dev_name)
{
lpx_interface *itf;
sal_netdev_desc nd;
int ret;
debug_lpx(3, "ing dev_name=%s", dev_name);
if(lpxitf_find_using_devname(dev_name) != NULL)
return NDAS_ERROR_EXIST;
---> nd = sal_netdev_open(dev_name); <--- Last call listed in the bug trace
if (nd == SAL_INVALID_NET_DESC) {
/* sal_debug_print("Failed open device %s\r\n", dev_name); */
return NDAS_ERROR_NO_DEVICE;
}
itf = (lpx_interface *)sal_malloc(sizeof(lpx_interface));
if(itf == NULL) {
ret = NDAS_ERROR_OUT_OF_MEMORY;
goto out1;
}
itf->nd = nd;
itf->itf_sklist = NULL;
ret = sal_spinlock_create("itf-sk-lock", &itf->itf_sklock);
if (!ret) {
ret = NDAS_ERROR_OUT_OF_MEMORY;
goto out2;
}
sal_strcpy(itf->dev, dev_name);
sal_netdev_get_address(nd, itf->itf_node);
itf->mss = sal_netdev_get_mtu(nd) - sizeof(struct lpxhdr);
sal_assert(itf->mss + sizeof(struct lpxhdr)>= MIN_MTU_SIZE); /* We assume MTU is large enough to hold two disk sector */
itf->hard_header_len = SAL_ETHER_HEADER_SIZE;
lpxitf_insert(itf);
ret = sal_netdev_register_proto(nd, g_htons(ETH_P_LPX), lpx_net_rx_handler);
if (ret != 0) {
sal_debug_print("sal_netdev_register_proto failed");
ret = NDAS_ERROR_PROTO_REGISTRATION_FAIL;
goto out3;
}
return NDAS_OK;
out3:
lpxitf_remove(itf);
out2:
sal_free(itf);
out1:
sal_netdev_close(nd);
return ret;
}
So the device open happens in sal_net.c (589 - 660 )
I loaded this function with printk statements to see what I can know more about the crash.
/**
* sal_netdev_open - open the network device by the given name
* @devname - the name of the network device to open
*/
NDAS_SAL_API sal_netdev_desc sal_netdev_open(const char* devname)
{
unsigned long flags;
int ret;
_network_device_t* psn = NULL;
struct net_device *dev = NULL;
struct list_head* i;
printk("sal_netdev_open devname=%s", devname);
// dbgl_salnet(1, "ing name=%s", devname);
if ( strcmp(devname, "lo") == 0 ) {
goto errout;
}
/* Check device is exists */
printk("sal_netdev_open DEV_GET_BY_NAME(devname)");
dev = DEV_GET_BY_NAME(devname);
if (unlikely(dev == NULL)) {
dbgl_salnet(1, "no such device %s:",devname);
goto errout;
}
printk("sal_netdev_open spin_lock_irqsave");
spin_lock_irqsave(&v_sal_net_global.lock, flags);
printk("sal_netdev_open list_for_each");
list_for_each(i, &v_sal_net_global.net_dev_list)
{
printk("sal_netdev_open list_entry");
psn = list_entry(i, _network_device_t, link);
printk("sal_netdev_open sal_assert(psn ...");
sal_assert(psn != NULL);
printk("sal_netdev_open if !strncomp(psn->devname ");
if ( !strncmp(psn->devname,devname,IFNAMSIZ) )
{
/* Already opened device */
printk("sal_netdev_open Already opened device");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
goto errout;
}
}
printk("sal_netdev_open spin_unlock_irqrestore");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
/* Now everything OK,
create place holder for new net interface */
printk("sal_netdev_open create place holder for new net interface");
psn = (_network_device_t*) sal_malloc(sizeof(_network_device_t));
if ( !psn ) {
goto errout;
}
printk("sal_netdev_open memset size of network device");
memset(psn, 0, sizeof(_network_device_t));
printk("sal_netdev_open strncpy devname ... ");
strncpy(psn->devname, devname, IFNAMSIZ); // TODO: strncpy
printk("sal_netdev_open psn->if_dev");
psn->if_dev = dev;
printk("sal_netdev_open ret = sal_netdev_get_addresss");
ret = sal_netdev_get_address((sal_netdev_desc)psn, psn->mac);
if (ret!=0) {
printk("sal_netdev_open go to errout (1)");
goto errout;
}
/* Link to net list */
printk("sal_netdev_open spin_lock_irqsave");
spin_lock_irqsave(&v_sal_net_global.lock, flags);
printk("sal_netdev_open list_add ");
list_add(&psn->link, &v_sal_net_global.net_dev_list);
printk("sal_netdev_open unlock_irqrestore ");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
printk("sal_netdev_open dev_put");
dev_put(dev);
printk("sal_netdev_open sal_netdev_opened : %s", devname);
dbgl_salnet(5, "sal_netdev_opened : %s", devname);
printk("sal_netdev_open return sal_netdev_desc)psn");
return (sal_netdev_desc)psn;
errout:
printk("sal_netdev_open errout: ");
dbgl_salnet(1, "sal_netdev_open : can\'t open %s", devname);
printk("sal_netdev_open ");
if (dev) {
printk("sal_netdev_open if(Dev) dev_put");
dev_put(dev);
}
if (psn) {
printk("sal_netdev_open if(psn) sal_free");
sal_free(psn); // psn is not NULL
}
printk("sal_netdev_open return SAL invalid net desc");
return SAL_INVALID_NET_DESC;
}
EXPORT_SYMBOL(sal_netdev_open);
and I get the following oops.
Feb 16 09:20:16 rawhide kernel: [41881.740523] ndas: sal_thread_create: sal_thread
Feb 16 09:20:16 rawhide kernel: [41881.740822] ndas: sal_thread_create: set tid; tsk=-131939323490912
Feb 16 09:20:16 rawhide kernel: [41881.740825] ndas: sal_thread_created: tid=-1605139104
Feb 16 09:20:16 rawhide kernel: [41881.741385] ndas: sal_thread_create: sal_thread
Feb 16 09:20:16 rawhide kernel: [41881.746951] ndas: sal_thread_create: set tid; tsk=-131939194175488
Feb 16 09:20:16 rawhide kernel: [41881.746955] ndas: sal_thread_created: tid=-1605139328
Feb 16 09:20:16 rawhide kernel: [41881.749033] ndas: registering network interface enp6s1
Feb 16 09:20:16 rawhide kernel: [41881.749038] sal_netdev_open devname=enp6s1
Feb 16 09:20:16 rawhide kernel: [41881.749038] sal_netdev_open DEV_GET_BY_NAME(devname)
Feb 16 09:20:16 rawhide kernel: [41881.749038] sal_netdev_open spin_lock_irqsave
Feb 16 09:20:16 rawhide kernel: [41881.749038] sal_netdev_open list_for_each
Feb 16 09:20:16 rawhide kernel: [41881.749038] sal_netdev_open list_entry
Feb 16 09:20:16 rawhide kernel: [41881.749038] sal_netdev_open sal_assert(psn ...
Feb 16 09:20:16 rawhide kernel: [41881.749038] sal_netdev_open if !strncomp(psn->devname
Feb 16 09:20:16 rawhide kernel: [41881.749060] BUG: unable to handle kernel NULL pointer dereference at 0000000000000024
Feb 16 09:20:16 rawhide kernel: [41881.749124] IP: [<ffffffff8135b119>] strncmp+0x9/0x60
Feb 16 09:20:16 rawhide kernel: [41881.749161] PGD 973eb067 PUD 627d5067 PMD 0
Feb 16 09:20:16 rawhide kernel: [41881.749198] Oops: 0000 [#1] SMP
Feb 16 09:20:16 rawhide kernel: [41881.749227] Modules linked in: ndas_core(OF+) ndas_block(OF) ndas_sal(OF) rfcomm ipt_MASQUERADE nf_conntrack_netbios_ns nf_conntrack_broadcast ip6table_mangle ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 iptable_nat nf_nat_ipv4 nf_nat iptable_mangle nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ebtable_filter ebtables bnep bluetooth rfkill ip6table_filter ip6_tables ppdev iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm microcode snd_hda_codec_idt snd_hda_intel usblp snd_hda_codec snd_hwdep snd_seq parport_pc snd_seq_device parport snd_pcm snd_page_alloc snd_timer snd soundcore serio_raw lpc_ich mei mfd_core i2c_i801 uinput ata_generic pata_acpi firewire_ohci r8169 firewire_core crc_itu_t mii i915 sata_sil24 pata_marvell video i2c_algo_bit drm_kms_helper drm e1000e i2c_core sunrpc
Feb 16 09:20:16 rawhiderawhide kernel: [41881.749841] CPU 0
Feb 16 09:20:16 rawhide kernel: [41881.749859] Pid: 3472, comm: insmod Tainted: GF O 3.8.0-0.rc7.git2.1.fc19.x86_64 #1 /DG965OT
Feb 16 09:20:16 rawhide kernel: [41881.749921] RIP: 0010:[<ffffffff8135b119>] [<ffffffff8135b119>] strncmp+0x9/0x60
Feb 16 09:20:16 rawhide kernel: [41881.749968] RSP: 0018:ffff880093c77d70 EFLAGS: 00010002
Feb 16 09:20:16 rawhide kernel: [41881.750004] RAX: 000000000000002b RBX: 0000000000000000 RCX: 0000000000000001
Feb 16 09:20:16 rawhide kernel: [41881.750012] RDX: 0000000000000010 RSI: ffff880035c62160 RDI: 0000000000000024
Feb 16 09:20:16 rawhide kernel: [41881.750012] RBP: ffff880093c77d70 R08: 0000000000000002 R09: 0000000000000000
Feb 16 09:20:16 rawhide kernel: [41881.750012] R10: 0000000000000001 R11: 0000000000000000 R12: ffff880035c62160
Feb 16 09:20:16 rawhide kernel: [41881.750012] R13: ffff880035c62160 R14: 0000000000000286 R15: ffffffffa0537ac0
Feb 16 09:20:16 rawhide kernel: [41881.750012] FS: 00007f6ce8d85740(0000) GS:ffff8800bdc00000(0000) knlGS:0000000000000000
Feb 16 09:20:16 rawhide kernel: [41881.750012] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
Feb 16 09:20:16 rawhide kernel: [41881.750012] CR2: 0000000000000024 CR3: 000000007b679000 CR4: 00000000000007f0
Feb 16 09:20:16 rawhide kernel: [41881.750012] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
Feb 16 09:20:16 rawhide kernel: [41881.750012] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Feb 16 09:20:16 rawhide kernel: [41881.750012] Process insmod (pid: 3472, threadinfo ffff880093c76000, task ffff880035d24da0)
Feb 16 09:20:16 rawhide kernel: [41881.750012] Stack:
Feb 16 09:20:16 rawhide kernel: [41881.750012] ffff880093c77da0 ffffffffa04bfc6b ffff880035c62160 0000000000000003
Feb 16 09:20:16 rawhide kernel: [41881.750012] ffffffffa0532e39 0000000000000000 ffff880093c77dc8 ffffffffa051948a
Feb 16 09:20:16 rawhide kernel: [41881.750012] ffff880035c62160 0000000000000003 ffffffffa0532e39 ffff880093c77dd8
Feb 16 09:20:16 rawhide kernel: [41881.750012] Call Trace:
Feb 16 09:20:16 rawhide kernel: [41881.750012] [<ffffffffa04bfc6b>] sal_netdev_open+0xdb/0x3e0 [ndas_sal]
Feb 16 09:20:16 rawhide kernel: [41881.750012] [<ffffffffa051948a>] lpx_register_dev+0xda/0x260 [ndas_core]
Feb 16 09:20:16 rawhide kernel: [41881.750012] [<ffffffffa050f0fe>] ndas_register_network_interface+0xe/0x10 [ndas_core]
Feb 16 09:20:16 rawhide kernel: [41881.750012] [<ffffffffa052fc85>] init_module+0x105/0x150 [ndas_core]
Feb 16 09:20:16 rawhide kernel: [41881.750012] [<ffffffffa052fb80>] ? ndas_change_handler_func+0x70/0x70 [ndas_core]
Feb 16 09:20:16 rawhide kernel: [41881.750012] [<ffffffff8100210a>] do_one_initcall+0x10a/0x160
Feb 16 09:20:16 rawhide kernel: [41881.750012] [<ffffffff810e9e8e>] load_module+0x1a4e/0x2310
Feb 16 09:20:16 rawhide kernel: [41881.750012] [<ffffffff81374f30>] ? ddebug_proc_write+0xf0/0xf0
Feb 16 09:20:16 rawhide kernel: [41881.750012] [<ffffffff810ea811>] sys_init_module+0xc1/0x110
Feb 16 09:20:16 rawhide kernel: [41881.750012] [<ffffffff8170eb19>] system_call_fastpath+0x16/0x1b
Feb 16 09:20:16 rawhide kernel: [41881.750012] Code: c6 01 3a 46 ff 74 eb 19 c0 83 c8 01 5d c3 0f 1f 40 00 31 c0 5d c3 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 85 d2 48 89 e5 74 3c <0f> b6 0f 44 0f b6 06 44 38 c1 75 3b 84 c9 74 2c 48 83 ea 01 31
Feb 16 09:20:16 rawhide kernel: [41881.750012] RIP [<ffffffff8135b119>] strncmp+0x9/0x60
Feb 16 09:20:16 rawhide kernel: [41881.750012] RSP <ffff880093c77d70>
Feb 16 09:20:16 rawhide kernel: [41881.750012] CR2: 0000000000000024
Feb 16 09:20:16 rawhide kernel: [41881.852936] ---[ end trace 0027d2d008b9dc49 ]---
Feb 16 09:20:16 rawhide kernel: [41881.852943] BUG: sleeping function called from invalid context at kernel/rwsem.c:20
Feb 16 09:20:16 rawhide kernel: [41881.852946] in_atomic(): 1, irqs_disabled(): 1, pid: 3472, name: insmod
Feb 16 09:20:16 rawhide kernel: [41881.852948] INFO: lockdep is turned off.
Feb 16 09:20:16 rawhide kernel: [41881.852950] irq event stamp: 0
Feb 16 09:20:16 rawhide kernel: [41881.852952] hardirqs last enabled at (0): [< (null)>] (null)
Feb 16 09:20:16 rawhide kernel: [41881.852957] hardirqs last disabled at (0): [<ffffffff81066548>] copy_process.part.21+0x5d8/0x1650
Feb 16 09:20:16 rawhide kernel: [41881.852965] softirqs last enabled at (0): [<ffffffff81066548>] copy_process.part.21+0x5d8/0x1650
Feb 16 09:20:16 rawhide kernel: [41881.852969] softirqs last disabled at (0): [< (null)>] (null)
So. oops appears in here: (sal_net.c)
printk("ndas: sal_netdev_open list_for_each\n");
list_for_each(i, &v_sal_net_global.net_dev_list) <-- this net_dev_list is referenced.
{
printk("ndas: sal_netdev_open list_entry\n");
psn = list_entry(i, _network_device_t, link); <--- psn is possibly a null reference.
printk("ndas: sal_netdev_open sal_assert(psn ...\n");
sal_assert(psn != NULL);
printk("ndas: sal_netdev_open if !strncomp(psn->devname \n");
if ( !strncmp(psn->devname,devname,IFNAMSIZ) ) <-- it fails to compare devname with psn
{
Next I turned up the printk just a bit for finer detail and oopsed again:
9707:Feb 16 10:10:20 rawhide kernel: [ 2882.471733] ndas: registering network interface enp6s1
9708:Feb 16 10:10:20 rawhide kernel: [ 2882.471737] ndas: sal_netdev_open devname=enp6s1
9709:Feb 16 10:10:20 rawhide kernel: [ 2882.471740] ndas: sal_netdev_open DEV_GET_BY_NAME(devname)ndas: sal_netdev_open spin_lock_irqsave
9710:Feb 16 10:10:20 rawhide kernel: [ 2882.471744] ndas: sal_netdev_open list_for_each
9711:Feb 16 10:10:20 rawhide kernel: [ 2882.471746] ndas: sal_netdev_open list_entry
9712:Feb 16 10:10:20 rawhide kernel: [ 2882.471748] ndas: sal_netdev_open sal_assert(psn ...
9713:Feb 16 10:10:20 rawhide kernel: [ 2882.471750] ndas: sal_netdev_open if !strncomp(psn->devname,devname
-->>9714:Feb 16 10:10:20 rawhide kernel: [ 2882.471752] ndas: sal_netdev_open actual if !strncomp((null),enp6s1,IFNAMISIZ)
>> line above show the values tested at this point.
>> the &v_sal_net_global.net_dev_list entry has some problem)
9715:Feb 16 10:10:20 rawhide kernel: [ 2882.471767] BUG: unable to handle kernel NULL pointer dereference at 0000000000000024
9716:Feb 16 10:10:20 rawhide kernel: [ 2882.471829] IP: [<ffffffff8135b119>] strncmp+0x9/0x60
So, I guess one thing to do is see how the net_dev_list is created. because after comparing, the function uses "psn" to create the actual device.
Here I did some modifications to sal_net.c. I jumped out in the case of NULL references.
printk("ndas: sal_netdev_open sal_assert(psn ...\n");
sal_assert(psn != NULL);
if(psn != NULL) {
printk("ndas: sal_netdev_ psn is not NULL so check to see if this adapter is already in use by NDAS\n");
printk("ndas: sal_netdev_ if (psn->devname != NULL \n");
/* Hack. just assuming the list is empty, so the name sure won't be
* the same either. Thus it will not test a null pointer
*/
if (psn->devname != NULL){
printk("ndas: sal_netdev_open actual if !strncomp(%s,%s,IFNAMISIZ)\n",psn->devname,devname);
if ( !strncmp(psn->devname,devname,IFNAMSIZ) )
{
/* Already opened device */
printk("ndas: sal_netdev_open Already opened device\n");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
goto errout;
}
} else {
printk("ndas: sal_netdev_ psn-devname is NULL DOH! \n");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
goto errout;
}
} else {
printk("ndas: sal_netdev_ psn is NULL DOH! \n");
printk("ndas: sal_netdev_ we will try to assign it any way then \n");
}
}
printk("ndas: sal_netdev_open spin_unlock_irqrestore\n");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
/* Now everything OK,
create place holder for new net interface */
printk("ndas: sal_netdev_open create place holder for new net interface\n");
psn = (_network_device_t*) sal_malloc(sizeof(_network_device_t));
if ( !psn ) {
goto errout;
}
It still oopsed though.
12112:Feb 16 17:20:57 rawhide kernel: [ 2187.856741] ndas: sal_netdev_open devname=enp6s1
12113:Feb 16 17:20:57 rawhide kernel: [ 2187.856744] ndas: sal_netdev_open DEV_GET_BY_NAME(devname)ndas: sal_netdev_open spin_lock_irqsave
12114:Feb 16 17:20:57 rawhide kernel: [ 2187.856748] ndas: sal_netdev_open list_for_each
12115:Feb 16 17:20:57 rawhide kernel: [ 2187.856750] ndas: sal_netdev_open list_entry
12116:Feb 16 17:20:57 rawhide kernel: [ 2187.856752] ndas: sal_netdev_open sal_assert(psn ...
12117:Feb 16 17:20:57 rawhide kernel: [ 2187.856753] ndas: sal_netdev_ psn is NULL DOH!
12118:Feb 16 17:20:57 rawhide kernel: [ 2187.856755] ndas: sal_netdev_ we will try to assign it any way then
12119:Feb 16 17:20:57 rawhide kernel: [ 2187.856772] BUG: unable to handle kernel NULL pointer dereference at (null)
12120:Feb 16 17:20:57 rawhide kernel: [ 2187.856831] IP: [<ffffffffa04ccca9>] sal_netdev_open+0x119/0x430 [ndas_sal]
12121:Feb 16 17:20:57 rawhide kernel: [ 2187.856882] PGD 82fa0067 PUD 7c703067 PMD 0
12122:Feb 16 17:20:57 rawhide kernel: [ 2187.856920] Oops: 0000 [#1] SMP
12123:Feb 16 17:20:57 rawhide kernel: [ 2187.856950] Modules linked in: ndas_core(OF+) ndas_block(OF) ndas_sal(OF) rfcomm ipt_MASQUERADE nf_conntrack_netbios_ns nf_conntrack_broadcast ip6table_mangle ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 iptable_nat nf_nat_ipv4 nf_nat iptable_mangle nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ebtable_filter ebtables bnep bluetooth rfkill ip6table_filter ip6_tables coretemp iTCO_wdt ppdev kvm_intel iTCO_vendor_support kvm microcode serio_raw snd_hda_codec_idt snd_hda_intel snd_hda_codec i2c_i801 usblp lpc_ich snd_hwdep mfd_core snd_seq snd_seq_device parport_pc snd_pcm parport snd_page_alloc snd_timer snd soundcore mei uinput ata_generic pata_acpi firewire_ohci r8169 firewire_core crc_itu_t mii sata_sil24 i915 pata_marvell video i2c_algo_bit drm_kms_helper drm e1000e i2c_core sunrpc [last unloaded: ndas_sal]
Now I get the feeling that "_network_device_t" maybe is null or something.
I imagined that psn should be assigned by
printk("ndas: sal_netdev_open create place holder for new net interface\n");
psn = (_network_device_t*) sal_malloc(sizeof(_network_device_t));
even if it was null to begin with.
closed by accident.
I guess not _network_device_t == null.
I made it jump out, skipping the registration if there was a null psn or psn->devname and the modules loaded, with the log output below.
Feb 16 20:25:16 rawhide kernel: [10975.698499] ndas: registering network interface enp6s1
Feb 16 20:25:16 rawhide kernel: [10975.698504] ndas: sal_netdev_open devname=enp6s1
Feb 16 20:25:16 rawhide kernel: [10975.698507] ndas: sal_netdev_open DEV_GET_BY_NAME(devname)ndas: sal_netdev_open spin_lock_irqsave
Feb 16 20:25:16 rawhide kernel: [10975.698511] ndas: sal_netdev_open list_for_each
Feb 16 20:25:16 rawhide kernel: [10975.698513] ndas: sal_netdev_open list_entry
Feb 16 20:25:16 rawhide kernel: [10975.698515] ndas: sal_netdev_open sal_assert(psn ...
Feb 16 20:25:16 rawhide kernel: [10975.698517] ndas: sal_netdev_ psn is NULL!
Feb 16 20:25:16 rawhide kernel: [10975.698519] ndas: sal_netdev_ we are jumping out then
Feb 16 20:25:16 rawhide kernel: [10975.698521] ndas: sal_netdev_ BTW size of _network_device_t is 64
Feb 16 20:25:16 rawhide kernel: [10975.698523] ndas: sal_netdev_open errout: ndas: sal_netdev_open ndas: sal_netdev_open if(Dev) dev_put
_network_device_t has a size to it. So, I guess it exists.
today i put this check in sal_net.c
if (&v_sal_net_global.net_dev_list == NULL){
printk("ndas: sal_netdev_open v_sal_net_global.netlist is NULL\n");
psn = NULL;
goto errout;
} else {
list_for_each(i, &v_sal_net_global.net_dev_list){
printk("ndas: global.netdevlist => %d\n", i);
}
}
Then it oopsed at this moment. So, my guess is that &v_sal_net_global.net_dev_list is the null pointer.
Is it possible this post is related? http://stackoverflow.com/questions/10262017/linux-kernel-list-list-head-init-vs-init-list-head
next I inserted the following:
ret = &v_sal_net_global.notifier->priority;
if (ret == NULL){
printk("ndas: sal_netdev_open v_sal_net_global.notifier->priority is NULL\n");
psn = NULL;
goto errout;
} else {
printk("ndas: sal_netdev_open v_sal_net_global.notifier->priority is %ld\n",ret);
}
It is to check the presence of another property in the device v_sal_net_global. This turns up a property correctly.
ndas: sal_netdev_open v_sal_net_global.notifier->priority is 16
So, I wonder if net_dev_list is not getting initialized.
net_dev_list should be setup in line 470:
INIT_LIST_HEAD(&v_sal_net_global.net_dev_list);
and notifier-> priority is setup on line 533
v_sal_net_global.notifier->priority = 0;
below is very printk busy function from sal_net.c
/**
* sal_netdev_open - open the network device by the given name
* @devname - the name of the network device to open
*/
NDAS_SAL_API sal_netdev_desc sal_netdev_open(const char* devname)
{
unsigned long flags;
int ret;
_network_device_t *psn = NULL;
struct net_device *dev = NULL;
struct list_head* i;
printk("ndas: sal_netdev_open devname=%s\n", devname);
// dbgl_salnet(1, "ing name=%s", devname);
if ( strcmp(devname, "lo") == 0 ) {
goto errout;
}
/* Check device is exists */
printk("ndas: sal_netdev_open DEV_GET_BY_NAME(devname)\n");
dev = DEV_GET_BY_NAME(devname);
if (unlikely(dev == NULL)) {
dbgl_salnet(1, "no such device %s:\n",devname);
goto errout;
}
if (list_empty(&v_sal_net_global.net_dev_list) ){
printk("ndas: sal_netdev_open v_sal_net_global.netdevlist exits but is empty\n");
goto errout;
} else {
printk("ndas: sal_netdev_open v_sal_net_global.netdevlist is not empty.\n");
printk("ndas: sal_netdev_open Test to see if NDAS is on list device(s).\n");
printk("ndas: sal_netdev_open spin_lock_irqsave\n");
spin_lock_irqsave(&v_sal_net_global.lock, flags);
ret = 0;
printk("ndas: sal_netdev_open list_for_each\n");
list_for_each(i, &v_sal_net_global.net_dev_list)
{
ret ++;
printk("ndas: sal_netdev_open netdevlist list_entry %d \n", ret);
psn = list_entry(i, _network_device_t, link);
/* printk("ndas: sal_netdev_open sal_assert(psn ...\n");
sal_assert(psn != NULL);
*/
if(psn != NULL) {
printk("ndas: sal_netdev_ psn is not NULL. Check adapter is already in use by NDAS\n");
/* Hack. just assuming the list is empty, so the name sure won't be
* the same either. Thus it will not test a null pointer
*/
printk("ndas: sal_netdev_ if (psn->devname != NULL \n");
if (psn->devname != NULL){
printk("ndas: sal_netdev_open test psn->devname: if !strncomp(%s,%s,IFNAMISIZ)\n",psn->devname,devname);
if ( !strncmp(psn->devname,devname,IFNAMSIZ) ) {
/* Already opened device */
printk("ndas: sal_netdev_open Already opened device\n");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
goto errout;
} else {
printk("ndas: sal_netdev_ strncmp(psn-devname,devname)=%d\n",strncmp(psn->devname,devname,IFNAMSIZ));
printk("ndas: gonna fill up this psn with a new device.");
}
} else {
printk("ndas: sal_netdev_ psn-devname is NULL! No need to check if in use. \n");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
break;
}
}
printk("ndas: sal_netdev_ psn is NULL! No need to check if in use. \n");
break;
}
printk("ndas: tested %d devices in v_sal_net_global.net_dev_list\n",ret);
printk("ndas: sal_netdev_open spin_unlock_irqrestore\n");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
/* Now everything OK,
* create place holder for new net interface */
printk("ndas: sal_netdev_open create place holder for new net interface\n");
psn = (_network_device_t*) sal_malloc(sizeof(_network_device_t));
if ( !psn ) {
goto errout;
}
printk("ndas: sal_netdev_open memset size of network device\n");
memset(psn, 0, sizeof(_network_device_t));
printk("ndas: sal_netdev_open strncpy devname ... \n");
strncpy(psn->devname, devname, IFNAMSIZ); // TODO: strncpy
printk("ndas: sal_netdev_open psn->if_dev\n");
psn->if_dev = dev;
printk("ndas: sal_netdev_open ret = sal_netdev_get_addresss\n");
ret = sal_netdev_get_address((sal_netdev_desc)psn, psn->mac);
if (ret!=0) {
printk("ndas: sal_netdev_open sal_netdev_get_address != 0 \n");
goto errout;
}
/* Link to net list */
printk("ndas: sal_netdev_open Link to net list, spin_lock_irqsave\n");
spin_lock_irqsave(&v_sal_net_global.lock, flags);
printk("ndas: see if &psn->link is NULL. \n");
if(list_empty(&psn->link) == NULL) {
printk("ndas: &psn->link is NULL. \n");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
goto errout;
}
printk("ndas: see if &psn->link is empty. \n");
if(list_empty(&psn->link)) {
printk("ndas: &psn->link is empty. \n");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
goto errout;
}
printk("ndas: see if &sal_net_global.net_dev_list is NULL. \n");
if(list_empty(&v_sal_net_global.net_dev_list) == NULL) {
printk("ndas: &sal_net_global.net_dev_list is NULL. \n");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
goto errout;
}
printk("ndas: see if &sal_net_global.net_dev_list is empty. \n");
if(list_empty(&v_sal_net_global.net_dev_list) ) {
printk("ndas: &sal_net_global.net_dev_list is empty. \n");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
goto errout;
}
printk("ndas: jumping out \n");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
goto errout;
printk("ndas: sal_netdev_open list_add ");
list_add(&psn->link, &v_sal_net_global.net_dev_list);
printk("ndas: sal_netdev_open unlock_irqrestore \n");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
printk("ndas: sal_netdev_open dev_put");
dev_put(dev);
printk("ndas: sal_netdev_open sal_netdev_opened : %s\n", devname);
dbgl_salnet(5, "sal_netdev_opened : %s", devname);
printk("ndas: sal_netdev_open return sal_netdev_desc)psn\n");
return (sal_netdev_desc)psn;
}
errout:
printk("ndas: sal_netdev_open errout: ");
dbgl_salnet(1, "sal_netdev_open : can\'t open %s\n", devname);
printk("ndas: sal_netdev_open ");
if (dev) {
printk("ndas: sal_netdev_open if(Dev) dev_put\n");
dev_put(dev);
}
if (psn) {
printk("ndas: sal_netdev_open if(psn) sal_free\n");
sal_free(psn); // psn is not NULL
}
printk("ndas: sal_netdev_open return SAL invalid net desc\n");
return SAL_INVALID_NET_DESC;
}
EXPORT_SYMBOL(sal_netdev_open);
So in this case, I find the null pointer indicated here:
printk("ndas: see if &psn->link is NULL. \n");
if(list_empty(&psn->link) == NULL) {
I am confused then to know what is broken.
I was led to believe that list_add, will automatically initialize a list_head in a structure. Could it be not working?
This is the structure:
/**
* _network_device_t
* @node - circle-linked list node.
* @devname - the name of network device
* @mac - the 6 bytes mac address
* @rx_handler - forward call when ptype.rx_handler is called.
*/
typedef struct _network_device {
struct list_head link;
struct net_device *if_dev;
sal_net_rx_proc rx_handler;
int use_virtual_mac; /* Used to change my MAC address to fake MAC. Used for netdisk emulation.*/
char devname[IFNAMSIZ];
char mac[IFHWADDRLEN]; /* MAC address of this interface */
char virtual_mac[IFHWADDRLEN];
} _network_device_t;
Should this code do some INIT_LIST_HEAD((&psn->link) somewhere along the line?
Well maybe.
After modding a bit more, I am able to load all three modules. This section re-initializes the empty lists.
/* Link to net list */
printk("ndas: sal_netdev_open Link to net list, spin_lock_irqsave\n");
spin_lock_irqsave(&v_sal_net_global.lock, flags);
printk("ndas: see if &psn->link is NULL. \n");
if(list_empty(&psn->link) == NULL) {
printk("ndas: &psn->link is NULL. INIT it and go on. \n");
INIT_LIST_HEAD(&psn->link);
/* spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
goto errout;
*/
}
printk("ndas: see if &psn->link is empty. \n");
if(list_empty(&psn->link)) {
printk("ndas: &psn->link is empty. Continuing. \n");
/* spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
goto errout;
*/
}
printk("ndas: see if &sal_net_global.net_dev_list is NULL. \n");
if(list_empty(&v_sal_net_global.net_dev_list) == NULL) {
printk("ndas: &sal_net_global.net_dev_list is NULL. re-INIT and go on\n");
INIT_LIST_HEAD(&v_sal_net_global.net_dev_list);
/* spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
goto errout;
*/
}
printk("ndas: see if &sal_net_global.net_dev_list is empty. \n");
if(list_empty(&v_sal_net_global.net_dev_list) ) {
printk("ndas: &sal_net_global.net_dev_list is empty. Let's keep going.\n");
/* spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
goto errout;
*/
}
/*
printk("ndas: jumping out \n");
spin_unlock_irqrestore(&v_sal_net_global.lock, flags);
psn = NULL;
goto errout;
*/
Probably pretty dangerous.
the module use count looks wrong when compared to a running computer.
On rahwide:
lsmod | grep ndas
ndas_block 222440 0
ndas_core 199569 0
ndas_sal 34317 2 ndas_core,ndas_block
On ealier machine:
lsmod | grep ndas
ndas_block 40593 0
ndas_core 194673 1 ndas_block
ndas_sal 41904 6 ndas_block,ndas_core
Here is a bug out that happened when I unloaded the modules.
------------[ cut here ]------------
rawhide kernel: [ 4802.863404] WARNING: at lib/list_debug.c:59 __list_del_entry+0xa1/0xd0()
rawhide kernel: [ 4802.863406] Hardware name:
rawhide kernel: [ 4802.863409] list_del corruption. prev->next should be ffff8800b12c07a8, but was ffff8800b12c0310
rawhide kernel: [ 4802.863411] Modules linked in: ndas_core(OF-) ndas_block(OF) ndas_sal(OF) rfcomm ipt_MASQUERADE nf_conntrack_netbios_ns nf_conntrack_broadcast ip6table_mangle ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 iptable_nat nf_nat_ipv4 nf_nat iptable_mangle nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ebtable_filter ebtables bnep bluetooth rfkill ip6table_filter ip6_tables coretemp ppdev kvm_intel iTCO_wdt iTCO_vendor_support kvm microcode snd_hda_codec_idt snd_hda_intel snd_hda_codec usblp snd_hwdep snd_seq parport_pc parport snd_seq_device snd_pcm snd_page_alloc snd_timer snd soundcore i2c_i801 mei lpc_ich mfd_core serio_raw uinput ata_generic pata_acpi firewire_ohci r8169 firewire_core mii crc_itu_t sata_sil24 i915 pata_marvell video i2c_algo_bit drm_kms_helper e1000e drm i2c_core sunrpc [last unloaded: ndas_sal]
rawhideFeb 26 13:01:26 david-PC kernel: [ 4802.863490] Pid: 4555, comm: rmmod Tainted: GF O 3.8.0-0.rc7.git2.1.fc19.x86_64 #1
rawhide kernel: [ 4802.863493] Call Trace:
rawhide kernel: [ 4802.863500] [<ffffffff81068600>] warn_slowpath_common+0x70/0xa0
rawhide kernel: [ 4802.863504] [<ffffffff8106867c>] warn_slowpath_fmt+0x4c/0x50
rawhide kernel: [ 4802.863509] [<ffffffff813686c1>] __list_del_entry+0xa1/0xd0
rawhide kernel: [ 4802.863513] [<ffffffff813686fd>] list_del+0xd/0x30
rawhide kernel: [ 4802.863520] [<ffffffffa04ba54b>] sal_netdev_close+0x2b/0x50 [ndas_sal]
rawhide kernel: [ 4802.863530] [<ffffffffa0504efd>] lpxitf_destroy+0xcd/0x110 [ndas_core]
rawhide kernel: [ 4802.863538] [<ffffffffa050b6dd>] lpx_unregister_dev+0xcd/0xe0 [ndas_core]
rawhide kernel: [ 4802.863544] [<ffffffffa050110e>] ndas_unregister_network_interface+0xe/0x10 [ndas_core]
rawhide kernel: [ 4802.863552] [<ffffffffa0521ae5>] unregister_network+0x65/0x90 [ndas_core]
rawhide kernel: [ 4802.863560] [<ffffffffa0521cde>] cleanup_module+0xe/0x20 [ndas_core]
rawhide kernel: [ 4802.863564] [<ffffffff810e819b>] sys_delete_module+0x16b/0x300
rawhide kernel: [ 4802.863569] [<ffffffff8136091e>] ? trace_hardirqs_on_thunk+0x3a/0x3f
rawhide kernel: [ 4802.863575] [<ffffffff8170eb19>] system_call_fastpath+0x16/0x1b
rawhide kernel: [ 4802.863578] ---[ end trace 581498be290318eb ]---
rawhide kernel: [ 4802.863600] =============================================================================
rawhide kernel: [ 4802.863604] BUG kmalloc-128 (Tainted: GF W O): Redzone overwritten
rawhide kernel: [ 4802.863606] -----------------------------------------------------------------------------
It did not crash the computer though.
What's going on with this code? Should I just delete it and move on, or is it something that should be looked at and worked on?
Thanks for your help thus far.
What's going on with this code?
I did not get beyond the last bug. So, it is only working on the previous kernels.
Should I just delete it and move on,
This is your call i think.
or is it something that should be looked at and worked on?
Looks like none of your team has time. We hoped that the driver project might be able to tweak it together, but it seems like there is no interest at all. I thought for sure on this GitHub page, some developers could check into it.
If you can help me past the null pointer, or to "port" one of the existing network block device driver in Linux to use this, I keep going and pick up the maintenance.
NDAS is good to keep data away from IP addresses so spies do not know the location and fine for network monitoring sensors too.
However, with the explosion of cloud offerings and HD size increasing and speed of Internet access abounding, folks don't seem to need the large private storage as much.
On Thu, Dec 05, 2013 at 04:38:27PM -0800, IOCELL Networks wrote:
Thanks for your help thus far.
What's going on with this code?
I did get beyond the last bug. So, it is only working on the previous kernels.
Should I just delete it and move on,
This is your call i think.
or is it something that should be looked at and worked on?
Looks like none of your team has time. We hoped that the driver project might be able to tweak it together, but it seems like there is no interest at all. I thought for sure on this GitHub page, some developers could check into it.
Sorry, no, I was waiting for you, for some reason I had stopped because I didn't think the code was working. That's why I never pointed anyone else at the code. If I misunderstood this, very sorry, that's my fault.
If you can help me past the null pointer, or to "port" one of the existing network block device driver in Linux to use this, I keep going and pick up the maintenance.
I don't know what is wrong here, sorry. I can't see any differences in the code, but I must be missing something.
thanks,
greg k-h
This proto version is surely oopsing when ndas_core connects.
Seems like as the network card is registered it makes a null pointer. The only thing close that I saw in the past was this bug. http://ndas4linux.iocellnetworks.com/trac/index.cgi/ticket/1
But I un - did the commented line in that bug, to test it. And there was still the same crash.
I will try to understand a little better tomorrow. Thanks again.
Here is a basic oops.