Open youzhongyang opened 5 years ago
More info: our network cards:
# prtconf -Dd | grep ixgbe
pci15d9,1528 (pciex8086,1528) [Intel Corporation Ethernet Controller 10-Gigabit X540-AT2], instance #4 (driver name: ixgbe)
pci15d9,1528 (pciex8086,1528) [Intel Corporation Ethernet Controller 10-Gigabit X540-AT2], instance #5 (driver name: ixgbe)
pci8086,3 (pciex8086,10fb) [Intel Corporation 82599ES 10-Gigabit SFI/SFP+ Network Connection], instance #0 (driver name: ixgbe)
pci8086,3 (pciex8086,10fb) [Intel Corporation 82599ES 10-Gigabit SFI/SFP+ Network Connection], instance #1 (driver name: ixgbe)
pci8086,3 (pciex8086,10fb) [Intel Corporation 82599ES 10-Gigabit SFI/SFP+ Network Connection], instance #2 (driver name: ixgbe)
pci8086,3 (pciex8086,10fb) [Intel Corporation 82599ES 10-Gigabit SFI/SFP+ Network Connection], instance #3 (driver name: ixgbe)
Our /usbkey/config:
# cat /usbkey/config
admin_nic=aggr0
admin_ip=172.30.192.25
admin_netmask=255.255.255.0
admin_network=
admin_gateway=172.30.192.25
headnode_default_gateway=172.30.192.1
dns_resolvers=***
dns_domain=***
ntp_hosts=***
compute_node_ntp_hosts=172.30.192.25
hostname=batfs1099
aggr0_aggr=90:e2:ba:eb:6:d8,90:e2:ba:eb:6:d9,90:e2:ba:eb:6:8,90:e2:ba:eb:6:9
aggr0_lacp_mode=active
admin0_nic=aggr0
admin0_ip=172.30.192.26
admin0_netmask=255.255.255.0
admin0_gateway=172.30.192.26
admin1_nic=aggr0
admin1_ip=172.30.192.27
admin1_netmask=255.255.255.0
admin1_gateway=172.30.192.27
MAC clients, their SRS count and rx_ring_group:
> ::walk mac_client_impl_cache | ::printf "%p %-16s %4d %20p\n" mac_client_impl_t . mci_name mci_flent->fe_rx_srs_cnt mci_flent->fe_rx_ring_group
ffffd0c5939d0008 z1_net0 17 ffffd0c7babe0280
ffffd0c5939d1478 admin1 17 ffffd0c7babe01e0
ffffd0c5939d28e8 admin0 17 ffffd0c7babe0140
ffffd0c5939d3d58 aggr0 17 ffffd0c7babe00a0
ffffd0c5939d51c8 aggr0-ixgbe3 0 ffffd0c571794000
ffffd0c5939d6638 aggr0-ixgbe2 0 ffffd0c572e2e000
ffffd0c5939d7aa8 ixgbe2 0 0
ffffd0c904a598f8 z6_net0 17 ffffd0c7babe05a0
ffffd0c5939d9010 aggr0-ixgbe1 0 ffffd0c50fdca000
ffffd0c904a5ad68 z5_net0 17 ffffd0c7babe0500
ffffd0c5939da480 aggr0-ixgbe0 0 ffffd0c570fd1000
ffffd0c5939db8f0 ixgbe5 5 ffffd0c5713110a0
ffffd0c904a5c1d8 z4_net0 17 ffffd0c7babe0460
ffffd0c5939dcd60 ixgbe4 5 ffffd0c572e1c0a0
ffffd0c904a5d648 z3_net0 17 ffffd0c7babe03c0
ffffd0c904a5eab8 z2_net0 17 ffffd0c7babe0320
ffffd0c5939de1d0 ixgbe3 0 0
ffffd0c5939df640 ixgbe1 0 0
ffffd0c5939e0ab0 ixgbe0 0 0
aggr0 SRSes:
> ffffd0c5939d3d58::print mac_client_impl_t mci_flent | ::print flow_entry_t fe_rx_srs_cnt fe_rx_srs
fe_rx_srs_cnt = 0x11
fe_rx_srs = [ 0xffffd0c7bb110300, 0xffffd0c7bb111040, 0xffffd0c7bb111d00, 0xffffd0c7bb114340, 0xffffd0c7bb116980, 0xffffd0c7bb117640, 0xffffd0c7bb118300, 0xffffd0c6b8797040, 0xffffd0c7bb10dcc0, 0xffffd0c7bb10d000, 0xffffd0c7bb304340, 0xffffd0c7bb303680, 0xffffd0c7bb3029c0, 0xffffd0c7bb301d00, 0xffffd0c7bb301040, 0xffffd0c7bb2fc300, 0xffffd0c7bb2fb640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... ]
aggr0 rx_ring_group:
> ffffd0c5939d3d58::print mac_client_impl_t mci_flent->fe_rx_ring_group | ::print -at mac_group_t
ffffd0c7babe00a0 mac_group_t {
ffffd0c7babe00a0 int mrg_index = 0x1
ffffd0c7babe00a4 mac_ring_type_t mrg_type = 0x1 (MAC_RING_TYPE_RX)
ffffd0c7babe00a8 mac_group_state_t mrg_state = 2 (MAC_GROUP_STATE_RESERVED)
ffffd0c7babe00b0 mac_group_t *mrg_next = 0xffffd0c7babe0140
ffffd0c7babe00b8 mac_handle_t mrg_mh = 0xffffd0c593966b80
ffffd0c7babe00c0 mac_ring_t *mrg_rings = 0xffffd0c7bac4f228
ffffd0c7babe00c8 uint_t mrg_cur_count = 0x10
ffffd0c7babe00d0 mac_grp_client_t *mrg_clients = 0xffffd0c5960afde0
ffffd0c7babe00d8 mac_group_info_t mrg_info = {
ffffd0c7babe00d8 mac_group_driver_t mgi_driver = 0xffffd0c5ab02ad58
ffffd0c7babe00e0 mac_group_start_t mgi_start = 0
ffffd0c7babe00e8 mac_group_stop_t mgi_stop = 0
ffffd0c7babe00f0 uint_t mgi_count = 0
ffffd0c7babe00f8 mac_intr_t mgi_intr = {
ffffd0c7babe00f8 mac_intr_handle_t mi_handle = 0
ffffd0c7babe0100 mac_intr_enable_t mi_enable = 0
ffffd0c7babe0108 mac_intr_disable_t mi_disable = 0
ffffd0c7babe0110 ddi_intr_handle_t mi_ddi_handle = 0
ffffd0c7babe0118 boolean_t mi_ddi_shared = 0 (0)
}
ffffd0c7babe0120 mac_add_mac_addr_t mgi_addmac = aggr_addmac
ffffd0c7babe0128 mac_rem_mac_addr_t mgi_remmac = aggr_remmac
ffffd0c7babe0130 mac_add_vlan_filter_t mgi_addvlan = aggr_addvlan
ffffd0c7babe0138 mac_rem_vlan_filter_t mgi_remvlan = aggr_remvlan
}
}
aggr0 SRSes' intr bytes and state:
> ffffd0c5939d3d58::print mac_client_impl_t mci_flent | ::print flow_entry_t fe_rx_srs | ::array void* 11 | ::print void* | ::printf "%16x\t%x\n" mac_soft_ring_set_t srs_rx.sr_stat.mrs_intrbytes srs_state
2a87622 24000002
0 2400000e
0 2400000e
0 2400000e
0 2400000e
0 2400000e
0 2400000e
0 2400000e
0 2400000e
0 2400000e
0 2400000e
0 2400000e
0 2400000e
0 2400000e
0 2400000e
0 2400000e
0 2400000e
admin1 SRSes' intr bytes and state:
> ffffd0c5939d1478::print mac_client_impl_t mci_flent | ::print flow_entry_t fe_rx_srs | ::array void* 11 | ::print void* | ::printf "%16x\t%x\n" mac_soft_ring_set_t srs_rx.sr_stat.mrs_intrbytes srs_state
33b029 24000002
6e63356d5 2400000e
ba586f87a 2400000e
71b123047 2400000e
5713e601f 2400000e
23314db060 2400000e
fd423a50e 2400000e
27a5d7a646 2400081e
d5556b8ee 2400000e
2448 2400000e
87babe76b 2400007e
607168f21 2400000e
8baa0cade 2400004e
ebf14734d 2400000e
b8eeb9659 2400000e
2a7662f67c 2400000e
23634602d4 2400000e
z1_net0(a VNIC for a zone) SRSes' intr bytes and state:
> ffffd0c5939d0008::print mac_client_impl_t mci_flent | ::print flow_entry_t fe_rx_srs | ::array void* 11 | ::print void* | ::printf "%16x\t%x\n" mac_soft_ring_set_t srs_rx.sr_stat.mrs_intrbytes srs_state
342672 24000002
1c3c85 2400000e
13f9b1 2400000e
21fda0 2400000e
13396e 2400000e
11fd0 2400000e
2cddd 2400000e
c3fd 2400000e
1780a 2400000e
14c975 2400000e
da4e0 2400000e
11669f 2400000e
150fa5 2400000e
81f6 2400000e
18cb5 2400000e
1bfb2 2400000e
f3a0 2400000e
When the kernel has elected to use the software lanes, then we always stand a chance of hitting the high watermark that it has for backpressure, which results in the rxsdrops. I think what we need to understand is why the kernel opted to do so.
What does ring gen_num of 1 mean? I can see the gen_num of rings for aggr0 is 1, which appears to be abnormal.
> ::walk mac_client_impl_cache | ::printf "%p %-16s %4d %20p\n" mac_client_impl_t . mci_name mci_flent->fe_rx_srs_cnt mci_flent->fe_rx_ring_group
ffffd0c8cbbdb8f8 z6_net0 17 ffffd0c7ba9e95a0
ffffd0c8cbbdcd68 z4_net0 17 ffffd0c7ba9e9500
ffffd0c8cbbde1d8 z5_net0 17 ffffd0c7ba9e9460
ffffd0c8cbbdf648 z3_net0 17 ffffd0c7ba9e93c0
ffffd0c8cbbe0ab8 z2_net0 17 ffffd0c7ba9e9320
ffffd0c593966008 z1_net0 17 ffffd0c7ba9e9280
ffffd0c593967478 admin1 17 ffffd0c7ba9e91e0
ffffd0c5939688e8 admin0 17 ffffd0c7ba9e9140
ffffd0c593969d58 aggr0 17 ffffd0c7ba9e90a0
ffffd0c59396b1c8 aggr0-ixgbe3 0 ffffd0c5734d0000
ffffd0c59396c638 aggr0-ixgbe2 0 ffffd0c571420000
ffffd0c59396daa8 ixgbe0 0 0
ffffd0c59396f010 aggr0-ixgbe1 0 ffffd0c572e5e000
ffffd0c593970480 aggr0-ixgbe0 0 ffffd0c571637000
ffffd0c5939718f0 ixgbe5 5 ffffd0c594f440a0
ffffd0c593972d60 ixgbe4 5 ffffd0c594f4a0a0
ffffd0c5939741d0 ixgbe1 0 0
ffffd0c593975640 ixgbe3 0 0
ffffd0c593976ab0 ixgbe2 0 0
aggr0:
> ffffd0c7ba9e90a0::print mac_group_t mrg_rings | ::list mac_ring_t mr_next | ::print mac_ring_t mr_gen_num
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
> ffffd0c7ba9e90a0::print mac_group_t mrg_rings | ::list mac_ring_t mr_next | ::print mac_ring_t mr_info.mri_driver | ::print aggr_pseudo_rx_ring_t arr_hw_rh | ::print mac_ring_t mr_info.mri_driver | ::printf "%-16d %d\n" ixgbe_rx_ring_t stat_ipackets ring_gen_num
0 1
0 1
0 1
0 1
0 1
0 1
0 1
0 1
0 1
0 1
0 1
0 1
0 1
0 1
0 1
0 1
admin0:
> ffffd0c7ba9e9140::print mac_group_t mrg_rings | ::list mac_ring_t mr_next | ::print mac_ring_t mr_gen_num
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
> ffffd0c7ba9e9140::print mac_group_t mrg_rings | ::list mac_ring_t mr_next | ::print mac_ring_t mr_info.mri_driver | ::print aggr_pseudo_rx_ring_t arr_hw_rh | ::print mac_ring_t mr_info.mri_driver | ::printf "%-16d %d\n" ixgbe_rx_ring_t stat_ipackets ring_gen_num
355395455 0
227104173 0
237647385 0
199660433 0
231530313 0
180110079 0
202886421 0
227397803 0
181857652 0
218078952 0
233618762 0
207876151 0
272891844 0
416724862 0
216742891 0
212029696 0
z1_net0:
> ffffd0c7ba9e9280::print mac_group_t mrg_rings | ::list mac_ring_t mr_next | ::print mac_ring_t mr_gen_num
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
> ffffd0c7ba9e9280::print mac_group_t mrg_rings | ::list mac_ring_t mr_next | ::print mac_ring_t mr_info.mri_driver | ::print aggr_pseudo_rx_ring_t arr_hw_rh | ::print mac_ring_t mr_info.mri_driver | ::printf "%-16d %d\n" ixgbe_rx_ring_t stat_ipackets ring_gen_num
4753997 0
5308389 0
3458119 0
4483390 0
5750086 0
3497963 0
3365930 0
5886896 0
538679 0
6597434 0
2291767 0
5950493 0
2378832 0
6698355 0
3419547 0
2184117 0
Figured out why aggr0 does not use hw lanes - when aggr_addmac() tries to register mac address for aggr0 interface, it does nothing and immediately returns.
Here are more info:
Script to stop aggr:
# cat /var/tmp/stop-aggr
#!/bin/bash
ifconfig aggr0 down
ifconfig aggr0 unplumb
dladm delete-aggr aggr0
Script to start aggr:
# cat /var/tmp/start-aggr
#!/bin/bash
dladm create-aggr -L active -l ixgbe0 -l ixgbe1 aggr0
ifconfig aggr0 plumb
ifconfig aggr0 inet 172.31.16.19 netmask 255.255.255.0 up
route add default 172.31.16.1
dtrace script:
# cat /var/tmp/dtrace-unicast_add
#!/bin/bash
dtrace -n 'aggr_addmac:entry {stack();printf("aggr_pseudo_rx_group_t * = %p\n", args[0]); printf("arg_index = %d\n", ((aggr_pseudo_rx_group_t*)args[0])->arg_index); tracemem(args[1], 6); tracemem( ((aggr_pseudo_rx_group_t*)args[0])->arg_grp->lg_addr, 6 ); }' -n 'i_mac_unicast_add:entry {stack(); ch = (mac_client_impl_t*)args[0]; printf("mac_client_impl_t %p %s mac_impl_t %p mi_name %s mac_addr %p\n", ch, ch->mci_name, ch->mci_mip, ch->mci_mip->mi_name, args[1]); tracemem(ch->mci_mip->mi_addr, 6); }' -o /var/tmp/out-aggr_addmac-unicast_add
Steps to get dtrace output:
# /var/tmp/stop-aggr
# /var/tmp/dtrace-unicast_add &
# /var/tmp/start-aggr
dtrace output:
# cat /var/tmp/out-aggr_addmac-unicast_add
CPU ID FUNCTION:NAME
0 28335 i_mac_unicast_add:entry
mac`mac_unicast_add+0x70
aggr`aggr_port_create+0x205
aggr`aggr_grp_add_port+0x8d
aggr`aggr_grp_create+0x2b1
aggr`aggr_ioc_create+0x132
dld`drv_ioctl+0x1e4
genunix`cdev_ioctl+0x39
specfs`spec_ioctl+0x60
genunix`fop_ioctl+0x55
genunix`ioctl+0x9b
unix`_sys_sysenter_post_swapgs+0x153
mac_client_impl_t ffffd0c600884018 aggr0-ixgbe0 mac_impl_t ffffd0c5932a1130 mi_name ixgbe0 mac_addr 0
0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef
0: 90 e2 ba b3 55 9c ....U.
0 28335 i_mac_unicast_add:entry
mac`mac_unicast_add+0x70
aggr`aggr_port_create+0x205
aggr`aggr_grp_add_port+0x8d
aggr`aggr_grp_create+0x2b1
aggr`aggr_ioc_create+0x132
dld`drv_ioctl+0x1e4
genunix`cdev_ioctl+0x39
specfs`spec_ioctl+0x60
genunix`fop_ioctl+0x55
genunix`ioctl+0x9b
unix`_sys_sysenter_post_swapgs+0x153
mac_client_impl_t ffffd0c600885488 aggr0-ixgbe1 mac_impl_t ffffd0c593297008 mi_name ixgbe1 mac_addr 0
0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef
0: 90 e2 ba b3 55 9d ....U.
19 28335 i_mac_unicast_add:entry
mac`mac_unicast_add+0x70
dls`dls_mac_active_set+0x5e
dls`dls_active_set+0x75
dld`proto_bind_req+0xc1
dld`dld_proto+0x255
dld`dld_wput_nondata_task+0x80
genunix`taskq_d_thread+0xb7
unix`thread_start+0x8
mac_client_impl_t ffffd0c6232c2ac0 aggr0 mac_impl_t ffffd0c6232da6f0 mi_name aggr1032 mac_addr 0
0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef
0: 90 e2 ba b3 55 9c ....U.
19 72304 aggr_addmac:entry
mac`mac_group_addmac+0x8c
mac`mac_add_macaddr_vlan+0x19b
mac`mac_datapath_setup+0x570
mac`mac_client_datapath_setup+0x22e
mac`i_mac_unicast_add+0x615
mac`mac_unicast_add+0x70
dls`dls_mac_active_set+0x5e
dls`dls_active_set+0x75
dld`proto_bind_req+0xc1
dld`dld_proto+0x255
dld`dld_wput_nondata_task+0x80
genunix`taskq_d_thread+0xb7
unix`thread_start+0x8
aggr_pseudo_rx_group_t * = ffffd0c5a418ac98
arg_index = 1
0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef
0: 90 e2 ba b3 55 9c ....U.
0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef
0: 90 e2 ba b3 55 9c ....U.
37 28335 i_mac_unicast_add:entry
mac`mac_unicast_add+0x70
dls`dls_mac_active_set+0x5e
dls`dls_active_set+0x75
dld`proto_bind_req+0xc1
dld`dld_proto+0x255
dld`dld_wput_nondata_task+0x80
genunix`taskq_d_thread+0xb7
unix`thread_start+0x8
mac_client_impl_t ffffd0c6232c2ac0 aggr0 mac_impl_t ffffd0c6232da6f0 mi_name aggr1032 mac_addr 0
0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef
0: 90 e2 ba b3 55 9c ....U.
37 72304 aggr_addmac:entry
mac`mac_group_addmac+0x8c
mac`mac_add_macaddr_vlan+0x19b
mac`mac_datapath_setup+0x570
mac`mac_client_datapath_setup+0x22e
mac`i_mac_unicast_add+0x615
mac`mac_unicast_add+0x70
dls`dls_mac_active_set+0x5e
dls`dls_active_set+0x75
dld`proto_bind_req+0xc1
dld`dld_proto+0x255
dld`dld_wput_nondata_task+0x80
genunix`taskq_d_thread+0xb7
unix`thread_start+0x8
aggr_pseudo_rx_group_t * = ffffd0c5a418ac98
arg_index = 1
0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef
0: 90 e2 ba b3 55 9c ....U.
0 1 2 3 4 5 6 7 8 9 a b c d e f 0123456789abcdef
0: 90 e2 ba b3 55 9c ....U.
I have no idea why the exactly same thing (i_mac_unicast_add() with same input args) was run twice.
So I attempted the following fix, it works:
http://src.illumos.org/source/xref/illumos-gate/usr/src/uts/common/io/aggr/aggr_grp.c#2351
static int
aggr_addmac(void *arg, const uint8_t *mac_addr)
{
aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
aggr_unicst_addr_t *addr, **pprev;
aggr_grp_t *grp = rx_group->arg_grp;
aggr_port_t *port, *p;
mac_perim_handle_t mph;
int err = 0;
mac_perim_enter_by_mh(grp->lg_mh, &mph);
/* remove the following if statement */
/*
if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
mac_perim_exit(mph);
return (0);
}
*/
/*
* Insert this mac address into the list of mac addresses owned by
* the aggregation pseudo group.
*/
pprev = &rx_group->arg_macaddr;
while ((addr = *pprev) != NULL) {
if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
mac_perim_exit(mph);
return (EEXIST); ===> return (0);
}
pprev = &addr->aua_next;
}
...
}
I am pretty sure it is hackish, and doesn't seem to be the right way to fix the issue.
Please advise.
@youzhongyang That check is for the aggr's MAC address. It should have already been set on all ports during creation. You don't want to set it again on a different group because it's already being filtered to another group. I forget why this case is being handled slightly differently than the EEXIST case. I'd need more time to boot my brain back into this code. I'm just a little too busy with some other stuff at the moment.
@rzezeski You're right, the mac address of the aggr interface is filtered on rx group 0 for all ports, but mac client aggr0 tries to use rx group 1 for hw lanes, if we don't register the mac address for rx group 1 by ixgbe_set_rar(), then there will no packets coming in through rx group 1.
This is what unicst_addr[1] looks like without the attempted fix, as you can see, it is empty:
> ffffd0c4f9ddc2b0::print struct dev_info devi_driver_data | ::print ixgbe_t unicst_addr[0] unicst_addr[1]
unicst_addr[0] = {
unicst_addr[0].reg = {
high = 0xe2900001
low = 0x9c55b3ba
}
unicst_addr[0].mac = {
set = 0x1
group_index = 0
addr = [ 0x90, 0xe2, 0xba, 0xb3, 0x55, 0x9c ]
}
}
unicst_addr[1] = {
unicst_addr[1].reg = {
high = 0
low = 0
}
unicst_addr[1].mac = {
set = 0
group_index = 0
addr = [ 0, 0, 0, 0, 0, 0 ]
}
}
With the fix, it went through rar registration and has data in unicst_addr[1]:
> ffffd0c4f9bd3010::print struct dev_info devi_driver_data | ::print -at ixgbe_t unicst_addr[0] unicst_addr[1]
ffffd0c573136bec ixgbe_ether_addr_t unicst_addr[0] = {
ffffd0c573136bec struct unicst_addr[0].reg = {
ffffd0c573136bec uint32_t high = 0xe2900001
ffffd0c573136bf0 uint32_t low = 0xd806ebba
}
ffffd0c573136bec struct unicst_addr[0].mac = {
ffffd0c573136bec uint8_t set = 0x1
ffffd0c573136bed uint8_t group_index = 0
ffffd0c573136bee uint8_t [6] addr = [ 0x90, 0xe2, 0xba, 0xeb, 0x6, 0xd8 ]
}
}
ffffd0c573136bf4 ixgbe_ether_addr_t unicst_addr[1] = {
ffffd0c573136bf4 struct unicst_addr[1].reg = {
ffffd0c573136bf4 uint32_t high = 0xe2900101
ffffd0c573136bf8 uint32_t low = 0xd806ebba
}
ffffd0c573136bf4 struct unicst_addr[1].mac = {
ffffd0c573136bf4 uint8_t set = 0x1
ffffd0c573136bf5 uint8_t group_index = 0x1
ffffd0c573136bf6 uint8_t [6] addr = [ 0x90, 0xe2, 0xba, 0xeb, 0x6, 0xd8 ]
}
}
I've been doing stress testing and our functional testing since last night, so far so good.
The proposed 'fix' works for ixgbe, but not for i40e (with multi-group support). On aggregated i40e interface, once VSI 0 and VSI 1 are filtered with the same aggr MAC address, incoming packets go through VSI 0 and VSI 1 'equally', thus causing unexpected behavior.
A better 'fix'. I have tested on ixgbe cards and i40e cards, no issue found.
diff --git a/usr/src/uts/common/io/aggr/aggr_grp.c b/usr/src/uts/common/io/aggr/aggr_grp.c
index 4784095..9944541 100644
--- a/usr/src/uts/common/io/aggr/aggr_grp.c
+++ b/usr/src/uts/common/io/aggr/aggr_grp.c
@@ -2529,11 +2529,6 @@ aggr_addmac(void *arg, const uint8_t *mac_addr)
mac_perim_enter_by_mh(grp->lg_mh, &mph);
- if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
- mac_perim_exit(mph);
- return (0);
- }
-
/*
* Insert this mac address into the list of mac addresses owned by
* the aggregation pseudo group.
@@ -2579,11 +2574,6 @@ aggr_remmac(void *arg, const uint8_t *mac_addr)
mac_perim_enter_by_mh(grp->lg_mh, &mph);
- if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
- mac_perim_exit(mph);
- return (0);
- }
-
/*
* Insert this mac address into the list of mac addresses owned by
* the aggregation pseudo group.
diff --git a/usr/src/uts/common/io/i40e/i40e_gld.c b/usr/src/uts/common/io/i40e/i40e_gld.c
index e1feaea..d340d38 100644
--- a/usr/src/uts/common/io/i40e/i40e_gld.c
+++ b/usr/src/uts/common/io/i40e/i40e_gld.c
@@ -133,8 +133,7 @@ i40e_group_add_mac(void *arg, const uint8_t *mac_addr)
for (i = 0; i < i40e->i40e_resources.ifr_nmacfilt_used; i++) {
if (bcmp(mac_addr, i40e->i40e_uaddrs[i].iua_mac,
ETHERADDRL) == 0) {
- ret = EEXIST;
- goto done;
+ ret = 0;
}
}
diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
index 6eea8b0..9ce06df 100644
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -3070,6 +3070,18 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
if ((err = mac_flow_add(mip->mi_flow_tab, flent)) != 0)
goto setup_failed;
+ /* Remove mac address filtering on default rx group,
+ * so that it is only for receving multicast and broadcast
+ * packets. This will allow hardware classification on the
+ * non-default group "rgroup", especially when it is the aggr
+ * interface itself.
+ */
+ if (rgroup != NULL && rgroup != default_rgroup &&
+ default_rgroup->mrg_state == MAC_GROUP_STATE_RESERVED &&
+ (mip->mi_state_flags & MIS_IS_AGGR)) {
+ mac_group_remmac(default_rgroup, mip->mi_info.mi_unicst_addr);
+ }
+
/* Program hardware classification. */
vid = i_mac_flow_vid(flent);
use_hw = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;
It's easy to reproduce using iperf3. Here is how we did it:
On the host, run a few iperf3 server processes:
where 172.30.192.25 is the IP bound to aggr0. Then on a few other hosts on the network, run iperf3 clients to talk to 172.30.192.25:
On the server host, run "kstat -p aggr0:0:mac_rx_swlane0:rxsdrops 1" to see if the counter changes.
The issue was reproduced using latest smartos image platform-20180830T001556Z.