TritonDataCenter / illumos-joyent

Community developed and maintained version of the OS/Net consolidation
http://www.illumos.org/projects/illumos-gate
265 stars 111 forks source link

rxsdrops on aggr0 of ixgbe cards with IP plumbed #179

Open youzhongyang opened 5 years ago

youzhongyang commented 5 years ago

It's easy to reproduce using iperf3. Here is how we did it:

On the host, run a few iperf3 server processes:

iperf3 -B 172.30.192.25 -s -p 5701 > /dev/null 2>&1 < /dev/null &
iperf3 -B 172.30.192.25 -s -p 5702 > /dev/null 2>&1 < /dev/null &
iperf3 -B 172.30.192.25 -s -p 5703 > /dev/null 2>&1 < /dev/null &
iperf3 -B 172.30.192.25 -s -p 5704 > /dev/null 2>&1 < /dev/null &
iperf3 -B 172.30.192.25 -s -p 5705 > /dev/null 2>&1 < /dev/null &
iperf3 -B 172.30.192.25 -s -p 5706 > /dev/null 2>&1 < /dev/null &

where 172.30.192.25 is the IP bound to aggr0. Then on a few other hosts on the network, run iperf3 clients to talk to 172.30.192.25:

iperf3 -c 172.30.192.25 -t 3600 -P 10 -p 5701 -N -b 0 > /dev/null 2>&1 < /dev/null &
iperf3 -c 172.30.192.25 -t 3600 -P 10 -p 5702 -N -b 0 > /dev/null 2>&1 < /dev/null &
iperf3 -c 172.30.192.25 -t 3600 -P 10 -p 5703 -N -b 0 > /dev/null 2>&1 < /dev/null &
iperf3 -c 172.30.192.25 -t 3600 -P 10 -p 5704 -N -b 0 > /dev/null 2>&1 < /dev/null &
iperf3 -c 172.30.192.25 -t 3600 -P 10 -p 5705 -N -b 0 > /dev/null 2>&1 < /dev/null &
iperf3 -c 172.30.192.25 -t 3600 -P 10 -p 5706 -N -b 0 > /dev/null 2>&1 < /dev/null &

On the server host, run "kstat -p aggr0:0:mac_rx_swlane0:rxsdrops 1" to see if the counter changes.

The issue was reproduced using latest smartos image platform-20180830T001556Z.

youzhongyang commented 5 years ago

More info: our network cards:

# prtconf -Dd | grep ixgbe
            pci15d9,1528 (pciex8086,1528) [Intel Corporation Ethernet Controller 10-Gigabit X540-AT2], instance #4 (driver name: ixgbe)
            pci15d9,1528 (pciex8086,1528) [Intel Corporation Ethernet Controller 10-Gigabit X540-AT2], instance #5 (driver name: ixgbe)
            pci8086,3 (pciex8086,10fb) [Intel Corporation 82599ES 10-Gigabit SFI/SFP+ Network Connection], instance #0 (driver name: ixgbe)
            pci8086,3 (pciex8086,10fb) [Intel Corporation 82599ES 10-Gigabit SFI/SFP+ Network Connection], instance #1 (driver name: ixgbe)
            pci8086,3 (pciex8086,10fb) [Intel Corporation 82599ES 10-Gigabit SFI/SFP+ Network Connection], instance #2 (driver name: ixgbe)
            pci8086,3 (pciex8086,10fb) [Intel Corporation 82599ES 10-Gigabit SFI/SFP+ Network Connection], instance #3 (driver name: ixgbe)

Our /usbkey/config:

# cat /usbkey/config
admin_nic=aggr0
admin_ip=172.30.192.25
admin_netmask=255.255.255.0
admin_network=
admin_gateway=172.30.192.25

headnode_default_gateway=172.30.192.1

dns_resolvers=***
dns_domain=***

ntp_hosts=***
compute_node_ntp_hosts=172.30.192.25

hostname=batfs1099

aggr0_aggr=90:e2:ba:eb:6:d8,90:e2:ba:eb:6:d9,90:e2:ba:eb:6:8,90:e2:ba:eb:6:9
aggr0_lacp_mode=active

admin0_nic=aggr0
admin0_ip=172.30.192.26
admin0_netmask=255.255.255.0
admin0_gateway=172.30.192.26

admin1_nic=aggr0
admin1_ip=172.30.192.27
admin1_netmask=255.255.255.0
admin1_gateway=172.30.192.27

MAC clients, their SRS count and rx_ring_group:

> ::walk mac_client_impl_cache | ::printf "%p %-16s %4d %20p\n" mac_client_impl_t . mci_name mci_flent->fe_rx_srs_cnt mci_flent->fe_rx_ring_group
ffffd0c5939d0008 z1_net0            17     ffffd0c7babe0280
ffffd0c5939d1478 admin1             17     ffffd0c7babe01e0
ffffd0c5939d28e8 admin0             17     ffffd0c7babe0140
ffffd0c5939d3d58 aggr0              17     ffffd0c7babe00a0
ffffd0c5939d51c8 aggr0-ixgbe3        0     ffffd0c571794000
ffffd0c5939d6638 aggr0-ixgbe2        0     ffffd0c572e2e000
ffffd0c5939d7aa8 ixgbe2              0                    0
ffffd0c904a598f8 z6_net0            17     ffffd0c7babe05a0
ffffd0c5939d9010 aggr0-ixgbe1        0     ffffd0c50fdca000
ffffd0c904a5ad68 z5_net0            17     ffffd0c7babe0500
ffffd0c5939da480 aggr0-ixgbe0        0     ffffd0c570fd1000
ffffd0c5939db8f0 ixgbe5              5     ffffd0c5713110a0
ffffd0c904a5c1d8 z4_net0            17     ffffd0c7babe0460
ffffd0c5939dcd60 ixgbe4              5     ffffd0c572e1c0a0
ffffd0c904a5d648 z3_net0            17     ffffd0c7babe03c0
ffffd0c904a5eab8 z2_net0            17     ffffd0c7babe0320
ffffd0c5939de1d0 ixgbe3              0                    0
ffffd0c5939df640 ixgbe1              0                    0
ffffd0c5939e0ab0 ixgbe0              0                    0

aggr0 SRSes:

> ffffd0c5939d3d58::print mac_client_impl_t mci_flent | ::print flow_entry_t fe_rx_srs_cnt fe_rx_srs
fe_rx_srs_cnt = 0x11
fe_rx_srs = [ 0xffffd0c7bb110300, 0xffffd0c7bb111040, 0xffffd0c7bb111d00, 0xffffd0c7bb114340, 0xffffd0c7bb116980, 0xffffd0c7bb117640, 0xffffd0c7bb118300, 0xffffd0c6b8797040, 0xffffd0c7bb10dcc0, 0xffffd0c7bb10d000, 0xffffd0c7bb304340, 0xffffd0c7bb303680, 0xffffd0c7bb3029c0, 0xffffd0c7bb301d00, 0xffffd0c7bb301040, 0xffffd0c7bb2fc300, 0xffffd0c7bb2fb640, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... ]

aggr0 rx_ring_group:

> ffffd0c5939d3d58::print mac_client_impl_t mci_flent->fe_rx_ring_group | ::print -at mac_group_t
ffffd0c7babe00a0 mac_group_t {
    ffffd0c7babe00a0 int mrg_index = 0x1
    ffffd0c7babe00a4 mac_ring_type_t mrg_type = 0x1 (MAC_RING_TYPE_RX)
    ffffd0c7babe00a8 mac_group_state_t mrg_state = 2 (MAC_GROUP_STATE_RESERVED)
    ffffd0c7babe00b0 mac_group_t *mrg_next = 0xffffd0c7babe0140
    ffffd0c7babe00b8 mac_handle_t mrg_mh = 0xffffd0c593966b80
    ffffd0c7babe00c0 mac_ring_t *mrg_rings = 0xffffd0c7bac4f228
    ffffd0c7babe00c8 uint_t mrg_cur_count = 0x10
    ffffd0c7babe00d0 mac_grp_client_t *mrg_clients = 0xffffd0c5960afde0
    ffffd0c7babe00d8 mac_group_info_t mrg_info = {
        ffffd0c7babe00d8 mac_group_driver_t mgi_driver = 0xffffd0c5ab02ad58
        ffffd0c7babe00e0 mac_group_start_t mgi_start = 0
        ffffd0c7babe00e8 mac_group_stop_t mgi_stop = 0
        ffffd0c7babe00f0 uint_t mgi_count = 0
        ffffd0c7babe00f8 mac_intr_t mgi_intr = {
            ffffd0c7babe00f8 mac_intr_handle_t mi_handle = 0
            ffffd0c7babe0100 mac_intr_enable_t mi_enable = 0
            ffffd0c7babe0108 mac_intr_disable_t mi_disable = 0
            ffffd0c7babe0110 ddi_intr_handle_t mi_ddi_handle = 0
            ffffd0c7babe0118 boolean_t mi_ddi_shared = 0 (0)
        }
        ffffd0c7babe0120 mac_add_mac_addr_t mgi_addmac = aggr_addmac
        ffffd0c7babe0128 mac_rem_mac_addr_t mgi_remmac = aggr_remmac
        ffffd0c7babe0130 mac_add_vlan_filter_t mgi_addvlan = aggr_addvlan
        ffffd0c7babe0138 mac_rem_vlan_filter_t mgi_remvlan = aggr_remvlan
    }
}

aggr0 SRSes' intr bytes and state:

> ffffd0c5939d3d58::print mac_client_impl_t mci_flent | ::print flow_entry_t fe_rx_srs | ::array void* 11 | ::print void* | ::printf "%16x\t%x\n" mac_soft_ring_set_t srs_rx.sr_stat.mrs_intrbytes srs_state
         2a87622        24000002
               0        2400000e
               0        2400000e
               0        2400000e
               0        2400000e
               0        2400000e
               0        2400000e
               0        2400000e
               0        2400000e
               0        2400000e
               0        2400000e
               0        2400000e
               0        2400000e
               0        2400000e
               0        2400000e
               0        2400000e
               0        2400000e

admin1 SRSes' intr bytes and state:

> ffffd0c5939d1478::print mac_client_impl_t mci_flent | ::print flow_entry_t fe_rx_srs | ::array void* 11 | ::print void* | ::printf "%16x\t%x\n" mac_soft_ring_set_t srs_rx.sr_stat.mrs_intrbytes srs_state
          33b029        24000002
       6e63356d5        2400000e
       ba586f87a        2400000e
       71b123047        2400000e
       5713e601f        2400000e
      23314db060        2400000e
       fd423a50e        2400000e
      27a5d7a646        2400081e
       d5556b8ee        2400000e
            2448        2400000e
       87babe76b        2400007e
       607168f21        2400000e
       8baa0cade        2400004e
       ebf14734d        2400000e
       b8eeb9659        2400000e
      2a7662f67c        2400000e
      23634602d4        2400000e

z1_net0(a VNIC for a zone) SRSes' intr bytes and state:

> ffffd0c5939d0008::print mac_client_impl_t mci_flent | ::print flow_entry_t fe_rx_srs | ::array void* 11 | ::print void* | ::printf "%16x\t%x\n" mac_soft_ring_set_t srs_rx.sr_stat.mrs_intrbytes srs_state
          342672        24000002
          1c3c85        2400000e
          13f9b1        2400000e
          21fda0        2400000e
          13396e        2400000e
           11fd0        2400000e
           2cddd        2400000e
            c3fd        2400000e
           1780a        2400000e
          14c975        2400000e
           da4e0        2400000e
          11669f        2400000e
          150fa5        2400000e
            81f6        2400000e
           18cb5        2400000e
           1bfb2        2400000e
            f3a0        2400000e
rmustacc commented 5 years ago

When the kernel has elected to use the software lanes, then we always stand a chance of hitting the high watermark that it has for backpressure, which results in the rxsdrops. I think what we need to understand is why the kernel opted to do so.

youzhongyang commented 5 years ago

What does ring gen_num of 1 mean? I can see the gen_num of rings for aggr0 is 1, which appears to be abnormal.

> ::walk mac_client_impl_cache | ::printf "%p %-16s %4d %20p\n" mac_client_impl_t . mci_name mci_flent->fe_rx_srs_cnt mci_flent->fe_rx_ring_group
ffffd0c8cbbdb8f8 z6_net0            17     ffffd0c7ba9e95a0
ffffd0c8cbbdcd68 z4_net0            17     ffffd0c7ba9e9500
ffffd0c8cbbde1d8 z5_net0            17     ffffd0c7ba9e9460
ffffd0c8cbbdf648 z3_net0            17     ffffd0c7ba9e93c0
ffffd0c8cbbe0ab8 z2_net0            17     ffffd0c7ba9e9320
ffffd0c593966008 z1_net0            17     ffffd0c7ba9e9280
ffffd0c593967478 admin1             17     ffffd0c7ba9e91e0
ffffd0c5939688e8 admin0             17     ffffd0c7ba9e9140
ffffd0c593969d58 aggr0              17     ffffd0c7ba9e90a0
ffffd0c59396b1c8 aggr0-ixgbe3        0     ffffd0c5734d0000
ffffd0c59396c638 aggr0-ixgbe2        0     ffffd0c571420000
ffffd0c59396daa8 ixgbe0              0                    0
ffffd0c59396f010 aggr0-ixgbe1        0     ffffd0c572e5e000
ffffd0c593970480 aggr0-ixgbe0        0     ffffd0c571637000
ffffd0c5939718f0 ixgbe5              5     ffffd0c594f440a0
ffffd0c593972d60 ixgbe4              5     ffffd0c594f4a0a0
ffffd0c5939741d0 ixgbe1              0                    0
ffffd0c593975640 ixgbe3              0                    0
ffffd0c593976ab0 ixgbe2              0                    0

aggr0:

> ffffd0c7ba9e90a0::print mac_group_t mrg_rings | ::list mac_ring_t mr_next | ::print mac_ring_t mr_gen_num
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1
mr_gen_num = 0x1

> ffffd0c7ba9e90a0::print mac_group_t mrg_rings | ::list mac_ring_t mr_next | ::print mac_ring_t mr_info.mri_driver | ::print aggr_pseudo_rx_ring_t arr_hw_rh | ::print mac_ring_t mr_info.mri_driver | ::printf "%-16d %d\n" ixgbe_rx_ring_t stat_ipackets ring_gen_num
0                1
0                1
0                1
0                1
0                1
0                1
0                1
0                1
0                1
0                1
0                1
0                1
0                1
0                1
0                1
0                1

admin0:

> ffffd0c7ba9e9140::print mac_group_t mrg_rings | ::list mac_ring_t mr_next | ::print mac_ring_t mr_gen_num
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0

> ffffd0c7ba9e9140::print mac_group_t mrg_rings | ::list mac_ring_t mr_next | ::print mac_ring_t mr_info.mri_driver | ::print aggr_pseudo_rx_ring_t arr_hw_rh | ::print mac_ring_t mr_info.mri_driver | ::printf "%-16d %d\n" ixgbe_rx_ring_t stat_ipackets ring_gen_num
355395455        0
227104173        0
237647385        0
199660433        0
231530313        0
180110079        0
202886421        0
227397803        0
181857652        0
218078952        0
233618762        0
207876151        0
272891844        0
416724862        0
216742891        0
212029696        0

z1_net0:

> ffffd0c7ba9e9280::print mac_group_t mrg_rings | ::list mac_ring_t mr_next | ::print mac_ring_t mr_gen_num
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0
mr_gen_num = 0

> ffffd0c7ba9e9280::print mac_group_t mrg_rings | ::list mac_ring_t mr_next | ::print mac_ring_t mr_info.mri_driver | ::print aggr_pseudo_rx_ring_t arr_hw_rh | ::print mac_ring_t mr_info.mri_driver | ::printf "%-16d %d\n" ixgbe_rx_ring_t stat_ipackets ring_gen_num
4753997          0
5308389          0
3458119          0
4483390          0
5750086          0
3497963          0
3365930          0
5886896          0
538679           0
6597434          0
2291767          0
5950493          0
2378832          0
6698355          0
3419547          0
2184117          0
youzhongyang commented 5 years ago

Figured out why aggr0 does not use hw lanes - when aggr_addmac() tries to register mac address for aggr0 interface, it does nothing and immediately returns.

Here are more info:

Script to stop aggr:

# cat /var/tmp/stop-aggr
#!/bin/bash
ifconfig aggr0 down
ifconfig aggr0 unplumb
dladm delete-aggr aggr0

Script to start aggr:

# cat /var/tmp/start-aggr
#!/bin/bash
dladm create-aggr -L active -l ixgbe0 -l ixgbe1 aggr0
ifconfig aggr0 plumb
ifconfig aggr0 inet 172.31.16.19 netmask 255.255.255.0 up
route add default 172.31.16.1

dtrace script:

# cat /var/tmp/dtrace-unicast_add
#!/bin/bash
dtrace -n 'aggr_addmac:entry {stack();printf("aggr_pseudo_rx_group_t * = %p\n", args[0]); printf("arg_index = %d\n", ((aggr_pseudo_rx_group_t*)args[0])->arg_index); tracemem(args[1], 6); tracemem( ((aggr_pseudo_rx_group_t*)args[0])->arg_grp->lg_addr, 6 ); }' -n 'i_mac_unicast_add:entry {stack(); ch = (mac_client_impl_t*)args[0]; printf("mac_client_impl_t %p %s mac_impl_t %p mi_name %s mac_addr %p\n", ch, ch->mci_name, ch->mci_mip, ch->mci_mip->mi_name, args[1]); tracemem(ch->mci_mip->mi_addr, 6); }' -o /var/tmp/out-aggr_addmac-unicast_add

Steps to get dtrace output:

# /var/tmp/stop-aggr
# /var/tmp/dtrace-unicast_add &
# /var/tmp/start-aggr

dtrace output:

# cat /var/tmp/out-aggr_addmac-unicast_add
CPU     ID                    FUNCTION:NAME
  0  28335          i_mac_unicast_add:entry
              mac`mac_unicast_add+0x70
              aggr`aggr_port_create+0x205
              aggr`aggr_grp_add_port+0x8d
              aggr`aggr_grp_create+0x2b1
              aggr`aggr_ioc_create+0x132
              dld`drv_ioctl+0x1e4
              genunix`cdev_ioctl+0x39
              specfs`spec_ioctl+0x60
              genunix`fop_ioctl+0x55
              genunix`ioctl+0x9b
              unix`_sys_sysenter_post_swapgs+0x153
mac_client_impl_t ffffd0c600884018 aggr0-ixgbe0 mac_impl_t ffffd0c5932a1130 mi_name ixgbe0 mac_addr 0

             0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  0123456789abcdef
         0: 90 e2 ba b3 55 9c                                ....U.

  0  28335          i_mac_unicast_add:entry
              mac`mac_unicast_add+0x70
              aggr`aggr_port_create+0x205
              aggr`aggr_grp_add_port+0x8d
              aggr`aggr_grp_create+0x2b1
              aggr`aggr_ioc_create+0x132
              dld`drv_ioctl+0x1e4
              genunix`cdev_ioctl+0x39
              specfs`spec_ioctl+0x60
              genunix`fop_ioctl+0x55
              genunix`ioctl+0x9b
              unix`_sys_sysenter_post_swapgs+0x153
mac_client_impl_t ffffd0c600885488 aggr0-ixgbe1 mac_impl_t ffffd0c593297008 mi_name ixgbe1 mac_addr 0

             0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  0123456789abcdef
         0: 90 e2 ba b3 55 9d                                ....U.

 19  28335          i_mac_unicast_add:entry
              mac`mac_unicast_add+0x70
              dls`dls_mac_active_set+0x5e
              dls`dls_active_set+0x75
              dld`proto_bind_req+0xc1
              dld`dld_proto+0x255
              dld`dld_wput_nondata_task+0x80
              genunix`taskq_d_thread+0xb7
              unix`thread_start+0x8
mac_client_impl_t ffffd0c6232c2ac0 aggr0 mac_impl_t ffffd0c6232da6f0 mi_name aggr1032 mac_addr 0

             0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  0123456789abcdef
         0: 90 e2 ba b3 55 9c                                ....U.

 19  72304                aggr_addmac:entry
              mac`mac_group_addmac+0x8c
              mac`mac_add_macaddr_vlan+0x19b
              mac`mac_datapath_setup+0x570
              mac`mac_client_datapath_setup+0x22e
              mac`i_mac_unicast_add+0x615
              mac`mac_unicast_add+0x70
              dls`dls_mac_active_set+0x5e
              dls`dls_active_set+0x75
              dld`proto_bind_req+0xc1
              dld`dld_proto+0x255
              dld`dld_wput_nondata_task+0x80
              genunix`taskq_d_thread+0xb7
              unix`thread_start+0x8
aggr_pseudo_rx_group_t * = ffffd0c5a418ac98
arg_index = 1

             0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  0123456789abcdef
         0: 90 e2 ba b3 55 9c                                ....U.

             0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  0123456789abcdef
         0: 90 e2 ba b3 55 9c                                ....U.

 37  28335          i_mac_unicast_add:entry
              mac`mac_unicast_add+0x70
              dls`dls_mac_active_set+0x5e
              dls`dls_active_set+0x75
              dld`proto_bind_req+0xc1
              dld`dld_proto+0x255
              dld`dld_wput_nondata_task+0x80
              genunix`taskq_d_thread+0xb7
              unix`thread_start+0x8
mac_client_impl_t ffffd0c6232c2ac0 aggr0 mac_impl_t ffffd0c6232da6f0 mi_name aggr1032 mac_addr 0

             0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  0123456789abcdef
         0: 90 e2 ba b3 55 9c                                ....U.

 37  72304                aggr_addmac:entry
              mac`mac_group_addmac+0x8c
              mac`mac_add_macaddr_vlan+0x19b
              mac`mac_datapath_setup+0x570
              mac`mac_client_datapath_setup+0x22e
              mac`i_mac_unicast_add+0x615
              mac`mac_unicast_add+0x70
              dls`dls_mac_active_set+0x5e
              dls`dls_active_set+0x75
              dld`proto_bind_req+0xc1
              dld`dld_proto+0x255
              dld`dld_wput_nondata_task+0x80
              genunix`taskq_d_thread+0xb7
              unix`thread_start+0x8
aggr_pseudo_rx_group_t * = ffffd0c5a418ac98
arg_index = 1

             0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  0123456789abcdef
         0: 90 e2 ba b3 55 9c                                ....U.

             0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f  0123456789abcdef
         0: 90 e2 ba b3 55 9c                                ....U.

I have no idea why the exactly same thing (i_mac_unicast_add() with same input args) was run twice.

So I attempted the following fix, it works:

http://src.illumos.org/source/xref/illumos-gate/usr/src/uts/common/io/aggr/aggr_grp.c#2351

static int
aggr_addmac(void *arg, const uint8_t *mac_addr)
{
    aggr_pseudo_rx_group_t  *rx_group = (aggr_pseudo_rx_group_t *)arg;
    aggr_unicst_addr_t  *addr, **pprev;
    aggr_grp_t      *grp = rx_group->arg_grp;
    aggr_port_t     *port, *p;
    mac_perim_handle_t  mph;
    int         err = 0;

    mac_perim_enter_by_mh(grp->lg_mh, &mph);

    /* remove the following if statement */
    /*
    if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
        mac_perim_exit(mph);
        return (0);
    }
    */

    /*
     * Insert this mac address into the list of mac addresses owned by
     * the aggregation pseudo group.
     */
    pprev = &rx_group->arg_macaddr;
    while ((addr = *pprev) != NULL) {
        if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
            mac_perim_exit(mph);
            return (EEXIST);   ===> return (0);
        }
        pprev = &addr->aua_next;
    }
    ...
}

I am pretty sure it is hackish, and doesn't seem to be the right way to fix the issue.

Please advise.

rzezeski commented 5 years ago

@youzhongyang That check is for the aggr's MAC address. It should have already been set on all ports during creation. You don't want to set it again on a different group because it's already being filtered to another group. I forget why this case is being handled slightly differently than the EEXIST case. I'd need more time to boot my brain back into this code. I'm just a little too busy with some other stuff at the moment.

youzhongyang commented 5 years ago

@rzezeski You're right, the mac address of the aggr interface is filtered on rx group 0 for all ports, but mac client aggr0 tries to use rx group 1 for hw lanes, if we don't register the mac address for rx group 1 by ixgbe_set_rar(), then there will no packets coming in through rx group 1.

This is what unicst_addr[1] looks like without the attempted fix, as you can see, it is empty:

> ffffd0c4f9ddc2b0::print struct dev_info devi_driver_data | ::print ixgbe_t unicst_addr[0] unicst_addr[1]
unicst_addr[0] = {
    unicst_addr[0].reg = {
        high = 0xe2900001
        low = 0x9c55b3ba
    }
    unicst_addr[0].mac = {
        set = 0x1
        group_index = 0
        addr = [ 0x90, 0xe2, 0xba, 0xb3, 0x55, 0x9c ]
    }
}
unicst_addr[1] = {
    unicst_addr[1].reg = {
        high = 0
        low = 0
    }
    unicst_addr[1].mac = {
        set = 0
        group_index = 0
        addr = [ 0, 0, 0, 0, 0, 0 ]
    }
}

With the fix, it went through rar registration and has data in unicst_addr[1]:

> ffffd0c4f9bd3010::print struct dev_info devi_driver_data | ::print -at ixgbe_t unicst_addr[0] unicst_addr[1]
ffffd0c573136bec ixgbe_ether_addr_t unicst_addr[0] = {
    ffffd0c573136bec struct  unicst_addr[0].reg = {
        ffffd0c573136bec uint32_t high = 0xe2900001
        ffffd0c573136bf0 uint32_t low = 0xd806ebba
    }
    ffffd0c573136bec struct  unicst_addr[0].mac = {
        ffffd0c573136bec uint8_t set = 0x1
        ffffd0c573136bed uint8_t group_index = 0
        ffffd0c573136bee uint8_t [6] addr = [ 0x90, 0xe2, 0xba, 0xeb, 0x6, 0xd8 ]
    }
}
ffffd0c573136bf4 ixgbe_ether_addr_t unicst_addr[1] = {
    ffffd0c573136bf4 struct  unicst_addr[1].reg = {
        ffffd0c573136bf4 uint32_t high = 0xe2900101
        ffffd0c573136bf8 uint32_t low = 0xd806ebba
    }
    ffffd0c573136bf4 struct  unicst_addr[1].mac = {
        ffffd0c573136bf4 uint8_t set = 0x1
        ffffd0c573136bf5 uint8_t group_index = 0x1
        ffffd0c573136bf6 uint8_t [6] addr = [ 0x90, 0xe2, 0xba, 0xeb, 0x6, 0xd8 ]
    }
}

I've been doing stress testing and our functional testing since last night, so far so good.

youzhongyang commented 5 years ago

The proposed 'fix' works for ixgbe, but not for i40e (with multi-group support). On aggregated i40e interface, once VSI 0 and VSI 1 are filtered with the same aggr MAC address, incoming packets go through VSI 0 and VSI 1 'equally', thus causing unexpected behavior.

youzhongyang commented 5 years ago

A better 'fix'. I have tested on ixgbe cards and i40e cards, no issue found.

diff --git a/usr/src/uts/common/io/aggr/aggr_grp.c b/usr/src/uts/common/io/aggr/aggr_grp.c
index 4784095..9944541 100644
--- a/usr/src/uts/common/io/aggr/aggr_grp.c
+++ b/usr/src/uts/common/io/aggr/aggr_grp.c
@@ -2529,11 +2529,6 @@ aggr_addmac(void *arg, const uint8_t *mac_addr)

    mac_perim_enter_by_mh(grp->lg_mh, &mph);

-   if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
-       mac_perim_exit(mph);
-       return (0);
-   }
-
    /*
     * Insert this mac address into the list of mac addresses owned by
     * the aggregation pseudo group.
@@ -2579,11 +2574,6 @@ aggr_remmac(void *arg, const uint8_t *mac_addr)

    mac_perim_enter_by_mh(grp->lg_mh, &mph);

-   if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
-       mac_perim_exit(mph);
-       return (0);
-   }
-
    /*
     * Insert this mac address into the list of mac addresses owned by
     * the aggregation pseudo group.
diff --git a/usr/src/uts/common/io/i40e/i40e_gld.c b/usr/src/uts/common/io/i40e/i40e_gld.c
index e1feaea..d340d38 100644
--- a/usr/src/uts/common/io/i40e/i40e_gld.c
+++ b/usr/src/uts/common/io/i40e/i40e_gld.c
@@ -133,8 +133,7 @@ i40e_group_add_mac(void *arg, const uint8_t *mac_addr)
    for (i = 0; i < i40e->i40e_resources.ifr_nmacfilt_used; i++) {
        if (bcmp(mac_addr, i40e->i40e_uaddrs[i].iua_mac,
            ETHERADDRL) == 0) {
-           ret = EEXIST;
-           goto done;
+           ret = 0;
        }
    }

diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c
index 6eea8b0..9ce06df 100644
--- a/usr/src/uts/common/io/mac/mac_datapath_setup.c
+++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c
@@ -3070,6 +3070,18 @@ mac_datapath_setup(mac_client_impl_t *mcip, flow_entry_t *flent,
        if ((err = mac_flow_add(mip->mi_flow_tab, flent)) != 0)
            goto setup_failed;

+       /* Remove mac address filtering on default rx group,
+        * so that it is only for receving multicast and broadcast
+        * packets. This will allow hardware classification on the
+        * non-default group "rgroup", especially when it is the aggr
+        * interface itself.
+        */
+       if (rgroup != NULL && rgroup != default_rgroup &&
+           default_rgroup->mrg_state == MAC_GROUP_STATE_RESERVED &&
+           (mip->mi_state_flags & MIS_IS_AGGR)) {
+           mac_group_remmac(default_rgroup, mip->mi_info.mi_unicst_addr);
+       }
+
        /* Program hardware classification. */
        vid = i_mac_flow_vid(flent);
        use_hw = (mcip->mci_state_flags & MCIS_UNICAST_HW) != 0;