openwrt / mt76

mac80211 driver for MediaTek MT76x0e, MT76x2e, MT7603, MT7615, MT7628 and MT7688
738 stars 343 forks source link

MT798X crash in mtk_wed_setup_tc_block_cb when multiple times channel switching #873

Open zhaojh329 opened 4 months ago

zhaojh329 commented 4 months ago

OpenWrt commit 66019e456f60f5ef71a9af0d73502e20b874d114

root@GL-MT6000:~# [ 2692.420930] mt798x-wmac 18000000.wifi wlan1: left allmulticast mode
[ 2692.427219] mt798x-wmac 18000000.wifi wlan1: left promiscuous mode
[ 2692.433468] br-lan: port 7(wlan1) entered disabled state
[ 2692.951367] Unable to handle kernel paging request at virtual address 48eb49654894f2e2
[ 2692.959280] Mem abort info:
[ 2692.962108]   ESR = 0x0000000096000004
[ 2692.965840]   EC = 0x25: DABT (current EL), IL = 32 bits
[ 2692.971133]   SET = 0, FnV = 0
[ 2692.974179]   EA = 0, S1PTW = 0
[ 2692.977304]   FSC = 0x04: level 0 translation fault
[ 2692.982168] Data abort info:
[ 2692.985033]   ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000
[ 2692.990497]   CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[ 2692.995534]   GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[ 2693.000826] [48eb49654894f2e2] address between user and kernel address ranges
[ 2693.007942] Internal error: Oops: 0000000096000004 [#1] SMP
[ 2693.013495] Modules linked in: pppoe ppp_async option wireguard usb_wwan rndis_host qmi_wwan pppox ppp_generic nft_fib_inet nf_flow_table_inet mt7915e(O) mt76_connac_lib(O) mt76(O) mac80211(O) libchacha20poly1305 ipt_REJECT huawei_cdc_ncm chacha_neon cfg80211(O) cdc_ncm cdc_ether xt_time xt_tcpudp xt_state xt_recent xt_quota xt_pkttype xt_owner xt_nat xt_multiport xt_mark xt_mac xt_limit xt_helper xt_conntrack xt_connmark xt_connlimit xt_connbytes xt_comment xt_cgroup xt_addrtype xt_TCPMSS xt_REDIRECT xt_MASQUERADE xt_LOG usbserial usbnet ts_fsm ts_bm slhc poly1305_neon nft_reject_ipv6 nft_reject_ipv4 nft_reject_inet nft_reject nft_redir nft_quota nft_numgen nft_nat nft_masq nft_log nft_limit nft_hash nft_flow_offload nft_fib_ipv6 nft_fib_ipv4 nft_fib nft_ct nft_compat nft_chain_nat nf_tables nf_reject_ipv4 nf_nat_tftp nf_nat_snmp_basic nf_nat_sip nf_nat_pptp nf_nat_irc nf_nat_h323 nf_nat_amanda nf_log_syslog nf_flow_table nf_conntrack_tftp nf_conntrack_snmp nf_conntrack_sip nf_conntrack_pptp nf_conntrack_netlink
[ 2693.013666]  nf_conntrack_irc nf_conntrack_h323 nf_conntrack_broadcast ts_kmp nf_conntrack_amanda nf_conncount mdio_netlink(O) libcurve25519_generic libcrc32c libchacha iptable_nat iptable_mangle iptable_filter ipheth ip_tables compat(O) cls_flower cdc_wdm cdc_acm br_netfilter asn1_decoder arptable_filter arpt_mangle arp_tables act_vlan crypto_safexcel fuse cls_bpf act_bpf sch_tbf sch_ingress sch_htb sch_hfsc em_u32 cls_u32 cls_route cls_matchall cls_fw cls_flow cls_basic act_skbedit act_mirred act_gact xt_set ip_set_list_set ip_set_hash_netportnet ip_set_hash_netport ip_set_hash_netnet ip_set_hash_netiface ip_set_hash_net ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ipmac ip_set_hash_ip ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set nfnetlink ip6table_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip6t_NPT ip6table_mangle ip6table_filter ip6_tables ip6t_REJECT x_tables nf_reject_ipv6 ifb ip6_udp_tunnel udp_tunnel tun ntfs nls_utf8
[ 2693.103336]  nls_iso8859_1 nls_cp437 sha512_arm64 sha1_ce sha1_generic seqiv md5 geniv des_generic libdes authencesn authenc uas usb_storage leds_gpio xhci_plat_hcd xhci_pci xhci_mtk_hcd xhci_hcd uhci_hcd ohci_platform ohci_hcd fsl_mph_dr_of ehci_platform ehci_fsl kmwan(O) ehci_hcd gpio_button_hotplug(O) gl_sdk4_tertf(O) gl_sdk4_black_white_list(O) vfat fat exfat usbcore usb_common aquantia mii gl_sdk4_hw_info(O)
[ 2693.229763] CPU: 2 PID: 2813 Comm: hostapd Tainted: G           O       6.6.27 #0
[ 2693.237223] Hardware name: GL.iNet GL-MT6000 (DT)
[ 2693.241908] pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[ 2693.248848] pc : mtk_wed_setup_tc_block_cb+0x4/0x38
[ 2693.253716] lr : tc_setup_cb_reoffload+0x30/0x134
[ 2693.258407] sp : ffffffc082293360
[ 2693.261705] x29: ffffffc082293360 x28: ffffffc080b58000 x27: 0000000000000000
[ 2693.268821] x26: ffffff800d593800 x25: 0000000000000000 x24: 0000000000000000
[ 2693.275936] x23: ffffff800a1882a0 x22: ffffff800d593800 x21: ffffff800a853c2c
[ 2693.283051] x20: 0000000000000000 x19: ffffff800d498800 x18: ffffffc0816d3cb0
[ 2693.290167] x17: 0000000000000000 x16: ffffffc080927090 x15: 0000000000000000
[ 2693.297283] x14: 0000000000000000 x13: 0000000000000030 x12: 0000000000000002
[ 2693.304398] x11: 00000000000000a8 x10: 00000000000008a0 x9 : ffffffc0822935f0
[ 2693.311514] x8 : 0000000000000001 x7 : ffffff800a853c2c x6 : ffffff800a1882a0
[ 2693.318629] x5 : ffffffc082293408 x4 : 0000000000000005 x3 : 48eb49654894f1f2
[ 2693.325745] x2 : ffffff8001289000 x1 : ffffffc082293408 x0 : 0000000000000005
[ 2693.332861] Call trace:
[ 2693.335294]  mtk_wed_setup_tc_block_cb+0x4/0x38
[ 2693.339808]  0xffffffc0790d84bc
[ 2693.342996]  tcf_block_playback_offloads+0x70/0x1e8
[ 2693.347856]  tcf_block_unbind+0x6c/0xc8
[ 2693.351675]  tcf_block_setup+0x38/0x1e8
[ 2693.355494]  tcf_block_offload_cmd.isra.0+0xdc/0x128
[ 2693.360440]  tcf_block_offload_unbind+0x50/0x8c
[ 2693.364952]  __tcf_block_put+0x88/0x178
[ 2693.368771]  tcf_block_put_ext+0x4c/0x60
[ 2693.372677]  0xffffffc0790b79ac
[ 2693.375821]  __qdisc_destroy+0x40/0xa0
[ 2693.379554]  qdisc_put+0x54/0x6c
[ 2693.382767]  dev_shutdown+0x90/0x108
[ 2693.386327]  unregister_netdevice_many_notify+0x2ec/0x77c
[ 2693.391708]  unregister_netdevice_queue+0xa4/0xb0
[ 2693.396395]  cfg80211_shutdown_all_interfaces+0x32c/0x37c [cfg80211]
[ 2693.402745]  cfg80211_unregister_wdev+0x10/0x18 [cfg80211]
[ 2693.408223]  ieee80211_if_remove+0x6c/0x110 [mac80211]
[ 2693.413374]  ieee80211_channel_switch_disconnect+0x1cfc/0x1d08 [mac80211]
[ 2693.420161]  cfg80211_remove_virtual_intf+0x5c/0x68 [cfg80211]
[ 2693.425987]  cfg80211_check_station_change+0x3194/0x32ac [cfg80211]
[ 2693.432244]  genl_family_rcv_msg_doit+0xa8/0x108
[ 2693.436847]  genl_rcv_msg+0x1b0/0x244
[ 2693.440495]  netlink_rcv_skb+0x58/0x120
[ 2693.444316]  genl_rcv+0x34/0x48
[ 2693.447443]  netlink_unicast+0x1e0/0x2c8
[ 2693.451350]  netlink_sendmsg+0x198/0x3c8
[ 2693.455256]  ____sys_sendmsg+0x1bc/0x26c
[ 2693.459165]  ___sys_sendmsg+0x78/0xb8
[ 2693.462811]  __sys_sendmsg+0x44/0x98
[ 2693.466372]  __arm64_sys_sendmsg+0x20/0x28
[ 2693.470452]  invoke_syscall.constprop.0+0x4c/0xe0
[ 2693.475141]  do_el0_svc+0x3c/0xb8
[ 2693.478441]  el0_svc+0x18/0x4c
[ 2693.481484]  el0t_64_sync_handler+0x118/0x124
[ 2693.485825]  el0t_64_sync+0x150/0x154
[ 2693.489475] Code: b9401fe0 a8c27bfd d65f03c0 a9400c42 (f9407863) 
[ 2693.495548] ---[ end trace 0000000000000000 ]---
[ 2693.506534] pstore: backend (ramoops) writing error (-28)
[ 2693.511918] Kernel panic - not syncing: Oops: Fatal exception
[ 2693.517643] SMP: stopping secondary CPUs
[ 2693.521551] Kernel Offset: disabled
[ 2693.525022] CPU features: 0x0,00000000,00000000,1000400b
[ 2693.530316] Memory Limit: none
[ 2693.539576] Rebooting in 3 seconds..
zhaojh329 commented 4 months ago

It doesn't crash anymore when remove bridger.

everything411 commented 1 month ago

same problem. further more, for mutliple ssid setups, this issue always occur each time applying wifi settings in luci or reboot the router. simple wifi down in ssh also crash the router, 100% reproducable. config and log.zip

everything411 commented 1 month ago

crash at mtk_wed_setup_tc_block_cb+0x4/0x38

static inline bool tc_can_offload(const struct net_device *dev)
{
    return dev->features & NETIF_F_HW_TC;
}
static int
mtk_wed_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
{
    struct mtk_wed_flow_block_priv *priv = cb_priv;
    struct flow_cls_offload *cls = type_data;
    struct mtk_wed_hw *hw = priv->hw;

    if (!tc_can_offload(priv->dev))
        return -EOPNOTSUPP;

    if (type != TC_SETUP_CLSFLOWER)
        return -EOPNOTSUPP;

    return mtk_flow_offload_cmd(hw->eth, cls, hw->index);
}

the assembly code is

.text:0000000000000040 ; int __fastcall mtk_wed_setup_tc_block_cb(tc_setup_type type, void *type_data, void *cb_priv)
.text:0000000000000040 mtk_wed_setup_tc_block_cb               ; DATA XREF: mtk_wed_setup_tc+78↓o
.text:0000000000000040                                         ; mtk_wed_setup_tc+C0↓o
.text:0000000000000040
.text:0000000000000040 var_10          = -0x10
.text:0000000000000040
.text:0000000000000040 type = X0                               ; tc_setup_type
.text:0000000000000040 type_data = X1                          ; void *
.text:0000000000000040 hw = X2                                 ; mtk_wed_hw *
.text:0000000000000040                 LDP             hw, X3, [hw]
.text:0000000000000044                 LDR             X3, [X3,#0xF0]
.text:0000000000000048                 TST             X3, #0x2000000000000
.text:000000000000004C                 CCMP            W0, #3, #0, NE
.text:0000000000000050                 B.NE            loc_70
.text:0000000000000054                 STP             X29, X30, [SP,#var_10]!
.text:0000000000000058                 MOV             X29, SP
.text:000000000000005C                 LDR             type, [hw,#0x10]
.text:0000000000000060                 LDR             W2, [hw,#0x80]
.text:0000000000000064                 BL              mtk_flow_offload_cmd
.text:0000000000000068                 LDP             X29, X30, [SP+0x10+var_10],#0x10
.text:000000000000006C                 RET
.text:0000000000000070 ; ---------------------------------------------------------------------------
.text:0000000000000070
.text:0000000000000070 loc_70                                  ; CODE XREF: mtk_wed_setup_tc_block_cb+10↑j
.text:0000000000000070                 MOV             W0, #0xFFFFFFA1
.text:0000000000000074                 RET
.text:0000000000000074 ; End of function mtk_wed_setup_tc_block_cb

LDR X3, [X3,#0xF0] crashed, and it means that priv->dev is a bad pointer and page fault when accessing priv->dev->features

everything411 commented 1 month ago

I'm not familiar with these codes and don't know why priv->dev could be garbage data. Is there anyone can help with that?

rx78gp01 commented 1 month ago

You can try patch mtk_wed.c :

@@ -2686,7 +2686,7 @@ mtk_wed_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_pri
    struct flow_cls_offload *cls = type_data;
    struct mtk_wed_hw *hw = priv->hw;

-   if (!tc_can_offload(priv->dev))
+   if (!priv || !tc_can_offload(priv->dev))
        return -EOPNOTSUPP;

    if (type != TC_SETUP_CLSFLOWER)
@@ -2747,6 +2747,7 @@ mtk_wed_setup_tc_block(struct mtk_wed_hw *hw, struct net_device *dev,
            flow_block_cb_remove(block_cb, f);
            list_del(&block_cb->driver_list);
            kfree(block_cb->cb_priv);
+           block_cb->cb_priv = NULL;
        }
        return 0;
    default:
everything411 commented 1 month ago

@rx78gp01 thanks a lot and with this patch my multiple ssid setup works again now! could you please submit this patch to upstream?

everything411 commented 1 month ago

cc @dangowrt @nbd168

everything411 commented 1 month ago

the right patch should be

--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -2686,7 +2686,12 @@ mtk_wed_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_pri
        struct flow_cls_offload *cls = type_data;
-       struct mtk_wed_hw *hw = priv->hw;
+       struct mtk_wed_hw *hw = NULL;
+
+       if(priv)
+               hw = priv->hw;
+       else
+               return -EOPNOTSUPP;

        if (!tc_can_offload(priv->dev))
                return -EOPNOTSUPP;

        if (type != TC_SETUP_CLSFLOWER)
@@ -2747,6 +2747,7 @@ mtk_wed_setup_tc_block(struct mtk_wed_hw *hw, struct net_device *dev,
                        flow_block_cb_remove(block_cb, f);
                        list_del(&block_cb->driver_list);
                        kfree(block_cb->cb_priv);
+                       block_cb->cb_priv = NULL;
                }
                return 0;
        default:

rx78gp01's original version cause a null pointer dereference at priv->hw

devianceluka commented 1 month ago

the right patch should be

--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -2686,7 +2686,12 @@ mtk_wed_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_pri
        struct flow_cls_offload *cls = type_data;
-       struct mtk_wed_hw *hw = priv->hw;
+       struct mtk_wed_hw *hw = NULL;
+
+       if(priv)
+               hw = priv->hw;
+       else
+               return -EOPNOTSUPP;

        if (!tc_can_offload(priv->dev))
                return -EOPNOTSUPP;

        if (type != TC_SETUP_CLSFLOWER)
@@ -2747,6 +2747,7 @@ mtk_wed_setup_tc_block(struct mtk_wed_hw *hw, struct net_device *dev,
                        flow_block_cb_remove(block_cb, f);
                        list_del(&block_cb->driver_list);
                        kfree(block_cb->cb_priv);
+                       block_cb->cb_priv = NULL;
                }
                return 0;
        default:

rx78gp01's original version cause a null pointer dereference at priv->hw

I can confirm it doesnt crash anymore with multissid with this patch on Redmi AX6000. Thanks @everything411 !