openwrt / mt76

mac80211 driver for MediaTek MT76x0e, MT76x2e, MT7603, MT7615, MT7628 and MT7688
737 stars 343 forks source link

[mt7613/mt7615] client connected but ping fail #518

Open ptpt52 opened 3 years ago

ptpt52 commented 3 years ago

on chip mt7613 iwinfo wlan1 assoclist show the sta connected but ping fail, no network tcpdump -i wlan1 shows that sta client(My PC) sending arp request and AP reply the arp reply that means My PC cannot get the arp replay packets say, AP sending out but actually xmit fail? and on this situation it is eating memory The memory is quickly exhausted oom-kill

but if I do something

ifconfig wlan1 down
ifconfig wlan1 up

the lost memory back again.

ryderlee1110 commented 3 years ago

7615 or 7613?

ptpt52 commented 3 years ago

it is 7613

ptpt52 commented 3 years ago
453 static int
454 mt76_txq_schedule_list(struct mt76_phy *phy, enum mt76_txq_id qid)
455 {
456         struct mt76_queue *q = phy->q_tx[qid];
457         struct mt76_dev *dev = phy->dev;
458         struct ieee80211_txq *txq;
459         struct mt76_txq *mtxq;
460         struct mt76_wcid *wcid;
461         int ret = 0;
462 
463         while (1) { 
464                 int n_frames = 0;
465                 
466                 if (test_bit(MT76_STATE_PM, &phy->state) ||
467                     test_bit(MT76_RESET, &phy->state))
468                         return -EBUSY;
469                 
470                 if (dev->queue_ops->tx_cleanup &&
471                     q->queued + 2 * MT_TXQ_FREE_THR >= q->ndesc) {
472                         dev->queue_ops->tx_cleanup(dev, q, false);
473                 }
474                 
475                 txq = ieee80211_next_txq(phy->hw, qid);
476                 if (!txq)
477                         break;
478                 
479                 mtxq = (struct mt76_txq *)txq->drv_priv;
480                 wcid = mtxq->wcid;
481                 if (wcid && test_bit(MT_WCID_FLAG_PS, &wcid->flags))
482                         continue;
483                 
484                 spin_lock_bh(&q->lock);
485                 
486                 if (mtxq->send_bar && mtxq->aggr) { 
487                         struct ieee80211_txq *txq = mtxq_to_txq(mtxq);
488                         struct ieee80211_sta *sta = txq->sta;
489                         struct ieee80211_vif *vif = txq->vif;
490                         u16 agg_ssn = mtxq->agg_ssn;
491                         u8 tid = txq->tid;
492                         
493                         mtxq->send_bar = false;
494                         spin_unlock_bh(&q->lock);
495                         ieee80211_send_bar(vif, sta->addr, tid, agg_ssn);
496                         spin_lock_bh(&q->lock);
497                 }
498 
499                 if (!mt76_txq_stopped(q))
500                         n_frames = mt76_txq_send_burst(phy, q, mtxq);
501 
502                 spin_unlock_bh(&q->lock);
503 
504                 ieee80211_return_txq(phy->hw, txq, false);
505 
506                 if (unlikely(n_frames < 0))
507                         return n_frames;
508 
509                 ret += n_frames;
510         }
511 
512         return ret;
513 }

what if mt76_txq_send_burst() return 0? or what if something wrong in hardware, tx_buf full what this runtine cannot send any packets? would the queued packets endless and eating memory?

ptpt52 commented 3 years ago

what if the tx_q stopped but the kernel still xmit and enqueue?

ptpt52 commented 3 years ago
1589 static bool ieee80211_queue_skb(struct ieee80211_local *local,
1590                                 struct ieee80211_sub_if_data *sdata,
1591                                 struct sta_info *sta,
1592                                 struct sk_buff *skb)
1593 {
1594         struct ieee80211_vif *vif;
1595         struct txq_info *txqi;
1596 
1597         if (!local->ops->wake_tx_queue ||
1598             sdata->vif.type == NL80211_IFTYPE_MONITOR)
1599                 return false;
1600 
1601         if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
1602                 sdata = container_of(sdata->bss,
1603                                      struct ieee80211_sub_if_data, u.ap);
1604 
1605         vif = &sdata->vif;
1606         txqi = ieee80211_get_txq(local, vif, sta, skb);
1607 
1608         if (!txqi)
1609                 return false;
1610 
1611         ieee80211_txq_enqueue(local, txqi, skb);
1612 
1613         schedule_and_wake_txq(local, txqi);
1614 
1615         return true;
1616 }

on some case ieee80211_queue_skb() just keep pushing skb into queue endless eating memory?

ptpt52 commented 3 years ago

@ryderlee1110 @nbd168 ping

ptpt52 commented 3 years ago

it seems I can set the memory_limit https://lists.bufferbloat.net/pipermail/make-wifi-fast/2018-February/001724.html

ptpt52 commented 3 years ago

when this issue happens I notice this:

cat sys/kernel/debug/ieee80211/phy1/mt76/xmit-queues
0:      queued=0 head=142 tail=142
1:      queued=0 head=449 tail=449
2:      queued=480 head=288 tail=320
3:      queued=0 head=2 tail=2
4:      queued=0 head=97 tail=97

why the q2 stop working?

ptpt52 commented 3 years ago

it is likely that the driver should detect the tx hang and do a reset on the dma?

ptpt52 commented 3 years ago

what if sta enter ps state?

ptpt52 commented 3 years ago

how to handle the MCU_EXT_EVENT_PS_SYNC event? should I call queue_work(phy->dev->wq, &pm->wake_work); ?

I notice that on some case sta enter ps state and tx to sta stop and cannnot come back active

ptpt52 commented 3 years ago

I notice this 2 changes: https://github.com/LorenzoBianconi/wireless-drivers-next/commit/9f8ee2b95f9cf53925a58a6a616940dd01dff320 https://github.com/LorenzoBianconi/wireless-drivers-next/commit/9f81246e3d7524aabc7eb9770999768323a6e42e

seems somewhat related I will try it out

ptpt52 commented 3 years ago
iwinfo wlan1 assoclist
XX:XX:XX:AB:68:5B  -33 dBm / -91 dBm (SNR 58)  100 ms ago
    RX: 780.0 MBit/s, VHT-MCS 9, 80MHz, VHT-NSS 2   2391366 Pkts.
    TX: 866.7 MBit/s, VHT-MCS 9, 80MHz, VHT-NSS 2   1842816 Pkts.
    expected throughput: 467.3 MBit/s

iw phy phy1 get txq get the resualt:

Packet limit:       256 pkts
Memory limit:       2097152 bytes
Quantum:        300 bytes
Number of queues:   4096
Backlog:        245 pkts
Memory usage:       494464 bytes
Packet limit overflows: 29
Memory limit overflows: 0
Hash collisions:    1838

it shows that packets buffered in the txq backlog

why the txq just blocked? at this time I notice:

cat /sys/kernel/debug/ieee80211/phy1/mt76/xmit-queues 
0:  queued=0 head=5 tail=5
1:  queued=0 head=0 tail=0
2:  queued=0 head=54 tail=54
3:  queued=0 head=0 tail=0
4:  queued=0 head=67 tail=67
tcpdump -i wlan1 -nevv
tcpdump: listening on wlan1, link-type EN10MB (Ethernet), capture size 262144 bytes
01:04:31.729596 xx:xx:xx:ab:68:5b > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Ethernet (len 6), IPv4 (len 4), Request who-has 192.168.18.1 tell 192.168.18.10, length 28
01:04:31.729892 xx:xx:xx:ab:68:5b > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Ethernet (len 6), IPv4 (len 4), Request who-has 192.168.18.1 tell 192.168.18.10, length 28
01:04:31.731334 xx:xx:xx:0c:d8:18 > xx:xx:xx:ab:68:5b, ethertype ARP (0x0806), length 60: Ethernet (len 6), IPv4 (len 4), Reply 192.168.18.1 is-at xx:xx:xx:0c:d8:18, length 46
ptpt52 commented 3 years ago

I notice that, after mt7615_mac_reset_work called the txq stop working. @ryderlee1110 @LorenzoBianconi

ptpt52 commented 3 years ago

should I re-call the mt76_txq_schedule_all(&dev->mphy); again?

ptpt52 commented 3 years ago

txq = ieee80211_next_txq(phy->hw, qid); in mt76_txq_schedule_list() can never get the txq for the blocked sta and iw phy phy1 get txq shows backlog is full

ptpt52 commented 3 years ago

but if some new client connect to the AP, it work and the blocked sta still not working

ptpt52 commented 3 years ago

and ifconfig wlan1 down; ifconfig wlan1 up would make everthing work again.

ptpt52 commented 3 years ago

Two cases cause the tx hang:

  1. mt7615_mac_reset_work was trigger and txq for the special one client blocked forever
  2. dma tx blocked forever in mt76
ptpt52 commented 3 years ago

would it be possible that, when the reset_work called, mt7615_mac_sta_poll report wrong aritime, so that sta blocked!!!

ptpt52 commented 3 years ago

@ryderlee1110 @nbd168

after further debug, after the reset_work called, the ieee80211_next_txq() nevert return the txq, it shows that ieee80211_txq_airtime_check() always return false

ptpt52 commented 3 years ago

that means the aql_tx_pending is out of limit?

ryderlee1110 commented 3 years ago

yes. looks like we have to clean tx_pending during reset. try ieee80211_clear_tx_pending() ?

ptpt52 commented 3 years ago

@ryderlee1110 thanks for the tips, let me try it out. is it somewhat like this:

diff --git a/mt7615/pci_mac.c b/mt7615/pci_mac.c
index 806cd2dc..89bd4717 100644
--- a/mt7615/pci_mac.c
+++ b/mt7615/pci_mac.c
@@ -361,6 +361,10 @@ void mt7615_mac_reset_work(struct work_struct *work)

        mt7615_update_beacons(dev);

+       ieee80211_clear_tx_pending(hw_to_local(mt76_hw(dev)));
+       if (ext_phy)
+               ieee80211_clear_tx_pending(hw_to_local(ext_phy->hw));
+
        mt7615_mutex_release(dev);

        ieee80211_queue_delayed_work(mt76_hw(dev), &dev->mphy.mac_work,

I do the clear after almost reset_work is done. if we clear too early, more pending comes in, right?

ryderlee1110 commented 3 years ago

Does it work? however, seems ieee80211_local cannot be touched in low level driver

struct ieee80211_local { /* embed the driver visible part.

ptpt52 commented 3 years ago

cannot even compile maybe we should do something similar to ifconfig wlan1 down ?

ptpt52 commented 3 years ago

it seems like that mt76_txq_schedule_list() not implemented correctly ? it should just dequeue and do the dma tx or just free skb, should not stop if more pending in queue

ptpt52 commented 3 years ago

maybe the mac80211 framework cannot expect and handle this situation ^_^

ryderlee1110 commented 3 years ago

well. you have to consider running out of tx tokens, which could cause packet drops under load.

ptpt52 commented 3 years ago

you mean call mt7615_tx_token_put() first and then mt76_txq_schedule_all() ?

ryderlee1110 commented 3 years ago

i meant why do we need a stop in mt76_txq_schedule_list.

ptpt52 commented 3 years ago

in mt7615_irq_tasklet() MT_INT_MCU_CMD event comes and reset_work trigger

ptpt52 commented 3 years ago

why that event happens? I don't know, it just happens

ptpt52 commented 3 years ago
463 static int
464 mt76_txq_schedule_list(struct mt76_phy *phy, enum mt76_txq_id qid)
465 {                  
466         struct mt76_queue *q = phy->q_tx[qid];
467         struct mt76_dev *dev = phy->dev;
468         struct ieee80211_txq *txq;
469         struct mt76_txq *mtxq;
470         struct mt76_wcid *wcid;
471         int ret = 0;
472 
473         while (1) {
474                 int n_frames = 0;
475 
476                 if (test_bit(MT76_STATE_PM, &phy->state)) {
477                         return -EBUSY;
478                 }
479 
480                 if (dev->queue_ops->tx_cleanup &&
481                     q->queued + 2 * MT_TXQ_FREE_THR >= q->ndesc) {
482                         dev->queue_ops->tx_cleanup(dev, q, false);
483                 }
484 
485                 txq = ieee80211_next_txq(phy->hw, qid);
486                 if (!txq)
487                         break;
488 
489                 if (test_bit(MT76_RESET, &phy->state)) {
490                         mtxq = (struct mt76_txq *)txq->drv_priv;
491                         while ((skb = mt76_txq_dequeue(phy, mtxq)) != NULL) {
492                                 hw = mt76_tx_status_get_hw(dev, skb);
493                                 ieee80211_free_txskb(hw, skb);
494                         }
495                         ieee80211_return_txq(phy->hw, txq, false);
496                         continue
497                 }
......................
....................

I am trying to change here in mt76_txq_schedule_list() if in MT76_RESET, it would do endless dequeue and free the skb Is it reasonable?

ptpt52 commented 3 years ago

in this way to ensure that no tx is pending

ptpt52 commented 3 years ago

I get the log:

[14668.777532] ieee80211_next_txq ac=2,2 vif c8:5b:a0:e7:34:65 sta e4:67:8e:ab:68:5b aql_check=1 deficit=-38809,-38553, aql 4304,12000 4304,24000
[14668.790405] ieee80211_next_txq ac=2,2 vif c8:5b:a0:e7:34:65 sta e4:67:8e:ab:68:5b aql_check=1 deficit=-38553,-38297, aql 4304,12000 4304,24000
[14668.803255] ieee80211_next_txq ac=2,2 vif c8:5b:a0:e7:34:65 sta e4:67:8e:ab:68:5b aql_check=1 deficit=-38297,-38041, aql 4304,12000 4304,24000
[14668.816115] ieee80211_next_txq ac=2,2 vif c8:5b:a0:e7:34:65 sta e4:67:8e:ab:68:5b aql_check=1 deficit=-38041,-37785, aql 4304,12000 4304,24000
[14668.828953] ieee80211_next_txq ac=2,2 vif c8:5b:a0:e7:34:65 sta e4:67:8e:ab:68:5b aql_check=1 deficit=-37785,-37529, aql 4304,12000 4304,24000
[14672.738663] mt7615_irq_tasklet MCU2HOST_SW_INT_ENA
[14672.743527] mt7615_irq_tasklet val MT7663_MCU_CMD_ERROR_MASK, 4
[14672.749475] mt7615_irq_tasklet MCU2HOST_SW_INT_ENA call reset_work
[14672.755683] mt7615_irq_tasklet MCU2HOST_SW_INT_ENA call reset_work done
[14672.755820] mt7615_mac_reset_work
[14672.767319] mt7615_mac_reset_work start
[14672.771715] mt7615_irq_tasklet MCU2HOST_SW_INT_ENA
[14672.776582] mt7615_irq_tasklet val MT7663_MCU_CMD_ERROR_MASK, 8
[14672.780889] mt7615_mac_reset_work wait MT_MCU_CMD_RESET_DONE
[14672.782584] mt7615_irq_tasklet MCU2HOST_SW_INT_ENA call reset_work
[14672.782616] mt7615_irq_tasklet MCU2HOST_SW_INT_ENA call reset_work done
[14672.808830] mt7615_mac_reset_work wait MT_MCU_CMD_RECOVERY_DONE
[14672.808918] mt7615_irq_tasklet MCU2HOST_SW_INT_ENA
[14672.819706] mt7615_irq_tasklet val MT7663_MCU_CMD_ERROR_MASK, 10
[14672.825781] mt7615_irq_tasklet MCU2HOST_SW_INT_ENA call reset_work
[14672.832031] mt7615_irq_tasklet MCU2HOST_SW_INT_ENA call reset_work done
[14672.832193] mt7615_mac_reset_work wait MT_MCU_CMD_NORMAL_STATE
[14672.839674] mt7615_irq_tasklet MCU2HOST_SW_INT_ENA
[14672.849534] mt7615_irq_tasklet val MT7663_MCU_CMD_ERROR_MASK, 20
[14672.855560] mt7615_irq_tasklet MCU2HOST_SW_INT_ENA call reset_work
[14672.861759] mt7615_irq_tasklet MCU2HOST_SW_INT_ENA call reset_work done
[14672.869563] mt7615_mac_reset_work end
[14672.887398] mt7615_mac_reset_work
[14672.891589] mt7615_mac_reset_work reset_state=20 not handled
[14673.716710] net_ratelimit: 471040 callbacks suppressed
[14673.716755] ieee80211_next_txq ac=2,2 vif c8:5b:a0:e7:34:65 sta e4:67:8e:ab:68:5b aql_check=1 deficit=-194651,-194395, aql 6592,12000 19460,24000
[14673.734978] ieee80211_next_txq ac=2,2 vif c8:5b:a0:e7:34:65 sta e4:67:8e:ab:68:5b aql_check=1 deficit=-194395,-194139, aql 6592,12000 19460,24000
[14673.748022] ieee80211_next_txq ac=2,2 vif c8:5b:a0:e7:34:65 sta e4:67:8e:ab:68:5b aql_check=1 deficit=-194139,-193883, aql 6592,12000 19460,24000
[14673.761067] ieee80211_next_txq ac=2,2 vif c8:5b:a0:e7:34:65 sta e4:67:8e:ab:68:5b aql_check=1 deficit=-193883,-193627, aql 6592,12000 19460,24000
[14673.774108] ieee80211_next_txq ac=2,2 vif c8:5b:a0:e7:34:65 sta e4:67:8e:ab:68:5b aql_check=1 deficit=-193627,-193371, aql 6592,12000 19460,24000
[14673.787182] ieee80211_next_txq ac=2,2 vif c8:5b:a0:e7:34:65 sta e4:67:8e:ab:68:5b aql_check=1 deficit=-193371,-193115, aql 6592,12000 19460,24000
[14673.800232] ieee80211_next_txq ac=2,2 vif c8:5b:a0:e7:34:65 sta e4:67:8e:ab:68:5b aql_check=1 deficit=-193115,-192859, aql 6592,12000 19460,24000

before the reset_work, local->aql_total_pending_airtime is equal to sta->airtime[ac].aql_tx_pending, it is 4304 but after the reset_work, the become not equal. local->aql_total_pending_airtime=19460 and sta->airtime[ac].aql_tx_pending=6592

ptpt52 commented 3 years ago

found some memleak here?

157 void mt7615_unregister_device(struct mt7615_dev *dev)
158 {
159         bool mcu_running;
160 
161         mcu_running = mt7615_wait_for_mcu_init(dev);
162 
163         mt7615_unregister_ext_phy(dev);
164         mt76_unregister_device(&dev->mt76);
165         if (mcu_running)
166                 mt7615_mcu_exit(dev);
167         mt7615_dma_cleanup(dev);
168 
169         mt7615_tx_token_put(dev);
170 
171         tasklet_disable(&dev->irq_tasklet);
172 
173         mt76_free_device(&dev->mt76);
174 }

mt7615_dma_cleanup() would call mt76_free_pending_txwi() to remove txwi from dev->txwi_cache but mt7615_tx_token_put() put the txwi back to dev->txwi_cache

ryderlee1110 commented 3 years ago

but mt7615_tx_token_put() put the txwi back to dev->txwi_cache

right. will fix it

ptpt52 commented 3 years ago
 671         
 672 static void ieee80211_report_used_skb(struct ieee80211_local *local,
 673                                       struct sk_buff *skb, bool dropped)
 674 {       
 675         struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
 676         u16 tx_time_est = ieee80211_info_get_tx_time_est(info);
 677         struct ieee80211_hdr *hdr = (void *)skb->data;
 678         bool acked = info->flags & IEEE80211_TX_STAT_ACK;
 679 
 680         if (dropped)
 681                 acked = false; 
 682                 
 683         if (tx_time_est) {
 684                 struct sta_info *sta;
 685         
 686                 rcu_read_lock();
 687                 
 688                 sta = sta_info_get_by_addrs(local, hdr->addr1, hdr->addr2);
 689                 ieee80211_sta_update_pending_airtime(local, sta, 
 690                                                      skb_get_queue_mapping(skb),
 691                                                      tx_time_est,
 692                                                      true);
 693                 rcu_read_unlock();
 694         }
.......

This code is not very reliable what if sta_info_get_by_addrs return NULL for some reason?

ryderlee1110 commented 3 years ago

what if sta_info_get_by_addrs return NULL for some reason?

The if (sta) check is actually in ieee80211_sta_update_pending_airtime. Back to this issue, did issue resolve after dropping packets you made in mt76_txq_schedule_list()?

ptpt52 commented 3 years ago

dropping packets in mt76_txq_schedule_list() seems incorrect. pending in txqi is ok, it's airtime never add to the txq_airtime before you call ieee80211_tx_dequeue() do dequeue it.

ryderlee1110 commented 3 years ago

yeah, so still have to find a way to clean tx_pending

ptpt52 commented 3 years ago

before the reset_work, local->aql_total_pending_airtime is equal to sta->airtime[ac].aql_tx_pending, it is 4304 but after the reset_work, the become not equal. local->aql_total_pending_airtime=19460 and sta->airtime[ac].aql_tx_pending=6592

Directly, this log shows the cause of tx stop issue

if reset_work trigger again, the aql_total_pending_airtime become more large so it stop working?

ptpt52 commented 3 years ago

but what cause the aql_total_pending_airtime become larger?

ptpt52 commented 3 years ago
3826 bool ieee80211_txq_airtime_check(struct ieee80211_hw *hw,
3827                                  struct ieee80211_txq *txq)
3828 {
3829         struct sta_info *sta;
3830         struct ieee80211_local *local = hw_to_local(hw);
3831 
3832         if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL))
3833                 return true;
3834 
3835         if (!txq->sta)
3836                 return true;
3837 
3838         sta = container_of(txq->sta, struct sta_info, sta);
3839         if (atomic_read(&sta->airtime[txq->ac].aql_tx_pending) <
3840             sta->airtime[txq->ac].aql_limit_low)
3841                 return true;
3842 
3843         if (atomic_read(&local->aql_total_pending_airtime) <
3844             local->aql_threshold &&
3845             atomic_read(&sta->airtime[txq->ac].aql_tx_pending) <
3846             sta->airtime[txq->ac].aql_limit_high)
3847                 return true;
3848 
3849         return false;
3850 }

if the aql_tx_pending leak, it become larger than aql_limit_high, would lead to txq for this sta stop working forever.

ptpt52 commented 3 years ago

ieee80211_tx_dequeue() add airtime to local->aql_total_pending_airtime and sta->airtime[ac].aql_tx_pending

ieee80211_report_used_skb() sub airtime from local->aql_total_pending_airtime and sta->airtime[ac].aql_tx_pending

I have just one client connected to AP

at first, I dump the aql_tx_pending and aql_total_pending_airtime they always equal

but at some point, when the reset_work triggered they become NOT equal

what would be the cause?

ryderlee1110 commented 3 years ago

@ptpt52 switch to use ieee80211_free_txskb in mt7615_tx_token_put to see if it solves your issues.

ptpt52 commented 3 years ago

seems make sense! Let me try it out.

ptpt52 commented 3 years ago

in testing

void mt7615_mac_reset_work(struct work_struct *work)
.......
288         ieee80211_stop_queues(mt76_hw(dev));
289         if (ext_phy)
290                 ieee80211_stop_queues(ext_phy->hw);
291 
292         set_bit(MT76_RESET, &dev->mphy.state);
293         set_bit(MT76_MCU_RESET, &dev->mphy.state);
294         wake_up(&dev->mt76.mcu.wait);
295         cancel_delayed_work_sync(&dev->mphy.mac_work);
296         del_timer_sync(&dev->phy.roc_timer);
297         cancel_work_sync(&dev->phy.roc_work);
298         if (phy2) {
299                 set_bit(MT76_RESET, &phy2->mt76->state);
300                 cancel_delayed_work_sync(&phy2->mt76->mac_work);
301                 del_timer_sync(&phy2->roc_timer);
302                 cancel_work_sync(&phy2->roc_work);
303         }
304 
305         /* lock/unlock all queues to ensure that no tx is pending */
306         mt76_txq_schedule_all(&dev->mphy);
307         if (ext_phy)
308                 mt76_txq_schedule_all(ext_phy);

one more question: this code it set MT76_RESET state and call mt76_txq_schedule_all() ? it is useless?

ptpt52 commented 3 years ago

got a warning log:

[  145.770286] mt7615e 0000:02:00.0: mt7615_mac_reset_work start
[  145.778015] ------------[ cut here ]------------
[  145.783016] WARNING: CPU: 0 PID: 3128 at backports-5.10.16-1/net/mac80211/tx.c:3589 ieee80211_tx_dequeue+0xf08/0x15cc [mac80211]
[  145.794620] Modules linked in: xt_connlimit nf_conncount iptable_nat xt_state xt_nat xt_helper xt_conntrack xt_connmark xt_connbytes xt_REDIRECT xt_MASQUERADE xt_CT nft_redir nft_ct nf_nat nt
[  145.794859]  ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ip ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set nfc
[  145.915669] CPU: 0 PID: 3128 Comm: kworker/u8:3 Not tainted 5.4.105 #0
[  145.922219] Workqueue: phy1 mt7615_mac_reset_work [mt7615e]
[  145.927787] Stack : 805e7e7c 8565bc0c 80660000 806b0000 8566e200 805f8d4c 85e36cb4 00000009
[  145.936141]         85cc5ec0 85cc5fec 85431000 80072fdc 00000000 00000001 8565bbc8 68b20ce1
[  145.944481]         00000000 00000000 00000000 00000000 0000005d 00000186 65725f63 5f746573
[  145.952821]         00000000 00000007 00000000 000e126b 00000000 806d0000 00000000 85e36cb4
[  145.961162]         00000009 85cc5ec0 85cc5fec 85431000 00000002 803063f0 00000000 80810000
[  145.969496]         ...
[  145.971940] Call Trace:
[  145.974405] [<8000b61c>] show_stack+0x30/0x100
[  145.978842] [<8054248c>] dump_stack+0xa4/0xdc
[  145.983202] [<8002bcd0>] __warn+0xc0/0x10c
[  145.987281] [<8002bd78>] warn_slowpath_fmt+0x5c/0xac
[  145.992424] [<85e36cb4>] ieee80211_tx_dequeue+0xf08/0x15cc [mac80211]
[  145.998950] [<85d15f34>] mt76_tx+0x1b0/0x1e0 [mt76]
[  146.004152] ---[ end trace 04c8cf92ac28895b ]---
[  146.018809] mt7615e 0000:02:00.0: mt7615_mac_reset_work end

looks like that the ieee80211_tx_dequeue() assume being called in softirq?

WARN_ON_ONCE(softirq_count() == 0);

I am trying to do clean the tx pending when do the reset_work

diff --git a/tx.c b/tx.c
index 451ed60c..d84fe79c 100644
--- a/tx.c
+++ b/tx.c
@@ -463,8 +463,7 @@ mt76_txq_schedule_list(struct mt76_phy *phy, enum mt76_txq_id qid)
        while (1) {
                int n_frames = 0;

-               if (test_bit(MT76_STATE_PM, &phy->state) ||
-                   test_bit(MT76_RESET, &phy->state))
+               if (test_bit(MT76_STATE_PM, &phy->state))
                        return -EBUSY;

                if (dev->queue_ops->tx_cleanup &&
@@ -476,6 +475,18 @@ mt76_txq_schedule_list(struct mt76_phy *phy, enum mt76_txq_id qid)
                if (!txq)
                        break;

+               if (unlikely(test_bit(MT76_RESET, &phy->state))) {
+                       /* in reset state clear all tx pending */
+                       struct sk_buff *skb;
+                       mtxq = (struct mt76_txq *)txq->drv_priv;
+                       while ((skb = mt76_txq_dequeue(phy, mtxq)) != NULL) {
+                               ieee80211_free_txskb(phy->hw, skb);
+                               ret++;
+                       }
+                       ieee80211_return_txq(phy->hw, txq, false);
+                       continue;
+               }
+
                mtxq = (struct mt76_txq *)txq->drv_priv;
                wcid = mtxq->wcid;
                if (wcid && test_bit(MT_WCID_FLAG_PS, &wcid->flags))