multipath-tcp / mptcp_net-next

Development version of the Upstream MultiPath TCP Linux kernel 🐧
https://mptcp.dev
Other
284 stars 42 forks source link

mptcp_join.sh: WARNING: possible recursive locking detected #138

Closed matttbe closed 3 years ago

matttbe commented 3 years ago

With a debug kernel, I get this warning when executing mptcp_join.sh tests:

[  604.998915] ============================================
[  604.998915] WARNING: possible recursive locking detected
[  604.998915] 5.11.0-rc3+ #866 Not tainted
[  604.998915] --------------------------------------------
[  604.998915] pm_nl_ctl/3335 is trying to acquire lock:
[  604.998915] ffff8880082ca260 (k-slock-AF_INET){+.-.}-{2:2}, at: mptcp_subflow_process_delegated (./include/net/sock.h:1586)
[  604.998915]
[  604.998915] but task is already holding lock:
[  604.998915] ffff8880059252e0 (k-slock-AF_INET){+.-.}-{2:2}, at: release_sock (net/core/sock.c:3065)
[  604.998915]
[  604.998915] other info that might help us debug this:
[  604.998915]  Possible unsafe locking scenario:
[  604.998915]
[  604.998915]        CPU0
[  604.998915]        ----
[  604.998915]   lock(k-slock-AF_INET);
[  604.998915]   lock(k-slock-AF_INET);
[  604.998915]
[  604.998915]  *** DEADLOCK ***
[  604.998915]
[  604.998915]  May be due to missing lock nesting notation
[  604.998915]
[  604.998915] 4 locks held by pm_nl_ctl/3335:
[  604.998915] #0: ffffffff995e5c30 (cb_lock){++++}-{3:3}, at: genl_rcv (net/netlink/genetlink.c:811)
[  604.998915] #1: ffffffff995e5ce8 (genl_mutex){+.+.}-{3:3}, at: genl_rcv_msg (net/netlink/genetlink.c:33)
[  604.998915] #2: ffff888005925360 (k-sk_lock-AF_INET){+.+.}-{0:0}, at: __inet_bind (net/ipv4/af_inet.c:518)
[  604.998915] #3: ffff8880059252e0 (k-slock-AF_INET){+.-.}-{2:2}, at: release_sock (net/core/sock.c:3065)
[  604.998915]
[  604.998915] stack backtrace:
[  604.998915] CPU: 0 PID: 3335 Comm: pm_nl_ctl Not tainted 5.11.0-rc3+ #866
[  604.998915] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014
[  604.998915] Call Trace:
[  604.998915] dump_stack (lib/dump_stack.c:122)
[  604.998915] __lock_acquire.cold (kernel/locking/lockdep.c:2761)
[  604.998915] ? register_lock_class (kernel/locking/lockdep.c:4690)
[  604.998915] ? rcu_read_lock_sched_held (./include/linux/lockdep.h:271)
[  604.998915] lock_acquire (kernel/locking/lockdep.c:437)
[  604.998915] ? mptcp_subflow_process_delegated (./include/net/sock.h:1586)
[  604.998915] ? check_flags (kernel/locking/lockdep.c:5405)
[  604.998915] ? lock_acquire (kernel/locking/lockdep.c:437)
[  604.998915] ? check_flags (kernel/locking/lockdep.c:5405)
[  604.998915] ? lock_downgrade (kernel/locking/lockdep.c:5445)
[  604.998915] _raw_spin_lock_bh (./include/linux/spinlock_api_smp.h:136)
[  604.998915] ? mptcp_subflow_process_delegated (./include/net/sock.h:1586)
[  604.998915] mptcp_subflow_process_delegated (./include/net/sock.h:1586)
[  604.998915] tcp_release_cb_override (net/mptcp/subflow.c:1533)
[  604.998915] release_sock (./include/net/sock.h:1557)
[  604.998915] __inet_bind (net/ipv4/af_inet.c:553)
[  604.998915] mptcp_pm_nl_create_listen_socket (net/mptcp/pm_netlink.c:732)
[  604.998915] ? mptcp_pm_free_addr_entry (net/mptcp/pm_netlink.c:707)
[  604.998915] ? kmem_cache_alloc_trace (mm/slub.c:2920)
[  604.998915] mptcp_nl_cmd_add_addr (net/mptcp/pm_netlink.c:972)
[  604.998915] ? mptcp_pm_create_subflow_or_signal_addr (net/mptcp/pm_netlink.c:953)
[  604.998915] ? nla_get_range_signed (lib/nlattr.c:563)
[  604.998915] ? unpoison_range (mm/kasan/shadow.c:106)
[  604.998915] ? __nla_parse (lib/nlattr.c:685)
[  604.998915] ? genl_family_rcv_msg_attrs_parse.isra.0 (net/netlink/genetlink.c:550)
[  604.998915] ? genl_family_rcv_msg_doit.isra.0 (net/netlink/genetlink.c:741)
[  604.998915] genl_family_rcv_msg_doit.isra.0 (net/netlink/genetlink.c:741)
[  604.998915] ? genl_start (net/netlink/genetlink.c:703)
[  604.998915] ? security_capable (security/security.c:782 (discriminator 10))
[  604.998915] genl_rcv_msg (net/netlink/genetlink.c:783)
[  604.998915] ? genl_family_rcv_msg_doit.isra.0 (net/netlink/genetlink.c:789)
[  604.998915] ? mptcp_pm_create_subflow_or_signal_addr (net/mptcp/pm_netlink.c:953)
[  604.998915] netlink_rcv_skb (net/netlink/af_netlink.c:2495)
[  604.998915] ? genl_family_rcv_msg_doit.isra.0 (net/netlink/genetlink.c:789)
[  604.998915] ? netlink_ack (net/netlink/af_netlink.c:2471)
[  604.998915] genl_rcv (net/netlink/genetlink.c:812)
[  604.998915] netlink_unicast (net/netlink/af_netlink.c:1305)
[  604.998915] ? netlink_attachskb (net/netlink/af_netlink.c:1315)
[  604.998915] ? _copy_from_iter_full (lib/iov_iter.c:806)
[  604.998915] netlink_sendmsg (net/netlink/af_netlink.c:1919)
[  604.998915] ? netlink_unicast (net/netlink/af_netlink.c:1845)
[  604.998915] ? netlink_unicast (net/netlink/af_netlink.c:1845)
[  604.998915] sock_sendmsg (net/socket.c:652)
[  604.998915] __sys_sendto (net/socket.c:1975)
[  604.998915] ? __ia32_sys_getpeername (net/socket.c:1946)
[  604.998915] ? do_user_addr_fault (./arch/x86/include/asm/jump_label.h:25)
[  604.998915] ? _raw_spin_unlock (./arch/x86/include/asm/preempt.h:102)
[  604.998915] ? up_read (kernel/locking/rwsem.c:1296)
[  604.998915] ? down_read_non_owner (kernel/locking/rwsem.c:1447)
[  604.998915] ? syscall_enter_from_user_mode (kernel/entry/common.c:106)
[  604.998915] ? syscall_enter_from_user_mode (kernel/entry/common.c:106)
[  604.998915] ? rcu_read_lock_sched_held (./include/linux/lockdep.h:271)
[  604.998915] ? rcu_read_lock_bh_held (kernel/rcu/update.c:118)
[  604.998915] ? mark_held_locks (kernel/locking/lockdep.c:4000)
[  604.998915] __x64_sys_sendto (net/socket.c:1983)
[  604.998915] do_syscall_64 (arch/x86/entry/common.c:46)
[  604.998915] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:127)
[  604.998915] RIP: 0033:0x7fb08fc077ea
[  604.998915] Code: 48 c7 c0 ff ff ff ff eb bc 0f 1f 80 00 00 00 00 f3 0f 1e fa 41 89 ca 64 8b 04 25 18 00 00 00 85 c0 75 15 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 76 c3 0f 1f 44 00 00 55 48 83 ec 30 44 89 4c

All code
========
   0:   48 c7 c0 ff ff ff ff    mov    $0xffffffffffffffff,%rax
   7:   eb bc                   jmp    0xffffffffffffffc5
   9:   0f 1f 80 00 00 00 00    nopl   0x0(%rax)
  10:   f3 0f 1e fa             endbr64
  14:   41 89 ca                mov    %ecx,%r10d
  17:   64 8b 04 25 18 00 00    mov    %fs:0x18,%eax
  1e:   00
  1f:   85 c0                   test   %eax,%eax
  21:   75 15                   jne    0x38
  23:   b8 2c 00 00 00          mov    $0x2c,%eax
  28:   0f 05                   syscall
  2a:*  48 3d 00 f0 ff ff       cmp    $0xfffffffffffff000,%rax         <-- trapping instruction
  30:   77 76                   ja     0xa8
  32:   c3                      ret
  33:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  38:   55                      push   %rbp
  39:   48 83 ec 30             sub    $0x30,%rsp
  3d:   44                      rex.R
  3e:   89                      .byte 0x89
  3f:   4c                      rex.WR

Code starting with the faulting instruction
===========================================
   0:   48 3d 00 f0 ff ff       cmp    $0xfffffffffffff000,%rax
   6:   77 76                   ja     0x7e
   8:   c3                      ret
   9:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
   e:   55                      push   %rbp
   f:   48 83 ec 30             sub    $0x30,%rsp
  13:   44                      rex.R
  14:   89                      .byte 0x89
  15:   4c                      rex.WR
[  604.998915] RSP: 002b:00007fffe44a42c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
[  604.998915] RAX: ffffffffffffffda RBX: 00007fffe44a4350 RCX: 00007fb08fc077ea
[  604.998915] RDX: 0000000000000038 RSI: 00007fffe44a4350 RDI: 0000000000000003
[  604.998915] RBP: 0000000000000038 R08: 00007fffe44a42dc R09: 000000000000000c
[  604.998915] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
[  604.998915] R13: 0000000000000003 R14: 00007fffe44a42dc R15: 00007fffe44a4898

It looks like it was introduced by ADD_ADDR: ports support (allowing incoming connections on a different port) series.

cc: @geliangtang

geliangtang commented 3 years ago

@pabeni

Hi Paolo,

Thanks for your advice for this issue. I tried the two solutions you mentioned yesterday, they all works well.

This is the patch for the first solution, adding nested annotation:


diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 5dbeae3ad666..8aa064c362f1 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2984,13 +2984,14 @@ void mptcp_subflow_process_delegated(struct sock *ssk)
 {
    struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
    struct sock *sk = subflow->conn;
+   unsigned long flags;

-   mptcp_data_lock(sk);
+   spin_lock_irqsave_nested(&sk->sk_lock.slock, flags, SINGLE_DEPTH_NESTING);
    if (!sock_owned_by_user(sk))
        __mptcp_subflow_push_pending(sk, ssk);
    else
        set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags);
-   mptcp_data_unlock(sk);
+   spin_unlock_irqrestore(&sk->sk_lock.slock, flags);
    mptcp_subflow_delegated_done(subflow);
 }

And this is the patch for the second solution, changing the lockdep class:


diff --git a/include/net/sock.h b/include/net/sock.h
index bdc4323ce53c..f79ce454b0e9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1666,6 +1666,8 @@ static inline bool sock_allow_reclassification(const struct sock *csk)
    return !sk->sk_lock.owned && !spin_is_locked(&sk->sk_lock.slock);
 }

+void sock_lock_reclassify_user(struct sock *sk);
+
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
              struct proto *prot, int kern);
 void sk_free(struct sock *sk);
diff --git a/net/core/sock.c b/net/core/sock.c
index bbcd4b97eddd..26e25864631a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1646,6 +1646,19 @@ static inline void sock_lock_init(struct sock *sk)
            af_family_keys + sk->sk_family);
 }

+void sock_lock_reclassify_user(struct sock *sk)
+{
+   if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
+       return;
+
+   lockdep_set_class_and_name(&sk->sk_lock.slock,
+                  af_family_slock_keys + sk->sk_family,
+                  af_family_slock_key_strings[sk->sk_family]);
+   lockdep_init_map(&sk->sk_lock.dep_map,
+            af_family_key_strings[sk->sk_family],
+            af_family_keys + sk->sk_family, 0);
+}
+
 /*
  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
  * even temporarly, because of RCU lookups. sk_node should also be left as is.
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 39998c521133..93614929d931 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -716,6 +716,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
    if (err)
        return err;

+   sock_lock_reclassify_user(entry->lsk->sk);
+
    msk = mptcp_sk(entry->lsk->sk);
    if (!msk) {
        err = -EINVAL;

I think the first one is better, and I had sent it to the ML. Please review it for me. Thanks.

geliangtang commented 3 years ago

Hi Paolo,

Here is the third patch for skipping lsk in mptcp_subflow_process_delegated:


diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 39998c521133..06e38ffeeeb9 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -705,6 +705,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
 static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
                        struct mptcp_pm_addr_entry *entry)
 {
+   struct mptcp_subflow_context *subflow;
    struct sockaddr_storage addr;
    struct mptcp_sock *msk;
    struct socket *ssock;
@@ -728,6 +729,9 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
        goto out;
    }

+   subflow = mptcp_subflow_ctx(ssock->sk);
+   subflow->lsk = 1;
+
    mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family);
    err = kernel_bind(ssock, (struct sockaddr *)&addr,
              sizeof(struct sockaddr_in));
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 5dbeae3ad666..6e6260f05423 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2985,6 +2985,9 @@ void mptcp_subflow_process_delegated(struct sock *ssk)
    struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
    struct sock *sk = subflow->conn;

+   if (subflow->lsk)
+       return;
+
    mptcp_data_lock(sk);
    if (!sock_owned_by_user(sk))
        __mptcp_subflow_push_pending(sk, ssk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index a0d321dcaeb4..2fc621606934 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -406,6 +406,7 @@ struct mptcp_subflow_context {
        mpc_map : 1,
        backup : 1,
        send_mp_prio : 1,
+       lsk : 1,
        rx_eof : 1,
        can_ack : 1,        /* only after processing the remote a key */
        disposable : 1;     /* ctx can be free at ulp release time */

matttbe commented 3 years ago

Fixed thanks to @pabeni 's patch: 870abbfb3a63: "squashed" in "mptcp: implement delegated actions" Thanks @geliangtang for having tried the different solutions!