In libnetconf, deadlock was observed between, notification thread and regular thread for sending/receiving RPC messages when the notification thread happens to see RPC reply and regular thread sees the notification message.
Notification threads holds mut_equeue and waits for mut_mqueue.
Similarly, RPC thread holds mut_mqueue and waits for mut_equeue.
I believe the RPC thread might have to relinquish the mut_mqueue when processing notification message.
Can you please look into this problem ?
Back traces
(gdb) info thread
Id Target Id Frame
3 Thread 0x7fd55dada780 (LWP 4328) 0x00007fd55c066336 in __GI___pthread_mutex_lock (mutex=0x2178ef8) at ../nptl/pthread_mutex_lock.c:114
2 Thread 0x7fd554ab0700 (LWP 4334) __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
1 Thread 0x7fd5554c6700 (LWP 4329) 0x00007fd55cb51ae3 in select () at ../sysdeps/unix/syscall-template.S:81
RPC thread
(gdb) bt
0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
1 0x00007fd55c0664d4 in _L_lock_952 () from /lib/x86_64-linux-gnu/libpthread.so.0
2 0x00007fd55c066336 in __GI___pthread_mutex_lock (mutex=0x2178ef8) at ../nptl/pthread_mutex_lock.c:114
3 0x00007fd55c49d008 in nc_session_recv_reply (session=0x2178e40, timeout=-1, reply=0x7ffe9f75aca8) at src/session.c:2445
4 0x00007fd55c49e7ef in nc_session_send_recv (session=0x2178e40, rpc=0x2e69940, reply=0x7ffe9f75aca8) at src/session.c:3031
(gdb) frame 2
2 0x00007fd55c066336 in __GI___pthread_mutex_lock (mutex=0x2178ef8) at ../nptl/pthread_mutex_lock.c:114
114 in ../nptl/pthread_mutex_lock.c
(gdb) print mutex.data.owner
$8 = 4334
(gdb) info local
PRETTY_FUNCTION = "__pthread_mutex_lock"
type = 4294966784
(gdb) frame 3
3 0x00007fd55c49d008 in nc_session_recv_reply (session=0x2178e40, timeout=-1, reply=0x7ffe9f75aca8) at src/session.c:2445
2445 src/session.c: No such file or directory.
(gdb) info local
msg_aux = 0x0
msg = 0x21b8d90
ret = NC_MSG_NOTIFICATION
error = 0x3ebc7f345cabf00
local_timeout = 100
0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
135 ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S: No such file or directory.
(gdb) bt
0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
1 0x00007fd55c0664d4 in _L_lock_952 () from /lib/x86_64-linux-gnu/libpthread.so.0
2 0x00007fd55c066336 in __GI___pthread_mutex_lock (mutex=0x2178f20) at ../nptl/pthread_mutex_lock.c:114
3 0x00007fd55c49cda2 in nc_session_recv_reply (session=0x2178e40, timeout=10000, reply=0x7fd554aafc90) at src/session.c:2391
4 0x00007fd55c499d86 in announce_nc_session_closing (session=0x2178e40) at src/session.c:1173
5 0x00007fd55c49a025 in nc_session_close (session=0x2178e40, reason=NC_SESSION_TERM_OTHER) at src/session.c:1231
6 0x00007fd55c49ccbd in nc_session_receive (session=0x2178e40, timeout=0, msg=0x7fd554aafe20) at src/session.c:2349
7 0x00007fd55c49ccec in nc_session_recv_msg (session=0x2178e40, timeout=0, msg=0x7fd554aafe20) at src/session.c:2358
8 0x00007fd55c49d449 in nc_session_recv_notif (session=0x2178e40, timeout=0, ntf=0x7fd554aafe60) at src/session.c:2537
9 0x00007fd55c4ab48a in ncntf_dispatch_receive (session=0x2178e40, process_ntf=0x7fd55d04f65a <notification_receiver+64>) at src/notifications.c:2681
(gdb) frame 2
2 0x00007fd55c066336 in __GI___pthread_mutex_lock (mutex=0x2178f20) at ../nptl/pthread_mutex_lock.c:114
114 ../nptl/pthread_mutex_lock.c: No such file or directory.
(gdb) print mutex.data.owner
$9 = 4328
(gdb) frame 3
3 0x00007fd55c49cda2 in nc_session_recv_reply (session=0x2178e40, timeout=10000, reply=0x7fd554aafc90) at src/session.c:2391
2391 src/session.c: No such file or directory.
(gdb) info local
msg_aux = 0x2178e40
msg = 0x0
ret = 1409298993
error = 0x3ebc7f345cabf00
local_timeout = 100
Hi,
In libnetconf, deadlock was observed between, notification thread and regular thread for sending/receiving RPC messages when the notification thread happens to see RPC reply and regular thread sees the notification message.
Notification threads holds mut_equeue and waits for mut_mqueue. Similarly, RPC thread holds mut_mqueue and waits for mut_equeue. I believe the RPC thread might have to relinquish the mut_mqueue when processing notification message. Can you please look into this problem ?
Back traces
(gdb) info thread Id Target Id Frame
2 Thread 0x7fd554ab0700 (LWP 4334) __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
1 Thread 0x7fd5554c6700 (LWP 4329) 0x00007fd55cb51ae3 in select () at ../sysdeps/unix/syscall-template.S:81
RPC thread
(gdb) bt
0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
1 0x00007fd55c0664d4 in _L_lock_952 () from /lib/x86_64-linux-gnu/libpthread.so.0
2 0x00007fd55c066336 in __GI___pthread_mutex_lock (mutex=0x2178ef8) at ../nptl/pthread_mutex_lock.c:114
3 0x00007fd55c49d008 in nc_session_recv_reply (session=0x2178e40, timeout=-1, reply=0x7ffe9f75aca8) at src/session.c:2445
4 0x00007fd55c49e7ef in nc_session_send_recv (session=0x2178e40, rpc=0x2e69940, reply=0x7ffe9f75aca8) at src/session.c:3031
(gdb) frame 2
2 0x00007fd55c066336 in __GI___pthread_mutex_lock (mutex=0x2178ef8) at ../nptl/pthread_mutex_lock.c:114
114 in ../nptl/pthread_mutex_lock.c (gdb) print mutex.data.owner $8 = 4334 (gdb) info local PRETTY_FUNCTION = "__pthread_mutex_lock" type = 4294966784 (gdb) frame 3
3 0x00007fd55c49d008 in nc_session_recv_reply (session=0x2178e40, timeout=-1, reply=0x7ffe9f75aca8) at src/session.c:2445
2445 src/session.c: No such file or directory. (gdb) info local msg_aux = 0x0 msg = 0x21b8d90 ret = NC_MSG_NOTIFICATION error = 0x3ebc7f345cabf00 local_timeout = 100
Notification thread
(gdb) thread 2 [Switching to thread 2 (Thread 0x7fd554ab0700 (LWP 4334))]
0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
135 ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S: No such file or directory. (gdb) bt
0 __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
1 0x00007fd55c0664d4 in _L_lock_952 () from /lib/x86_64-linux-gnu/libpthread.so.0
2 0x00007fd55c066336 in __GI___pthread_mutex_lock (mutex=0x2178f20) at ../nptl/pthread_mutex_lock.c:114
3 0x00007fd55c49cda2 in nc_session_recv_reply (session=0x2178e40, timeout=10000, reply=0x7fd554aafc90) at src/session.c:2391
4 0x00007fd55c499d86 in announce_nc_session_closing (session=0x2178e40) at src/session.c:1173
5 0x00007fd55c49a025 in nc_session_close (session=0x2178e40, reason=NC_SESSION_TERM_OTHER) at src/session.c:1231
6 0x00007fd55c49ccbd in nc_session_receive (session=0x2178e40, timeout=0, msg=0x7fd554aafe20) at src/session.c:2349
7 0x00007fd55c49ccec in nc_session_recv_msg (session=0x2178e40, timeout=0, msg=0x7fd554aafe20) at src/session.c:2358
8 0x00007fd55c49d449 in nc_session_recv_notif (session=0x2178e40, timeout=0, ntf=0x7fd554aafe60) at src/session.c:2537
9 0x00007fd55c4ab48a in ncntf_dispatch_receive (session=0x2178e40, process_ntf=0x7fd55d04f65a <notification_receiver+64>) at src/notifications.c:2681
(gdb) frame 2
2 0x00007fd55c066336 in __GI___pthread_mutex_lock (mutex=0x2178f20) at ../nptl/pthread_mutex_lock.c:114
114 ../nptl/pthread_mutex_lock.c: No such file or directory. (gdb) print mutex.data.owner $9 = 4328 (gdb) frame 3
3 0x00007fd55c49cda2 in nc_session_recv_reply (session=0x2178e40, timeout=10000, reply=0x7fd554aafc90) at src/session.c:2391
2391 src/session.c: No such file or directory. (gdb) info local msg_aux = 0x2178e40 msg = 0x0 ret = 1409298993 error = 0x3ebc7f345cabf00 local_timeout = 100