zeromq / libzmq

ZeroMQ core engine in C++, implements ZMTP/3.1
https://www.zeromq.org
Mozilla Public License 2.0
9.7k stars 2.35k forks source link

Assertion failed: check () (bundled/zeromq/src/msg.cpp:248) (pyzmq 15.1.0) #1689

Closed umitanuki closed 8 years ago

umitanuki commented 8 years ago

I'm running a python server that uses two ZMQ sockets, one for PUB and one for REP. The server runs python's threads up to 8, and using numpy etc. Intermittently, it aborts like below.

(gdb) bt
#0  0x00007f3e81ab1cc9 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
#1  0x00007f3e81ab50d8 in __GI_abort () at abort.c:89
#2  0x00007f3e7585b949 in zmq::zmq_abort (errmsg_=errmsg_@entry=0x7f3e758931ee "check ()")
    at bundled/zeromq/src/err.cpp:84
#3  0x00007f3e758520d6 in zmq::msg_t::data (this=0x7f3e18199b50) at bundled/zeromq/src/msg.cpp:230
#4  0x00007f3e7587c873 in zmq::pipe_t::read (this=0x7f3e18199fa0, msg_=msg_@entry=0x7f3e18199b50)
    at bundled/zeromq/src/pipe.cpp:169
#5  0x00007f3e75867b2d in zmq::session_base_t::pull_msg (this=0x7f3e18199500, msg_=msg_@entry=0x7f3e18199b50)
    at bundled/zeromq/src/session_base.cpp:126
#6  0x00007f3e75857455 in zmq::stream_engine_t::pull_and_encode (this=0x7f3e18199b30, msg_=0x7f3e18199b50)
    at bundled/zeromq/src/stream_engine.cpp:877
#7  0x00007f3e75856f08 in zmq::stream_engine_t::out_event (this=0x7f3e18199b30)
    at bundled/zeromq/src/stream_engine.cpp:361
#8  0x00007f3e758686b3 in zmq::session_base_t::read_activated (this=0x7f3e18199500, pipe_=0x7f3e18199fa0)
    at bundled/zeromq/src/session_base.cpp:264
#9  0x00007f3e7587a40c in zmq::io_thread_t::in_event (this=0x2454c80) at bundled/zeromq/src/io_thread.cpp:83
#10 0x00007f3e7588a82e in zmq::epoll_t::loop (this=0x2454f00) at bundled/zeromq/src/epoll.cpp:176
#11 0x00007f3e7586f7a0 in thread_routine (arg_=0x2454f80) at bundled/zeromq/src/thread.cpp:96
#12 0x00007f3e81e48182 in start_thread (arg=0x7f3e5b9ab700) at pthread_create.c:312
#13 0x00007f3e81b7547d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111

Seems like msg is broken.

(gdb) p u
$1 = {base = {metadata = 0x3638302e312c302e,
    unused = "1700000000001,1.08617", '9' <repeats 11 times>, ",1.08617000000", type = 48 '0', flags = 48 '0'},
  vsm = {metadata = 0x3638302e312c302e, data = "1700000000001,1.08617", '9' <repeats 11 times>, ",1.0861700000",
    size = 48 '0', type = 48 '0', flags = 48 '0'}, lmsg = {metadata = 0x3638302e312c302e,
    content = 0x3030303030303731, unused = "00001,1.08617", '9' <repeats 11 times>, ",1.08617000000", type = 48 '0',
    flags = 48 '0'}, cmsg = {metadata = 0x3638302e312c302e, data = 0x3030303030303731, size = 3328490189376991280,
    unused = "08617", '9' <repeats 11 times>, ",1.08617000000", type = 48 '0', flags = 48 '0'}, delimiter = {
    metadata = 0x3638302e312c302e, unused = "1700000000001,1.08617", '9' <repeats 11 times>, ",1.08617000000",
    type = 48 '0', flags = 48 '0'}}
(gdb) p u.base.type
$2 = 48 '0'
(gdb) p type_min
$3 = zmq::msg_t::type_min
(gdb) p $3
$4 = zmq::msg_t::type_min
(gdb) p (char)type_min
$5 = 101 'e'
(gdb) p (char)type_max
$6 = 104 'h'
(gdb) p/x u
$7 = {base = {metadata = 0x3638302e312c302e, unused = {0x31, 0x37, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
      0x30, 0x30, 0x31, 0x2c, 0x31, 0x2e, 0x30, 0x38, 0x36, 0x31, 0x37, 0x39 <repeats 11 times>, 0x2c, 0x31, 0x2e,
      0x30, 0x38, 0x36, 0x31, 0x37, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30}, type = 0x30, flags = 0x30}, vsm = {
    metadata = 0x3638302e312c302e, data = {0x31, 0x37, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30,
      0x31, 0x2c, 0x31, 0x2e, 0x30, 0x38, 0x36, 0x31, 0x37, 0x39 <repeats 11 times>, 0x2c, 0x31, 0x2e, 0x30, 0x38,
      0x36, 0x31, 0x37, 0x30, 0x30, 0x30, 0x30, 0x30}, size = 0x30, type = 0x30, flags = 0x30}, lmsg = {
    metadata = 0x3638302e312c302e, content = 0x3030303030303731, unused = {0x30, 0x30, 0x30, 0x30, 0x31, 0x2c, 0x31,
      0x2e, 0x30, 0x38, 0x36, 0x31, 0x37, 0x39 <repeats 11 times>, 0x2c, 0x31, 0x2e, 0x30, 0x38, 0x36, 0x31, 0x37,
      0x30, 0x30, 0x30, 0x30, 0x30, 0x30}, type = 0x30, flags = 0x30}, cmsg = {metadata = 0x3638302e312c302e,
    data = 0x3030303030303731, size = 0x2e312c3130303030, unused = {0x30, 0x38, 0x36, 0x31, 0x37,
      0x39 <repeats 11 times>, 0x2c, 0x31, 0x2e, 0x30, 0x38, 0x36, 0x31, 0x37, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
    type = 0x30, flags = 0x30}, delimiter = {metadata = 0x3638302e312c302e, unused = {0x31, 0x37, 0x30, 0x30, 0x30,
      0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x31, 0x2c, 0x31, 0x2e, 0x30, 0x38, 0x36, 0x31, 0x37,
      0x39 <repeats 11 times>, 0x2c, 0x31, 0x2e, 0x30, 0x38, 0x36, 0x31, 0x37, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
    type = 0x30, flags = 0x30}}

(gdb) x/32w (void*) &u
0x7f3e18199b58:    0x312c302e    0x3638302e    0x30303731    0x30303030
0x7f3e18199b68:    0x30303030    0x2e312c31    0x31363830    0x39393937
0x7f3e18199b78:    0x39393939    0x39393939    0x302e312c    0x37313638
0x7f3e18199b88:    0x30303030    0x30303030    0x18312230    0x00007f3e
0x7f3e18199b98:    0x183182ae    0x00007f3e    0x00000000    0x00000000
0x7f3e18199ba8:    0x18339540    0x00007f3e    0x18316290    0x00007f3e
0x7f3e18199bb8:    0x000000a0    0x00000000    0x1819a060    0x00007f3e
0x7f3e18199bc8:    0x18339660    0x00007f3e    0x3915f100    0x00007f3e

Here's the threads info.

(gdb) info threads
  Id   Target Id         Frame
  20   Thread 0x7f3e39160700 (LWP 28695) 0x00007f3e7587cd99 in zmq::ypipe_t<zmq::msg_t, 256>::write (
    this=0x7f3e182a49a0, value_=..., incomplete_=<optimized out>) at bundled/zeromq/src/ypipe.hpp:86
  19   Thread 0x7f3e5c1ac700 (LWP 28693) 0x00007f3e81b75b13 in epoll_wait () at ../sysdeps/unix/syscall-template.S:81
  18   Thread 0x7f3e3615a700 (LWP 28701) 0x00007f3e7587cd99 in zmq::ypipe_t<zmq::msg_t, 256>::write (
    this=0x7f3e18614780, value_=..., incomplete_=<optimized out>) at bundled/zeromq/src/ypipe.hpp:86
  17   Thread 0x7f3e67618700 (LWP 28692) pthread_barrier_wait ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
  16   Thread 0x7f3e67e19700 (LWP 28691) pthread_barrier_wait ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
  15   Thread 0x7f3e7796c700 (LWP 28688) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  14   Thread 0x7f3e7816d700 (LWP 28687) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  13   Thread 0x7f3e7896e700 (LWP 28686) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  12   Thread 0x7f3e7916f700 (LWP 28685) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  11   Thread 0x7f3e3695b700 (LWP 28700) __memmove_ssse3_back ()
    at ../sysdeps/x86_64/multiarch/memcpy-ssse3-back.S:1627
  10   Thread 0x7f3e79970700 (LWP 28684) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  9    Thread 0x7f3e7a171700 (LWP 28683) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  8    Thread 0x7f3e7b173700 (LWP 28681) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  7    Thread 0x7f3e7a972700 (LWP 28682) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  6    Thread 0x7f3e3715c700 (LWP 28699) sem_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S:85
  5    Thread 0x7f3e82272740 (LWP 28677) 0x00007f3e81b6812d in poll () at ../sysdeps/unix/syscall-template.S:81
  4    Thread 0x7f3e3795d700 (LWP 28698) 0x00007f3e81b6cda3 in select () at ../sysdeps/unix/syscall-template.S:81
  3    Thread 0x7f3e3815e700 (LWP 28697) __memmove_ssse3_back ()
    at ../sysdeps/x86_64/multiarch/memcpy-ssse3-back.S:1619
  2    Thread 0x7f3e3895f700 (LWP 28696) 0x00000000005506c7 in PyObject_Free ()
* 1    Thread 0x7f3e5b9ab700 (LWP 28694) 0x00007f3e81ab1cc9 in __GI_raise (sig=sig@entry=6)
    at ../nptl/sysdeps/unix/sysv/linux/raise.c:56

I suspected thread 2 which is freeing something, but it is completely unrelated to the main issue.

(gdb) bt
#0  0x00000000005506c7 in PyObject_Free ()
#1  0x00007f3e808080b0 in NpyIter_Deallocate (iter=0x7f3e5ad961d8) at numpy/core/src/multiarray/nditer_constr.c:698
#2  0x00007f3e804c48e7 in PyUFunc_ReduceWrapper (operand=operand@entry=0x7f3e5ad84c60, out=out@entry=0x0,
    wheremask=wheremask@entry=0x0, operand_dtype=operand_dtype@entry=0x7f3e80a93020 <BOOL_Descr>,
    result_dtype=result_dtype@entry=0x7f3e80a93020 <BOOL_Descr>, casting=casting@entry=NPY_UNSAFE_CASTING,
    axis_flags=axis_flags@entry=0x7f3e3895d450 "\001", reorderable=reorderable@entry=1, keepdims=keepdims@entry=0,
    subok=subok@entry=0, assign_identity=assign_identity@entry=0x7f3e804f68d0 <assign_reduce_identity_zero>,
    loop=loop@entry=0x7f3e804f80d0 <reduce_loop>, data=data@entry=0xaff180, buffersize=8192,
    funcname=funcname@entry=0x7f3e805052f9 "logical_or") at numpy/core/src/umath/reduction.c:554
#3  0x00007f3e804f90f9 in PyUFunc_Reduce (keepdims=0, odtype=<optimized out>, axes=0x7f3e3895d150,
    naxes=<optimized out>, out=0x0, arr=0x7f3e5ad84c60, ufunc=0xaff180) at numpy/core/src/umath/ufunc_object.c:2884
#4  PyUFunc_GenericReduction (ufunc=<optimized out>, args=<optimized out>, kwds=<optimized out>,
    operation=<optimized out>) at numpy/core/src/umath/ufunc_object.c:3854
#5  0x000000000049968d in PyEval_EvalFrameEx ()
#6  0x00000000004a1c9a in ?? ()
#7  0x0000000000505f96 in PyObject_Call ()
#8  0x00007f3e807f02e3 in forward_ndarray_method (self=0x7f3e5ad84c60, args=0x7f3e82232050, kwds=0x0,
    forwarding_callable=0x7f3e804ae668) at numpy/core/src/multiarray/methods.c:101
#9  0x000000000049968d in PyEval_EvalFrameEx ()
#10 0x0000000000499ef2 in PyEval_EvalFrameEx ()
#11 0x00000000004a090c in PyEval_EvalCodeEx ()
#12 0x0000000000499a52 in PyEval_EvalFrameEx ()
#13 0x00000000004a090c in PyEval_EvalCodeEx ()
#14 0x0000000000499a52 in PyEval_EvalFrameEx ()
#15 0x00000000004a090c in PyEval_EvalCodeEx ()
#16 0x000000000049ab45 in PyEval_EvalFrameEx ()
#17 0x00000000004a1c9a in ?? ()
#18 0x00000000004dfe94 in ?? ()
#19 0x0000000000505f96 in PyObject_Call ()
#20 0x00000000004cac9f in ?? ()
#21 0x000000000049a974 in PyEval_EvalFrameEx ()
#22 0x00000000004a2d2e in ?? ()
#23 0x000000000049990f in PyEval_EvalFrameEx ()
#24 0x00000000004a1c9a in ?? ()
#25 0x0000000000505f96 in PyObject_Call ()
#26 0x000000000049b07a in PyEval_EvalFrameEx ()
#27 0x0000000000499ef2 in PyEval_EvalFrameEx ()
#28 0x0000000000499ef2 in PyEval_EvalFrameEx ()
#29 0x00000000004a1c9a in ?? ()
#30 0x00000000004dfe94 in ?? ()
#31 0x00000000004dc9cb in PyEval_CallObjectWithKeywords ()
#32 0x0000000000588b12 in ?? ()
#33 0x00007f3e81e48182 in start_thread (arg=0x7f3e3895f700) at pthread_create.c:312
#34 0x00007f3e81b7547d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111

Any idea?

umitanuki commented 8 years ago

Another instance.

(gdb) bt
#0  0x00007f0b4eb06cc9 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
#1  0x00007f0b4eb0a0d8 in __GI_abort () at abort.c:89
#2  0x00007f0b428b0949 in zmq::zmq_abort (errmsg_=errmsg_@entry=0x7f0b428e81ee "check ()")
    at bundled/zeromq/src/err.cpp:84
#3  0x00007f0b428a71a7 in zmq::msg_t::size (this=0x7f0ae8a050f0) at bundled/zeromq/src/msg.cpp:248
#4  0x00007f0b428e5d62 in zmq::v2_encoder_t::message_ready (this=0x7f0ae8a7a2c0)
    at bundled/zeromq/src/v2_encoder.cpp:53
#5  0x00007f0b428abf21 in zmq::stream_engine_t::out_event (this=0x7f0ae8a050d0)
    at bundled/zeromq/src/stream_engine.cpp:363
#6  0x00007f0b428bd6b3 in zmq::session_base_t::read_activated (this=0x7f0ae8a05540, pipe_=0x7f0ae8a10cb0)
    at bundled/zeromq/src/session_base.cpp:264
#7  0x00007f0b428cf40c in zmq::io_thread_t::in_event (this=0x3dc7b60) at bundled/zeromq/src/io_thread.cpp:83
#8  0x00007f0b428df82e in zmq::epoll_t::loop (this=0x3dc7de0) at bundled/zeromq/src/epoll.cpp:176
#9  0x00007f0b428c47a0 in thread_routine (arg_=0x3dc7e60) at bundled/zeromq/src/thread.cpp:96
#10 0x00007f0b4ee9d182 in start_thread (arg=0x7f0b28a00700) at pthread_create.c:312
#11 0x00007f0b4ebca47d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
(gdb) info threads
  Id   Target Id         Frame
  20   Thread 0x7f0b039b0700 (LWP 30296) 0x00007f0b4eea43bd in read () at ../sysdeps/unix/syscall-template.S:81
  19   Thread 0x7f0b459c3700 (LWP 30282) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  18   Thread 0x7f0b451c2700 (LWP 30283) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  17   Thread 0x7f0b449c1700 (LWP 30284) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  16   Thread 0x7f0b34e6e700 (LWP 30287) pthread_barrier_wait ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
  15   Thread 0x7f0b29201700 (LWP 30289) 0x00007f0b4ebcab13 in epoll_wait () at ../sysdeps/unix/syscall-template.S:81
  14   Thread 0x7f0b061b5700 (LWP 30291) sem_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S:85
  13   Thread 0x7f0b471c6700 (LWP 30279) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  12   Thread 0x7f0b3466d700 (LWP 30288) pthread_barrier_wait ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_barrier_wait.S:71
  11   Thread 0x7f0b481c8700 (LWP 30277) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  10   Thread 0x7f0b461c4700 (LWP 30281) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  9    Thread 0x7f0b4f2c7740 (LWP 30273) 0x00007f0b4ebbd12d in poll () at ../sysdeps/unix/syscall-template.S:81
  8    Thread 0x7f0b479c7700 (LWP 30278) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  7    Thread 0x7f0b469c5700 (LWP 30280) pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  6    Thread 0x7f0b031af700 (LWP 30297) 0x00007f0b4ebc1da3 in select () at ../sysdeps/unix/syscall-template.S:81
  5    Thread 0x7f0b051b3700 (LWP 30293) 0x00007f0b4ebc1da3 in select () at ../sysdeps/unix/syscall-template.S:81
  4    Thread 0x7f0b049b2700 (LWP 30294) 0x00007f0b49250606 in __pyx_pf_6pandas_5algos_196is_monotonic_int64 (
    __pyx_self=<optimized out>, __pyx_v_timelike=1, __pyx_v_arr=<optimized out>) at pandas/algos.c:74482
  3    Thread 0x7f0b041b1700 (LWP 30295) 0x00007f0b4ebc1da3 in select () at ../sysdeps/unix/syscall-template.S:81
  2    Thread 0x7f0b059b4700 (LWP 30292) sem_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S:85
* 1    Thread 0x7f0b28a00700 (LWP 30290) 0x00007f0b4eb06cc9 in __GI_raise (sig=sig@entry=6)
    at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
(gdb) p u
$1 = {base = {metadata = 0x0, unused = '\000' <repeats 45 times>, type = 0 '\000', flags = 0 '\000'}, vsm = {
    metadata = 0x0, data = '\000' <repeats 44 times>, size = 0 '\000', type = 0 '\000', flags = 0 '\000'}, lmsg = {
    metadata = 0x0, content = 0x0, unused = '\000' <repeats 37 times>, type = 0 '\000', flags = 0 '\000'}, cmsg = {
    metadata = 0x0, data = 0x0, size = 0, unused = '\000' <repeats 29 times>, type = 0 '\000', flags = 0 '\000'},
  delimiter = {metadata = 0x0, unused = '\000' <repeats 45 times>, type = 0 '\000', flags = 0 '\000'}}
umitanuki commented 8 years ago

I found the socket is not thread-safe, as described in the doc. Using forwarder device, I'm not seeing this issue anymore.