sipwise / rtpengine

The Sipwise media proxy for Kamailio
GNU General Public License v3.0
763 stars 360 forks source link

rtpengine mr10.5 crashed in __dtx_send_later #1817

Open andingv opened 2 months ago

andingv commented 2 months ago

rtpengine version the issue has been seen with

mr10.5.1

Used distribution and its version

Ubuntu 18.04.6 LTS

Linux kernel version used

No response

CPU architecture issue was seen on (see uname -m)

x86_64

Expected behaviour you didn't see

No response

Unexpected behaviour you saw

I am seeing a seldom crash, last week two hosts crashed independently.

Steps to reproduce the problem

Not sure, but there's one idea . Am I seeing it right that some endpoint is sending the RTP packet with huge ts causing this behavior? Program terminated with signal SIGSEGV, Segmentation fault.

0 0x000055bed2c174a5 in decoder_input_data_ptime (dec=0x7fc6ec17e690, data=0xc0089a793fe274df, ts=4715930405566051902, ptime=ptime@entry=0x7fc7bb5f9288,

callback=0x55bed2bebbe0 <packet_decoded_fifo>, u1=0x7fc73c089d90, u2=0x7fc7bb5f91c0) at codeclib.c:936

Additional program output to the terminal or logs illustrating the issue

Here is the backtrace:

(gdb) 
#0  0x000055bed2c174a5 in decoder_input_data_ptime (dec=0x7fc6ec17e690, data=0xc0089a793fe274df, ts=4715930405566051902, ptime=ptime@entry=0x7fc7bb5f9288, 
    callback=0x55bed2bebbe0 <packet_decoded_fifo>, u1=0x7fc73c089d90, u2=0x7fc7bb5f91c0) at codeclib.c:936
No locals.
#1  0x000055bed2bef42f in __rtp_decode (ch=<optimized out>, input_ch=0x7fc73c089d90, packet=<optimized out>, mp=0x7fc7bb5f91c0) at codec.c:3682
        ret = 0
#2  0x000055bed2bec9e7 in __dtx_send_later (ct=0x7fc6a401d5f0) at codec.c:3072
        dtxb = 0x7fc6a401d5f0
        mp_copy = {raw = {s = 0x0, len = 0}, fsin = {address = {family = 0x55bed2e51b80 <__socket_families>, u = {ipv4 = {s_addr = 113162329}, ipv6 = {__in6_u = {
                    __u6_addr8 = "Y\270\276\006", '\000' <repeats 11 times>, __u6_addr16 = {47193, 1726, 0, 0, 0, 0, 0, 0}, __u6_addr32 = {113162329, 0, 0, 0}}}}}, port = 26856}, tv = {
            tv_sec = 1713279718, tv_usec = 40418}, sfd = 0x7fc630056450, call = 0x7fc6fc0105a0, stream = 0x7fc71c0900d0, media = 0x7fc66c0d0810, media_out = 0x7fc62c0db7b0, sink = {
            sink = 0x7fc6cc111080, handler = 0x55bed2e486d0 <__sh_noop_rtp>, kernel_output_idx = -1, rtcp_only = 0, transcoding = 1}, rtp = 0x7fc48c038080, rtcp = 0x0, ssrc_in = 0x7fc69814a570, 
          ssrc_out = 0x7fc69814cbf0, payload = {s = 0x0, len = 0}, packets_out = {head = 0x0, tail = 0x0, length = 0}, ptime = -1}
        ret = 0
        ts = 4715930405566051902
        p_left = 7
        tv_diff = 130003
        ts_diff = 4715930405563795742
        dtxp = <optimized out>
        call = 0x7fc6fc0105a0
        ch = 0x7fc73c089d90
        ps = <optimized out>
        input_ch = 0x7fc73c089d90
        ptime = 20
#3  0x000055bed2bfb0c8 in timerthread_run (p=0x55bed2e53bc0 <codec_timers_thread>) at timerthread.c:62
        tt_obj = <optimized out>
        sleeptime = <optimized out>
        tv = <optimized out>
        tt = <optimized out>
        waker = {lock = 0x55bed2e53bc8 <codec_timers_thread+8>, cond = 0x55bed2e53bf0 <codec_timers_thread+48>}
#4  0x000055bed2b79f99 in thread_detach_func (d=0x55bed4f94e60) at aux.c:250
        __cancel_buf = {__cancel_jmp_buf = {{__cancel_jmp_buf = {94278105255520, -5219573935894328112, 140495818822912, 0, 140482340461232, 140720364687680, 5196924824070491344, 
                2084599588019342544}, __mask_was_saved = 0}}, __pad = {0x7fc7bb5f9450, 0x0, 0x0, 0x0}}
        __cancel_routine = 0x55bed2b79e00 <thread_detach_cleanup>
        __cancel_arg = 0x55bed4f94e60
        __not_first_call = <optimized out>
        dt = 0x55bed4f94e60
        t = <optimized out>
        scheduler = <optimized out>
#5  0x00007fc8104596db in start_thread (arg=0x7fc7bb5fa700) at pthread_create.c:463
        pd = 0x7fc7bb5fa700
        now = <optimized out>
        unwind_buf = {cancel_jmp_buf = {{jmp_buf = {140495818827520, -5219573935894328112, 140495818822912, 0, 94278105255520, 140720364687680, 5196924823827221712, 5188280515029208272}, 
              mask_was_saved = 0}}, priv = {pad = {0x0, 0x0, 0x0, 0x0}, data = {prev = 0x0, cleanup = 0x0, canceltype = 0}}}
        not_first_call = <optimized out>
#6  0x00007fc80fcf061f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95

something more:

(gdb) fr 2
#2  0x000055bed2bec9e7 in __dtx_send_later (ct=0x7fc6a401d5f0) at codec.c:3072
3072    in codec.c
(gdb) p *enc
No symbol "enc" in current context.
(gdb) p *ch
$1 = {h = {obj = {ref = 21, free_func = 0x55bed2be91c0 <__free_ssrc_handler>, size = 376}, lock = {__data = {__lock = 0, __count = 0, __owner = 0, __nusers = 0, __kind = 0, __spins = 0, 
        __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>, __align = 0}, ssrc = 3270805003, last_used = 1713279718}, handler = 0x7fc6bc0fb070, 
  decoder = 0x7fc6ec17e690, encoder = 0x7fc6b8110230, encoder_format = {clockrate = 16000, channels = 1, format = 1}, bitrate = 14250, ptime = 20, bytes_per_packet = 80, first_ts = 4800, 
  last_ts = 2256640, first_send = {tv_sec = 1713279436, tv_usec = 750421}, first_send_ts = 4160, output_skew = 0, sample_buffer = 0x7fc728157920, dtx_buffer = 0x7fc6a401d5f0, dtmf_dsp = 0x0, 
  dtmf_resampler = {swresample = 0x0, no_filter = false}, dtmf_format = {clockrate = 0, channels = 0, format = 0}, dtmf_ts = 0, last_dtmf_event_ts = 0, dtmf_events = {head = 0x0, tail = 0x0, 
    length = 0}, dtmf_event = {code = 0, volume = 0, ts = 0, rand_code = 0, block_dtmf = BLOCK_DTMF_OFF}, dtmf_state = {code = 0, volume = 0, ts = 0, rand_code = 0, block_dtmf = BLOCK_DTMF_OFF}, 
  silence_events = {head = 0x0, tail = 0x0, length = 0}, dtmf_start_ts = 0, dtmf_first_duration = 0, skip_pts = 0, rtp_mark = 0}


### Anything else?

The machines are doing transcoding and AMR DTX so the load is quite high.
We have manually picked the fix for #1591 to this locally built version.
rfuchs commented 2 months ago

A large timestamp shouldn't be a problem, but the data pointer looks corrupted, which probably means ts was corrupted as well. Possibly a double-free of the packet object or something like it. What are the contents of *dtxb in frame 2?

andingv commented 2 months ago

ok thank you, here you go:

(gdb) fr 2
#2  0x000055bed2bec9e7 in __dtx_send_later (ct=0x7fc6a401d5f0) at codec.c:3072
3072    codec.c: No such file or directory.
(gdb) p *dtxb
$1 = {ct = {tt_obj = {obj = {ref = 2, free_func = 0x55bed2be9390 <__dtx_free>, size = 416}, tt = 0x55bed2e53bc0 <codec_timers_thread>, next_check = {tv_sec = 0, tv_usec = 0}, last_run = {
        tv_sec = 1713279718, tv_usec = 170421}}, next = {tv_sec = 1713279718, tv_usec = 170421}, timer_func = 0x55bed2bec4c0 <__dtx_send_later>}, lock = {__data = {__lock = 0, __count = 0, 
      __owner = 0, __nusers = 0, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 0x0}}, __size = '\000' <repeats 39 times>, __align = 0}, csh = 0x7fc73c089d90, ptime = 20, 
  tspp = 160, clockrate = 8000, call = 0x7fc6fc0105a0, packets = {head = 0x7fbfd406f820, tail = 0x7fbfeffbbd00, length = 7}, last_mp = {raw = {s = 0x0, len = 0}, fsin = {address = {
        family = 0x55bed2e51b80 <__socket_families>, u = {ipv4 = {s_addr = 113162329}, ipv6 = {__in6_u = {__u6_addr8 = "Y\270\276\006", '\000' <repeats 11 times>, __u6_addr16 = {47193, 1726, 0, 0, 
                0, 0, 0, 0}, __u6_addr32 = {113162329, 0, 0, 0}}}}}, port = 26856}, tv = {tv_sec = 1713279718, tv_usec = 40418}, sfd = 0x7fc630056450, call = 0x7fc6fc0105a0, stream = 0x7fc71c0900d0, 
    media = 0x7fc66c0d0810, media_out = 0x7fc62c0db7b0, sink = {sink = 0x7fc6cc111080, handler = 0x55bed2e486d0 <__sh_noop_rtp>, kernel_output_idx = -1, rtcp_only = 0, transcoding = 1}, 
    rtp = 0x7fc48c0df360, rtcp = 0x0, ssrc_in = 0x7fc69814a570, ssrc_out = 0x7fc69814cbf0, payload = {s = 0x0, len = 0}, packets_out = {head = 0x0, tail = 0x0, length = 0}, ptime = 0}, 
  head_ts = 4715930405566051902, ssrc = 3270805003, start = 1713279718}