timescale / timescaledb

An open-source time-series SQL database optimized for fast ingest and complex queries. Packaged as a PostgreSQL extension.
https://www.timescale.com/
Other
17.97k stars 883 forks source link

[Bug]: Crash in bgw_scheduler test inside ResetLatch during CI run #6543

Open erimatnor opened 10 months ago

erimatnor commented 10 months ago

What type of bug is this?

Crash

What subsystems and features are affected?

Background worker

What happened?

Crashed during a CI run

TimescaleDB version affected

2.14-dev

PostgreSQL version used

15.5

What operating system did you use?

Ubuntu 22.04

What installation method did you use?

Source

What platform did you run on?

Other

Relevant log output and stack trace

(gdb) +bt full
#0  __pthread_kill_implementation (no_tid=0, signo=6, 
    threadid=139666941264384) at ./nptl/pthread_kill.c:44
        tid = <optimized out>
        ret = 0
        pd = 0x7f06be6bc200
        old_mask = Reading in symbols for abort.c...
{__val = {140726388645160, 140726388645168, 140726388645176, 139666943001130, 
            140726388645192, 94483703240837, 140726388637936, 
            140726388637928, 0, 139666933630630, 94483702132656, 
            838725508530176, 0, 112, 128, 140726388638128}}
        ret = <optimized out>
        pd = <optimized out>
        old_mask = <optimized out>
        ret = <optimized out>
        tid = <optimized out>
        ret = <optimized out>
        resultvar = <optimized out>
        resultvar = <optimized out>
        __arg3 = <optimized out>
        __arg2 = <optimized out>
        __arg1 = <optimized out>
        _a3 = <optimized out>
        _a2 = <optimized out>
        _a1 = <optimized out>
        __futex = <optimized out>
        resultvar = <optimized out>
        __arg3 = <optimized out>
        __arg2 = <optimized out>
        __arg1 = <optimized out>
        _a3 = <optimized out>
        _a2 = <optimized out>
        _a1 = <optimized out>
        __futex = <optimized out>
        __private = <optimized out>
        __oldval = <optimized out>
        result = <optimized out>
#1  __pthread_kill_internal (signo=6, threadid=139666941264384)
    at ./nptl/pthread_kill.c:78
No locals.
#2  __GI___pthread_kill (threadid=139666941264384, signo=signo@entry=6)
    at ./nptl/pthread_kill.c:89
No locals.
#3  0x00007f06bde42476 in __GI_raise (Reading in symbols for assert.c...
sig=sig@entry=6) at ../sysdeps/posix/raise.c:26
        ret = <optimized out>
#4  0x00007f06bde287f3 in __GI_abort () at ./stdlib/abort.c:79
        save_stage = 1
        act = Reading in symbols for latch.c...
{__sigaction_handler = {sa_handler = 0x7ffd6a67c1be, 
            sa_sigaction = 0x7ffd6a67c1be}, sa_mask = {__val = {2, 
              139666934046263, 1, 139666934053069, 3, 140726388638116, 12, 
              139666934053073, 2, 7306304475261219008, 3834876900996100962, 
              140726388638208, 3833236429663910835, 140726388638224, 
              4538996557681408000, 140726388645192}}, sa_flags = 686, 
          sa_restorer = 0x12}
        sigs = {__val = {32, 4, 18446744073709551576, 139666943185632, 
            140726388647078, 94483701796864, 94483702263956, 94483703240800, 
            140726388647078, 40, 139666934040530, 1, 94483702263956, 6, 
            139666934053061, 3}}
#5  0x000055eeb3dbb188 in ExceptionalCondition (
    Reading in symbols for /home/runner/work/timescaledb-private/timescaledb-private/test/src/bgw/params.c...
conditionName=conditionName@entry=0x55eeb3f42323 "latch->owner_pid == MyProcPid", errorType=errorType@entry=0x55eeb3e1700b "FailedAssertion", 
    fileName=fileName@entry=0x55eeb3f42257 "latch.c", 
    lineNumber=lineNumber@entry=686) at assert.c:69
No locals.
#6  0x000055eeb3c5ef29 in ResetLatch (
    Reading in symbols for /home/runner/work/timescaledb-private/timescaledb-private/test/src/bgw/timer_mock.c...
latch=<optimized out>) at latch.c:686
No locals.
#7  0x00007f06b43633aa in ts_reset_and_wait_timer_latch ()
    at /home/runner/work/timescaledb-private/timescaledb-private/test/src/bgw/params.c:205
        do_close = Reading in symbols for /home/runner/work/timescaledb-private/timescaledb-private/src/bgw/timer.c...
true
        wrapper = 0x7f06be848000
        __func__ = "ts_reset_and_wait_timer_latch"
#8  0x00007f06b435fef8 in mock_wait (
    Reading in symbols for /home/runner/work/timescaledb-private/timescaledb-private/src/bgw/scheduler.c...
until=28863000000)
    at /home/runner/work/timescaledb-private/timescaledb-private/test/src/bgw/timer_mock.c:71
        __func__ = "mock_wait"
        lc = 0x0
#9  0x00007f06b42eb3f4 in ts_timer_wait (
    Reading in symbols for /home/runner/work/timescaledb-private/timescaledb-private/test/src/bgw/scheduler_mock.c...
until=28863000000)
    at /home/runner/work/timescaledb-private/timescaledb-private/src/bgw/timer.c:106
No locals.
#10 0x00007f06b42ea7a9 in ts_bgw_scheduler_process (
    Reading in symbols for bgworker.c...
run_for_interval_ms=-1, 
    bgw_register=0x7f06b435fb86 <ts_timer_mock_register_bgw_handle>)
    at /home/runner/work/timescaledb-private/timescaledb-private/src/bgw/scheduler.c:785

How can we reproduce the bug?

It happened during a CI run and there are no specific steps to reproduce.
mkindahl commented 10 months ago

This is because there is a check that the latch is owned by the resetter. Either the latch is used incorrectly, or it might not be initialized properly.