this_thread::sleep_for(short duration) can sleep forever on Linux


Bugzilla Link	PR48560
Status	NEW
Importance	P enhancement
Reported by	Paul Groke (paul.groke@dynatrace.com)
Reported on	2020-12-19 10:56:21 -0800
Last modified on	2020-12-19 11:00:08 -0800
Version	11.0
Hardware	All Linux
CC	llvm-bugs@lists.llvm.org, mclow.lists@gmail.com
Fixed by commit(s)
Attachments
Blocks
Blocked by
See also
I was able to reproduce the problem with the following program (see below). It
won't reproduce well on a loaded machine and you may have to fiddle with the
spin counts to get a successful repro.

In my tests, I was able to reproduce the problem on:
* Ubuntu 20.04 LTS x86-64
* Cent OS 5 x86-64, PPC-64, PPCLE-64, ARM-64 and s390x

Note that this cannot be reproduced with WSL1 since Microsoft's implementation
of the syscall does not have this bug. It can be reproduced with WSL2 though.

Output from my machine:

Remaining: 0.020000000 - signal count: 0
Remaining: 0.312026100 - signal count: 6322
Remaining: 0.610834500 - signal count: 12729
Remaining: 0.907113100 - signal count: 19096
Remaining: 1.202000100 - signal count: 25452
Remaining: 1.499204500 - signal count: 31842
Remaining: 1.799767600 - signal count: 38277
Remaining: 2.095204700 - signal count: 44636
(...snip...)
Remaining: 86.829101800 - signal count: 1865855
Remaining: 87.126722500 - signal count: 1872251
Remaining: 87.427912300 - signal count: 1878704
Remaining: 87.730148000 - signal count: 1885171
Remaining: 88.029529500 - signal count: 1891599
Remaining: 88.329424200 - signal count: 1898039
Remaining: 88.629499900 - signal count: 1904482
Remaining: 88.926003300 - signal count: 1910861
ERROR: timeout. sleep thread seems hung.

---

#include <signal.h>
#include <time.h>
#include <atomic>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <thread>

using namespace std::chrono_literals;

constexpr auto acq = std::memory_order_acquire;
constexpr auto rlx = std::memory_order_relaxed;

constexpr uint64_t sigHandlerSpin = 10'000;
constexpr uint64_t sigSenderSpin = 100;
constexpr int64_t sleepTimeNs = 20'000'000; // 20ms
constexpr int64_t reportIntervalNs = 200'000'000; // 200ms
constexpr auto timeout = 60s;

thread_local bool tls_isSleepThread{false};
std::atomic<uint32_t> s_signalCount{0};
std::atomic<bool> s_signalPending{false};

struct timespec makeTs(time_t sec, int64_t nsec) {
    struct timespec ts = {};
    ts.tv_sec = sec;
    ts.tv_nsec = nsec;
    return ts;
}

void spinWait(uint64_t spinCount) {
    [[maybe_unused]] volatile int32_t dummy = 0;
    volatile int32_t divisor = 123456789;
    volatile int32_t dividend = 33;
    for (uint64_t i = 0; i < spinCount; i++)
        dummy = divisor / dividend;
}

void signalHandler(int) {
    if (tls_isSleepThread) {
        s_signalCount++;
        spinWait(sigHandlerSpin);
    }
    s_signalPending = false;
};

void spinBarrier(std::atomic<uint32_t>& barrier) {
    barrier--;
    while (barrier.load(acq)) {
        // wait...
    }
}

struct State {
    std::atomic<uint32_t> startBarrier{3};
    std::atomic<bool> stop{false};
    std::atomic<uint32_t> reportGen{0};
    std::atomic<int64_t> reportSec{0};
    std::atomic<int64_t> reportNsec{0};
};

[[maybe_unused]] void nanosleepThreadFn(State& state) {
    tls_isSleepThread = true;
    auto sleepTime = makeTs(0, sleepTimeNs);
    state.reportSec.store(sleepTime.tv_sec, rlx);
    state.reportNsec.store(sleepTime.tv_nsec, rlx);

    spinBarrier(state.startBarrier);

    while (!state.stop) {
        if (::nanosleep(&sleepTime, &sleepTime) != -1)
            break;
        if (errno != EINTR)
            break;

        auto gen = state.reportGen.load(acq);
        state.reportSec.store(sleepTime.tv_sec, rlx);
        state.reportNsec.store(sleepTime.tv_nsec, rlx);
        state.reportGen.store(gen + 1);
    }

    state.stop = true;
}

[[maybe_unused]] void stdSleepThreadFn(State& state) {
    tls_isSleepThread = true;
    auto sleepTime = 1ns * sleepTimeNs;

    spinBarrier(state.startBarrier);
    std::this_thread::sleep_for(sleepTime);

    state.stop = true;
}

void reportThreadFn(State& state) {
    auto const delay = makeTs(0, reportIntervalNs);

    spinBarrier(state.startBarrier);
    while (!state.stop) {
        // try to get consistent snapshot...
        int64_t sec;
        int64_t nsec;
        for (size_t i = 0; i < 10; i++) {
            auto const gen = state.reportGen.load(acq);
            spinWait(10);
            sec = state.reportSec.load(rlx);
            nsec = state.reportNsec.load(rlx);
            spinWait(10);
            auto const gen2 = state.reportGen.load(acq);
            if (gen2 == gen)
                break;
        }

        std::cout << "Remaining: " << sec << "."
                  << std::setfill('0') << std::setw(9) << nsec
                  << " - signal count: " << s_signalCount
                  << std::endl;
        nanosleep(&delay, nullptr);
    }
}

int runTest() {
    s_signalCount = 0;
    s_signalPending = false;

    // install SIGURG handler...
    struct sigaction oldSigUrgAction = {};
    struct sigaction sigUrgAction = {};
    sigUrgAction.sa_handler = signalHandler;
    sigaction(SIGURG, &sigUrgAction, &oldSigUrgAction);

    auto* const state = new State{};

    std::thread sleepThread{[state] { nanosleepThreadFn(*state); }};
    std::thread reportThread{[state] { reportThreadFn(*state); }};

    auto const sleepThreadHandle = sleepThread.native_handle();

    spinBarrier(state->startBarrier);

    using Clock = std::chrono::steady_clock;
    auto const t0 = Clock::now();

    // Hammer the sleep thread with signals...
    bool isTimedOut = false;
    while (!state->stop && !isTimedOut) {
        if (!s_signalPending) {
            s_signalPending = true;
            spinWait(sigSenderSpin);
            pthread_kill(sleepThreadHandle, SIGURG);
        }

        if (Clock::now() - t0 > timeout) {
            isTimedOut = true;
            break;
        }
    }

    state->stop = true;
    reportThread.join();

    if (isTimedOut) {
        std::cout << "ERROR: timeout. sleep thread seems hung." << std::endl;
        // Leak sleepThread and state to avoid crash
        sleepThread.detach();
    } else {
        sleepThread.join();
        delete state;
    }

    // Restore old SIGCHLD action.
    sigaction(SIGURG, &oldSigUrgAction, nullptr);

    return isTimedOut ? 1 : 0;
}

int main() {
    return runTest();
}
Quuxplusone / LLVMBugzillaTest

this_thread::sleep_for(short duration) can sleep forever on Linux #47530