Chuyu-Team / YY-Thunks

Fix DecodePointer, EncodePointer,RegDeleteKeyEx etc. APIs not found in Windows XP RTM.
MIT License
569 stars 103 forks source link

使用 wait_for 等待 std::async 返回的 future,有概率导致程序卡死 #122

Open gexgd0419 opened 3 days ago

gexgd0419 commented 3 days ago

YY-Thunks 版本:1.1.2

使用 std::async 启动异步任务,并在返回的 std::future 上用 wait_for 反复检查任务的完成状态。这种做法在 XP 系统中有概率导致程序卡死,无法继续运行。

如下代码可复现此问题:

#include <iostream>
#include <future>
#include <random>

int main()
{
    std::random_device rd;
    std::uniform_int_distribution<int> dist(50, 1000);
    for (;;)
    {
        // 启动随机时长的任务
        auto fut = std::async(std::launch::async,
            [&]() { std::this_thread::sleep_for(std::chrono::milliseconds(dist(rd))); });
        size_t i = 0;
        while (fut.wait_for(std::chrono::milliseconds(50)) == std::future_status::timeout)
            std::cout << ++i << std::endl;  // 输出递增的数字表示程序正常运行
    }
    return 0;
}
gexgd0419 commented 3 days ago

下面是程序卡死后,主线程的调用堆栈:(其他线程此时没有工作)

ntdll.dll!_KiFastSystemCallRet@0()
ntdll.dll!_NtWaitForKeyedEvent@16()
YYThunksDeadlockTest.exe!SleepConditionVariableSRW(_RTL_CONDITION_VARIABLE * ConditionVariable=0x00153460, _RTL_SRWLOCK * SRWLock=0x00153434, unsigned long dwMilliseconds=50, unsigned long Flags=0) 行 2332
YYThunksDeadlockTest.exe!Concurrency::details::stl_condition_variable_win7::wait_for(_Stl_critical_section * lock=0x00153430, unsigned int timeout=50) 行 29
YYThunksDeadlockTest.exe!_Cnd_timedwait_for(_Cnd_internal_imp_t * const cond=0x0015345c, _Mtx_internal_imp_t * const mtx=0x0015342c, const unsigned int target_ms=50) 行 53
[内联框架] YYThunksDeadlockTest.exe!std::condition_variable::wait_until(std::unique_lock<std::mutex> &) 行 600
[内联框架] YYThunksDeadlockTest.exe!std::condition_variable::wait_until(std::unique_lock<std::mutex> &) 行 613
[内联框架] YYThunksDeadlockTest.exe!std::condition_variable::wait_for(std::unique_lock<std::mutex> &) 行 578
YYThunksDeadlockTest.exe!std::_Associated_state<int>::_Wait_for<__int64,std::ratio<1,1000>>(const std::chrono::duration<__int64,std::ratio<1,1000>> & _Rel_time={...}) 行 260
[内联框架] YYThunksDeadlockTest.exe!std::_State_manager<int>::wait_for(const std::chrono::duration<__int64,std::ratio<1,1000>> &) 行 759
YYThunksDeadlockTest.exe!main() 行 17
[内联框架] YYThunksDeadlockTest.exe!invoke_main() 行 78
YYThunksDeadlockTest.exe!__scrt_common_main_seh() 行 288
kernel32.dll!_BaseProcessStart@4()

发现疑似和 std::condition_variable 有关,于是改成如下代码,发现也能在 wait_for 的位置触发卡死:

#include <iostream>
#include <future>
#include <random>

int main()
{
    std::random_device rd;
    std::uniform_int_distribution<int> dist(50, 1000);
    for (;;)
    {
        std::mutex mtx;
        std::condition_variable cv;
        std::unique_lock<std::mutex> lock(mtx);
        std::atomic_bool flag = false;
        std::jthread thrd([&]() {
            std::this_thread::sleep_for(std::chrono::milliseconds(dist(rd)));
            flag.store(true, std::memory_order_release);
            cv.notify_all();
        });
        size_t i = 0;
        while (!cv.wait_for(lock, std::chrono::milliseconds(50),
            [&]() { return flag.load(std::memory_order_acquire); }))
        {
            std::cout << ++i << std::endl;
        }
    }
    return 0;
}
gexgd0419 commented 3 days ago

直接调用 Windows API 操作 CONDITION_VARIABLE 也会有同样的问题,程序会卡在 NtWaitForKeyedEvent 的位置,无论是配合 SRWLOCK 还是 CRITICAL_SECTION 都会出现同样的问题。

#include <iostream>
#include <thread>
#include <random>
#include <Windows.h>

int main()
{
    std::random_device rd;
    std::uniform_int_distribution<int> dist(50, 1000);
    for (;;)
    {
        SRWLOCK srw;
        InitializeSRWLock(&srw);
        CONDITION_VARIABLE cv;
        InitializeConditionVariable(&cv);
        AcquireSRWLockExclusive(&srw);
        std::atomic_bool flag = false;
        std::jthread thrd([&]() {
            Sleep(dist(rd));
            flag.store(true, std::memory_order_release);
            WakeAllConditionVariable(&cv);
        });
        size_t i = 0;
        while (!SleepConditionVariableSRW(&cv, &srw, 50, 0) && !flag.load(std::memory_order_acquire))
        {
            std::cout << ++i << std::endl;
        }
        ReleaseSRWLockExclusive(&srw);
    }
    return 0;
}
#include <iostream>
#include <thread>
#include <random>
#include <Windows.h>

int main()
{
    std::random_device rd;
    std::uniform_int_distribution<int> dist(50, 1000);
    for (;;)
    {
        CRITICAL_SECTION cs;
        InitializeCriticalSection(&cs);
        CONDITION_VARIABLE cv;
        InitializeConditionVariable(&cv);
        EnterCriticalSection(&cs);
        std::atomic_bool flag = false;
        std::jthread thrd([&]() {
            Sleep(dist(rd));
            flag.store(true, std::memory_order_release);
            WakeAllConditionVariable(&cv);
        });
        size_t i = 0;
        while (!SleepConditionVariableCS(&cv, &cs, 50) && !flag.load(std::memory_order_acquire))
        {
            std::cout << ++i << std::endl;
        }
        LeaveCriticalSection(&cs);
    }
    return 0;
}
gexgd0419 commented 3 days ago

目前可以把问题锁定在这个位置:

https://github.com/Chuyu-Team/YY-Thunks/blob/a41b77a3db2245375577422171665edb5b1600e0/src/Thunks/api-ms-win-core-synch.hpp#L2323-L2334

如果程序走到了里层的 if,没有设定超时的 NtWaitForKeyedEvent 就会把程序卡死。