dotnet / runtime

.NET is a cross-platform runtime for cloud, mobile, desktop, and IoT apps.
https://docs.microsoft.com/dotnet/core/
MIT License
15.19k stars 4.72k forks source link

[ARM/Linux] paltest_namedmutex_test1 hangs #6014

Closed jyoungyun closed 4 years ago

jyoungyun commented 8 years ago

The paltest_namedmutex_test1 hangs on Raspberry Pi2.

I will add comments soon if I take a hint to resolve it.

jyoungyun commented 8 years ago

The gdb info:

Program received signal SIGINT, Interrupt.
0xb6fa5094 in ?? () from /lib/libpthread.so.0
(gdb) bt full
#0  0xb6fa5094 in ?? () from /lib/libpthread.so.0
No symbol table info available.
dotnet/coreclr#1  0xb6f9ef98 in pthread_mutex_timedlock () from /lib/libpthread.so.0
No symbol table info available.
dotnet/coreclr#2  0x000ac05a in MutexHelpers::TryAcquireLock (mutex=0xb6ff2008, 
    timeoutMilliseconds=500)
    at /home/jyoung/git/dotnet/coreclr_upstream/src/pal/src/synchobj/mutex.cpp:863
        timeoutTime = {tv_sec = 1420095963, tv_nsec = 792755126}
        palError = 0
        lockResult = 568845
dotnet/coreclr#3  0x000adb0c in NamedMutexProcessData::TryAcquireLock (this=0x13e5b0, 
    timeoutMilliseconds=500)
    at /home/jyoung/git/dotnet/coreclr_upstream/src/pal/src/synchobj/mutex.cpp:1329
        sharedData = 0xb6ff2008
        result = MutexTryAcquireLockResult::TimedOut
        currentThread = 0x13e418
dotnet/coreclr#4  0x000bee40 in CorUnix::InternalWaitForMultipleObjectsEx (pThread=0x135298, 
    nCount=1, lpHandles=0xbefff6c8, bWaitAll=0, dwMilliseconds=500, 
    bAlertable=0)
jyoungyun commented 8 years ago
TestAssert(WaitForSingleObject(m, 0) == WAIT_TIMEOUT); // try to lock the mutex without waiting
TestAssert(WaitForSingleObject(m, g_expectedTimeoutMilliseconds) == WAIT_TIMEOUT); // try to lock the mutex with a timeout

The tc blocked if the timeout event occured repeatedly. When I commented out the second line, it works properly. Need to check the internal logic. (Even it was passed, the Abandon test failed again by the same token.)

adityamandaleeka commented 8 years ago

@kouvel

leemgs commented 8 years ago

Need to check the internal logic. (Even it was passed, the Abandon test failed again by the same token.)

Here is lowlevelrobustlock.c from glibc 2.21 - https://sourceware.org/git/?p=glibc.git;a=blob;f=nptl/lowlevelrobustlock.c;h=3a2725c47ea81e1c5cac946532629db276e9411e;hb=4e42b5b8f89f0e288e68be7ad70f9525aebc2cff

As we can see, the native posix thread library (NPTL) of glibc correctly return value in case of time out as following:

  84 #if (!defined __ASSUME_FUTEX_CLOCK_REALTIME \
  85      || !defined lll_futex_timed_wait_bitset)
  86       struct timeval tv;
  87       struct timespec rt;
  88 
  89       /* Get the current time.  */
  90       (void) __gettimeofday (&tv, NULL);
  91 
  92       /* Compute relative timeout.  */
  93       rt.tv_sec = abstime->tv_sec - tv.tv_sec;
  94       rt.tv_nsec = abstime->tv_nsec - tv.tv_usec * 1000;
  95       if (rt.tv_nsec < 0)
  96         {
  97           rt.tv_nsec += 1000000000;
  98           --rt.tv_sec;
  99         }
 100 
 101       /* Already timed out?  */
 102       if (rt.tv_sec < 0)
 103         return ETIMEDOUT;
 104 #endif

At least, the ETIMEDOUT handling of glibc is reasonable to me. It seems that the WaitForSingleObject() function repeatedly try to get timed lock until it will get the lock within specified lock time (e.g. 30000).

When I tried to monitor system call behavior of FUTEX (Fast User-space muTEX), the futex system call normally returns "ETIMEODOUT(-1)" value. So, I think that @kouvel have to modify WaitForSignle Object() function for Raspberry Pi2 (Linux/ARM) device as well as X86 desktop. @jyoungyun , please re-test paltest_namedmutex_test1 test after increasing the TimedoutMilliseconds.

  30 #define MaxPathSize (200)
  31 const DWORD PollLoopSleepMilliseconds = 100;
  32 const DWORD FailTimeoutMilliseconds = 30000;
  33 DWORD g_expectedTimeoutMilliseconds = 500;    
futex(0xb6f34008, FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, -2147453529, {1465791176, 347627253}, ffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xb6f34008, FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, -2147453529, {1465791176, 347627253}, ffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xb6f34008, FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, -2147453529, {1465791176, 347627253}, ffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xb6f34008, FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, -2147453529, {1465791176, 347627253}, ffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xb6f34008, FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, -2147453529, {1465791176, 347627253}, ffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xb6f34008, FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, -2147453529, {1465791176, 347627253}, ffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xb6f34008, FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, -2147453529, {1465791176, 347627253}, ffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xb6f34008, FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, -2147453529, {1465791176, 347627253}, ffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xb6f34008, FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, -2147453529, {1465791176, 347627253}, ffffffff) = -1 ETIMEDOUT (Connection timed out)

Below is additional debug information.

(gdb) bt
#0  __libc_do_syscall () at ../ports/sysdeps/unix/sysv/linux/arm/libc-do-syscall.S:44
dotnet/coreclr#1  0xb6f2041a in __lll_robust_timedlock_wait (futex=futex@entry=0xb6f34008, 
    abstime=abstime@entry=0xbea3364c, private=private@entry=128)
    at ../nptl/sysdeps/unix/sysv/linux/lowlevelrobustlock.c:115
dotnet/coreclr#2  0xb6f1ce84 in pthread_mutex_timedlock (mutex=0xb6f34008, abstime=0xbea3364c)
    at pthread_mutex_timedlock.c:206
dotnet/coreclr#3  0x0003ee46 in MutexHelpers::TryAcquireLock (mutex=0xb6f34008, 
    timeoutMilliseconds=<optimized out>)
    at /work/dotnet/coreclr.git/src/pal/src/synchobj/mutex.cpp:863
dotnet/coreclr#4  0x0003f1e8 in NamedMutexProcessData::TryAcquireLock (this=0x783c8, timeoutMilliseconds=265)
    at /work/dotnet/coreclr.git/src/pal/src/synchobj/mutex.cpp:1329
dotnet/coreclr#5  0x000439ba in CorUnix::InternalWaitForMultipleObjectsEx (pThread=0x6f540, nCount=1, 
    lpHandles=<optimized out>, bWaitAll=<optimized out>, dwMilliseconds=<optimized out>, 
    bAlertable=-110) at /work/dotnet/coreclr.git/src/pal/src/synchmgr/wait.cpp:639
dotnet/coreclr#6  0x0004380c in WaitForSingleObject (
    hHandle=<error reading variable: Cannot access memory at address 0x800075a7>, 
    dwMilliseconds=500) at /work/dotnet/coreclr.git/src/pal/src/synchmgr/wait.cpp:67
dotnet/coreclr#7  0x00021ac0 in MutualExclusionTests_Parent ()
    at /work/dotnet/coreclr.git/src/pal/tests/palsuite/threading/NamedMutex/test1/namedmutex.cpp:510
dotnet/coreclr#8  0x0002244e in MutualExclusionTests ()
    at /work/dotnet/coreclr.git/src/pal/tests/palsuite/threading/NamedMutex/test1/namedmutex.cpp:590
dotnet/coreclr#9  0x0002504e in RunTests ()
    at /work/dotnet/coreclr.git/src/pal/tests/palsuite/threading/NamedMutex/test1/namedmutex.cpp:901
dotnet/coreclr#10 main (argc=<optimized out>, argv=<optimized out>)
    at /work/dotnet/coreclr.git/src/pal/tests/palsuite/threading/NamedMutex/test1/namedmutex.cpp:994
(gdb) 
{0xd17-0x85715} EXIT  [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchmgr/wait.cpp.70: WaitForSingleObject returns DWORD 0
{0xd19-0x85715} ENTRY [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchmgr/wait.cpp.63: WaitForSingleObject(hHandle=000000A8, dwMilliseconds=30000)
{0xd17-0x85715} ENTRY [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchobj/mutex.cpp.395: ReleaseMutex(hMutex=00000098)
{0xd17-0x85715} EXIT  [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchobj/mutex.cpp.406: ReleaseMutex returns BOOL 1
{0xd17-0x85715} ENTRY [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchmgr/wait.cpp.63: WaitForSingleObject(hHandle=00000088, dwMilliseconds=0)
{0xd17-0x85715} EXIT  [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchmgr/wait.cpp.70: WaitForSingleObject returns DWORD 0
{0xd17-0x85715} ENTRY [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchmgr/wait.cpp.63: WaitForSingleObject(hHandle=000000A4, dwMilliseconds=0)
{0xd17-0x85715} EXIT  [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchmgr/wait.cpp.70: WaitForSingleObject returns DWORD 258
{0xd17-0x85715} ENTRY [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchmgr/wait.cpp.63: WaitForSingleObject(hHandle=000000A4, dwMilliseconds=500)
{0xd17-0x85715} EXIT  [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchmgr/wait.cpp.70: WaitForSingleObject returns DWORD 0
{0xd19-0x85715} ENTRY [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchmgr/wait.cpp.63: WaitForSingleObject(hHandle=000000A8, dwMilliseconds=30000)
{0xd17-0x85715} ENTRY [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchobj/mutex.cpp.395: ReleaseMutex(hMutex=00000098)
{0xd17-0x85715} EXIT  [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchobj/mutex.cpp.406: ReleaseMutex returns BOOL 1
{0xd17-0x85715} ENTRY [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchmgr/wait.cpp.63: WaitForSingleObject(hHandle=00000088, dwMilliseconds=0)
{0xd17-0x85715} EXIT  [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchmgr/wait.cpp.70: WaitForSingleObject returns DWORD 0
{0xd17-0x85715} ENTRY [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchmgr/wait.cpp.63: WaitForSingleObject(hHandle=000000A4, dwMilliseconds=0)
{0xd17-0x85715} EXIT  [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchmgr/wait.cpp.70: WaitForSingleObject returns DWORD 258
{0xd17-0x85715} ENTRY [SYNC   ] at /home/jyoung/git/dotnet/coreclr/src/pal/src/synchmgr/wait.cpp.63: WaitForSingleObject(hHandle=000000A4, dwMilliseconds=500)
leemgs commented 8 years ago

futex(0xb6f34008, FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, -2147453529, {1465791176, 347627253}, ffffffff) = -1 ETIMEDOUT (Connection timed out) futex(0xb6f34008, FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, -2147453529, {1465791176, 347627253}, ffffffff) = -1 ETIMEDOUT (Connection timed out)

According to the FUTEX manual, the ETIMEDOUT value means Timeout during the FUTEX_WAIT operation. Note that Intel X64 CPU(Desktop PC) and ARM Cortex-A CPU (Embedded) have different computing power. So, I wonder about the goal of this unit test.

%Cpu0  : 14.8 us, 83.9 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  1.3 si,  0.0 st
%Cpu1  : 95.1 us,  4.9 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
KiB Mem:   2069744 total,  1231888 used,   837856 free,   155784 buffers
KiB Swap:        0 total,        0 used,        0 free.   952588 cached Mem

  PID USER      PR  NI    VIRT    RES    SHR S %CPU %MEM     TIME+ COMMAND                  
30250 root      20   0   22240   3300   2796 R 42.3  0.2   5:11.71 paltest_namedmu          
30252 root      20   0   22240   3300   2796 t 11.1  0.2   5:11.86 paltest_namedmu          

FYI, Below is the hardware timer information to check the timekeeping using clocksource and clockevent. I could not find any problem from my development boards.

rasp-pi2@u14.04$> cat /sys/devices/system/clocksource/clocksource0/current_clocksource 
arch_sys_counter

rasp-pi2@u14.04$> cat /sys/devices/system/clocksource/clocksource0/available_clocksource 
arch_sys_counter mct-frc
crhome1@u14.04$> cat /sys/devices/system/clocksource/clocksource0/current_clocksource 
arch_sys_counter

chrome1@u14.04$> cat /sys/devices/system/clocksource/clocksource0/available_clocksource 
arch_sys_counter
x64@u14.04$> cat /sys/devices/system/clocksource/clocksource0/current_clocksource 
tsc
x64@u14.04$> cat /sys/devices/system/clocksource/clocksource0/available_clocksource 
tsc hpet acpi_pm
jyoungyun commented 8 years ago

This issue is related to robust mutex. The _lll_robust_timedlock_wait function (in glibc) was hanged when the robust mutex was locked by A thread and B thread tried to lock it with a specified time by usig pthread_mutex_timedlock. My simple test case was fixed when the PTHREAD_PRIO_INHERIT protocol set. But the namedtest still failed. Now the error log:

paltest_namedmutex_test1: pthread_mutex_timedlock.c:306: pthread_mutex_timedlock: Assertion `(-(e)) != 35 || (kind != PTHREAD_MUTEX_ERRORCHECK_NP && kind != PTHREAD_MUTEX_RECURSIVE_NP)' failed.
leemgs commented 8 years ago

In order to prevent unexpected priority inversion on Linux/ARM, using PTHREAD_PRIO_INHERIT protocol is good to me. Please share us the output of 'strace -s paltrst_namedmutex_test1' command?

ghost commented 8 years ago

I am seeing similar behavior with this test on Alpine Linux (x86_64): https://github.com/dotnet/coreclr/issues/5872#issuecomment-227012425.

jyoungyun commented 8 years ago

@leemgs The paltest_namedmutex_test1 occured EDEADLK (Resource deadlock avoided)

clock_gettime(CLOCK_REALTIME, {1421112297, 40466308}) = 0
futex(0xb6f98008, FUTEX_LOCK_PI, 1)     = -1 EDEADLK (Resource deadlock avoided)
write(2, "paltest_namedmutex_test1: pthrea"..., 194paltest_namedmutex_test1: pthread_mutex_timedlock.c:306: pthread_mutex_timedlock: Assertion `(-(e)) != 35 || (kind != PTHREAD_MUTEX_ERRORCHECK_NP && kind != PTHREAD_MUTEX_RECURSIVE_NP)' failed.
) = 194
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb6f97000
rt_sigprocmask(SIG_UNBLOCK, [ABRT], NULL, 8) = 0
tgkill(3661, 3661, SIGABRT)             = 0
--- SIGABRT {si_signo=SIGABRT, si_code=SI_TKILL, si_pid=3661, si_uid=0} ---
+++ killed by SIGABRT (core dumped) +++
Aborted (core dumped)
kouvel commented 8 years ago

Thanks for looking into this. I don't understand why PTHREAD_PRIO_INHERIT should fix the deadlock, although it makes sense to use inherited priority. Even considering priorities, shouldn't a timed wait still time out?

I believe EDEADLK should only occur for non-recursive mutexes when a thread that already owns the lock tries to lock the mutex a second time. This pthread mutex should be set up as a recursive mutex here: https://github.com/dotnet/coreclr/blob/master/src/pal/src/synchobj/mutex.cpp#L825.

There is an alternate implementation that uses file locks instead of a pthread process-shared, robust, recursive mutex. If there are continuing issues related to pthread mutexes, maybe it would be better to switch to the file lock implementation. Currently, the detection is purely based on availability of the pthread functions on the system: https://github.com/dotnet/coreclr/blob/master/src/pal/src/configure.cmake#L993. If we can somehow detect these issues there by running some code, the macro HAVE_FULLY_FEATURED_PTHREAD_MUTEXES can be disabled and that will cause it to use the file lock implementation.

jyoungyun commented 8 years ago

I think I found the reason why paltest_namedmutex_test1 was hang. It is related to glibc. After applying below code, paltest_namedmutex_test1 is working well. Of course my simple code is also passed. (https://sourceware.org/bugzilla/show_bug.cgi?id=20263)

--- a/nptl/lowlevelrobustlock.c
+++ b/nptl/lowlevelrobustlock.c
@@ -113,15 +113,20 @@ __lll_robust_timedlock_wait (int *futex, const struct timespec *abstime,
          && atomic_compare_and_exchange_bool_acq (futex, newval, oldval))
        continue;

+      int err;
       /* If *futex == 2, wait until woken or timeout.  */
 #if (!defined __ASSUME_FUTEX_CLOCK_REALTIME \
      || !defined lll_futex_timed_wait_bitset)
-      lll_futex_timed_wait (futex, newval, &rt, private);
+      err = lll_futex_timed_wait (futex, newval, &rt, private);
 #else
-      lll_futex_timed_wait_bitset (futex, newval, abstime,
+      err = lll_futex_timed_wait_bitset (futex, newval, abstime,
                                   FUTEX_CLOCK_REALTIME, private);
 #endif

+      /* the futex call time out */
+      if (err == -ETIMEDOUT)
+           return ETIMEDOUT;
+
     try:
       ;
     }

I will update this patch soon but it takes a long time to be applied our reference targets. So I think this test should be changed to testsFailingOutsideWindows.txt. @kouvel And I think this issue can't detect in configure.cmake file because it is a deadlock situation. Can we detect it in advance ?

I attached my simple test code. a.c.txt

kouvel commented 8 years ago

Oh great, thank you! I'll send out a PR to disable the test for the time being.