>>> bt
#0 0x00007f7430470fbb in raise () from /home/rusty/futex/sysroot/lib/x86_64-linux-gnu/libc.so.6
#1 0x00007f7430456864 in abort () from /home/rusty/futex/sysroot/lib/x86_64-linux-gnu/libc.so.6
#2 0x00007f74304b949c in ?? () from /home/rusty/futex/sysroot/lib/x86_64-linux-gnu/libc.so.6
#3 0x00007f74304b97b0 in __libc_fatal () from /home/rusty/futex/sysroot/lib/x86_64-linux-gnu/libc.so.6
#4 0x00007f743062f01c in __lll_lock_wait () from /home/rusty/futex/sysroot/lib/x86_64-linux-gnu/libpthread.so.0
#5 0x00007f7430627733 in pthread_mutex_lock () from /home/rusty/futex/sysroot/lib/x86_64-linux-gnu/libpthread.so.0
#6 0x0000000001acedc0 in ?? ()
#7 0x0000000000000000 in ?? ()
void
__lll_lock_wait (int *futex, int private)
{
if (atomic_load_relaxed (futex) == 2)
goto futex;
while (atomic_exchange_acquire (futex, 2) != 0)
{
futex:
LIBC_PROBE (lll_lock_wait, 1, futex);
futex_wait ((unsigned int *) futex, 2, private); /* Wait if *futex == 2. */
}
}
libc_hidden_def (__lll_lock_wait)
futex_wait的代码如下
static __always_inline int
futex_wait (unsigned int *futex_word, unsigned int expected, int private)
{
int err = lll_futex_timed_wait (futex_word, expected, NULL, private);
switch (err)
{
case 0:
case -EAGAIN:
case -EINTR:
return -err;
case -ETIMEDOUT: /* Cannot have happened as we provided no timeout. */
case -EFAULT: /* Must have been caused by a glibc or application bug. */
case -EINVAL: /* Either due to wrong alignment or due to the timeout not
being normalized. Must have been caused by a glibc or
application bug. */
case -ENOSYS: /* Must have been caused by a glibc bug. */
/* No other errors are documented at this time. */
default:
futex_fatal_error ();
}
}
Debugging a futex crash https://rustylife.github.io/2023/08/15/futex-crash.html
很精彩的文章。首先core的堆栈仅有 futex相关
怎么抓bug?
首先有__libc_fatal或者futex_fatal_error 基本能定位到是futex_wait上出问题了 ll_futex_wait基本包一层futex_wait
futex_wait的代码如下
这几个错误是最值得关注的,按图索骥就行了 首先不是ETIMEDOUT,其次EFAULT和ENOSYS是比较难查的,EINVAL反而比较好查,先确认是不是wrong alignment 对齐问题,简单来说,就是查地址,思路有了,怎么查?切到frame 5
抓到了不对齐??看懂怎么定位的了没?
__lll_lock_wait (int *futex, int private) 怎么传futex和private?
mov 0x8(%rsp),%rdi到lock_wait,说明 退回去就是futex和private,对吧
不记得的琢磨一下汇编哈,看看这个 https://godbolt.org/z/bbET88a1x
复现一下
为什么锁两次,因为锁一次不能复现,正常来说顶多死锁
编译执行
能复现,接下来就要确认futex哪里报错了
trace-cmd 录一下调用栈
能看到死在get_futex_key里,看一下代码
就是你了