Open mpoeter opened 3 years ago
Please provide a link to source code and a full report. Is it latest clang version? Does it reproduce with the latest clang?
This was with clang-12. I stubled over this issue while running some integrations from https://github.com/arangodb/arangodb, but you probably don't want to debug this issue with that source base!
I will try to find some time and create a stripped down example to reproduce this.
I am able to reproduce this with the following sample program:
#include <atomic>
#include <deque>
#include <memory>
#include <thread>
struct Dummy {
unsigned number;
};
struct Versions {
std::atomic<unsigned> last_sequence;
std::atomic<unsigned> last_published_sequence;
};
struct Foo {
explicit Foo(Versions* versions) : versions(versions) {}
std::deque<Dummy> snapshots;
bool cond = true;
const Versions* versions;
unsigned foo() {
unsigned oldest_snapshot;
if (snapshots.empty()) {
oldest_snapshot = cond
? versions->last_sequence.load(std::memory_order_relaxed)
: versions->last_published_sequence.load(std::memory_order_relaxed);
} else {
oldest_snapshot = snapshots.begin()->number;
}
return oldest_snapshot;
}
};
int main() {
std::unique_ptr<Versions> versions = std::make_unique<Versions>();
std::atomic<bool> stop{false};
std::thread t1([&stop, &versions]() {
unsigned i = 0;
while (!stop)
versions->last_sequence.store(++i, std::memory_order_relaxed);
});
std::thread t2([&stop, &versions]() {
Foo f(versions.get());
while (!stop)
volatile unsigned v = f.foo();
});
std::this_thread::sleep_for(std::chrono::seconds(1));
stop = true;
t1.join();
t2.join();
return 0;
}
We only ever access last_sequence
(which is atomic), but when running clang-12 main.cpp -fsanitize=thread -lstdc++ -O3 && ./a.out
I get the following report:
WARNING: ThreadSanitizer: data race (pid=6164)
Read of size 4 at 0x7b0400000000 by thread T2:
#0 std::thread::_State_impl<std::thread::_Invoker<std::tuple<main::$_1> > >::_M_run() <null> (a.out+0x4b6347)
#1 <null> <null> (libstdc++.so.6+0xd6d83)
Previous atomic write of size 4 at 0x7b0400000000 by thread T1:
#0 __tsan_atomic32_store <null> (a.out+0x471004)
#1 std::thread::_State_impl<std::thread::_Invoker<std::tuple<main::$_0> > >::_M_run() <null> (a.out+0x4b622e)
#2 <null> <null> (libstdc++.so.6+0xd6d83)
Location is heap block of size 8 at 0x7b0400000000 allocated by main thread:
#0 malloc <null> (a.out+0x4233ab)
#1 operator new(unsigned long) <null> (libstdc++.so.6+0xaac28)
#2 __libc_start_main <null> (libc.so.6+0x28cb1)
Thread T2 (tid=6167, running) created by main thread at:
#0 pthread_create <null> (a.out+0x424bdb)
#1 std::thread::_M_start_thread(std::unique_ptr<std::thread::_State, std::default_delete<std::thread::_State> >, void (*)()) <null> (libstdc++.so.6+0xd7048)
#2 __libc_start_main <null> (libc.so.6+0x28cb1)
Thread T1 (tid=6166, running) created by main thread at:
#0 pthread_create <null> (a.out+0x424bdb)
#1 std::thread::_M_start_thread(std::unique_ptr<std::thread::_State, std::default_delete<std::thread::_State> >, void (*)()) <null> (libstdc++.so.6+0xd7048)
#2 __libc_start_main <null> (libc.so.6+0x28cb1)
SUMMARY: ThreadSanitizer: data race (/mnt/d/cpptest/a.out+0x4b6347) in std::thread::_State_impl<std::thread::_Invoker<std::tuple<main::$_1> > >::_M_run()
==================
ThreadSanitizer: reported 1 warnings
I can reproduce this with clang version 13.0.0 (https://github.com/dvyukov/llvm-project.git cb6c9976f982b4ac7fc212e05ef390ab396d7c69)
.
I've added attribute((noinline)) to foo and got this:
00000000004d0360 <_ZN3Foo3fooEv>:
_ZN3Foo3fooEv():
/tmp/test.cc:22
std::deque<Dummy> snapshots;
bool cond = true;
const Versions* versions;
__attribute__((noinline)) unsigned foo() {
4d0360: 55 push %rbp
4d0361: 41 57 push %r15
4d0363: 41 56 push %r14
4d0365: 53 push %rbx
4d0366: 50 push %rax
4d0367: 48 89 fb mov %rdi,%rbx
4d036a: 48 8b 7c 24 28 mov 0x28(%rsp),%rdi
4d036f: e8 fc 21 ff ff callq 4c2570 <__tsan_func_entry>
_ZSteqRKSt15_Deque_iteratorI5DummyRS0_PS0_ES5_():
/usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/bits/stl_deque.h:269
_M_last = _M_first + difference_type(_S_buffer_size());
}
friend bool
operator==(const _Self& __x, const _Self& __y) _GLIBCXX_NOEXCEPT
{ return __x._M_cur == __y._M_cur; }
4d0374: 48 8d 7b 30 lea 0x30(%rbx),%rdi
4d0378: e8 f3 d4 fe ff callq 4bd870 <__tsan_read8>
4d037d: 4c 8b 7b 30 mov 0x30(%rbx),%r15
4d0381: 48 8d 7b 10 lea 0x10(%rbx),%rdi
4d0385: e8 e6 d4 fe ff callq 4bd870 <__tsan_read8>
4d038a: 4c 8b 73 10 mov 0x10(%rbx),%r14
4d038e: 4d 39 f7 cmp %r14,%r15
_ZN3Foo3fooEv():
/tmp/test.cc:24
unsigned oldest_snapshot;
if (snapshots.empty()) {
4d0391: 75 43 jne 4d03d6 <_ZN3Foo3fooEv+0x76>
/tmp/test.cc:25
oldest_snapshot = cond
4d0393: 48 8d 7b 50 lea 0x50(%rbx),%rdi
4d0397: e8 84 c4 fe ff callq 4bc820 <__tsan_read1>
4d039c: 40 8a 6b 50 mov 0x50(%rbx),%bpl
4d03a0: 48 8d 7b 58 lea 0x58(%rbx),%rdi
4d03a4: e8 c7 d4 fe ff callq 4bd870 <__tsan_read8>
4d03a9: 4c 8b 73 58 mov 0x58(%rbx),%r14
4d03ad: 40 84 ed test %bpl,%bpl
4d03b0: 74 0f je 4d03c1 <_ZN3Foo3fooEv+0x61>
_ZNKSt13__atomic_baseIjE4loadESt12memory_order():
/usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/bits/atomic_base.h:426
{
memory_order __b = __m & __memory_order_mask;
__glibcxx_assert(__b != memory_order_release);
__glibcxx_assert(__b != memory_order_acq_rel);
return __atomic_load_n(&_M_i, int(__m));
4d03b2: 4c 89 f7 mov %r14,%rdi
4d03b5: 31 f6 xor %esi,%esi
4d03b7: e8 34 9b fd ff callq 4a9ef0 <__tsan_atomic32_load>
4d03bc: 41 8b 06 mov (%r14),%eax
4d03bf: eb 15 jmp 4d03d6 <_ZN3Foo3fooEv+0x76>
4d03c1: 49 8d 5e 04 lea 0x4(%r14),%rbx
4d03c5: 48 89 df mov %rbx,%rdi
4d03c8: 31 f6 xor %esi,%esi
4d03ca: e8 21 9b fd ff callq 4a9ef0 <__tsan_atomic32_load>
4d03cf: 41 8b 46 04 mov 0x4(%r14),%eax
4d03d3: 49 89 de mov %rbx,%r14
_ZN3Foo3fooEv():
4d03d6: 4c 89 f7 mov %r14,%rdi
4d03d9: e8 22 cf fe ff callq 4bd300 <__tsan_read4>
4d03de: 41 8b 1e mov (%r14),%ebx
/tmp/test.cc:32
: versions->last_published_sequence.load(std::memory_order_relaxed);
} else {
oldest_snapshot = snapshots.begin()->number;
}
return oldest_snapshot;
4d03e1: e8 fa 21 ff ff callq 4c25e0 <__tsan_func_exit>
4d03e6: 89 d8 mov %ebx,%eax
4d03e8: 48 83 c4 08 add $0x8,%rsp
4d03ec: 5b pop %rbx
4d03ed: 41 5e pop %r14
4d03ef: 41 5f pop %r15
4d03f1: 5d pop %rbp
4d03f2: c3 retq
4d03de
is the non-atomic load from the atomic var.
That block is reused/merged, there are several jump to 4d03d6
.
The code is quite questionable, it does the atomic loads, discards the values and does non-atomic load...
Without -fsanitize=thread the code looks strange as well:
0000000000401510 <_ZN3Foo3fooEv>:
401510: 48 8b 47 10 mov 0x10(%rdi),%rax
401514: 48 39 47 30 cmp %rax,0x30(%rdi)
401518: 74 03 je 40151d <_ZN3Foo3fooEv+0xd>
40151a: 8b 00 mov (%rax),%eax
40151c: c3 retq
40151d: 80 7f 50 00 cmpb $0x0,0x50(%rdi)
401521: 48 8b 47 58 mov 0x58(%rdi),%rax
401525: 74 05 je 40152c <_ZN3Foo3fooEv+0x1c>
401527: 8b 08 mov (%rax),%ecx
401529: 8b 00 mov (%rax),%eax
40152b: c3 retq
40152c: 8b 48 04 mov 0x4(%rax),%ecx
40152f: 48 83 c0 04 add $0x4,%rax
401533: 8b 00 mov (%rax),%eax
401535: c3 retq
WAT?
401527: 8b 08 mov (%rax),%ecx
401529: 8b 00 mov (%rax),%eax
40152b: c3 retq
WAT?
40152c: 8b 48 04 mov 0x4(%rax),%ecx
40152f: 48 83 c0 04 add $0x4,%rax
401533: 8b 00 mov (%rax),%eax
401535: c3 retq
It seems that this is a generic codegen bug that also affects normal builds. I would assume one of these duplicated loads is "atomic" and another is "non-atomic". In normal build this only leads to a performance regression. In tsan it becomes more visible.
Yes, I also found the code rather strange, but TBH I did not check what it looks like without tsan. Seems like this should be reported to clang then.
Here is a bit more simplified code:
#include <atomic>
#include <deque>
#include <memory>
#include <thread>
struct Foo {
unsigned *ptr = nullptr;
bool cond = true;
std::atomic<unsigned> a1 = {0};
std::atomic<unsigned> a2 = {0};
__attribute__((noinline)) unsigned foo() {
unsigned oldest_snapshot;
if (!ptr) {
oldest_snapshot = cond
? a1.load(std::memory_order_relaxed)
: a2.load(std::memory_order_relaxed);
} else {
oldest_snapshot = *ptr;
}
return oldest_snapshot;
}
};
int main() {
Foo foo;
std::atomic<bool> stop{false};
std::thread t1([&]() {
for (unsigned i = 0; !stop; i++)
foo.a1.store(++i, std::memory_order_relaxed);
});
std::thread t2([&]() {
while (!stop)
volatile unsigned v = foo.foo();
});
std::this_thread::sleep_for(std::chrono::seconds(1));
stop = true;
t1.join();
t2.join();
return 0;
}
I've also re-tested with the current llvm HEAD: clang version 13.0.0 (https://github.com/llvm/llvm-project.git 3feb84a36f5128dd0f6e9c65314609a9ce4372bc) It's still reproducible.
code with -O1 (looks bad):
0000000000401420 <_ZN3Foo3fooEv>:
401420: 48 8b 07 mov (%rdi),%rax
401423: 48 85 c0 test %rax,%rax
401426: 74 03 je 40142b <_ZN3Foo3fooEv+0xb>
401428: 8b 00 mov (%rax),%eax
40142a: c3 retq
40142b: 48 8d 47 0c lea 0xc(%rdi),%rax
40142f: 48 8d 4f 10 lea 0x10(%rdi),%rcx
401433: 31 d2 xor %edx,%edx
401435: 80 7f 08 00 cmpb $0x0,0x8(%rdi)
401439: 0f 94 c2 sete %dl
40143c: 48 0f 44 c1 cmove %rcx,%rax
401440: 8b 4c 97 0c mov 0xc(%rdi,%rdx,4),%ecx
401444: 8b 00 mov (%rax),%eax
401446: c3 retq
code with -O1 -fsanitize=thread (looks bad as well):
00000000004d6500 <_ZN3Foo3fooEv>:
4d6500: 41 57 push %r15
4d6502: 41 56 push %r14
4d6504: 53 push %rbx
4d6505: 49 89 fe mov %rdi,%r14
4d6508: 48 8b 7c 24 18 mov 0x18(%rsp),%rdi
4d650d: e8 8e 20 ff ff callq 4c85a0 <__tsan_func_entry>
4d6512: 4c 89 f7 mov %r14,%rdi
4d6515: e8 c6 b0 fe ff callq 4c15e0 <__tsan_read8>
4d651a: 49 8b 1e mov (%r14),%rbx
4d651d: 48 85 db test %rbx,%rbx
4d6520: 75 30 jne 4d6552 <_ZN3Foo3fooEv+0x52>
4d6522: 49 8d 7e 08 lea 0x8(%r14),%rdi
4d6526: e8 65 97 fe ff callq 4bfc90 <__tsan_read1>
4d652b: 49 8d 46 0c lea 0xc(%r14),%rax
4d652f: 45 31 ff xor %r15d,%r15d
4d6532: 41 80 7e 08 00 cmpb $0x0,0x8(%r14)
4d6537: 49 8d 5e 10 lea 0x10(%r14),%rbx
4d653b: 41 0f 94 c7 sete %r15b
4d653f: 48 0f 45 d8 cmovne %rax,%rbx
4d6543: 48 89 df mov %rbx,%rdi
4d6546: 31 f6 xor %esi,%esi
4d6548: e8 a3 5a fd ff callq 4abff0 <__tsan_atomic32_load>
4d654d: 43 8b 44 be 0c mov 0xc(%r14,%r15,4),%eax
4d6552: 48 89 df mov %rbx,%rdi
4d6555: e8 06 a8 fe ff callq 4c0d60 <__tsan_read4> // offending non-atomic read
4d655a: 8b 1b mov (%rbx),%ebx // offending non-atomic read
4d655c: e8 df 20 ff ff callq 4c8640 <__tsan_func_exit>
4d6561: 89 d8 mov %ebx,%eax
4d6563: 5b pop %rbx
4d6564: 41 5e pop %r14
4d6566: 41 5f pop %r15
4d6568: c3 retq
This needs to be filed at https://bugs.llvm.org/ Do you want to file it yourself?
I think I don't have an account yet at https://bugs.llvm.org - could you file it instead? Otherwise I'll contact the admin to create an account for me.
I will file it.
Ouch! I see a bad miscompilation in release build at least on arm64, but I won't be surprised if this can lead to some miscompilations on slightly modified code on x86 as well. Here the resulting load misses an acquire fence:
struct Foo {
unsigned *ptr = nullptr;
bool cond = true;
unsigned a1 = 0;
unsigned a2 = 0;
unsigned foo();
};
unsigned Foo::foo() {
unsigned oldest_snapshot;
if (!ptr) {
oldest_snapshot = cond
? __atomic_load_n(&a1, __ATOMIC_ACQUIRE)
: __atomic_load_n(&a2, __ATOMIC_ACQUIRE);
} else {
oldest_snapshot = *ptr;
}
return oldest_snapshot;
}
0000000000000000 <_ZN3Foo3fooEv>:
0: f9400008 ldr x8, [x0]
4: b4000068 cbz x8, 10 <_ZN3Foo3fooEv+0x10>
8: b9400100 ldr w0, [x8]
c: d65f03c0 ret
10: 39402008 ldrb w8, [x0, #8]
14: 34000068 cbz w8, 20 <_ZN3Foo3fooEv+0x20>
18: 91003008 add x8, x0, #0xc
1c: 14000002 b 24 <_ZN3Foo3fooEv+0x24>
20: 91004008 add x8, x0, #0x10
24: 88dffd1f ldar wzr, [x8]
28: b9400100 ldr w0, [x8]
2c: d65f03c0 ret
Filed https://bugs.llvm.org/show_bug.cgi?id=50139 Thanks for digging and reporting!
Thanks for filing! Seems to be a good test case... :)
While running some code using RocksDB I received the following data race warning:
The relevant code in
DBImpl::ReleaseSnapshot
looks like this:The generated assembler code looks like this:
Effectively the compiler generated code that stores different addresses in
rbx
based on the conditions, and later simply dereferences that register to populate the variableoldest_snapshot
. However, this causes false positive data race warnings in TSan.I did not yet have the time to try to create a stripped down example that reproduces this problem, but I hope I have provided all relevant information.