llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
29.15k stars 12.03k forks source link

Miscompiled C++ code for mingw/x86_64 target #64253

Closed mstorsjo closed 1 year ago

mstorsjo commented 1 year ago

Since c65b4d64d4b09795fe237b62a4226121c5b13248, https://reviews.llvm.org/D135462, some C++ code is miscompiled for the mingw/x86_64 target.

This breaks the libcxx testcase libcxx/test/std/ranges/range.factories/range.repeat.view/ctor.piecewise.pass.cpp.

Noticing the issue is somewhat tricky since this testcase is rather new; the libcxx CI uses a prebuilt release with Clang 16 (where the testcase passes correctly). At the point of the regression, the testcase didn't exist yet (so a configuration that tests libcxx with latest git Clang didn't catch it immediately until long after the miscompilation was introduced); the test (and corresponding libc++ implementation) was added only later.

The issue can be reproduced with a reduced testcase like this:

#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
#include <ctype.h>
#include <wctype.h>
#include <stdio.h>

namespace std {
template <class, int b> struct c { static const bool f = b; };
template <bool d> using e = c<bool, d>;
template <class a, class g> using h = e<__is_same(a, g)>;
void ab();
inline namespace ad {
template <class, class> concept i = requires { ab; };
template <class a, class g> concept aa = h<a, g>::f;
template <class a, class g> concept j = aa<g, a>;
template <bool, class, class> using ah = int;
template <class, class> using ae = int;
template <class, class> using af = decltype(0);
template <class, class, class, class> struct ag;
template <class aj, class ai> using aq = af<ae<aj, ai>, ae<ai, aj>>;
template <class ak, class al, class aj, class ai> requires requires {
  typename aq<aj, ai>;
}
struct ag<ak, al, aj, ai>;
template <class a> concept am = requires { a{}; };
template <class a> concept an = i<a, a>;
template <class a> concept ao = i<a, a>;
template <class a> constexpr bool ap = __is_object(a);
} // namespace ad
struct n {
  template <class g> using k = g;
};
template <class, class g> using ar = n::k<g>;
template <class a> concept as = am<a>;
namespace ad {
template <class a> a *addressof(a &);
template <class at> concept au = requires(at aw) { aw; };
template <class at> concept ax = requires(at aw) { aw; };
struct {
  template <class a> auto operator()(a &&ay) { return ay.az(); }
} az;
template <size_t...> struct o;
template <class ba, ba... bb> struct bc {
  template <size_t> using l = o<bb...>;
};
template <size_t bd, size_t be>
using m = __make_integer_seq<bc, size_t, bd>::template l<be>;
template <class...> class tuple;
template <size_t...> struct o {};
template <size_t bd, size_t be = 0> struct q { typedef m<bd, be> ac; };
template <class...> struct bf {};
template <class> struct bg;
template <class... a> struct bg<tuple<a...>> : c<size_t, sizeof...(a)> {};
template <class, class> struct bh;
template <template <class...> class s, class... av, size_t... bi>
struct bh<s<av...>, o<bi...>> {
  template <class a> using p = bf<ar<a, __type_pack_element<bi, av>>...>;
};
template <class a, size_t bd = bg<a>::f> struct bj {
  using bk = a;
  using r = bh<bk, typename q<bd>::ac>;
  using ac = r::template p<a>;
};
int piecewise_construct;
template <size_t, class> class v {
public:
  template <class a> v(a) {}
};
template <class...> struct w;
template <size_t... t, class... a> struct w<o<t...>, a...> : v<t, a>... {
  template <size_t... x, class... y> w(o<x...>, bf<y...>) : v<x, y>(0)... {}
};
template <class... a> class tuple {
  w<typename q<sizeof...(a)>::ac, a...> bl;

public:
  template <class... g>
  tuple(g...) : bl(typename q<sizeof...(g)>::ac(), typename bj<tuple>::ac()) {}
};
template <class... a> tuple(a...) -> tuple<a...>;
template <class a, class s> a bm(s) { return a(); }
template <class a, class s> a bn(s ay) { return bm<a>(ay); }
template <class a> concept bo = ax<a>;
template <class z> class bp {
  z u() { return static_cast<z &>(*this); }

public:
  template <bo bq> auto operator[](bq br) { return az(u())[br]; }
};
struct bs {
} unreachable_sentinel;
template <class a> concept bt = ap<a>;
namespace ranges {
template <bt> class bu;
template <class a> concept bv = ao<a>;
template <bt a> requires bv<a> class bu<a> {
  a bw;

public:
  bu(...) {}
  a &operator*() { return bw; }
};
template <class a> concept bx = au<a>;
template <an a, as by = bs>
requires(ap<a> &&j<a, a> && (bx<by> || j<by, bs>)) class repeat_view
    : public bp<repeat_view<a>> {
  class bz;

public:
  template <class... ca>
  repeat_view(int, tuple<> cb, tuple<ca...> cc)
      : ce(bn<a>(cb)), cf(bn<by>(cc)) {}
  bz az() { return addressof(*ce); }
  bu<a> ce;
  [[__no_unique_address__]] by cf;
};
template <an a, as by>
requires(ap<a> &&j<a, a> &&
         (bx<by> || j<by, bs>)) class repeat_view<a, by>::bz {
  friend repeat_view;
  using cg = ah<j<by, bs>, ptrdiff_t, by>;
  bz(a *ch) : ce(ch) {}

public:
  using cd = cg;
  a operator*() { return *ce; }
  a operator[](cd) { return **this; }
  a *ce;
};
} // namespace ranges
} // namespace ad
} // namespace std

struct A {
  int x = 111;
  int y = 222;

  constexpr A() = default;
  constexpr A(int _x, int _y) : x(_x), y(_y) {}
};

int main(int, char**) {
  std::ranges::repeat_view<A> rv(std::piecewise_construct, std::tuple{}, std::tuple{std::unreachable_sentinel});
  printf("rv[0].x = %d - %s\n", rv[0].x, rv[0].x == 111 ? "ok" : "incorrect");
  if (rv[0].x != 111)
    return 1;

  return 0;
}

misopt.zip

(I'll rerun the reduction to try to retain the libc++ header names to keep it slightly more readable.)

If compiled with clang -target x86_64-w64-mingw32 -std=c++2b misopt.cpp -o misopt.exe, the resulting binary outputs rv[0].x = 0 - incorrect. If optimization is added with e.g. -O2, it correctly outputs rv[0].x = 111 - ok instead. If compiled with Clang 16, it runs correctly in both cases.

CC @asavonic

mstorsjo commented 1 year ago

Attached is a less obfuscated version of the reduced testcase, where the symbol names haven't been changed - making it hopefully slightly more understandable, and having more likeness to the libc++ original: misopt2.zip

mstorsjo commented 1 year ago

CC @efriedma-quic who commented on this before in https://reviews.llvm.org/D135462 - hopefully you can help at least with looping in the right people to look at this.

I've got a fair bit of data pointing towards this being a Clang/frontend issue, not specific to the codegen backends. CC @AaronBallman who hopefully is good at spotting a bug or might know who might be good at it. And CC @rnk who generally is knowledgeable on the Windows specific quirks.

To look closer at the issue, I've split the testcase into three files; range.cpp contains the reduced/minimized libc++ implementation of the relevant classes. get.cpp includes range.cpp and instantiates some classes and returns an integer. get-main.cpp calls get.cpp and prints the verdict. get.cpp is almost entirely untangled from any platform specifics.

split.zip

If compiled for a mingw target, we get the failure:

$ clang -target x86_64-w64-mingw32 -std=c++2b get.cpp get-main.cpp -o get.exe; ./get.exe
rv[0].x = 0 - incorrect

With optimization, it is ok:

$ clang -target x86_64-w64-mingw32 -std=c++2b get.cpp get-main.cpp -o get.exe -O2; ./get.exe
rv[0].x = 111 - ok

If we split out generating the unoptimized IR for get.cpp, we can do the following:

$ clang -target x86_64-w64-mingw32 -std=c++2b get.cpp -S -emit-llvm -Xclang -disable-llvm-passes -o get.ll
$ llc -mtriple=x86_64-w64-mingw32 get.ll -filetype obj -o get.o
$ clang -target x86_64-w64-mingw32 get.o get-main.cpp -o get.exe; ./get.exe
rv[0].x = 0 - incorrect

At this point, the generated IR in get.ll seems hopelessly broken though - adding -O2 to llc (which IIRC is the default even) doesn't make the optimizer make this do the desired thing like it did when passing -O2 to clang. -O2 -Xclang -disable-llvm-passes doesn't get working IR either, only -O2 without -disable-llvm-passes to clang produces IR which then ends up compiled successfully.

To further show that the fault lies in the IR from -Xclang -disable-llvm-passes with a mingw target, let's try cross pollination (which isn't generally expected to work I guess, but the object file is selfcontained enough here that it's worth doing).

Generate IR for an x86_64-w64-mingw32 target, but compile it for x86_64-linux-gnu:

$ clang -target x86_64-w64-mingw32 get.cpp -std=c++2b -S -emit-llvm -Xclang -disable-llvm-passes -o get.ll
$ llc -mtriple=x86_64-linux-gnu get.ll -filetype obj -o get.o
$ clang -target x86_64-linux-gnu get.o get-main.cpp -o get.exe -Wl,--no-pie; ./get.exe
rv[0].x = 0 - incorrect

Also doing the same in reverse - generating IR for x86_64-linux-gnu but compiling it for x86_64-w64-mingw32 produces a working result:

$ clang -target x86_64-linux-gnu get.cpp -std=c++2b -S -emit-llvm -Xclang -disable-llvm-passes -o get.ll
$ llc -mtriple=x86_64-w64-mingw32 get.ll -filetype obj -o get.o  
$ clang -target x86_64-w64-mingw32 get.o get-main.cpp -o get.exe; ./get.exe
rv[0].x = 111 - ok

To make things more interesting, let's try compiling get.cpp for an MSVC target:

$ clang -target x86_64-windows-msvc get.cpp -std=c++2b -S -emit-llvm -Xclang -disable-llvm-passes -o get.ll
In file included from get.cpp:4:
./range.cpp:136:5: warning: unknown attribute '__no_unique_address__' ignored [-Wunknown-attributes]
  136 |   [[__no_unique_address__]] _Bound __bound_;
      |     ^~~~~~~~~~~~~~~~~~~~~
1 warning generated.
$ llc -mtriple=x86_64-w64-mingw32 get.ll -filetype obj -o get.o
$ clang -target x86_64-w64-mingw32 get.o get-main.cpp -o get.exe; ./get.exe
rv[0].x = 111 - ok

The [[__no_unique_address__]] seems to be essential for this testcase, and as the MSVC target doesn't implement that, it succeeds and doesn't show the issue. So it's plausible that the implementation of [[__no_unique_address__]] for mingw targets, that otherwise should use all the generic itanium logic, is part of the root cause of this issue here.

Further attaching get.ll when generated with clang -target x86_64-w64-mingw32 -std=c++2b get.cpp -S -emit-llvm -Xclang -disable-llvm-passes -o get.ll: get.ll.zip

The same issue also manifests if generating the IR with an older (16 release) version of Clang but compiling it with a newer version of llc (past https://github.com/llvm/llvm-project/commit/c65b4d64d4b09795fe237b62a4226121c5b13248).

Here's a Compiler Explorer link with the generated IR for this case: https://gcc.godbolt.org/z/KxxMxMhPa

efriedma-quic commented 1 year ago

Maybe you can try selectively enabling/disabling the change from D135462 for specific functions, and see which change in code generation is actually causing the bug to show up?

Looking at the generated code, I can't see any reason for the alignment to have any effect; the most likely issue is some sort of buffer overflow.

asavonic commented 1 year ago

D135462 is supposed to have no effect on generated LLVM IR - it changes SelectionDAG and below. You can dump assembly output from llc and compare it with and without the patch. Difference should only be around stack layout.

Another approach to reduce this case is to specify stack alignment in source code using __attribute__ ((aligned (N))). Try setting higher alignment for stack variables and see if that fixes the issue.

asavonic commented 1 year ago

Another approach to reduce this case is to specify stack alignment in source code using __attribute__ ((aligned (N))). Try setting higher alignment for stack variables and see if that fixes the issue.

Although for this C++ code it is not clear where to put the attribute. It is probably easier to manually change alignment of alloca instructions in LLVM IR, and then recompile with llc.

mstorsjo commented 1 year ago

D135462 is supposed to have no effect on generated LLVM IR - it changes SelectionDAG and below.

Yep - it seems like the generated IR always (or at least since the 16 release) has had this flaw, and only recent changes expose the breakage.

I'll see if it's possible to diff the generated IR between the working linux case and broken mingw case, or if there's too much differences to spot what might be relevant.

You can dump assembly output from llc and compare it with and without the patch. Difference should only be around stack layout.

That sounds like a good idea! I guess I could also try to diff the output from -print-after-all.

Maybe you can try selectively enabling/disabling the change from D135462 for specific functions, and see which change in code generation is actually causing the bug to show up?

Thanks for the idea, I'll try that if I don't make progress other ways.

Looking at the generated code, I can't see any reason for the alignment to have any effect; the most likely issue is some sort of buffer overflow.

Yeah, it feels like something such...

mstorsjo commented 1 year ago

I'll see if it's possible to diff the generated IR between the working linux case and broken mingw case, or if there's too much differences to spot what might be relevant.

There's a lot of differences there, too much to spot something relevant right off the bat.

Maybe you can try selectively enabling/disabling the change from D135462 for specific functions, and see which change in code generation is actually causing the bug to show up?

Thanks for the idea, I'll try that if I don't make progress other ways.

This turned out to give some fairly interesting pointers. Enabling the change from D135462 changed the outcome of the test when activated on the functions _ZNSt12_GLOBAL__N_115make_from_tupleINS_22unreachable_sentinel_tENS_5tupleIJS1_EEEEET_T0_ and _ZNSt12_GLOBAL__N_122__make_from_tuple_implINS_22unreachable_sentinel_tENS_5tupleIJS1_EEEEET_T0_ (std::(anonymous namespace)::unreachable_sentinel_t std::(anonymous namespace)::make_from_tuple<std::(anonymous namespace)::unreachable_sentinel_t, std::(anonymous namespace)::tuple<std::(anonymous namespace)::unreachable_sentinel_t> >(std::(anonymous namespace)::tuple<std::(anonymous namespace)::unreachable_sentinel_t>) and std::(anonymous namespace)::unreachable_sentinel_t std::(anonymous namespace)::__make_from_tuple_impl<std::(anonymous namespace)::unreachable_sentinel_t, std::(anonymous namespace)::tuple<std::(anonymous namespace)::unreachable_sentinel_t> >(std::(anonymous namespace)::tuple<std::(anonymous namespace)::unreachable_sentinel_t>)).

The latter of these has the following diff in output assembly:

 _ZNSt12_GLOBAL__N_122__make_from_tuple_implINS_22unreachable_sentinel_tENS_5tupleIJS1_EEEEET_T0_: # @_ZNSt12_GLOBAL__N_122__make_from_tuple_implINS_22unreachable_sentinel_tENS_5tupleIJS1_EEEEET_T0_
 .seh_proc _ZNSt12_GLOBAL__N_122__make_from_tuple_implINS_22unreachable_sentinel_tENS_5tupleIJS1_EEEEET_T0_
 # %bb.0:
-   subq    $16, %rsp
-   .seh_stackalloc 16
+   pushq   %rax
+   .seh_stackalloc 8
    .seh_endprologue
-   movb    %cl, (%rsp)
-   movb    8(%rsp), %al
-   addq    $16, %rsp
+   movb    %cl, 6(%rsp)
+   movb    7(%rsp), %al
+   popq    %rcx
    retq
    .seh_endproc

The only difference here is the layout of the stack variable. Other than that, the function seems to be doing total nonsense - it writes one byte from the input parameter into one location on the stack, movb %cl, (%rsp) and reads one uninitialized byte from the stack, movb 8(%rsp), %al, into the return value register. The only thing that differs is where these are laid out on the newly allocated stack.

This seems to revolve around empty classes. unreachable_sentinel_t is defined like this:

struct unreachable_sentinel_t {
} unreachable_sentinel;

In the LLVM IR, it is defined like this:

%"struct.std::(anonymous namespace)::unreachable_sentinel_t" = type { i8 } 

It is defined in the same way for both the mingw and linux cases.

The mingw IR for this function looks like this:

; Function Attrs: mustprogress noinline nounwind optnone uwtable
define internal i8 @_ZNSt12_GLOBAL__N_122__make_from_tuple_implINS_22unreachable_sentinel_tENS_5tupleIJS1_EEEEET_T0_(i8 %0) #2 {
  %2 = alloca %"struct.std::(anonymous namespace)::unreachable_sentinel_t", align 1
  %3 = alloca %"class.std::(anonymous namespace)::tuple.0", align 1 
  %4 = getelementptr inbounds %"class.std::(anonymous namespace)::tuple.0", ptr %3, i32 0, i32 0
  %5 = getelementptr inbounds %"struct.std::(anonymous namespace)::__tuple_impl.1", ptr %4, i32 0, i32 0
  store i8 %0, ptr %5, align 1 
  %6 = getelementptr inbounds %"struct.std::(anonymous namespace)::unreachable_sentinel_t", ptr %2, i32 0, i32 0
  %7 = load i8, ptr %6, align 1 
  ret i8 %7
}

For the linux case, the function does exist but looks entirely different:

; Function Attrs: mustprogress noinline nounwind optnone uwtable 
define internal void @_ZNSt12_GLOBAL__N_122__make_from_tuple_implINS_22unreachable_sentinel_tENS_5tupleIJS1_EEEEET_T0_() #2 {
  %1 = alloca %"class.std::(anonymous namespace)::tuple.0", align 1
  ret void
}

So in the linux case, the clang codegen knows to skip loads/stores to this struct entirely, while on mingw it generates these dummy loads/stores, even if it's probably not meant to be included in allocations at all.

I tried stepping around the code in a debugger, and managed to pinpoint the issue further. The smoking gun seems to reside in this function:

; Function Attrs: noinline optnone uwtable
define internal void @_ZNSt12_GLOBAL__N_16ranges11repeat_viewI1ANS_22unreachable_sentinel_tEEC2IJS3_EEEiNS_5tupleIJEEENS6_IJDpT_EEE(ptr noundef nonnull align 4 dereferenceable(8) %0, i32 noundef %1, i8 %2, i8 %3) unnamed_addr #1 align 2 {
  %5 = alloca %"class.std::(anonymous namespace)::tuple", align 1
  %6 = alloca %"class.std::(anonymous namespace)::tuple.0", align 1
  %7 = alloca ptr, align 8
  %8 = alloca i32, align 4
  %9 = alloca %struct.A, align 4 
  %10 = alloca %"class.std::(anonymous namespace)::tuple", align 1
  %11 = alloca %"class.std::(anonymous namespace)::tuple.0", align 1 
  %12 = getelementptr inbounds %"class.std::(anonymous namespace)::tuple", ptr %5, i32 0, i32 0
  %13 = getelementptr inbounds %"struct.std::(anonymous namespace)::__tuple_impl", ptr %12, i32 0, i32 0
  store i8 %2, ptr %13, align 1 
  %14 = getelementptr inbounds %"class.std::(anonymous namespace)::tuple.0", ptr %6, i32 0, i32 0
  %15 = getelementptr inbounds %"struct.std::(anonymous namespace)::__tuple_impl.1", ptr %14, i32 0, i32 0
  store i8 %3, ptr %15, align 1
  store ptr %0, ptr %7, align 8 
  store i32 %1, ptr %8, align 4
  %16 = load ptr, ptr %7, align 8 
  %17 = getelementptr inbounds %"class.std::(anonymous namespace)::ranges::repeat_view", ptr %16, i32 0, i32 0 
  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %10, ptr align 1 %5, i64 1, i1 false) 
  %18 = getelementptr inbounds %"class.std::(anonymous namespace)::tuple", ptr %10, i32 0, i32 0
  %19 = getelementptr inbounds %"struct.std::(anonymous namespace)::__tuple_impl", ptr %18, i32 0, i32 0
  %20 = load i8, ptr %19, align 1
  %21 = call i64 @_ZNSt12_GLOBAL__N_115make_from_tupleI1ANS_5tupleIJEEEEET_T0_(i8 %20) 
  store i64 %21, ptr %9, align 4
  %22 = load i64, ptr %9, align 4 
  call void (ptr, ...) @_ZNSt12_GLOBAL__N_113__movable_boxI1AEC2Ez(ptr noundef nonnull align 4 dereferenceable(8) %17, i64 %22) 
  call void @llvm.memcpy.p0.p0.i64(ptr align 1 %11, ptr align 1 %6, i64 1, i1 false) 
  %23 = getelementptr inbounds %"class.std::(anonymous namespace)::tuple.0", ptr %11, i32 0, i32 0
  %24 = getelementptr inbounds %"struct.std::(anonymous namespace)::__tuple_impl.1", ptr %23, i32 0, i32 0
  %25 = load i8, ptr %24, align 1 
  %26 = call i8 @_ZNSt12_GLOBAL__N_115make_from_tupleINS_22unreachable_sentinel_tENS_5tupleIJS1_EEEEET_T0_(i8 %25)
  %27 = getelementptr inbounds %"struct.std::(anonymous namespace)::unreachable_sentinel_t", ptr %16, i32 0, i32 0
  store i8 %26, ptr %27, align 4
  ret void
} 

It's not very obvious until after stepping through the code back and forth a dozen times in a debugger, but... The user data payload gets written into %17; we initialize that as %17 = getelementptr inbounds %"class.std::(anonymous namespace)::ranges::repeat_view", ptr %16, i32 0, i32 0, i.e. effectively equal to %16. We pass it to call void (ptr, ...) @_ZNSt12_GLOBAL__N_113__movable_boxI1AEC2Ez(ptr noundef nonnull align 4 dereferenceable(8) %17, i64 %22) which writes the user data payload into %17. Later at the end, we do this bit:

  %26 = call i8 @_ZNSt12_GLOBAL__N_115make_from_tupleINS_22unreachable_sentinel_tENS_5tupleIJS1_EEEEET_T0_(i8 %25)
  %27 = getelementptr inbounds %"struct.std::(anonymous namespace)::unreachable_sentinel_t", ptr %16, i32 0, i32 0
  store i8 %26, ptr %27, align 4

Here we get another pointer %27 which also ends up equal to %16, equal to %17 and write one byte (representing the unreachable sentinel empty class) into it with store i8 %26, ptr %27, align 4.

When we were lucky before, this byte that was written here just by luck happened to be equal to the first byte written to %17. When unlucky, it is something else, corrupting the user data structure.

I.e., this is a case of "How on earth did this thing work in the first place?".

This boils down to a much more concrete Clang level question: Why doesn't mingw get the same treatment for empty classes like linux targets get here? That one byte dummy struct seem to be accounted for in some places, but seem to alias other valid data elsewhere, and writes to it clobber the desired payload data.

mstorsjo commented 1 year ago

Adding more observations after a brief discussion with @AaronBallman on irc. It does seem to be related to the __no_unique_address__ attribute. If that is removed, the clobbering writes in _ZNSt12_GLOBAL__N_16ranges11repeat_viewI1ANS_22unreachable_sentinel_tEEC2IJS3_EEEiNS_5tupleIJEEENS6_IJDpT_EEE that seem to corrupt the wanted data, seem to disappear.

So it is likely that some code relating to __no_unique_address__, incorrectly check for a Windows target, when the correct distinction is checking for MSVC (or checking whether the C++ ABI is Itanium or MSVC). (CC @amykhuang who supposedly is going to look into __no_unique_address__ for MSVC targets.)

However the weird discrepancy in _ZNSt12_GLOBAL__N_122__make_from_tuple_implINS_22unreachable_sentinel_tENS_5tupleIJS1_EEEEET_T0_ - which does loads of uninitialized stack data on mingw, but is a near-empty function for linux, remains even if the __no_unique_address__ attribute is removed. So that seems to be a separate difference.

efriedma-quic commented 1 year ago

As far as I can tell, lowering struct S {}; S f2(); to a function that returns i8 is a thing that only happens on x86-64 Windows. Probably happened as a result of trying to directly copy what MSVC is doing; from looking at the generated assembly, it internally does something similar. Not sure exactly where the relevant code is; clang/lib/CodeGen/Targets/X86.cpp is a reasonable starting point.

The part that specifically results in weirdness, is when we try to construct a value of the empty struct type: it ends up storing the returned i8 to memory, in a location that doesn't actually have any backing storage.

struct S {};
S f();
struct Z { int x; [[__no_unique_address__]]S y; Z(); };
Z::Z() : x(111), y(f()) { }
mstorsjo commented 1 year ago

As far as I can tell, lowering struct S {}; S f2(); to a function that returns i8 is a thing that only happens on x86-64 Windows. Probably happened as a result of trying to directly copy what MSVC is doing; from looking at the generated assembly, it internally does something similar. Not sure exactly where the relevant code is; clang/lib/CodeGen/Targets/X86.cpp is a reasonable starting point.

The part that specifically results in weirdness, is when we try to construct a value of the empty struct type: it ends up storing the returned i8 to memory, in a location that doesn't actually have any backing storage.

struct S {};
S f();
struct Z { int x; [[__no_unique_address__]]S y; Z(); };
Z::Z() : x(111), y(f()) { }

Thanks, that's a great reduction of the issue! I wouldn't have gotten it reduced to that myself.

Regarding mimicing weirdness that MSVC does; that's a good reason indeed. struct S {}; S f2(); itself probably behaves the same for pure C too, and mingw generally tries to match MSVC for pure C ABIs. (For C++ ABIs they diverge entirely though.)

Also, as for no_unique_address, while MSVC doesn't support it (or has it as a no-op), they do have [[msvc::no_unique_address]] which has an effect, see #49358.

Funnily enough, MSVC does seem to behave quite weirdly/inconsistently wrt this specific case as well. With your example above, compiled with cl -c empty.cpp, with MSVC 19.36.32532 (MSVC 2022 17.6.0), it gives roughly this output - regardless of whether [[no_unique_address]] is omitted, kept as is, or changed into [[msvc::no_unique_address]].

0000000000000000 <??0Z@@QEAA@XZ>:
       0: 48 89 4c 24 08                movq    %rcx, 0x8(%rsp)
       5: 48 83 ec 38                   subq    $0x38, %rsp 
       9: 48 8b 44 24 40                movq    0x40(%rsp), %rax
       e: c7 00 6f 00 00 00             movl    $0x6f, (%rax)
      14: e8 00 00 00 00                callq   0x19 <??0Z@@QEAA@XZ+0x19>
                0000000000000015:  IMAGE_REL_AMD64_REL32        ?f@@YA?AUS@@XZ
      19: 88 44 24 20                   movb    %al, 0x20(%rsp)
      1d: 48 8b 44 24 40                movq    0x40(%rsp), %rax
      22: 48 83 c4 38                   addq    $0x38, %rsp
      26: c3                            retq    

I.e. it takes the dummy one byte return from the f() call, but writes it into an otherwise unused stack slot, and does nothing further with it. This seems reasonable.

If we, however, add -std:c++20 to the MSVC compile command, we get more interesting results. First if the [[no_unique_address]] attribute is omitted or kept as such, we get the following output:

0000000000000000 <??0Z@@QEAA@XZ>:
       0: 48 89 4c 24 08                movq    %rcx, 0x8(%rsp) 
       5: 48 83 ec 28                   subq    $0x28, %rsp
       9: 48 8b 44 24 30                movq    0x30(%rsp), %rax
       e: c7 00 6f 00 00 00             movl    $0x6f, (%rax)
      14: e8 00 00 00 00                callq   0x19 <??0Z@@QEAA@XZ+0x19>
                0000000000000015:  IMAGE_REL_AMD64_REL32        ?f@@YA?AUS@@XZ
      19: 48 8b 4c 24 30                movq    0x30(%rsp), %rcx
      1e: 88 41 04                      movb    %al, 0x4(%rcx) 
      21: 48 8b 44 24 30                movq    0x30(%rsp), %rax
      26: 48 83 c4 28                   addq    $0x28, %rsp
      2a: c3                            retq

In this case, it suddenly writes the dummy one byte return value into the z member (movb %al, 0x4(%rcx)), which it didn't do before. Now for the really interesting case, if we make it [[msvc::no_unique_address]] instead, we seem to get the same bug as we're having in Clang:

0000000000000000 <??0Z@@QEAA@XZ>:
       0: 48 89 4c 24 08                movq    %rcx, 0x8(%rsp)
       5: 48 83 ec 28                   subq    $0x28, %rsp 
       9: 48 8b 44 24 30                movq    0x30(%rsp), %rax
       e: c7 00 6f 00 00 00             movl    $0x6f, (%rax)
      14: e8 00 00 00 00                callq   0x19 <??0Z@@QEAA@XZ+0x19>
                0000000000000015:  IMAGE_REL_AMD64_REL32        ?f@@YA?AUS@@XZ
      19: 48 8b 4c 24 30                movq    0x30(%rsp), %rcx
      1e: 88 01                         movb    %al, (%rcx)
      20: 48 8b 44 24 30                movq    0x30(%rsp), %rax
      25: 48 83 c4 28                   addq    $0x28, %rsp
      29: c3                            retq    

Now suddenly, the dummy byte is written into the start of %rcx, in movb %al, (%rcx), where %rcx is equal to %rax earlier where we wrote the other value in movl $0x6f, (%rax) (if I'm reading the x86 assembly correctly).

mstorsjo commented 1 year ago

I haven't found anything really relevant for the x86_64 special case for returning empty classes yet... I've tried to grep around in tests to see if there's anything specific for that (if that could lead me back to a commit where the implementation was added), but it seems to have been in place for a very long time. At least according to Compiler Explorer, Clang 3.1 seems to have had this behaviour already.

I did find 83a125834252ab885efc6784bb1146f5ed753487 which seems remotely related, but not quite.

I noted that Clang 3.8 seems to have the same behaviour for

struct S {};
S f() { return S(); }

on i386 as for x86_64, while Clang 3.9 has got the linux-like ret void behaviour instead. So I guess I could try to bisect where that changed in case it would give some pointers.

mstorsjo commented 1 year ago

I noted that Clang 3.8 seems to have the same behaviour for

struct S {};
S f() { return S(); }

on i386 as for x86_64, while Clang 3.9 has got the linux-like ret void behaviour instead. So I guess I could try to bisect where that changed in case it would give some pointers.

I bisected this behavior change down to 380b2243595046d472884f7902f6f76749306508.

mstorsjo commented 1 year ago

As far as I can tell, lowering struct S {}; S f2(); to a function that returns i8 is a thing that only happens on x86-64 Windows. Probably happened as a result of trying to directly copy what MSVC is doing; from looking at the generated assembly, it internally does something similar.

Actually, it's apparently not entirely specific for Windows/x86_64 - the same happens for e.g. powerpc64le-linux targets as well - see https://github.com/llvm/llvm-project/issues/64427. I haven't found any code which looks like it intentionally tries to do this - this happens for some targets, based on the lack of code like in https://github.com/llvm/llvm-project/commit/380b2243595046d472884f7902f6f76749306508.

This tweak avoids the issue by ignoring empty structs in return values:

diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index 5bbf78a9b69e..5c435a5d001f 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -4292,6 +4292,10 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
   if (Ty->isVoidType())
     return ABIArgInfo::getIgnore();

+  // Ignore empty structs/unions.
+  if (IsReturnType && isEmptyRecord(getContext(), Ty, true))
+    return ABIArgInfo::getIgnore();
+
   if (const EnumType *EnumTy = Ty->getAs<EnumType>())
     Ty = EnumTy->getDecl()->getIntegerType();

I'm not sure if this is to be considered an ABI break or not - for functions that returned an empty struct, we previously returned nonsense data in the al register, now we don't try to set it at all (so al contains some other unspecified data instead). But we'd need to do this for all targets to avoid this bug...

The better fix would be to avoid emitting a write at all, when the initialized class member doesn't have any storage (has got the no_unique_address attribute); I haven't found where that code resides yet.

mstorsjo commented 1 year ago

I think this diff should fix the issue:

diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index 0795ea598411..cb637c084f88 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -710,6 +710,8 @@ void CodeGenFunction::EmitInitializerForField(FieldDecl *Field, LValue LHS,
         getOverlapForFieldInit(Field), AggValueSlot::IsNotZeroed,
         // Checks are made by the code that calls constructor.
         AggValueSlot::IsSanitizerChecked);
+    if (Field->hasAttr<NoUniqueAddressAttr>())
+      Slot = AggValueSlot::ignored();
     EmitAggExpr(Init, Slot);
     break;
   }

The diff in generated LLVM IR looks like this:

--- old.ll      2023-08-07 09:52:40.706588049 +0300
+++ new.ll      2023-08-07 09:53:11.022702973 +0300
@@ -12,13 +12,14 @@
 define dso_local void @_ZN1ZC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) unnamed_addr #0 align 2 {
 entry:
   %this.addr = alloca ptr, align 8
+  %coerce = alloca %struct.S, align 1
   store ptr %this, ptr %this.addr, align 8
   %this1 = load ptr, ptr %this.addr, align 8
   %x = getelementptr inbounds %struct.Z, ptr %this1, i32 0, i32 0
   store i32 111, ptr %x, align 4
   %call = call i8 @_Z1fv()
-  %coerce.dive = getelementptr inbounds %struct.S, ptr %this1, i32 0, i32 0
-  store i8 %call, ptr %coerce.dive, align 4
+  %coerce.dive = getelementptr inbounds %struct.S, ptr %coerce, i32 0, i32 0
+  store i8 %call, ptr %coerce.dive, align 1
   ret void
 }

There's still a store, but to a safe unused alloca; with optimization, the dead store goes away.

I'll try to make a proper patch to upstream this fix within a day or so; CC @crtrott @amy-kwan.

crtrott commented 1 year ago

Woohoo: good work tracking this down. Btw. my tests (and if I understand your findings correct, your diagnosis) indicate this has been an issue essentially forever right? Does that make this a candidate for backport, at least to a 16.0.7?

llvmbot commented 1 year ago

@llvm/issue-subscribers-clang-codegen

AaronBallman commented 1 year ago

Woohoo: good work tracking this down. Btw. my tests (and if I understand your findings correct, your diagnosis) indicate this has been an issue essentially forever right? Does that make this a candidate for backport, at least to a 16.0.7?

We're not currently planning any additional releases of 16.x, and because it isn't a regression I'm not certain if it should be backported to 17.x (I'd have to see the final patch and any fallout from it to know better on that, though).

crtrott commented 1 year ago

It does make a few tests in libcxx 17.x fail though. We would need to identify exactly which platforms are affected and disable those tests there I guess? I did disable the mdspan test on MinGW since we test that in CI, Power is not tested in CI, so currently on Power the libcxx tests will fail with the 17.x.

mstorsjo commented 1 year ago

Woohoo: good work tracking this down. Btw. my tests (and if I understand your findings correct, your diagnosis) indicate this has been an issue essentially forever right? Does that make this a candidate for backport, at least to a 16.0.7?

We're not currently planning any additional releases of 16.x, and because it isn't a regression I'm not certain if it should be backported to 17.x (I'd have to see the final patch and any fallout from it to know better on that, though).

The proper patch is up now at https://reviews.llvm.org/D157332.

IMO the fix looks quite targeted and safe enough IMO (most platforms already treat it as a void and should be entirely unaffected, and the fix seems fairly safe to me). But anyway let's review it properly, then hopefully keep it in main for a few days, and then raise the question of whether we can backport it.

It's not a regression, but it seems to be a longstanding flaw for a C++ code pattern that doesn't seem to have been common earlier, but seems to be becoming more common in recently added libccxx code.

mstorsjo commented 1 year ago

Reopening, as I'd like to have the fix in 17.x.

mstorsjo commented 1 year ago

/cherry-pick d60c3d08e78dfbb4b180776b83e910d810e1f36a

I removed the regression tag, as the root issue wasn't a regression, it has always been present - it was only an unrelated change that exposed the issue in this testcase - but the issue has been present all along in issues shown in #64077 and #64427.

llvmbot commented 1 year ago

/branch llvm/llvm-project-release-prs/issue64253

llvmbot commented 1 year ago

/pull-request llvm/llvm-project-release-prs#589

mstorsjo commented 1 year ago

FYI, I reported the same issue as a bug in MSVC (when using [[msvc::no_unique_address]]) at https://developercommunity.visualstudio.com/t/Init-of-empty-structs-with-msvc::no_un/10440912.