davidlattimore / wild

Apache License 2.0
414 stars 11 forks source link

Fix tests on openSUSE #17

Closed marxin closed 3 weeks ago

marxin commented 1 month ago

Noticed that for the current main branch commit (042615a):

test integration_test ... FAILED

failures:

---- integration_test stdout ----
wild: /home/marxin/Programming/wild/wild/tests/build/ifunc.c-default.wild
ld: /home/marxin/Programming/wild/wild/tests/build/ifunc.c-default.ld
asm._start
  ORIG            `/home/marxin/Programming/wild/wild/tests/build/ifunc.default-f778dc51c46a70e0.o`
                  push %rbp
                  mov %rsp,%rbp
                  sub $0x10,%rsp
                  call 0x00000000000000F8
                  mov %eax,-4(%rbp)
                  cmpl $0,-4(%rbp)
                  je 0x0000000000000020
                  mov -4(%rbp),%eax
                  mov %eax,%edi
                  call 0x000000000000015F
                  call 0  // 0xAAA=PLT(IFUNC(resolve_compute_value10))
                  cmp $0xA,%eax
                  je 0x0000000000000034
                  mov $1,%edi
                  call 0x000000000000015F
                  call 0xFFFFFFFFFFFFFFF0
                  cmp $0x20,%eax
                  je 0x0000000000000048
                  mov $2,%edi
                  call 0x000000000000015F

  wild 0x00401798 b8 09 18 40 00 mov $0x401809,%eax  // Mov_r32_imm32(0x401809) resolve_compute_value10 compute_value10
  ld   0x00401058 b8 08 10 40 00 mov $0x401008,%eax  // Mov_r32_imm32(0x401008) PLT(IFUNC(resolve_compute_value10)) PLT(IFUNC(compute_value10))
  ORIG            b8 00 00 00 00 mov $0,%eax  // R_X86_64_32 -> `compute_value10`
  TRACE           value_flags=IFUNC | CAN_BYPASS_GOT resolution_flags=DIRECT | GOT | PLT

                  call *%rax
                  cmp $0xA,%eax
                  je 0x000000000000005E
                  mov $3,%edi
                  call 0x000000000000015F
                  mov 0,%eax  // 0xAAA=resolve_count
                  cmp $2,%eax
                  je 0x0000000000000073
                  mov $4,%edi
                  call 0x000000000000015F

  wild 0x004017c3 b8 09 18 40 00 mov $0x401809,%eax  // Mov_r32_imm32(0x401809) resolve_compute_value10 compute_value10
  ld   0x00401083 b8 08 10 40 00 mov $0x401008,%eax  // Mov_r32_imm32(0x401008) PLT(IFUNC(resolve_compute_value10)) PLT(IFUNC(compute_value10))
  ORIG            b8 00 00 00 00 mov $0,%eax  // R_X86_64_32 -> `compute_value10`
  TRACE           value_flags=IFUNC | CAN_BYPASS_GOT resolution_flags=DIRECT | GOT | PLT

  wild 0x004017c8 48 3d 2e 18 40 00 cmp $0x40182E,%rax  // Cmp_RAX_imm32(0x40182e) resolve_compute_value32 compute_value32
  ld   0x00401088 48 3d 00 10 40 00 cmp $0x401000,%rax  // Cmp_RAX_imm32(0x401000) PLT(IFUNC(resolve_compute_value32)) PLT(IFUNC(compute_value32))
  ORIG            48 3d 00 00 00 00 cmp $0,%rax  // R_X86_64_32S -> `compute_value32`
  TRACE           value_flags=IFUNC | CAN_BYPASS_GOT resolution_flags=DIRECT | GOT | PLT

                  jne 0x000000000000008A
                  mov $5,%edi
                  call 0x000000000000015F

  wild 0x004017da b8 09 18 40 00 mov $0x401809,%eax  // Mov_r32_imm32(0x401809) resolve_compute_value10 compute_value10
  ld   0x0040109a b8 08 10 40 00 mov $0x401008,%eax  // Mov_r32_imm32(0x401008) PLT(IFUNC(resolve_compute_value10)) PLT(IFUNC(compute_value10))
  ORIG            b8 00 00 00 00 mov $0,%eax  // R_X86_64_32 -> `compute_value10`
  TRACE           value_flags=IFUNC | CAN_BYPASS_GOT resolution_flags=DIRECT | GOT | PLT

  wild 0x004017df 48 3d 09 18 40 00 cmp $0x401809,%rax  // Cmp_RAX_imm32(0x401809) resolve_compute_value10 compute_value10
  ld   0x0040109f 48 3d 08 10 40 00 cmp $0x401008,%rax  // Cmp_RAX_imm32(0x401008) PLT(IFUNC(resolve_compute_value10)) PLT(IFUNC(compute_value10))
  ORIG            48 3d 00 00 00 00 cmp $0,%rax  // R_X86_64_32S -> `compute_value10`
  TRACE           value_flags=IFUNC | CAN_BYPASS_GOT resolution_flags=DIRECT | GOT | PLT

                  je 0x00000000000000A1
                  mov $5,%edi
                  call 0x000000000000015F
                  mov $0x2A,%edi
                  call 0x000000000000015F
                  nop
                  leave
                  ret

asm.init_ifuncs
  ORIG            `/home/marxin/Programming/wild/wild/tests/build/ifunc_init.default-46162e6be85e5a0c.o`
                  push %rbp
                  mov %rsp,%rbp
                  sub $0x20,%rsp

  wild 0x00401850 48 c7 45 f8 d8 28 40 00 movq $0x4028D8,-8(%rbp)  // Mov_rm64_imm32(0xfffffffffffffff8) UNKNOWN-TLS
  ld   0x00401110 48 c7 45 f8 30 02 40 00 movq $0x400230,-8(%rbp)  // Mov_rm64_imm32(0xfffffffffffffff8) UNKNOWN-TLS
  ORIG            48 c7 45 f8 00 00 00 00 movq $0,-8(%rbp)  // R_X86_64_32S -> `__rela_iplt_start`
  TRACE           value_flags=ADDRESS | CAN_BYPASS_GOT resolution_flags=DIRECT

                  jmp 0x0000000000000056
                  mov -8(%rbp),%rax
                  mov 8(%rax),%rax
                  mov %eax,%edx
                  mov $0x25,%eax
                  cmp %rax,%rdx
                  je 0x000000000000002D
                  mov $7,%eax
                  jmp 0x0000000000000065
                  mov -8(%rbp),%rax
                  mov (%rax),%rax
                  mov %rax,-0x10(%rbp)
                  mov -8(%rbp),%rax
                  mov 0x10(%rax),%rax
                  mov %rax,-0x18(%rbp)
                  mov -0x18(%rbp),%rax
                  call *%rax
                  mov -0x10(%rbp),%rdx
                  mov %rax,(%rdx)
                  addq $0x18,-8(%rbp)

  wild 0x0040189e 48 81 7d f8 08 29 40 00 cmpq $0x402908,-8(%rbp)  // Cmp_rm64_imm32(0xfffffffffffffff8) UNKNOWN-TLS
  ld   0x0040115e 48 81 7d f8 60 02 40 00 cmpq $0x400260,-8(%rbp)  // Cmp_rm64_imm32(0xfffffffffffffff8) UNKNOWN-TLS
  ORIG            48 81 7d f8 00 00 00 00 cmpq $0,-8(%rbp)  // R_X86_64_32S -> `__rela_iplt_end`
  TRACE           value_flags=ADDRESS | CAN_BYPASS_GOT resolution_flags=DIRECT

                  jb 0x0000000000000012
                  mov $0,%eax
                  leave
                  ret

section.rela.plt.link
  wild .symtab
  ld Invalid ELF section index

Error: Validation failed.
Binary `/home/marxin/Programming/wild/wild/tests/build/ifunc.c-default.wild`. Relink with:
cargo run --bin wild -- --gc-sections -static -o /home/marxin/Programming/wild/wild/tests/build/ifunc.c-default.wild /home/marxin/Programming/wild/wild/tests/build/ifunc.default-f778dc51c46a70e0.o /home/marxin/Programming/wild/wild/tests/build/ifunc1.default-9aeb3e763f3dee95.o /home/marxin/Programming/wild/wild/tests/build/ifunc_init.default-46162e6be85e5a0c.o /home/marxin/Programming/wild/wild/tests/build/exit.default-debbe7c7ab67f747.o
 To revalidate:
cargo run --bin linker-diff -- --wild-defaults --ignore 'section.plt.entsize' --ref /home/marxin/Programming/wild/wild/tests/build/ifunc.c-default.ld /home/marxin/Programming/wild/wild/tests/build/ifunc.c-default.wild
❯ readelf -SW /home/marxin/Programming/wild/wild/tests/build/ifunc.c-default.ld
There are 12 section headers, starting at offset 0x3380:

Section Headers:
  [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
  [ 0]                   NULL            0000000000000000 000000 000000 00      0   0  0
  [ 1] .note.gnu.property NOTE            0000000000400200 000200 000030 00   A  0   0  8
  [ 2] .rela.plt         RELA            0000000000400230 000230 000030 18  AI  0   6  8
  [ 3] .plt              PROGBITS        0000000000401000 001000 000010 00  AX  0   0  8
  [ 4] .text             PROGBITS        0000000000401010 001010 000173 00  AX  0   0  1
  [ 5] .eh_frame         PROGBITS        0000000000402000 002000 0000f8 00   A  0   0  8
  [ 6] .got.plt          PROGBITS        0000000000403fe8 002fe8 000028 08  WA  0   0  8
  [ 7] .bss              NOBITS          0000000000404010 003010 000008 00  WA  0   0  4
  [ 8] .comment          PROGBITS        0000000000000000 003010 000019 01  MS  0   0  1
  [ 9] .symtab           SYMTAB          0000000000000000 003030 000210 18     10  13  8
  [10] .strtab           STRTAB          0000000000000000 003240 0000da 00      0   0  1
  [11] .shstrtab         STRTAB          0000000000000000 00331a 00005f 00      0   0  1
❯ readelf -SW /home/marxin/Programming/wild/wild/tests/build/ifunc.c-default.wild
There are 13 section headers, starting at offset 0x120:

Section Headers:
  [Nr] Name              Type            Address          Off    Size   ES Flg Lk Inf Al
  [ 0]                   NULL            0000000000400000 000000 000000 00      0   0  0
  [ 1] .phdr             NULL            0000000000400040 000040 000000 00      0   0  0
  [ 2] .shdr             NULL            0000000000400120 000120 000000 00      0   0  0
  [ 3] .shstrtab         STRTAB          0000000000400460 000460 000059 00      0   0  1
  [ 4] .symtab           SYMTAB          00000000004004c0 0004c0 000198 18      5   9  8
  [ 5] .strtab           STRTAB          0000000000400658 000658 0000d8 00      0   0  1
  [ 6] .plt              PROGBITS        0000000000401730 000730 000020 10  AX  0   0 16
  [ 7] .text             PROGBITS        0000000000401750 000750 000173 00  AX  0   0  1
  [ 8] .got              PROGBITS        00000000004028c8 0008c8 000010 08  WA  0   0  8
  [ 9] .rela.plt         RELA            00000000004028d8 0008d8 000030 18  AI  4   0  8
  [10] .eh_frame         PROGBITS        0000000000402908 000908 000140 00   A  0   0  8
  [11] .bss              NOBITS          0000000000402a48 000a48 000004 00  WA  0   0  4
  [12] .comment          PROGBITS        0000000000402a4c 000a48 000035 01  MS  0   0  1
❯ ld --version
GNU ld (GNU Binutils; openSUSE Tumbleweed) 2.42.0.20240130-4
marxin commented 1 month ago

The issue is actually caused by the fact that the following section is missing in the output generated by Wild linker:

[ 1] .note.gnu.property NOTE 0000000000400200 000200 000030 00 A 0 0 8

❯ readelf -n /home/marxin/Programming/wild/wild/tests/build/ifunc.c-default.ld

Displaying notes found in: .note.gnu.property
  Owner                Data size    Description
  GNU                  0x00000020   NT_GNU_PROPERTY_TYPE_0
      Properties: x86 feature used: x86
    x86 ISA used: x86-64-baseline
davidlattimore commented 1 month ago

Hi @marxin. Thanks for the report.

Wild doesn't output exactly the same sections as GNU ld and that's expected. That alone shouldn't cause the test to fail. I'd guess GNU ld has changed some things and linker-diff needs to be changed to accommodate those changes.

I'm stuck on a relatively old version of GNU ld - 2.38. I also set up a docker image, but even that only has 2.40. So I'll need to probably set up a different docker image in order to get something even newer.

linker-diff is reporting more than one failure. So the link field on the .rela.plt section is only one of the failures. The other failures are the asm diffs printed above that. Some of them look to be due to wild bypassing the GOT (global offset table) for an ifunc - which doesn't seem like a valid thing to do. I just checked and it's not doing that for me. One possibly clue is the line:

ORIG            b8 00 00 00 00 mov $0,%eax  // R_X86_64_32 -> `compute_value10`

That shows the relocation from the original input file. It's requesting the absolute address of the function compute_value10.

If I force my test to fail so as to get the same line, it looks like this:

ORIG            e8 00 00 00 00 call 5  // R_X86_64_PLT32 -> `compute_value10`

The compiler requested a reference to the PLT (procedure linkage table) for the function compute_value10.

So I think the difference is perhaps the version of the compiler that was used. Or perhaps the default flags for the compiler.

marxin commented 1 month ago

Hi @marxin. Thanks for the report.

You're welcome, I'm very much curious about the project!

I'm stuck on a relatively old version of GNU ld - 2.38. I also set up a docker image, but even that only has 2.40. So I'll need to probably set up a different docker image in order to get something even newer.

Please try testing that on opensuse/tumbleweed which is an openSUSE rolling distro that contains the latest binutils.

So I think the difference is perhaps the version of the compiler that was used. Or perhaps the default flags for the compiler.

Yeah, that's very likely the case. I know openSUSE uses -fPIE by default and the binutils might also have some non-default configuration options:

❯ gcc -v
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/lib64/gcc/x86_64-suse-linux/13/lto-wrapper
OFFLOAD_TARGET_NAMES=nvptx-none:amdgcn-amdhsa
OFFLOAD_TARGET_DEFAULT=1
Target: x86_64-suse-linux
Configured with: ../configure CFLAGS=' -O2 -funwind-tables -fasynchronous-unwind-tables -fstack-clash-protection -Werror=return-type -g' CXXFLAGS=' -O2 -funwind-tables -fasynchronous-unwind-tables -fstack-clash-protection -Werror=return-type -g' XCFLAGS=' -O2 -funwind-tables -fasynchronous-unwind-tables -fstack-clash-protection -Werror=return-type -g' TCFLAGS=' -O2 -funwind-tables -fasynchronous-unwind-tables -fstack-clash-protection -Werror=return-type -g' GDCFLAGS=' -O2 -funwind-tables -fasynchronous-unwind-tables -fstack-clash-protection -g' --prefix=/usr --infodir=/usr/share/info --mandir=/usr/share/man --libdir=/usr/lib64 --libexecdir=/usr/lib64 --enable-languages=c,c++,objc,fortran,obj-c++,ada,go,d,jit,m2 --enable-offload-targets=nvptx-none,amdgcn-amdhsa, --enable-offload-defaulted --without-cuda-driver --enable-host-shared --enable-checking=release --disable-werror --with-gxx-include-dir=/usr/include/c++/13 --with-libstdcxx-zoneinfo=/usr/share/zoneinfo --enable-ssp --disable-libssp --disable-libvtv --enable-cet=auto --disable-libcc1 --enable-plugin --with-bugurl=https://bugs.opensuse.org/ --with-pkgversion='SUSE Linux' --with-slibdir=/lib64 --with-system-zlib --enable-libstdcxx-allocator=new --disable-libstdcxx-pch --enable-libphobos --enable-version-specific-runtime-libs --with-gcc-major-version-only --enable-linker-build-id --enable-linux-futex --enable-gnu-indirect-function --program-suffix=-13 --without-system-libunwind --enable-multilib --with-arch-32=x86-64 --with-tune=generic --with-build-config=bootstrap-lto-lean --enable-link-serialization --build=x86_64-suse-linux --host=x86_64-suse-linux
Thread model: posix
Supported LTO compression algorithms: zlib zstd
gcc version 13.3.0 (SUSE Linux) 
davidlattimore commented 1 month ago

I've built a docker image based on openSUSE that successfully reproduces this issue and others. I've been busy this week preparing for a talk, but now that that's out of the way I can hopefully get through some of the bugs.

marxin commented 1 month ago

Great then! May I ask you if the talk will be public, I would be interested then.

davidlattimore commented 1 month ago

I hope so. I believe it was recorded, but I'm not sure when it will get posted. I have however put up a blog post that I wrote in conjunction with the talk. The content is pretty similar. It's at https://davidlattimore.github.io/posts/2024/07/17/testing-a-linker.html

davidlattimore commented 3 weeks ago

Just an update on progress here... there are various things about openSUSE that make it different than the Pop-OS (Ubuntu-based) system I've primarily been testing on. One difference is that binaries are compiled with -fno-pie by default. I added support for this a couple of days ago, although I haven't extensively tested it as yet.

Another difference is that the gcc version on openSUSE seems to be gcc-7 by default, which is pretty old. Some of the tests aren't working because the test is using attributes that weren't supported by that version of gcc - e.g. the retain attribute. I'm fixing this by changing the tests to look explicitly for a newer version of gcc, e.g. gcc-13 and only use gcc if that isn't found.

I'm also finding various packages that need to be installed in order for some of the tests to work. For example tests that link with -static-pie need glibc-devel-static to be installed. I'm updating openuses.Docker as I find such things.

Lastly there are some false positives from linker-diff that I need to fix. In particular it currently doesn't do a good job when relocation points to somewhere within the bytes of a symbol rather than to the start.

marxin commented 3 weeks ago

One difference is that binaries are compiled with -fno-pie by default. I added support for this a couple of days ago, although I haven't extensively tested it as yet.

Thanks for working on that. I've tested that briefly and it seems it's not working for a simple test-case:

❯ printf '#include<iostream>\nint main() { std::cout << "XXX" << std::endl; return 0; }' | gcc-13 -x c++ - -lstdc++ -B ~/Programming/wild -fpie
~/Programming/wild on  use-opensuse-tumbleweed [$?] via 🦀 v1.82.0-nightly 
❯ echo $?
0
❯ printf '#include<iostream>\nint main() { std::cout << "XXX" << std::endl; return 0; }' | gcc-13 -x c++ - -lstdc++ -B ~/Programming/wild -fno-pie
~/Programming/wild on  use-opensuse-tumbleweed [$?] via 🦀 v1.82.0-nightly 
❯ ./a.out 
Segmentation fault (core dumped)
❯ valgrind a.out
...
==67944== Process terminating with default action of signal 11 (SIGSEGV): dumping core
==67944==  Bad permissions for mapped region at address 0x404F80
==67944==    at 0x404F80: ??? (in /home/marxin/Programming/wild/a.out)
==67944==    by 0x4B2F1EF: (below main) (in /usr/lib64/libc.so.6)

Another difference is that the gcc version on openSUSE seems to be gcc-7 by default, which is pretty old. Some of the tests aren't working because the test is using attributes that weren't supported by that version of gcc - e.g. the retain attribute. I'm fixing this by changing the tests to look explicitly for a newer version of gcc, e.g. gcc-13 and only use gcc if that isn't found.

Yeah, it's ancient and I would prefer using Tumbleweed (that I have installed on my machine). It's a rolling distro that provides GCC 13 (and will provide GCC 14 by default soonish): #23.

davidlattimore commented 3 weeks ago

I'm trying to reproduce the segfault. Does it repro for you in the docker image? Here's what I tried:

docker build -t wild-dev-opensuse . -f docker/opensuse.Dockerfile && docker run -it wild-dev-opensuse

Then in the resulting shell:

10292fab3b37:/wild # cargo build
   ...
    Finished `dev` profile [unoptimized] target(s) in 5.37s
10292fab3b37:/wild # printf '#include<iostream>\nint main() { std::cout << "XXX" << std::endl; return 0; }' | gcc-13 -x c++ - -lstdc++ -B target/debug/wild -fno-pie
10292fab3b37:/wild # ./a.out 
XXX
10292fab3b37:/wild # 
10292fab3b37:/wild # echo $?
0
10292fab3b37:/wild # 
davidlattimore commented 3 weeks ago

Ah, wait, that didn't link with wild - I didn't use -B correctly

davidlattimore commented 3 weeks ago

OK, I copied target/release/wild to /usr/bin/ld, and can now reproduce the segfault. Thanks, will investigate...

davidlattimore commented 3 weeks ago

The issue seemed to be due to applying copy relocations to functions, which won't work. I've fixed it to now generate a PLT entry instead of doing a copy relocation when the target of the relocation is a function.

davidlattimore commented 3 weeks ago

Good news.... After a few more changes, the tests now pass on the openSUSE tumbleweed docker image. Thanks for all your bug reports on this an the other issues BTW :)

marxin commented 2 weeks ago

Thanks for the fixes ;) I've got one another issue noticed while building mold with wild: #24.