jart / cosmopolitan

build-once run-anywhere c library
ISC License
18.36k stars 630 forks source link

Hello.com, redbean.com fail to run on MacOS M1 under Rosetta #429

Closed pkulchenko closed 2 years ago

pkulchenko commented 2 years ago

hello.com is built on linux following instructions published here: https://justine.lol/cosmopolitan/ and then executed on a MacBook Pro with the M1 Max chip (arm64) running macOS Monterey 12.4 under Rosetta translation.

#To confirm that it's running under Rosetta:
❯ arch -arch x86_64 ./[hello.com](http://hello.com/)
[1]    79473 segmentation fault  arch -arch x86_64 ./[hello.com](http://hello.com/)

❯ sh ./[hello.com](http://hello.com/)
[1]    73377 segmentation fault  sh ./[hello.com](http://hello.com/)
❯ sh ./[hello.com](http://hello.com/) --assimilate
❯ ./[hello.com](http://hello.com/)
[1]    73395 segmentation fault  ./[hello.com](http://hello.com/)
❯ lldb --file ./[hello.com](http://hello.com/)
(lldb) target create "./[hello.com](http://hello.com/)"
Current executable set to '/Users/user/cosmo/[hello.com](http://hello.com/)' (x86_64).
(lldb) r
Process 73409 launched: '/Users/user/cosmo/[hello.com](http://hello.com/)' (x86_64)
Process 73409 stopped
* thread #1, stop reason = EXC_BAD_ACCESS (code=1, address=0x1f)
    frame #0: 0x000000000040239f [hello.com](http://hello.com/)
->  0x40239f: movl   (%rsp), %ebx
    0x4023a2: leaq   0x8(%rsp), %rsi
    0x4023a7: leaq   0x10(%rsp,%rbx,8), %rdx
    0x4023ac: movq   %rsp, 0x1e97d(%rip)
Target 0: ([hello.com](http://hello.com/)) stopped.
(lldb) bt
* thread #1, stop reason = EXC_BAD_ACCESS (code=1, address=0x1f)
  * frame #0: 0x000000000040239f [hello.com](http://hello.com/)
(lldb)
❯ file [hello.com](http://hello.com/)
[hello.com](http://hello.com/): Mach-O 64-bit executable x86_64

Full crash report:

{"app_name":"[hello.com](http://hello.com/)","timestamp":"2022-06-17 12:25:25.00 -0700","app_version":"","slice_uuid":"45e417b7-2c6d-a6da-45e4-17b72c6da6da","build_version":"","platform":0,"share_with_app_devs":1,"is_first_party":1,"bug_type":"309","os_version":"macOS 12.4 (21F79)","incident_id":"E185D602-4547-4382-874C-6A44AF63AAD3","name":"[hello.com](http://hello.com/)"}
{
  "uptime" : 2300000,
  "procLaunch" : "2022-06-17 12:25:25.2262 -0700",
  "procRole" : "Unspecified",
  "version" : 2,
  "userID" : 501,
  "deployVersion" : 210,
  "modelCode" : "MacBookPro18,4",
  "procStartAbsTime" : 55200184527962,
  "coalitionID" : 3240,
  "osVersion" : {
    "train" : "macOS 12.4",
    "build" : "21F79",
    "releaseType" : "User"
  },
  "captureTime" : "2022-06-17 12:25:25.4489 -0700",
  "incident" : "E185D602-4547-4382-874C-6A44AF63AAD3",
  "bug_type" : "309",
  "pid" : 73395,
  "procExitAbsTime" : 55200189857901,
  "translated" : true,
  "cpuType" : "X86-64",
  "procName" : "[hello.com](http://hello.com/)",
  "procPath" : "\/Users\/USER\/*\/[hello.com](http://hello.com/)",
  "parentProc" : "zsh",
  "parentPid" : 73319,
  "coalitionName" : "com.googlecode.iterm2",
  "crashReporterKey" : "917168CC-9F91-8E09-0E83-6F8C1759F8E6",
  "responsiblePid" : 10569,
  "responsibleProc" : "iTerm2",
  "wakeTime" : 29832,
  "sleepWakeUUID" : "5126274E-4713-4921-8ECB-404A80F155BC",
  "sip" : "enabled",
  "vmRegionInfo" : "0x1f is not in any region.  Bytes before following region: 140718563835873\n      REGION TYPE                    START - END         [ VSIZE] PRT\/MAX SHRMOD  REGION DETAIL\n      UNUSED SPACE AT START\n--->  \n      mapped file              7ffb9802c000-7ffbc5b10000 [730.9M] r-x\/r-x SM=COW  ...t_id=a25954a5",
  "isCorpse" : 1,
  "exception" : {"codes":"0x0000000000000001, 0x000000000000001f","rawCodes":[1,31],"type":"EXC_BAD_ACCESS","signal":"SIGSEGV","subtype":"KERN_INVALID_ADDRESS at 0x000000000000001f"},
  "termination" : {"flags":0,"code":11,"namespace":"SIGNAL","indicator":"Segmentation fault: 11","byProc":"exc handler","byPid":73395},
  "vmregioninfo" : "0x1f is not in any region.  Bytes before following region: 140718563835873\n      REGION TYPE                    START - END         [ VSIZE] PRT\/MAX SHRMOD  REGION DETAIL\n      UNUSED SPACE AT START\n--->  \n      mapped file              7ffb9802c000-7ffbc5b10000 [730.9M] r-x\/r-x SM=COW  ...t_id=a25954a5",
  "extMods" : {"caller":{"thread_create":0,"thread_set_state":0,"task_for_pid":0},"system":{"thread_create":0,"thread_set_state":1,"task_for_pid":2},"targeted":{"thread_create":0,"thread_set_state":0,"task_for_pid":0},"warnings":0},
  "usedImages" : [
  {
    "size" : 0,
    "source" : "A",
    "base" : 0,
    "uuid" : "00000000-0000-0000-0000-000000000000"
  }
],
  "legacyInfo" : {
  "threadHighlighted" : 0
},
  "trialInfo" : {
  "rollouts" : [
    {
      "rolloutId" : "6112e14f37f5d11121dcd519",
      "factorPackIds" : {
        "SIRI_TEXT_TO_SPEECH" : "629e5353a0d1644ed53959b8"
      },
      "deploymentId" : 240000148
    },
    {
      "rolloutId" : "61301e3a61217b3110231469",
      "factorPackIds" : {
        "SIRI_FIND_MY_CONFIGURATION_FILES" : "6216ae152a40e71046e16225"
      },
      "deploymentId" : 240000016
    }
  ],
  "experiments" : [

  ]
},
  "reportNotes" : [
  "_dyld_process_info_create failed with 5",
  "dyld_process_snapshot_get_shared_cache failed",
  "Failed to create CSSymbolicatorRef - corpse still valid ¯\\_(ツ)_\/¯"
]
}

@jart, is there any additional information I can get to troubleshoot this? --ftrace doesn't show any information at all, so it appears to be crashing before it gets to any of the function frames.

It may be related to AVX instruction support (even though EXC_BAD_ACCESS seems to indicate a memory issue); Rosetta documentation indicates the following:

Rosetta translates all x86_64 instructions, but it doesn’t support the execution of some newer instruction sets and processor features, such as AVX, AVX2, and AVX512 vector instructions. If you include these newer instructions in your code, execute them only after verifying that they are available. For example, to determine if AVX512 vector instructions are available, use the sysctlbyname function to check the hw.optional.avx512f attribute.

dkulchenko commented 2 years ago

The issue appears to be caused by line 54 in libc/crt/crt.S:

//  translates arguments from old stack abi
    mov (%rsp),%ebx         # argc <-- this line
    lea 8(%rsp),%rsi            # argv
    lea 16(%rsp,%rbx,8),%rdx        # envp

Found via lldb and objdump:

(lldb) run
Process 90454 launched: '/Users/daniilk/cosmo/hello.com' (x86_64)
Process 90454 stopped
* thread #1, stop reason = EXC_BAD_ACCESS (code=1, address=0x1f)
    frame #0: 0x00000000004023df hello.com
->  0x4023df: movl   (%rsp), %ebx
    0x4023e2: leaq   0x8(%rsp), %rsi
    0x4023e7: leaq   0x10(%rsp,%rbx,8), %rdx
    0x4023ec: movq   %rsp, 0xbfd6d(%rip)
Target 0: (hello.com) stopped.

objdump disassembly:

00000000004023c2 <_start>:
  4023c2:   48 85 ff                test   %rdi,%rdi
  4023c5:   48 0f 45 e7             cmovne %rdi,%rsp
  4023c9:   74 02                   je     4023cd <_start+0xb>
  4023cb:   b1 20                   mov    $0x20,%cl
  4023cd:   88 0d 4d ae 0a 00       mov    %cl,0xaae4d(%rip)        # 4ad220 <__hostos>
  4023d3:   0f 31                   rdtsc  
  4023d5:   bb 10 23 4c 00          mov    $0x4c2310,%ebx
  4023da:   89 03                   mov    %eax,(%rbx)
  4023dc:   89 53 04                mov    %edx,0x4(%rbx)
  4023df:   8b 1c 24                mov    (%rsp),%ebx
  4023e2:   48 8d 74 24 08          lea    0x8(%rsp),%rsi
  4023e7:   48 8d 54 dc 10          lea    0x10(%rsp,%rbx,8),%rdx
jart commented 2 years ago

If it's possible for our software to run on MacOS M1 via Rosetta then I'd like to see that happen. However I can't help with this issue because I don't own an M1. I don't anticipate that changing anytime soon, so this issue is community supported and contributions are welcome.

pkulchenko commented 2 years ago

If it's possible for our software to run on MacOS M1 via Rosetta then I'd like to see that happen. However I can't help with this issue because I don't own an M1. I don't anticipate that changing anytime soon, so this issue is community supported and contributions are welcome.

@jart, assuming we have access to M1, is there something further than can be investigated with respect to this code?

//  translates arguments from old stack abi
    mov (%rsp),%ebx         # argc <-- this line
    lea 8(%rsp),%rsi            # argv
    lea 16(%rsp,%rbx,8),%rdx        # envp

It looks like the value of %rsp is wrong, but not clear why.

jart commented 2 years ago

Something would have to be very irregular about Rosetta for it to fail at that line. That instruction is only a few opcodes away from the program entrypoint. It's possible that when Rosetta spawns the program, it's using a different ABI. If you're authorized to use a debugger on your M1 machine, then it would help to know the values of every single x86 general register at the time that instruction happens. It would also be worth confirming that execution does in fact start at _start.

jart commented 2 years ago

It's also worth making sure we're talking about an assimilated binary here, just to rule out that _start() wasn't called by the APE loader.

dkulchenko commented 2 years ago

This is on an assimilated binary (the assimilate step runs without issue).

Here's the registers, formatted as binary, then decimal, then string:

❯ lldb --file ./hello.com
(lldb) target create "./hello.com"
Current executable set to '/Users/daniilk/cosmo/hello.com' (x86_64).
(lldb) r
Process 42094 launched: '/Users/daniilk/cosmo/hello.com' (x86_64)
Process 42094 stopped
* thread #1, stop reason = EXC_BAD_ACCESS (code=1, address=0x1f)
    frame #0: 0x00000000004023df hello.com
->  0x4023df: movl   (%rsp), %ebx
    0x4023e2: leaq   0x8(%rsp), %rsi
    0x4023e7: leaq   0x10(%rsp,%rbx,8), %rdx
    0x4023ec: movq   %rsp, 0xbfd6d(%rip)
Target 0: (hello.com) stopped.

## as binary:

(lldb) register read --all
General Purpose Registers:
       rax = 0x00000000b4b2d1f7
       rbx = 0x00000000004c2310
       rcx = 0x00000001044d4020
       rdx = 0x00000000000abdda
       rdi = 0x000000000000001f
       rsi = 0x000000b900000000
       rbp = 0x0000000000000020
       rsp = 0x000000000000001f
        r8 = 0x57cfd102a32d005a
        r9 = 0x57cfd102a32d005a
       r10 = 0x00007fffffe00fe0
       r11 = 0x000000010027e1f0
       r12 = 0x000000010027e0b0
       r13 = 0x0000000100008260
       r14 = 0x0000000000000000
       r15 = 0x0000000000000000
       rip = 0x00000000004023df
    rflags = 0x0000000000000203

@jart I'd be more than happy to spin up an M1 Mac server in the cloud for a week or two and give you full access (with hello.com etc already loaded on), if that would help at all!

jart commented 2 years ago

That's a lot of clobbered data in registers. Something looks very not right. If Rosetta is able to have that big of an ABI issue after executing only five instructions, then imagine how many more issues there'll be with the other billion? I can't support this because I don't have one of these machines in my automated test fleet. That's why this issue needs to be community supported.

dkulchenko commented 2 years ago

Progress so far: it looks like something's going wrong with platform detection. It's trying to call syscalls by their Linux IDs instead of XNU.

Replacing #if SupportsFreebsd() in crt.S with #if 0 to bypass that section, and adding mov $XNU,%al right after _init_systemfive_detected: in systemfive.S resolves the segfaults and gets us to a cosmopolitan error instead:

❯ ./hello.com
error: 0x1044c0000 size 131'072 overlaps shadow space
❯ ./hello.com --strace
SYS      0            281'496 bell system five system call support 331 magnums loaded on xnu's not unix!
error: 0x1044c0000 size 131'072 overlaps shadow space
SYS  73037            552'885 _Exit(1)
❯ ./hello.com --ftrace
error: 0x1044c0000 size 131'072 overlaps shadow space
❯

Digging further, but it seems XNU's not being detected correctly someplace.

dkulchenko commented 2 years ago

Stubbing out OverlapsShadowSpace in libc/runtime/memtrack.internal.h to always return 0 in combination with the above two changes gets us to a fully "working" hello.com on M1 under Rosetta:

❯ ./hello.com
hello world
dkulchenko commented 2 years ago

As a simpler hack, adding:

mov $0, %rdi
mov $XNU,%cl

immediately under _start: in crt.S (and still stubbing out OverlapsShadowSpace) also leads us to a working hello world (only on an assimilated binary).

jart commented 2 years ago

That's reassuring. Now we need to find a safe hack for detecting XNU. The way we currently do it is with the MAC_LC_UNIXTHREAD data structure.

https://github.com/jart/cosmopolitan/blob/5d837c4e7c8ea332a9e8f87a023ad12e682586a4/ape/ape.S#L862

It's probably not recognizing the MAC_THREAD_NEXGEN32E command because it's on ARM. If we could find out what the magnum is for flavaflav on M1 then we could add a second section.

pkulchenko commented 2 years ago

If we could find out what the magnum is for flavaflav on M1 then we could add a second section.

How would we do that and what is flavaflav?

dkulchenko commented 2 years ago

If we could find out what the magnum is for flavaflav on M1 then we could add a second section.

How would we do that and what is flavaflav?

Looking into it - it's in the Mach-o headers (see: https://github.com/aidansteele/osx-abi-macho-file-format-reference#thread_command).

dkulchenko commented 2 years ago

MAC_THREAD_NEXGEN32E is currently set to 4 in macho.internal.h because of this line in the i386 Mach-O headers:

#define x86_THREAD_STATE64 4

the equivalent line in the ARM headers is:

#define ARM_THREAD_STATE64 6

but setting MAC_THREAD_NEXGEN32E to 6 breaks the Mach-O executable under Rosetta (it doesn't recognize it as an executable file anymore). So it looks like Rosetta is translating the flavor along with anything else, so sadly it doesn't look like the issue is there.

I'm wondering if it's the use of LC_UNIXTHREAD (deprecated since macOS 10.7) that's being poorly translated by Rosetta (maybe that's why %rdi and %rcx aren't being passed along correctly) - I might try tomorrow to see if that block can be ported to the newer LC_MAIN instead - before I dig too much there, @jart is there a reason LC_MAIN didn't fit?

jart commented 2 years ago

It'd be great if we can find out what the non-deprecated way of encoding static binaries for Mac OS X is. I wouldn't be surprised if there isn't one. Apple doesn't like static linking and last time I checked they removed a lot of their Mach-O documentation off the web because they want developers to use their build tools. It took me ages to pull the docs out of the Wayback machine and find the current hack we're using. If there's a better officially blessed one then I'm totally open to using it. Just so long as it doesn't require rewriting all the system calls.

dkulchenko commented 2 years ago

I took a deeper look today and you're right - there's no way of avoiding dyld without using LC_UNIXTHREAD. Ah well, let's make it work then.

I wrote a script to analyze all the binaries on my system that run under Rosetta - for each, it dropped into a debugger and looked at the initial registers on spawn into the process, then kept track of registers that were always the same between all the executables tested. The script spit out the following:

rbx always 0x00000000ffffffff
rdx always 0x0000000000000001
rflags always 0x0000000000000246
cs always 0x000000000000002b
ebx always 0xffffffff
edx always 0x00000001
bx always 0xffff
dx always 0x0001
bh always 0xff
bl always 0xff
dl always 0x01

I included a bunch of executables from wildly different toolchains (Golang, C via GCC, C via Clang/llvm, and patched Cosmopolitan) to make sure we covered a variety of different entrypoints, and those were the initial registers that were in common between all of the executables (x64 running under Rosetta on ARM).

To rule out a environment-specific coincidence, I ran the same script on a similar set of binaries on a completely different machine (M1 Mac Mini in the cloud) with a slightly different macOS version and got the same results.

So I figured let's use that to detect Rosetta. The following patch detects and de-clobbers Rosetta for both assimilated binaries (crt.S) and non-assimilated (ape) binaries:

diff --git a/ape/loader-elf.S b/ape/loader-elf.S
index 7ba774525..ea7975f1a 100644
--- a/ape/loader-elf.S
+++ b/ape/loader-elf.S
@@ -214,6 +214,28 @@ macho: .long   0xFEEDFACE+1
 // @see    APE_LOADER_ENTRY
 // @see    ape/loader.h
 _start:    mov %rsp,%rsi
+#if SupportsXnu()
+   cmp $0x0000000000000001, %rdx
+   jne 0f
+   movq $0x00000000ffffffff, %rdx
+   cmp %rdx, %rbx
+   jne 0f
+   mov $0, %rax
+   mov $0, %rbx
+   mov $0, %rcx
+   mov $XNU, %rdx
+   mov $0, %rdi
+   mov $0, %rbp
+   mov $0, %r8
+   mov $0, %r9
+   mov $0, %r10
+   mov $0, %r11
+   mov $0, %r12
+   mov $0, %r13
+   mov $0, %r14
+   mov $0, %r15
+0:
+#endif
    jmp ApeLoader
    .endfn  _start,globl

diff --git a/ape/loader-macho.S b/ape/loader-macho.S
index b027760a7..13c3e5261 100644
--- a/ape/loader-macho.S
+++ b/ape/loader-macho.S
@@ -114,7 +114,29 @@ macho: .long   0xFEEDFACE+1

    .align  64
 _start:    mov %rsp,%rsi
-   jmp ApeLoader
+#if SupportsXnu()
+   cmp $0x0000000000000001, %rdx
+   jne 0f
+   movq $0x00000000ffffffff, %rdx
+   cmp %rdx, %rbx
+   jne 0f
+   mov $0, %rax
+   mov $0, %rbx
+   mov $0, %rcx
+   mov $XNU, %rdx
+   mov $0, %rdi
+   mov $0, %rbp
+   mov $0, %r8
+   mov $0, %r9
+   mov $0, %r10
+   mov $0, %r11
+   mov $0, %r12
+   mov $0, %r13
+   mov $0, %r14
+   mov $0, %r15
+0:
+#endif
+  jmp  ApeLoader
    .endfn  _start,globl

 __syscall_loader:
diff --git a/libc/crt/crt.S b/libc/crt/crt.S
index f30fde679..993bf4d82 100644
--- a/libc/crt/crt.S
+++ b/libc/crt/crt.S
@@ -30,14 +30,38 @@
 // @note   ape.S and ape-loader both set RCX to XNU on Darwin
 // @noreturn
 _start:
+#if SupportsXnu()
+   // detect and declobber Rosetta
+   cmp $0x0000000000000001, %rdx
+   jne 0f
+   movq $0x00000000ffffffff, %rdx
+   cmp %rdx, %rbx
+   jne 0f
+   mov $0, %rax
+   mov $IMAGE_BASE_VIRTUAL, %rbx
+   mov $XNU, %rcx
+   mov $0, %rdx
+   mov $0, %rdi
+   mov $0, %rsi
+   mov $0, %rbp
+   mov $0, %r8
+   mov $0, %r9
+   mov $0, %r10
+   mov $0, %r11
+   mov $0, %r12
+   mov $0, %r13
+   mov $0, %r14
+   mov $0, %r15
+0:
+#endif

 #if SupportsFreebsd()
 // detect free besiyata dishmaya
    test    %rdi,%rdi
    cmovnz  %rdi,%rsp
-   jz  0f
+   jz  1f
    movb    $FREEBSD,%cl
-0:
+1:
 #endif

 // set operating system when already detected
@@ -80,14 +104,14 @@ _start:
 #if SupportsXnu()
 // xnu doesn't have auxiliary values
    testb   IsXnu()
-   jz  1f              # polyfill xnu auxv
+   jz  2f              # polyfill xnu auxv
    push    $0              # auxv[1][1]=0
    push    $0              # auxv[1][0]=0
    mov %rsp,%rcx           # auxv
 #endif

 // enter cosmopolitan runtime
-1: mov %ebx,%edi
+2: mov %ebx,%edi
    call    cosmo
 9: .unreachable
    .endfn  _start,weak,hidden

It likely needs to be cleaned up to be PR-worthy (I had to learn assembler for this, so I'm very rusty) but the above makes things fully work on M1 (both assimilated and non-assimilated) and things still seem to be okay on regular Intel macOS and Linux as well. Tested on both hello.com and redbean.com and everything is groovy.

Note that on MODE=dbg I'm still having to stub out OverlapsShadowSpace in memtrack.internal.h to always return 0 - I'm not quite sure how to fix that one.

dkulchenko commented 2 years ago

Here's a tidier patch:

diff --git a/ape/loader-elf.S b/ape/loader-elf.S
index 7ba774525..11cf22c10 100644
--- a/ape/loader-elf.S
+++ b/ape/loader-elf.S
@@ -214,6 +214,28 @@ macho: .long   0xFEEDFACE+1
 // @see    APE_LOADER_ENTRY
 // @see    ape/loader.h
 _start:    mov %rsp,%rsi
+#if SupportsXnu()
+   cmp $0x1, %rdx
+   jne 0f
+   movq $0xffffffff, %rdx
+   cmp %rdx, %rbx
+   jne 0f
+   mov $0, %rax
+   mov $0, %rbx
+   mov $0, %rcx
+   mov $XNU, %rdx
+   mov $0, %rdi
+   mov $0, %rbp
+   mov $0, %r8
+   mov $0, %r9
+   mov $0, %r10
+   mov $0, %r11
+   mov $0, %r12
+   mov $0, %r13
+   mov $0, %r14
+   mov $0, %r15
+0:
+#endif
    jmp ApeLoader
    .endfn  _start,globl

diff --git a/ape/loader-macho.S b/ape/loader-macho.S
index b027760a7..3c738c66e 100644
--- a/ape/loader-macho.S
+++ b/ape/loader-macho.S
@@ -114,7 +114,29 @@ macho: .long   0xFEEDFACE+1

    .align  64
 _start:    mov %rsp,%rsi
-   jmp ApeLoader
+#if SupportsXnu()
+   cmp $0x1, %rdx
+   jne 0f
+   movq $0xffffffff, %rdx
+   cmp %rdx, %rbx
+   jne 0f
+   mov $0, %rax
+   mov $0, %rbx
+   mov $0, %rcx
+   mov $XNU, %rdx
+   mov $0, %rdi
+   mov $0, %rbp
+   mov $0, %r8
+   mov $0, %r9
+   mov $0, %r10
+   mov $0, %r11
+   mov $0, %r12
+   mov $0, %r13
+   mov $0, %r14
+   mov $0, %r15
+0:
+#endif
+  jmp  ApeLoader
    .endfn  _start,globl

 __syscall_loader:
diff --git a/libc/crt/crt.S b/libc/crt/crt.S
index f30fde679..1305fc422 100644
--- a/libc/crt/crt.S
+++ b/libc/crt/crt.S
@@ -30,6 +30,30 @@
 // @note   ape.S and ape-loader both set RCX to XNU on Darwin
 // @noreturn
 _start:
+#if SupportsXnu()
+   // detect and declobber Rosetta
+   cmp $0x1, %rdx
+   jne notm1
+   movq $0xffffffff, %rdx
+   cmp %rdx, %rbx
+   jne notm1
+   mov $0, %rax
+   mov $IMAGE_BASE_VIRTUAL, %rbx
+   mov $XNU, %rcx
+   mov $0, %rdx
+   mov $0, %rdi
+   mov $0, %rsi
+   mov $0, %rbp
+   mov $0, %r8
+   mov $0, %r9
+   mov $0, %r10
+   mov $0, %r11
+   mov $0, %r12
+   mov $0, %r13
+   mov $0, %r14
+   mov $0, %r15
+notm1:
+#endif

 #if SupportsFreebsd()
 // detect free besiyata dishmaya
dkulchenko commented 2 years ago

I also submitted a bug report upstream to Apple to see if they'd be interested in not disregarding LC_UNIXTHREAD registers under Rosetta, but I doubt that'll go anywhere. So I think detecting running under Rosetta as above is the way to go.

jart commented 2 years ago

Great find! I'm loving the new hack. You certainly learned assembly quickly. Your comments are already pull request worthy. Some things like clearing the registers isn't strictly necessary. Here's an example of how you could simplify the change, assuming it works.

diff --git a/ape/loader-elf.S b/ape/loader-elf.S
index 7ba774525..fb38e1daf 100644
--- a/ape/loader-elf.S
+++ b/ape/loader-elf.S
@@ -213,7 +213,17 @@ macho: .long   0xFEEDFACE+1
 //
 // @see    APE_LOADER_ENTRY
 // @see    ape/loader.h
-_start:    mov %rsp,%rsi
+_start:
+
+// Hack for detecting M1 Rosetta environment.
+// https://github.com/jart/cosmopolitan/issues/429#issuecomment-1166704377
+   cmp $-1,%ebx
+   jne 0f
+   cmp $+1,%edx
+   jne 0f
+   mov $XNU,%dl
+
+0: mov %rsp,%rsi
    jmp ApeLoader
    .endfn  _start,globl

diff --git a/ape/loader-macho.S b/ape/loader-macho.S
index b027760a7..c572ec40b 100644
--- a/ape/loader-macho.S
+++ b/ape/loader-macho.S
@@ -113,7 +113,17 @@ macho: .long   0xFEEDFACE+1
    .endobj macho,globl

    .align  64
-_start:    mov %rsp,%rsi
+_start:
+
+// Hack for detecting M1 Rosetta environment.
+// https://github.com/jart/cosmopolitan/issues/429#issuecomment-1166704377
+   cmp $-1,%ebx
+   jne 0f
+   cmp $+1,%edx
+   jne 0f
+   mov $XNU,%dl
+
+0: mov %rsp,%rsi
    jmp ApeLoader
    .endfn  _start,globl

diff --git a/ape/loader.c b/ape/loader.c
index d38e4b838..1653b53ab 100644
--- a/ape/loader.c
+++ b/ape/loader.c
@@ -595,11 +595,11 @@ __attribute__((__noreturn__)) void ApeLoader(long di, long *sp, char dl,
   // detect freebsd
   if (handoff) {
     os = handoff->os;
+  } else if (SupportsXnu() && dl == XNU) {
+    os = XNU;
   } else if (SupportsFreebsd() && di) {
     os = FREEBSD;
     sp = (long *)di;
-  } else if (SupportsXnu() && dl == XNU) {
-    os = XNU;
   } else {
     os = 0;
   }
diff --git a/libc/crt/crt.S b/libc/crt/crt.S
index f30fde679..c7079218a 100644
--- a/libc/crt/crt.S
+++ b/libc/crt/crt.S
@@ -40,6 +40,17 @@ _start:
 0:
 #endif

+#if SupportsXnu()
+// Hack for detecting M1 Rosetta environment.
+// https://github.com/jart/cosmopolitan/issues/429#issuecomment-1166704377
+   cmp $-1,%ebx
+   jne 0f
+   cmp $+1,%edx
+   jne 0f
+   mov $XNU,%cl
+0:
+#endif
+
 // set operating system when already detected
    mov %cl,__hostos(%rip)

Want to send me a PR?

dkulchenko commented 2 years ago

Thanks!

PR opened, just had to make a minor tweak to your simplified patch - it looks like we can't avoid clearing %rcx in loader-macho and loader-elf without having a segfault, so added that one back in, and same with %rdi in crt.S so we don't erroneously pass the FreeBSD check (as %rdi is also non-zero under Rosetta, as it is on FreeBSD).

Tested the PR against hello.com, standalone ape, and redbean.com on M1 and all appears to be working.

jart commented 2 years ago

Yes the need to clear RCX in the loader makes sense, since the RCX is always the fourth parameter to a C function, which in our case is the handoff variable.

pkulchenko commented 2 years ago

Closing as resolved by #453. Thanks @dkulchenko!