dougallj / applegpu

Apple G13 GPU architecture docs and tools
BSD 3-Clause "New" or "Revised" License
552 stars 41 forks source link

CompilerExplorer: Support extracting helper functions #52

Closed TellowKrinkle closed 1 year ago

TellowKrinkle commented 1 year ago

Usually, GPU binaries have all their functions inlined into the main function, but this may not always happen, e.g.

This adds support for decompiling functions other than the main function to compiler explorer, so you don't end up with a mysterious call to a negative PC

Example Shader ```metal uint helper(uint pos, device const uint* data) { uint res = data[pos]; if (res == 0) { res = helper(pos * 2, data) + helper(pos * 2 + 1, data); } return res; } kernel void test(uint pos [[thread_position_in_grid]], device uint* odata, device const uint* idata) { odata[pos] = helper(pos, idata); } ``` ``` compute l__Z6helperjPU9MTLdeviceKj: 0: 62120000 mov_imm r4h, 0 4: b50081052000 stack_store i8, 1, 2, 0, 8, 0 a: b509000501c03000 stack_store i16, 1, 0, xy, 4, r1l_r1h, 16, 0 12: b5b9c00500c03000 stack_store i16, 1, 0, xy, 4, r23l_r23h, 12, 0 1a: b5c1800500c03000 stack_store i16, 1, 0, xy, 4, r24l_r24h, 8, 0 22: b5c9400500c03000 stack_store i16, 1, 0, xy, 4, r25l_r25h, 4, 0 2a: b5d1000500c03000 stack_store i16, 1, 0, xy, 4, r26l_r26h, 0, 0 32: 7e5d980a8000 mov r23, r12.cache 38: 7e615a0a8000 mov r24, r13 3e: 05116e0621c01200 device_load 0, i32, x, r2, r23_r24, r11, unsigned 46: 3800 wait 0 48: d28884020000 if_icmp r0l.cache, seq, r2.cache, 0, 1 4e: 8e090060a9000000 iadd r2.cache, 0, r11.cache, lsl 1 56: 0e690160a9000000 iadd r26, 1, r11.cache, lsl 1 5e: 7e2d440a8000 mov r11, r2 64: 7e31ae0a8000 mov r12, r23.cache 6a: 7e35700a8000 mov r13, r24 70: 421000000000 push_exec r0l, 2 76: 10c08affffff call 0x0 7c: d21600000000 pop_exec r0l.cache, 2 82: 7e65960a8000 mov r25, r11.cache 88: 7e2d740a8000 mov r11, r26 8e: 7e31ae0a8000 mov r12, r23.cache 94: 7e35700a8000 mov r13, r24 9a: 421000000000 push_exec r0l, 2 a0: 10c060ffffff call 0x0 a6: d21600000000 pop_exec r0l.cache, 2 ac: 8e09562227000000 iadd r2.cache, r11, r25 b4: 520e00000000 pop_exec r0l, 1 ba: 7e2d440a8000 mov r11, r2 c0: 35d1000500c03000 stack_load r26l_r26h, i16, 1, 0, xy, 4, 0, 0 c8: 35c9400500c03000 stack_load r25l_r25h, i16, 1, 0, xy, 4, 4, 0 d0: 35c1800500c03000 stack_load r24l_r24h, i16, 1, 0, xy, 4, 8, 0 d8: 35b9c00500c03000 stack_load r23l_r23h, i16, 1, 0, xy, 4, 12, 0 e0: 3509000501c03000 stack_load r1l_r1h, i16, 1, 0, xy, 4, 16, 0 e8: b50081052f8000ff stack_store i8, 1, 2, 0, -8, 0 f0: 62120000 mov_imm r4h, 0 f4: 3800 wait 0 f6: 1402 ret r1 compute shader: 140: 62000000 mov_imm r0l, 0 144: 62020000 mov_imm r0h, 0 148: b50081052000 stack_store i8, 1, 2, 0, 8, 0 14e: 725d1004 get_sr r23, sr80 (thread_position_in_grid.x) 152: 722d1004 get_sr r11, sr80 (thread_position_in_grid.x) 156: 7e3184098000 mov r12, u2 15c: 7e3586098000 mov r13, u3 162: e2000000 mov_imm r0l.cache, 0 166: 62120000 mov_imm r4h, 0 16a: 421000000000 push_exec r0l, 2 170: 10c090feffff call 0x0 176: 521600000000 pop_exec r0l, 2 17c: 4559e00e02c01200 device_store 0, i32, x, r11, u0_u1, r23, unsigned, 0 184: 8800 stop ```