strayacode / yuugen

A DS Emulator written in C++
https://strayacode.github.io/
GNU General Public License v3.0
26 stars 0 forks source link

Optimise flag calculations in Jit #27

Open strayacode opened 6 months ago

strayacode commented 6 months ago

Example of how a single guest instruction translates to many host instruction:

block[0200003c][a][1f] guest code:
  cmppls r3, #0x00000014 53534150 0200003c
block[0200003c][a][1f] guest code -> ir | 1 instructions emitted:
  v0 = load_cpsr
  v1 = set_bit v0, 0x00000000, 0x0000001d
  v2 = load_gpr r3
  v3 = subtract v2, 0x00000014
  v4 = get_bit v3, 0x0000001f
  v6 = set_bit v1, v4, 0x0000001f
  v7 = compare_eq v3, 0x00000000
  v9 = set_bit v6, v7, 0x0000001e
  v10 = compare_ge v2, 0x00000014
  v12 = set_bit v9, v10, 0x0000001d
  v13 = xor v2, 0x00000014
  v14 = xor v2, v3
  v15 = and v13, v14
  v16 = get_bit v15, 0x0000001f
  v18 = set_bit v12, v16, 0x0000001c
  store_cpsr v18
  store_gpr r15, 0x02000048
block[0200003c][a][1f] ir -> a64 assembly | 54 instructions emitted | entry at 0x12270043c:
  a9ba53f3      stp     x19, x20, [sp, #-96]!
  a9015bf5      stp     x21, x22, [sp, #16]
  a90263f7      stp     x23, x24, [sp, #32]
  a9036bf9      stp     x25, x26, [sp, #48]
  a90473fb      stp     x27, x28, [sp, #64]
  a9057bfd      stp     x29, x30, [sp, #80]
  aa0003f3      mov     x19, x0
  2a0103f4      mov     w20, w1
  b940f275      ldr     w21, [x19, #240]
  12040eb5      and     w21, w21, #0xf0000000
  d51b4215      msr     NZCV, x21
  540000a5      b.pl    #20
  52800916      mov     w22, #72
  72a04016      movk    w22, #512, lsl #16
  b9004676      str     w22, [x19, #68]
  1400001e      b       #120
  b940f275      ldr     w21, [x19, #240]
  52800017      mov     w23, #0
  12027ab6      and     w22, w21, #0xdfffffff
  2a1776d6      orr     w22, w22, w23, lsl #29
  b9401675      ldr     w21, [x19, #20]
  52800298      mov     w24, #20
  4b1802b7      sub     w23, w21, w24
  531f7ef8      lsr     w24, w23, #31
  12007ad9      and     w25, w22, #0x7fffffff
  2a187f39      orr     w25, w25, w24, lsl #31
  710002ff      cmp     w23, #0
  1a9f17f6      cset    w22, eq
  12017b38      and     w24, w25, #0xbfffffff
  2a167b18      orr     w24, w24, w22, lsl #30
  710052bf      cmp     w21, #20
  1a9f37f6      cset    w22, hs
  12027b19      and     w25, w24, #0xdfffffff
  2a167739      orr     w25, w25, w22, lsl #29
  52800298      mov     w24, #20
  4a1802b6      eor     w22, w21, w24
  4a1702b8      eor     w24, w21, w23
  0a1802d5      and     w21, w22, w24
  531f7eb6      lsr     w22, w21, #31
  12037b35      and     w21, w25, #0xefffffff
  2a1672b5      orr     w21, w21, w22, lsl #28
  b900f275      str     w21, [x19, #240]
  52800915      mov     w21, #72
  72a04015      movk    w21, #512, lsl #16
  b9004675      str     w21, [x19, #68]
  51000694      sub     w20, w20, #1
  2a1403e0      mov     w0, w20
  a9457bfd      ldp     x29, x30, [sp, #80]
  a94473fb      ldp     x27, x28, [sp, #64]
  a9436bf9      ldp     x25, x26, [sp, #48]
  a94263f7      ldp     x23, x24, [sp, #32]
  a9415bf5      ldp     x21, x22, [sp, #16]
  a8c653f3      ldp     x19, x20, [sp], #96
  d65f03c0      ret

Ideally the flag calculation should be done natively as part of a host instruction, and then the flag results can be written directly back to cpsr. Since a32 and a64 have the same flag layout roughly, this should be more efficient.

strayacode commented 4 months ago

In relation to this: See if there's better way to handle instructions that change flags. The only reason we have to break a basic block is because updating flags will change the condition. For the basic block we assumed that all instructions evaluate true to the first condition.