Optimise flag calculations in Jit

Example of how a single guest instruction translates to many host instruction:

block[0200003c][a][1f] guest code:
  cmppls r3, #0x00000014 53534150 0200003c
block[0200003c][a][1f] guest code -> ir | 1 instructions emitted:
  v0 = load_cpsr
  v1 = set_bit v0, 0x00000000, 0x0000001d
  v2 = load_gpr r3
  v3 = subtract v2, 0x00000014
  v4 = get_bit v3, 0x0000001f
  v6 = set_bit v1, v4, 0x0000001f
  v7 = compare_eq v3, 0x00000000
  v9 = set_bit v6, v7, 0x0000001e
  v10 = compare_ge v2, 0x00000014
  v12 = set_bit v9, v10, 0x0000001d
  v13 = xor v2, 0x00000014
  v14 = xor v2, v3
  v15 = and v13, v14
  v16 = get_bit v15, 0x0000001f
  v18 = set_bit v12, v16, 0x0000001c
  store_cpsr v18
  store_gpr r15, 0x02000048
block[0200003c][a][1f] ir -> a64 assembly | 54 instructions emitted | entry at 0x12270043c:
  a9ba53f3      stp     x19, x20, [sp, #-96]!
  a9015bf5      stp     x21, x22, [sp, #16]
  a90263f7      stp     x23, x24, [sp, #32]
  a9036bf9      stp     x25, x26, [sp, #48]
  a90473fb      stp     x27, x28, [sp, #64]
  a9057bfd      stp     x29, x30, [sp, #80]
  aa0003f3      mov     x19, x0
  2a0103f4      mov     w20, w1
  b940f275      ldr     w21, [x19, #240]
  12040eb5      and     w21, w21, #0xf0000000
  d51b4215      msr     NZCV, x21
  540000a5      b.pl    #20
  52800916      mov     w22, #72
  72a04016      movk    w22, #512, lsl #16
  b9004676      str     w22, [x19, #68]
  1400001e      b       #120
  b940f275      ldr     w21, [x19, #240]
  52800017      mov     w23, #0
  12027ab6      and     w22, w21, #0xdfffffff
  2a1776d6      orr     w22, w22, w23, lsl #29
  b9401675      ldr     w21, [x19, #20]
  52800298      mov     w24, #20
  4b1802b7      sub     w23, w21, w24
  531f7ef8      lsr     w24, w23, #31
  12007ad9      and     w25, w22, #0x7fffffff
  2a187f39      orr     w25, w25, w24, lsl #31
  710002ff      cmp     w23, #0
  1a9f17f6      cset    w22, eq
  12017b38      and     w24, w25, #0xbfffffff
  2a167b18      orr     w24, w24, w22, lsl #30
  710052bf      cmp     w21, #20
  1a9f37f6      cset    w22, hs
  12027b19      and     w25, w24, #0xdfffffff
  2a167739      orr     w25, w25, w22, lsl #29
  52800298      mov     w24, #20
  4a1802b6      eor     w22, w21, w24
  4a1702b8      eor     w24, w21, w23
  0a1802d5      and     w21, w22, w24
  531f7eb6      lsr     w22, w21, #31
  12037b35      and     w21, w25, #0xefffffff
  2a1672b5      orr     w21, w21, w22, lsl #28
  b900f275      str     w21, [x19, #240]
  52800915      mov     w21, #72
  72a04015      movk    w21, #512, lsl #16
  b9004675      str     w21, [x19, #68]
  51000694      sub     w20, w20, #1
  2a1403e0      mov     w0, w20
  a9457bfd      ldp     x29, x30, [sp, #80]
  a94473fb      ldp     x27, x28, [sp, #64]
  a9436bf9      ldp     x25, x26, [sp, #48]
  a94263f7      ldp     x23, x24, [sp, #32]
  a9415bf5      ldp     x21, x22, [sp, #16]
  a8c653f3      ldp     x19, x20, [sp], #96
  d65f03c0      ret

Ideally the flag calculation should be done natively as part of a host instruction, and then the flag results can be written directly back to cpsr. Since a32 and a64 have the same flag layout roughly, this should be more efficient.

strayacode / yuugen

Optimise flag calculations in Jit #27