sikthehedgehog / mdtools

A set of tools for developing Mega Drive homebrew
76 stars 8 forks source link

Check how well the C decompressors for SLZ and UFTC work #11

Open sikthehedgehog opened 4 years ago

sikthehedgehog commented 4 years ago

Because I bet they're gonna be slow even with -O3 (honestly I should make them wrappers for inline asm, at least the UFTC one).

andwn commented 4 years ago

How's this? I started using SLZ for CS maps and decided to make the asm version GCC compatible instead of using the C one. Same with UFTC for another project. Well, it's not inline, but figured I'd share anyway. Need to give GCC/GAS --register-prefix-optional unless you feel like typing out all the %.

slz.h

void DecompressSlz(const void *in, void *out);

slz.s

    .globl DecompressSlz
DecompressSlz:
    /* Pull parameters from stack, a0, a1, d0, and d1 are caller saved */
    move.l  4(sp), a1               /* in */
    move.l  8(sp), a0               /* out */

    movem.l d2-d4, -(sp)            /* Save registers */

    move.b  (a1)+, d2               /* Get uncompressed size */
    lsl.w   #8, d2
    move.b  (a1)+, d2

    moveq   #1, d1                  /* Cause code to fetch new token data */
                                    /* as soon as it starts */
SLZ_MainLoop:
    tst.w   d2                      /* Did we read all the data? */
    beq     SLZ_End                   /* If so, we're done with it! */

    subq.w  #1, d1                  /* Check if we need more tokens */
    bne.s   SLZ_HasTokens
    move.b  (a1)+, d0
    moveq   #8, d1
SLZ_HasTokens:

    add.b   d0, d0                  /* Get next token type */
    bcc.s   SLZ_Uncompressed          /* 0 = uncompressed, 1 = compressed */

    move.b  (a1)+, d3               /* Compressed? Read string info */
    lsl.w   #8, d3                    /* d3 = distance */
    move.b  (a1)+, d3                 /* d4 = length */
    move.b  d3, d4
    lsr.w   #4, d3
    and.w   #0x0F, d4

    subq.w  #3, d2                  /* Length is offset by 3 */
    sub.w   d4, d2                  /* Now that we know the string length, */
                                      /* discount it from the amount of data */
                                      /* to be read */

    addq.w  #3, d3                  /* Distance is offset by 3 */
    neg.w   d3                      /* Make distance go backwards */

    add.w   d4, d4                  /* Copy bytes using Duff's device */
    add.w   d4, d4                    /* MUCH faster than a loop, due to lack */
    eor.w   #0x0F<<2, d4              /* of iteration overhead */
    jmp     SLZ_Duff(pc,d4.w)
SLZ_Duff:
    .rept   0x12
    move.b  (a0,d3.w), (a0)+
    .endr

    bra     SLZ_MainLoop            /* Keep processing data */

SLZ_Uncompressed:
    move.b  (a1)+, (a0)+            /* Uncompressed? Read as is */
    subq.w  #1, d2                  /* It's always one byte long */
    bra     SLZ_MainLoop              /* Keep processing data */

SLZ_End:
    movem.l (sp)+, d2-d4            /* Restore registers */

    rts                             /* End of subroutine */

uftc.h

void DecompressUftc(const void *in, void *out, uint16_t start, uint16_t num);

uftc.s

    .globl DecompressUftc
DecompressUftc:
    /* Pull parameters from stack, a0, a1, d0, and d1 are caller saved */
    move.l  4(sp),a0                    /* in */
    move.l  8(sp),a1                    /* out */
    /* -mshort */
#    move.w  12(sp),d0                   /* start */
#    move.w  14(sp),d1                   /* num */
    /* no -mshort */
    move.w  14(sp),d0                   /* start */
    move.w  18(sp),d1                   /* num */

    movem.l d2/a2-a4, -(sp)             /* Save registers */

    moveq   #0, d2                      /* Get size of dictionary */
    move.w  (a0)+, d2

    lea     (a0,d2.l), a4               /* Get address of data with first tile */
    and.l   #0xFFFF, d0                   /* to be decompressed (using a dword */
    lsl.l   #3, d0                        /* so we can have up to 8192 tiles) */
    lea     (a4,d0.l), a4

    bra.s   UFTC_LoopEnd                /* Start decompressing */
UFTC_Loop:
    move.w  (a4)+, d2                   /* Fetch addresses of dictionary */
    lea     (a0,d2.l), a3                 /* entries for the first two 4x4 */
    move.w  (a4)+, d2                     /* blocks of this tile */
    lea     (a0,d2.l), a2

    move.w  (a3)+, (a1)+                /* Decompress first pair of 4x4 blocks */
    move.w  (a2)+, (a1)+                  /* into the output buffer */
    move.w  (a3)+, (a1)+
    move.w  (a2)+, (a1)+
    move.w  (a3)+, (a1)+
    move.w  (a2)+, (a1)+
    move.w  (a3)+, (a1)+
    move.w  (a2)+, (a1)+

    move.w  (a4)+, d2                   /* Fetch addresses of dictionary */
    lea     (a0,d2.l), a3                 /* entries for the last two 4x4 */
    move.w  (a4)+, d2                     /* blocks of this tile */
    lea     (a0,d2.l), a2

    move.w  (a3)+, (a1)+                /* Decompress last pair of 4x4 blocks */
    move.w  (a2)+, (a1)+                  /* into the output buffer */
    move.w  (a3)+, (a1)+
    move.w  (a2)+, (a1)+
    move.w  (a3)+, (a1)+
    move.w  (a2)+, (a1)+
    move.w  (a3)+, (a1)+
    move.w  (a2)+, (a1)+

UFTC_LoopEnd:
    dbf     d1, UFTC_Loop               /* Go for next tile */

    movem.l  (sp)+, d2/a2-a4            /* Restore registers */
    rts                                 /* End of subroutine */
sikthehedgehog commented 3 years ago

And now replaced them with inline asm (not pure asm routines because LTO doesn't like that, and also would bring problems with -mshort). Only SLZ16 and UFTC16 for now though, gotta get around doing the same for SLZ24.