unicorn-engine / unicorn

Unicorn CPU emulator framework (ARM, AArch64, M68K, Mips, Sparc, PowerPC, RiscV, S390x, TriCore, X86)
http://www.unicorn-engine.org
GNU General Public License v2.0
7.67k stars 1.35k forks source link

BUG - PDEP & PEXT in x86_64 computes wrong results #1852

Closed basavesh closed 1 year ago

basavesh commented 1 year ago

I believe the src and mask order is exchanged here while computing the result. PDEP DEST, SRC, MASK but I think Unicorn is doing PDEP DEST, MASK, SRC

Before Executing:

int64_t r_rax = 0x0;     // RAX register
int64_t r_rbx = 0x3;     // RBX register
int64_t r_rcx = 0xd;     // RCX register

INSTR: pdep rax,rbx,rcx in INTEL syntax

AFTER Executing:

>>> RAX = 0x1                     <------------ wrong result (it should be 0x5)
>>> RBX = 0x3
>>> RCX = 0xd
>>> PDEP Intrinsic = 0x5

@wtdcode

Repro code: Build cmd: clang <test_keystone_unicorn.c> -o test_keystone_unicorn -lkeystone -lunicorn -mbmi2

#include <stdio.h>
#include <immintrin.h>
#include <keystone/keystone.h>
#include <unicorn/unicorn.h>

// memory address where emulation starts
#define ADDRESS 0x1000000

// separate assembly instructions by ; or \n
#define CODE "pdep rax,rbx,rcx"

int main(int argc, char **argv)
{
    ks_engine *ks;
    ks_err err;
    size_t count;
    unsigned char *encode;
    size_t size;

    err = ks_open(KS_ARCH_X86, KS_MODE_64, &ks);
    if (err != KS_ERR_OK) {
        printf("ERROR: failed on ks_open(), quit\n");
        return -1;
    }

    if (ks_asm(ks, CODE, 0, &encode, &size, &count) != KS_ERR_OK) {
        printf("ERROR: ks_asm() failed & count = %lu, error = %u\n",
                count, ks_errno(ks));
    } else {
        size_t i;

        printf("%s = ", CODE);
        for (i = 0; i < size; i++) {
            printf("%02x ", encode[i]);
        }
        printf("\n");
        printf("Compiled: %lu bytes, statements: %lu\n", size, count);
    }

    uc_engine *uc;
    uc_err err_uc;
    int64_t r_rax = 0x0;     // RAX register
    int64_t r_rbx = 0x3;     // RBX register
    int64_t r_rcx = 0xd;     // RCX register

    printf("Emulate i386 code\n");

    // Initialize emulator in X86-32bit mode
    err_uc = uc_open(UC_ARCH_X86, UC_MODE_64, &uc);
    if (err_uc != UC_ERR_OK) {
        printf("Failed on uc_open() with error returned: %u\n", err_uc);
        return -1;
    }

    // map 2MB memory for this emulation
    uc_mem_map(uc, ADDRESS, 2 * 1024 * 1024, UC_PROT_ALL);

    // write machine code to be emulated to memory
    if (uc_mem_write(uc, ADDRESS, encode, size)) {
        printf("Failed to write emulation code to memory, quit!\n");
        return -1;
    }

    // initialize machine registers
    uc_reg_write(uc, UC_X86_REG_RAX, &r_rax);
    uc_reg_write(uc, UC_X86_REG_RBX, &r_rbx);
    uc_reg_write(uc, UC_X86_REG_RCX, &r_rcx);

    // emulate code in infinite time & unlimited instructions
    err_uc=uc_emu_start(uc, ADDRESS, ADDRESS + size, 0, 0);
    if (err_uc) {
        printf("Failed on uc_emu_start() with error returned %u: %s\n",
            err_uc, uc_strerror(err_uc));
    }

    // now print out some registers
    printf("Emulation done. Below is the CPU context\n");

    uc_reg_read(uc, UC_X86_REG_RAX, &r_rax);
    uc_reg_read(uc, UC_X86_REG_RBX, &r_rbx);
    uc_reg_read(uc, UC_X86_REG_RCX, &r_rcx);

    printf(">>> RAX = 0x%lx\n", r_rax);
    printf(">>> RBX = 0x%lx\n", r_rbx);
    printf(">>> RCX = 0x%lx\n", r_rcx);
    printf("PDEP Intrinsic = 0x%llx\n", _pdep_u64(r_rbx, r_rcx));
    uc_close(uc);

    // NOTE: free encode after usage to avoid leaking memory
    ks_free(encode);

    // close Keystone instance when done
    ks_close(ks);

    return 0;
}
basavesh commented 1 year ago

Test case for PEXT Before Executing

int64_t r_rax = 0x0;     // RAX register
int64_t r_rbx = 0x7;     // RBX register
int64_t r_rcx = 0x5;     // RCX register

Instruction

pext rax,rbx,rcx

After Executing

Emulation done. Below is the CPU context
>>> RAX = 0x5                                     <--------- wrong
>>> RBX = 0x7
>>> RCX = 0x5
PEXT Intrinsic = 0x3
basavesh commented 1 year ago

Fix for PDEP and PEXT. I can create a pull request.

Is it possible that QEMU also has same bug? (didn't test QEMU yet)

diff --git a/qemu/target/i386/translate.c b/qemu/target/i386/translate.c
index b4dc56f2..4a7b0045 100644
--- a/qemu/target/i386/translate.c
+++ b/qemu/target/i386/translate.c
@@ -4226,7 +4226,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                 } else {
                     tcg_gen_ext32u_tl(tcg_ctx, s->T1, tcg_ctx->cpu_regs[s->vex_v]);
                 }
-                gen_helper_pdep(tcg_ctx, tcg_ctx->cpu_regs[reg], s->T0, s->T1);
+                gen_helper_pdep(tcg_ctx, tcg_ctx->cpu_regs[reg], s->T1, s->T0);
                 break;

             case 0x2f5: /* pext Gy, By, Ey */
@@ -4244,7 +4244,7 @@ static void gen_sse(CPUX86State *env, DisasContext *s, int b,
                 } else {
                     tcg_gen_ext32u_tl(tcg_ctx, s->T1, tcg_ctx->cpu_regs[s->vex_v]);
                 }
-                gen_helper_pext(tcg_ctx, tcg_ctx->cpu_regs[reg], s->T0, s->T1);
+                gen_helper_pext(tcg_ctx, tcg_ctx->cpu_regs[reg], s->T1, s->T0);
                 break;

             case 0x1f6: /* adcx Gy, Ey */
wtdcode commented 1 year ago

Is it possible that QEMU also has same bug? (didn't test QEMU yet)

Yes, see my comments in your PR

wtdcode commented 1 year ago

Closed as upstream patch gets backport