CPU: distinct decode step / disassembly

fengb commented 5 years ago

Extracted from https://github.com/fengb/fundude/issues/12, https://github.com/fengb/fundude/issues/16

We need a way to save decoded instructions.

Note: this will most likely hurt performance since we're converting the "step" jump table into a "decode" jump table + function pointer call. The followup cache should speed everything back up.

fengb commented 5 years ago

const OpArg = union(enum) {
  u8: u8,
  u16: u16,
  reg8: CpuReg8,
  reg16: CpuReg16,
};

const Op = struct {
  microop: fn(fd: *Fundude, arg0: OpArg, arg1: OpArg) OpResult,
  arg0: OpArg,
  arg1: OpArg,
  op_len: u4,
  zasm: []const u8,
};

const OpResult = struct {
  jump: u16,
  cycles: u16,
};


const LD = struct {
    fn µ_rr_d8(arg0: OpArg, arg1: OpArg) OpResult {
        return OpResult{ .jump = arg1.u16, .cycles = arg0.u8 };
    }

    pub fn rr_d8(arg0: u8, arg1: u16) Op {
        return Op{
            .microop = µ_rr_d8,
            .arg0 = OpArg{ .u8 = arg0 },
            .arg1 = OpArg{ .u16 = arg1 },
            .zasm = "LD",
        };
    }
};```

fengb commented 5 years ago

Might be worth deferring until https://github.com/ziglang/zig/issues/1717 for function expressions:

const ld__rr_d8 = fn(arg0: u8, arg1: u16) Op {
    return Op{
       .zasm = "LD",
       .arg0 = OpArg{ .u8 = arg0 },
       .arg1 = OpArg{ .u16 = arg1 },
       .microop = fn(arg0: OpArg, arg1: OpArg) OpResult {
             return OpResult{ .jump = arg1.u16, .cycles = arg0.u8 };
       }
    };
};

fengb commented 4 years ago

More CPU cache friendly layout:

const Op = struct {
    // 2 bytes
    .microop = enum { .ld__rr_d8, ... };
    // 2 bytes
    arg0: OpArg,
    // 2 bytes
    arg1: OpArg,

    len: u8,
    cycles: packed tuple { u4, u4 },
};

Thoughts:

This makes it obvious that all we're doing is caching the decode step. Probably won't be any faster but also shouldn't be slower either.
We can semi-easily test if a function pointer is better. It'd be slightly less cache friendly so we should benchmark the differences.

fengb commented 4 years ago

Maybe try comptime magic similar to wazm:

pub fn decode(data: [*]u8) Op {
    return switch (data[0]) {
        0x06 => Op.init(.ld__rr_d8, Reg8.B, with8(inst)),
    };
}

pub fn run(cpu: *main.Cpu, mmu: *main.Mmu, op: Op) Result {
    // Microps should be a generated enum
    const func = switch (op.microp) {
        .ld__rr_d8 => ld__rr_d8,
    };
    // Casts should be noops due to packed struct magic.
    const args = @typeInfo(@TypeOf(func)).Fn.args;
    const arg0 = @bitCast(arg_types[0], op.arg0);
    const arg1 = @bitCast(arg_types[1], op.arg1);
    return @bitCast(Result, func(cpu, mmu, arg0, arg1));
}

pub fn ld__rr_d8(cpu: *main.Cpu, mmu: *main.Mmu, tgt: Arg.Reg8, d8: Arg.U8) Result.Fixed(1, 8) {
    cpu.reg._8.set(tgt, d8);
    return .{};
}

If we want to keep using microp, we cannot map 1-to-1 like I did with wazm so we'll need the manual decode step. It might be worth investigating flattening the decode, but I'm not sure I want to go from ~80 microps to ~240 ops.

fengb commented 4 years ago

I like this direction. I also feel like there should be better standardization of type abbreviations (because I really don't remember the current ones):

r- — register
i- — immediate
R- — register-as-pointer
I- — immediate-as-pointer
-b — byte (8 bit)
-w — wide/word (16 bit)

fengb commented 4 years ago

With the separate decode step, we can finally get rid of the instruction offset hacks in all of the call ops and irqStep!

fn step() u16 {
    const op = decode();
    cpu.reg.PC += op.length;
    const duration = run(op);
    assert(duration == op.next_duration or duration == op.jump_duration);
    return duration;
}

const Op.Result = extern struct {
    duration,

    fn Fixed(length: u8, duration: u8) type {
        return extern struct {
            const length = length;
            const next_duration = duration;
            const jump_duration = duration;

            duration: u16 = duration,
        };
    }

    fn Cond(length: u8, next_duration: u8, jump_duration: u8) type {
        return extern struct {
            const length = length;
            const next_duration = next_duration;
            const jump_duration = jump_duration;

            duration: u16,
        };
    }
};

fengb / fundude

CPU: distinct decode step / disassembly #27