Flow out of InjectPayloadCallother implementation

shuffle2 commented 2 years ago

I have the following:

package AndeStar;

import java.util.HashMap;
import java.util.Map;

import ghidra.app.plugin.processors.sleigh.SleighLanguage;
import ghidra.program.model.lang.InjectPayload;
import ghidra.program.model.lang.InjectPayloadCallother;
import ghidra.program.model.lang.PcodeInjectLibrary;

public class PcodeInjectLibraryAndeStar extends PcodeInjectLibrary {
    // names of defined pcode ops that require pcode injection
    public static final String LMW = "lmw";
    public static final String SMW = "smw";

    private Map<String, InjectPayloadCallother> implementedOps;

    public static final String SOURCENAME = "AndeStarInternal";

    public PcodeInjectLibraryAndeStar(SleighLanguage l) {
        super(l);
        implementedOps = new HashMap<>();
        implementedOps.put(LMW, new InjectLSMW(SOURCENAME, l, uniqueBase, false));
        uniqueBase += 0x100;
        implementedOps.put(SMW, new InjectLSMW(SOURCENAME, l, uniqueBase, true));
        uniqueBase += 0x100;
    }

    public PcodeInjectLibraryAndeStar(PcodeInjectLibraryAndeStar op2) {
        super(op2);
        implementedOps = op2.implementedOps; // Immutable
    }

    @Override
    public PcodeInjectLibrary clone() {
        return new PcodeInjectLibraryAndeStar(this);
    }

    @Override
    public InjectPayload allocateInject(String sourceName, String name, int tp) {
        if (tp == InjectPayload.CALLOTHERFIXUP_TYPE) {
            InjectPayloadCallother payload = implementedOps.get(name);
            if (payload != null) {
                return payload;
            }
        }
        return super.allocateInject(sourceName, name, tp);
    }
}

public class InjectLSMW extends InjectPayloadCallother {
    private SleighLanguage language;
    private long uniqueBase;
    private boolean store;
    private AddressSpace constSpace;
    private AddressSpace uniqueSpace;
    private Address opAddress;
    private int seqnum;
    private Varnode defSpaceId;

    public InjectLSMW(String sourceName, SleighLanguage language, long uniqueBase, boolean store) {
        super(sourceName);
        this.language = language;
        this.uniqueBase = uniqueBase;
        this.store = store;
        constSpace = language.getAddressFactory().getConstantSpace();
        uniqueSpace = language.getAddressFactory().getUniqueSpace();
        defSpaceId = getConstant(language.getDefaultSpace().getSpaceID(), 4);
        seqnum = 0;
    }

    private Varnode convertRegisterToVarnode(Register reg) {
        return new Varnode(reg.getAddress(), reg.getBitLength() / 8);
    }

    private Varnode getConstant(long val, int size) {
        return new Varnode(constSpace.getAddress(val), size);
    }

    @Override
    public PcodeOp[] getPcode(Program program, InjectContext con) {
        int Rb = (int) con.inputlist.get(0).getOffset();
        Varnode Ra = con.inputlist.get(1);
        int Re = (int) con.inputlist.get(2).getOffset();
        int Enable4 = (int) con.inputlist.get(3).getOffset();
        int BeforeAfter = (int) con.inputlist.get(4).getOffset();
        int IncDec = (int) con.inputlist.get(5).getOffset();
        int Modify = (int) con.inputlist.get(6).getOffset();

        boolean noRegs = Rb == Re && Rb == 31;
        if (Rb > Re || (!noRegs && Rb > 28) || (noRegs && Enable4 == 0)) {
            throw new IllegalArgumentException("illformed lsmw");
        }

        int reglist = Enable4;
        if (!noRegs) {
            for (int i = Rb; i <= Re; i++) {
                reglist |= 1 << (31 - i);
            }
        }
        int totalSize = 0;
        for (int i = 0; i < 32; i++) {
            if ((reglist & (1 << i)) != 0) {
                totalSize += 4;
            }
        }

        opAddress = con.baseAddr;
        ArrayList<PcodeOp> opList = new ArrayList<PcodeOp>();

        int RaSize = Ra.getSize();
        Varnode regSize = getConstant(RaSize, RaSize);
        Varnode numBytes = getConstant(totalSize, RaSize);

        // bAddr = Ra
        Varnode bAddr = new Varnode(uniqueSpace.getAddress(uniqueBase), RaSize);
        uniqueBase += 16; // ?
        opList.add(new PcodeOp(opAddress, seqnum++, PcodeOp.COPY, new Varnode[] { Ra }, bAddr));

        // int b_addr;
        if (BeforeAfter == 0 && IncDec == 0) {
            // b_addr = Ra;
        } else if (BeforeAfter != 0 && IncDec == 0) {
            // b_addr = Ra + 4;
            opList.add(new PcodeOp(opAddress, seqnum++, PcodeOp.INT_ADD, new Varnode[] { bAddr, regSize }, bAddr));
        } else if (BeforeAfter == 0 && IncDec != 0) {
            // b_addr = Ra - totalSize + 4;
            opList.add(new PcodeOp(opAddress, seqnum++, PcodeOp.INT_SUB, new Varnode[] { bAddr, numBytes }, bAddr));
            opList.add(new PcodeOp(opAddress, seqnum++, PcodeOp.INT_ADD, new Varnode[] { bAddr, regSize }, bAddr));
        } else {
            // b_addr = Ra - totalSize;
            opList.add(new PcodeOp(opAddress, seqnum++, PcodeOp.INT_SUB, new Varnode[] { bAddr, numBytes }, bAddr));
        }

        Address a0Addr = language.getRegister("a0").getAddress();
        for (int i = 0; i < 32; i++) {
            if ((reglist & (1 << (31 - i))) == 0) {
                continue;
            }
            Varnode reg = convertRegisterToVarnode(language.getRegister(a0Addr.add(i * RaSize), RaSize));
            if (store) {
                // generate the store from R[i]
                opList.add(new PcodeOp(opAddress, seqnum++, PcodeOp.STORE, new Varnode[] { defSpaceId, bAddr, reg }));
            } else {
                // generate the load to R[i]
                opList.add(new PcodeOp(opAddress, seqnum++, PcodeOp.LOAD, new Varnode[] { defSpaceId, bAddr }, reg));
            }
            // b_addr += 4;
            opList.add(new PcodeOp(opAddress, seqnum++, PcodeOp.INT_ADD, new Varnode[] { bAddr, regSize }, bAddr));
        }

        // BUG: the writeback does not flow to the successive instructions for some
        // reason. Because the inject type is CALLOTHERFIXUP_TYPE?
        if (Modify != 0) {
            if (IncDec == 0) {
                // Ra += totalSize;
                opList.add(new PcodeOp(opAddress, seqnum++, PcodeOp.INT_ADD, new Varnode[] { Ra, numBytes }, Ra));
            } else {
                // Ra -= totalSize;
                opList.add(new PcodeOp(opAddress, seqnum++, PcodeOp.INT_SUB, new Varnode[] { Ra, numBytes }, Ra));
            }
        }

        PcodeOp[] res = new PcodeOp[opList.size()];
        opList.toArray(res);
        return res;
    }

}

<callotherfixup targetop="lmw">
        <pcode dynamic="true">
            <input name="Rb"/>
            <input name="Ra"/>
            <input name="Re"/>
            <input name="Enable4"/>
            <input name="BeforeAfter"/>
            <input name="IncDec"/>
            <input name="Modify"/>
        </pcode>
    </callotherfixup>
    <callotherfixup targetop="smw">
        <pcode dynamic="true">
            <input name="Rb"/>
            <input name="Ra"/>
            <input name="Re"/>
            <input name="Enable4"/>
            <input name="BeforeAfter"/>
            <input name="IncDec"/>
            <input name="Modify"/>
        </pcode>
    </callotherfixup>

:LMW.^lmw4^lmw3^lmw2 u20_24, [u15_19], u10_14, u6_9 is u20_24 & u20_24i & u15_19 & u10_14 & u10_14i & u6_9 & lmw5=0 & lmw4 & lmw3 & lmw2 & u0_1=0b00 {
    lmw(u20_24i:1, u15_19, u10_14i:1, u6_9:1, lmw4:1, lmw3:1, lmw2:1);
}
:SMW.^lmw4^lmw3^lmw2 u20_24, [u15_19], u10_14, u6_9 is u20_24 & u20_24i & u15_19 & u10_14 & u10_14i & u6_9 & lmw5=1 & lmw4 & lmw3 & lmw2 & u0_1=0b00 {
    smw(u20_24i:1, u15_19, u10_14i:1, u6_9:1, lmw4:1, lmw3:1, lmw2:1);
}

What I observe is that it appears to work mostly correct, except that the writeback to Ra is not propagated to following instructions.
For example (stack depth should be 4 after the first SMW.adm):

The problem can also be seen here, where a5 should have been updated by 0001868a but is not (also, stack depth should be 0xc after first SMW):

Any tips for how to go about this? Also, any tips for debugging it (viewing effective pcode considering dynamically generated ops / representation as it goes through optimization)?

As I hint in the BUG comment, I guess this is fairly expected as [something in ghidra] cannot figure out sideeffects of CALLOTHER . But, how do you truly implement a user-defined instruction in dynamic pcode? Trying to implement this in sleigh was somewhat of a nightmare and doesn't seem like a good solution.

shuffle2 commented 2 years ago

I've tried a few workarounds without success:

creating a new register in the language spec which the pcode injector writes to, then setting Ra = shared_reg in sleigh.
dividing the constructors up to specialize the Modify=1 case, then recomputing the displacement to add to Ra and doing the add in sleigh. This seems like it should work but it doesn't. The sleigh looks like:

total_size: is u10_14i & u20_24i & u6_9 & lmw4 {
    local reg_size:1 = ((u10_14i - u20_24i) + popcount(u6_9:1)) * 4;
    local disp:4 = sext(reg_size * ((lmw4:1 << 1) - 1));
    export *[const]:4 disp;
}
:SMW.^lmw4^lmw3^lmw2 u20_24, [u15_19], u10_14, u6_9 is u20_24 & u20_24i & u15_19 & u10_14 & u10_14i & u6_9 & lmw5=1 & lmw4 & lmw3 & lmw2 & lmw2=0 & u0_1=0b00 {
    smw(u20_24i:1, u15_19, u10_14i:1, u6_9:1, lmw4:1, lmw3:1, lmw2:1);
}
:SMW.^lmw4^lmw3^lmw2 u20_24, [u15_19], u10_14, u6_9 is u20_24 & u20_24i & u15_19 & u10_14 & u10_14i & u6_9 & lmw5=1 & lmw4 & lmw3 & lmw2 & lmw2=1 & u0_1=0b00 & total_size {
    smw(u20_24i:1, u15_19, u10_14i:1, u6_9:1, lmw4:1, lmw3:1, lmw2:1);
    build total_size;
    u15_19 = u15_19 + total_size;
}

Which generates pcode like:

                             **************************************************************
                             *                          FUNCTION                          *
                             **************************************************************
                             undefined FUN_0008de48()
                               assume gp = 0x2046000
             undefined         a0:1           <RETURN>
                             FUN_0008de48                                       XREF[1]:     FUN_0008c348:0008c4dc(c)  
        0008de48 3a 6f 98 bc       0           SMW.adm    s0,[sp],s0,0x2
                                                      CALLOTHER "smw", 6:1, sp, 6:1, 2:1, 1:1, 1:1, 1:1
                                                      $U3c00:1 = INT_SUB 6:1, 6:1
                                                      $U3c80:1 = POPCOUNT 2:1
                                                      $U3d00:1 = INT_ADD $U3c00:1, $U3c80:1
                                                      $U3e00:1 = INT_MULT $U3d00:1, 4:1
                                                      $U3e80:1 = INT_LEFT 1:1, 1:4
                                                      $U3f00:1 = INT_SUB $U3e80:1, 1:1
                                                      $U3f80:1 = INT_MULT $U3e00:1, $U3f00:1
                                                      $U4080:4 = INT_SEXT $U3f80:1
                                                      $U4100:4 = LOAD const($U4080:4)
                                                      sp = INT_ADD sp, $U4100:4
        0008de4c 3c 1c 5a 68     - ? -         LWI.gp     a1,[+0x169a0]=>DAT_0205c9a0                      = ??
                                                      $U6700:4 = INT_ADD gp, 0x169a0:4
                                                      a1 = LOAD ram($U6700:4)

Note that stack depth becomes - ? -. Is there a way to make the computation actually be constant?

shuffle2 commented 2 years ago

OK seems to be working now:

mw_wb_disp: squash reg_size disp is u10_14i & u20_24i & lmw_fp & lmw_gp & lmw_lp & lmw_sp & lmw3 [
    # squash = (u20_24i == 31) ? 0 : 1
    squash = 1 ^ (((u20_24i >> 4) & 1) & ((u20_24i >> 3) & 1) & ((u20_24i >> 2) & 1) & ((u20_24i >> 1) & 1) & (u20_24i & 1));
    reg_size = ((squash * (u10_14i - u20_24i + 1)) + lmw_fp + lmw_gp + lmw_lp + lmw_sp) * 4;
    # mul by {inc:1,dec:-1}
    disp = reg_size * (((lmw3^1) << 1) - 1);
] {
    export *[const]:4 disp;
}
:LMW.^lmw4^lmw3^lmw2 u20_24, [u15_19], u10_14, u6_9 is u20_24 & u20_24i & u15_19 & u10_14 & u10_14i & u6_9 & lmw5=0 & lmw4 & lmw3 & lmw2 & lmw2=0 & u0_1=0b00 {
    lmw(u20_24i:1, u15_19, u10_14i:1, u6_9:1, lmw4:1, lmw3:1, lmw2:1);
}
:LMW.^lmw4^lmw3^lmw2 u20_24, [u15_19], u10_14, u6_9 is u20_24 & u20_24i & u15_19 & u10_14 & u10_14i & u6_9 & lmw5=0 & lmw4 & lmw3 & lmw2 & lmw2=1 & u0_1=0b00 & mw_wb_disp {
    lmw(u20_24i:1, u15_19, u10_14i:1, u6_9:1, lmw4:1, lmw3:1, lmw2:1);
    build mw_wb_disp;
    u15_19 = u15_19 + mw_wb_disp;
}
:SMW.^lmw4^lmw3^lmw2 u20_24, [u15_19], u10_14, u6_9 is u20_24 & u20_24i & u15_19 & u10_14 & u10_14i & u6_9 & lmw5=1 & lmw4 & lmw3 & lmw2 & lmw2=0 & u0_1=0b00 {
    smw(u20_24i:1, u15_19, u10_14i:1, u6_9:1, lmw4:1, lmw3:1, lmw2:1);
}
:SMW.^lmw4^lmw3^lmw2 u20_24, [u15_19], u10_14, u6_9 is u20_24 & u20_24i & u15_19 & u10_14 & u10_14i & u6_9 & lmw5=1 & lmw4 & lmw3 & lmw2 & lmw2=1 & u0_1=0b00 & mw_wb_disp {
    smw(u20_24i:1, u15_19, u10_14i:1, u6_9:1, lmw4:1, lmw3:1, lmw2:1);
    build mw_wb_disp;
    u15_19 = u15_19 + mw_wb_disp;
}

This seems very gross and I still wish there were a way to "fully" implement an instruction (opposed to a "call") in java...

p.s. I assume this will still cause e.g. loads of registers from lmw to be missed by some analysis passes (all memory/register writes except for the writeback target still have the problem). Which is pretty bad...

ghidracadabra commented 1 year ago

I'm not familiar with this particular architecture, and it seems like you've mostly solved your problem, but here goes.

First, some background. Dynamic pcode injection was designed for situations such as modelling java bytecode. The semantics of some of the instructions in that language depend on information elsewhere in the program. For example, there's an instruction to push a field of an object on to the operand stack. Nothing local to the instruction tells you the size of the field (which can be either 4 or 8 bytes) - you have to look up that information in the "constant pool", which is essentially a data structure in a separate section of the binary. So, given an instance of that instruction, you need to consult that data structure for the information needed to generate the pcode. Different instances of the same instruction can correspond to different pcode.

That's not to say that dynamic injection wouldn't be useful in other contexts, such as this one. So I guess my question is: are you generating the pcode programmatically because you have to (as in the java case), or because it's much easier than implementing the semantics in sleigh? From your post I think it's the second, but as I said, I'm not familiar with the architecture.

It's possible to implement an instruction in java for emulation. For the new emulator (associated with the debugger) there's a even a facility to generate sleigh in java ("structured sleigh"). We're also working on some improvements regarding different pcode implementations for different analyses. It's possible that they would be useful for this case.

As for debugging, if you run Ghidra though eclipse you can put a breakpoint on the getPcode method of your injection and see what's calling it (which you've probably already done). This will at least given you an idea of which analyses are using the dynamically generated pcode.

shuffle2 commented 1 year ago

My first try at implementing the instructions in sleigh looked like this: https://gist.github.com/shuffle2/eee9b1bde68232799d2dc4f848ad9e10 Which had the problem of generating that kind of complex pcode for each instance of the instruction, which did not get simplified by the decompiler / analysis was not able to see e.g. writeback addend as constant (could probably be improved tho):

A workable solution to implement the functionality in sleigh would be to emit specialized constructors for all possible combinations of operands to the instructions, like https://github.com/xyzz/ghidra-rl78/blob/master/data/languages/rl78.slaspec.in#L149 . But using an external preprocessing tool is rather annoying and just feels like there's something missing in sleigh which would make these type of specifications easier to express. I'll probably eventually move to that method, but would like to know if there's an alternative.

I tried my luck at implementing it in a similar manner as the arm ldm register list, but got frustrated at sleigh and moved on to dynamic pcode before I got it working...(and, the ldm register list implementation seems like a manually-typed-out version of something that could be better expressed in some form of meta-sleigh (i.e. preprocessor tool), anyway).

For reference, the manual for this instruction is:

ryanmkurtz commented 1 year ago

It appears this question has been answered.

NationalSecurityAgency / ghidra

Flow out of InjectPayloadCallother implementation #4792