NationalSecurityAgency / ghidra

Ghidra is a software reverse engineering (SRE) framework
https://www.nsa.gov/ghidra
Apache License 2.0
50.51k stars 5.77k forks source link

8051: Decompiling the simplest code. #1282

Open beketata opened 4 years ago

beketata commented 4 years ago

Below is an example of C function in the “Hello World” style for 8051:

typedef unsigned short word;

word xdata _buf[0x100];

void Fn1()
{
  word i;

  for( i=0; i<0x100; i++ )
  {
    _buf[i] = 0;
  }
}

After compiling it with Keil and decompiling with Ghidra it looks like:

void FUN_CODE_0000(void)
{
 char cVar1;
 byte bVar2;
 undefined *puVar3;

 bVar2 = 0;
 cVar1 = 0;
 while ((cVar1 == '\0') << 7 < '\0') {
   puVar3 = (undefined *)CONCAT11(cVar1 << 1 | CARRY1(bVar2,bVar2),bVar2 * '\x02');
   *puVar3 = 0;
   puVar3[1] = 0;
   bVar2 = bVar2 + 1;
   if (bVar2 == 0) {
     cVar1 = cVar1 + '\x01';
   }
 }
 return;
}

Disassembly listing:

                             //
                             // CODE 
                             // CODE: CODE:0000-CODE:0025
                             //
                             **************************************************************
                             *                          FUNCTION                          *
                             **************************************************************
                             void __stdcall FUN_CODE_0000(void)
             void              <VOID>         <RETURN>
                             FUN_CODE_0000
       CODE:0000 e4              CLR        A
       CODE:0001 fd              MOV        R5,A
       CODE:0002 fc              MOV        R4,A
                             LAB_CODE_0003                                   XREF[1]:     CODE:0023(j)  
       CODE:0003 c3              CLR        CY
       CODE:0004 ec              MOV        A,R4
       CODE:0005 94 01           SUBB       A,#0x1
       CODE:0007 50 1c           JNC        LAB_CODE_0025
       CODE:0009 ed              MOV        A,R5
       CODE:000a 25 e0           ADD        A,A
       CODE:000c ff              MOV        R7,A
       CODE:000d ec              MOV        A,R4
       CODE:000e 33              RLC        A
       CODE:000f fe              MOV        R6,A
       CODE:0010 74 00           MOV        A,#0x0
       CODE:0012 2f              ADD        A,R7
       CODE:0013 f5 82           MOV        DPL,A
       CODE:0015 74 00           MOV        A,#0x0
       CODE:0017 3e              ADDC       A,R6
       CODE:0018 f5 83           MOV        DPH,A
       CODE:001a e4              CLR        A
       CODE:001b f0              MOVX       @DPTR,A
       CODE:001c a3              INC        DPTR
       CODE:001d f0              MOVX       @DPTR,A
       CODE:001e 0d              INC        R5
       CODE:001f ed              MOV        A,R5
       CODE:0020 70 01           JNZ        LAB_CODE_0023
       CODE:0022 0c              INC        R4
                             LAB_CODE_0023                                   XREF[1]:     CODE:0020(j)  
       CODE:0023 80 de           SJMP       LAB_CODE_0003
                             LAB_CODE_0025                                   XREF[1]:     CODE:0007(j)  
       CODE:0025 22              RET

Could anyone suggest me how to make it readable? Or is it not possible at the current stage of development process?

Zipped binary file.

astrelsky commented 4 years ago

Could anyone suggest me how to make it readable? Or is it not possible at the current stage of development process?

What does the disassembly look like? There is likely some optimization going on when Kiel compiles the C code. What you write in C/C++ etc isn't always what you expect in machine code.

beketata commented 4 years ago

The above example with switched off optimization: (I think "optimization" concept for Keil compiler is not the same as for Ghidra.)

#pragma ot(0)

typedef unsigned short word;

word xdata _buf[0x100];

void Fn1()
{
  word i;

  for( i=0; i<0x100; i++ )
  {
    _buf[i] = 0;
  }
}

After decompiling now it looks like:

void FUN_CODE_0000(void)
{
  undefined *puVar1;

  DAT_EXTMEM_0200 = '\0';
  DAT_EXTMEM_0201 = 0;
  while ((DAT_EXTMEM_0200 == '\0') << 7 < '\0') {
    puVar1 = (undefined *)
             CONCAT11(DAT_EXTMEM_0200 << 1 | CARRY1(DAT_EXTMEM_0201,DAT_EXTMEM_0201),
                      DAT_EXTMEM_0201 * '\x02');
    *puVar1 = 0;
    puVar1[1] = 0;
    DAT_EXTMEM_0201 = DAT_EXTMEM_0201 + 1;
    if (DAT_EXTMEM_0201 == 0) {
      DAT_EXTMEM_0200 = DAT_EXTMEM_0200 + '\x01';
    }
  }
  return;
}

Disassembly listing:

                             //
                             // CODE 
                             // CODE: CODE:0000-CODE:0044
                             //
                             **************************************************************
                             *                          FUNCTION                          *
                             **************************************************************
                             void __stdcall FUN_CODE_0000(void)
             void              <VOID>         <RETURN>
                             FUN_CODE_0000
       CODE:0000 90 02 00        MOV        DPTR,#0x200
       CODE:0003 e4              CLR        A
       CODE:0004 f0              MOVX       @DPTR=>DAT_EXTMEM_0200,A
       CODE:0005 a3              INC        DPTR
       CODE:0006 e4              CLR        A
       CODE:0007 f0              MOVX       @DPTR=>DAT_EXTMEM_0201,A
                             LAB_CODE_0008                                   XREF[1]:     CODE:0042(j)  
       CODE:0008 90 02 00        MOV        DPTR,#0x200
       CODE:000b e0              MOVX       A,@DPTR=>DAT_EXTMEM_0200
       CODE:000c fe              MOV        R6,A
       CODE:000d a3              INC        DPTR
       CODE:000e e0              MOVX       A,@DPTR=>DAT_EXTMEM_0201
       CODE:000f ff              MOV        R7,A
       CODE:0010 c3              CLR        CY
       CODE:0011 ee              MOV        A,R6
       CODE:0012 94 01           SUBB       A,#0x1
       CODE:0014 50 2e           JNC        LAB_CODE_0044
       CODE:0016 90 02 00        MOV        DPTR,#0x200
       CODE:0019 e0              MOVX       A,@DPTR=>DAT_EXTMEM_0200
       CODE:001a fe              MOV        R6,A
       CODE:001b a3              INC        DPTR
       CODE:001c e0              MOVX       A,@DPTR=>DAT_EXTMEM_0201
       CODE:001d ff              MOV        R7,A
       CODE:001e ef              MOV        A,R7
       CODE:001f 25 e0           ADD        A,A
       CODE:0021 ff              MOV        R7,A
       CODE:0022 ee              MOV        A,R6
       CODE:0023 33              RLC        A
       CODE:0024 fe              MOV        R6,A
       CODE:0025 74 00           MOV        A,#0x0
       CODE:0027 2f              ADD        A,R7
       CODE:0028 f5 82           MOV        DPL,A
       CODE:002a 74 00           MOV        A,#0x0
       CODE:002c 3e              ADDC       A,R6
       CODE:002d f5 83           MOV        DPH,A
       CODE:002f e4              CLR        A
       CODE:0030 f0              MOVX       @DPTR,A
       CODE:0031 a3              INC        DPTR
       CODE:0032 e4              CLR        A
       CODE:0033 f0              MOVX       @DPTR,A
       CODE:0034 90 02 01        MOV        DPTR,#0x201
       CODE:0037 e0              MOVX       A,@DPTR=>DAT_EXTMEM_0201
       CODE:0038 04              INC        A
       CODE:0039 f0              MOVX       @DPTR=>DAT_EXTMEM_0201,A
       CODE:003a 70 06           JNZ        LAB_CODE_0042
       CODE:003c 90 02 00        MOV        DPTR,#0x200
       CODE:003f e0              MOVX       A,@DPTR=>DAT_EXTMEM_0200
       CODE:0040 04              INC        A
       CODE:0041 f0              MOVX       @DPTR=>DAT_EXTMEM_0200,A
                             LAB_CODE_0042                                   XREF[1]:     CODE:003a(j)  
       CODE:0042 80 c4           SJMP       LAB_CODE_0008
                             LAB_CODE_0044                                   XREF[1]:     CODE:0014(j)  
       CODE:0044 22              RET

Zipped binary file.

astrelsky commented 4 years ago

The above example with switched off optimization: (I think "optimization" concept for Keil compiler is not the same as for Ghidra.)

I did intend to mean optimization by the Keil compiler. A lot of optimization happens behind the scenes with c and c++ compilers. The code you write is almost never the code you get.

Defining puVar1 as a short* may help with decompilation. I'm not familiar with the 8051 processor, but from a quick glance at the instruction set it appears to only operate on bytes. When no debug information is available Ghidra attempts to determine the datatype from the opcode. However, since these instructions are only on bytes it will determine every datatype to be a signed/unsigned byte or a pointer to one.

I'm curious to see what it would look like if you compile the c code with _buf as an unsigned char.

beketata commented 4 years ago

Without any "words" now :smile:. Keil C source:

typedef unsigned char byte;

byte xdata _buf[0x80];

void Fn1()
{
  byte i;

  for( i=0; i<0x80; i++ )
  {
    _buf[i] = 0;
  }
}

Ghidra decompilation result:

void Fn1(void)
{
  byte *local_R6R7_1;

  local_R6R7_1._1_1_ = 0;
  while ((0x80 < 0x80U - ((((byte)local_R6R7_1 < 0x80) << 7) >> 7)) << 7 < '\0') {
    *(undefined *)(ushort)(byte)local_R6R7_1 = 0;
    local_R6R7_1._1_1_ = (byte)local_R6R7_1 + 1;
  }
  return;
}

Disassembly listing:

                             //
                             // CODE 
                             // CODE: CODE:0000-CODE:001b
                             //
                             **************************************************************
                             *                          FUNCTION                          *
                             **************************************************************
                             void __stdcall Fn1(void)
             void              <VOID>         <RETURN>
             byte *            R6R7:2         local_R6R7_1                            XREF[1]:     CODE:0001(W)  
                             Fn1
       CODE:0000 e4              CLR        A
       CODE:0001 ff              MOV        local_R6R7_1,A
                             LAB_CODE_0002                                   XREF[1]:     CODE:0019(j)  
       CODE:0002 ef              MOV        A,local_R6R7_1
       CODE:0003 c3              CLR        CY
       CODE:0004 94 80           SUBB       A,#0x80
       CODE:0006 74 80           MOV        A,#0x80
       CODE:0008 94 80           SUBB       A,#0x80
       CODE:000a 50 0f           JNC        LAB_CODE_001b
       CODE:000c 74 00           MOV        A,#0x0
       CODE:000e 2f              ADD        A,local_R6R7_1
       CODE:000f f5 82           MOV        DPL,A
       CODE:0011 e4              CLR        A
       CODE:0012 34 00           ADDC       A,#0x0
       CODE:0014 f5 83           MOV        DPH,A
       CODE:0016 e4              CLR        A
       CODE:0017 f0              MOVX       @DPTR,A
       CODE:0018 0f              INC        local_R6R7_1
       CODE:0019 80 e7           SJMP       LAB_CODE_0002
                             LAB_CODE_001b                                   XREF[1]:     CODE:000a(j)  
       CODE:001b 22              RET
astrelsky commented 4 years ago

Ok, so I think the decompilation could be cleaned up a bit either by changing PSW to a context register and the following to its bits:

@define CY      "PSW[7,1]"
@define AC      "PSW[6,1]"
@define N       "PSW[5,1]"
@define RS1     "PSW[4,1]"
@define RS0     "PSW[3,1]"
@define OV      "PSW[2,1]"
@define Z       "PSW[1,1]"

or by ensuring that the above do not appear in code. The right and left shifting that you are seeing is from checking if a carry has occurred. ie CY results in PSW<< 7. This is just a suggestion though, I don't know much about this processor.