BinaryAnalysisPlatform / bap

Binary Analysis Platform
MIT License
2.05k stars 271 forks source link

Incomplete disassembly of a binary with ghidra-backend (vs default llvm-backend) #1611

Open basavesh opened 1 month ago

basavesh commented 1 month ago

Hi,

Looks like the ghidra backend is not exploring the binary properly and missing a lot of instructions.

For example: Compile the attached mini word count file.

gcc wc.c -o wc 

llvm-backend

$bap ./wc --print-missing

Histogram:
38   ENDBR64

Lifted:  449
Failed:  0
Missed:  38

However, the ghidra backend lifts much less instructions.

$ bap ./wc --print-missing --x86-backend=ghidra

Histogram:

Lifted:  265
Failed:  0
Missed:  0

Source code:

/* Sample implementation of wc utility. */
//https://www.gnu.org/software/cflow/manual/html_node/Source-of-wc-command.html

#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <ctype.h>

typedef unsigned long count_t;  /* Counter type */

/* Current file counters: chars, words, lines */
count_t ccount;
count_t wcount;
count_t lcount;

/* Totals counters: chars, words, lines */
count_t total_ccount = 0;
count_t total_wcount = 0;
count_t total_lcount = 0;

/* Print error message and exit with error status. If PERR is not 0,
 *    display current errno status. */
static void
error_print (int perr, char *fmt, va_list ap)
{
      vfprintf (stderr, fmt, ap);
        if (perr)
                perror (" ");
          else
                  fprintf (stderr, "\n");
            exit (1);  
}

/* Print error message and exit with error status. */
static void
errf (char *fmt, ...)
{
      va_list ap;

        va_start (ap, fmt);
          error_print (0, fmt, ap);
            va_end (ap);
}

/* Print error message followed by errno status and exit
 *    with error code. */
static void
perrf (char *fmt, ...)
{
      va_list ap;

        va_start (ap, fmt);
          error_print (1, fmt, ap);
            va_end (ap);
}

/* Output counters for given file */
void
report (char *file, count_t ccount, count_t wcount, count_t lcount)
{
printf ("%6lu %6lu %6lu %s\n", lcount, wcount, ccount, file);
}

/* Return true if C is a valid word constituent */
static int
isword (unsigned char c)
{
      return isalpha (c);
}

/* Increase character and, if necessary, line counters */
#define COUNT(c)       \
          ccount++;        \
                if ((c) == '\n') \
                    lcount++;

/* Get next word from the input stream. Return 0 on end
 *    of file or error condition. Return 1 otherwise. */
int
getword (FILE *fp)
{
      int c;
        int word = 0;

          if (feof (fp))
                  return 0;

            while ((c = getc (fp)) != EOF)
                {
                          if (isword (c))
                                  {
                                            wcount++;
                                                  break;
                                                      }
                            COUNT (c);
                            }

          for (; c != EOF; c = getc (fp))
                  {
                            COUNT (c);
                              if (!isword (c))
                                      break;
                              }

            return c != EOF;
}

/* Process file FILE. */
void
counter (char *file)
{
      FILE *fp = fopen (file, "r");
        if (!fp)
                perrf ("cannot open file `%s'", file);

          ccount = wcount = lcount = 0;
            while (getword (fp))
                ;
          fclose (fp);

            report (file, ccount, wcount, lcount);
              total_ccount += ccount;
                total_wcount += wcount;
              total_lcount += lcount;
}

int
main (int argc, char **argv)
{
      int i;
        if (argc < 2)
                errf ("usage: wc FILE [FILE...]");

          for (i = 1; i < argc; i++)
                  counter (argv[i]);

            if (argc > 2)
                report ("total", total_ccount, total_wcount, total_lcount);

          return 0;
}

@ivg

basavesh commented 2 weeks ago

One thing I noticed while comparing the llvm_disasm.cpp with ghidra_disasm.cpp is that llvm_disasm calls the step function recursively while ghidra_disasm does not. I don't know if this relates to the exploration issue.

Edit: It looks like more handling the prefix case.