secure-software-engineering / phasar

A LLVM-based static analysis framework.
Other
919 stars 140 forks source link

Can Phasar be used to identify control-dependent instructions #685

Open Mohannadcse opened 7 months ago

Mohannadcse commented 7 months ago

I'm using phasar taint analysis to identify data-dependent and control-dependent instructions on argv.

psr::IDEExtendedTaintAnalysis<1, false> TaintAnalysis(&IR, &T, &I, &P,
                                                          Config);
    TaintAnalysis.setIFDSIDESolverConfig(SolverConfig);
    std::stringstream SolverConfigStr;
    SolverConfigStr << "Using solver config: "
                    << TaintAnalysis.getIFDSIDESolverConfig() << '\n';
    llvm::outs() << SolverConfigStr.str();
    psr::IDESolver Solver(TaintAnalysis);
    llvm::outs() << "Solving data-flow analysis ...\n";
    Solver.solve();
    llvm::outs() << "Data-flow analysis has been solved.\n";

    auto SolverRes = Solver.getSolverResults();
    auto AllResEntries = SolverRes.getAllResultEntries();

    for (auto &Res : AllResEntries) {
      const llvm::Instruction *Inst = Res.getRowKey();
      auto ResAtInst = SolverRes.resultsAt(Inst);

      for ([[maybe_unused]] const auto &Op : Inst->operands()) {
        for (auto &[Fact, Value] : ResAtInst) {
          llvm::Value *PotentialGepPointerOp = nullptr;
          if (auto *Gep = llvm::dyn_cast<llvm::GetElementPtrInst>(Op)) {
            PotentialGepPointerOp = Gep->getPointerOperand();
          }
          if (Op == Fact->base() || (PotentialGepPointerOp &&
                                     PotentialGepPointerOp == Fact->base())) {
            NeckCandidates.push_back(
                const_cast<llvm::Instruction *>(Inst)); // NOLINT ;-)
          }
        }
      }

The IDESolver results are as follows:

***************************************************************
*                  Raw IDESolver results                      *
***************************************************************

============ Results for function '__psrCRuntimeGlobalCtorsModel' ============

N: ret void | ID: -1
--------------------
    D: (<ZERO>; Offsets={ } #0) | V: Top

N: call void @__psrCRuntimeGlobalDtorsModel() | ID: -1
------------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Top

============ Results for function 'main' ============

N: %3 = alloca i32, align 4, !psr.id !20 | ID: 124
--------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: %4 = alloca i32, align 4, !psr.id !21 | ID: 125
--------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: %5 = alloca i8**, align 8, !psr.id !22 | ID: 126
---------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: %6 = alloca i32, align 4, !psr.id !23 | ID: 127
--------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: %7 = alloca i32, align 4, !psr.id !24 | ID: 128
--------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: %8 = alloca i32, align 4, !psr.id !25 | ID: 129
--------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: %9 = alloca i32, align 4, !psr.id !26 | ID: 130
--------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: %10 = alloca i32, align 4, !psr.id !27 | ID: 131
---------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: %11 = alloca [1024 x i8], align 16, !psr.id !28 | ID: 132
------------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: store i32 0, i32* %3, align 4, !psr.id !29 | ID: 133
-------------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: store i32 %0, i32* %4, align 4, !psr.id !30 | ID: 134
--------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: store i8** %1, i8*** %5, align 8, !psr.id !34 | ID: 136
----------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: store i32 0, i32* %6, align 4, !dbg !39, !psr.id !41 | ID: 139
-----------------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized

N: store i32 0, i32* %7, align 4, !dbg !43, !psr.id !45 | ID: 141
-----------------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: store i32 0, i32* %8, align 4, !dbg !47, !psr.id !49 | ID: 143
-----------------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized

N: store i32 0, i32* %9, align 4, !dbg !51, !psr.id !53 | ID: 145
-----------------------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: store i32 1, i32* %10, align 4, !dbg !56, !psr.id !58 | ID: 147
------------------------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized

N: br label %12, !dbg !59, !psr.id !60 | ID: 148
------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: %13 = load i32, i32* %10, align 4, !dbg !61, !psr.id !63 | ID: 149
---------------------------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized

N: %14 = load i32, i32* %4, align 4, !dbg !64, !psr.id !65 | ID: 150
--------------------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: %15 = icmp slt i32 %13, %14, !dbg !66, !psr.id !67 | ID: 151
---------------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: br i1 %15, label %16, label %51, !dbg !68, !psr.id !69 | ID: 152
-------------------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: %17 = load i8**, i8*** %5, align 8, !dbg !70, !psr.id !73 | ID: 153
----------------------------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized

N: %18 = load i32, i32* %10, align 4, !dbg !74, !psr.id !75 | ID: 154
---------------------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: %19 = sext i32 %18 to i64, !dbg !70, !psr.id !76 | ID: 155
-------------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: %20 = getelementptr inbounds i8*, i8** %17, i64 %19, !dbg !70, !psr.id !77 | ID: 156
---------------------------------------------------------------------------------------
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: %21 = load i8*, i8** %20, align 8, !dbg !70, !psr.id !78 | ID: 157
---------------------------------------------------------------------
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: %22 = getelementptr inbounds i8, i8* %21, i64 0, !dbg !70, !psr.id !79 | ID: 158
-----------------------------------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: %23 = load i8, i8* %22, align 1, !dbg !70, !psr.id !80 | ID: 159
-------------------------------------------------------------------
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: %24 = sext i8 %23 to i32, !dbg !70, !psr.id !81 | ID: 160
------------------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: %25 = icmp eq i32 %24, 45, !dbg !82, !psr.id !83 | ID: 161
-------------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: br i1 %25, label %26, label %45, !dbg !84, !psr.id !85 | ID: 162
-------------------------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized

N: %27 = load i8**, i8*** %5, align 8, !dbg !86, !psr.id !88 | ID: 163
----------------------------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: %28 = load i32, i32* %10, align 4, !dbg !89, !psr.id !90 | ID: 164
---------------------------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: %29 = sext i32 %28 to i64, !dbg !86, !psr.id !91 | ID: 165
-------------------------------------------------------------
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

how to interpret IDESolver raw results to identify control-dependent instructions. For example, N: %17 = load i8**, i8*** %5, align 8, !dbg !70, !psr.id !73 | ID: 153 uses %5 = alloca i8**, align 8, !psr.id !22 | ID: 126, which uses argv, but how can I determine that %25 = icmp eq i32 %24, 45, !dbg !82, !psr.id !83 | ID: 161 is also depends on %17?

Mohannadcse commented 7 months ago

While %88 = icmp ne i8* %87, null, !dbg !209, !psr.id !212 | ID: 231 isn't dependent on argv, however, its dataflow facts look similar to %25

N: br label %86, !dbg !206, !psr.id !208 | ID: 229
--------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: %87 = getelementptr inbounds [1024 x i8], [1024 x i8]* %11, i64 0, i64 0, !dbg !209, !psr.id !211 | ID: 230
--------------------------------------------------------------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom

N: %88 = icmp ne i8* %87, null, !dbg !209, !psr.id !212 | ID: 231
-----------------------------------------------------------------
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized

N: br i1 %88, label %89, label %91, !dbg !213, !psr.id !214 | ID: 232
---------------------------------------------------------------------
    D: (i8** %1 | ID: main.1; Offsets={ 0 } #0) | V: NotSanitized
    D: (i8*** %5 | ID: 126; Offsets={ 0 } #0) | V: NotSanitized
    D: (<ZERO>; Offsets={ } #0) | V: Bottom
Mohannadcse commented 7 months ago

Here is my program

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main(int argc, char **argv) {
  int count_chars = 0, count_lines = 0;
  int total_chars = 0, total_lines = 0;

  for (int i = 1; i < argc; i++) {
    if (argv[i][0] == '-') { // Check for `-`
      switch (argv[i][1]) {
      case 'c':
        count_chars = 1;
        break;
      case 'l':
        count_lines = 1;
        break;
      default:
        printf("wrong flag %s\n", argv[i]);
        exit(0);
      }
    } else {
      printf("Provide correct args");
    }
  }

  char buffer[1024];
  while (fgets(buffer, 1024, stdin)) {
    if (count_chars)
      total_chars += strlen(buffer); 
    if (count_lines)
      total_lines++; 
  }

  if (count_chars)
    printf("Total chars = %d\n", total_chars);
  if (count_lines)
    printf("Total lines = %d\n", total_lines);

  if (buffer) {
    printf("dump\n");
  }
  return 0;
}
Mohannadcse commented 7 months ago

Stmt if (argv[i][0] == '-') corresponds to %15 and if (buffer) corresponds to %88

MMory commented 7 months ago

Hi Mohannad,

I am not sure whether I understand your question correctly. How do you mean your notion of "control dependent"? Are you referring to the problem "which values influence which controlflow decision"? The examples you referred to look like dataflow properties though.

For example, N: %17 = load i8**, i8* %5, align 8, !dbg !70, !psr.id !73 | ID: 153 uses %5 = alloca i8, align 8, !psr.id !22 | ID: 126

The extended taint analysis is optimized particularly for identifying flows that reach a sink, choosing to not generate facts at loads but instead following back def-use chains at sinks, with the idea of reducing the size of the ESG and also to simplify the implementation of the field-sensitivity. With that, unfortunately you don't see the dataflow facts at the individual statements.

There is another taint analysis, the IFDS taint analysis, in which this kind of optimization is not present and it should report the dataflow that you asked for. The same applies to the other flow you asked for in the same post. Downside of that analysis is the lack of field sensitivity, but at least in the example you provided it does not seem to matter that much.

Generally, the report of the IFDS/IDE solver lists for each program statement which dataflow facts hold before executing that statement. In the taint analysis, it should list the variables that are tainted there. It does not describe control dependency.

We have another analysis in phasar, the InstInteractionAnalysis, which describes interactions between instructions, but not just for one particular seed, but for all instructions. I am not sure to what extent this might be helpful for your purposes though.

Hope that helps.

Best regards Martin

Mohannadcse commented 7 months ago

Let's close this, Thanks a lot @MMory

Mohannadcse commented 7 months ago

@MMory I used IFDSSolver as we discussed and dumped the facts. Now, focusing on

Stmt if (argv[i][0] == '-') corresponds to %15 and if (buffer) corresponds to %88 Here are the corresponding facts to the instructions %15 and %88, how to figure out that %15 should be tainted while %88 shouldn't?

N: %15 = icmp slt i32 %13, %14, !dbg !66, !psr.id !67 | ID: 151
---------------------------------------------------------------
    D: %27 = load i8**, i8*** %5, align 8, !dbg !86, !psr.id !88 | ID: 163 | V: TOP
    D: %23 = load i8, i8* %22, align 1, !dbg !70, !psr.id !80 | ID: 159 | V: TOP
    D: %31 = load i8*, i8** %30, align 8, !dbg !86, !psr.id !93 | ID: 167 | V: TOP
    D: %24 = sext i8 %23 to i32, !dbg !70, !psr.id !81 | ID: 160 | V: TOP
    D: br i1 %25, label %26, label %45, !dbg !84, !psr.id !85 | ID: 162 | V: TOP
    D: %21 = load i8*, i8** %20, align 8, !dbg !70, !psr.id !78 | ID: 157 | V: TOP
    D: @zero_value = constant i2 0, align 4 | ID: -1 | V: BOTTOM
    D: %34 = sext i8 %33 to i32, !dbg !86, !psr.id !96 | ID: 170 | V: TOP
    D: %33 = load i8, i8* %32, align 1, !dbg !86, !psr.id !95 | ID: 169 | V: TOP
    D: %32 = getelementptr inbounds i8, i8* %31, i64 1, !dbg !86, !psr.id !94 | ID: 168 | V: TOP
    D: %25 = icmp eq i32 %24, 45, !dbg !82, !psr.id !83 | ID: 161 | V: TOP
    D: %30 = getelementptr inbounds i8*, i8** %27, i64 %29, !dbg !86, !psr.id !92 | ID: 166 | V: TOP
    D: %22 = getelementptr inbounds i8, i8* %21, i64 0, !dbg !70, !psr.id !79 | ID: 158 | V: TOP
    D: i8** %1 | ID: main.1 | V: TOP
    D: %20 = getelementptr inbounds i8*, i8** %17, i64 %19, !dbg !70, !psr.id !77 | ID: 156 | V: TOP
    D: switch i32 %34, label %37 [
    i32 99, label %35
    i32 108, label %36
  ], !dbg !97, !psr.id !98 | ID: 171 | V: TOP
    D: %17 = load i8**, i8*** %5, align 8, !dbg !70, !psr.id !73 | ID: 153 | V: TOP
    D: %5 = alloca i8**, align 8, !psr.id !22 | ID: 126 | V: TOP

N: %88 = icmp ne i8* %87, null, !dbg !209, !psr.id !212 | ID: 231
-----------------------------------------------------------------
    D: %22 = getelementptr inbounds i8, i8* %21, i64 0, !dbg !70, !psr.id !79 | ID: 158 | V: TOP
    D: %17 = load i8**, i8*** %5, align 8, !dbg !70, !psr.id !73 | ID: 153 | V: TOP
    D: %27 = load i8**, i8*** %5, align 8, !dbg !86, !psr.id !88 | ID: 163 | V: TOP
    D: %21 = load i8*, i8** %20, align 8, !dbg !70, !psr.id !78 | ID: 157 | V: TOP
    D: %23 = load i8, i8* %22, align 1, !dbg !70, !psr.id !80 | ID: 159 | V: TOP
    D: %31 = load i8*, i8** %30, align 8, !dbg !86, !psr.id !93 | ID: 167 | V: TOP
    D: %24 = sext i8 %23 to i32, !dbg !70, !psr.id !81 | ID: 160 | V: TOP
    D: br i1 %25, label %26, label %45, !dbg !84, !psr.id !85 | ID: 162 | V: TOP
    D: i8** %1 | ID: main.1 | V: TOP
    D: %20 = getelementptr inbounds i8*, i8** %17, i64 %19, !dbg !70, !psr.id !77 | ID: 156 | V: TOP
    D: switch i32 %34, label %37 [
    i32 99, label %35
    i32 108, label %36
  ], !dbg !97, !psr.id !98 | ID: 171 | V: TOP
    D: %5 = alloca i8**, align 8, !psr.id !22 | ID: 126 | V: TOP
    D: %30 = getelementptr inbounds i8*, i8** %27, i64 %29, !dbg !86, !psr.id !92 | ID: 166 | V: TOP
    D: %25 = icmp eq i32 %24, 45, !dbg !82, !psr.id !83 | ID: 161 | V: TOP
    D: %32 = getelementptr inbounds i8, i8* %31, i64 1, !dbg !86, !psr.id !94 | ID: 168 | V: TOP
    D: %33 = load i8, i8* %32, align 1, !dbg !86, !psr.id !95 | ID: 169 | V: TOP
    D: %34 = sext i8 %33 to i32, !dbg !86, !psr.id !96 | ID: 170 | V: TOP
    D: @zero_value = constant i2 0, align 4 | ID: -1 | V: BOTTOM