GaloisInc / saw-script

The SAW scripting language.
BSD 3-Clause "New" or "Revised" License
441 stars 63 forks source link

llvm_sizeof: Use debugging information when alias information is optimized away #1291

Open RyanGlScott opened 3 years ago

RyanGlScott commented 3 years ago

Consider the following example:

// test.c

#include <stdlib.h>

struct foo {
  int a;
};

size_t baz() {
  struct foo bar = { .a = 1 };
  return sizeof(struct foo) * bar.a;
}
// test.saw

mod <- llvm_load_module "test.bc";

let baz_spec = do {
  llvm_execute_func [];
  let s = llvm_sizeof mod (llvm_alias "struct.foo");
  llvm_return (llvm_term {{ `(s) : [64] }});
};

llvm_verify mod "baz" [] false baz_spec abc;

When test.c is compiled without optimizations, the proof succeeds:

$ clang -g -emit-llvm -c test.c -O0
$ ~/Software/saw-0.8/bin/saw test.saw

[16:05:46.989] Loading file "/home/rscott/Documents/Hacking/SAW/test.saw"
[16:05:47.009] Verifying baz ...
[16:05:47.009] Simulating baz ...
[16:05:47.010] Checking proof obligations baz ...
[16:05:47.010] Proof succeeded! baz

If test.c is compiled with -O2, however, the proof fails:

$ clang -g -emit-llvm -c test.c -O2
$ ~/Software/saw-0.8/bin/saw test.saw

[16:06:03.113] Loading file "/home/rscott/Documents/Hacking/SAW/test.saw"
[16:06:03.116] Stack trace:
"llvm_verify" (/home/rscott/Documents/Hacking/SAW/test.saw:11:1-11:12):
"baz_spec" (/home/rscott/Documents/Hacking/SAW/test.saw:11:32-11:40):
"llvm_sizeof" (/home/rscott/Documents/Hacking/SAW/test.saw:7:11-7:22):
llvm_sizeof: Unsupported type: %struct.foo
Details:
Unknown type alias Ident "struct.foo"

This because with -O2, LLVM will optimize away the alias information for foo in the compiled bitcode. Compare the bitcode for the -O0 version, which contains %struct.foo = type { i32 }:

``` $ clang -g -emit-llvm -S test.c -O0 -frecord-command-line $ cat test.ll ; ModuleID = 'test.c' source_filename = "test.c" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" %struct.foo = type { i32 } @__const.baz.bar = private unnamed_addr constant %struct.foo { i32 1 }, align 4 ; Function Attrs: noinline nounwind optnone uwtable define dso_local i64 @baz() #0 !dbg !8 { %1 = alloca %struct.foo, align 4 call void @llvm.dbg.declare(metadata %struct.foo* %1, metadata !14, metadata !DIExpression()), !dbg !19 %2 = bitcast %struct.foo* %1 to i8*, !dbg !19 call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %2, i8* align 4 bitcast (%struct.foo* @__const.baz.bar to i8*), i64 4, i1 false), !dbg !19 %3 = getelementptr inbounds %struct.foo, %struct.foo* %1, i32 0, i32 0, !dbg !20 %4 = load i32, i32* %3, align 4, !dbg !20 %5 = sext i32 %4 to i64, !dbg !21 %6 = mul i64 4, %5, !dbg !22 ret i64 %6, !dbg !23 } ; Function Attrs: nounwind readnone speculatable willreturn declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 ; Function Attrs: argmemonly nounwind willreturn declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #2 attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone speculatable willreturn } attributes #2 = { argmemonly nounwind willreturn } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} !llvm.ident = !{!6} !llvm.commandline = !{!7} !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 10.0.0-4ubuntu1 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) !1 = !DIFile(filename: "test.c", directory: "/home/rscott/Documents/Hacking/SAW") !2 = !{} !3 = !{i32 7, !"Dwarf Version", i32 4} !4 = !{i32 2, !"Debug Info Version", i32 3} !5 = !{i32 1, !"wchar_size", i32 4} !6 = !{!"clang version 10.0.0-4ubuntu1 "} !7 = !{!"/usr/lib/llvm-10/bin/clang -g -emit-llvm -S test.c -O0 -frecord-command-line"} !8 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 9, type: !9, scopeLine: 9, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) !9 = !DISubroutineType(types: !10) !10 = !{!11} !11 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !12, line: 46, baseType: !13) !12 = !DIFile(filename: "/usr/lib/llvm-10/lib/clang/10.0.0/include/stddef.h", directory: "") !13 = !DIBasicType(name: "long unsigned int", size: 64, encoding: DW_ATE_unsigned) !14 = !DILocalVariable(name: "bar", scope: !8, file: !1, line: 10, type: !15) !15 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "foo", file: !1, line: 5, size: 32, elements: !16) !16 = !{!17} !17 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !15, file: !1, line: 6, baseType: !18, size: 32) !18 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) !19 = !DILocation(line: 10, column: 14, scope: !8) !20 = !DILocation(line: 11, column: 35, scope: !8) !21 = !DILocation(line: 11, column: 31, scope: !8) !22 = !DILocation(line: 11, column: 29, scope: !8) !23 = !DILocation(line: 11, column: 3, scope: !8) ```

To the bitcode for the -O2 version, which does not:

``` $ clang -g -emit-llvm -S test.c -O2 -frecord-command-line $ cat test.ll ; ModuleID = 'test.c' source_filename = "test.c" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" ; Function Attrs: norecurse nounwind readnone uwtable define dso_local i64 @baz() local_unnamed_addr #0 !dbg !8 { call void @llvm.dbg.value(metadata i32 1, metadata !15, metadata !DIExpression()), !dbg !20 ret i64 4, !dbg !21 } ; Function Attrs: nounwind readnone speculatable willreturn declare void @llvm.dbg.value(metadata, metadata, metadata) #1 attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone speculatable willreturn } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5} !llvm.ident = !{!6} !llvm.commandline = !{!7} !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 10.0.0-4ubuntu1 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) !1 = !DIFile(filename: "test.c", directory: "/home/rscott/Documents/Hacking/SAW") !2 = !{} !3 = !{i32 7, !"Dwarf Version", i32 4} !4 = !{i32 2, !"Debug Info Version", i32 3} !5 = !{i32 1, !"wchar_size", i32 4} !6 = !{!"clang version 10.0.0-4ubuntu1 "} !7 = !{!"/usr/lib/llvm-10/bin/clang -g -emit-llvm -S test.c -O2 -frecord-command-line"} !8 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 9, type: !9, scopeLine: 9, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !14) !9 = !DISubroutineType(types: !10) !10 = !{!11} !11 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !12, line: 46, baseType: !13) !12 = !DIFile(filename: "/usr/lib/llvm-10/lib/clang/10.0.0/include/stddef.h", directory: "") !13 = !DIBasicType(name: "long unsigned int", size: 64, encoding: DW_ATE_unsigned) !14 = !{!15} !15 = !DILocalVariable(name: "bar", scope: !8, file: !1, line: 10, type: !16) !16 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "foo", file: !1, line: 5, size: 32, elements: !17) !17 = !{!18} !18 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !16, file: !1, line: 6, baseType: !19, size: 32) !19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) !20 = !DILocation(line: 0, scope: !8) !21 = !DILocation(line: 11, column: 3, scope: !8) ```

Just because the bitcode is missing an alias doesn't mean we have to give up, however. The -O2 bitcode actually does contain the size of struct foo elsewhere in debugging information. See the size: 32 bit below:

!16 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "foo", file: !1, line: 5, size: 32, elements: !17)

I propose that in the event that SAW cannot resolve an alias, it should consult the debugging information as a fallback. This isn't guaranteed to always succeed, since it's possible that LLVM might also optimize away the debugging information. But for many cases, the info is right there, so we might as well use it.

RyanGlScott commented 3 years ago

I was originally hopeful that this could be a one-module fix, but things are rarely that simple. My first inclination was to modify crucible's lookupAlias function, which is responsible for throwing the "Unknown type alias" error:

lookupAlias :: (?lc :: TypeContext, MonadError String m) => Ident -> m SymType
lookupAlias i =
  case llvmAliasMap ?lc ^. at i of
    Just stp -> return stp
    Nothing  -> throwError $ unwords ["Unknown type alias", show i]

If looking up an Ident in the llvmAliasMap fails, then simply consult the llvmMetadataMap as a last resort... at least, that's what I thought at first. Unfortunately, as far as I can tell, LLVM's metadata doesn't provide you with enough information to faithfully construct an entire SymType out of it (e.g., it doesn't tell you a struct's packedness, or the padding of a struct's fields). As a result, we'll likely need to do the heavy lifting in saw-script's llvm_sizeof function:

https://github.com/GaloisInc/saw-script/blob/2e4fc0603da85bb1b188d4739a3386e25eea50ab/src/SAWScript/Crucible/LLVM/Builtins.hs#L1761-L1775

If liftMemType fails to look up an alias type, then we could fall back to using the llvmMetadataMap. The problem is: how do we know if liftMemType fails specifically due to an unknown type alias? There are a number of different things that can cause liftMemType to fail, but simply matching on Left err doesn't distinguish between any of these different failure modes.

One possible way forward is to change the MonadError String m constraints used in lookupAlias (and elsewhere in crucible) to something like MonadError TypeContextException m, where:

-- | Describes what problem was encountered in a computation using a
-- 'TypeContext'.
data TypeContextException
  = UnknownTypeAlias Ident
  | NonMemTypeEncountered SymType
  | NonRetTypeEncountered SymType
  | SymTypeConversionErrors [Doc Void]
  | TranslationError String
  deriving Show

instance Exception TypeContextException where
  displayException (UnknownTypeAlias i)            = unwords ["Unknown type alias", show i]
  displayException (NonMemTypeEncountered stp)     = unlines ["Expected memory type", show stp]
  displayException (NonRetTypeEncountered stp)     = unlines ["Expected return type", show stp]
  displayException (SymTypeConversionErrors edocs) = unlines (map show edocs)
  displayException (TranslationError s)            = s

Then llvm_sizeof could match on Left (UnknownTypeAlias _) to distinguish it from other forms of failure. In the former case, we would look up the debugging info in llvmMetadataMap, and the latter case, we would simply propagate the error as llvm_sizeof currently does.

One annoying wrinkle: how exactly do we look up the relevant debugging info in llvmMetadataMap? I thought that llvm-pretty's guessAliasInfo function would be just the tool for the job, but the catch is that it returns an Info. Info has two issues:

  1. Info's Structure constructor doesn't record its size, nor does Union nor ArrInfo. I suppose we could change these constructors to store the size, but that feels a bit ad hoc. (Why not store all other fields of DICompositeType as well?)
  2. Even if we did decide to change Info to store the size, we'd only be able to do so for certain forms of DICompositeTypes. Notably, Info doesn't have a constructor that corresponds to enums.

There's enough of an impedance mismatch between Info and what I need that it would likely be easier just to cargo-cult what guessAliasInfo does, except making it return a DICompositeType instead of an Info. Either way, I'll need to change llvm-pretty, since I would need to make use of certain functions from Text.LLVM.DebugUtils that aren't currently exported.

robdockins commented 3 years ago

One intermediate step we could take would be to allow the user to explicitly give alias definitions. Then, if the alias exists in the bitcode we could just check that it matches, but if it doesn't exist, we can just take it as given.

RyanGlScott commented 3 years ago

That does sound appealing. What sort of API would you envision for this feature? Something like llvm_declare_alias : String -> LLVMType -> TopLevel LLVMType?

robdockins commented 3 years ago

I was thinking lvm_declare_alias : String -> LLVMType -> TopLevel (), but yes, basically. Then it would be pretty easy to cut-an-paste, e.g.:

llvm_declare_alias "struct.foo" (llvm_type "{ i32 }");
robdockins commented 3 years ago

I guess one wrinkle would be to make sure that recursive groups of aliases could be declared without causing errors.

RyanGlScott commented 2 years ago

Another use case for this feature is supporting union types, as Clang has an annoying habit of optimizing away aliases that are mentioned in unions. For instance, saw-script's own test_llvm_union test case will work when compiled without optimizations, but not with optimizations. Compare the bitcode at -O0 with Clang 10:

%struct.st = type { i32, %union.anon }
%union.anon = type { %struct.inc_2_st }
%struct.inc_2_st = type { i32, i32 }
%struct.inc_1_st = type { i32 }

Versus the bitcode at -O1:

%struct.st = type { i32, %union.anon }
%union.anon = type { %struct.inc_2_st }
%struct.inc_2_st = type { i32, i32 }

In the optimized version, the %struct.inc_1_st alias has been optimized away, which leaves SAW unable to resolve its fields:

$ ~/Software/saw-0.9.0.99-Linux-x86_64/bin/saw test.saw

[21:58:33.287] Loading file "/home/rscott/Documents/Hacking/Haskell/saw-script/intTests/test_llvm_union/test.saw"
[21:58:33.322] Verifying 'inc_1' using 'llvm_verify':
[21:58:33.416] Stack trace:
"llvm_verify" (/home/rscott/Documents/Hacking/Haskell/saw-script/intTests/test_llvm_union/test.saw:47:1-47:12):
"inc_spec" (/home/rscott/Documents/Hacking/Haskell/saw-script/intTests/test_llvm_union/test.saw:47:30-47:38):
"llvm_points_to" (/home/rscott/Documents/Hacking/Haskell/saw-script/intTests/test_llvm_union/test.saw:20:7-20:21):
Found struct field name: 'x'
in struct with name 'inc_1_st'.
However, the offset of this field found in the debug information could not
be correlated with the computed LLVM type of the setup value:
%struct.inc_1_st*

It would be convenient to be able to declare an alias for inc_1_st to ensure that it can be used even if Clang optimizes it away.