GaloisInc / llvm-pretty-bc-parser

Parser for the llvm bitcode format
Other
60 stars 6 forks source link

`llvm-pretty-bc-parser` inlines metadata nodes unnecessarily #260

Open RyanGlScott opened 1 year ago

RyanGlScott commented 1 year ago

If you compile this program:

// test.c
int main(void) {
  int x = 0;
  return x;
}

Like so:

$ clang-14 test.c -emit-llvm -g -c && clang-14 test.c -emit-llvm -g -S

Then this is what the resulting test.ll file will look like:

```ll ; ModuleID = 'test.c' source_filename = "test.c" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" ; Function Attrs: noinline nounwind optnone uwtable define dso_local i32 @main() #0 !dbg !10 { %1 = alloca i32, align 4 %2 = alloca i32, align 4 store i32 0, i32* %1, align 4 call void @llvm.dbg.declare(metadata i32* %2, metadata !15, metadata !DIExpression()), !dbg !16 store i32 0, i32* %2, align 4, !dbg !16 %3 = load i32, i32* %2, align 4, !dbg !17 ret i32 %3, !dbg !18 } ; Function Attrs: nofree nosync nounwind readnone speculatable willreturn declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } attributes #1 = { nofree nosync nounwind readnone speculatable willreturn } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8} !llvm.ident = !{!9} !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "Ubuntu clang version 14.0.0-1ubuntu1.1", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) !1 = !DIFile(filename: "test.c", directory: "/home/ryanscott/Documents/Hacking/Haskell/llvm-pretty-bc-parser", checksumkind: CSK_MD5, checksum: "374aba39e8173c9c7f53dd059756dcd6") !2 = !{i32 7, !"Dwarf Version", i32 5} !3 = !{i32 2, !"Debug Info Version", i32 3} !4 = !{i32 1, !"wchar_size", i32 4} !5 = !{i32 7, !"PIC Level", i32 2} !6 = !{i32 7, !"PIE Level", i32 2} !7 = !{i32 7, !"uwtable", i32 1} !8 = !{i32 7, !"frame-pointer", i32 2} !9 = !{!"Ubuntu clang version 14.0.0-1ubuntu1.1"} !10 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 2, type: !11, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !14) !11 = !DISubroutineType(types: !12) !12 = !{!13} !13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) !14 = !{} !15 = !DILocalVariable(name: "x", scope: !10, file: !1, line: 3, type: !13) !16 = !DILocation(line: 3, column: 7, scope: !10) !17 = !DILocation(line: 4, column: 10, scope: !10) !18 = !DILocation(line: 4, column: 3, scope: !10) ```

So far, so good. Now let's see what happens when we round-trip this through llvm-pretty-bc-parser:

``` λ> parseBitCodeFromFile "test.bc" >>= print . fmap (ppLLVM llvmVlatest ppModule) Right source_filename = "test.c" target triple = "x86_64-pc-linux-gnu-" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" declare default void @llvm.dbg.declare(metadata, metadata, metadata) define default i32 @main() !dbg !10 { ;

The part that I want to draw attention to is the call void @llvm.dbg.declare(...) statement. In the original test.ll file, we have this:

  call void @llvm.dbg.declare(metadata i32* %2, metadata !15, metadata !DIExpression()), !dbg !16

But in the roundtripped code, we instead have this:

  call void @llvm.dbg.declare(metadata i32* %2,
                              metadata !DILocalVariable(scope: !10, name: "x",
                                                        file: !DIFile(filename: "test.c",
                                                                      directory: "/home/ryanscott/Documents/Hacking/Haskell/llvm-pretty-bc-parser"),
                                                        line: 3, type: !11, arg: 0, flags: 0,
                                                        align: 0),
                              metadata !DIExpression()), !dbg !DILocation(line: 3, column: 7,
                                                                          scope: !10)

Note how instead of printing references to !15 and !16, the latter inlines the definitions of !15 and !16 entirely, resulting in much more verbose code.

Although strange, this is not wrong in this example, since both versions of the program are equivalent. This is not always the case, however. If you repeat this experiment with the test case in #258 (using Clang 17), however, you will see this in the original .ll file:

  call void @llvm.dbg.assign(metadata i1 undef, metadata !23, metadata !DIExpression(), metadata !24, metadata ptr undef, metadata !DIExpression()), !dbg !25

...

!24 = distinct !DIAssignID()

And this in the roundtripped version:

  call void @llvm.dbg.assign(metadata i1 undef,
                             metadata !DILocalVariable(scope: !17, name: "x",
                                                       file: !DIFile(filename: "test.c",
                                                                     directory: "/home/ryanscott/Documents/Hacking/Haskell/llvm-pretty-bc-parser"),
                                                       line: 4,
                                                       type: !DIBasicType(tag: 36, name: "int",
                                                                          size: 32, align: 0,
                                                                          encoding: 5, flags: 0),
                                                       arg: 0, flags: 0, align: 0),
                             metadata !DIExpression(), metadata !DIAssignID(),
                             metadata ptr undef,
                             metadata !DIExpression()), !dbg !DILocation(line: 0, column: 0,
                                                                         scope: !17)

Note that we are now inlining the expression metdata !DIAssignID. This is invalid, as LLVM requires that all DIAssignID nodes be distinct. This is the case in the original .ll file, but the roundtripped version drops the distinct keyword after inlining. By that point, it is too late, as it is not possible to attach the distinct to inline metadata nodes—the only way to do so is by putting the node in the top-level list of metadata nodes (e.g., !24 in the original .ll file).

In order to make the test case from #258 work, we will need to prevent llvm-pretty-bc-parser from performing this gratuitous inlining. This issue tracks that task.

RyanGlScott commented 1 year ago

I briefly looked at this, and there appear to be two separate (but related) issues involved:

  1. The metadata references after each use of !dbg are inlined. This happens because !dbg locations are not handled when parsing a PartialMetadata value, like most other forms of metadata. Instead, they are handled here, when parsing a FUNC_CODE_DEBUG_LOC:

    https://github.com/GaloisInc/llvm-pretty-bc-parser/blob/8901a9e54d6e0e1a755830c56efe48d9dc8ea758/src/Data/LLVM/BitCode/IR/Function.hs#L763-L771

  2. The metadata arguments to functions such as @llvm.dbg.declare are inlined. Again, this happens because these arguments are handled in parseCallArgs (when parsing FUNC_CODE_INST_CALL), outside of the PartialMetadata parsing loop.

What both of these issues have in common is that both of these forms of metadata are parsed outside of Data.LLVM.BitCode.IR.Metadata, where PartialMetadata is handled. This is important because PartialMetadata contains a MetadataTable, and the MetadataTable data type is largely responsible for the bookkeeping required to de-duplicate metadata references. Because the two forms of metadata above are parsed in a place without access to a MetadataTable, they do not have a convenient way to determine if there is already a reference that points to them.

It's not yet clear to me what the right way to fix this is. I think we will likely need to move some of the MetadataTable-related bookkeeping out of Data.LLVM.BitCode.IR.Metadata and into another place so that they are accessible when parsing things in Data.LLVM.BitCode.IR.Function. I tried looking at LLVM's source code for inspiration, but they appear to be using a rather different approach to metadata deduplication than what llvm-pretty-bc-parser is using (see the getMetadataSlot() function here).

RyanGlScott commented 10 months ago

@peterohanley found another occurrence of this issue in https://github.com/GaloisInc/llvm-pretty-bc-parser/pull/265#discussion_r1443609413:

struct h {
  bool a;
};

void g(struct h *s) {
}

void f() {
  struct h s;
}

When compiling directly with clang++ -emit-llvm -g -S, the .ll output is:

```ll ; ModuleID = 'p2.cpp' source_filename = "p2.cpp" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" %struct.h = type { i8 } ; Function Attrs: mustprogress noinline nounwind optnone uwtable define dso_local void @_Z1gP1h(%struct.h* noundef %0) #0 !dbg !10 { %2 = alloca %struct.h*, align 8 store %struct.h* %0, %struct.h** %2, align 8 call void @llvm.dbg.declare(metadata %struct.h** %2, metadata !19, metadata !DIExpression()), !dbg !20 ret void, !dbg !21 } ; Function Attrs: nofree nosync nounwind readnone speculatable willreturn declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 ; Function Attrs: mustprogress noinline nounwind optnone uwtable define dso_local void @_Z1fv() #0 !dbg !22 { %1 = alloca %struct.h, align 1 call void @llvm.dbg.declare(metadata %struct.h* %1, metadata !25, metadata !DIExpression()), !dbg !26 ret void, !dbg !27 } attributes #0 = { mustprogress noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } attributes #1 = { nofree nosync nounwind readnone speculatable willreturn } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8} !llvm.ident = !{!9} !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "Ubuntu clang version 14.0.0-1ubuntu1.1", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) !1 = !DIFile(filename: "p2.cpp", directory: "/home/ryanscott/Documents/Hacking/Haskell/llvm-pretty-bc-parser", checksumkind: CSK_MD5, checksum: "3a886ebcac895e3acf83864f97430f6d") !2 = !{i32 7, !"Dwarf Version", i32 5} !3 = !{i32 2, !"Debug Info Version", i32 3} !4 = !{i32 1, !"wchar_size", i32 4} !5 = !{i32 7, !"PIC Level", i32 2} !6 = !{i32 7, !"PIE Level", i32 2} !7 = !{i32 7, !"uwtable", i32 1} !8 = !{i32 7, !"frame-pointer", i32 2} !9 = !{!"Ubuntu clang version 14.0.0-1ubuntu1.1"} !10 = distinct !DISubprogram(name: "g", linkageName: "_Z1gP1h", scope: !1, file: !1, line: 5, type: !11, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !18) !11 = !DISubroutineType(types: !12) !12 = !{null, !13} !13 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64) !14 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "h", file: !1, line: 1, size: 8, flags: DIFlagTypePassByValue, elements: !15, identifier: "_ZTS1h") !15 = !{!16} !16 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !14, file: !1, line: 2, baseType: !17, size: 8) !17 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean) !18 = !{} !19 = !DILocalVariable(name: "s", arg: 1, scope: !10, file: !1, line: 5, type: !13) !20 = !DILocation(line: 5, column: 18, scope: !10) !21 = !DILocation(line: 6, column: 1, scope: !10) !22 = distinct !DISubprogram(name: "f", linkageName: "_Z1fv", scope: !1, file: !1, line: 8, type: !23, scopeLine: 8, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !18) !23 = !DISubroutineType(types: !24) !24 = !{null} !25 = !DILocalVariable(name: "s", scope: !22, file: !1, line: 9, type: !14) !26 = !DILocation(line: 9, column: 12, scope: !22) !27 = !DILocation(line: 10, column: 1, scope: !22) ```

After round-tripping through llvm-pretty-bc-parser, the .ll output is:

```ll source_filename = "p2.cpp" target triple = "x86_64-pc-linux-gnu-" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" %struct.h = type { i8 } declare default void @llvm.dbg.declare(metadata, metadata, metadata) define default void @_Z1gP1h(%struct.h* %0) !dbg !15 { ;

In the former .ll file, we have:

define dso_local void @_Z1fv() #0 !dbg !22 {
  %1 = alloca %struct.h, align 1
  call void @llvm.dbg.declare(metadata %struct.h* %1, metadata !25, metadata !DIExpression()), !dbg !26
  ret void, !dbg !27
}

!14 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "h", file: !1, line: 1, size: 8, flags: DIFlagTypePassByValue, elements: !15, identifier: "_ZTS1h")
!25 = !DILocalVariable(name: "s", scope: !22, file: !1, line: 9, type: !14)

Note how !25, the metadata node for the local s variable, is referenced from _Z1fv's call to llvm.dbg.declare. Moreover, !25's type is !1, a distinct node for the struct h type.

In the latter .ll file, we instead have:

define default void @_Z1fv() !dbg !20 {
; <label>: 0
  %1 = alloca %struct.h, align 1
  call void @llvm.dbg.declare(metadata %struct.h* %1,
                              metadata !DILocalVariable(scope: !20, name: "s",
                                                        file: !DIFile(filename: "p2.cpp",
                                                                      directory: "/home/ryanscott/Documents/Hacking/Haskell/llvm-pretty-bc-parser"),
                                                        line: 9,
                                                        type: !DICompositeType(tag: 19, name: "h",
                                                                               file: !2, line: 1,
                                                                               size: 8, align: 0,
                                                                               offset: 0,
                                                                               flags: 4194304,
                                                                               elements: !13,
                                                                               runtimeLang: 0,
                                                                               identifier: "_ZTS1h"),
                                                        arg: 0, flags: 0, align: 0),
                              metadata !DIExpression()), !dbg !DILocation(line: 9, column: 12,
                                                                          scope: !20)
  ret void, !dbg !DILocation(line: 10, column: 1, scope: !20)
}

!1 =
distinct !DICompositeType(tag: 19, name: "h", file: !2, line: 1,
                          size: 8, align: 0, offset: 0, flags: 4194304, elements: !13,
                          runtimeLang: 0, identifier: "_ZTS1h")
!23 =
!DILocalVariable(scope: !20, name: "s", file: !2, line: 9,
                 type: !1, arg: 0, flags: 0, align: 0)

This time, !23 (the metadata node for the local s variable) is not referenced from _Z1fv's call to llvm.dbg.declare. Instead, the contents of !23 and !1 (the metadata node for the struct h type) are inlined into the llvm.dbg.declare call, similar to the original example in this issue. This creates a problem because !1 is marked as distinct, but the inlined version of !1 in the llvm.dbg.declare call is not distinct. llvm-pretty-bc-parser's test suite uses a diff that is sensitive to this difference, and as a result, the test suite fails on this example.

This p2.cpp example is perhaps not as severe as the test case in #258, the latter of which llvm-as outright rejects. llvm-as will still accept the .ll output of p2.cpp even with all of the questionable inlining—it's just llvm-pretty-bc-parser's test suite that is sensitive to the difference. Still, this suggests that we shouldn't be doing the questionable inlining in the first place, and this p2.cpp failure is a symptom of that.