llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.53k stars 11.79k forks source link

[X86] [AMX] Misoptimized copy of a zero tile #112763

Open ienkovich opened 3 hours ago

ienkovich commented 3 hours ago

When a zero AMX tile has multiple uses, it's copied using a tileload + tilestore pair instead of another tilezero instruction. Example:

; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define void @amx_phi(ptr %0, ptr %1, ptr %2, i1 %3) {
.entry:
  %zero_tile = tail call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64)
  br i1 %3, label %.comp, label %.exit

.comp:
  %lhs_1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %0, i64 128)
  %ptr_1 = getelementptr i8, ptr %0, i64 1024
  %lhs_2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %ptr_1, i64 128)
  %rhs_1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %1, i64 128)
  %acc_1 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %zero_tile, x86_amx %lhs_1, x86_amx %rhs_1)
  %acc_2 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %zero_tile, x86_amx %lhs_2, x86_amx %rhs_1)
  br label %.exit

.exit:
  %res_1 = phi x86_amx [ %zero_tile, %.entry ], [ %acc_1, %.comp ]
  %res_2 = phi x86_amx [ %zero_tile, %.entry ], [ %acc_2, %.comp ]
  %ptr_out_1 = getelementptr i8, ptr %2, i64 0
  tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %ptr_out_1, i64 128, x86_amx %res_1)
  %ptr_out_2 = getelementptr i8, ptr %2, i64 1024
  tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %ptr_out_2, i64 128, x86_amx %res_2)

  ret void
}

; Function Attrs: nounwind
declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #0
; Function Attrs: nounwind
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) #0
; Function Attrs: nounwind
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #0
; Function Attrs: nounwind
declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #0

attributes #0 = { nounwind }

!llvm.module.flags = !{!0}

!0 = !{i32 2, !"Debug Info Version", i32 3}

Generated asm using llc test.ll -mcpu=sapphirerapids -O3:

        tilezero        %tmm0
...
        tilestored      %tmm0, 896(%rsp,%rbp)   # 1024-byte Folded Spill
        tileloadd       896(%rsp), %tmm1        # 1024-byte Folded Reload
...
        tilestored      %tmm0, 1920(%rsp,%rbp)  # 1024-byte Folded Spill
        tileloadd       1920(%rsp), %tmm1       # 1024-byte Folded Reload

In both cases tilezero %tmm1 could be used instead because %tmm0 is known to be defined by tilezeo instruction.

llvmbot commented 2 hours ago

@llvm/issue-subscribers-backend-x86

Author: Ilya Enkovich (ienkovich)

When a zero AMX tile has multiple uses, it's copied using a `tileload` + `tilestore` pair instead of another `tilezero` instruction. Example: ``` ; ModuleID = 'LLVMDialectModule' source_filename = "LLVMDialectModule" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" define void @amx_phi(ptr %0, ptr %1, ptr %2, i1 %3) { .entry: %zero_tile = tail call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64) br i1 %3, label %.comp, label %.exit .comp: %lhs_1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %0, i64 128) %ptr_1 = getelementptr i8, ptr %0, i64 1024 %lhs_2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %ptr_1, i64 128) %rhs_1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %1, i64 128) %acc_1 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %zero_tile, x86_amx %lhs_1, x86_amx %rhs_1) %acc_2 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %zero_tile, x86_amx %lhs_2, x86_amx %rhs_1) br label %.exit .exit: %res_1 = phi x86_amx [ %zero_tile, %.entry ], [ %acc_1, %.comp ] %res_2 = phi x86_amx [ %zero_tile, %.entry ], [ %acc_2, %.comp ] %ptr_out_1 = getelementptr i8, ptr %2, i64 0 tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %ptr_out_1, i64 128, x86_amx %res_1) %ptr_out_2 = getelementptr i8, ptr %2, i64 1024 tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %ptr_out_2, i64 128, x86_amx %res_2) ret void } ; Function Attrs: nounwind declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #0 ; Function Attrs: nounwind declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) #0 ; Function Attrs: nounwind declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #0 ; Function Attrs: nounwind declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #0 attributes #0 = { nounwind } !llvm.module.flags = !{!0} !0 = !{i32 2, !"Debug Info Version", i32 3} ``` Generated asm using `llc test.ll -mcpu=sapphirerapids -O3`: ``` tilezero %tmm0 ... tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill tileloadd 896(%rsp), %tmm1 # 1024-byte Folded Reload ... tilestored %tmm0, 1920(%rsp,%rbp) # 1024-byte Folded Spill tileloadd 1920(%rsp), %tmm1 # 1024-byte Folded Reload ``` In both cases `tilezero %tmm1` could be used instead because `%tmm0` is known to be defined by `tilezeo` instruction.