Open ienkovich opened 1 month ago
When a zero AMX tile has multiple uses, it's copied using a tileload + tilestore pair instead of another tilezero instruction. Example:
tileload
tilestore
tilezero
; ModuleID = 'LLVMDialectModule' source_filename = "LLVMDialectModule" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" define void @amx_phi(ptr %0, ptr %1, ptr %2, i1 %3) { .entry: %zero_tile = tail call x86_amx @llvm.x86.tilezero.internal(i16 16, i16 64) br i1 %3, label %.comp, label %.exit .comp: %lhs_1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %0, i64 128) %ptr_1 = getelementptr i8, ptr %0, i64 1024 %lhs_2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %ptr_1, i64 128) %rhs_1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 16, i16 64, ptr %1, i64 128) %acc_1 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %zero_tile, x86_amx %lhs_1, x86_amx %rhs_1) %acc_2 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 16, i16 64, i16 64, x86_amx %zero_tile, x86_amx %lhs_2, x86_amx %rhs_1) br label %.exit .exit: %res_1 = phi x86_amx [ %zero_tile, %.entry ], [ %acc_1, %.comp ] %res_2 = phi x86_amx [ %zero_tile, %.entry ], [ %acc_2, %.comp ] %ptr_out_1 = getelementptr i8, ptr %2, i64 0 tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %ptr_out_1, i64 128, x86_amx %res_1) %ptr_out_2 = getelementptr i8, ptr %2, i64 1024 tail call void @llvm.x86.tilestored64.internal(i16 16, i16 64, ptr %ptr_out_2, i64 128, x86_amx %res_2) ret void } ; Function Attrs: nounwind declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #0 ; Function Attrs: nounwind declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64) #0 ; Function Attrs: nounwind declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #0 ; Function Attrs: nounwind declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #0 attributes #0 = { nounwind } !llvm.module.flags = !{!0} !0 = !{i32 2, !"Debug Info Version", i32 3}
Generated asm using llc test.ll -mcpu=sapphirerapids -O3:
llc test.ll -mcpu=sapphirerapids -O3
tilezero %tmm0 ... tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill tileloadd 896(%rsp), %tmm1 # 1024-byte Folded Reload ... tilestored %tmm0, 1920(%rsp,%rbp) # 1024-byte Folded Spill tileloadd 1920(%rsp), %tmm1 # 1024-byte Folded Reload
In both cases tilezero %tmm1 could be used instead because %tmm0 is known to be defined by tilezeo instruction.
tilezero %tmm1
%tmm0
tilezeo
@llvm/issue-subscribers-backend-x86
Author: Ilya Enkovich (ienkovich)
When a zero AMX tile has multiple uses, it's copied using a
tileload
+tilestore
pair instead of anothertilezero
instruction. Example:Generated asm using
llc test.ll -mcpu=sapphirerapids -O3
:In both cases
tilezero %tmm1
could be used instead because%tmm0
is known to be defined bytilezeo
instruction.