[Performance] Un-efficient code generated because of the limitation of the vectorization in the IGC's scalar backend.

The Triton kernel mainly depends on the IGC to help to do the vectorization on the data to convert the SIMT kernel to SIMD kernel which could be executed on Intel GPU. (Compare to control flow vectorization.)

The IGC uses the naive SOA to vectorize the scalar and composed data type. (There would be advantage hybrid SoA and AoS). Demonstrate The rough idea with the C++:

// SIMT vector<2xfp16>
struct vector_2xfp16 {
  fp16 val_0;
  fp16 val_1;
};

// SIMD type after vectorized
struct vector_2xfp16 {
  fp16 val_0[sub_group_size];
  fp16 val_1[sub_group_size];
};

It is naturel to vectorize the general binary operation by simply expand the operation to SIMD instruction.

# SIMT
add val_0, val_1
# SIMD
add (sub_group_size|M0) val_0, val_1

But it causes extra overhead in some other ops like the bitcast, extract/insert.

%85 = bitcast <2 x i16> %84 to i32, !dbg !449

Because the layout of the SIMT type <2xi16> and i32 are different after the vectorization in SIMD.

// SIMD type for <2xi16>
struct vector_2xfp16 {
  i16 val_0[sub_group_size];
  i16 val_1[sub_group_size];
};

// SIMD type for i32
struct simd_i32 {
  i32 val_0[sub_group_size];
};

It requires extra register value shuffle in SIMD while in the SIMT it seems a dummy ops to change the data type.

This is really impact the performance especially for some cases: memory collapsed load/store, DPAS operands compose/decompose.

E.G disassemble for store:

# SIMT kernel for memory collapsed store.
%83 = insertelement <2 x i16> undef, i16 %bf_cvt23, i32 0, !dbg !449
  %84 = insertelement <2 x i16> %83, i16 %bf_cvt24, i32 1, !dbg !449
  %85 = bitcast <2 x i16> %84 to i32, !dbg !449
  %86 = insertelement <2 x i16> undef, i16 %bf_cvt25, i32 0, !dbg !449
  %87 = insertelement <2 x i16> %86, i16 %bf_cvt26, i32 1, !dbg !449
  %88 = bitcast <2 x i16> %87 to i32, !dbg !449
  %89 = insertelement <2 x i16> undef, i16 %bf_cvt27, i32 0, !dbg !449
  %90 = insertelement <2 x i16> %89, i16 %bf_cvt28, i32 1, !dbg !449
  %91 = bitcast <2 x i16> %90 to i32, !dbg !449
  %92 = insertelement <2 x i16> undef, i16 %bf_cvt29, i32 0, !dbg !449
  %93 = insertelement <2 x i16> %92, i16 %bf_cvt30, i32 1, !dbg !449
  %94 = bitcast <2 x i16> %93 to i32, !dbg !449
  %95 = insertelement <4 x i32> undef, i32 %85, i32 0, !dbg !449
  %96 = insertelement <4 x i32> %95, i32 %88, i32 1, !dbg !449
  %97 = insertelement <4 x i32> %96, i32 %91, i32 2, !dbg !449
  %98 = insertelement <4 x i32> %97, i32 %94, i32 3, !dbg !449
  %99 = mul i32 %14, %3, !dbg !450
  %100 = sext i32 %99 to i64, !dbg !451
  %101 = getelementptr i16, i16 addrspace(1)* %0, i64 %100, !dbg !451
  %102 = sext i32 %22 to i64, !dbg !452
  %103 = getelementptr i16, i16 addrspace(1)* %101, i64 %102, !dbg !452
  %104 = bitcast i16 addrspace(1)* %103 to <4 x i32> addrspace(1)*, !dbg !449
  store <4 x i32> %98, <4 x i32> addrspace(1)* %104, align 16, !dbg !449
  br label %105, !dbg !449

The VISA

    mov (M1, 32) bf_cvt23_0(0,0)<1> V0112(0,0)<1;1,0>                            /// $200
    mov (M1, 32) bf_cvt24_0(0,0)<1> V0120(0,0)<1;1,0>                            /// $201
    mov (M1, 32) bf_cvt25_0(0,0)<1> V0128(0,0)<1;1,0>                            /// $202
    mov (M1, 32) bf_cvt26_0(0,0)<1> V0136(0,0)<1;1,0>                            /// $203
    mov (M1, 32) bf_cvt27_0(0,0)<1> V0144(0,0)<1;1,0>                            /// $204
    mov (M1, 32) bf_cvt28_0(0,0)<1> V0152(0,0)<1;1,0>                            /// $205
    mov (M1, 32) bf_cvt29_0(0,0)<1> V0160(0,0)<1;1,0>                            /// $206
    mov (M1, 32) bf_cvt30_0(0,0)<1> V0168(0,0)<1;1,0>                            /// $207
    // The overhead in register value shuffle because of layout are not aligned.
    mov (M1, 32) V0169(0,0)<1> bf_cvt23(0,0)<1;1,0>                              /// $208
    mov (M1, 32) V0169(1,0)<1> bf_cvt24(0,0)<1;1,0>                              /// $209
    mov (M1, 32) V0171(0,0)<2> V0169(0,0)<1;1,0>                                 /// $210
    mov (M1, 32) V0171(0,1)<2> V0169(1,0)<1;1,0>                                 /// $211
    mov (M1, 32) V0172(0,0)<1> bf_cvt25(0,0)<1;1,0>                              /// $212
    mov (M1, 32) V0172(1,0)<1> bf_cvt26(0,0)<1;1,0>                              /// $213
    mov (M1, 32) V0174(0,0)<2> V0172(0,0)<1;1,0>                                 /// $214
    mov (M1, 32) V0174(0,1)<2> V0172(1,0)<1;1,0>                                 /// $215
    mov (M1, 32) V0175(0,0)<1> bf_cvt27(0,0)<1;1,0>                              /// $216
    mov (M1, 32) V0175(1,0)<1> bf_cvt28(0,0)<1;1,0>                              /// $217
    mov (M1, 32) V0177(0,0)<2> V0175(0,0)<1;1,0>                                 /// $218
    mov (M1, 32) V0177(0,1)<2> V0175(1,0)<1;1,0>                                 /// $219
    mov (M1, 32) V0178(0,0)<1> bf_cvt29(0,0)<1;1,0>                              /// $220
    mov (M1, 32) V0178(1,0)<1> bf_cvt30(0,0)<1;1,0>                              /// $221
    mov (M1, 32) V0180(0,0)<2> V0178(0,0)<1;1,0>                                 /// $222
    mov (M1, 32) V0180(0,1)<2> V0178(1,0)<1;1,0>                                 /// $223
    // The overhead end.
    mov (M1, 32) V0181(0,0)<1> V0170(0,0)<1;1,0>                                 /// $224
    mov (M1, 32) V0181(2,0)<1> V0173(0,0)<1;1,0>                                 /// $225
    mov (M1, 32) V0181(4,0)<1> V0176(0,0)<1;1,0>                                 /// $226
    mov (M1, 32) V0181(6,0)<1> V0179(0,0)<1;1,0>                                 /// $227

There is no performance improvement by removing the bitcast in the vectorization gather/scatter load/store.

The IGC will shuffle the values to pack it to i32 for memory accessing. It has same effect as we packed it explicitly.

# Use the non-packed type for vectorized store.
store <8 x half> %64, <8 x half> addrspace(1)* %66, align 16, !dbg !339

# The VISA shows the shuffle in packing the value into V0101 and V0099.
   .decl V0099 v_type=G type=d num_elts=64 align=hword
   .decl V0100 v_type=G type=d num_elts=64 align=hword
   .decl V0101 v_type=G type=hf num_elts=128 align=hword alias=<V0099, 0>
   .decl V0102 v_type=G type=hf num_elts=128 align=hword alias=<V0100, 0>
...
    mov (M1, 16) V0101(0,0)<2> V0093(0,0)<1;1,0>                                 /// $98
    mov (M1, 16) V0101(0,1)<2> V0093(1,0)<1;1,0>                                 /// $99
    mov (M1, 16) V0101(2,0)<2> V0093(2,0)<1;1,0>                                 /// $100
    mov (M1, 16) V0101(2,1)<2> V0093(3,0)<1;1,0>                                 /// $101
    mov (M1, 16) V0101(4,0)<2> V0093(4,0)<1;1,0>                                 /// $102
    mov (M1, 16) V0101(4,1)<2> V0093(5,0)<1;1,0>                                 /// $103
    mov (M1, 16) V0101(6,0)<2> V0093(6,0)<1;1,0>                                 /// $104
    mov (M1, 16) V0101(6,1)<2> V0093(7,0)<1;1,0>                                 /// $105
    mov (M5, 16) V0102(0,0)<2> V0094(0,0)<1;1,0>                                 /// $106
    mov (M5, 16) V0102(0,1)<2> V0094(1,0)<1;1,0>                                 /// $107
    mov (M5, 16) V0102(2,0)<2> V0094(2,0)<1;1,0>                                 /// $108
    mov (M5, 16) V0102(2,1)<2> V0094(3,0)<1;1,0>                                 /// $109
    mov (M5, 16) V0102(4,0)<2> V0094(4,0)<1;1,0>                                 /// $110
    mov (M5, 16) V0102(4,1)<2> V0094(5,0)<1;1,0>                                 /// $111
    mov (M5, 16) V0102(6,0)<2> V0094(6,0)<1;1,0>                                 /// $112
    mov (M5, 16) V0102(6,1)<2> V0094(7,0)<1;1,0>                                 /// $113
    lsc_store.ugm.wb.wb (M1, 16)  bti(0x2)[V0097]:a32  V0099:d32x4               /// $114
    lsc_store.ugm.wb.wb (M5, 16)  bti(0x2)[V0098]:a32  V0100:d32x4               /// $115

intel / intel-xpu-backend-for-triton

[Performance] Un-efficient code generated because of the limitation of the vectorization in the IGC's scalar backend. #1254