[GEMM] Benchmark for tilling.

How-To

In order to run some benchmark, now you should:

cd benchmarks/DeepLearning/Ops/GEMM
make gemm-affine # will get a IJK loop order case.
# cd into your build path and run
cmake --build .. --target clean && cmake --build .. && ./gemm-benchmark

Then you'll get some output like this:

2022-06-29T14:57:30+08:00
Running ./gemm-benchmark
Run on (6 X 3901.13 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x6)
  L1 Instruction 32 KiB (x6)
  L2 Unified 256 KiB (x6)
  L3 Unified 9216 KiB (x1)
Load Average: 1.58, 0.86, 0.79
***WARNING*** Library was built as DEBUG. Timings may be affected.
-----------------------------------------------------------------------
Benchmark             Time             CPU   Iterations UserCounters...
-----------------------------------------------------------------------
BM_GEMM/50       242182 ns       242091 ns         2874 GFLOPS=1.03681
BM_GEMM/100     2113227 ns      2112343 ns          331 GFLOPS=0.949723
BM_GEMM/150     7136396 ns      7133468 ns           98 GFLOPS=0.949057
BM_GEMM/200    16976603 ns     16969488 ns           41 GFLOPS=0.932451
... skipped ...

If you want to add tilling:

make gemm-affine && buddy-opt opt_gemm.mlir  --affine-loop-tile=tile-sizes=96,96,96 -o opt_gemm.mlir

Will get output opt_gemm.mlir like this:

#map0 = affine_map<(d0) -> (d0)>
#map1 = affine_map<(d0)[s0] -> (d0 + 96, s0)>
module {
  func.func @gemm(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: memref<?x?xf32>) {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %0 = memref.dim %arg0, %c0 : memref<?x?xf32>
    %1 = memref.dim %arg0, %c1 : memref<?x?xf32>
    %2 = memref.dim %arg1, %c1 : memref<?x?xf32>
    affine.for %arg3 = 0 to %0 step 96 {
      affine.for %arg4 = 0 to %2 step 96 {
        affine.for %arg5 = 0 to %1 step 96 {
          affine.for %arg6 = #map0(%arg3) to min #map1(%arg3)[%0] {
            affine.for %arg7 = #map0(%arg4) to min #map1(%arg4)[%2] {
              affine.for %arg8 = #map0(%arg5) to min #map1(%arg5)[%1] {
                %3 = affine.load %arg0[%arg6, %arg8] : memref<?x?xf32>
                %4 = affine.load %arg1[%arg8, %arg7] : memref<?x?xf32>
                %5 = affine.load %arg2[%arg6, %arg7] : memref<?x?xf32>
                %6 = arith.mulf %3, %4 : f32
                %7 = arith.addf %5, %6 : f32
                affine.store %7, %arg2[%arg6, %arg7] : memref<?x?xf32>
              }
            }
          }
        }
      }
    }
    return
  }
}

Change Order into JPI_PIJ:

buddy-opt ./gemm.mlir -convert-linalg-to-affine-loops --affine-loop-tile=tile-sizes=64,128,8 --loop-order-change=new-order-list=1,2,0,5,3,4 -o opt_gemm.mlir

For more information about --loop-order-change, see here

Experiments Result

After some experiments (i5-8400@3.8GHz, L1 32 KiB L2 256KiB, L3 9216KiB), this chart show how loop order and tilling strategy effect performance:

more clearly, here's 6 different loop orders performance data:

And if we add tilling into JPI, we got around 9x improvement in bigger data size:

For now, I think it's time to add explicit copy and packing for higher performance.

Experiments Data

Size	IJK	IJK_444	JPI	JPI_444	IPJ	JIP	PJI	PIJ	JIP_4x8Kernel	PIJ_4x8Kernel	JPI(128x8x64)_PIJ
50	1.07213	1.49043	1.75786	1.49054	1.75821	1.0719	1.7582	1.75788	1.43582	1.45133	1.71038
100	0.956985	1.48011	1.69565	1.51845	1.6954	0.954057	1.69301	1.69282	1.45028	1.48864	1.68239
150	0.930297	1.50178	1.71389	1.50881	1.69496	0.925648	1.70585	1.71876	1.44819	1.49826	1.69468
200	0.918516	1.50955	1.69969	1.51468	1.73226	0.907136	1.70486	1.72676	1.44886	1.48086	1.70748
250	0.910608	1.49565	1.62872	1.50158	1.73679	0.902569	1.6558	1.72519	1.44181	1.48715	1.69062
300	0.89963	1.48963	1.54301	1.49671	1.73179	0.895698	1.55402	1.72626	1.44287	1.4887	1.67527
350	0.896757	1.48946	1.49094	1.49631	1.73951	0.8905	1.53314	1.72778	1.43841	1.49377	1.70042
400	0.895873	1.48744	1.52202	1.49304	1.74106	0.889319	1.53522	1.73154	1.44521	1.49308	1.68443
450	0.891267	1.47966	1.4191	1.48178	1.74079	0.881843	1.4605	1.73211	1.42925	1.48602	1.68519
500	0.885515	1.47148	1.4866	1.4799	1.74722	0.881334	1.49901	1.73563	1.43307	1.49364	1.68292
550	0.874419	1.46672	1.40647	1.45072	1.74686	0.865482	1.44591	1.73748	1.42524	1.49812	1.68261
600	0.87092	1.46282	1.40693	1.44528	1.71188	0.868445	1.44142	1.73987	1.43985	1.49292	1.68133
650	0.867626	1.45961	1.37314	1.40445	1.7486	0.865681	1.38132	1.74145	1.43558	1.49793	1.68168
700	0.867879	1.43558	1.35098	1.38663	1.74997	0.865614	1.3676	1.71782	1.43662	1.49987	1.68559
750	0.86679	1.44482	1.32745	1.32379	1.74928	0.865408	1.33552	1.74522	1.42625	1.49998	1.68261
800	0.865646	1.43359	1.12923	1.31065	1.75049	0.86343	1.18604	1.74331	1.41283	1.50112	1.68227
850	0.866279	1.41481	1.23472	1.27581	1.7294	0.863193	1.29228	1.74452	1.40136	1.49816	1.67971
900	0.864349	1.41157	0.632307	1.21741	1.68943	0.864268	0.560315	1.74038	1.39436	1.49873	1.67877
950	0.863853	1.3993	0.291784	1.10106	1.73347	0.861498	0.291255	1.73594	1.3785	1.49864	1.67846
1000	0.862641	1.40064	0.275667	1.09721	1.74621	0.862825	0.276645	1.73295	1.36765	1.4973	1.67482
1050	0.859337	1.4014	0.267374	1.08939	1.75009	0.862882	0.268941	1.72829	1.37235	1.49657	1.6756
1100	0.856641	1.39251	0.265604	1.08585	1.74637	0.860047	0.267285	1.72543	1.37442	1.49745	1.67237
1150	0.843294	1.3887	0.264407	1.08131	1.74416	0.858757	0.264734	1.72859	1.36294	1.49511	1.67278
1200	0.849222	1.39387	0.261955	1.08532	1.74109	0.858943	0.264021	1.7302	1.3608	1.4989	1.67671
1250	0.821967	1.38016	0.260799	1.07918	1.7261	0.856841	0.262586	1.72855	1.35677	1.49687	1.6711
1300	0.816498	1.37177	0.259138	1.08327	1.71566	0.85275	0.261243	1.72562	1.33995	1.49468	1.6739
1350	0.784567	1.36706	0.256997	1.07877	1.7174	0.851168	0.259072	1.71991	1.32793	1.49677	1.67558
1400	0.807176	1.36505	0.256043	1.07858	1.721	0.848527	0.257905	1.71065	1.31149	1.49574	1.67159
1450	0.714385	1.3435	0.253416	1.07103	1.70691	0.844309	0.255771	1.70106	1.29837	1.49281	1.66112
1500	0.742913	1.34347	0.252195	1.07481	1.70945	0.843041	0.25461	1.70193	1.29445	1.49264	1.6781

buddy-compiler / buddy-benchmark