Open lethean1 opened 2 years ago
Can you please add the commands you use and the generated IR? BTW if you are trying to replicate our results you can also try out the docker image: https://github.com/wsmoses/Polygeist-Script/
I used the hand-made heat-3d mlir, and I wanted to use the polymer to optimize it. the mlir file is as follow:
#map = affine_map<()[s0] -> (s0 - 1)>
module {
func private@heat_3d(%arg0: memref<200x200x200xf64>, %arg1: memref<200x200x200xf64>, %arg6:i32) attributes {llvm.emit_c_interface} {
%0 = arith.index_cast %arg6 : i32 to index
affine.for %arg5 = 0 to 1000{
affine.for %arg2 = 1 to #map()[%0] {
affine.for %arg3 = 1 to #map()[%0] {
affine.for %arg4 = 1 to #map()[%0] {
%cst = arith.constant 1.250000e-01 : f64
%1 = affine.load %arg0[%arg2 + 1, %arg3, %arg4] : memref<200x200x200xf64>
%cst_0 = arith.constant 2.000000e+00 : f64
%2 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%3 = arith.mulf %cst_0, %2 : f64
%4 = arith.subf %1, %3 : f64
%5 = affine.load %arg0[%arg2 - 1, %arg3, %arg4] : memref<200x200x200xf64>
%6 = arith.addf %4, %5 : f64
%7 = arith.mulf %cst, %6 : f64
%cst_1 = arith.constant 1.250000e-01 : f64
%8 = affine.load %arg0[%arg2, %arg3 + 1, %arg4] : memref<200x200x200xf64>
%cst_2 = arith.constant 2.000000e+00 : f64
%9 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%10 = arith.mulf %cst_2, %9 : f64
%11 = arith.subf %8, %10 : f64
%12 = affine.load %arg0[%arg2, %arg3 - 1, %arg4] : memref<200x200x200xf64>
%13 = arith.addf %11, %12 : f64
%14 = arith.mulf %cst_1, %13 : f64
%15 = arith.addf %7, %14 : f64
%cst_3 = arith.constant 1.250000e-01 : f64
%16 = affine.load %arg0[%arg2, %arg3, %arg4 + 1] : memref<200x200x200xf64>
%cst_4 = arith.constant 2.000000e+00 : f64
%17 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%18 = arith.mulf %cst_4, %17 : f64
%19 = arith.subf %16, %18 : f64
%20 = affine.load %arg0[%arg2, %arg3, %arg4 - 1] : memref<200x200x200xf64>
%21 = arith.addf %19, %20 : f64
%22 = arith.mulf %cst_3, %21 : f64
%23 = arith.addf %15, %22 : f64
%24 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%25 = arith.addf %23, %24 : f64
affine.store %25, %arg1[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
}
}
}
affine.for %arg2 = 1 to #map()[%0] {
affine.for %arg3 = 1 to #map()[%0] {
affine.for %arg4 = 1 to #map()[%0] {
%cst = arith.constant 1.250000e-01 : f64
%1 = affine.load %arg1[%arg2 + 1, %arg3, %arg4] : memref<200x200x200xf64>
%cst_0 = arith.constant 2.000000e+00 : f64
%2 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%3 = arith.mulf %cst_0, %2 : f64
%4 = arith.subf %1, %3 : f64
%5 = affine.load %arg1[%arg2 - 1, %arg3, %arg4] : memref<200x200x200xf64>
%6 = arith.addf %4, %5 : f64
%7 = arith.mulf %cst, %6 : f64
%cst_1 = arith.constant 1.250000e-01 : f64
%8 = affine.load %arg1[%arg2, %arg3 + 1, %arg4] : memref<200x200x200xf64>
%cst_2 = arith.constant 2.000000e+00 : f64
%9 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%10 = arith.mulf %cst_2, %9 : f64
%11 = arith.subf %8, %10 : f64
%12 = affine.load %arg1[%arg2, %arg3 - 1, %arg4] : memref<200x200x200xf64>
%13 = arith.addf %11, %12 : f64
%14 = arith.mulf %cst_1, %13 : f64
%15 = arith.addf %7, %14 : f64
%cst_3 = arith.constant 1.250000e-01 : f64
%16 = affine.load %arg1[%arg2, %arg3, %arg4 + 1] : memref<200x200x200xf64>
%cst_4 = arith.constant 2.000000e+00 : f64
%17 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%18 = arith.mulf %cst_4, %17 : f64
%19 = arith.subf %16, %18 : f64
%20 = affine.load %arg1[%arg2, %arg3, %arg4 - 1] : memref<200x200x200xf64>
%21 = arith.addf %19, %20 : f64
%22 = arith.mulf %cst_3, %21 : f64
%23 = arith.addf %15, %22 : f64
%24 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
%25 = arith.addf %23, %24 : f64
affine.store %25, %arg0[%arg2, %arg3, %arg4] : memref<200x200x200xf64>
}
}
}
}
return
}
func @heat_3d_iteration(%arg0: memref<200x200x200xf64>, %arg1: memref<200x200x200xf64>)attributes {llvm.emit_c_interface}{
%cst_200 = arith.constant 200 : i32
call @heat_3d(%arg0, %arg1, %cst_200) : (memref<200x200x200xf64>, memref<200x200x200xf64>, i32) -> ()
return
}
}
if I changed this mlir file to fit Polygeist-Script's llvm version(like change arith.add to add) and ran the instructions above, I can got a good optimization effect. But I wanted to achieve your optimization effect in Polygeist-Script with a higher version polymer to fit my project.
The whole compilation process is as follow:
./bin/polymer-opt --demote-loop-reduction --extract-scop-stmt --pluto-opt='parallelize=1' --inline --canonicalize in.mlir 2>/dev/null > out.mlir
mlir-opt -affine-parallelize -lower-affine -convert-scf-to-openmp -convert-scf-to-std -convert-openmp-to-llvm out.mlir | mlir-translate -mlir-to-llvmir > out.ll
clang main.c -O3 out.ll -o out.exe -lm -fopenmp
numactl --physcpubind=1-8 ./out.exe
and main.c :
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/time.h>
struct ThreeDMemrefF64 {
double *ptrToData;
double *alignedPtrToData;
long offset;
long shape[3];
long stride[3];
};
#define M 200
#define N 200
#define P 200
struct timeval begin, end;
void tic()
{
gettimeofday(&begin, NULL);
}
double tok()
{
gettimeofday(&end, NULL);
double elapsedTime = (end.tv_sec - begin.tv_sec)*1e3 + \
(end.tv_usec - begin.tv_usec)*1e-3;
return elapsedTime;
}
extern void _mlir_ciface_heat_3d_iteration(struct ThreeDMemrefF64 *,
struct ThreeDMemrefF64 *);
int main(int argc, char *argv[]) {
int i, j, k;
double (*A)[200][200] = calloc(8000000, sizeof(double));
double (*B)[200][200] = calloc(8000000, sizeof(double));
double sumtime = 0;
for (i = 0; i < M; i++) {
for (j = 0; j < N; j++) {
for(k = 0; k< P; k++){
A[i][j][k] = ((double)i + j + k) / (i + j + k + 1);
B[i][j][k] = (double)0;
}
}
}
struct ThreeDMemrefF64 A_mem = {&A[0][0][0], &A[0][0][0], 0, {M, N, P}, {N*P, P, 1}};
struct ThreeDMemrefF64 B_mem = {&B[0][0][0], &B[0][0][0], 0, {M, N, P}, {N*P, P, 1}};
tic();
_mlir_ciface_heat_3d_iteration(&A_mem, &B_mem);
double elapsedTime = tok();
sumtime += elapsedTime;
printf("Time: %lf (ms)\n", elapsedTime);
return 0;
}
Can you please check to get the same schedule/IR from Polymer using the provided docker and your newer version? If the schedules are different, that could be an explanation. Also, which version of mlir-opt
are you using? Do you also have the slow-down if you run the same Polymer version available in the docker?
I got the different schedule from the provided docker and my newer verison(updated polymer). This is the point of my confusion. I used the same instruction and almost same input(only modified some ir expression to fit llvm like addf->arith.addf as some std op changed to arith dialect). And I also think may be this is why I got a slow down(if I don't use the wrong instruction), but this result even slower than not having to optimize. I think that is abnormal.
or can you update your Polygeist-Script project?
I got the different schedule from the provided docker and my newer verison(updated polymer). This is the point of my confusion. I used the same instruction and almost same input(only modified some ir expression to fit llvm like addf->arith.addf as some std op changed to arith dialect). And I also think may be this is why I got a slow down(if I don't use the wrong instruction), but this result even slower than not having to optimize. I think that is abnormal.
Polymer depends on Polygeist and MLIR. These tools change daily, so having the same inputs and command-line options is sometimes not sufficient to obtain the same performance. We probably have some performance regression when updating Polymer to newer Polygesit and MLIR versions.
I saw the instruction in the Polygeist-Script project (https://github.com/wsmoses/Polygeist-Script/):
and I used the almost same instruction to run the mlir file but used updated polymer(e87c27c36b3d346612e505a1b5d7939e6b6aeb41 updated on 2022.1.3)
Then I got a quiet different polyhedral optimization results and it was running nearly 8 times slower. Is there any mistake here ?