Implementing Polyhedral Compiler

hikettei commented 2 months ago

Reading

Workload

[x] Fix for scalar outputs
[x] Refactor: render-expr
[x] autograd
[x] simplifying the indexing elements
[x] (!!!) Symbolic Compilation (by adding some assertions to the undetermined shapes, e.g.: (apply #'> all_permutations([a, b, c])))
[x] Using symbolic compilation we can autogenerate each backends for AVM, by just implementing ajit backend.
- [x] Graph Mode, Conv2DとかをLowererせず実行，Moduleを事前にCompileしておいてPyTorchっぽく使う (N C H W)のC, H, WもSymbolicにする必要があるのでIDIVが必要になる
[x] Generate the vectorized/parallelized code w/ ARM NEON
[x] Generate the optimized code w/ METAL
[x] gc-reachable isl objects
[x] ./roswell/caten.ros, from onnx to pure c compiler
[x] Custom Operator at :CUSTOM_OP class

hikettei commented 2 months ago

Padding in a single kernel :v:

(jit (caten (!cos (!sin (!padding (make-tensor `(10 10) :initial-element 2.0) `((2 2) (2 2)) :value 0.0)))) :debug 4)

#include <math.h>
void main700672(float* val_0, float* val_3, float* val_10, float val_8);
void main700672(float* val_0, float* val_3, float* val_10, float val_8) {
  val_8 = 1.5707964;
  val_10[(0)] = val_8;
  for(int c0=0;(c0<=13);c0+=1) {
    for(int c1=0;(c1<=13);c1+=1) {
      val_0[14*(c0)+(c1)] = 0.0;
      if ((c0==2)&&(c1==2)) {
      }
      if ((((c0>=2)&&(c0<=11))&&(c1>=2))&&(c1<=11)) {
        val_3[10*((c0-2))+((c1-2))] = 2.0;
        val_0[14*((c0-2)+2)+((c1-2)+2)] = val_3[10*((c0-2))+((c1-2))];
      }
      val_0[14*(c0)+(c1)] = sin(val_0[14*(c0)+(c1)]);
      val_0[14*(c0)+(c1)] = val_0[14*(c0)+(c1)]+val_10[0+0];
      val_0[14*(c0)+(c1)] = sin(val_0[14*(c0)+(c1)]);
    }
  }
}

as well as the composed gemm:

CATEN-USER> (jit (caten (!matmul (make-tensor `(128 32)) (!matmul (make-tensor `(32 64)) (make-tensor `(64 128))))) :debug 1)
Compiled:

#include <math.h>
void main232208(float* val_29, float* val_21, float* val_13, float* val_5, float* val_9);
void main232208(float* val_29, float* val_21, float* val_13, float* val_5, float* val_9) {
  for(int c0=0;(c0<=31);c0+=1) {
    for(int c1=0;(c1<=127);c1+=1) {
      val_13[128*(c0)+(c1)+(0)] = 0.0;
      for(int c2=0;(c2<=63);c2+=1) {
        val_13[128*(c0)+(c1)+0] += val_5[64*(c0)+0+(c2)] * val_9[0+64*(c1)+(c2)];
      }
    }
  }
  for(int c0=0;(c0<=127);c0+=1) {
    for(int c1=0;(c1<=127);c1+=1) {
      val_29[128*(c0)+(c1)+(0)] = 0.0;
      for(int c2=0;(c2<=31);c2+=1) {
        val_29[128*(c0)+(c1)+0] += val_21[32*(c0)+0+(c2)] * val_13[0+32*(c1)+(c2)];
      }
    }
  }
}

hikettei commented 2 months ago

Axpy

CATEN-USER> (jit (caten (!add (!view (make-tensor `(n)) `(froma toa bya)) (!view (make-tensor `(n)) `(fromb tob byb)))) :debug 1)
Compiled:

#include <math.h>
#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))
void main7637355(int n, int fromb, int tob, int byb, int toa, int froma, int bya, float* val_39, float* val_28);
void main7637355(int n, int fromb, int tob, int byb, int toa, int froma, int bya, float* val_39, float* val_28) {
  for(int c0=0;(c0<tob);c0+=1) {
    val_39[bya*(c0+froma)] = val_39[bya*(c0+froma)]+val_28[byb*(c0+fromb)];
  }
}

hikettei commented 2 months ago

Softmax

TEST> (caten (!softmax (make-tensor `(3 3) :initial-element 1.0)))
Compiled:

#include <math.h>
#include <stdint.h>
#define boolean _Bool
#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))
/*
Arrays:
  - val_6[float32]: (3 1)
  - val_0[float32]: (3 3)
*/
void main1599824(float* val_6, float* val_0);
void main1599824(float* val_6, float* val_0) {
  float val_2;
  for(int c0=0;(c0<=2);c0+=1) {
    val_6[(c0)+(0)] = 0.0;
    for(int c1=0;(c1<=2);c1+=1) {
      val_2 = 1.442695;
      val_0[3*(c0)+(c1)] = 1.0;
      val_0[3*(c0)+(c1)] = val_0[3*(c0)+(c1)]*val_2;
      val_0[3*(c0)+(c1)] = exp2(val_0[3*(c0)+(c1)]);
      val_6[(c0)+0] += val_0[3*(c0)+(c1)];
    }
    val_6[(c0)+0] = 1.0 / (val_6[(c0)+0]);
    for(int c1=2;(c1<=4);c1+=1) {
      val_0[3*(c0)+((c1-2))] = val_0[3*(c0)+((c1-2))]*val_6[(c0)+0];
    }
  }
}

hikettei commented 2 months ago

最適化に手をつける前に多分コンパイルしたいIRを取得しておくのと，予めUnittest書いた方が便利だと思うのでこっち優先

Workload
[x] activations
[x] linear, basic math functions
[x] 論理演算
[x] Norms: BatchNorm LayerNorm GroupNorm RMSNorm
[x] Embedding
[x] 確率分布の初期化 (randn etc ...) 初期化済みBufferから計算を継続する (_topi的なのを実装)
[ ] RNN
[ ] RoPE, PE,
[ ] Unittest for them
[ ] Polyhedral Compilerに手をつける
- [x] Frontendから分離
- Renderer (特にRender-Expr)のrefactor
  - [x] MULTIEXPR=1で固定
  - [x] Backwardの計算が絡んでめんどくさいので後でやる
- 入出力がScalarの時のBug
- undeclared varを修正
- memory_plannerはもう少し頑張れる (+ 一時領域属性の判定 + Reuse)
- Parallelization/Vectorization (OpenMP)
- GPU (Metal, NV)
- Tiling
- gemmを頑張る (4x4 accumlation)

hikettei commented 1 month ago

getting sophisticated

hikettei / Caten

Implementing Polyhedral Compiler #6

Reading

Workload