Closed hikettei closed 1 month ago
Padding in a single kernel :v:
(jit (caten (!cos (!sin (!padding (make-tensor `(10 10) :initial-element 2.0) `((2 2) (2 2)) :value 0.0)))) :debug 4)
#include <math.h>
void main700672(float* val_0, float* val_3, float* val_10, float val_8);
void main700672(float* val_0, float* val_3, float* val_10, float val_8) {
val_8 = 1.5707964;
val_10[(0)] = val_8;
for(int c0=0;(c0<=13);c0+=1) {
for(int c1=0;(c1<=13);c1+=1) {
val_0[14*(c0)+(c1)] = 0.0;
if ((c0==2)&&(c1==2)) {
}
if ((((c0>=2)&&(c0<=11))&&(c1>=2))&&(c1<=11)) {
val_3[10*((c0-2))+((c1-2))] = 2.0;
val_0[14*((c0-2)+2)+((c1-2)+2)] = val_3[10*((c0-2))+((c1-2))];
}
val_0[14*(c0)+(c1)] = sin(val_0[14*(c0)+(c1)]);
val_0[14*(c0)+(c1)] = val_0[14*(c0)+(c1)]+val_10[0+0];
val_0[14*(c0)+(c1)] = sin(val_0[14*(c0)+(c1)]);
}
}
}
as well as the composed gemm:
CATEN-USER> (jit (caten (!matmul (make-tensor `(128 32)) (!matmul (make-tensor `(32 64)) (make-tensor `(64 128))))) :debug 1)
Compiled:
#include <math.h>
void main232208(float* val_29, float* val_21, float* val_13, float* val_5, float* val_9);
void main232208(float* val_29, float* val_21, float* val_13, float* val_5, float* val_9) {
for(int c0=0;(c0<=31);c0+=1) {
for(int c1=0;(c1<=127);c1+=1) {
val_13[128*(c0)+(c1)+(0)] = 0.0;
for(int c2=0;(c2<=63);c2+=1) {
val_13[128*(c0)+(c1)+0] += val_5[64*(c0)+0+(c2)] * val_9[0+64*(c1)+(c2)];
}
}
}
for(int c0=0;(c0<=127);c0+=1) {
for(int c1=0;(c1<=127);c1+=1) {
val_29[128*(c0)+(c1)+(0)] = 0.0;
for(int c2=0;(c2<=31);c2+=1) {
val_29[128*(c0)+(c1)+0] += val_21[32*(c0)+0+(c2)] * val_13[0+32*(c1)+(c2)];
}
}
}
}
Axpy
CATEN-USER> (jit (caten (!add (!view (make-tensor `(n)) `(froma toa bya)) (!view (make-tensor `(n)) `(fromb tob byb)))) :debug 1)
Compiled:
#include <math.h>
#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))
void main7637355(int n, int fromb, int tob, int byb, int toa, int froma, int bya, float* val_39, float* val_28);
void main7637355(int n, int fromb, int tob, int byb, int toa, int froma, int bya, float* val_39, float* val_28) {
for(int c0=0;(c0<tob);c0+=1) {
val_39[bya*(c0+froma)] = val_39[bya*(c0+froma)]+val_28[byb*(c0+fromb)];
}
}
Softmax
TEST> (caten (!softmax (make-tensor `(3 3) :initial-element 1.0)))
Compiled:
#include <math.h>
#include <stdint.h>
#define boolean _Bool
#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))
/*
Arrays:
- val_6[float32]: (3 1)
- val_0[float32]: (3 3)
*/
void main1599824(float* val_6, float* val_0);
void main1599824(float* val_6, float* val_0) {
float val_2;
for(int c0=0;(c0<=2);c0+=1) {
val_6[(c0)+(0)] = 0.0;
for(int c1=0;(c1<=2);c1+=1) {
val_2 = 1.442695;
val_0[3*(c0)+(c1)] = 1.0;
val_0[3*(c0)+(c1)] = val_0[3*(c0)+(c1)]*val_2;
val_0[3*(c0)+(c1)] = exp2(val_0[3*(c0)+(c1)]);
val_6[(c0)+0] += val_0[3*(c0)+(c1)];
}
val_6[(c0)+0] = 1.0 / (val_6[(c0)+0]);
for(int c1=2;(c1<=4);c1+=1) {
val_0[3*(c0)+((c1-2))] = val_0[3*(c0)+((c1-2))]*val_6[(c0)+0];
}
}
}
最適化に手をつける前に多分コンパイルしたいIRを取得しておくのと,予めUnittest書いた方が便利だと思うのでこっち優先
getting sophisticated
Reading
https://arxiv.org/pdf/2401.06665
https://www.researchgate.net/publication/320992060_Consecutivity_in_the_isl_Polyhedral_Scheduler
https://pliss2019.github.io/albert_cohen_slides.pdf
https://www.researchgate.net/publication/320992060_Consecutivity_in_the_isl_Polyhedral_Scheduler
https://libisl.sourceforge.io/manual.pdf
Workload
./roswell/caten.ros
, from onnx to pure c compiler:CUSTOM_OP
class