Closed BiradarSiddhant02 closed 3 months ago
Observed from code in #70
Observed that the difference is because order of loop variables.
Still it's not the same.
Here's the code we are comparing with each other:
(c-lisp
(define ((__MMult1 void) (A (ptr float))
(B (ptr float))
(C (ptr float))
(m int)
(n int)
(k int))
(declare i int)
(declare j int)
(declare p int)
(for ((set i 0) (lt i m) (set i (add i 1)))
(for ((set j 0) (lt j n) (set j (add j 1)))
(declare sum float)
(set sum 0.0)
(for ((set p 0) (lt p k) (set p (add p 1)))
; sum += A[p * m + i] * B[j * k + p]
(set sum (fadd
(fmul
(load (ptradd A (add (mul p m) i)))
(load (ptradd B (add (mul j k) p))))
sum)))
(store (ptradd C (add (mul j m) i)) sum)))
(ret)))
and
void ref_mult(float* A, float* B, float* C, int m, int n, int k) {
int i ;
int j ;
int p ;
for ( i = 0; i < m; i++) {
for ( j = 0; j < n; j++) {
float sum = 0;
for ( p = 0; p < k; p++) {
sum += A[p * m + i] * B[j * k + p];
}
C[j * m + i] = sum;
}
}
return ;
}
LLVM IR without any optimization in C:
; ModuleID = 'refmult.c'
source_filename = "refmult.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: noinline nounwind optnone uwtable
define dso_local void @ref_mult(ptr noundef %0, ptr noundef %1, ptr noundef %2, i32 noundef %3, i32 noundef %4, i32 noundef %5) #0 {
%7 = alloca ptr, align 8
%8 = alloca ptr, align 8
%9 = alloca ptr, align 8
%10 = alloca i32, align 4
%11 = alloca i32, align 4
%12 = alloca i32, align 4
%13 = alloca i32, align 4
%14 = alloca i32, align 4
%15 = alloca i32, align 4
%16 = alloca float, align 4
store ptr %0, ptr %7, align 8
store ptr %1, ptr %8, align 8
store ptr %2, ptr %9, align 8
store i32 %3, ptr %10, align 4
store i32 %4, ptr %11, align 4
store i32 %5, ptr %12, align 4
store i32 0, ptr %13, align 4
br label %17
17: ; preds = %69, %6
%18 = load i32, ptr %13, align 4
%19 = load i32, ptr %10, align 4
%20 = icmp slt i32 %18, %19
br i1 %20, label %21, label %72
21: ; preds = %17
store i32 0, ptr %14, align 4
br label %22
22: ; preds = %65, %21
%23 = load i32, ptr %14, align 4
%24 = load i32, ptr %11, align 4
%25 = icmp slt i32 %23, %24
br i1 %25, label %26, label %68
26: ; preds = %22
store float 0.000000e+00, ptr %16, align 4
store i32 0, ptr %15, align 4
br label %27
27: ; preds = %52, %26
%28 = load i32, ptr %15, align 4
%29 = load i32, ptr %12, align 4
%30 = icmp slt i32 %28, %29
br i1 %30, label %31, label %55
31: ; preds = %27
%32 = load ptr, ptr %7, align 8
%33 = load i32, ptr %15, align 4
%34 = load i32, ptr %10, align 4
%35 = mul nsw i32 %33, %34
%36 = load i32, ptr %13, align 4
%37 = add nsw i32 %35, %36
%38 = sext i32 %37 to i64
%39 = getelementptr inbounds float, ptr %32, i64 %38
%40 = load float, ptr %39, align 4
%41 = load ptr, ptr %8, align 8
%42 = load i32, ptr %14, align 4
%43 = load i32, ptr %12, align 4
%44 = mul nsw i32 %42, %43
%45 = load i32, ptr %15, align 4
%46 = add nsw i32 %44, %45
%47 = sext i32 %46 to i64
%48 = getelementptr inbounds float, ptr %41, i64 %47
%49 = load float, ptr %48, align 4
%50 = load float, ptr %16, align 4
%51 = call float @llvm.fmuladd.f32(float %40, float %49, float %50)
store float %51, ptr %16, align 4
br label %52
52: ; preds = %31
%53 = load i32, ptr %15, align 4
%54 = add nsw i32 %53, 1
store i32 %54, ptr %15, align 4
br label %27, !llvm.loop !6
55: ; preds = %27
%56 = load float, ptr %16, align 4
%57 = load ptr, ptr %9, align 8
%58 = load i32, ptr %14, align 4
%59 = load i32, ptr %10, align 4
%60 = mul nsw i32 %58, %59
%61 = load i32, ptr %13, align 4
%62 = add nsw i32 %60, %61
%63 = sext i32 %62 to i64
%64 = getelementptr inbounds float, ptr %57, i64 %63
store float %56, ptr %64, align 4
br label %65
65: ; preds = %55
%66 = load i32, ptr %14, align 4
%67 = add nsw i32 %66, 1
store i32 %67, ptr %14, align 4
br label %22, !llvm.loop !8
68: ; preds = %22
br label %69
69: ; preds = %68
%70 = load i32, ptr %13, align 4
%71 = add nsw i32 %70, 1
store i32 %71, ptr %13, align 4
br label %17, !llvm.loop !9
72: ; preds = %17
ret void
}
; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare float @llvm.fmuladd.f32(float, float, float) #1
attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 2}
!4 = !{i32 7, !"frame-pointer", i32 2}
!5 = !{!"clang version 15.0.7"}
!6 = distinct !{!6, !7}
!7 = !{!"llvm.loop.mustprogress"}
!8 = distinct !{!8, !7}
!9 = distinct !{!9, !7}
and LLVM IR for C:
; ModuleID = ""
target triple = "unknown-unknown-unknown"
target datalayout = ""
define void @"__MMult1"(float* %"A", float* %"B", float* %"C", i32 %"m", i32 %"n", i32 %"k")
{
alloca-bbmrqjzu:
%"A.1" = alloca float*
%"B.1" = alloca float*
%"C.1" = alloca float*
%"m.1" = alloca i32
%"n.1" = alloca i32
%"k.1" = alloca i32
%"tmp_clisp-affwcftt" = alloca i32
%"i" = alloca i32
%"tmp_clisp.inp_0-mkkphcmp" = alloca i32
%"tmp_clisp.inp_1-hbzklbpu" = alloca i32
%"tmp_clisp.cond-piewakqq" = alloca i1
%"tmp_clisp-omedjamq" = alloca i32
%"j" = alloca i32
%"tmp_clisp.inp_0-wfsoufeb" = alloca i32
%"tmp_clisp.inp_1-ujctvsij" = alloca i32
%"tmp_clisp.cond-nqepxmpx" = alloca i1
%"tmp_clisp-aordgtmx" = alloca float
%"sum.krxidkvg.gtbosduc" = alloca float
%"tmp_clisp-jucjizah" = alloca i32
%"p" = alloca i32
%"tmp_clisp.inp_0-aiavcbuq" = alloca i32
%"tmp_clisp.inp_1-jndmhbze" = alloca i32
%"tmp_clisp.cond-eokbncpo" = alloca i1
%"tmp_clisp.inp_0-alwgxswg" = alloca i32
%"tmp_clisp.inp_1-bljojdzc" = alloca i32
%"tmp_clisp.inp_0-zyaxkyfb" = alloca i32
%"tmp_clisp.inp_1-mxzzloqp" = alloca i32
%"tmp_clisp-zlkwxzlv" = alloca i32
%"tmp_clisp-lsegeqfw" = alloca float*
%"tmp_clisp.inp_0-tyjnueua" = alloca float
%"tmp_clisp.inp_0-zwskayzm" = alloca i32
%"tmp_clisp.inp_1-duvxmwwc" = alloca i32
%"tmp_clisp.inp_0-xsgzizfs" = alloca i32
%"tmp_clisp.inp_1-iwjecgmx" = alloca i32
%"tmp_clisp-nmoirgcz" = alloca i32
%"tmp_clisp-gjwitaar" = alloca float*
%"tmp_clisp.inp_1-ubmlbnzu" = alloca float
%"tmp_clisp.inp_0-kdbwpial" = alloca float
%"tmp_clisp.inp_1-euvgwazb" = alloca float
%"tmp_clisp-mbttzjym" = alloca float
%"tmp_clisp.inp_0-hoxwqebl" = alloca i32
%"tmp_clisp.inp_1-tnfibwti" = alloca i32
%"tmp_clisp-qufnjzis" = alloca i32
%"tmp_clisp.inp_0-ouourali" = alloca i32
%"tmp_clisp.inp_1-hltiwqfw" = alloca i32
%"tmp_clisp.inp_0-vlfjpvro" = alloca i32
%"tmp_clisp.inp_1-gtwxqetc" = alloca i32
%"tmp_clisp-iefldmrm" = alloca i32
%"tmp_clisp.ptr-lvfdazjr" = alloca float*
%"tmp_clisp.val-ljmupwse" = alloca float
%"tmp_clisp-nvfiledd" = alloca float
%"tmp_clisp.inp_0-ausjwnoe" = alloca i32
%"tmp_clisp.inp_1-gxfjeqti" = alloca i32
%"tmp_clisp-vdiojlyt" = alloca i32
%"tmp_clisp.inp_0-hpakbkdl" = alloca i32
%"tmp_clisp.inp_1-wiolysdf" = alloca i32
%"tmp_clisp-hfomxqoq" = alloca i32
br label %"entry-thzhmlgy"
entry-thzhmlgy:
store float* %"A", float** %"A.1"
store float* %"B", float** %"B.1"
store float* %"C", float** %"C.1"
store i32 %"m", i32* %"m.1"
store i32 %"n", i32* %"n.1"
store i32 %"k", i32* %"k.1"
store i32 0, i32* %"tmp_clisp-affwcftt"
%".15" = load i32, i32* %"tmp_clisp-affwcftt"
store i32 %".15", i32* %"i"
br label %"tmp_clisp.loop-tawppkmx"
tmp_clisp.loop-tawppkmx:
%".18" = load i32, i32* %"i"
store i32 %".18", i32* %"tmp_clisp.inp_0-mkkphcmp"
%".20" = load i32, i32* %"m.1"
store i32 %".20", i32* %"tmp_clisp.inp_1-hbzklbpu"
%".22" = load i32, i32* %"tmp_clisp.inp_0-mkkphcmp"
%".23" = load i32, i32* %"tmp_clisp.inp_1-hbzklbpu"
%"tmp_clisp.cond-piewakqq.1" = icmp slt i32 %".22", %".23"
store i1 %"tmp_clisp.cond-piewakqq.1", i1* %"tmp_clisp.cond-piewakqq"
%".25" = load i1, i1* %"tmp_clisp.cond-piewakqq"
br i1 %".25", label %"tmp_clisp.cont-hoaeiczv", label %"tmp_clisp.break-dcqlzqiq"
tmp_clisp.cont-hoaeiczv:
store i32 0, i32* %"tmp_clisp-omedjamq"
%".28" = load i32, i32* %"tmp_clisp-omedjamq"
store i32 %".28", i32* %"j"
br label %"tmp_clisp.loop-cjfdjqts"
tmp_clisp.loop-cjfdjqts:
%".31" = load i32, i32* %"j"
store i32 %".31", i32* %"tmp_clisp.inp_0-wfsoufeb"
%".33" = load i32, i32* %"n.1"
store i32 %".33", i32* %"tmp_clisp.inp_1-ujctvsij"
%".35" = load i32, i32* %"tmp_clisp.inp_0-wfsoufeb"
%".36" = load i32, i32* %"tmp_clisp.inp_1-ujctvsij"
%"tmp_clisp.cond-nqepxmpx.1" = icmp slt i32 %".35", %".36"
store i1 %"tmp_clisp.cond-nqepxmpx.1", i1* %"tmp_clisp.cond-nqepxmpx"
%".38" = load i1, i1* %"tmp_clisp.cond-nqepxmpx"
br i1 %".38", label %"tmp_clisp.cont-kobknley", label %"tmp_clisp.break-wjvptbxu"
tmp_clisp.cont-kobknley:
store float 0x0, float* %"tmp_clisp-aordgtmx"
%".41" = load float, float* %"tmp_clisp-aordgtmx"
store float %".41", float* %"sum.krxidkvg.gtbosduc"
store i32 0, i32* %"tmp_clisp-jucjizah"
%".44" = load i32, i32* %"tmp_clisp-jucjizah"
store i32 %".44", i32* %"p"
br label %"tmp_clisp.loop-dokigiia"
tmp_clisp.loop-dokigiia:
%".47" = load i32, i32* %"p"
store i32 %".47", i32* %"tmp_clisp.inp_0-aiavcbuq"
%".49" = load i32, i32* %"k.1"
store i32 %".49", i32* %"tmp_clisp.inp_1-jndmhbze"
%".51" = load i32, i32* %"tmp_clisp.inp_0-aiavcbuq"
%".52" = load i32, i32* %"tmp_clisp.inp_1-jndmhbze"
%"tmp_clisp.cond-eokbncpo.1" = icmp slt i32 %".51", %".52"
store i1 %"tmp_clisp.cond-eokbncpo.1", i1* %"tmp_clisp.cond-eokbncpo"
%".54" = load i1, i1* %"tmp_clisp.cond-eokbncpo"
br i1 %".54", label %"tmp_clisp.cont-qrwqmrlh", label %"tmp_clisp.break-zkrgkvej"
tmp_clisp.cont-qrwqmrlh:
%".56" = load i32, i32* %"p"
store i32 %".56", i32* %"tmp_clisp.inp_0-alwgxswg"
%".58" = load i32, i32* %"m.1"
store i32 %".58", i32* %"tmp_clisp.inp_1-bljojdzc"
%".60" = load i32, i32* %"tmp_clisp.inp_0-alwgxswg"
%".61" = load i32, i32* %"tmp_clisp.inp_1-bljojdzc"
%"tmp_clisp.inp_0-zyaxkyfb.1" = mul i32 %".60", %".61"
store i32 %"tmp_clisp.inp_0-zyaxkyfb.1", i32* %"tmp_clisp.inp_0-zyaxkyfb"
%".63" = load i32, i32* %"i"
store i32 %".63", i32* %"tmp_clisp.inp_1-mxzzloqp"
%".65" = load i32, i32* %"tmp_clisp.inp_0-zyaxkyfb"
%".66" = load i32, i32* %"tmp_clisp.inp_1-mxzzloqp"
%"tmp_clisp-zlkwxzlv.1" = add i32 %".65", %".66"
store i32 %"tmp_clisp-zlkwxzlv.1", i32* %"tmp_clisp-zlkwxzlv"
%".68" = load float*, float** %"A.1"
%".69" = load i32, i32* %"tmp_clisp-zlkwxzlv"
%".70" = getelementptr float, float* %".68", i32 %".69"
store float* %".70", float** %"tmp_clisp-lsegeqfw"
%".72" = load float*, float** %"tmp_clisp-lsegeqfw"
%".73" = load float, float* %".72"
store float %".73", float* %"tmp_clisp.inp_0-tyjnueua"
%".75" = load i32, i32* %"j"
store i32 %".75", i32* %"tmp_clisp.inp_0-zwskayzm"
%".77" = load i32, i32* %"k.1"
store i32 %".77", i32* %"tmp_clisp.inp_1-duvxmwwc"
%".79" = load i32, i32* %"tmp_clisp.inp_0-zwskayzm"
%".80" = load i32, i32* %"tmp_clisp.inp_1-duvxmwwc"
%"tmp_clisp.inp_0-xsgzizfs.1" = mul i32 %".79", %".80"
store i32 %"tmp_clisp.inp_0-xsgzizfs.1", i32* %"tmp_clisp.inp_0-xsgzizfs"
%".82" = load i32, i32* %"p"
store i32 %".82", i32* %"tmp_clisp.inp_1-iwjecgmx"
%".84" = load i32, i32* %"tmp_clisp.inp_0-xsgzizfs"
%".85" = load i32, i32* %"tmp_clisp.inp_1-iwjecgmx"
%"tmp_clisp-nmoirgcz.1" = add i32 %".84", %".85"
store i32 %"tmp_clisp-nmoirgcz.1", i32* %"tmp_clisp-nmoirgcz"
%".87" = load float*, float** %"B.1"
%".88" = load i32, i32* %"tmp_clisp-nmoirgcz"
%".89" = getelementptr float, float* %".87", i32 %".88"
store float* %".89", float** %"tmp_clisp-gjwitaar"
%".91" = load float*, float** %"tmp_clisp-gjwitaar"
%".92" = load float, float* %".91"
store float %".92", float* %"tmp_clisp.inp_1-ubmlbnzu"
%".94" = load float, float* %"tmp_clisp.inp_0-tyjnueua"
%".95" = load float, float* %"tmp_clisp.inp_1-ubmlbnzu"
%"tmp_clisp.inp_0-kdbwpial.1" = fmul float %".94", %".95"
store float %"tmp_clisp.inp_0-kdbwpial.1", float* %"tmp_clisp.inp_0-kdbwpial"
%".97" = load float, float* %"sum.krxidkvg.gtbosduc"
store float %".97", float* %"tmp_clisp.inp_1-euvgwazb"
%".99" = load float, float* %"tmp_clisp.inp_0-kdbwpial"
%".100" = load float, float* %"tmp_clisp.inp_1-euvgwazb"
%"tmp_clisp-mbttzjym.1" = fadd float %".99", %".100"
store float %"tmp_clisp-mbttzjym.1", float* %"tmp_clisp-mbttzjym"
%".102" = load float, float* %"tmp_clisp-mbttzjym"
store float %".102", float* %"sum.krxidkvg.gtbosduc"
%".104" = load i32, i32* %"p"
store i32 %".104", i32* %"tmp_clisp.inp_0-hoxwqebl"
store i32 1, i32* %"tmp_clisp.inp_1-tnfibwti"
%".107" = load i32, i32* %"tmp_clisp.inp_0-hoxwqebl"
%".108" = load i32, i32* %"tmp_clisp.inp_1-tnfibwti"
%"tmp_clisp-qufnjzis.1" = add i32 %".107", %".108"
store i32 %"tmp_clisp-qufnjzis.1", i32* %"tmp_clisp-qufnjzis"
%".110" = load i32, i32* %"tmp_clisp-qufnjzis"
store i32 %".110", i32* %"p"
br label %"tmp_clisp.loop-dokigiia"
tmp_clisp.break-zkrgkvej:
%".113" = load i32, i32* %"j"
store i32 %".113", i32* %"tmp_clisp.inp_0-ouourali"
%".115" = load i32, i32* %"m.1"
store i32 %".115", i32* %"tmp_clisp.inp_1-hltiwqfw"
%".117" = load i32, i32* %"tmp_clisp.inp_0-ouourali"
%".118" = load i32, i32* %"tmp_clisp.inp_1-hltiwqfw"
%"tmp_clisp.inp_0-vlfjpvro.1" = mul i32 %".117", %".118"
store i32 %"tmp_clisp.inp_0-vlfjpvro.1", i32* %"tmp_clisp.inp_0-vlfjpvro"
%".120" = load i32, i32* %"i"
store i32 %".120", i32* %"tmp_clisp.inp_1-gtwxqetc"
%".122" = load i32, i32* %"tmp_clisp.inp_0-vlfjpvro"
%".123" = load i32, i32* %"tmp_clisp.inp_1-gtwxqetc"
%"tmp_clisp-iefldmrm.1" = add i32 %".122", %".123"
store i32 %"tmp_clisp-iefldmrm.1", i32* %"tmp_clisp-iefldmrm"
%".125" = load float*, float** %"C.1"
%".126" = load i32, i32* %"tmp_clisp-iefldmrm"
%".127" = getelementptr float, float* %".125", i32 %".126"
store float* %".127", float** %"tmp_clisp.ptr-lvfdazjr"
%".129" = load float, float* %"sum.krxidkvg.gtbosduc"
store float %".129", float* %"tmp_clisp.val-ljmupwse"
%".131" = load float*, float** %"tmp_clisp.ptr-lvfdazjr"
%".132" = load float, float* %"tmp_clisp.val-ljmupwse"
store float %".132", float* %".131"
%".134" = load float, float* %"tmp_clisp.val-ljmupwse"
store float %".134", float* %"tmp_clisp-nvfiledd"
%".136" = load i32, i32* %"j"
store i32 %".136", i32* %"tmp_clisp.inp_0-ausjwnoe"
store i32 1, i32* %"tmp_clisp.inp_1-gxfjeqti"
%".139" = load i32, i32* %"tmp_clisp.inp_0-ausjwnoe"
%".140" = load i32, i32* %"tmp_clisp.inp_1-gxfjeqti"
%"tmp_clisp-vdiojlyt.1" = add i32 %".139", %".140"
store i32 %"tmp_clisp-vdiojlyt.1", i32* %"tmp_clisp-vdiojlyt"
%".142" = load i32, i32* %"tmp_clisp-vdiojlyt"
store i32 %".142", i32* %"j"
br label %"tmp_clisp.loop-cjfdjqts"
tmp_clisp.break-wjvptbxu:
%".145" = load i32, i32* %"i"
store i32 %".145", i32* %"tmp_clisp.inp_0-hpakbkdl"
store i32 1, i32* %"tmp_clisp.inp_1-wiolysdf"
%".148" = load i32, i32* %"tmp_clisp.inp_0-hpakbkdl"
%".149" = load i32, i32* %"tmp_clisp.inp_1-wiolysdf"
%"tmp_clisp-hfomxqoq.1" = add i32 %".148", %".149"
store i32 %"tmp_clisp-hfomxqoq.1", i32* %"tmp_clisp-hfomxqoq"
%".151" = load i32, i32* %"tmp_clisp-hfomxqoq"
store i32 %".151", i32* %"i"
br label %"tmp_clisp.loop-tawppkmx"
tmp_clisp.break-dcqlzqiq:
br label %"tmp_clisp.ret_lbl-guamijsj"
tmp_clisp.ret_lbl-guamijsj:
ret void
}
We're creating lot of temporary variables with alloca in c-lisp while C is not. Does this have to be like this? @GlowingScrewdriver.
This is not c-lisp problem. Let's observe releant brilisp and see if they match LLVM.
Some more observations:
(brilisp
(define ((__MMult1 void)
(A (ptr float))
(B (ptr float))
(C (ptr float))
(m int)
(n int)
(k int))
; i = 0
(set (tmp_clisp-siugxvzj int) (const 0))
(set (i int) (id tmp_clisp-siugxvzj))
(label tmp_clisp.loop-ktqhbbhy)
; i < m
(set (tmp_clisp.inp_0-bolfwpgw int) (id i))
(set (tmp_clisp.inp_1-kcqayyyj int) (id m))
(set (tmp_clisp.cond-ncpwvzyj bool)
(lt tmp_clisp.inp_0-bolfwpgw
tmp_clisp.inp_1-kcqayyyj))
; continue or break check
(br tmp_clisp.cond-ncpwvzyj
tmp_clisp.cont-xhhzwymo
tmp_clisp.break-umrpdgwz)
(label tmp_clisp.cont-xhhzwymo)
; j = 0
(set (tmp_clisp-fuexemot int) (const 0))
(set (j int) (id tmp_clisp-fuexemot))
(label tmp_clisp.loop-ortsybnq)
; j < n
(set (tmp_clisp.inp_0-cawawdft int) (id j))
(set (tmp_clisp.inp_1-nbsonjyo int) (id n))
(set (tmp_clisp.cond-ytxoncmc bool)
(lt tmp_clisp.inp_0-cawawdft
tmp_clisp.inp_1-nbsonjyo))
; continue or break check
(br tmp_clisp.cond-ytxoncmc
tmp_clisp.cont-doskgrds
tmp_clisp.break-jrptpggi)
(label tmp_clisp.cont-doskgrds)
; sum = 0
(set (tmp_clisp-gxsgwjwa float) (const 0))
(set (sum.ntxblfwd.dejhoqjr float)
(id tmp_clisp-gxsgwjwa))
; p = 0
(set (tmp_clisp-nqacoxyw int) (const 0))
(set (p int) (id tmp_clisp-nqacoxyw))
(label tmp_clisp.loop-yzbmigkv)
; p < k
(set (tmp_clisp.inp_0-nbhjaaqn int) (id p))
(set (tmp_clisp.inp_1-jgaywwhk int) (id k))
(set (tmp_clisp.cond-peyiylke bool)
(lt tmp_clisp.inp_0-nbhjaaqn
tmp_clisp.inp_1-jgaywwhk))
; break check
(br tmp_clisp.cond-peyiylke
tmp_clisp.cont-vzonxxlp
tmp_clisp.break-lbsddwuy)
(label tmp_clisp.cont-vzonxxlp)
; p * m
(set (tmp_clisp.inp_0-ohpgvqtb int) (id p))
(set (tmp_clisp.inp_1-jbdqduba int) (id m))
(set (tmp_clisp.inp_0-zjvsemeb int)
(mul tmp_clisp.inp_0-ohpgvqtb
tmp_clisp.inp_1-jbdqduba))
; p * m + i
(set (tmp_clisp.inp_1-mfejnidx int) (id i))
(set (tmp_clisp-mxaqtskz int)
(add tmp_clisp.inp_0-zjvsemeb
tmp_clisp.inp_1-mfejnidx))
; A[p*m +i]
(set (tmp_clisp-bquzoawz (ptr float))
(ptradd A tmp_clisp-mxaqtskz))
(set (tmp_clisp.inp_0-lakwncit float)
(load tmp_clisp-bquzoawz))
; j*k
(set (tmp_clisp.inp_0-wjuhxkdq int) (id j))
(set (tmp_clisp.inp_1-vgfluxlc int) (id k))
(set (tmp_clisp.inp_0-nsctpjow int)
(mul tmp_clisp.inp_0-wjuhxkdq
tmp_clisp.inp_1-vgfluxlc))
; j*k + p
(set (tmp_clisp.inp_1-zjjooazg int) (id p))
(set (tmp_clisp-shaqabnc int)
(add tmp_clisp.inp_0-nsctpjow
tmp_clisp.inp_1-zjjooazg))
; B[j*k + p]
(set (tmp_clisp-thqgkynb (ptr float))
(ptradd B tmp_clisp-shaqabnc))
(set (tmp_clisp.inp_1-ouazzsfu float)
(load tmp_clisp-thqgkynb))
; sum = sum + A[p*m +i] * B[j*k + p]
(set (tmp_clisp.inp_0-etugbivl float)
(fmul tmp_clisp.inp_0-lakwncit
tmp_clisp.inp_1-ouazzsfu))
(set (tmp_clisp.inp_1-oruvfknq float)
(id sum.ntxblfwd.dejhoqjr))
(set (tmp_clisp-yaqtggmm float)
(fadd tmp_clisp.inp_0-etugbivl
tmp_clisp.inp_1-oruvfknq))
(set (sum.ntxblfwd.dejhoqjr float)
(id tmp_clisp-yaqtggmm))
; p = p + 1
(set (tmp_clisp.inp_0-vtakxodi int) (id p))
(set (tmp_clisp.inp_1-jqlswect int) (const 1))
(set (tmp_clisp-mvputqgg int)
(add tmp_clisp.inp_0-vtakxodi
tmp_clisp.inp_1-jqlswect))
(set (p int) (id tmp_clisp-mvputqgg))
(jmp tmp_clisp.loop-yzbmigkv)
; j * m
(label tmp_clisp.break-lbsddwuy)
(set (tmp_clisp.inp_0-iuslubbd int) (id j))
(set (tmp_clisp.inp_1-tjkrlbgn int) (id m))
(set (tmp_clisp.inp_0-cgzmfptq int)
(mul tmp_clisp.inp_0-iuslubbd
tmp_clisp.inp_1-tjkrlbgn))
; j * m + i
(set (tmp_clisp.inp_1-dhxxntaj int) (id i))
(set (tmp_clisp-nttorwaw int)
(add tmp_clisp.inp_0-cgzmfptq
tmp_clisp.inp_1-dhxxntaj))
; C[j*m + i] = sim
(set (tmp_clisp.ptr-kwxfpvuv (ptr float))
(ptradd C tmp_clisp-nttorwaw))
(set (tmp_clisp.val-dfgbkymx float)
(id sum.ntxblfwd.dejhoqjr))
(store tmp_clisp.ptr-kwxfpvuv
tmp_clisp.val-dfgbkymx)
; j = j + 1
(set (tmp_clisp-dpuizpsd float)
(id tmp_clisp.val-dfgbkymx))
(set (tmp_clisp.inp_0-qnpimfwr int) (id j))
(set (tmp_clisp.inp_1-lwkuwvyg int) (const 1))
(set (tmp_clisp-ewewatde int)
(add tmp_clisp.inp_0-qnpimfwr
tmp_clisp.inp_1-lwkuwvyg))
(set (j int) (id tmp_clisp-ewewatde))
(jmp tmp_clisp.loop-ortsybnq)
(label tmp_clisp.break-jrptpggi)
; i = i + 1
(set (tmp_clisp.inp_0-ssmzexee int) (id i))
(set (tmp_clisp.inp_1-uptnyjrh int) (const 1))
(set (tmp_clisp-evuknbie int)
(add tmp_clisp.inp_0-ssmzexee
tmp_clisp.inp_1-uptnyjrh))
(set (i int) (id tmp_clisp-evuknbie))
(jmp tmp_clisp.loop-ktqhbbhy)
(label tmp_clisp.break-umrpdgwz)
; ret
(jmp tmp_clisp.ret_lbl-dzendtoe)
(label tmp_clisp.ret_lbl-dzendtoe)
(ret)))
BRILisp is quite readable if not for id.
Relevant c-lisp code:
def gen_set_expr(self, expr, res_sym):
if not verify_shape(expr, [str, str, None]):
raise CodegenError(f"Bad set expression: {expr}")
name = expr[1]
scoped_name = self.scoped_lookup(name)
instr_list = self.gen_expr(expr[2], res_sym=res_sym)
instr_list.append(
["set", [scoped_name, self.symbol_types[scoped_name]], ["id", res_sym]]
)
return instr_list
Biggest reason we have id is because of res_sym
being an input to gen expressions in c-lisp
Newer Brilsp and LLVM after running with #77 :
(brilisp
(define ((__MMult1 void)
(A (ptr float))
(B (ptr float))
(C (ptr float))
(m int)
(n int)
(k int))
(set (tmp_clisp-ivjrdagl int) (const 0))
(set (i int) (id tmp_clisp-ivjrdagl))
(label tmp_clisp.loop-ymbbjefd)
(set (tmp_clisp-crvgjtqq bool) (lt i m))
(br tmp_clisp-crvgjtqq
tmp_clisp.cont-elpzoqye
tmp_clisp.break-eebkjiem)
(label tmp_clisp.cont-elpzoqye)
(set (tmp_clisp-kgolqdma int) (const 0))
(set (j int) (id tmp_clisp-kgolqdma))
(label tmp_clisp.loop-kpdkibjn)
(set (tmp_clisp-ieeuniew bool) (lt j n))
(br tmp_clisp-ieeuniew
tmp_clisp.cont-tvkgjhho
tmp_clisp.break-qxwqorwq)
(label tmp_clisp.cont-tvkgjhho)
(set (tmp_clisp-quwsjetu float) (const 0))
(set (sum.qapyumhy.cuycchkx float)
(id tmp_clisp-quwsjetu))
(set (tmp_clisp-bivorips int) (const 0))
(set (p int) (id tmp_clisp-bivorips))
(label tmp_clisp.loop-mtgezhgu)
(set (tmp_clisp-bqmzektk bool) (lt p k))
(br tmp_clisp-bqmzektk
tmp_clisp.cont-eocirseo
tmp_clisp.break-hwdbyjjm)
(label tmp_clisp.cont-eocirseo)
(set (tmp_clisp-spvyhcsp int) (mul p m))
(set (tmp_clisp-unziitke int)
(add tmp_clisp-spvyhcsp i))
(set (tmp_clisp-vbbwbfiq (ptr float))
(ptradd A tmp_clisp-unziitke))
(set (tmp_clisp-idxigtop float)
(load tmp_clisp-vbbwbfiq))
(set (tmp_clisp-jpalikxo int) (mul j k))
(set (tmp_clisp-ymtqsshy int)
(add tmp_clisp-jpalikxo p))
(set (tmp_clisp-tkzvojmi (ptr float))
(ptradd B tmp_clisp-ymtqsshy))
(set (tmp_clisp-vkldsdyb float)
(load tmp_clisp-tkzvojmi))
(set (tmp_clisp-fvvzxxpp float)
(fmul tmp_clisp-idxigtop tmp_clisp-vkldsdyb))
(set (tmp_clisp-brccmkmu float)
(fadd tmp_clisp-fvvzxxpp sum.qapyumhy.cuycchkx))
(set (sum.qapyumhy.cuycchkx float)
(id tmp_clisp-brccmkmu))
(set (tmp_clisp-qjhrkumd int) (const 1))
(set (tmp_clisp-zxhnlmlp int)
(add p tmp_clisp-qjhrkumd))
(set (p int) (id tmp_clisp-zxhnlmlp))
(jmp tmp_clisp.loop-mtgezhgu)
(label tmp_clisp.break-hwdbyjjm)
(set (tmp_clisp-fgymqpop int) (mul j m))
(set (tmp_clisp-pthkcwvn int)
(add tmp_clisp-fgymqpop i))
(set (tmp_clisp-bjjoeriq (ptr float))
(ptradd C tmp_clisp-pthkcwvn))
(store tmp_clisp-bjjoeriq sum.qapyumhy.cuycchkx)
(set (tmp_clisp-nenxsrsr int) (const 1))
(set (tmp_clisp-bfwyqktl int)
(add j tmp_clisp-nenxsrsr))
(set (j int) (id tmp_clisp-bfwyqktl))
(jmp tmp_clisp.loop-kpdkibjn)
(label tmp_clisp.break-qxwqorwq)
(set (tmp_clisp-mejehkwk int) (const 1))
(set (tmp_clisp-dbgttfsb int)
(add i tmp_clisp-mejehkwk))
(set (i int) (id tmp_clisp-dbgttfsb))
(jmp tmp_clisp.loop-ymbbjefd)
(label tmp_clisp.break-eebkjiem)
(jmp tmp_clisp.ret_lbl-igrtxgmn)
(label tmp_clisp.ret_lbl-igrtxgmn)
(ret)))
and LLVM with -O1:
; ModuleID = 'MMulti_new.ll'
source_filename = "MMulti_new.ll"
target triple = "unknown-unknown-unknown"
; Function Attrs: argmemonly nofree norecurse nosync nounwind
define void @__MMult1(float* nocapture readonly %A, float* nocapture readonly %B, float* nocapture writeonly %C, i32 %m, i32 %n, i32 %k) local_unnamed_addr #0 {
alloca-ovaaulgt:
%tmp_clisp-ifyhhxbd.118 = icmp sgt i32 %m, 0
br i1 %tmp_clisp-ifyhhxbd.118, label %tmp_clisp.loop-inohvwsi.preheader.lr.ph, label %tmp_clisp.ret_lbl-yfiookbt
tmp_clisp.loop-inohvwsi.preheader.lr.ph: ; preds = %alloca-ovaaulgt
%tmp_clisp-eghmoqva.116 = icmp sgt i32 %n, 0
%tmp_clisp-smtahyaw.113 = icmp sgt i32 %k, 0
br label %tmp_clisp.loop-inohvwsi.preheader
tmp_clisp.loop-inohvwsi.preheader: ; preds = %tmp_clisp.loop-inohvwsi.preheader.lr.ph, %tmp_clisp.break-onhrayvl
%i.019 = phi i32 [ 0, %tmp_clisp.loop-inohvwsi.preheader.lr.ph ], [ %tmp_clisp-mcjogwyb.1, %tmp_clisp.break-onhrayvl ]
br i1 %tmp_clisp-eghmoqva.116, label %tmp_clisp.loop-safbgzsu.preheader, label %tmp_clisp.break-onhrayvl
tmp_clisp.loop-safbgzsu.preheader: ; preds = %tmp_clisp.loop-inohvwsi.preheader, %tmp_clisp.break-tmohrxbn
%j.017 = phi i32 [ %tmp_clisp-urcljuex.1, %tmp_clisp.break-tmohrxbn ], [ 0, %tmp_clisp.loop-inohvwsi.preheader ]
br i1 %tmp_clisp-smtahyaw.113, label %tmp_clisp.cont-gjjqluld.lr.ph, label %tmp_clisp.break-tmohrxbn
tmp_clisp.cont-gjjqluld.lr.ph: ; preds = %tmp_clisp.loop-safbgzsu.preheader
%tmp_clisp-jqhxwtve.1 = mul i32 %j.017, %k
br label %tmp_clisp.cont-gjjqluld
tmp_clisp.cont-gjjqluld: ; preds = %tmp_clisp.cont-gjjqluld.lr.ph, %tmp_clisp.cont-gjjqluld
%sum.mlbgqafz.komgfkcq.015 = phi float [ 0.000000e+00, %tmp_clisp.cont-gjjqluld.lr.ph ], [ %tmp_clisp-mmntcjil.1, %tmp_clisp.cont-gjjqluld ]
%p.014 = phi i32 [ 0, %tmp_clisp.cont-gjjqluld.lr.ph ], [ %tmp_clisp-bdmhpmgc.1, %tmp_clisp.cont-gjjqluld ]
%tmp_clisp-atfslbhe.1 = mul i32 %p.014, %m
%tmp_clisp-xqxkitro.1 = add i32 %tmp_clisp-atfslbhe.1, %i.019
%0 = sext i32 %tmp_clisp-xqxkitro.1 to i64
%.52 = getelementptr float, float* %A, i64 %0
%.55 = load float, float* %.52, align 4
%tmp_clisp-rauulcuv.1 = add i32 %p.014, %tmp_clisp-jqhxwtve.1
%1 = sext i32 %tmp_clisp-rauulcuv.1 to i64
%.65 = getelementptr float, float* %B, i64 %1
%.68 = load float, float* %.65, align 4
%tmp_clisp-ruhlchfv.1 = fmul float %.55, %.68
%tmp_clisp-mmntcjil.1 = fadd float %sum.mlbgqafz.komgfkcq.015, %tmp_clisp-ruhlchfv.1
%tmp_clisp-bdmhpmgc.1 = add nuw nsw i32 %p.014, 1
%tmp_clisp-smtahyaw.1 = icmp slt i32 %tmp_clisp-bdmhpmgc.1, %k
br i1 %tmp_clisp-smtahyaw.1, label %tmp_clisp.cont-gjjqluld, label %tmp_clisp.break-tmohrxbn
tmp_clisp.break-tmohrxbn: ; preds = %tmp_clisp.cont-gjjqluld, %tmp_clisp.loop-safbgzsu.preheader
%sum.mlbgqafz.komgfkcq.0.lcssa = phi float [ 0.000000e+00, %tmp_clisp.loop-safbgzsu.preheader ], [ %tmp_clisp-mmntcjil.1, %tmp_clisp.cont-gjjqluld ]
%tmp_clisp-iwpbmoqc.1 = mul i32 %j.017, %m
%tmp_clisp-brwzagwq.1 = add i32 %tmp_clisp-iwpbmoqc.1, %i.019
%2 = sext i32 %tmp_clisp-brwzagwq.1 to i64
%.93 = getelementptr float, float* %C, i64 %2
store float %sum.mlbgqafz.komgfkcq.0.lcssa, float* %.93, align 4
%tmp_clisp-urcljuex.1 = add nuw nsw i32 %j.017, 1
%tmp_clisp-eghmoqva.1 = icmp slt i32 %tmp_clisp-urcljuex.1, %n
br i1 %tmp_clisp-eghmoqva.1, label %tmp_clisp.loop-safbgzsu.preheader, label %tmp_clisp.break-onhrayvl
tmp_clisp.break-onhrayvl: ; preds = %tmp_clisp.break-tmohrxbn, %tmp_clisp.loop-inohvwsi.preheader
%tmp_clisp-mcjogwyb.1 = add nuw nsw i32 %i.019, 1
%tmp_clisp-ifyhhxbd.1 = icmp slt i32 %tmp_clisp-mcjogwyb.1, %m
br i1 %tmp_clisp-ifyhhxbd.1, label %tmp_clisp.loop-inohvwsi.preheader, label %tmp_clisp.ret_lbl-yfiookbt
tmp_clisp.ret_lbl-yfiookbt: ; preds = %tmp_clisp.break-onhrayvl, %alloca-ovaaulgt
ret void
}
attributes #0 = { argmemonly nofree norecurse nosync nounwind }
And the ref_mul c code with -O1
; ModuleID = 'refmult.c'
source_filename = "refmult.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: argmemonly nofree nosync nounwind uwtable
define dso_local void @ref_mult(ptr nocapture noundef readonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef writeonly %2, i32 noundef %3, i32 noundef %4, i32 noundef %5) local_unnamed_addr #0 {
%7 = icmp sgt i32 %3, 0
br i1 %7, label %8, label %46
8: ; preds = %6
%9 = icmp sgt i32 %4, 0
%10 = icmp sgt i32 %5, 0
%11 = sext i32 %3 to i64
%12 = sext i32 %5 to i64
%13 = sext i32 %3 to i64
%14 = zext i32 %3 to i64
%15 = zext i32 %4 to i64
%16 = zext i32 %5 to i64
br label %17
17: ; preds = %8, %43
%18 = phi i64 [ 0, %8 ], [ %44, %43 ]
br i1 %9, label %19, label %43
19: ; preds = %17, %36
%20 = phi i64 [ %41, %36 ], [ 0, %17 ]
br i1 %10, label %21, label %36
21: ; preds = %19
%22 = mul nsw i64 %20, %12
br label %23
23: ; preds = %21, %23
%24 = phi i64 [ 0, %21 ], [ %34, %23 ]
%25 = phi float [ 0.000000e+00, %21 ], [ %33, %23 ]
%26 = mul nsw i64 %24, %11
%27 = add nsw i64 %26, %18
%28 = getelementptr inbounds float, ptr %0, i64 %27
%29 = load float, ptr %28, align 4, !tbaa !5
%30 = add nsw i64 %24, %22
%31 = getelementptr inbounds float, ptr %1, i64 %30
%32 = load float, ptr %31, align 4, !tbaa !5
%33 = tail call float @llvm.fmuladd.f32(float %29, float %32, float %25)
%34 = add nuw nsw i64 %24, 1
%35 = icmp eq i64 %34, %16
br i1 %35, label %36, label %23, !llvm.loop !9
36: ; preds = %23, %19
%37 = phi float [ 0.000000e+00, %19 ], [ %33, %23 ]
%38 = mul nsw i64 %20, %13
%39 = add nsw i64 %38, %18
%40 = getelementptr inbounds float, ptr %2, i64 %39
store float %37, ptr %40, align 4, !tbaa !5
%41 = add nuw nsw i64 %20, 1
%42 = icmp eq i64 %41, %15
br i1 %42, label %43, label %19, !llvm.loop !12
43: ; preds = %36, %17
%44 = add nuw nsw i64 %18, 1
%45 = icmp eq i64 %44, %14
br i1 %45, label %46, label %17, !llvm.loop !13
46: ; preds = %43, %6
ret void
}
; Function Attrs: mustprogress nocallback nofree nosync nounwind readnone speculatable willreturn
declare float @llvm.fmuladd.f32(float, float, float) #1
attributes #0 = { argmemonly nofree nosync nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind readnone speculatable willreturn }
Added comments for LLVM code from sexp mult. Ran mem2reg alone.
opt -passes=mem2reg -S MMulti_new.ll > MMulti_new_mem2reg.ll
; ModuleID = 'MMulti_new.ll'
source_filename = "MMulti_new.ll"
target triple = "unknown-unknown-unknown"
define void @__MMult1(float* %A, float* %B, float* %C, i32 %m, i32 %n, i32 %k) {
alloca-yobpmfwf:
br label %entry-nyiytwyy
entry-nyiytwyy: ; preds = %alloca-yobpmfwf
br label %sym.loop-cqatlhqk
sym.loop-cqatlhqk: ; preds = %sym.break-enfknquz, %entry-nyiytwyy
; i = 0 or i + 1
%i.0 = phi i32 [ 0, %entry-nyiytwyy ], [ %sym-aimutfvh.1, %sym.break-enfknquz ]
; i < m
%sym-zqlnadit.1 = icmp slt i32 %i.0, %m
br i1 %sym-zqlnadit.1, label %sym.cont-gtzeinpp, label %sym.break-sfwvbees
sym.cont-gtzeinpp: ; preds = %sym.loop-cqatlhqk
br label %sym.loop-ixsprijw
sym.loop-ixsprijw: ; preds = %sym.break-tlhnsjzq, %sym.cont-gtzeinpp
; j = 0 or j + 1
%j.0 = phi i32 [ 0, %sym.cont-gtzeinpp ], [ %sym-vschuphl.1, %sym.break-tlhnsjzq ]
; j < n
%sym-erdqidvw.1 = icmp slt i32 %j.0, %n
br i1 %sym-erdqidvw.1, label %sym.cont-lkreoyvv, label %sym.break-enfknquz
sym.cont-lkreoyvv: ; preds = %sym.loop-ixsprijw
br label %sym.loop-uuuiebvx
sym.loop-uuuiebvx: ; preds = %sym.cont-wjcvnepx, %sym.cont-lkreoyvv
; sum = 0 or sum + ...
%sum.ixwbxjcu.oiseueuu.0 = phi float [ 0.000000e+00, %sym.cont-lkreoyvv ], [ %sym-cmhiaxws.1, %sym.cont-wjcvnepx ]
; p = 0 or p + a
%p.0 = phi i32 [ 0, %sym.cont-lkreoyvv ], [ %sym-ahtoruef.1, %sym.cont-wjcvnepx ]
; p < k
%sym-xivbpzih.1 = icmp slt i32 %p.0, %k
br i1 %sym-xivbpzih.1, label %sym.cont-wjcvnepx, label %sym.break-tlhnsjzq
sym.cont-wjcvnepx: ; preds = %sym.loop-uuuiebvx
; p * m
%sym-gheyvzkh.1 = mul i32 %p.0, %m
; i + p * m
%sym-mlumjxqw.1 = add i32 %sym-gheyvzkh.1, %i.0
; &A[i + p * m]
%.52 = getelementptr float, float* %A, i32 %sym-mlumjxqw.1
; A[i + p * m]
%.55 = load float, float* %.52, align 4
; j * k
%sym-vrxwgvld.1 = mul i32 %j.0, %k
; j * k + p
%sym-nwyrpbcn.1 = add i32 %sym-vrxwgvld.1, %p.0
; &B[j * k + p]
%.65 = getelementptr float, float* %B, i32 %sym-nwyrpbcn.1
; B[j * k + p]
%.68 = load float, float* %.65, align 4
; A[i + p * m] * B[j * k + p]
%sym-ipfrfuuq.1 = fmul float %.55, %.68
; sum + A[i + p * m] * B[j * k + p]
%sym-cmhiaxws.1 = fadd float %sym-ipfrfuuq.1, %sum.ixwbxjcu.oiseueuu.0
; p = p + 1
%sym-ahtoruef.1 = add i32 %p.0, 1
br label %sym.loop-uuuiebvx
sym.break-tlhnsjzq: ; preds = %sym.loop-uuuiebvx
; j * m
%sym-ilkuwtus.1 = mul i32 %j.0, %m
; i + j * m
%sym-icndiatv.1 = add i32 %sym-ilkuwtus.1, %i.0
; &C[i + j * m]
%.93 = getelementptr float, float* %C, i32 %sym-icndiatv.1
; C[i + j * m] = sum
store float %sum.ixwbxjcu.oiseueuu.0, float* %.93, align 4
; j + 1
%sym-vschuphl.1 = add i32 %j.0, 1
br label %sym.loop-ixsprijw
sym.break-enfknquz: ; preds = %sym.loop-ixsprijw
; i + 1
%sym-aimutfvh.1 = add i32 %i.0, 1
br label %sym.loop-cqatlhqk
sym.break-sfwvbees: ; preds = %sym.loop-cqatlhqk
br label %sym.ret_lbl-ertfxglj
sym.ret_lbl-ertfxglj: ; preds = %sym.break-sfwvbees
ret void
}
Now for the C code but with -O1:
clang -O1 -S -emit-llvm refmult.c
; ModuleID = 'refmult.c'
source_filename = "refmult.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: argmemonly nofree nosync nounwind uwtable
define dso_local void @ref_mult(ptr nocapture noundef readonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef writeonly %2, i32 noundef %3, i32 noundef %4, i32 noundef %5) local_unnamed_addr #0 {
; m > 0
%7 = icmp sgt i32 %3, 0
br i1 %7, label %8, label %46
8: ; preds = %6
; n > 0
%9 = icmp sgt i32 %4, 0
; k > 0
%10 = icmp sgt i32 %5, 0
; m, n, k -> int64
%11 = sext i32 %3 to i64
%12 = sext i32 %5 to i64
%13 = sext i32 %3 to i64
%14 = zext i32 %3 to i64
%15 = zext i32 %4 to i64
%16 = zext i32 %5 to i64
br label %17
17: ; preds = %8, %43
; i = 0 or i + 1
%18 = phi i64 [ 0, %8 ], [ %44, %43 ]
br i1 %9, label %19, label %43
19: ; preds = %17, %36
; j = 0 or j + 1
%20 = phi i64 [ %41, %36 ], [ 0, %17 ]
br i1 %10, label %21, label %36
21: ; preds = %19
; j * k
%22 = mul nsw i64 %20, %12
br label %23
23: ; preds = %21, %23
; p = 0 or p + 1
%24 = phi i64 [ 0, %21 ], [ %34, %23 ]
; sum = 0 or sum + 1
%25 = phi float [ 0.000000e+00, %21 ], [ %33, %23 ]
; p * m
%26 = mul nsw i64 %24, %11
; p * m + i
%27 = add nsw i64 %26, %18
; A[p * m + 1]
%28 = getelementptr inbounds float, ptr %0, i64 %27
%29 = load float, ptr %28, align 4, !tbaa !5
; p + j * k
%30 = add nsw i64 %24, %22
; B[p + j * k]
%31 = getelementptr inbounds float, ptr %1, i64 %30
%32 = load float, ptr %31, align 4, !tbaa !5
; intrinsic: sum + A[p * m + 1]* B[p + j * k]
%33 = tail call float @llvm.fmuladd.f32(float %29, float %32, float %25)
; p = p + 1
%34 = add nuw nsw i64 %24, 1
; p < k
%35 = icmp eq i64 %34, %16
br i1 %35, label %36, label %23, !llvm.loop !9
36: ; preds = %23, %19
; sum = 0 or sum
%37 = phi float [ 0.000000e+00, %19 ], [ %33, %23 ]
; j * m
%38 = mul nsw i64 %20, %13
; j * m + i
%39 = add nsw i64 %38, %18
; C[j * m + i] = sum
%40 = getelementptr inbounds float, ptr %2, i64 %39
store float %37, ptr %40, align 4, !tbaa !5
; j = j + 1
%41 = add nuw nsw i64 %20, 1
; j < n
%42 = icmp eq i64 %41, %15
br i1 %42, label %43, label %19, !llvm.loop !12
43: ; preds = %36, %17
; i = i + 1
%44 = add nuw nsw i64 %18, 1
; i < m
%45 = icmp eq i64 %44, %14
br i1 %45, label %46, label %17, !llvm.loop !13
46: ; preds = %43, %6
ret void
}
LLVM IRs are looking very similar except for
Need to create benchmarks again.
Thanks to @BiradarSiddhant02 for benchmarks. Perf has gotten better with #77
The reference multiplier (C code) is very slow compared to the c-lisp version. Both are compiled with -O1 optimization and linked with the main.o file (also compiled with -O1) at the end.
The c-lisp kernel of MMult_1x4_4 is comparably worse
Upon analysing llvm intermediate representation of both c and c-lisp implementation of 1x4_4. The LLVM IR for c-lisp is atleast 4x bigger than LLVM IR for C in terms of lines of code
The c-lisp kernel matches C kernel in terms of performance check commit https://github.com/chsasank/llama.lisp/pull/70/commits/ea7aa6fda5711513d44f96f8f2bdeae4698e79be
Fixed a mistake in code where the for loops are dissimilar between c-lisp and C kernels. Also the fixed the matrix indexing mistake in C kernel
Clearly, #77 fixed the perf difference b/w C and C-lisp. We're now exactly same speed as C. Closing the issue. Hopefully don't have to bother about perf of c-lisp anymore :)