chsasank / llama.lisp

Lisp dialect designed for HPC and AI
GNU Lesser General Public License v2.1
15 stars 6 forks source link

C-Lisp Lower GFlops vs C #72

Closed BiradarSiddhant02 closed 3 months ago

BiradarSiddhant02 commented 4 months ago

c-lisp vs c

chsasank commented 4 months ago

Observed from code in #70

chsasank commented 4 months ago

Observed that the difference is because order of loop variables.

image

Still it's not the same.

chsasank commented 4 months ago

Here's the code we are comparing with each other:

(c-lisp

    (define ((__MMult1 void) (A (ptr float))
                            (B (ptr float))    
                            (C (ptr float))    
                            (m int)    
                            (n int)    
                            (k int))

        (declare i int)
        (declare j int)
        (declare p int)

        (for ((set i 0) (lt i m) (set i (add i 1)))
            (for ((set j 0) (lt j n) (set j (add j 1)))
                (declare sum float)
                (set sum 0.0)
                (for ((set p 0) (lt p k) (set p (add p 1)))
                    ; sum += A[p * m + i] * B[j * k + p]
                    (set sum (fadd
                        (fmul 
                            (load (ptradd A (add (mul p m) i)))
                            (load (ptradd B (add (mul j k) p))))
                        sum)))
                (store (ptradd C (add (mul j m) i)) sum)))
        (ret)))

and

void ref_mult(float* A, float* B, float* C, int m, int n, int k) {

    int i ;
    int j ;
    int p ;
    for ( i = 0; i < m; i++) {
        for ( j = 0; j < n; j++) {
            float sum = 0;
            for ( p = 0; p < k; p++) {
                sum += A[p * m + i] * B[j * k + p];
            }
            C[j * m + i] = sum;
        }
    }
    return ;
}
chsasank commented 4 months ago

LLVM IR without any optimization in C:

; ModuleID = 'refmult.c'
source_filename = "refmult.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: noinline nounwind optnone uwtable
define dso_local void @ref_mult(ptr noundef %0, ptr noundef %1, ptr noundef %2, i32 noundef %3, i32 noundef %4, i32 noundef %5) #0 {
  %7 = alloca ptr, align 8
  %8 = alloca ptr, align 8
  %9 = alloca ptr, align 8
  %10 = alloca i32, align 4
  %11 = alloca i32, align 4
  %12 = alloca i32, align 4
  %13 = alloca i32, align 4
  %14 = alloca i32, align 4
  %15 = alloca i32, align 4
  %16 = alloca float, align 4
  store ptr %0, ptr %7, align 8
  store ptr %1, ptr %8, align 8
  store ptr %2, ptr %9, align 8
  store i32 %3, ptr %10, align 4
  store i32 %4, ptr %11, align 4
  store i32 %5, ptr %12, align 4
  store i32 0, ptr %13, align 4
  br label %17

17:                                               ; preds = %69, %6
  %18 = load i32, ptr %13, align 4
  %19 = load i32, ptr %10, align 4
  %20 = icmp slt i32 %18, %19
  br i1 %20, label %21, label %72

21:                                               ; preds = %17
  store i32 0, ptr %14, align 4
  br label %22

22:                                               ; preds = %65, %21
  %23 = load i32, ptr %14, align 4
  %24 = load i32, ptr %11, align 4
  %25 = icmp slt i32 %23, %24
  br i1 %25, label %26, label %68

26:                                               ; preds = %22
  store float 0.000000e+00, ptr %16, align 4
  store i32 0, ptr %15, align 4
  br label %27

27:                                               ; preds = %52, %26
  %28 = load i32, ptr %15, align 4
  %29 = load i32, ptr %12, align 4
  %30 = icmp slt i32 %28, %29
  br i1 %30, label %31, label %55

31:                                               ; preds = %27
  %32 = load ptr, ptr %7, align 8
  %33 = load i32, ptr %15, align 4
  %34 = load i32, ptr %10, align 4
  %35 = mul nsw i32 %33, %34
  %36 = load i32, ptr %13, align 4
  %37 = add nsw i32 %35, %36
  %38 = sext i32 %37 to i64
  %39 = getelementptr inbounds float, ptr %32, i64 %38
  %40 = load float, ptr %39, align 4
  %41 = load ptr, ptr %8, align 8
  %42 = load i32, ptr %14, align 4
  %43 = load i32, ptr %12, align 4
  %44 = mul nsw i32 %42, %43
  %45 = load i32, ptr %15, align 4
  %46 = add nsw i32 %44, %45
  %47 = sext i32 %46 to i64
  %48 = getelementptr inbounds float, ptr %41, i64 %47
  %49 = load float, ptr %48, align 4
  %50 = load float, ptr %16, align 4
  %51 = call float @llvm.fmuladd.f32(float %40, float %49, float %50)
  store float %51, ptr %16, align 4
  br label %52

52:                                               ; preds = %31
  %53 = load i32, ptr %15, align 4
  %54 = add nsw i32 %53, 1
  store i32 %54, ptr %15, align 4
  br label %27, !llvm.loop !6

55:                                               ; preds = %27
  %56 = load float, ptr %16, align 4
  %57 = load ptr, ptr %9, align 8
  %58 = load i32, ptr %14, align 4
  %59 = load i32, ptr %10, align 4
  %60 = mul nsw i32 %58, %59
  %61 = load i32, ptr %13, align 4
  %62 = add nsw i32 %60, %61
  %63 = sext i32 %62 to i64
  %64 = getelementptr inbounds float, ptr %57, i64 %63
  store float %56, ptr %64, align 4
  br label %65

65:                                               ; preds = %55
  %66 = load i32, ptr %14, align 4
  %67 = add nsw i32 %66, 1
  store i32 %67, ptr %14, align 4
  br label %22, !llvm.loop !8

68:                                               ; preds = %22
  br label %69

69:                                               ; preds = %68
  %70 = load i32, ptr %13, align 4
  %71 = add nsw i32 %70, 1
  store i32 %71, ptr %13, align 4
  br label %17, !llvm.loop !9

72:                                               ; preds = %17
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare float @llvm.fmuladd.f32(float, float, float) #1

attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }

!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}

!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 2}
!4 = !{i32 7, !"frame-pointer", i32 2}
!5 = !{!"clang version 15.0.7"}
!6 = distinct !{!6, !7}
!7 = !{!"llvm.loop.mustprogress"}
!8 = distinct !{!8, !7}
!9 = distinct !{!9, !7}

and LLVM IR for C:

; ModuleID = ""
target triple = "unknown-unknown-unknown"
target datalayout = ""

define void @"__MMult1"(float* %"A", float* %"B", float* %"C", i32 %"m", i32 %"n", i32 %"k")
{
alloca-bbmrqjzu:
  %"A.1" = alloca float*
  %"B.1" = alloca float*
  %"C.1" = alloca float*
  %"m.1" = alloca i32
  %"n.1" = alloca i32
  %"k.1" = alloca i32
  %"tmp_clisp-affwcftt" = alloca i32
  %"i" = alloca i32
  %"tmp_clisp.inp_0-mkkphcmp" = alloca i32
  %"tmp_clisp.inp_1-hbzklbpu" = alloca i32
  %"tmp_clisp.cond-piewakqq" = alloca i1
  %"tmp_clisp-omedjamq" = alloca i32
  %"j" = alloca i32
  %"tmp_clisp.inp_0-wfsoufeb" = alloca i32
  %"tmp_clisp.inp_1-ujctvsij" = alloca i32
  %"tmp_clisp.cond-nqepxmpx" = alloca i1
  %"tmp_clisp-aordgtmx" = alloca float
  %"sum.krxidkvg.gtbosduc" = alloca float
  %"tmp_clisp-jucjizah" = alloca i32
  %"p" = alloca i32
  %"tmp_clisp.inp_0-aiavcbuq" = alloca i32
  %"tmp_clisp.inp_1-jndmhbze" = alloca i32
  %"tmp_clisp.cond-eokbncpo" = alloca i1
  %"tmp_clisp.inp_0-alwgxswg" = alloca i32
  %"tmp_clisp.inp_1-bljojdzc" = alloca i32
  %"tmp_clisp.inp_0-zyaxkyfb" = alloca i32
  %"tmp_clisp.inp_1-mxzzloqp" = alloca i32
  %"tmp_clisp-zlkwxzlv" = alloca i32
  %"tmp_clisp-lsegeqfw" = alloca float*
  %"tmp_clisp.inp_0-tyjnueua" = alloca float
  %"tmp_clisp.inp_0-zwskayzm" = alloca i32
  %"tmp_clisp.inp_1-duvxmwwc" = alloca i32
  %"tmp_clisp.inp_0-xsgzizfs" = alloca i32
  %"tmp_clisp.inp_1-iwjecgmx" = alloca i32
  %"tmp_clisp-nmoirgcz" = alloca i32
  %"tmp_clisp-gjwitaar" = alloca float*
  %"tmp_clisp.inp_1-ubmlbnzu" = alloca float
  %"tmp_clisp.inp_0-kdbwpial" = alloca float
  %"tmp_clisp.inp_1-euvgwazb" = alloca float
  %"tmp_clisp-mbttzjym" = alloca float
  %"tmp_clisp.inp_0-hoxwqebl" = alloca i32
  %"tmp_clisp.inp_1-tnfibwti" = alloca i32
  %"tmp_clisp-qufnjzis" = alloca i32
  %"tmp_clisp.inp_0-ouourali" = alloca i32
  %"tmp_clisp.inp_1-hltiwqfw" = alloca i32
  %"tmp_clisp.inp_0-vlfjpvro" = alloca i32
  %"tmp_clisp.inp_1-gtwxqetc" = alloca i32
  %"tmp_clisp-iefldmrm" = alloca i32
  %"tmp_clisp.ptr-lvfdazjr" = alloca float*
  %"tmp_clisp.val-ljmupwse" = alloca float
  %"tmp_clisp-nvfiledd" = alloca float
  %"tmp_clisp.inp_0-ausjwnoe" = alloca i32
  %"tmp_clisp.inp_1-gxfjeqti" = alloca i32
  %"tmp_clisp-vdiojlyt" = alloca i32
  %"tmp_clisp.inp_0-hpakbkdl" = alloca i32
  %"tmp_clisp.inp_1-wiolysdf" = alloca i32
  %"tmp_clisp-hfomxqoq" = alloca i32
  br label %"entry-thzhmlgy"
entry-thzhmlgy:
  store float* %"A", float** %"A.1"
  store float* %"B", float** %"B.1"
  store float* %"C", float** %"C.1"
  store i32 %"m", i32* %"m.1"
  store i32 %"n", i32* %"n.1"
  store i32 %"k", i32* %"k.1"
  store i32 0, i32* %"tmp_clisp-affwcftt"
  %".15" = load i32, i32* %"tmp_clisp-affwcftt"
  store i32 %".15", i32* %"i"
  br label %"tmp_clisp.loop-tawppkmx"
tmp_clisp.loop-tawppkmx:
  %".18" = load i32, i32* %"i"
  store i32 %".18", i32* %"tmp_clisp.inp_0-mkkphcmp"
  %".20" = load i32, i32* %"m.1"
  store i32 %".20", i32* %"tmp_clisp.inp_1-hbzklbpu"
  %".22" = load i32, i32* %"tmp_clisp.inp_0-mkkphcmp"
  %".23" = load i32, i32* %"tmp_clisp.inp_1-hbzklbpu"
  %"tmp_clisp.cond-piewakqq.1" = icmp slt i32 %".22", %".23"
  store i1 %"tmp_clisp.cond-piewakqq.1", i1* %"tmp_clisp.cond-piewakqq"
  %".25" = load i1, i1* %"tmp_clisp.cond-piewakqq"
  br i1 %".25", label %"tmp_clisp.cont-hoaeiczv", label %"tmp_clisp.break-dcqlzqiq"
tmp_clisp.cont-hoaeiczv:
  store i32 0, i32* %"tmp_clisp-omedjamq"
  %".28" = load i32, i32* %"tmp_clisp-omedjamq"
  store i32 %".28", i32* %"j"
  br label %"tmp_clisp.loop-cjfdjqts"
tmp_clisp.loop-cjfdjqts:
  %".31" = load i32, i32* %"j"
  store i32 %".31", i32* %"tmp_clisp.inp_0-wfsoufeb"
  %".33" = load i32, i32* %"n.1"
  store i32 %".33", i32* %"tmp_clisp.inp_1-ujctvsij"
  %".35" = load i32, i32* %"tmp_clisp.inp_0-wfsoufeb"
  %".36" = load i32, i32* %"tmp_clisp.inp_1-ujctvsij"
  %"tmp_clisp.cond-nqepxmpx.1" = icmp slt i32 %".35", %".36"
  store i1 %"tmp_clisp.cond-nqepxmpx.1", i1* %"tmp_clisp.cond-nqepxmpx"
  %".38" = load i1, i1* %"tmp_clisp.cond-nqepxmpx"
  br i1 %".38", label %"tmp_clisp.cont-kobknley", label %"tmp_clisp.break-wjvptbxu"
tmp_clisp.cont-kobknley:
  store float              0x0, float* %"tmp_clisp-aordgtmx"
  %".41" = load float, float* %"tmp_clisp-aordgtmx"
  store float %".41", float* %"sum.krxidkvg.gtbosduc"
  store i32 0, i32* %"tmp_clisp-jucjizah"
  %".44" = load i32, i32* %"tmp_clisp-jucjizah"
  store i32 %".44", i32* %"p"
  br label %"tmp_clisp.loop-dokigiia"
tmp_clisp.loop-dokigiia:
  %".47" = load i32, i32* %"p"
  store i32 %".47", i32* %"tmp_clisp.inp_0-aiavcbuq"
  %".49" = load i32, i32* %"k.1"
  store i32 %".49", i32* %"tmp_clisp.inp_1-jndmhbze"
  %".51" = load i32, i32* %"tmp_clisp.inp_0-aiavcbuq"
  %".52" = load i32, i32* %"tmp_clisp.inp_1-jndmhbze"
  %"tmp_clisp.cond-eokbncpo.1" = icmp slt i32 %".51", %".52"
  store i1 %"tmp_clisp.cond-eokbncpo.1", i1* %"tmp_clisp.cond-eokbncpo"
  %".54" = load i1, i1* %"tmp_clisp.cond-eokbncpo"
  br i1 %".54", label %"tmp_clisp.cont-qrwqmrlh", label %"tmp_clisp.break-zkrgkvej"
tmp_clisp.cont-qrwqmrlh:
  %".56" = load i32, i32* %"p"
  store i32 %".56", i32* %"tmp_clisp.inp_0-alwgxswg"
  %".58" = load i32, i32* %"m.1"
  store i32 %".58", i32* %"tmp_clisp.inp_1-bljojdzc"
  %".60" = load i32, i32* %"tmp_clisp.inp_0-alwgxswg"
  %".61" = load i32, i32* %"tmp_clisp.inp_1-bljojdzc"
  %"tmp_clisp.inp_0-zyaxkyfb.1" = mul i32 %".60", %".61"
  store i32 %"tmp_clisp.inp_0-zyaxkyfb.1", i32* %"tmp_clisp.inp_0-zyaxkyfb"
  %".63" = load i32, i32* %"i"
  store i32 %".63", i32* %"tmp_clisp.inp_1-mxzzloqp"
  %".65" = load i32, i32* %"tmp_clisp.inp_0-zyaxkyfb"
  %".66" = load i32, i32* %"tmp_clisp.inp_1-mxzzloqp"
  %"tmp_clisp-zlkwxzlv.1" = add i32 %".65", %".66"
  store i32 %"tmp_clisp-zlkwxzlv.1", i32* %"tmp_clisp-zlkwxzlv"
  %".68" = load float*, float** %"A.1"
  %".69" = load i32, i32* %"tmp_clisp-zlkwxzlv"
  %".70" = getelementptr float, float* %".68", i32 %".69"
  store float* %".70", float** %"tmp_clisp-lsegeqfw"
  %".72" = load float*, float** %"tmp_clisp-lsegeqfw"
  %".73" = load float, float* %".72"
  store float %".73", float* %"tmp_clisp.inp_0-tyjnueua"
  %".75" = load i32, i32* %"j"
  store i32 %".75", i32* %"tmp_clisp.inp_0-zwskayzm"
  %".77" = load i32, i32* %"k.1"
  store i32 %".77", i32* %"tmp_clisp.inp_1-duvxmwwc"
  %".79" = load i32, i32* %"tmp_clisp.inp_0-zwskayzm"
  %".80" = load i32, i32* %"tmp_clisp.inp_1-duvxmwwc"
  %"tmp_clisp.inp_0-xsgzizfs.1" = mul i32 %".79", %".80"
  store i32 %"tmp_clisp.inp_0-xsgzizfs.1", i32* %"tmp_clisp.inp_0-xsgzizfs"
  %".82" = load i32, i32* %"p"
  store i32 %".82", i32* %"tmp_clisp.inp_1-iwjecgmx"
  %".84" = load i32, i32* %"tmp_clisp.inp_0-xsgzizfs"
  %".85" = load i32, i32* %"tmp_clisp.inp_1-iwjecgmx"
  %"tmp_clisp-nmoirgcz.1" = add i32 %".84", %".85"
  store i32 %"tmp_clisp-nmoirgcz.1", i32* %"tmp_clisp-nmoirgcz"
  %".87" = load float*, float** %"B.1"
  %".88" = load i32, i32* %"tmp_clisp-nmoirgcz"
  %".89" = getelementptr float, float* %".87", i32 %".88"
  store float* %".89", float** %"tmp_clisp-gjwitaar"
  %".91" = load float*, float** %"tmp_clisp-gjwitaar"
  %".92" = load float, float* %".91"
  store float %".92", float* %"tmp_clisp.inp_1-ubmlbnzu"
  %".94" = load float, float* %"tmp_clisp.inp_0-tyjnueua"
  %".95" = load float, float* %"tmp_clisp.inp_1-ubmlbnzu"
  %"tmp_clisp.inp_0-kdbwpial.1" = fmul float %".94", %".95"
  store float %"tmp_clisp.inp_0-kdbwpial.1", float* %"tmp_clisp.inp_0-kdbwpial"
  %".97" = load float, float* %"sum.krxidkvg.gtbosduc"
  store float %".97", float* %"tmp_clisp.inp_1-euvgwazb"
  %".99" = load float, float* %"tmp_clisp.inp_0-kdbwpial"
  %".100" = load float, float* %"tmp_clisp.inp_1-euvgwazb"
  %"tmp_clisp-mbttzjym.1" = fadd float %".99", %".100"
  store float %"tmp_clisp-mbttzjym.1", float* %"tmp_clisp-mbttzjym"
  %".102" = load float, float* %"tmp_clisp-mbttzjym"
  store float %".102", float* %"sum.krxidkvg.gtbosduc"
  %".104" = load i32, i32* %"p"
  store i32 %".104", i32* %"tmp_clisp.inp_0-hoxwqebl"
  store i32 1, i32* %"tmp_clisp.inp_1-tnfibwti"
  %".107" = load i32, i32* %"tmp_clisp.inp_0-hoxwqebl"
  %".108" = load i32, i32* %"tmp_clisp.inp_1-tnfibwti"
  %"tmp_clisp-qufnjzis.1" = add i32 %".107", %".108"
  store i32 %"tmp_clisp-qufnjzis.1", i32* %"tmp_clisp-qufnjzis"
  %".110" = load i32, i32* %"tmp_clisp-qufnjzis"
  store i32 %".110", i32* %"p"
  br label %"tmp_clisp.loop-dokigiia"
tmp_clisp.break-zkrgkvej:
  %".113" = load i32, i32* %"j"
  store i32 %".113", i32* %"tmp_clisp.inp_0-ouourali"
  %".115" = load i32, i32* %"m.1"
  store i32 %".115", i32* %"tmp_clisp.inp_1-hltiwqfw"
  %".117" = load i32, i32* %"tmp_clisp.inp_0-ouourali"
  %".118" = load i32, i32* %"tmp_clisp.inp_1-hltiwqfw"
  %"tmp_clisp.inp_0-vlfjpvro.1" = mul i32 %".117", %".118"
  store i32 %"tmp_clisp.inp_0-vlfjpvro.1", i32* %"tmp_clisp.inp_0-vlfjpvro"
  %".120" = load i32, i32* %"i"
  store i32 %".120", i32* %"tmp_clisp.inp_1-gtwxqetc"
  %".122" = load i32, i32* %"tmp_clisp.inp_0-vlfjpvro"
  %".123" = load i32, i32* %"tmp_clisp.inp_1-gtwxqetc"
  %"tmp_clisp-iefldmrm.1" = add i32 %".122", %".123"
  store i32 %"tmp_clisp-iefldmrm.1", i32* %"tmp_clisp-iefldmrm"
  %".125" = load float*, float** %"C.1"
  %".126" = load i32, i32* %"tmp_clisp-iefldmrm"
  %".127" = getelementptr float, float* %".125", i32 %".126"
  store float* %".127", float** %"tmp_clisp.ptr-lvfdazjr"
  %".129" = load float, float* %"sum.krxidkvg.gtbosduc"
  store float %".129", float* %"tmp_clisp.val-ljmupwse"
  %".131" = load float*, float** %"tmp_clisp.ptr-lvfdazjr"
  %".132" = load float, float* %"tmp_clisp.val-ljmupwse"
  store float %".132", float* %".131"
  %".134" = load float, float* %"tmp_clisp.val-ljmupwse"
  store float %".134", float* %"tmp_clisp-nvfiledd"
  %".136" = load i32, i32* %"j"
  store i32 %".136", i32* %"tmp_clisp.inp_0-ausjwnoe"
  store i32 1, i32* %"tmp_clisp.inp_1-gxfjeqti"
  %".139" = load i32, i32* %"tmp_clisp.inp_0-ausjwnoe"
  %".140" = load i32, i32* %"tmp_clisp.inp_1-gxfjeqti"
  %"tmp_clisp-vdiojlyt.1" = add i32 %".139", %".140"
  store i32 %"tmp_clisp-vdiojlyt.1", i32* %"tmp_clisp-vdiojlyt"
  %".142" = load i32, i32* %"tmp_clisp-vdiojlyt"
  store i32 %".142", i32* %"j"
  br label %"tmp_clisp.loop-cjfdjqts"
tmp_clisp.break-wjvptbxu:
  %".145" = load i32, i32* %"i"
  store i32 %".145", i32* %"tmp_clisp.inp_0-hpakbkdl"
  store i32 1, i32* %"tmp_clisp.inp_1-wiolysdf"
  %".148" = load i32, i32* %"tmp_clisp.inp_0-hpakbkdl"
  %".149" = load i32, i32* %"tmp_clisp.inp_1-wiolysdf"
  %"tmp_clisp-hfomxqoq.1" = add i32 %".148", %".149"
  store i32 %"tmp_clisp-hfomxqoq.1", i32* %"tmp_clisp-hfomxqoq"
  %".151" = load i32, i32* %"tmp_clisp-hfomxqoq"
  store i32 %".151", i32* %"i"
  br label %"tmp_clisp.loop-tawppkmx"
tmp_clisp.break-dcqlzqiq:
  br label %"tmp_clisp.ret_lbl-guamijsj"
tmp_clisp.ret_lbl-guamijsj:
  ret void
}
chsasank commented 4 months ago

We're creating lot of temporary variables with alloca in c-lisp while C is not. Does this have to be like this? @GlowingScrewdriver.

This is not c-lisp problem. Let's observe releant brilisp and see if they match LLVM.

Some more observations:

  1. Lot of id instructions. Can we eliminate it?
(brilisp
  (define ((__MMult1 void)
           (A (ptr float))
           (B (ptr float))
           (C (ptr float))
           (m int)
           (n int)
           (k int))

    ; i = 0
    (set (tmp_clisp-siugxvzj int) (const 0))
    (set (i int) (id tmp_clisp-siugxvzj))

    (label tmp_clisp.loop-ktqhbbhy)

     ; i < m
    (set (tmp_clisp.inp_0-bolfwpgw int) (id i))
    (set (tmp_clisp.inp_1-kcqayyyj int) (id m))
    (set (tmp_clisp.cond-ncpwvzyj bool)
         (lt tmp_clisp.inp_0-bolfwpgw
             tmp_clisp.inp_1-kcqayyyj))

    ; continue or break check
    (br tmp_clisp.cond-ncpwvzyj
        tmp_clisp.cont-xhhzwymo
        tmp_clisp.break-umrpdgwz)

    (label tmp_clisp.cont-xhhzwymo)

    ; j = 0
    (set (tmp_clisp-fuexemot int) (const 0))
    (set (j int) (id tmp_clisp-fuexemot))

    (label tmp_clisp.loop-ortsybnq)

    ; j < n
    (set (tmp_clisp.inp_0-cawawdft int) (id j))
    (set (tmp_clisp.inp_1-nbsonjyo int) (id n))
    (set (tmp_clisp.cond-ytxoncmc bool)
         (lt tmp_clisp.inp_0-cawawdft
             tmp_clisp.inp_1-nbsonjyo))

    ; continue or break check
    (br tmp_clisp.cond-ytxoncmc
        tmp_clisp.cont-doskgrds
        tmp_clisp.break-jrptpggi)

    (label tmp_clisp.cont-doskgrds)

    ; sum = 0
    (set (tmp_clisp-gxsgwjwa float) (const 0))
    (set (sum.ntxblfwd.dejhoqjr float)
         (id tmp_clisp-gxsgwjwa))

    ; p = 0
    (set (tmp_clisp-nqacoxyw int) (const 0))
    (set (p int) (id tmp_clisp-nqacoxyw))

    (label tmp_clisp.loop-yzbmigkv)

    ; p < k
    (set (tmp_clisp.inp_0-nbhjaaqn int) (id p))
    (set (tmp_clisp.inp_1-jgaywwhk int) (id k))
    (set (tmp_clisp.cond-peyiylke bool)
         (lt tmp_clisp.inp_0-nbhjaaqn
             tmp_clisp.inp_1-jgaywwhk))

    ; break check
    (br tmp_clisp.cond-peyiylke
        tmp_clisp.cont-vzonxxlp
        tmp_clisp.break-lbsddwuy)
    (label tmp_clisp.cont-vzonxxlp)

     ; p * m
    (set (tmp_clisp.inp_0-ohpgvqtb int) (id p))
    (set (tmp_clisp.inp_1-jbdqduba int) (id m))
    (set (tmp_clisp.inp_0-zjvsemeb int)
         (mul tmp_clisp.inp_0-ohpgvqtb
              tmp_clisp.inp_1-jbdqduba))

    ; p * m + i
    (set (tmp_clisp.inp_1-mfejnidx int) (id i))
    (set (tmp_clisp-mxaqtskz int)
         (add tmp_clisp.inp_0-zjvsemeb
              tmp_clisp.inp_1-mfejnidx))

    ; A[p*m +i]
    (set (tmp_clisp-bquzoawz (ptr float))
         (ptradd A tmp_clisp-mxaqtskz))
    (set (tmp_clisp.inp_0-lakwncit float)
         (load tmp_clisp-bquzoawz))

     ; j*k
    (set (tmp_clisp.inp_0-wjuhxkdq int) (id j))
    (set (tmp_clisp.inp_1-vgfluxlc int) (id k))
    (set (tmp_clisp.inp_0-nsctpjow int)
         (mul tmp_clisp.inp_0-wjuhxkdq
              tmp_clisp.inp_1-vgfluxlc))

    ; j*k + p
    (set (tmp_clisp.inp_1-zjjooazg int) (id p))
    (set (tmp_clisp-shaqabnc int)
         (add tmp_clisp.inp_0-nsctpjow
              tmp_clisp.inp_1-zjjooazg))

    ; B[j*k + p]
    (set (tmp_clisp-thqgkynb (ptr float))
         (ptradd B tmp_clisp-shaqabnc))
    (set (tmp_clisp.inp_1-ouazzsfu float)
         (load tmp_clisp-thqgkynb))

    ; sum = sum + A[p*m +i] *  B[j*k + p]
    (set (tmp_clisp.inp_0-etugbivl float)
         (fmul tmp_clisp.inp_0-lakwncit
               tmp_clisp.inp_1-ouazzsfu))
    (set (tmp_clisp.inp_1-oruvfknq float)
         (id sum.ntxblfwd.dejhoqjr))
    (set (tmp_clisp-yaqtggmm float)
         (fadd tmp_clisp.inp_0-etugbivl
               tmp_clisp.inp_1-oruvfknq))
    (set (sum.ntxblfwd.dejhoqjr float)
         (id tmp_clisp-yaqtggmm))

    ; p = p + 1
    (set (tmp_clisp.inp_0-vtakxodi int) (id p))
    (set (tmp_clisp.inp_1-jqlswect int) (const 1))
    (set (tmp_clisp-mvputqgg int)
         (add tmp_clisp.inp_0-vtakxodi
              tmp_clisp.inp_1-jqlswect))
    (set (p int) (id tmp_clisp-mvputqgg))

    (jmp tmp_clisp.loop-yzbmigkv)

    ; j * m
    (label tmp_clisp.break-lbsddwuy)
    (set (tmp_clisp.inp_0-iuslubbd int) (id j))
    (set (tmp_clisp.inp_1-tjkrlbgn int) (id m))
    (set (tmp_clisp.inp_0-cgzmfptq int)
         (mul tmp_clisp.inp_0-iuslubbd
              tmp_clisp.inp_1-tjkrlbgn))

    ; j * m + i
    (set (tmp_clisp.inp_1-dhxxntaj int) (id i))
    (set (tmp_clisp-nttorwaw int)
         (add tmp_clisp.inp_0-cgzmfptq
              tmp_clisp.inp_1-dhxxntaj))

    ; C[j*m + i] = sim
    (set (tmp_clisp.ptr-kwxfpvuv (ptr float))
         (ptradd C tmp_clisp-nttorwaw))
    (set (tmp_clisp.val-dfgbkymx float)
         (id sum.ntxblfwd.dejhoqjr))
    (store tmp_clisp.ptr-kwxfpvuv
           tmp_clisp.val-dfgbkymx)

     ; j = j + 1
    (set (tmp_clisp-dpuizpsd float)
         (id tmp_clisp.val-dfgbkymx))
    (set (tmp_clisp.inp_0-qnpimfwr int) (id j))
    (set (tmp_clisp.inp_1-lwkuwvyg int) (const 1))
    (set (tmp_clisp-ewewatde int)
         (add tmp_clisp.inp_0-qnpimfwr
              tmp_clisp.inp_1-lwkuwvyg))
    (set (j int) (id tmp_clisp-ewewatde))

    (jmp tmp_clisp.loop-ortsybnq)

    (label tmp_clisp.break-jrptpggi)

    ; i = i + 1
    (set (tmp_clisp.inp_0-ssmzexee int) (id i))
    (set (tmp_clisp.inp_1-uptnyjrh int) (const 1))
    (set (tmp_clisp-evuknbie int)
         (add tmp_clisp.inp_0-ssmzexee
              tmp_clisp.inp_1-uptnyjrh))
    (set (i int) (id tmp_clisp-evuknbie))

    (jmp tmp_clisp.loop-ktqhbbhy)
    (label tmp_clisp.break-umrpdgwz)

    ; ret
    (jmp tmp_clisp.ret_lbl-dzendtoe)
    (label tmp_clisp.ret_lbl-dzendtoe)
    (ret)))

BRILisp is quite readable if not for id.

chsasank commented 4 months ago

Relevant c-lisp code:

    def gen_set_expr(self, expr, res_sym):
        if not verify_shape(expr, [str, str, None]):
            raise CodegenError(f"Bad set expression: {expr}")

        name = expr[1]
        scoped_name = self.scoped_lookup(name)
        instr_list = self.gen_expr(expr[2], res_sym=res_sym)
        instr_list.append(
            ["set", [scoped_name, self.symbol_types[scoped_name]], ["id", res_sym]]
        )
        return instr_list

Biggest reason we have id is because of res_sym being an input to gen expressions in c-lisp

chsasank commented 4 months ago

Newer Brilsp and LLVM after running with #77 :

(brilisp
  (define ((__MMult1 void)
           (A (ptr float))
           (B (ptr float))
           (C (ptr float))
           (m int)
           (n int)
           (k int))

    (set (tmp_clisp-ivjrdagl int) (const 0))
    (set (i int) (id tmp_clisp-ivjrdagl))

    (label tmp_clisp.loop-ymbbjefd)

    (set (tmp_clisp-crvgjtqq bool) (lt i m))
    (br tmp_clisp-crvgjtqq
        tmp_clisp.cont-elpzoqye
        tmp_clisp.break-eebkjiem)

    (label tmp_clisp.cont-elpzoqye)
    (set (tmp_clisp-kgolqdma int) (const 0))
    (set (j int) (id tmp_clisp-kgolqdma))

    (label tmp_clisp.loop-kpdkibjn)
    (set (tmp_clisp-ieeuniew bool) (lt j n))

    (br tmp_clisp-ieeuniew
        tmp_clisp.cont-tvkgjhho
        tmp_clisp.break-qxwqorwq)

    (label tmp_clisp.cont-tvkgjhho)

    (set (tmp_clisp-quwsjetu float) (const 0))
    (set (sum.qapyumhy.cuycchkx float)
         (id tmp_clisp-quwsjetu))

    (set (tmp_clisp-bivorips int) (const 0))
    (set (p int) (id tmp_clisp-bivorips))

    (label tmp_clisp.loop-mtgezhgu)
    (set (tmp_clisp-bqmzektk bool) (lt p k))
    (br tmp_clisp-bqmzektk
        tmp_clisp.cont-eocirseo
        tmp_clisp.break-hwdbyjjm)
    (label tmp_clisp.cont-eocirseo)
    (set (tmp_clisp-spvyhcsp int) (mul p m))
    (set (tmp_clisp-unziitke int)
         (add tmp_clisp-spvyhcsp i))
    (set (tmp_clisp-vbbwbfiq (ptr float))
         (ptradd A tmp_clisp-unziitke))
    (set (tmp_clisp-idxigtop float)
         (load tmp_clisp-vbbwbfiq))
    (set (tmp_clisp-jpalikxo int) (mul j k))
    (set (tmp_clisp-ymtqsshy int)
         (add tmp_clisp-jpalikxo p))
    (set (tmp_clisp-tkzvojmi (ptr float))
         (ptradd B tmp_clisp-ymtqsshy))
    (set (tmp_clisp-vkldsdyb float)
         (load tmp_clisp-tkzvojmi))
    (set (tmp_clisp-fvvzxxpp float)
         (fmul tmp_clisp-idxigtop tmp_clisp-vkldsdyb))
    (set (tmp_clisp-brccmkmu float)
         (fadd tmp_clisp-fvvzxxpp sum.qapyumhy.cuycchkx))
    (set (sum.qapyumhy.cuycchkx float)
         (id tmp_clisp-brccmkmu))
    (set (tmp_clisp-qjhrkumd int) (const 1))
    (set (tmp_clisp-zxhnlmlp int)
         (add p tmp_clisp-qjhrkumd))
    (set (p int) (id tmp_clisp-zxhnlmlp))
    (jmp tmp_clisp.loop-mtgezhgu)
    (label tmp_clisp.break-hwdbyjjm)
    (set (tmp_clisp-fgymqpop int) (mul j m))
    (set (tmp_clisp-pthkcwvn int)
         (add tmp_clisp-fgymqpop i))
    (set (tmp_clisp-bjjoeriq (ptr float))
         (ptradd C tmp_clisp-pthkcwvn))
    (store tmp_clisp-bjjoeriq sum.qapyumhy.cuycchkx)
    (set (tmp_clisp-nenxsrsr int) (const 1))
    (set (tmp_clisp-bfwyqktl int)
         (add j tmp_clisp-nenxsrsr))
    (set (j int) (id tmp_clisp-bfwyqktl))
    (jmp tmp_clisp.loop-kpdkibjn)
    (label tmp_clisp.break-qxwqorwq)
    (set (tmp_clisp-mejehkwk int) (const 1))
    (set (tmp_clisp-dbgttfsb int)
         (add i tmp_clisp-mejehkwk))
    (set (i int) (id tmp_clisp-dbgttfsb))
    (jmp tmp_clisp.loop-ymbbjefd)
    (label tmp_clisp.break-eebkjiem)
    (jmp tmp_clisp.ret_lbl-igrtxgmn)
    (label tmp_clisp.ret_lbl-igrtxgmn)
    (ret)))

and LLVM with -O1:

; ModuleID = 'MMulti_new.ll'
source_filename = "MMulti_new.ll"
target triple = "unknown-unknown-unknown"

; Function Attrs: argmemonly nofree norecurse nosync nounwind
define void @__MMult1(float* nocapture readonly %A, float* nocapture readonly %B, float* nocapture writeonly %C, i32 %m, i32 %n, i32 %k) local_unnamed_addr #0 {
alloca-ovaaulgt:
  %tmp_clisp-ifyhhxbd.118 = icmp sgt i32 %m, 0
  br i1 %tmp_clisp-ifyhhxbd.118, label %tmp_clisp.loop-inohvwsi.preheader.lr.ph, label %tmp_clisp.ret_lbl-yfiookbt

tmp_clisp.loop-inohvwsi.preheader.lr.ph:          ; preds = %alloca-ovaaulgt
  %tmp_clisp-eghmoqva.116 = icmp sgt i32 %n, 0
  %tmp_clisp-smtahyaw.113 = icmp sgt i32 %k, 0
  br label %tmp_clisp.loop-inohvwsi.preheader

tmp_clisp.loop-inohvwsi.preheader:                ; preds = %tmp_clisp.loop-inohvwsi.preheader.lr.ph, %tmp_clisp.break-onhrayvl
  %i.019 = phi i32 [ 0, %tmp_clisp.loop-inohvwsi.preheader.lr.ph ], [ %tmp_clisp-mcjogwyb.1, %tmp_clisp.break-onhrayvl ]
  br i1 %tmp_clisp-eghmoqva.116, label %tmp_clisp.loop-safbgzsu.preheader, label %tmp_clisp.break-onhrayvl

tmp_clisp.loop-safbgzsu.preheader:                ; preds = %tmp_clisp.loop-inohvwsi.preheader, %tmp_clisp.break-tmohrxbn
  %j.017 = phi i32 [ %tmp_clisp-urcljuex.1, %tmp_clisp.break-tmohrxbn ], [ 0, %tmp_clisp.loop-inohvwsi.preheader ]
  br i1 %tmp_clisp-smtahyaw.113, label %tmp_clisp.cont-gjjqluld.lr.ph, label %tmp_clisp.break-tmohrxbn

tmp_clisp.cont-gjjqluld.lr.ph:                    ; preds = %tmp_clisp.loop-safbgzsu.preheader
  %tmp_clisp-jqhxwtve.1 = mul i32 %j.017, %k
  br label %tmp_clisp.cont-gjjqluld

tmp_clisp.cont-gjjqluld:                          ; preds = %tmp_clisp.cont-gjjqluld.lr.ph, %tmp_clisp.cont-gjjqluld
  %sum.mlbgqafz.komgfkcq.015 = phi float [ 0.000000e+00, %tmp_clisp.cont-gjjqluld.lr.ph ], [ %tmp_clisp-mmntcjil.1, %tmp_clisp.cont-gjjqluld ]
  %p.014 = phi i32 [ 0, %tmp_clisp.cont-gjjqluld.lr.ph ], [ %tmp_clisp-bdmhpmgc.1, %tmp_clisp.cont-gjjqluld ]
  %tmp_clisp-atfslbhe.1 = mul i32 %p.014, %m
  %tmp_clisp-xqxkitro.1 = add i32 %tmp_clisp-atfslbhe.1, %i.019
  %0 = sext i32 %tmp_clisp-xqxkitro.1 to i64
  %.52 = getelementptr float, float* %A, i64 %0
  %.55 = load float, float* %.52, align 4
  %tmp_clisp-rauulcuv.1 = add i32 %p.014, %tmp_clisp-jqhxwtve.1
  %1 = sext i32 %tmp_clisp-rauulcuv.1 to i64
  %.65 = getelementptr float, float* %B, i64 %1
  %.68 = load float, float* %.65, align 4
  %tmp_clisp-ruhlchfv.1 = fmul float %.55, %.68
  %tmp_clisp-mmntcjil.1 = fadd float %sum.mlbgqafz.komgfkcq.015, %tmp_clisp-ruhlchfv.1
  %tmp_clisp-bdmhpmgc.1 = add nuw nsw i32 %p.014, 1
  %tmp_clisp-smtahyaw.1 = icmp slt i32 %tmp_clisp-bdmhpmgc.1, %k
  br i1 %tmp_clisp-smtahyaw.1, label %tmp_clisp.cont-gjjqluld, label %tmp_clisp.break-tmohrxbn

tmp_clisp.break-tmohrxbn:                         ; preds = %tmp_clisp.cont-gjjqluld, %tmp_clisp.loop-safbgzsu.preheader
  %sum.mlbgqafz.komgfkcq.0.lcssa = phi float [ 0.000000e+00, %tmp_clisp.loop-safbgzsu.preheader ], [ %tmp_clisp-mmntcjil.1, %tmp_clisp.cont-gjjqluld ]
  %tmp_clisp-iwpbmoqc.1 = mul i32 %j.017, %m
  %tmp_clisp-brwzagwq.1 = add i32 %tmp_clisp-iwpbmoqc.1, %i.019
  %2 = sext i32 %tmp_clisp-brwzagwq.1 to i64
  %.93 = getelementptr float, float* %C, i64 %2
  store float %sum.mlbgqafz.komgfkcq.0.lcssa, float* %.93, align 4
  %tmp_clisp-urcljuex.1 = add nuw nsw i32 %j.017, 1
  %tmp_clisp-eghmoqva.1 = icmp slt i32 %tmp_clisp-urcljuex.1, %n
  br i1 %tmp_clisp-eghmoqva.1, label %tmp_clisp.loop-safbgzsu.preheader, label %tmp_clisp.break-onhrayvl

tmp_clisp.break-onhrayvl:                         ; preds = %tmp_clisp.break-tmohrxbn, %tmp_clisp.loop-inohvwsi.preheader
  %tmp_clisp-mcjogwyb.1 = add nuw nsw i32 %i.019, 1
  %tmp_clisp-ifyhhxbd.1 = icmp slt i32 %tmp_clisp-mcjogwyb.1, %m
  br i1 %tmp_clisp-ifyhhxbd.1, label %tmp_clisp.loop-inohvwsi.preheader, label %tmp_clisp.ret_lbl-yfiookbt

tmp_clisp.ret_lbl-yfiookbt:                       ; preds = %tmp_clisp.break-onhrayvl, %alloca-ovaaulgt
  ret void
}

attributes #0 = { argmemonly nofree norecurse nosync nounwind }

And the ref_mul c code with -O1

; ModuleID = 'refmult.c'
source_filename = "refmult.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: argmemonly nofree nosync nounwind uwtable
define dso_local void @ref_mult(ptr nocapture noundef readonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef writeonly %2, i32 noundef %3, i32 noundef %4, i32 noundef %5) local_unnamed_addr #0 {
  %7 = icmp sgt i32 %3, 0
  br i1 %7, label %8, label %46

8:                                                ; preds = %6
  %9 = icmp sgt i32 %4, 0
  %10 = icmp sgt i32 %5, 0
  %11 = sext i32 %3 to i64
  %12 = sext i32 %5 to i64
  %13 = sext i32 %3 to i64
  %14 = zext i32 %3 to i64
  %15 = zext i32 %4 to i64
  %16 = zext i32 %5 to i64
  br label %17

17:                                               ; preds = %8, %43
  %18 = phi i64 [ 0, %8 ], [ %44, %43 ]
  br i1 %9, label %19, label %43

19:                                               ; preds = %17, %36
  %20 = phi i64 [ %41, %36 ], [ 0, %17 ]
  br i1 %10, label %21, label %36

21:                                               ; preds = %19
  %22 = mul nsw i64 %20, %12
  br label %23

23:                                               ; preds = %21, %23
  %24 = phi i64 [ 0, %21 ], [ %34, %23 ]
  %25 = phi float [ 0.000000e+00, %21 ], [ %33, %23 ]
  %26 = mul nsw i64 %24, %11
  %27 = add nsw i64 %26, %18
  %28 = getelementptr inbounds float, ptr %0, i64 %27
  %29 = load float, ptr %28, align 4, !tbaa !5
  %30 = add nsw i64 %24, %22
  %31 = getelementptr inbounds float, ptr %1, i64 %30
  %32 = load float, ptr %31, align 4, !tbaa !5
  %33 = tail call float @llvm.fmuladd.f32(float %29, float %32, float %25)
  %34 = add nuw nsw i64 %24, 1
  %35 = icmp eq i64 %34, %16
  br i1 %35, label %36, label %23, !llvm.loop !9

36:                                               ; preds = %23, %19
  %37 = phi float [ 0.000000e+00, %19 ], [ %33, %23 ]
  %38 = mul nsw i64 %20, %13
  %39 = add nsw i64 %38, %18
  %40 = getelementptr inbounds float, ptr %2, i64 %39
  store float %37, ptr %40, align 4, !tbaa !5
  %41 = add nuw nsw i64 %20, 1
  %42 = icmp eq i64 %41, %15
  br i1 %42, label %43, label %19, !llvm.loop !12

43:                                               ; preds = %36, %17
  %44 = add nuw nsw i64 %18, 1
  %45 = icmp eq i64 %44, %14
  br i1 %45, label %46, label %17, !llvm.loop !13

46:                                               ; preds = %43, %6
  ret void
}

; Function Attrs: mustprogress nocallback nofree nosync nounwind readnone speculatable willreturn
declare float @llvm.fmuladd.f32(float, float, float) #1

attributes #0 = { argmemonly nofree nosync nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind readnone speculatable willreturn }
chsasank commented 4 months ago

Added comments for LLVM code from sexp mult. Ran mem2reg alone.

opt -passes=mem2reg -S MMulti_new.ll > MMulti_new_mem2reg.ll
; ModuleID = 'MMulti_new.ll'
source_filename = "MMulti_new.ll"
target triple = "unknown-unknown-unknown"

define void @__MMult1(float* %A, float* %B, float* %C, i32 %m, i32 %n, i32 %k) {
alloca-yobpmfwf:
  br label %entry-nyiytwyy

entry-nyiytwyy:                                   ; preds = %alloca-yobpmfwf
  br label %sym.loop-cqatlhqk

sym.loop-cqatlhqk:                          ; preds = %sym.break-enfknquz, %entry-nyiytwyy
  ; i = 0 or i + 1
  %i.0 = phi i32 [ 0, %entry-nyiytwyy ], [ %sym-aimutfvh.1, %sym.break-enfknquz ]
  ; i < m
  %sym-zqlnadit.1 = icmp slt i32 %i.0, %m
  br i1 %sym-zqlnadit.1, label %sym.cont-gtzeinpp, label %sym.break-sfwvbees

sym.cont-gtzeinpp:                          ; preds = %sym.loop-cqatlhqk
  br label %sym.loop-ixsprijw

sym.loop-ixsprijw:                          ; preds = %sym.break-tlhnsjzq, %sym.cont-gtzeinpp
  ; j = 0 or j + 1
  %j.0 = phi i32 [ 0, %sym.cont-gtzeinpp ], [ %sym-vschuphl.1, %sym.break-tlhnsjzq ]
  ; j < n
  %sym-erdqidvw.1 = icmp slt i32 %j.0, %n
  br i1 %sym-erdqidvw.1, label %sym.cont-lkreoyvv, label %sym.break-enfknquz

sym.cont-lkreoyvv:                          ; preds = %sym.loop-ixsprijw
  br label %sym.loop-uuuiebvx

sym.loop-uuuiebvx:                          ; preds = %sym.cont-wjcvnepx, %sym.cont-lkreoyvv
  ; sum = 0 or sum + ...
  %sum.ixwbxjcu.oiseueuu.0 = phi float [ 0.000000e+00, %sym.cont-lkreoyvv ], [ %sym-cmhiaxws.1, %sym.cont-wjcvnepx ]
  ; p = 0 or p + a
  %p.0 = phi i32 [ 0, %sym.cont-lkreoyvv ], [ %sym-ahtoruef.1, %sym.cont-wjcvnepx ]
  ; p < k
  %sym-xivbpzih.1 = icmp slt i32 %p.0, %k
  br i1 %sym-xivbpzih.1, label %sym.cont-wjcvnepx, label %sym.break-tlhnsjzq

sym.cont-wjcvnepx:                          ; preds = %sym.loop-uuuiebvx
  ; p * m
  %sym-gheyvzkh.1 = mul i32 %p.0, %m
  ; i + p * m
  %sym-mlumjxqw.1 = add i32 %sym-gheyvzkh.1, %i.0
  ; &A[i + p * m]
  %.52 = getelementptr float, float* %A, i32 %sym-mlumjxqw.1
  ; A[i + p * m]
  %.55 = load float, float* %.52, align 4
  ; j * k
  %sym-vrxwgvld.1 = mul i32 %j.0, %k
  ; j * k + p
  %sym-nwyrpbcn.1 = add i32 %sym-vrxwgvld.1, %p.0
  ; &B[j * k + p]
  %.65 = getelementptr float, float* %B, i32 %sym-nwyrpbcn.1
  ; B[j * k + p]
  %.68 = load float, float* %.65, align 4
  ; A[i + p * m] * B[j * k + p]
  %sym-ipfrfuuq.1 = fmul float %.55, %.68
  ; sum + A[i + p * m] * B[j * k + p]
  %sym-cmhiaxws.1 = fadd float %sym-ipfrfuuq.1, %sum.ixwbxjcu.oiseueuu.0
  ; p = p + 1
  %sym-ahtoruef.1 = add i32 %p.0, 1

  br label %sym.loop-uuuiebvx

sym.break-tlhnsjzq:                         ; preds = %sym.loop-uuuiebvx
  ; j * m
  %sym-ilkuwtus.1 = mul i32 %j.0, %m
  ; i + j * m
  %sym-icndiatv.1 = add i32 %sym-ilkuwtus.1, %i.0
  ; &C[i + j * m]
  %.93 = getelementptr float, float* %C, i32 %sym-icndiatv.1
  ; C[i + j * m] = sum
  store float %sum.ixwbxjcu.oiseueuu.0, float* %.93, align 4
  ; j + 1
  %sym-vschuphl.1 = add i32 %j.0, 1
  br label %sym.loop-ixsprijw

sym.break-enfknquz:                         ; preds = %sym.loop-ixsprijw
  ; i + 1
  %sym-aimutfvh.1 = add i32 %i.0, 1
  br label %sym.loop-cqatlhqk

sym.break-sfwvbees:                         ; preds = %sym.loop-cqatlhqk
  br label %sym.ret_lbl-ertfxglj

sym.ret_lbl-ertfxglj:                       ; preds = %sym.break-sfwvbees
  ret void
}

Now for the C code but with -O1:

clang -O1 -S -emit-llvm  refmult.c
; ModuleID = 'refmult.c'
source_filename = "refmult.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: argmemonly nofree nosync nounwind uwtable
define dso_local void @ref_mult(ptr nocapture noundef readonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef writeonly %2, i32 noundef %3, i32 noundef %4, i32 noundef %5) local_unnamed_addr #0 {
  ; m > 0
  %7 = icmp sgt i32 %3, 0
  br i1 %7, label %8, label %46

8:                                                ; preds = %6
  ; n > 0
  %9 = icmp sgt i32 %4, 0
  ; k > 0
  %10 = icmp sgt i32 %5, 0
  ; m, n, k -> int64
  %11 = sext i32 %3 to i64
  %12 = sext i32 %5 to i64
  %13 = sext i32 %3 to i64
  %14 = zext i32 %3 to i64
  %15 = zext i32 %4 to i64
  %16 = zext i32 %5 to i64
  br label %17

17:                                               ; preds = %8, %43
  ; i = 0 or i + 1
  %18 = phi i64 [ 0, %8 ], [ %44, %43 ]
  br i1 %9, label %19, label %43

19:                                               ; preds = %17, %36
  ; j = 0 or j + 1
  %20 = phi i64 [ %41, %36 ], [ 0, %17 ]
  br i1 %10, label %21, label %36

21:                                               ; preds = %19
  ; j * k
  %22 = mul nsw i64 %20, %12
  br label %23

23:                                               ; preds = %21, %23
  ; p = 0 or p + 1
  %24 = phi i64 [ 0, %21 ], [ %34, %23 ]
  ; sum = 0 or sum + 1
  %25 = phi float [ 0.000000e+00, %21 ], [ %33, %23 ]
  ; p * m
  %26 = mul nsw i64 %24, %11
  ; p * m + i
  %27 = add nsw i64 %26, %18

  ; A[p * m + 1]
  %28 = getelementptr inbounds float, ptr %0, i64 %27
  %29 = load float, ptr %28, align 4, !tbaa !5

  ; p + j * k
  %30 = add nsw i64 %24, %22

  ; B[p + j * k]
  %31 = getelementptr inbounds float, ptr %1, i64 %30
  %32 = load float, ptr %31, align 4, !tbaa !5

  ; intrinsic: sum + A[p * m + 1]* B[p + j * k]
  %33 = tail call float @llvm.fmuladd.f32(float %29, float %32, float %25)

  ; p = p + 1
  %34 = add nuw nsw i64 %24, 1
  ; p < k
  %35 = icmp eq i64 %34, %16
  br i1 %35, label %36, label %23, !llvm.loop !9

36:                                               ; preds = %23, %19
  ; sum = 0 or sum
  %37 = phi float [ 0.000000e+00, %19 ], [ %33, %23 ]
  ; j * m
  %38 = mul nsw i64 %20, %13
  ; j * m + i
  %39 = add nsw i64 %38, %18
  ; C[j * m + i] = sum
  %40 = getelementptr inbounds float, ptr %2, i64 %39
  store float %37, ptr %40, align 4, !tbaa !5

  ; j = j + 1
  %41 = add nuw nsw i64 %20, 1
  ; j < n
  %42 = icmp eq i64 %41, %15

  br i1 %42, label %43, label %19, !llvm.loop !12

43:                                               ; preds = %36, %17
  ; i = i + 1
  %44 = add nuw nsw i64 %18, 1
  ; i < m
  %45 = icmp eq i64 %44, %14
  br i1 %45, label %46, label %17, !llvm.loop !13

46:                                               ; preds = %43, %6
  ret void
}
chsasank commented 4 months ago

LLVM IRs are looking very similar except for

  1. llvm.fmuladd.f32 = fused multiply add
  2. casting of indices to int64
chsasank commented 4 months ago

Need to create benchmarks again.

chsasank commented 4 months ago

Thanks to @BiradarSiddhant02 for benchmarks. Perf has gotten better with #77

MMult_1x4_3_performance_plot_02_12

BiradarSiddhant02 commented 4 months ago

MMult_1x4_4_performance_plot_13:03 MMult_4x4_4_performance_plot_13:33

The reference multiplier (C code) is very slow compared to the c-lisp version. Both are compiled with -O1 optimization and linked with the main.o file (also compiled with -O1) at the end.

BiradarSiddhant02 commented 4 months ago

The c-lisp kernel of MMult_1x4_4 is comparably worse

Upon analysing llvm intermediate representation of both c and c-lisp implementation of 1x4_4. The LLVM IR for c-lisp is atleast 4x bigger than LLVM IR for C in terms of lines of code

BiradarSiddhant02 commented 3 months ago

MMult_4x4_4_performance_plot_12:55

The c-lisp kernel matches C kernel in terms of performance check commit https://github.com/chsasank/llama.lisp/pull/70/commits/ea7aa6fda5711513d44f96f8f2bdeae4698e79be

Fixed a mistake in code where the for loops are dissimilar between c-lisp and C kernels. Also the fixed the matrix indexing mistake in C kernel

chsasank commented 3 months ago

Clearly, #77 fixed the perf difference b/w C and C-lisp. We're now exactly same speed as C. Closing the issue. Hopefully don't have to bother about perf of c-lisp anymore :)