BlueBrain / nmodl

Code Generation Framework For NEURON MODeling Language
https://bluebrain.github.io/nmodl/
Apache License 2.0
55 stars 15 forks source link

[LLVM] Optimising code for vectorisation #544

Open georgemitenkov opened 3 years ago

georgemitenkov commented 3 years ago

This is a placeholder for discussion. I am not 100% up-to-date with what optimisations are done on the AST level, but catching up :)

Consider the following kernel that is vectorised with vector width of 4:

VOID nrn_state_hh(INSTANCE_STRUCT *mech){
    INTEGER id
    for(id = 0; id<mech->node_count; id = id+4) {
        INTEGER node_id
        DOUBLE v
        mech->m[id] = mech->m[id]+(1.0-exp(mech->dt*((((-1.0)))/mech->mtau[id])))*(-(((mech->minf[id]))/mech->mtau[id])/((((-1.0)))/mech->mtau[id])-mech->m[id])
        mech->h[id] = mech->h[id]+(1.0-exp(mech->dt*((((-1.0)))/mech->htau[id])))*(-(((mech->hinf[id]))/mech->htau[id])/((((-1.0)))/mech->htau[id])-mech->h[id])
        mech->n[id] = mech->n[id]+(1.0-exp(mech->dt*((((-1.0)))/mech->ntau[id])))*(-(((mech->ninf[id]))/mech->ntau[id])/((((-1.0)))/mech->ntau[id])-mech->n[id])
    }
}

The corresponding LLVM is

; ModuleID = 'hh'
source_filename = "hh"

%hh__instance_var__type = type { double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, i32*, i32*, i32*, i32*, i32*, i32*, double*, i32*, double, double, double, i32, i32 }

define void @nrn_state_hh(%hh__instance_var__type* %mech1) {
  %mech = alloca %hh__instance_var__type*, align 8
  store %hh__instance_var__type* %mech1, %hh__instance_var__type** %mech, align 8
  %id = alloca i32, align 4
  store i32 0, i32* %id, align 4
  %__vec_id = alloca <4 x i32>, align 16
  store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %__vec_id, align 16
  br label %for.cond

for.cond:                                         ; preds = %for.inc, %0
  %1 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %2 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %1, i32 0, i32 43
  %3 = load i32, i32* %2, align 4
  %4 = load i32, i32* %id, align 4
  %5 = icmp slt i32 %4, %3
  br i1 %5, label %for.body, label %for.exit

for.body:                                         ; preds = %for.cond
  %node_id = alloca i32, align 4
  %v = alloca double, align 8
  %6 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %7 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %6, i32 0, i32 13
  %8 = load i32, i32* %id, align 4
  %9 = sext i32 %8 to i64
  %10 = load double*, double** %7, align 8
  %11 = bitcast double* %10 to <4 x double>*
  %12 = getelementptr inbounds <4 x double>, <4 x double>* %11, i64 %9
  %13 = load <4 x double>, <4 x double>* %12, align 32
  %14 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %15 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %14, i32 0, i32 10
  %16 = load i32, i32* %id, align 4
  %17 = sext i32 %16 to i64
  %18 = load double*, double** %15, align 8
  %19 = bitcast double* %18 to <4 x double>*
  %20 = getelementptr inbounds <4 x double>, <4 x double>* %19, i64 %17
  %21 = load <4 x double>, <4 x double>* %20, align 32
  %22 = fdiv <4 x double> <double -1.000000e+00, double -1.000000e+00, double -1.000000e+00, double -1.000000e+00>, %21
  %23 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %24 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %23, i32 0, i32 10
  %25 = load i32, i32* %id, align 4
  %26 = sext i32 %25 to i64
  %27 = load double*, double** %24, align 8
  %28 = bitcast double* %27 to <4 x double>*
  %29 = getelementptr inbounds <4 x double>, <4 x double>* %28, i64 %26
  %30 = load <4 x double>, <4 x double>* %29, align 32
  %31 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %32 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %31, i32 0, i32 7
  %33 = load i32, i32* %id, align 4
  %34 = sext i32 %33 to i64
  %35 = load double*, double** %32, align 8
  %36 = bitcast double* %35 to <4 x double>*
  %37 = getelementptr inbounds <4 x double>, <4 x double>* %36, i64 %34
  %38 = load <4 x double>, <4 x double>* %37, align 32
  %39 = fdiv <4 x double> %38, %30
  %40 = fneg <4 x double> %39
  %41 = fdiv <4 x double> %40, %22
  %42 = fsub <4 x double> %41, %13
  %43 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %44 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %43, i32 0, i32 10
  %45 = load i32, i32* %id, align 4
  %46 = sext i32 %45 to i64
  %47 = load double*, double** %44, align 8
  %48 = bitcast double* %47 to <4 x double>*
  %49 = getelementptr inbounds <4 x double>, <4 x double>* %48, i64 %46
  %50 = load <4 x double>, <4 x double>* %49, align 32
  %51 = fdiv <4 x double> <double -1.000000e+00, double -1.000000e+00, double -1.000000e+00, double -1.000000e+00>, %50
  %52 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %53 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %52, i32 0, i32 40
  %54 = load double, double* %53, align 8
  %.splatinsert = insertelement <4 x double> undef, double %54, i32 0
  %.splat = shufflevector <4 x double> %.splatinsert, <4 x double> undef, <4 x i32> zeroinitializer
  %55 = fmul <4 x double> %.splat, %51
  %56 = call <4 x double> @llvm.exp.v4f64(<4 x double> %55)
  %57 = fsub <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %56
  %58 = fmul <4 x double> %57, %42
  %59 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %60 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %59, i32 0, i32 13
  %61 = load i32, i32* %id, align 4
  %62 = sext i32 %61 to i64
  %63 = load double*, double** %60, align 8
  %64 = bitcast double* %63 to <4 x double>*
  %65 = getelementptr inbounds <4 x double>, <4 x double>* %64, i64 %62
  %66 = load <4 x double>, <4 x double>* %65, align 32
  %67 = fadd <4 x double> %66, %58
  %68 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %69 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %68, i32 0, i32 13
  %70 = load i32, i32* %id, align 4
  %71 = sext i32 %70 to i64
  %72 = load double*, double** %69, align 8
  %73 = bitcast double* %72 to <4 x double>*
  %74 = getelementptr inbounds <4 x double>, <4 x double>* %73, i64 %71
  store <4 x double> %67, <4 x double>* %74, align 32
  %75 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %76 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %75, i32 0, i32 14
  %77 = load i32, i32* %id, align 4
  %78 = sext i32 %77 to i64
  %79 = load double*, double** %76, align 8
  %80 = bitcast double* %79 to <4 x double>*
  %81 = getelementptr inbounds <4 x double>, <4 x double>* %80, i64 %78
  %82 = load <4 x double>, <4 x double>* %81, align 32
  %83 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %84 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %83, i32 0, i32 11
  %85 = load i32, i32* %id, align 4
  %86 = sext i32 %85 to i64
  %87 = load double*, double** %84, align 8
  %88 = bitcast double* %87 to <4 x double>*
  %89 = getelementptr inbounds <4 x double>, <4 x double>* %88, i64 %86
  %90 = load <4 x double>, <4 x double>* %89, align 32
  %91 = fdiv <4 x double> <double -1.000000e+00, double -1.000000e+00, double -1.000000e+00, double -1.000000e+00>, %90
  %92 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %93 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %92, i32 0, i32 11
  %94 = load i32, i32* %id, align 4
  %95 = sext i32 %94 to i64
  %96 = load double*, double** %93, align 8
  %97 = bitcast double* %96 to <4 x double>*
  %98 = getelementptr inbounds <4 x double>, <4 x double>* %97, i64 %95
  %99 = load <4 x double>, <4 x double>* %98, align 32
  %100 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %101 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %100, i32 0, i32 8
  %102 = load i32, i32* %id, align 4
  %103 = sext i32 %102 to i64
  %104 = load double*, double** %101, align 8
  %105 = bitcast double* %104 to <4 x double>*
  %106 = getelementptr inbounds <4 x double>, <4 x double>* %105, i64 %103
  %107 = load <4 x double>, <4 x double>* %106, align 32
  %108 = fdiv <4 x double> %107, %99
  %109 = fneg <4 x double> %108
  %110 = fdiv <4 x double> %109, %91
  %111 = fsub <4 x double> %110, %82
  %112 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %113 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %112, i32 0, i32 11
  %114 = load i32, i32* %id, align 4
  %115 = sext i32 %114 to i64
  %116 = load double*, double** %113, align 8
  %117 = bitcast double* %116 to <4 x double>*
  %118 = getelementptr inbounds <4 x double>, <4 x double>* %117, i64 %115
  %119 = load <4 x double>, <4 x double>* %118, align 32
  %120 = fdiv <4 x double> <double -1.000000e+00, double -1.000000e+00, double -1.000000e+00, double -1.000000e+00>, %119
  %121 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %122 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %121, i32 0, i32 40
  %123 = load double, double* %122, align 8
  %.splatinsert2 = insertelement <4 x double> undef, double %123, i32 0
  %.splat3 = shufflevector <4 x double> %.splatinsert2, <4 x double> undef, <4 x i32> zeroinitializer
  %124 = fmul <4 x double> %.splat3, %120
  %125 = call <4 x double> @llvm.exp.v4f64(<4 x double> %124)
  %126 = fsub <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %125
  %127 = fmul <4 x double> %126, %111
  %128 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %129 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %128, i32 0, i32 14
  %130 = load i32, i32* %id, align 4
  %131 = sext i32 %130 to i64
  %132 = load double*, double** %129, align 8
  %133 = bitcast double* %132 to <4 x double>*
  %134 = getelementptr inbounds <4 x double>, <4 x double>* %133, i64 %131
  %135 = load <4 x double>, <4 x double>* %134, align 32
  %136 = fadd <4 x double> %135, %127
  %137 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %138 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %137, i32 0, i32 14
  %139 = load i32, i32* %id, align 4
  %140 = sext i32 %139 to i64
  %141 = load double*, double** %138, align 8
  %142 = bitcast double* %141 to <4 x double>*
  %143 = getelementptr inbounds <4 x double>, <4 x double>* %142, i64 %140
  store <4 x double> %136, <4 x double>* %143, align 32
  %144 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %145 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %144, i32 0, i32 15
  %146 = load i32, i32* %id, align 4
  %147 = sext i32 %146 to i64
  %148 = load double*, double** %145, align 8
  %149 = bitcast double* %148 to <4 x double>*
  %150 = getelementptr inbounds <4 x double>, <4 x double>* %149, i64 %147
  %151 = load <4 x double>, <4 x double>* %150, align 32
  %152 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %153 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %152, i32 0, i32 12
  %154 = load i32, i32* %id, align 4
  %155 = sext i32 %154 to i64
  %156 = load double*, double** %153, align 8
  %157 = bitcast double* %156 to <4 x double>*
  %158 = getelementptr inbounds <4 x double>, <4 x double>* %157, i64 %155
  %159 = load <4 x double>, <4 x double>* %158, align 32
  %160 = fdiv <4 x double> <double -1.000000e+00, double -1.000000e+00, double -1.000000e+00, double -1.000000e+00>, %159
  %161 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %162 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %161, i32 0, i32 12
  %163 = load i32, i32* %id, align 4
  %164 = sext i32 %163 to i64
  %165 = load double*, double** %162, align 8
  %166 = bitcast double* %165 to <4 x double>*
  %167 = getelementptr inbounds <4 x double>, <4 x double>* %166, i64 %164
  %168 = load <4 x double>, <4 x double>* %167, align 32
  %169 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %170 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %169, i32 0, i32 9
  %171 = load i32, i32* %id, align 4
  %172 = sext i32 %171 to i64
  %173 = load double*, double** %170, align 8
  %174 = bitcast double* %173 to <4 x double>*
  %175 = getelementptr inbounds <4 x double>, <4 x double>* %174, i64 %172
  %176 = load <4 x double>, <4 x double>* %175, align 32
  %177 = fdiv <4 x double> %176, %168
  %178 = fneg <4 x double> %177
  %179 = fdiv <4 x double> %178, %160
  %180 = fsub <4 x double> %179, %151
  %181 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %182 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %181, i32 0, i32 12
  %183 = load i32, i32* %id, align 4
  %184 = sext i32 %183 to i64
  %185 = load double*, double** %182, align 8
  %186 = bitcast double* %185 to <4 x double>*
  %187 = getelementptr inbounds <4 x double>, <4 x double>* %186, i64 %184
  %188 = load <4 x double>, <4 x double>* %187, align 32
  %189 = fdiv <4 x double> <double -1.000000e+00, double -1.000000e+00, double -1.000000e+00, double -1.000000e+00>, %188
  %190 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %191 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %190, i32 0, i32 40
  %192 = load double, double* %191, align 8
  %.splatinsert4 = insertelement <4 x double> undef, double %192, i32 0
  %.splat5 = shufflevector <4 x double> %.splatinsert4, <4 x double> undef, <4 x i32> zeroinitializer
  %193 = fmul <4 x double> %.splat5, %189
  %194 = call <4 x double> @llvm.exp.v4f64(<4 x double> %193)
  %195 = fsub <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %194
  %196 = fmul <4 x double> %195, %180
  %197 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %198 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %197, i32 0, i32 15
  %199 = load i32, i32* %id, align 4
  %200 = sext i32 %199 to i64
  %201 = load double*, double** %198, align 8
  %202 = bitcast double* %201 to <4 x double>*
  %203 = getelementptr inbounds <4 x double>, <4 x double>* %202, i64 %200
  %204 = load <4 x double>, <4 x double>* %203, align 32
  %205 = fadd <4 x double> %204, %196
  %206 = load %hh__instance_var__type*, %hh__instance_var__type** %mech, align 8
  %207 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %206, i32 0, i32 15
  %208 = load i32, i32* %id, align 4
  %209 = sext i32 %208 to i64
  %210 = load double*, double** %207, align 8
  %211 = bitcast double* %210 to <4 x double>*
  %212 = getelementptr inbounds <4 x double>, <4 x double>* %211, i64 %209
  store <4 x double> %205, <4 x double>* %212, align 32
  br label %for.inc

for.inc:                                          ; preds = %for.body
  %213 = load i32, i32* %id, align 4
  %214 = add i32 %213, 4
  store i32 %214, i32* %id, align 4
  %215 = load <4 x i32>, <4 x i32>* %__vec_id, align 16
  %216 = add <4 x i32> %215, <i32 4, i32 4, i32 4, i32 4>
  store <4 x i32> %216, <4 x i32>* %__vec_id, align 16
  br label %for.cond

for.exit:                                         ; preds = %for.cond
  ret void
}

; Function Attrs: nounwind readnone speculatable willreturn
declare <4 x double> @llvm.exp.v4f64(<4 x double>) #0

attributes #0 = { nounwind readnone speculatable willreturn }

Running opt <llvm_file> -o3 -S -o - gives

; ModuleID = 'vectorised.ll'
source_filename = "hh"

%hh__instance_var__type = type { double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, i32*, i32*, i32*, i32*, i32*, i32*, double*, i32*, double, double, double, i32, i32 }

; Function Attrs: nofree nounwind
define void @nrn_state_hh(%hh__instance_var__type* nocapture readonly %mech1) local_unnamed_addr #0 {
  %1 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %mech1, i64 0, i32 43
  %2 = load i32, i32* %1, align 4
  %3 = icmp sgt i32 %2, 0
  br i1 %3, label %for.body.lr.ph, label %for.exit

for.body.lr.ph:                                   ; preds = %0
  %4 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %mech1, i64 0, i32 13
  %5 = bitcast double** %4 to <4 x double>**
  %6 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %mech1, i64 0, i32 10
  %7 = bitcast double** %6 to <4 x double>**
  %8 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %mech1, i64 0, i32 7
  %9 = bitcast double** %8 to <4 x double>**
  %10 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %mech1, i64 0, i32 40
  %11 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %mech1, i64 0, i32 14
  %12 = bitcast double** %11 to <4 x double>**
  %13 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %mech1, i64 0, i32 11
  %14 = bitcast double** %13 to <4 x double>**
  %15 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %mech1, i64 0, i32 8
  %16 = bitcast double** %15 to <4 x double>**
  %17 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %mech1, i64 0, i32 15
  %18 = bitcast double** %17 to <4 x double>**
  %19 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %mech1, i64 0, i32 12
  %20 = bitcast double** %19 to <4 x double>**
  %21 = getelementptr inbounds %hh__instance_var__type, %hh__instance_var__type* %mech1, i64 0, i32 9
  %22 = bitcast double** %21 to <4 x double>**
  br label %for.body

for.body:                                         ; preds = %for.body.lr.ph, %for.body
  %id.047 = phi i32 [ 0, %for.body.lr.ph ], [ %84, %for.body ]
  %23 = sext i32 %id.047 to i64
  %24 = load <4 x double>*, <4 x double>** %5, align 8
  %25 = getelementptr inbounds <4 x double>, <4 x double>* %24, i64 %23
  %26 = load <4 x double>, <4 x double>* %25, align 32
  %27 = load <4 x double>*, <4 x double>** %7, align 8
  %28 = getelementptr inbounds <4 x double>, <4 x double>* %27, i64 %23
  %29 = load <4 x double>, <4 x double>* %28, align 32
  %30 = fdiv <4 x double> <double -1.000000e+00, double -1.000000e+00, double -1.000000e+00, double -1.000000e+00>, %29
  %31 = load <4 x double>*, <4 x double>** %9, align 8
  %32 = getelementptr inbounds <4 x double>, <4 x double>* %31, i64 %23
  %33 = load <4 x double>, <4 x double>* %32, align 32
  %34 = fneg <4 x double> %33
  %35 = fdiv <4 x double> %34, %29
  %36 = fdiv <4 x double> %35, %30
  %37 = fsub <4 x double> %36, %26
  %38 = load double, double* %10, align 8
  %.splatinsert = insertelement <4 x double> undef, double %38, i32 0
  %.splat = shufflevector <4 x double> %.splatinsert, <4 x double> undef, <4 x i32> zeroinitializer
  %39 = fmul <4 x double> %30, %.splat
  %40 = tail call <4 x double> @llvm.exp.v4f64(<4 x double> %39)
  %41 = fsub <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %40
  %42 = fmul <4 x double> %37, %41
  %43 = fadd <4 x double> %26, %42
  store <4 x double> %43, <4 x double>* %25, align 32
  %44 = load <4 x double>*, <4 x double>** %12, align 8
  %45 = getelementptr inbounds <4 x double>, <4 x double>* %44, i64 %23
  %46 = load <4 x double>, <4 x double>* %45, align 32
  %47 = load <4 x double>*, <4 x double>** %14, align 8
  %48 = getelementptr inbounds <4 x double>, <4 x double>* %47, i64 %23
  %49 = load <4 x double>, <4 x double>* %48, align 32
  %50 = fdiv <4 x double> <double -1.000000e+00, double -1.000000e+00, double -1.000000e+00, double -1.000000e+00>, %49
  %51 = load <4 x double>*, <4 x double>** %16, align 8
  %52 = getelementptr inbounds <4 x double>, <4 x double>* %51, i64 %23
  %53 = load <4 x double>, <4 x double>* %52, align 32
  %54 = fneg <4 x double> %53
  %55 = fdiv <4 x double> %54, %49
  %56 = fdiv <4 x double> %55, %50
  %57 = fsub <4 x double> %56, %46
  %58 = load double, double* %10, align 8
  %.splatinsert2 = insertelement <4 x double> undef, double %58, i32 0
  %.splat3 = shufflevector <4 x double> %.splatinsert2, <4 x double> undef, <4 x i32> zeroinitializer
  %59 = fmul <4 x double> %50, %.splat3
  %60 = tail call <4 x double> @llvm.exp.v4f64(<4 x double> %59)
  %61 = fsub <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %60
  %62 = fmul <4 x double> %57, %61
  %63 = fadd <4 x double> %46, %62
  store <4 x double> %63, <4 x double>* %45, align 32
  %64 = load <4 x double>*, <4 x double>** %18, align 8
  %65 = getelementptr inbounds <4 x double>, <4 x double>* %64, i64 %23
  %66 = load <4 x double>, <4 x double>* %65, align 32
  %67 = load <4 x double>*, <4 x double>** %20, align 8
  %68 = getelementptr inbounds <4 x double>, <4 x double>* %67, i64 %23
  %69 = load <4 x double>, <4 x double>* %68, align 32
  %70 = fdiv <4 x double> <double -1.000000e+00, double -1.000000e+00, double -1.000000e+00, double -1.000000e+00>, %69
  %71 = load <4 x double>*, <4 x double>** %22, align 8
  %72 = getelementptr inbounds <4 x double>, <4 x double>* %71, i64 %23
  %73 = load <4 x double>, <4 x double>* %72, align 32
  %74 = fneg <4 x double> %73
  %75 = fdiv <4 x double> %74, %69
  %76 = fdiv <4 x double> %75, %70
  %77 = fsub <4 x double> %76, %66
  %78 = load double, double* %10, align 8
  %.splatinsert4 = insertelement <4 x double> undef, double %78, i32 0
  %.splat5 = shufflevector <4 x double> %.splatinsert4, <4 x double> undef, <4 x i32> zeroinitializer
  %79 = fmul <4 x double> %70, %.splat5
  %80 = tail call <4 x double> @llvm.exp.v4f64(<4 x double> %79)
  %81 = fsub <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %80
  %82 = fmul <4 x double> %77, %81
  %83 = fadd <4 x double> %66, %82
  store <4 x double> %83, <4 x double>* %65, align 32
  %84 = add i32 %id.047, 4
  %85 = load i32, i32* %1, align 4
  %86 = icmp slt i32 %84, %85
  br i1 %86, label %for.body, label %for.exit

for.exit:                                         ; preds = %for.body, %0
  ret void
}

; Function Attrs: nounwind readnone speculatable willreturn
declare <4 x double> @llvm.exp.v4f64(<4 x double>) #1

attributes #0 = { nofree nounwind }
attributes #1 = { nounwind readnone speculatable willreturn }
georgemitenkov commented 3 years ago

This is good, we now have x2 less instructions! However, we still observe certain redundancy:

%78 = load double, double* %10, align 8
%.splatinsert4 = insertelement <4 x double> undef, double %78, i32 0
%.splat5 = shufflevector <4 x double> %.splatinsert4, <4 x double> undef, <4 x i32> zeroinitializer

I suspect that LLVM does not pick this up cause of vectorised code, more complicated vector instructions like shufflevector. If we look at what has generated this code, we see that actually it is loop invariant:

%78 = mech->dt // constant for all loop iterations

By adapting some kind of loop invariant code motion on AST level (or the way the kernel is constructed) we would be able to remove the duplicated code blocks.

pramodk commented 3 years ago

%78 = mech->dt // constant for all loop iterations

yeah, this dt would be trivial to avoid. I will do this in next PR.