BlueBrain / nmodl

Code Generation Framework For NEURON MODeling Language
https://bluebrain.github.io/nmodl/
Apache License 2.0
53 stars 15 forks source link

LLVM vector code generation : How to? #473

Open pramodk opened 3 years ago

pramodk commented 3 years ago

This is a ticket to gather reference and discuss vector code generation strategies for LLVM IR. cc: @georgemitenkov

pramodk commented 3 years ago

Starting with simple pattern that is often useful in NMODL context:

void compute(double *restrict x, double *restrict y, double *restrict z, int *restrict index) {
    for(int i = 0; i < 1024; i++) {
        x[i] = y[i] + 3*z[index[i]];
    }
}

and generated IR:

$ clang -c -std=c99 test.c -S -emit-llvm -O3 && cat test.ll
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: nofree norecurse nounwind uwtable
define dso_local void @compute(double* noalias nocapture, double* noalias nocapture readonly, double* noalias nocapture readonly, i32* noalias nocapture readonly) local_unnamed_addr #0 {
  br label %6

5:                                                ; preds = %6
  ret void

6:                                                ; preds = %6, %4
  %7 = phi i64 [ 0, %4 ], [ %29, %6 ]
  %8 = getelementptr inbounds double, double* %1, i64 %7
  %9 = load double, double* %8, align 8, !tbaa !2
  %10 = getelementptr inbounds i32, i32* %3, i64 %7
  %11 = load i32, i32* %10, align 4, !tbaa !6
  %12 = sext i32 %11 to i64
  %13 = getelementptr inbounds double, double* %2, i64 %12
  %14 = load double, double* %13, align 8, !tbaa !2
  %15 = fmul double %14, 3.000000e+00
  %16 = fadd double %9, %15
  %17 = getelementptr inbounds double, double* %0, i64 %7
  store double %16, double* %17, align 8, !tbaa !2
  %18 = or i64 %7, 1
  %19 = getelementptr inbounds double, double* %1, i64 %18
  %20 = load double, double* %19, align 8, !tbaa !2
  %21 = getelementptr inbounds i32, i32* %3, i64 %18
  %22 = load i32, i32* %21, align 4, !tbaa !6
  %23 = sext i32 %22 to i64
  %24 = getelementptr inbounds double, double* %2, i64 %23
  %25 = load double, double* %24, align 8, !tbaa !2
  %26 = fmul double %25, 3.000000e+00
  %27 = fadd double %20, %26
  %28 = getelementptr inbounds double, double* %0, i64 %18
  store double %27, double* %28, align 8, !tbaa !2
  %29 = add nuw nsw i64 %7, 2
  %30 = icmp eq i64 %29, 1024
  br i1 %30, label %5, label %6
}

and enabling AVX-512 gives:

$ clang -c -std=c99 test.c -S -emit-llvm -O3 -march=skylake-avx512 && cat test.ll
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: nofree norecurse nounwind uwtable
define dso_local void @compute(double* noalias nocapture, double* noalias nocapture readonly, double* noalias nocapture readonly, i32* noalias nocapture readonly) local_unnamed_addr #0 {
  br label %5

5:                                                ; preds = %5, %4
  %6 = phi i64 [ 0, %4 ], [ %59, %5 ]
  %7 = getelementptr inbounds double, double* %1, i64 %6
  %8 = bitcast double* %7 to <8 x double>*
  %9 = load <8 x double>, <8 x double>* %8, align 8, !tbaa !2
  %10 = getelementptr inbounds double, double* %7, i64 8
  %11 = bitcast double* %10 to <8 x double>*
  %12 = load <8 x double>, <8 x double>* %11, align 8, !tbaa !2
  %13 = getelementptr inbounds double, double* %7, i64 16
  %14 = bitcast double* %13 to <8 x double>*
  %15 = load <8 x double>, <8 x double>* %14, align 8, !tbaa !2
  %16 = getelementptr inbounds double, double* %7, i64 24
  %17 = bitcast double* %16 to <8 x double>*
  %18 = load <8 x double>, <8 x double>* %17, align 8, !tbaa !2
  %19 = getelementptr inbounds i32, i32* %3, i64 %6
  %20 = bitcast i32* %19 to <8 x i32>*
  %21 = load <8 x i32>, <8 x i32>* %20, align 4, !tbaa !6
  %22 = getelementptr inbounds i32, i32* %19, i64 8
  %23 = bitcast i32* %22 to <8 x i32>*
  %24 = load <8 x i32>, <8 x i32>* %23, align 4, !tbaa !6
  %25 = getelementptr inbounds i32, i32* %19, i64 16
  %26 = bitcast i32* %25 to <8 x i32>*
  %27 = load <8 x i32>, <8 x i32>* %26, align 4, !tbaa !6
  %28 = getelementptr inbounds i32, i32* %19, i64 24
  %29 = bitcast i32* %28 to <8 x i32>*
  %30 = load <8 x i32>, <8 x i32>* %29, align 4, !tbaa !6
  %31 = sext <8 x i32> %21 to <8 x i64>
  %32 = sext <8 x i32> %24 to <8 x i64>
  %33 = sext <8 x i32> %27 to <8 x i64>
  %34 = sext <8 x i32> %30 to <8 x i64>
  %35 = getelementptr inbounds double, double* %2, <8 x i64> %31
  %36 = getelementptr inbounds double, double* %2, <8 x i64> %32
  %37 = getelementptr inbounds double, double* %2, <8 x i64> %33
  %38 = getelementptr inbounds double, double* %2, <8 x i64> %34
  %39 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %35, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> undef), !tbaa !2
  %40 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %36, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> undef), !tbaa !2
  %41 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %37, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> undef), !tbaa !2
  %42 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %38, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x double> undef), !tbaa !2
  %43 = fmul <8 x double> %39, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
  %44 = fmul <8 x double> %40, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
  %45 = fmul <8 x double> %41, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
  %46 = fmul <8 x double> %42, <double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00, double 3.000000e+00>
  %47 = fadd <8 x double> %9, %43
  %48 = fadd <8 x double> %12, %44
  %49 = fadd <8 x double> %15, %45
  %50 = fadd <8 x double> %18, %46
  %51 = getelementptr inbounds double, double* %0, i64 %6
  %52 = bitcast double* %51 to <8 x double>*
  store <8 x double> %47, <8 x double>* %52, align 8, !tbaa !2
  %53 = getelementptr inbounds double, double* %51, i64 8
  %54 = bitcast double* %53 to <8 x double>*
  store <8 x double> %48, <8 x double>* %54, align 8, !tbaa !2
  %55 = getelementptr inbounds double, double* %51, i64 16
  %56 = bitcast double* %55 to <8 x double>*
  store <8 x double> %49, <8 x double>* %56, align 8, !tbaa !2
  %57 = getelementptr inbounds double, double* %51, i64 24
  %58 = bitcast double* %57 to <8 x double>*
  store <8 x double> %50, <8 x double>* %58, align 8, !tbaa !2
  %59 = add i64 %6, 32
  %60 = icmp eq i64 %59, 1024
  br i1 %60, label %61, label %5, !llvm.loop !8

61:                                               ; preds = %5
  ret void
}

So this example shows one approach of generating LLVM IR with vector types which will be later mapped to vector instructions.

pramodk commented 3 years ago

Another common patten is to use math functions:

void compute(double *restrict x, double *restrict y, double *restrict z, int *restrict index) {
    for(int i = 0; i < 1024; i++) {
        x[i] = y[i] + exp(z[i]);
    }
}

which inhibits vectorisation:

$ clang -c -std=c99 test.c -S -emit-llvm -O3 -march=skylake-avx512 -Rpass-missed=loop-vectorize && cat test.ll
test.c:4:5: remark: loop not vectorized [-Rpass-missed=loop-vectorize]
    for(int i = 0; i < 1024; i++) {
    ^
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: nofree nounwind uwtable
define dso_local void @compute(double* noalias nocapture, double* noalias nocapture readonly, double* noalias nocapture readonly, i32* noalias nocapture readnone) local_unnamed_addr #0 !dbg !6 {
  br label %6, !dbg !8

5:                                                ; preds = %6
  ret void, !dbg !9

6:                                                ; preds = %6, %4
  %7 = phi i64 [ 0, %4 ], [ %15, %6 ]
  %8 = getelementptr inbounds double, double* %1, i64 %7, !dbg !10
  %9 = load double, double* %8, align 8, !dbg !10, !tbaa !11
  %10 = getelementptr inbounds double, double* %2, i64 %7, !dbg !15
  %11 = load double, double* %10, align 8, !dbg !15, !tbaa !11
  %12 = tail call double @exp(double %11) #2, !dbg !16
  %13 = fadd double %9, %12, !dbg !17
  %14 = getelementptr inbounds double, double* %0, i64 %7, !dbg !18
  store double %13, double* %14, align 8, !dbg !19, !tbaa !11
  %15 = add nuw nsw i64 %7, 1, !dbg !20
  %16 = icmp eq i64 %15, 1024, !dbg !21
  br i1 %16, label %5, label %6, !dbg !8, !llvm.loop !22
}

Clang documentation says :

image

Using -fno-math-errno gives expected result:

$ clang -c -std=c99 test.c -S -emit-llvm -O3 -march=skylake-avx512 -Rpass-missed=loop-vectorize -fno-math-errno && cat test.ll
; ModuleID = 'test.c'
source_filename = "test.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: nofree nounwind uwtable
define dso_local void @compute(double* noalias nocapture, double* noalias nocapture readonly, double* noalias nocapture readonly, i32* noalias nocapture readnone) local_unnamed_addr #0 !dbg !6 {
  br label %5, !dbg !8

5:                                                ; preds = %5, %4
  %6 = phi i64 [ 0, %4 ], [ %94, %5 ], !dbg !9
  %7 = getelementptr inbounds double, double* %1, i64 %6, !dbg !10
  %8 = bitcast double* %7 to <8 x double>*, !dbg !10
  %9 = load <8 x double>, <8 x double>* %8, align 8, !dbg !10, !tbaa !11
  %10 = getelementptr inbounds double, double* %2, i64 %6, !dbg !15
  %11 = bitcast double* %10 to <8 x double>*, !dbg !15
  %12 = load <8 x double>, <8 x double>* %11, align 8, !dbg !15, !tbaa !11
  %13 = call <8 x double> @llvm.exp.v8f64(<8 x double> %12), !dbg !16
  %14 = fadd <8 x double> %9, %13, !dbg !17
  %15 = getelementptr inbounds double, double* %0, i64 %6, !dbg !18
  %16 = bitcast double* %15 to <8 x double>*, !dbg !19
  store <8 x double> %14, <8 x double>* %16, align 8, !dbg !19, !tbaa !11
  %17 = or i64 %6, 8, !dbg !9
  %18 = getelementptr inbounds double, double* %1, i64 %17, !dbg !10
  %19 = bitcast double* %18 to <8 x double>*, !dbg !10
  %20 = load <8 x double>, <8 x double>* %19, align 8, !dbg !10, !tbaa !11
  %21 = getelementptr inbounds double, double* %2, i64 %17, !dbg !15
  %22 = bitcast double* %21 to <8 x double>*, !dbg !15
  %23 = load <8 x double>, <8 x double>* %22, align 8, !dbg !15, !tbaa !11
  %24 = call <8 x double> @llvm.exp.v8f64(<8 x double> %23), !dbg !16
  %25 = fadd <8 x double> %20, %24, !dbg !17
  %26 = getelementptr inbounds double, double* %0, i64 %17, !dbg !18
  %27 = bitcast double* %26 to <8 x double>*, !dbg !19
  store <8 x double> %25, <8 x double>* %27, align 8, !dbg !19, !tbaa !11
  %28 = or i64 %6, 16, !dbg !9
  %29 = getelementptr inbounds double, double* %1, i64 %28, !dbg !10
  %30 = bitcast double* %29 to <8 x double>*, !dbg !10
  %31 = load <8 x double>, <8 x double>* %30, align 8, !dbg !10, !tbaa !11
  %32 = getelementptr inbounds double, double* %2, i64 %28, !dbg !15
  %33 = bitcast double* %32 to <8 x double>*, !dbg !15
  %34 = load <8 x double>, <8 x double>* %33, align 8, !dbg !15, !tbaa !11
  %35 = call <8 x double> @llvm.exp.v8f64(<8 x double> %34), !dbg !16
  %36 = fadd <8 x double> %31, %35, !dbg !17
  %37 = getelementptr inbounds double, double* %0, i64 %28, !dbg !18
  %38 = bitcast double* %37 to <8 x double>*, !dbg !19
  store <8 x double> %36, <8 x double>* %38, align 8, !dbg !19, !tbaa !11
  %39 = or i64 %6, 24, !dbg !9
  %40 = getelementptr inbounds double, double* %1, i64 %39, !dbg !10
  %41 = bitcast double* %40 to <8 x double>*, !dbg !10
  %42 = load <8 x double>, <8 x double>* %41, align 8, !dbg !10, !tbaa !11
  %43 = getelementptr inbounds double, double* %2, i64 %39, !dbg !15
  %44 = bitcast double* %43 to <8 x double>*, !dbg !15
  %45 = load <8 x double>, <8 x double>* %44, align 8, !dbg !15, !tbaa !11
  %46 = call <8 x double> @llvm.exp.v8f64(<8 x double> %45), !dbg !16
  %47 = fadd <8 x double> %42, %46, !dbg !17
  %48 = getelementptr inbounds double, double* %0, i64 %39, !dbg !18
  %49 = bitcast double* %48 to <8 x double>*, !dbg !19
  store <8 x double> %47, <8 x double>* %49, align 8, !dbg !19, !tbaa !11
  %50 = or i64 %6, 32, !dbg !9
  %51 = getelementptr inbounds double, double* %1, i64 %50, !dbg !10
  %52 = bitcast double* %51 to <8 x double>*, !dbg !10
  %53 = load <8 x double>, <8 x double>* %52, align 8, !dbg !10, !tbaa !11
  %54 = getelementptr inbounds double, double* %2, i64 %50, !dbg !15
  %55 = bitcast double* %54 to <8 x double>*, !dbg !15
  %56 = load <8 x double>, <8 x double>* %55, align 8, !dbg !15, !tbaa !11
  %57 = call <8 x double> @llvm.exp.v8f64(<8 x double> %56), !dbg !16
  %58 = fadd <8 x double> %53, %57, !dbg !17
  %59 = getelementptr inbounds double, double* %0, i64 %50, !dbg !18
  %60 = bitcast double* %59 to <8 x double>*, !dbg !19
  store <8 x double> %58, <8 x double>* %60, align 8, !dbg !19, !tbaa !11
  %61 = or i64 %6, 40, !dbg !9
  %62 = getelementptr inbounds double, double* %1, i64 %61, !dbg !10
  %63 = bitcast double* %62 to <8 x double>*, !dbg !10
  %64 = load <8 x double>, <8 x double>* %63, align 8, !dbg !10, !tbaa !11
  %65 = getelementptr inbounds double, double* %2, i64 %61, !dbg !15
  %66 = bitcast double* %65 to <8 x double>*, !dbg !15
  %67 = load <8 x double>, <8 x double>* %66, align 8, !dbg !15, !tbaa !11
  %68 = call <8 x double> @llvm.exp.v8f64(<8 x double> %67), !dbg !16
  %69 = fadd <8 x double> %64, %68, !dbg !17
  %70 = getelementptr inbounds double, double* %0, i64 %61, !dbg !18
  %71 = bitcast double* %70 to <8 x double>*, !dbg !19
  store <8 x double> %69, <8 x double>* %71, align 8, !dbg !19, !tbaa !11
  %72 = or i64 %6, 48, !dbg !9
  %73 = getelementptr inbounds double, double* %1, i64 %72, !dbg !10
  %74 = bitcast double* %73 to <8 x double>*, !dbg !10
  %75 = load <8 x double>, <8 x double>* %74, align 8, !dbg !10, !tbaa !11
  %76 = getelementptr inbounds double, double* %2, i64 %72, !dbg !15
  %77 = bitcast double* %76 to <8 x double>*, !dbg !15
  %78 = load <8 x double>, <8 x double>* %77, align 8, !dbg !15, !tbaa !11
  %79 = call <8 x double> @llvm.exp.v8f64(<8 x double> %78), !dbg !16
  %80 = fadd <8 x double> %75, %79, !dbg !17
  %81 = getelementptr inbounds double, double* %0, i64 %72, !dbg !18
  %82 = bitcast double* %81 to <8 x double>*, !dbg !19
  store <8 x double> %80, <8 x double>* %82, align 8, !dbg !19, !tbaa !11
  %83 = or i64 %6, 56, !dbg !9
  %84 = getelementptr inbounds double, double* %1, i64 %83, !dbg !10
  %85 = bitcast double* %84 to <8 x double>*, !dbg !10
  %86 = load <8 x double>, <8 x double>* %85, align 8, !dbg !10, !tbaa !11
  %87 = getelementptr inbounds double, double* %2, i64 %83, !dbg !15
  %88 = bitcast double* %87 to <8 x double>*, !dbg !15
  %89 = load <8 x double>, <8 x double>* %88, align 8, !dbg !15, !tbaa !11
  %90 = call <8 x double> @llvm.exp.v8f64(<8 x double> %89), !dbg !16
  %91 = fadd <8 x double> %86, %90, !dbg !17
  %92 = getelementptr inbounds double, double* %0, i64 %83, !dbg !18
  %93 = bitcast double* %92 to <8 x double>*, !dbg !19
  store <8 x double> %91, <8 x double>* %93, align 8, !dbg !19, !tbaa !11
  %94 = add nuw nsw i64 %6, 64, !dbg !9
  %95 = icmp eq i64 %94, 1024, !dbg !9
  br i1 %95, label %96, label %5, !dbg !9, !llvm.loop !20

96:                                               ; preds = %5
  ret void, !dbg !23
}
georgemitenkov commented 3 years ago

Thanks for starting a ticket on this! I am currently reading on LLVM;s SLP and Loop vectorizers, their APIs, etc.

I am wondering is there are particular cases for vectorisation in NMODL that occur frequently and on which we can focus first?

pramodk commented 3 years ago

Thanks for starting a ticket on this! I am currently reading on LLVM;s SLP and Loop vectorizers, their APIs, etc.

👍

I am wondering is there are particular cases for vectorisation in NMODL that occur frequently and on which we can focus first?

I would take following example as most common use case that will cover ~90% of the cases we have:


// It's not proper compilable code but gives an idea

void rates(double v, int id) {
    minf[id] = v + 0.01
}

void nrn_state_hh(..){
    double dt = 0.25
    double v = 0
    int id, node_id
    for(id = 0; id<node_id; id = id+1) {
        node_id = node_index[id]           // shows a load using indirect index; gather 
        v = voltage[node_id]
        {
            rates(v, id)
            m[id] = m[id]+(1.0-exp(dt*((((-1.0)))/mtau[id])))*(-(((minf[id]))/mtau[id])/((((-1.0)))/mtau[id])-m[id])
            h[id] = h[id]+(1.0-exp(dt*((((-1.0)))/htau[id])))*(-(((hinf[id]))/htau[id])/((((-1.0)))/htau[id])-h[id])
            n[id] = n[id]+(1.0-exp(dt*((((-1.0)))/ntau[id])))*(-(((ninf[id]))/ntau[id])/((((-1.0)))/ntau[id])-n[id])
        }
        ion_ena[node_id] += n[id]         // shows a store using indirect index; scatter 
    }
}

And here is the summary:

struct Mechanism {
   int *node_index;
   double *voltage;
   double *m;
   double *h;
   double *n;
   double *m;
   double * mtau;
   ...
};

Mechanism mech;

I would say nrn_state_hh will be our first benchmarking kernel! :)

I hope above gives some idea about pattern required for vectorisation.

pramodk commented 3 years ago

By the way, currently you don't see for loop with vectorisable patterns (as shown in above nrn_state_hh()) in FUNCTION / PROCEDURE blocks. This is what I am working on in #476. With that PR, you will see function containing loop for which you can emit LLVM IR.

pramodk commented 3 years ago

Another point to keep in mind: In very few mechanism kernels we could have conflict with store i.e. multiple SIMD lanes writing to same location :

for(id = 0; id< node_count; id = id+1) {
      node_id = node_index[id]
      ...
      ion_ena[node_id] += n[id]         // here, node_id could be same for
}

In this case, while doing SIMD store, we need atomic write support. (also imagine a scenario where multiple threads are active)

Here are some references:

castigli commented 3 years ago

Regarding today's discussion about exp, it turns out that the instructions _mm256_exp_pd/_mm512_exp_pd are not intrinsic but part of the SVML library, so I guess at the llvm IR level is too late to include? Regardless, for portability we could expand our fast math library (possibly writing it in llvm IR directly?).

Minimal example

// llvm 11.0.0
// clang++ -std=c++17 -O3 -mavx512f -fveclib=SVML

#include <immintrin.h>
#include <cmath>

constexpr size_t len = 64;
using vec = double[len];

void Exp(vec ret, vec x)
{
    for (size_t i = 0; i < len; ++i)
    {
        ret[i] = exp(x[i]);
    }
}
Exp(double*, double*):                             # @Exp(double*, double*)
        pushq   %r15
        pushq   %r14
        pushq   %rbx
        movq    %rsi, %r15
        movq    %rdi, %r14
        leaq    512(%rsi), %rax
        cmpq    %rdi, %rax
        jbe     .LBB0_4
        leaq    512(%r14), %rax
        cmpq    %r15, %rax
        jbe     .LBB0_4
        xorl    %ebx, %ebx
.LBB0_3:                                # =>This Inner Loop Header: Depth=1
        vmovsd  (%r15,%rbx,8), %xmm0            # xmm0 = mem[0],zero
        callq   exp
        vmovsd  %xmm0, (%r14,%rbx,8)
        addq    $1, %rbx
        cmpq    $64, %rbx
        jne     .LBB0_3
        popq    %rbx
        popq    %r14
        popq    %r15
        retq
.LBB0_4:
        vmovups (%r15), %zmm0
        callq   __svml_exp8
        vmovups %zmm0, (%r14)
        vmovups 64(%r15), %zmm0
        callq   __svml_exp8
        vmovups %zmm0, 64(%r14)
        vmovups 128(%r15), %zmm0
        callq   __svml_exp8
        vmovups %zmm0, 128(%r14)
        vmovups 192(%r15), %zmm0
        callq   __svml_exp8
        vmovups %zmm0, 192(%r14)
        vmovups 256(%r15), %zmm0
        callq   __svml_exp8
        vmovups %zmm0, 256(%r14)
        vmovups 320(%r15), %zmm0
        callq   __svml_exp8
        vmovups %zmm0, 320(%r14)
        vmovups 384(%r15), %zmm0
        callq   __svml_exp8
        vmovups %zmm0, 384(%r14)
        vmovups 448(%r15), %zmm0
        callq   __svml_exp8
        vmovups %zmm0, 448(%r14)
        popq    %rbx
        popq    %r14
        popq    %r15
        vzeroupper
        retq

https://godbolt.org/z/vnshMGqKq

pramodk commented 3 years ago

so I guess at the llvm IR level is too late to include? Regardless, for portability we could expand our fast math library (possibly writing it in llvm IR directly?)

@castigli : I missed above your comments. See relevant discussion in #589 - we are hoping to use some llvm pass to replace llvm math intrinsics with SVM or other math library.