Open pramodk opened 3 years ago
Using the mod file example from above, I looked at 4 different kernels:
For the first kernel, I looked at the following llvm vector size:
As in the example I looked at the following micro architectures:
For reference this is the data struct with offsets in bytes
INSTANCE_STRUCT {
DOUBLE *minf : 0
DOUBLE *mtau : 8
DOUBLE *m :16
DOUBLE *Dm :24
DOUBLE *v_unused :32
DOUBLE *g_unused :40
DOUBLE *voltage :48
INTEGER *node_index :56
DOUBLE t :64
DOUBLE dt :72
DOUBLE celsius :80
INTEGER secondorder :88
INTEGER node_count :92
}
nrn_state_hh: # @nrn_state_hh
# %bb.0:
movl 92(%rdi), %ecx # mech->nodecount
leal -7(%rcx), %eax # mech->nodecount - 7
testl %eax, %eax # mech->nodecount - 7 <= 0?
jle .LBB0_1
# %bb.6: # %for.body.preheader
xorl %eax, %eax # eax will be id, set counter to zero
.p2align 4, 0x90
.LBB0_7: # %for.body
# =>This Inner Loop Header: Depth=1
cltq # sign extend eax -> rax (this instruction could be avoid with size_t loop var)
movq (%rdi), %rcx # mech->minf to rcx | (these 3 ops could be outside the loop)
movq 8(%rdi), %rdx # mech->mtau to rdx |
movq 16(%rdi), %rsi # mech->m to rsi |
vmovapd (%rcx,%rax,8), %zmm0 # load packed double mech->minf[id] to zmm0
vsubpd (%rsi,%rax,8), %zmm0, %zmm0 # mech->minf[id] - load pd mech->m[id]
vdivpd (%rdx,%rax,8), %zmm0, %zmm0 # (mech->minf[id] - mech->m[id]) / load pd mech->mtau[id]
vmovapd %zmm0, (%rsi,%rax,8) # store packed double to mech->m[id]
addl $8, %eax # increment id
movl 92(%rdi), %ecx # mech->nodecount | (these 2 ops could be outside the loop)
leal -7(%rcx), %edx # mech->nodecount - 7 |
cmpl %edx, %eax # id < mech->nodecount -7?
jl .LBB0_7
# ...scalar loop...
# ...only vec loop body...
.LBB0_7: # %for.body
# =>This Inner Loop Header: Depth=1
cltq
movq (%rdi), %rcx
movq 8(%rdi), %rdx
movq 16(%rdi), %rsi
vmovapd (%rcx,%rax,8), %zmm0
vmovapd 64(%rcx,%rax,8), %zmm1
vsubpd (%rsi,%rax,8), %zmm0, %zmm0
vsubpd 64(%rsi,%rax,8), %zmm1, %zmm1
vdivpd 64(%rdx,%rax,8), %zmm1, %zmm1
vdivpd (%rdx,%rax,8), %zmm0, %zmm0
vmovapd %zmm0, (%rsi,%rax,8)
vmovapd %zmm1, 64(%rsi,%rax,8)
addl $16, %eax
movl 92(%rdi), %ecx
leal -15(%rcx), %edx
cmpl %edx, %eax
jl .LBB0_7
# ...only vec loop body...
.LBB0_7: # %for.body
# =>This Inner Loop Header: Depth=1
cltq
movq (%rdi), %rcx
movq 8(%rdi), %rdx
movq 16(%rdi), %rsi
vmovapd (%rcx,%rax,8), %zmm0
vmovapd 64(%rcx,%rax,8), %zmm1
vmovapd 128(%rcx,%rax,8), %zmm2
vmovapd 192(%rcx,%rax,8), %zmm3
vsubpd 192(%rsi,%rax,8), %zmm3, %zmm3
vdivpd 192(%rdx,%rax,8), %zmm3, %zmm3
vsubpd (%rsi,%rax,8), %zmm0, %zmm0
vdivpd (%rdx,%rax,8), %zmm0, %zmm0
vsubpd 64(%rsi,%rax,8), %zmm1, %zmm1
vdivpd 64(%rdx,%rax,8), %zmm1, %zmm1
vsubpd 128(%rsi,%rax,8), %zmm2, %zmm2
vdivpd 128(%rdx,%rax,8), %zmm2, %zmm2
vmovapd %zmm2, 128(%rsi,%rax,8)
vmovapd %zmm1, 64(%rsi,%rax,8)
vmovapd %zmm0, (%rsi,%rax,8)
vmovapd %zmm3, 192(%rsi,%rax,8)
addl $32, %eax
movl 92(%rdi), %ecx
leal -31(%rcx), %edx
cmpl %edx, %eax
jl .LBB0_7
nrn_state_hh: # @nrn_state_hh
# %bb.0:
pushq %r15 # | save callee registers on stack
pushq %r14 # |
pushq %rbx # |
movq %rdi, %r14 # copy mech in r14
movl 92(%rdi), %eax # mech->nodecount
leal -7(%rax), %ecx # mech->nodecount - 7
xorl %ebx, %ebx # set id = 0
testl %ecx, %ecx # |
jle .LBB0_1 # | mech->nodecount - 7 <= 0?
.p2align 4, 0x90
.LBB0_5: # %for.body
# =>This Inner Loop Header: Depth=1
movslq %ebx, %rbx # copy id and sign extend (this op could be avoided)
movq (%r14), %rax # mech->minf to rax | (these 3 ops could be outside the loop)
movq 8(%r14), %rcx # mech->mtau to rcx |
movq 16(%r14), %rdx # mech->m to rdx |
vmovapd (%rax,%rbx,8), %zmm0 # load packed double mech->minf[id]
vsubpd (%rdx,%rbx,8), %zmm0, %zmm0 # mech->minf[id] - load pd mech->m[id]
vdivpd (%rcx,%rbx,8), %zmm0, %zmm0 # (mech->minf[id] - mech->m[id]) / load pd mech->mtau[id]
vmovapd %zmm0, (%rdx,%rbx,8) # store pd to mech->m[id]
movq 16(%r14), %r15 # mech->m to rdx (this seems unnecessary)
vmovaps (%r15,%rbx,8), %zmm0 # move packed single into 1st arg for function call (technically should be vmovapd)
callq __svml_exp8@PLT # call to svml vec exp
vmovaps %zmm0, (%r15,%rbx,8) # store ps to mech->m[id] (technically should be vmovapd)
addl $8, %ebx # id += 8
movl 92(%r14), %eax # mech->nodecount | (these 2 ops could be outside the loop)
leal -7(%rax), %ecx # mech->nodecount - 7 |
cmpl %ecx, %ebx # |
jl .LBB0_5 # |id < mech->nodecount -7?
# ...scalar loop...
nrn_state_hh: # @nrn_state_hh
# %bb.0:
movl 92(%rdi), %ecx # mech->nodecount
leal -7(%rcx), %eax # mech->nodecount - 7
testl %eax, %eax # | mech->nodecount - 7 <= 0?
jle .LBB0_1 # |
# %bb.6: # %for.body.preheader
xorl %eax, %eax # set id = 0
.p2align 4, 0x90
.LBB0_7: # %for.body
# =>This Inner Loop Header: Depth=1
cltq # sign extend eax -> rax (this instruction could be avoid with size_t loop var)
movq 56(%rdi), %rcx # copy mech->nodeindex in rcx (this could be moved outside the loop)
vmovapd (%rcx,%rax,4), %ymm0 # load packed double mech->nodeindex[id] in ymm0 (8 x 32 bit)
movq 48(%rdi), %rcx # copy mech->voltage in rcx (this could be moved outside the loop)
kxnorw %k0, %k0, %k1 # set all bit mask to 1 in k1
vgatherdpd (%rcx,%ymm0,8), %zmm1 {%k1} # indirect load mech->voltage[node_id] to xmm0
movq (%rdi), %rcx # copy mech->minf to rcx
movq 16(%rdi), %rdx # copy mech->m to rdx
vaddpd (%rcx,%rax,8), %zmm1, %zmm0 # mech->voltage[node_id] + mech->minf[id]
vmovapd %zmm0, (%rdx,%rax,8) # store pd in mech->m[id]
addl $8, %eax # id +=8
movl 92(%rdi), %ecx # mech->nodecount | (these 2 ops could be outside the loop)
leal -7(%rcx), %edx # mech->nodecount - 7 |
cmpl %edx, %eax # |
jl .LBB0_7 # |id < mech->nodecount -7?
# ...scalar loop...
nrn_state_hh: # @nrn_state_hh
# %bb.0:
pushq %r15 # | save callee registers on stack
pushq %r14 # |
pushq %rbx # |
subq $352, %rsp # imm = 0x160 (allocate 352 bytes on the stack)
movq %rdi, %r14 # copy mech in r14
movl 92(%rdi), %eax # mech->nodecount
leal -7(%rax), %ecx # mech->nodecount -7
xorl %ebx, %ebx # set id = 0
testl %ecx, %ecx # | mech->nodecount - 7 <= 0?
jle .LBB0_1 # |
# %bb.5: # %for.body.preheader
vbroadcastsd .LCPI0_0(%rip), %zmm0 # zmm0 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
vmovups %zmm0, 256(%rsp) # 64-byte Spill, unaligned packed single store (technically should be vmovupd)
vbroadcastsd .LCPI0_1(%rip), %zmm0 # zmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
vmovups %zmm0, 192(%rsp) # 64-byte Spill, unaligned packed single store (technically should be vmovupd)
vbroadcastsd .LCPI0_2(%rip), %zmm0 # zmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
vmovupd %zmm0, 128(%rsp) # 64-byte Spill, unaligned packed double store
.p2align 4, 0x90
.LBB0_6: # %for.body
# =>This Inner Loop Header: Depth=1
movslq %ebx, %rbx # move id to rbx and sign extend
movq 16(%r14), %r15 # copy mech->m in r15
vmovapd (%r15,%rbx,8), %zmm3 # load packed double mech->m[id] | why
vmovupd %zmm3, 64(%rsp) # 64-byte Spill, store unaligned packed double mech->m[id] on the stack | ?
movq 8(%r14), %rax # copy mech->mtau in rax
vmovapd (%rax,%rbx,8), %zmm0 # load packed double mech->mtau[id]
vmovupd 256(%rsp), %zmm1 # 64-byte Reload, -1
vdivpd %zmm0, %zmm1, %zmm1 # -1/mech->mtau[id]
movq (%r14), %rax # copy mech->minf in rax
vmovupd 192(%rsp), %zmm2 # 64-byte Reload, 0 |
vxorpd (%rax,%rbx,8), %zmm2, %zmm2 # load pd mech->minf[id] ^ 0 | (this seems useless)
vdivpd %zmm0, %zmm2, %zmm0 # mech->minf[id] / mech->mtau[id]
vdivpd %zmm1, %zmm0, %zmm0 # (mech->minf[id] / mech->mtau[id]) / (-1/mech->mtau[id])
vsubpd %zmm3, %zmm0, %zmm0 # (mech->minf[id] / mech->mtau[id]) / (-1/mech->mtau[id]) - mech->m[id]
vmovupd %zmm0, (%rsp) # 64-byte Spill, store unaligned pd to stack (there are enough registers to avoid this)
vmulpd 72(%r14){1to8}, %zmm1, %zmm0 # broadcast mech->dt then * -1/mech->mtau[id]
callq __svml_exp8@PLT # call SVML
vmovupd 128(%rsp), %zmm1 # 64-byte Reload, load unaligned packed from stack, 1.0
vsubpd %zmm0, %zmm1, %zmm0 # 1.0 - exp(...)
vmulpd (%rsp), %zmm0, %zmm0 # 64-byte Folded Reload, uload pd from stack, (...) * (1 - exp(...))
vaddpd 64(%rsp), %zmm0, %zmm0 # 64-byte Folded Reload, uload pd from stack, mech->m[id] * (...)
vmovapd %zmm0, (%r15,%rbx,8) # aligned store pd to mech->m[id]
addl $8, %ebx # id+=8
movl 92(%r14), %eax # mech->nodecount | (these 2 ops could be outside the loop)
leal -7(%rax), %ecx # mech->nodecount - 7 |
cmpl %ecx, %ebx # | id < mech->nodecount -7?
jl .LBB0_6 # |
# ...scalar loop...
@castigli Great summary, thanks a lot! Some points from LLVM's side:
mod file could be simplified, compiler does not simplify things like (a/b)/(1/b)
b can be 0 => trap, or undef => possibly any value and 0 as well. I think that for such optimisations there is a flag in LLVM that allows to perform occasionally unsafe optimisations (not sure about division: hoisting 1/x
out of the loop is definitely illegal, but instructions that may overflow are fine).
there are some ops that could be removed from the loop
I am looking into this in more detail, but "copy mech->movq
instructions) are not handled in optimiser due to weak alias analysis. In #606 I am looking more into this: these loads definitely should go out of the loop.
@georgemitenkov welcome!
think that for such optimisations there is a flag in LLVM that allows to perform occasionally unsafe optimisations
I'll try to have a look a this, if I were to write that code manually I would simplify those expressions
I am looking into this in more detail, but "copy mech-> in rax" (i.e. movq instructions) are not handled in optimiser due to weak alias analysis. In #606 I am looking more into this: these loads definitely should go out of the loop.
Good to know, it looks like you already found a solution!
Do you have some idea on why there are so many stack stores/loads when there are plenty of vector registers available?
if I were to write that code manually I would simplify those expressions
Agreed!
Do you have some idea on why there are so many stack stores/loads when there are plenty of vector registers available?
Not yet, I am looking more into this.
Just a quick update with regard to code generation for ARMv8+SVE.
It seems like llvm 13 is still not able to generate SVE instructions.
I tried to target a64fx
cpu or a generic aarch64+sve
.
with
llc-13 -o - -O3 -march=aarch64 -mcpu=a64fx test.bc
or
llc-13 -o - -O3 -march=aarch64 -mattr=+v8.4a,+sve test.bc
Advanced SIMD (Neon) instructions only are generated, no sve
// simd loop only
.LBB0_2: // %for.body
// =>This Inner Loop Header: Depth=1
ldp x11, x10, [x0, #8]
ldr x12, [x0]
sbfiz x9, x8, #3, #32
add w8, w8, #8 // =8
add x11, x11, x9
add x10, x10, x9
add x9, x12, x9
ldp q0, q1, [x10]
ldp q3, q2, [x10, #32]
ldp q4, q5, [x9]
ldp q7, q6, [x9, #32]
ldp q17, q16, [x11]
fsub v3.2d, v7.2d, v3.2d
fsub v1.2d, v5.2d, v1.2d
fsub v0.2d, v4.2d, v0.2d
fsub v2.2d, v6.2d, v2.2d
fdiv v0.2d, v0.2d, v17.2d
fdiv v1.2d, v1.2d, v16.2d
ldp q4, q5, [x11, #32]
fdiv v2.2d, v2.2d, v5.2d
fdiv v3.2d, v3.2d, v4.2d
stp q0, q1, [x10]
stp q3, q2, [x10, #32]
ldr w9, [x0, #92]
sub w10, w9, #7 // =7
cmp w8, w10
b.lt .LBB0_2
This is consistent with some autovectorization tests I made for a simple daxpy
function.
namespace scalar
{
void daxpy(double a, double* x, double* y, std::size_t size, double* ret)
{
for(std::size_t i=0; i < size; ++i)
{
ret[i] = a * x[i] + y[i];
}
}
}
clang++ (12.0.0) -march=armv8-a -std=c++17 -O1
(gcc 10.3 generates a single fma instruction)
scalar::daxpy(double, double*, double*, unsigned long, double*):
cbz x2, .LBB0_2
.LBB0_1:
ldr d1, [x0], #8
ldr d2, [x1], #8
subs x2, x2, #1
fmul d1, d1, d0 // |
fadd d1, d1, d2 // | --> this could be a fma, gcc does that
str d1, [x3], #8
b.ne .LBB0_1
.LBB0_2:
clang++ (12.0.0) -march=armv8-a -std=c++17 -O1
(2x unroll, again no fma)
// simd loop only
.LBB0_6: // =>This Inner Loop Header: Depth=1
ldp q2, q3, [x11, #-16]
ldp q4, q5, [x10, #-16]
subs x12, x12, #4 // =4
add x10, x10, #32 // =32
fmul v2.2d, v2.2d, v1.2d
fmul v3.2d, v3.2d, v1.2d
fadd v2.2d, v2.2d, v4.2d
fadd v3.2d, v3.2d, v5.2d
stp q2, q3, [x9, #-16]
add x9, x9, #32 // =32
add x11, x11, #32 // =32
b.ne .LBB0_6
cmp x8, x2
b.eq .LBB0_10
clang++ does not generate sve code.
g++ (10.3) -march=armv8-a+sve -std=c++17 -O3
, gcc generates sve instructions.
However it also generates a tail loop that should not be necessary with the predicate instructions in sve
// simd loop only
.L5:
ld1d z0.d, p0/z, [x0, x4, lsl 3]
ld1d z2.d, p0/z, [x1, x4, lsl 3]
fmad z0.d, p1/m, z1.d, z2.d
st1d z0.d, p0, [x3, x4, lsl 3]
add x4, x4, x5
whilelo p0.d, x4, x2
b.any .L5
thanks @castigli for the above summary. For the SVE, here is an example George provided: https://godbolt.org/z/jc8v8vbrx
For the record, copying example here:
void foo(double *a, int n) {
int id;
#pragma clang loop vectorize_width(4, scalable) unroll(disable) distribute(disable)
for (id = 0; id < n; id = id+1) {
a[id] = 7;
}
}
And the article I mentioned in the email is this one.
Anyway, we could check SVE in coming days. On x86/BB5, it would be great if we could compare the performance of kernels executed via JIT vs performance of kernels compiled offline via Intel or Clang+SVML toolchain. I am thinking following:
test.mod
, take nrn_state_hh
kernel and hh_Instance
struct generated from cpp file. we need to write/tweak instance
struct manually to match what we generate in NMODL for JIT execution but it won't take that much time (@iomaganaris or I will help). Compile this kernel and instance struct with clang+svml and create library called foo.a
foo.a
library for linking. (e.g. via CXX/LD flags)nrn_state_hh
from foo.a
as well and measure execution time.Above process is bit manual but I think it would be quicker to get started. It would be nice if we could get this early so that we are confident about performance numbers and provide feedback to George if further changes/optimisaitons are needed.
A small update on the benchmark results of JIT vs intel.
With reference to the "compute bound" kernel, preliminary results showed a significant difference in speed between the JIT compiled kernel and the intel compiler with flags -O2 -qopt-zmm-usage=high -xCORE-AVX512 -fimf-use-svml
.
Upon asm comparison, the main differences seem to be:
You can see the effect on the kernel and flags here: https://godbolt.org/z/oqq9hbfEh
and here is a snippet of the relevant instructions for division
...
vmovups (%rbx,%r14,8), %zmm5 #31.89
vsubpd %zmm18, %zmm16, %zmm1 #31.76
vdivpd %zmm5, %zmm1, %zmm3 #31.89
vaddpd %zmm0, %zmm17, %zmm2 #31.40
vaddpd %zmm3, %zmm2, %zmm4 #31.89
...
versus reciprocal
...
vmovups (%rbx,%r14,8), %zmm6 #31.89
vsubpd %zmm18, %zmm16, %zmm5 #31.76
vrcp14pd %zmm6, %zmm3 #31.89
vaddpd %zmm0, %zmm17, %zmm4 #31.40
vfpclasspd $30, %zmm3, %k0 #31.89
vmovaps %zmm3, %zmm1 #31.89
...
To eliminate the use of approximate reciprocal, I added -prec-div
flag to icpc.
As part of this ticket, we should look generated LLVM IR + assembly kernels and see if there are any obvious performance issues.
Here are steps:
.ll
file with certain vector width but targeting different ISAs:Check all available architectures for specific target:
Generate
.ll
file vector width of 8 once:And generate assembly kernels for different targets easily:
And generated assembly kernels: