Closed mppf closed 5 years ago
Looking at muladsubb. First, I have a small performance reproducer:
use Time;
// num_samples: 3000 for "long", 300000 for "medium", 50000000 for "short"
config const num_samples = 50000000; // "short"
// array length: 44217 for "long", 5001 for "medium", 171 for "short"
config const len = 171; // "short"
proc main() {
var ltimer:Timer;
var RealArray_1D: [0..10][0..#len] real;
// The stuff from LCALSLoops.chpl loopInit
for i in 1..5 do
initData(RealArray_1D[i-1], i);
// This portion from RunBRawLoops.chpl runBRawLoops e.g.
ref out1 = RealArray_1D[0];
ref out2 = RealArray_1D[1];
ref out3 = RealArray_1D[2];
ref in1 = RealArray_1D[3];
ref in2 = RealArray_1D[4];
ltimer.start();
for 0..#num_samples {
for i in 0..#len {
out1[i] = in1[i] * in2[i];
out2[i] = in1[i] + in2[i];
out3[i] = in1[i] - in2[i];
}
}
ltimer.stop();
var chksum: real = 0.0;
// Get this portion from LCALSLoops.chpl loopFinalize
updateChksum(chksum, len, RealArray_1D[0]);
updateChksum(chksum, len, RealArray_1D[1]);
updateChksum(chksum, len, RealArray_1D[2]);
// Get the Checksum from LCALSChecksums.chpl
var expect = 391727437.25186200960888527333736;
writeln("Got checksum ", chksum, " expected ", expect);
writeln("Kernel ran in ", ltimer.elapsed(), " seconds");
}
proc initData(ra: [] real, id: int) {
const factor: real = if id % 2 != 0 then 0.1 else 0.2;
for (r,j) in zip(ra, 0..) {
r = factor*(j + 1.1)/(j + 1.12345);
}
}
proc updateChksum(ref chksum:real, length: int,
ra: [] real, scale_factor: real = 1.0) {
use LongDouble;
ref data = ra;
var len = ra.numElements;
var tchk: longdouble = chksum;
for (j, dat) in zip(0..#len, data) {
tchk += (j+1)*dat*scale_factor;
}
chksum = tchk:real;
}
Next, I ran the (optimized) C version in gdb and hit control-c in the middle of execution. This loop is appraently the kernel:
0x423108 <chpl_user_main+6184>: vmovupd (%r11,%rdx,1),%ymm0
0x42310e <chpl_user_main+6190>: vmulpd (%rbx,%rdx,1),%ymm0,%ymm0
0x423113 <chpl_user_main+6195>: add $0x1,%r10
0x423117 <chpl_user_main+6199>: vmovupd %ymm0,(%r15,%rdx,1)
=> 0x42311d <chpl_user_main+6205>: vmovupd (%r11,%rdx,1),%ymm0
0x423123 <chpl_user_main+6211>: vaddpd (%rbx,%rdx,1),%ymm0,%ymm0
0x423128 <chpl_user_main+6216>: vmovupd %ymm0,(%r14,%rdx,1)
0x42312e <chpl_user_main+6222>: vmovapd (%rbx,%rdx,1),%ymm0
0x423133 <chpl_user_main+6227>: vsubpd (%r11,%rdx,1),%ymm0,%ymm0
0x423139 <chpl_user_main+6233>: vmovupd %ymm0,(%r9,%rdx,1)
0x42313f <chpl_user_main+6239>: add $0x20,%rdx
0x423143 <chpl_user_main+6243>: cmp %r10,%r13
0x423146 <chpl_user_main+6246>: ja 0x423108 <chpl_user_main+6184>
Here is the kernel in the optimized LLVM version:
0x428e00 <chpl_user_main+6400>: vmovsd 0x8(%rcx,%rbp,8),%xmm0
0x428e06 <chpl_user_main+6406>: vmulsd 0x8(%rdx,%rbp,8),%xmm0,%xmm0
0x428e0c <chpl_user_main+6412>: vmovsd %xmm0,0x8(%rax,%rbp,8)
0x428e12 <chpl_user_main+6418>: vmovsd 0x8(%rcx,%rbp,8),%xmm0
0x428e18 <chpl_user_main+6424>: vaddsd 0x8(%rdx,%rbp,8),%xmm0,%xmm0
0x428e1e <chpl_user_main+6430>: vmovsd %xmm0,0x8(%rsi,%rbp,8)
=> 0x428e24 <chpl_user_main+6436>: vmovsd 0x8(%rcx,%rbp,8),%xmm0
0x428e2a <chpl_user_main+6442>: vsubsd 0x8(%rdx,%rbp,8),%xmm0,%xmm0
0x428e30 <chpl_user_main+6448>: vmovsd %xmm0,0x8(%rdi,%rbp,8)
0x428e36 <chpl_user_main+6454>: vmovsd 0x10(%rcx,%rbp,8),%xmm0
0x428e3c <chpl_user_main+6460>: vmulsd 0x10(%rdx,%rbp,8),%xmm0,%xmm0
0x428e42 <chpl_user_main+6466>: vmovsd %xmm0,0x10(%rax,%rbp,8)
0x428e48 <chpl_user_main+6472>: vmovsd 0x10(%rcx,%rbp,8),%xmm0
0x428e4e <chpl_user_main+6478>: vaddsd 0x10(%rdx,%rbp,8),%xmm0,%xmm0
0x428e54 <chpl_user_main+6484>: vmovsd %xmm0,0x10(%rsi,%rbp,8)
0x428e5a <chpl_user_main+6490>: vmovsd 0x10(%rcx,%rbp,8),%xmm0
0x428e60 <chpl_user_main+6496>: vsubsd 0x10(%rdx,%rbp,8),%xmm0,%xmm0
0x428e66 <chpl_user_main+6502>: vmovsd %xmm0,0x10(%rdi,%rbp,8)
0x428e6c <chpl_user_main+6508>: add $0x2,%rbp
0x428e70 <chpl_user_main+6512>: cmp %r14,%rbp
0x428e73 <chpl_user_main+6515>: jl 0x428e00 <chpl_user_main+6400>
This benchmark has equivalent performance with clang-included and with --llvm, so it's not necessarily the case that we're simply encoding the LLVM IR wrong. It seems that LLVM is not vectorizing.
If I adjust the program to run with vectorizeOnly, the LLVM version becomes faster than the GCC one.
Possible remedy: extend no-alias metadata for the sub-array case.
The following version of the benchmark shows LLVM can be faster:
use Time;
// num_samples: 3000 for "long", 300000 for "medium", 50000000 for "short"
config const num_samples = 50000000; // "short"
// array length: 44217 for "long", 5001 for "medium", 171 for "short"
config const len = 171; // "short"
proc main() {
var ltimer:Timer;
var RealArray_1D0: [0..#len] real;
var RealArray_1D1: [0..#len] real;
var RealArray_1D2: [0..#len] real;
var RealArray_1D3: [0..#len] real;
var RealArray_1D4: [0..#len] real;
// The stuff from LCALSLoops.chpl loopInit
initData(RealArray_1D0, 1);
initData(RealArray_1D1, 2);
initData(RealArray_1D2, 3);
initData(RealArray_1D3, 4);
initData(RealArray_1D4, 5);
// This portion from RunBRawLoops.chpl runBRawLoops e.g.
ltimer.start();
for 0..#num_samples {
//for i in vectorizeOnly(0..#len) {
for i in 0..#len {
RealArray_1D0[i] = RealArray_1D3[i] * RealArray_1D4[i];
RealArray_1D1[i] = RealArray_1D3[i] + RealArray_1D4[i];
RealArray_1D2[i] = RealArray_1D3[i] - RealArray_1D4[i];
}
}
ltimer.stop();
var chksum: real = 0.0;
// Get this portion from LCALSLoops.chpl loopFinalize
updateChksum(chksum, len, RealArray_1D0);
updateChksum(chksum, len, RealArray_1D1);
updateChksum(chksum, len, RealArray_1D2);
// Get the Checksum from LCALSChecksums.chpl
var expect = 391727437.25186200960888527333736;
writeln("Got checksum ", chksum, " expected ", expect);
writeln("Kernel ran in ", ltimer.elapsed(), " seconds");
}
proc initData(ra: [] real, id: int) {
const factor: real = if id % 2 != 0 then 0.1 else 0.2;
for (r,j) in zip(ra, 0..) {
r = factor*(j + 1.1)/(j + 1.12345);
}
}
proc updateChksum(ref chksum:real, length: int,
ra: [] real, scale_factor: real = 1.0) {
use LongDouble;
ref data = ra;
var len = ra.numElements;
var tchk: longdouble = chksum;
for (j, dat) in zip(0..#len, data) {
tchk += (j+1)*dat*scale_factor;
}
chksum = tchk:real;
}
Here LLVM is about 2x faster than GCC (due to #11324).
pressure_calc: serial LLVM version is much slower than C version, using gcc or clang-included. With vectorizeOnly, the LLVM version is still slower than the C version (from either).
Here is a perfomance reproducer:
use Time;
// num_samples: 3000 for "long", 300000 for "medium", 50000000 for "short"
config const num_samples = 30000;
// array length: 44217 for "long", 5001 for "medium", 171 for "short"
config const len = 44217;
proc main() {
var ltimer:Timer;
var RealArray_1D: [0..10][0..#len] real;
var RealArray_scalars: [0..#10] real;
// The stuff from LCALSLoops.chpl loopInit
for i in 1..5 do
initData(RealArray_1D[i-1], i);
initData(RealArray_scalars, 1);
// This portion from RunBRawLoops.chpl runBRawLoops e.g.
ref compression = RealArray_1D[0];
ref bvc = RealArray_1D[1];
ref p_new = RealArray_1D[2];
ref e_old = RealArray_1D[3];
ref vnewc = RealArray_1D[4];
const cls = RealArray_scalars[0];
const p_cut = RealArray_scalars[1];
const pmin = RealArray_scalars[2];
const eosvmax = RealArray_scalars[3];
ltimer.start();
for 0..#num_samples {
//for i in vectorizeOnly(0..#len) {
for i in 0..#len {
bvc[i] = cls * (compression[i] + 1.0);
}
//for i in vectorizeOnly(0..#len) {
for i in 0..#len {
p_new[i] = bvc[i] * e_old[i];
if ( abs(p_new[i]) < p_cut ) then p_new[i] = 0.0;
if ( vnewc[i] >= eosvmax ) then p_new[i] = 0.0;
if ( p_new[i] < pmin ) then p_new[i] = pmin;
}
}
ltimer.stop();
var chksum: real = 0.0;
// Get this portion from LCALSLoops.chpl loopFinalize
updateChksum(chksum, len, RealArray_1D[2]);
// Get the Checksum from LCALSChecksums.chpl
var expect = 97197939.191977054695598781108856;
writeln("Got checksum ", chksum, " expected ", expect);
writeln("Kernel ran in ", ltimer.elapsed(), " seconds");
}
proc initData(ra: [] real, id: int) {
const factor: real = if id % 2 != 0 then 0.1 else 0.2;
for (r,j) in zip(ra, 0..) {
r = factor*(j + 1.1)/(j + 1.12345);
}
}
proc updateChksum(ref chksum:real, length: int,
ra: [] real, scale_factor: real = 1.0) {
use LongDouble;
ref data = ra;
var len = ra.numElements;
var tchk: longdouble = chksum;
for (j, dat) in zip(0..#len, data) {
tchk += (j+1)*dat*scale_factor;
}
chksum = tchk:real;
}
This benchmark has two loops. Only the one setting p_new shows the performance difference.
Here is the asm for the gcc version of the inner loop:
0x423138 <chpl_user_main+6088>: vmovupd (%r15,%rdx,1),%ymm0
0x42313e <chpl_user_main+6094>: vmulpd (%r14,%rdx,1),%ymm0,%ymm1
=> 0x423144 <chpl_user_main+6100>: add $0x1,%r9
0x423148 <chpl_user_main+6104>: vandpd %ymm8,%ymm1,%ymm0
0x42314d <chpl_user_main+6109>: vcmpltpd %ymm11,%ymm0,%ymm0
0x423153 <chpl_user_main+6115>: vandnpd %ymm1,%ymm0,%ymm0
0x423157 <chpl_user_main+6119>: vmovapd %ymm0,(%rbx,%rdx,1)
0x42315c <chpl_user_main+6124>: vcmplepd (%rcx,%rdx,1),%ymm10,%ymm1
0x423162 <chpl_user_main+6130>: vandnpd %ymm0,%ymm1,%ymm0
0x423166 <chpl_user_main+6134>: vmaxpd %ymm0,%ymm9,%ymm0
0x42316a <chpl_user_main+6138>: vmovapd %ymm0,(%rbx,%rdx,1)
0x42316f <chpl_user_main+6143>: add $0x20,%rdx
0x423173 <chpl_user_main+6147>: cmp %r9,%r11
0x423176 <chpl_user_main+6150>: ja 0x423138 <chpl_user_main+6088>
Here is the asm for the LLVM version of the inner loop:
0x4293b0 <chpl_user_main+7616>: vmovsd 0x8(%rcx,%rdi,8),%xmm1
0x4293b6 <chpl_user_main+7622>: vmulsd 0x8(%rdx,%rdi,8),%xmm1,%xmm1
0x4293bc <chpl_user_main+7628>: vandpd %xmm0,%xmm1,%xmm2
0x4293c0 <chpl_user_main+7632>: vcmpltsd %xmm3,%xmm2,%xmm2
0x4293c5 <chpl_user_main+7637>: vandnpd %xmm1,%xmm2,%xmm1
0x4293c9 <chpl_user_main+7641>: vmovsd %xmm1,0x8(%rax,%rdi,8)
0x4293cf <chpl_user_main+7647>: vmovsd 0x8(%rsi,%rdi,8),%xmm2
0x4293d5 <chpl_user_main+7653>: vucomisd %xmm4,%xmm2
0x4293d9 <chpl_user_main+7657>: jb 0x4293e8 <chpl_user_main+7672>
0x4293db <chpl_user_main+7659>: movq $0x0,0x8(%rax,%rdi,8)
0x4293e4 <chpl_user_main+7668>: vxorpd %xmm1,%xmm1,%xmm1
0x4293e8 <chpl_user_main+7672>: vucomisd %xmm1,%xmm5
0x4293ec <chpl_user_main+7676>: jbe 0x4293f3 <chpl_user_main+7683>
0x4293ee <chpl_user_main+7678>: mov %rbx,0x8(%rax,%rdi,8)
0x4293f3 <chpl_user_main+7683>: vmovsd 0x10(%rcx,%rdi,8),%xmm1
0x4293f9 <chpl_user_main+7689>: vmulsd 0x10(%rdx,%rdi,8),%xmm1,%xmm1
0x4293ff <chpl_user_main+7695>: vandpd %xmm0,%xmm1,%xmm2
0x429403 <chpl_user_main+7699>: vcmpltsd %xmm3,%xmm2,%xmm2
=> 0x429408 <chpl_user_main+7704>: vandnpd %xmm1,%xmm2,%xmm1
0x42940c <chpl_user_main+7708>: vmovsd %xmm1,0x10(%rax,%rdi,8)
0x429412 <chpl_user_main+7714>: vmovsd 0x10(%rsi,%rdi,8),%xmm2
0x429418 <chpl_user_main+7720>: vucomisd %xmm4,%xmm2
0x42941c <chpl_user_main+7724>: jae 0x429440 <chpl_user_main+7760>
0x42941e <chpl_user_main+7726>: vucomisd %xmm1,%xmm5
0x429422 <chpl_user_main+7730>: jbe 0x429453 <chpl_user_main+7779>
0x429424 <chpl_user_main+7732>: mov %rbx,0x10(%rax,%rdi,8)
0x429429 <chpl_user_main+7737>: add $0x2,%rdi
0x42942d <chpl_user_main+7741>: cmp %r12,%rdi
0x429430 <chpl_user_main+7744>: jl 0x4293b0 <chpl_user_main+7616>
0x429436 <chpl_user_main+7750>: jmp 0x429460 <chpl_user_main+7792>
0x429438 <chpl_user_main+7752>: nopl 0x0(%rax,%rax,1)
0x429440 <chpl_user_main+7760>: movq $0x0,0x10(%rax,%rdi,8)
0x429449 <chpl_user_main+7769>: vxorpd %xmm1,%xmm1,%xmm1
0x42944d <chpl_user_main+7773>: vucomisd %xmm1,%xmm5
0x429451 <chpl_user_main+7777>: ja 0x429424 <chpl_user_main+7732>
0x429453 <chpl_user_main+7779>: add $0x2,%rdi
0x429457 <chpl_user_main+7783>: cmp %r12,%rdi
0x42945a <chpl_user_main+7786>: jl 0x4293b0 <chpl_user_main+7616>
Here again vectorization is not occuring.
Here is the LLVM vectorizeOnly version
0x429420 <chpl_user_main+7648>: vmovsd 0x8(%rsi,%rcx,8),%xmm0
0x429426 <chpl_user_main+7654>: vmulsd 0x8(%rdi,%rcx,8),%xmm0,%xmm0
0x42942c <chpl_user_main+7660>: vandpd %xmm5,%xmm0,%xmm1
0x429430 <chpl_user_main+7664>: vcmpltsd %xmm2,%xmm1,%xmm1
0x429435 <chpl_user_main+7669>: vandnpd %xmm0,%xmm1,%xmm0
0x429439 <chpl_user_main+7673>: vmovsd %xmm0,0x8(%rdx,%rcx,8)
0x42943f <chpl_user_main+7679>: vmovq 0x8(%rbx,%rcx,8),%xmm1
=> 0x429445 <chpl_user_main+7685>: vucomisd %xmm3,%xmm1
0x429449 <chpl_user_main+7689>: jae 0x429470 <chpl_user_main+7728>
0x42944b <chpl_user_main+7691>: vucomisd %xmm0,%xmm4
0x42944f <chpl_user_main+7695>: jbe 0x429483 <chpl_user_main+7747>
0x429451 <chpl_user_main+7697>: mov %r14,0x8(%rdx,%rcx,8)
0x429456 <chpl_user_main+7702>: add $0x1,%rcx
0x42945a <chpl_user_main+7706>: cmp %rax,%rcx
0x42945d <chpl_user_main+7709>: jl 0x429420 <chpl_user_main+7648>
0x42945f <chpl_user_main+7711>: jmp 0x42948c <chpl_user_main+7756>
0x429461 <chpl_user_main+7713>:
data16 data16 data16 data16 data16 nopw %cs:0x0(%rax,%rax,1)
0x429470 <chpl_user_main+7728>: movq $0x0,0x8(%rdx,%rcx,8)
0x429479 <chpl_user_main+7737>: vxorpd %xmm0,%xmm0,%xmm0
0x42947d <chpl_user_main+7741>: vucomisd %xmm0,%xmm4
0x429481 <chpl_user_main+7745>: ja 0x429451 <chpl_user_main+7697>
0x429483 <chpl_user_main+7747>: add $0x1,%rcx
0x429487 <chpl_user_main+7751>: cmp %rax,%rcx
0x42948a <chpl_user_main+7754>: jl 0x429420 <chpl_user_main+7648>
Here is part of the asm for the clang-included version:
=> 0x4164d0 <chpl_user_main+8032>: vmovupd (%rdi,%rdx,1),%ymm4
0x4164d5 <chpl_user_main+8037>: vmulpd (%rcx,%rdx,1),%ymm4,%ymm4
0x4164da <chpl_user_main+8042>:
vbroadcastsd 0x9c6fd(%rip),%ymm5 # 0x4b2be0
0x4164e3 <chpl_user_main+8051>: vandpd %ymm5,%ymm4,%ymm6
0x4164e7 <chpl_user_main+8055>: vcmpnltpd %ymm8,%ymm6,%ymm6
0x4164ed <chpl_user_main+8061>: vandpd %ymm4,%ymm6,%ymm4
0x4164f1 <chpl_user_main+8065>: vmovupd %ymm4,0x0(%rbp,%rdx,1)
0x4164f7 <chpl_user_main+8071>: vmovupd (%rax,%rdx,1),%ymm6
0x4164fc <chpl_user_main+8076>: vcmplepd %ymm6,%ymm10,%ymm7
0x416501 <chpl_user_main+8081>: vcmpnlepd %ymm6,%ymm10,%ymm6
0x416506 <chpl_user_main+8086>: vandpd %ymm4,%ymm6,%ymm4
0x41650a <chpl_user_main+8090>: vcmpltpd %ymm2,%ymm4,%ymm6
0x41650f <chpl_user_main+8095>: vorpd %ymm7,%ymm6,%ymm6
0x416513 <chpl_user_main+8099>: vmaxpd %ymm4,%ymm2,%ymm4
0x416517 <chpl_user_main+8103>: vmaskmovpd %ymm4,%ymm6,0x0(%rbp,%rdx,1)
0x41651e <chpl_user_main+8110>: vmovupd 0x20(%rdi,%rdx,1),%ymm4
0x416524 <chpl_user_main+8116>: vmulpd 0x20(%rcx,%rdx,1),%ymm4,%ymm4
0x41652a <chpl_user_main+8122>: vandpd %ymm5,%ymm4,%ymm5
0x41652e <chpl_user_main+8126>: vcmpnltpd %ymm8,%ymm5,%ymm5
0x416534 <chpl_user_main+8132>: vandpd %ymm4,%ymm5,%ymm4
0x416538 <chpl_user_main+8136>: vmovupd %ymm4,0x20(%rbp,%rdx,1)
0x41653e <chpl_user_main+8142>: vmovupd 0x20(%rax,%rdx,1),%ymm5
0x416544 <chpl_user_main+8148>: vcmplepd %ymm5,%ymm10,%ymm6
0x416549 <chpl_user_main+8153>: vcmpnlepd %ymm5,%ymm10,%ymm5
0x41654e <chpl_user_main+8158>: vandpd %ymm4,%ymm5,%ymm4
0x416552 <chpl_user_main+8162>: vcmpltpd %ymm2,%ymm4,%ymm5
0x416557 <chpl_user_main+8167>: vorpd %ymm6,%ymm5,%ymm5
0x41655b <chpl_user_main+8171>: vmaxpd %ymm4,%ymm2,%ymm4
0x41655f <chpl_user_main+8175>:
vmaskmovpd %ymm4,%ymm5,0x20(%rbp,%rdx,1)
0x416566 <chpl_user_main+8182>: add $0x8,%rbx
0x41656a <chpl_user_main+8186>: add $0x40,%rdx
0x41656e <chpl_user_main+8190>: add $0x2,%rsi
0x416572 <chpl_user_main+8194>: jne 0x4164d0 <chpl_user_main+8032>
LLVM IR for --llvm vectorizeOnly version
; <label>:791: ; preds = %811, %778
%792 = phi i64 [ %812, %811 ], [ %776, %778 ]
%793 = getelementptr inbounds double, double* %781, i64 %792
%794 = getelementptr inbounds double, double* %784, i64 %792
%795 = getelementptr inbounds double, double* %787, i64 %792
%796 = load double, double* %794, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
%797 = load double, double* %795, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
%798 = fmul contract double %796, %797, !chpl.ast.id !2214
%799 = tail call contract double @llvm.fabs.f64(double %798)
%800 = fcmp contract olt double %799, %695, !chpl.ast.id !2215
%801 = select i1 %800, double 0.000000e+00, double %798
store double %801, double* %793, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193
%802 = getelementptr inbounds double, double* %790, i64 %792
%803 = load double, double* %802, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
%804 = fcmp contract ult double %803, %700, !chpl.ast.id !2216
br i1 %804, label %806, label %805
; <label>:805: ; preds = %791
store double 0.000000e+00, double* %793, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
br label %806
; <label>:806: ; preds = %805, %791
%807 = phi double [ %801, %791 ], [ 0.000000e+00, %805 ]
%808 = fcmp contract olt double %807, %706, !chpl.ast.id !2217
br i1 %808, label %809, label %811
; <label>:809: ; preds = %806
%810 = bitcast double* %793 to i64*
store i64 %698, i64* %810, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
br label %811
; <label>:811: ; preds = %809, %806
%812 = add nsw i64 %792, 1
%813 = icmp slt i64 %792, %775
br i1 %813, label %791, label %814, !llvm.loop !2213
Here's the optimized clang-included version of the same loop:
vector.body1035: ; preds = %vector.body1035, %vector.ph1071.new
%index1074 = phi i64 [ 0, %vector.ph1071.new ], [ %index.next1075.1, %vector.body1035 ]
%niter = phi i64 [ %unroll_iter, %vector.ph1071.new ], [ %niter.nsub.1, %vector.body1035 ]
%402 = add i64 %118, %index1074
%403 = add nsw i64 %402, 1
%404 = getelementptr inbounds double, double* %372, i64 %403
%405 = getelementptr inbounds double, double* %375, i64 %403
%406 = getelementptr inbounds double, double* %378, i64 %403
%407 = bitcast double* %405 to <4 x double>*
%wide.load1083 = load <4 x double>, <4 x double>* %407, align 8, !tbaa !26, !alias.scope !384
%408 = bitcast double* %406 to <4 x double>*
%wide.load1084 = load <4 x double>, <4 x double>* %408, align 8, !tbaa !26, !alias.scope !387
%409 = fmul <4 x double> %wide.load1083, %wide.load1084
%410 = call <4 x double> @llvm.fabs.v4f64(<4 x double> %409)
%411 = fcmp olt <4 x double> %410, %broadcast.splat1086
%412 = select <4 x i1> %411, <4 x double> zeroinitializer, <4 x double> %409
%413 = bitcast double* %404 to <4 x double>*
store <4 x double> %412, <4 x double>* %413, align 8, !tbaa !26, !alias.scope !389, !noalias !391
%414 = getelementptr inbounds double, double* %381, i64 %403
%415 = bitcast double* %414 to <4 x double>*
%wide.load1087 = load <4 x double>, <4 x double>* %415, align 8, !tbaa !26, !alias.scope !393
%416 = fcmp ult <4 x double> %wide.load1087, %broadcast.splat1089
%417 = select <4 x i1> %416, <4 x double> %412, <4 x double> zeroinitializer
%418 = fcmp olt <4 x double> %417, %broadcast.splat1091
%419 = xor <4 x i1> %416, <i1 true, i1 true, i1 true, i1 true>
%420 = or <4 x i1> %418, %419
%421 = select <4 x i1> %418, <4 x double> %broadcast.splat1091, <4 x double> %417
%422 = bitcast double* %404 to <4 x double>*
call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %421, <4 x double>* %422, i32 8, <4 x i1> %420), !tbaa !26, !alias.scope !389, !noalias !391
%index.next1075 = or i64 %index1074, 4
%423 = add i64 %118, %index.next1075
%424 = add nsw i64 %423, 1
Here's the un-optimized clang-included IR
%520 = load double*, double** %519, align 8, !tbaa !177
store double* %520, double** %101, align 8, !tbaa !2
%521 = load double*, double** %101, align 8, !tbaa !2
%522 = load i64, i64* %87, align 8, !tbaa !16
%523 = getelementptr inbounds double, double* %521, i64 %522
store double* %523, double** %102, align 8, !tbaa !2
%524 = load double*, double** %98, align 8, !tbaa !2
%525 = load double, double* %524, align 8, !tbaa !178
%526 = load double*, double** %102, align 8, !tbaa !2
%527 = load double, double* %526, align 8, !tbaa !178
%528 = fmul double %525, %527
%529 = load double*, double** %94, align 8, !tbaa !2
store double %528, double* %529, align 8, !tbaa !178
%530 = load %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %52, align 8, !tbaa !2
%531 = getelementptr inbounds %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s, %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %530, i32 0, i32 1
%532 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %531, align 8, !tbaa !171
store %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %532, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %103, align 8, !tbaa !2
%533 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %103, align 8, !tbaa !2
store %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %533, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %104, align 8, !tbaa !2
%534 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %104, align 8, !tbaa !2
%535 = getelementptr inbounds %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %534, i32 0, i32 8
%536 = load double*, double** %535, align 8, !tbaa !177
store double* %536, double** %105, align 8, !tbaa !2
%537 = load double*, double** %105, align 8, !tbaa !2
%538 = load i64, i64* %87, align 8, !tbaa !16
%539 = getelementptr inbounds double, double* %537, i64 %538
store double* %539, double** %106, align 8, !tbaa !2
%540 = load double*, double** %106, align 8, !tbaa !2
%541 = load double, double* %540, align 8, !tbaa !178
%542 = call double @llvm.fabs.f64(double %541)
store double %542, double* %107, align 8, !tbaa !178
%543 = load double, double* %107, align 8, !tbaa !178
%544 = load double, double* %65, align 8, !tbaa !178
%545 = fcmp olt double %543, %544
%546 = zext i1 %545 to i8
store i8 %546, i8* %108, align 1, !tbaa !141
%547 = load i8, i8* %108, align 1, !tbaa !141, !range !161
%548 = trunc i8 %547 to i1
br i1 %548, label %549, label %561
; <label>:549: ; preds = %493
%550 = load %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %52, align 8, !tbaa !2
%551 = getelementptr inbounds %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s, %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %550, i32 0, i32 1
%552 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %551, align 8, !tbaa !171
store %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %552, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %109, align 8, !tbaa !2
%553 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %109, align 8, !tbaa !2
store %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %553, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %110, align 8, !tbaa !2
%554 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %110, align 8, !tbaa !2
%555 = getelementptr inbounds %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %554, i32 0, i32 8
%556 = load double*, double** %555, align 8, !tbaa !177
store double* %556, double** %111, align 8, !tbaa !2
%557 = load double*, double** %111, align 8, !tbaa !2
%558 = load i64, i64* %87, align 8, !tbaa !16
%559 = getelementptr inbounds double, double* %557, i64 %558
store double* %559, double** %112, align 8, !tbaa !2
%560 = load double*, double** %112, align 8, !tbaa !2
store double 0.000000e+00, double* %560, align 8, !tbaa !178
br label %561
But that's hard to read so here's the -O1 version
; <label>:99: ; preds = %116, %.lr.ph
%.0361371 = phi i64 [ %.sroa.0158.0.copyload, %.lr.ph ], [ %117, %116 ]
%100 = getelementptr inbounds double, double* %89, i64 %.0361371
%101 = getelementptr inbounds double, double* %92, i64 %.0361371
%102 = getelementptr inbounds double, double* %95, i64 %.0361371
%103 = load double, double* %101, align 8, !tbaa !152
%104 = load double, double* %102, align 8, !tbaa !152
%105 = fmul double %103, %104
%106 = tail call double @llvm.fabs.f64(double %105)
%107 = fcmp olt double %106, %70
%storemerge = select i1 %107, double 0.000000e+00, double %105
store double %storemerge, double* %100, align 8, !tbaa !152
%108 = getelementptr inbounds double, double* %98, i64 %.0361371
%109 = load double, double* %108, align 8, !tbaa !152
%110 = fcmp ult double %109, %74
br i1 %110, label %112, label %111
; <label>:111: ; preds = %99
store double 0.000000e+00, double* %100, align 8, !tbaa !152
br label %112
; <label>:112: ; preds = %99, %111
%113 = load double, double* %100, align 8, !tbaa !152
%114 = fcmp olt double %113, %72
br i1 %114, label %115, label %116
; <label>:115: ; preds = %112
store double %72, double* %100, align 8, !tbaa !152
br label %116
; <label>:116: ; preds = %112, %115
%117 = add nsw i64 %.0361371, 1
%118 = icmp slt i64 %.0361371, %.sroa.5.0.copyload
br i1 %118, label %99, label %._crit_edge
._crit_edge: ; preds = %116, %85
%119 = add nuw nsw i64 %.0362372, 1
%120 = icmp slt i64 %.0362372, %.0363
br i1 %120, label %85, label %._crit_edge375
It looks like much of the issue is coming from an llvm.invariant.start which ends up (somehow) causing this code to be generated:
%810 = bitcast double* %793 to i64*
store i64 %698, i64* %810, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
Removing the llvm.invariant.start related to that pointer or replacing that bitcast/store with a store of 0 allow the loop to vectorize.
This is reported as an LLVM bug here: https://bugs.llvm.org/show_bug.cgi?id=39852
planckian: Here --llvm and clang-included are slower than GCC, when --no-ieee-float is provided. Here is a performance reproducer:
use Time;
// num_samples: 3000 for "long", 300000 for "medium", 50000000 for "short"
config const num_samples = 5000000; // "short"
// array length: 44217 for "long", 5001 for "medium", 171 for "short"
config const len = 171; // "short"
proc main() {
var ltimer:Timer;
var RealArray_1D: [0..10][0..#len] real;
// The stuff from LCALSLoops.chpl loopInit
for i in 1..5 do
initData(RealArray_1D[i-1], i);
// This portion from RunBRawLoops.chpl runBRawLoops e.g.
ref x = RealArray_1D[0];
ref y = RealArray_1D[1];
ref u = RealArray_1D[2];
ref v = RealArray_1D[3];
ref w = RealArray_1D[4];
var expmax = 20.0;
u[len-1] = 0.99 * expmax*v[len-1];
ltimer.start();
for 0..#num_samples {
//for k in vectorizeOnly(0..#len) {
for k in 0..#len {
y[k] = u[k] / v[k];
w[k] = x[k] / (exp(y[k]) - 1.0);
}
}
ltimer.stop();
var chksum: real = 0.0;
// Get this portion from LCALSLoops.chpl loopFinalize
updateChksum(chksum, len, RealArray_1D[4]);
// Get the Checksum from LCALSChecksums.chpl
var expect = 391727437.25186200960888527333736;
writeln("Got checksum ", chksum, " expected ", expect);
writeln("Kernel ran in ", ltimer.elapsed(), " seconds");
}
proc initData(ra: [] real, id: int) {
const factor: real = if id % 2 != 0 then 0.1 else 0.2;
for (r,j) in zip(ra, 0..) {
r = factor*(j + 1.1)/(j + 1.12345);
}
}
proc updateChksum(ref chksum:real, length: int,
ra: [] real, scale_factor: real = 1.0) {
use LongDouble;
ref data = ra;
var len = ra.numElements;
var tchk: longdouble = chksum;
for (j, dat) in zip(0..#len, data) {
tchk += (j+1)*dat*scale_factor;
}
chksum = tchk:real;
}
It's important to compile this with --no-ieee-float to see the difference. Here is the GCC assembly:
0x417b68 <chpl_user_main+8040>: mov -0x828(%rbp),%rax
0x417b6f <chpl_user_main+8047>: vmovupd 0x0(%r13,%rbx,1),%ymm6
0x417b76 <chpl_user_main+8054>: vdivpd (%r14,%rbx,1),%ymm6,%ymm0
0x417b7c <chpl_user_main+8060>: vmovupd %ymm0,(%rax,%rbx,1)
0x417b81 <chpl_user_main+8065>: vmovupd (%r15,%rbx,1),%ymm1
0x417b87 <chpl_user_main+8071>: vmovapd %ymm1,-0x810(%rbp)
0x417b8f <chpl_user_main+8079>: callq 0x4d78c0 <_ZGVdN4v___exp_finite>
0x417b94 <chpl_user_main+8084>: vaddpd 0xc4304(%rip),%ymm0,%ymm0 # 0x4dbea0
0x417b9c <chpl_user_main+8092>: vmovapd -0x810(%rbp),%ymm1
0x417ba4 <chpl_user_main+8100>: vdivpd %ymm0,%ymm1,%ymm1
=> 0x417ba8 <chpl_user_main+8104>: vmovupd %ymm1,(%r12,%rbx,1)
0x417bae <chpl_user_main+8110>: add $0x20,%rbx
0x417bb2 <chpl_user_main+8114>: cmp %rbx,-0x830(%rbp)
0x417bb9 <chpl_user_main+8121>: jne 0x417b68 <chpl_user_main+8040>
Vs the clang-included version:
0x415f90 <chpl_user_main+6688>: callq 0x404a20 <exp@plt>
0x415f95 <chpl_user_main+6693>: vmovapd %xmm0,0x70(%rsp)
0x415f9b <chpl_user_main+6699>: vpermilpd $0x1,0xe0(%rsp),%xmm0
0x415fa6 <chpl_user_main+6710>: callq 0x404a20 <exp@plt>
0x415fab <chpl_user_main+6715>: vmovapd 0x70(%rsp),%xmm1
0x415fb1 <chpl_user_main+6721>: vunpcklpd %xmm0,%xmm1,%xmm0
0x415fb5 <chpl_user_main+6725>: vmovapd %xmm0,0x70(%rsp)
0x415fbb <chpl_user_main+6731>: vmovups 0x80(%rsp),%ymm0
0x415fc4 <chpl_user_main+6740>: vzeroupper
0x415fc7 <chpl_user_main+6743>: callq 0x404a20 <exp@plt>
=> 0x415fcc <chpl_user_main+6748>: vmovaps %xmm0,0xe0(%rsp)
0x415fd5 <chpl_user_main+6757>: vpermilpd $0x1,0x80(%rsp),%xmm0
0x415fe0 <chpl_user_main+6768>: callq 0x404a20 <exp@plt>
0x415fe5 <chpl_user_main+6773>: vmovapd 0xe0(%rsp),%xmm1
0x415fee <chpl_user_main+6782>: vunpcklpd %xmm0,%xmm1,%xmm0
0x415ff2 <chpl_user_main+6786>: vinsertf128 $0x1,0x70(%rsp),%ymm0,%ymm0
0x415ffa <chpl_user_main+6794>: vaddpd 0x320(%rsp),%ymm0,%ymm0
0x416003 <chpl_user_main+6803>: vmovupd 0x160(%rsp),%ymm1
0x41600c <chpl_user_main+6812>: vdivpd %ymm0,%ymm1,%ymm0
0x416010 <chpl_user_main+6816>: mov 0x140(%rsp),%rax
0x416018 <chpl_user_main+6824>: vmovupd %ymm0,(%rax,%rbx,8)
0x41601d <chpl_user_main+6829>: or $0x20,%r14
Or this version from --llvm
0x425350 <chpl_user_main+6544>: vmovsd 0x8(%r13,%r14,8),%xmm0
0x425357 <chpl_user_main+6551>: vdivsd 0x8(%r15,%r14,8),%xmm0,%xmm0
0x42535e <chpl_user_main+6558>: vmovsd %xmm0,0x8(%rbx,%r14,8)
0x425365 <chpl_user_main+6565>: vmovsd 0x8(%r12,%r14,8),%xmm1
0x42536c <chpl_user_main+6572>: vmovsd %xmm1,0x80(%rsp)
0x425375 <chpl_user_main+6581>: callq 0x404730 <__exp_finite@plt>
0x42537a <chpl_user_main+6586>: vmovsd 0x8d3f6(%rip),%xmm1 # 0x4b2778
0x425382 <chpl_user_main+6594>: mov 0xe0(%rsp),%rax
0x42538a <chpl_user_main+6602>: vaddsd %xmm1,%xmm0,%xmm0
0x42538e <chpl_user_main+6606>: vmovsd 0x80(%rsp),%xmm1
0x425397 <chpl_user_main+6615>: vdivsd %xmm0,%xmm1,%xmm0
=> 0x42539b <chpl_user_main+6619>: vmovsd %xmm0,0x8(%rbp,%r14,8)
0x4253a2 <chpl_user_main+6626>: add $0x1,%r14
0x4253a6 <chpl_user_main+6630>: cmp %rax,%r14
0x4253a9 <chpl_user_main+6633>: jl 0x425350 <chpl_user_main+6544>
I believe the problem here is that it's not calling a vectorized version of exp
.
I've created #11772 to summarize the results of this spike.
LCALS performance differs for --llvm especially for the raw,short configuration, but to some degree for other benchmarks.
https://chapel-lang.org/perf/chapcs/llvm/?startdate=2018/09/12&graphs=lcalsrawshort,lcalsrawmedium,lcalsrawlong,lcalsrawompshort,lcalsrawompmedium,lcalsrawomplong
For example,
This task is a spike to investigate the cause of these differences.