spike: investigate LLVM performance difference for LCALS

mppf commented 6 years ago

LCALS performance differs for --llvm especially for the raw,short configuration, but to some degree for other benchmarks.

https://chapel-lang.org/perf/chapcs/llvm/?startdate=2018/09/12&graphs=lcalsrawshort,lcalsrawmedium,lcalsrawlong,lcalsrawompshort,lcalsrawompmedium,lcalsrawomplong

For example,

test	C	llvm
muladdsub	1.23 s	3.36 s
planckian	0.96 s	3.12 s
init3	0.74 s	1.8 s
trap_int	0.77 s	1.38 s
imp_hydro_2d	0.65 s	0.81 s
couple	1.16 s	1.32 s
pic_1d	1.25 s	1.3 s
pressure_calc	1.19 s	1.27 s

This task is a spike to investigate the cause of these differences.

mppf commented 5 years ago

Looking at muladsubb. First, I have a small performance reproducer:

use Time;

// num_samples: 3000 for "long", 300000 for "medium", 50000000 for "short"
config const num_samples = 50000000; // "short"
// array length: 44217 for "long", 5001 for "medium", 171 for "short"
config const len = 171; // "short"

proc main() {
  var ltimer:Timer;
  var RealArray_1D: [0..10][0..#len] real;

  // The stuff from LCALSLoops.chpl loopInit
  for i in 1..5 do
    initData(RealArray_1D[i-1], i);

  // This portion from RunBRawLoops.chpl runBRawLoops e.g.
  ref out1 = RealArray_1D[0];
  ref out2 = RealArray_1D[1];
  ref out3 = RealArray_1D[2];
  ref in1  = RealArray_1D[3];
  ref in2  = RealArray_1D[4];
  ltimer.start();
  for 0..#num_samples {
    for i in 0..#len {
      out1[i] = in1[i] * in2[i];
      out2[i] = in1[i] + in2[i];
      out3[i] = in1[i] - in2[i];
    }
  }
  ltimer.stop();

  var chksum: real = 0.0;
  // Get this portion from LCALSLoops.chpl loopFinalize 
  updateChksum(chksum, len, RealArray_1D[0]);
  updateChksum(chksum, len, RealArray_1D[1]);
  updateChksum(chksum, len, RealArray_1D[2]);

  // Get the Checksum from LCALSChecksums.chpl
  var expect = 391727437.25186200960888527333736;
  writeln("Got checksum ", chksum, " expected ", expect);
  writeln("Kernel ran in ", ltimer.elapsed(), " seconds");
}

proc initData(ra: [] real, id: int) {
  const factor: real = if id % 2 != 0 then 0.1 else 0.2;
  for (r,j) in zip(ra, 0..) {
    r = factor*(j + 1.1)/(j + 1.12345);
  }
}

proc updateChksum(ref chksum:real, length: int, 
                  ra: [] real, scale_factor: real = 1.0) {
  use LongDouble;

  ref data = ra;
  var len = ra.numElements;

  var tchk: longdouble = chksum;
  for (j, dat) in zip(0..#len, data) {
    tchk += (j+1)*dat*scale_factor;
  }
  chksum = tchk:real;
}

Next, I ran the (optimized) C version in gdb and hit control-c in the middle of execution. This loop is appraently the kernel:

   0x423108 <chpl_user_main+6184>:  vmovupd (%r11,%rdx,1),%ymm0
   0x42310e <chpl_user_main+6190>:  vmulpd (%rbx,%rdx,1),%ymm0,%ymm0
   0x423113 <chpl_user_main+6195>:  add    $0x1,%r10
   0x423117 <chpl_user_main+6199>:  vmovupd %ymm0,(%r15,%rdx,1)
=> 0x42311d <chpl_user_main+6205>:  vmovupd (%r11,%rdx,1),%ymm0
   0x423123 <chpl_user_main+6211>:  vaddpd (%rbx,%rdx,1),%ymm0,%ymm0
   0x423128 <chpl_user_main+6216>:  vmovupd %ymm0,(%r14,%rdx,1)
   0x42312e <chpl_user_main+6222>:  vmovapd (%rbx,%rdx,1),%ymm0
   0x423133 <chpl_user_main+6227>:  vsubpd (%r11,%rdx,1),%ymm0,%ymm0
   0x423139 <chpl_user_main+6233>:  vmovupd %ymm0,(%r9,%rdx,1)
   0x42313f <chpl_user_main+6239>:  add    $0x20,%rdx
   0x423143 <chpl_user_main+6243>:  cmp    %r10,%r13
   0x423146 <chpl_user_main+6246>:  ja     0x423108 <chpl_user_main+6184>

Here is the kernel in the optimized LLVM version:

   0x428e00 <chpl_user_main+6400>:  vmovsd 0x8(%rcx,%rbp,8),%xmm0
   0x428e06 <chpl_user_main+6406>:  vmulsd 0x8(%rdx,%rbp,8),%xmm0,%xmm0
   0x428e0c <chpl_user_main+6412>:  vmovsd %xmm0,0x8(%rax,%rbp,8)
   0x428e12 <chpl_user_main+6418>:  vmovsd 0x8(%rcx,%rbp,8),%xmm0
   0x428e18 <chpl_user_main+6424>:  vaddsd 0x8(%rdx,%rbp,8),%xmm0,%xmm0
   0x428e1e <chpl_user_main+6430>:  vmovsd %xmm0,0x8(%rsi,%rbp,8)
=> 0x428e24 <chpl_user_main+6436>:  vmovsd 0x8(%rcx,%rbp,8),%xmm0
   0x428e2a <chpl_user_main+6442>:  vsubsd 0x8(%rdx,%rbp,8),%xmm0,%xmm0
   0x428e30 <chpl_user_main+6448>:  vmovsd %xmm0,0x8(%rdi,%rbp,8)
   0x428e36 <chpl_user_main+6454>:  vmovsd 0x10(%rcx,%rbp,8),%xmm0
   0x428e3c <chpl_user_main+6460>:  vmulsd 0x10(%rdx,%rbp,8),%xmm0,%xmm0
   0x428e42 <chpl_user_main+6466>:  vmovsd %xmm0,0x10(%rax,%rbp,8)
   0x428e48 <chpl_user_main+6472>:  vmovsd 0x10(%rcx,%rbp,8),%xmm0
   0x428e4e <chpl_user_main+6478>:  vaddsd 0x10(%rdx,%rbp,8),%xmm0,%xmm0
   0x428e54 <chpl_user_main+6484>:  vmovsd %xmm0,0x10(%rsi,%rbp,8)
   0x428e5a <chpl_user_main+6490>:  vmovsd 0x10(%rcx,%rbp,8),%xmm0
   0x428e60 <chpl_user_main+6496>:  vsubsd 0x10(%rdx,%rbp,8),%xmm0,%xmm0
   0x428e66 <chpl_user_main+6502>:  vmovsd %xmm0,0x10(%rdi,%rbp,8)
   0x428e6c <chpl_user_main+6508>:  add    $0x2,%rbp
   0x428e70 <chpl_user_main+6512>:  cmp    %r14,%rbp
   0x428e73 <chpl_user_main+6515>:  jl     0x428e00 <chpl_user_main+6400>

This benchmark has equivalent performance with clang-included and with --llvm, so it's not necessarily the case that we're simply encoding the LLVM IR wrong. It seems that LLVM is not vectorizing.

If I adjust the program to run with vectorizeOnly, the LLVM version becomes faster than the GCC one.

Possible remedy: extend no-alias metadata for the sub-array case.

mppf commented 5 years ago

The following version of the benchmark shows LLVM can be faster:

use Time;

// num_samples: 3000 for "long", 300000 for "medium", 50000000 for "short"
config const num_samples = 50000000; // "short"
// array length: 44217 for "long", 5001 for "medium", 171 for "short"
config const len = 171; // "short"

proc main() {
  var ltimer:Timer;
  var RealArray_1D0: [0..#len] real;
  var RealArray_1D1: [0..#len] real;
  var RealArray_1D2: [0..#len] real;
  var RealArray_1D3: [0..#len] real;
  var RealArray_1D4: [0..#len] real;
  // The stuff from LCALSLoops.chpl loopInit
  initData(RealArray_1D0, 1);
  initData(RealArray_1D1, 2);
  initData(RealArray_1D2, 3);
  initData(RealArray_1D3, 4);
  initData(RealArray_1D4, 5);

  // This portion from RunBRawLoops.chpl runBRawLoops e.g.
  ltimer.start();
  for 0..#num_samples {
    //for i in vectorizeOnly(0..#len) {
    for i in 0..#len {
      RealArray_1D0[i] = RealArray_1D3[i] * RealArray_1D4[i];
      RealArray_1D1[i] = RealArray_1D3[i] + RealArray_1D4[i];
      RealArray_1D2[i] = RealArray_1D3[i] - RealArray_1D4[i];
    }
  }
  ltimer.stop();

  var chksum: real = 0.0;
  // Get this portion from LCALSLoops.chpl loopFinalize 
  updateChksum(chksum, len, RealArray_1D0);
  updateChksum(chksum, len, RealArray_1D1);
  updateChksum(chksum, len, RealArray_1D2);

  // Get the Checksum from LCALSChecksums.chpl
  var expect = 391727437.25186200960888527333736;
  writeln("Got checksum ", chksum, " expected ", expect);
  writeln("Kernel ran in ", ltimer.elapsed(), " seconds");
}

proc initData(ra: [] real, id: int) {
  const factor: real = if id % 2 != 0 then 0.1 else 0.2;
  for (r,j) in zip(ra, 0..) {
    r = factor*(j + 1.1)/(j + 1.12345);
  }
}

proc updateChksum(ref chksum:real, length: int, 
                  ra: [] real, scale_factor: real = 1.0) {
  use LongDouble;

  ref data = ra;
  var len = ra.numElements;

  var tchk: longdouble = chksum;
  for (j, dat) in zip(0..#len, data) {
    tchk += (j+1)*dat*scale_factor;
  }
  chksum = tchk:real;
}

Here LLVM is about 2x faster than GCC (due to #11324).

mppf commented 5 years ago

pressure_calc: serial LLVM version is much slower than C version, using gcc or clang-included. With vectorizeOnly, the LLVM version is still slower than the C version (from either).

Here is a perfomance reproducer:

use Time;

// num_samples: 3000 for "long", 300000 for "medium", 50000000 for "short"
config const num_samples = 30000;
// array length: 44217 for "long", 5001 for "medium", 171 for "short"
config const len = 44217;

proc main() {
  var ltimer:Timer;
  var RealArray_1D: [0..10][0..#len] real;
  var RealArray_scalars: [0..#10] real;

  // The stuff from LCALSLoops.chpl loopInit
  for i in 1..5 do
    initData(RealArray_1D[i-1], i);
  initData(RealArray_scalars, 1);

  // This portion from RunBRawLoops.chpl runBRawLoops e.g.
  ref compression = RealArray_1D[0];
  ref bvc = RealArray_1D[1];
  ref p_new = RealArray_1D[2];
  ref e_old = RealArray_1D[3];
  ref vnewc = RealArray_1D[4];

  const cls = RealArray_scalars[0];
  const p_cut = RealArray_scalars[1];
  const pmin = RealArray_scalars[2];
  const eosvmax = RealArray_scalars[3];
  ltimer.start();
  for 0..#num_samples {
    //for i in vectorizeOnly(0..#len) {
    for i in 0..#len {
      bvc[i] = cls * (compression[i] + 1.0);
    }
    //for i in vectorizeOnly(0..#len) {
    for i in 0..#len {
      p_new[i] = bvc[i] * e_old[i];
      if ( abs(p_new[i]) <  p_cut ) then p_new[i] = 0.0;
      if ( vnewc[i] >= eosvmax ) then p_new[i] = 0.0;
      if ( p_new[i]  <  pmin ) then p_new[i] = pmin;
    }
  }
  ltimer.stop();

  var chksum: real = 0.0;
  // Get this portion from LCALSLoops.chpl loopFinalize 
  updateChksum(chksum, len, RealArray_1D[2]);

  // Get the Checksum from LCALSChecksums.chpl
  var expect = 97197939.191977054695598781108856;
  writeln("Got checksum ", chksum, " expected ", expect);
  writeln("Kernel ran in ", ltimer.elapsed(), " seconds");
}

proc initData(ra: [] real, id: int) {
  const factor: real = if id % 2 != 0 then 0.1 else 0.2;
  for (r,j) in zip(ra, 0..) {
    r = factor*(j + 1.1)/(j + 1.12345);
  }
}

proc updateChksum(ref chksum:real, length: int, 
                  ra: [] real, scale_factor: real = 1.0) {
  use LongDouble;

  ref data = ra;
  var len = ra.numElements;

  var tchk: longdouble = chksum;
  for (j, dat) in zip(0..#len, data) {
    tchk += (j+1)*dat*scale_factor;
  }
  chksum = tchk:real;
}

This benchmark has two loops. Only the one setting p_new shows the performance difference.

Here is the asm for the gcc version of the inner loop:

   0x423138 <chpl_user_main+6088>:  vmovupd (%r15,%rdx,1),%ymm0
   0x42313e <chpl_user_main+6094>:  vmulpd (%r14,%rdx,1),%ymm0,%ymm1
=> 0x423144 <chpl_user_main+6100>:  add    $0x1,%r9
   0x423148 <chpl_user_main+6104>:  vandpd %ymm8,%ymm1,%ymm0
   0x42314d <chpl_user_main+6109>:  vcmpltpd %ymm11,%ymm0,%ymm0
   0x423153 <chpl_user_main+6115>:  vandnpd %ymm1,%ymm0,%ymm0
   0x423157 <chpl_user_main+6119>:  vmovapd %ymm0,(%rbx,%rdx,1)
   0x42315c <chpl_user_main+6124>:  vcmplepd (%rcx,%rdx,1),%ymm10,%ymm1
   0x423162 <chpl_user_main+6130>:  vandnpd %ymm0,%ymm1,%ymm0
   0x423166 <chpl_user_main+6134>:  vmaxpd %ymm0,%ymm9,%ymm0
   0x42316a <chpl_user_main+6138>:  vmovapd %ymm0,(%rbx,%rdx,1)
   0x42316f <chpl_user_main+6143>:  add    $0x20,%rdx
   0x423173 <chpl_user_main+6147>:  cmp    %r9,%r11
   0x423176 <chpl_user_main+6150>:  ja     0x423138 <chpl_user_main+6088>

Here is the asm for the LLVM version of the inner loop:

   0x4293b0 <chpl_user_main+7616>:  vmovsd 0x8(%rcx,%rdi,8),%xmm1
   0x4293b6 <chpl_user_main+7622>:  vmulsd 0x8(%rdx,%rdi,8),%xmm1,%xmm1
   0x4293bc <chpl_user_main+7628>:  vandpd %xmm0,%xmm1,%xmm2
   0x4293c0 <chpl_user_main+7632>:  vcmpltsd %xmm3,%xmm2,%xmm2
   0x4293c5 <chpl_user_main+7637>:  vandnpd %xmm1,%xmm2,%xmm1
   0x4293c9 <chpl_user_main+7641>:  vmovsd %xmm1,0x8(%rax,%rdi,8)
   0x4293cf <chpl_user_main+7647>:  vmovsd 0x8(%rsi,%rdi,8),%xmm2
   0x4293d5 <chpl_user_main+7653>:  vucomisd %xmm4,%xmm2
   0x4293d9 <chpl_user_main+7657>:  jb     0x4293e8 <chpl_user_main+7672>
   0x4293db <chpl_user_main+7659>:  movq   $0x0,0x8(%rax,%rdi,8)
   0x4293e4 <chpl_user_main+7668>:  vxorpd %xmm1,%xmm1,%xmm1
   0x4293e8 <chpl_user_main+7672>:  vucomisd %xmm1,%xmm5
   0x4293ec <chpl_user_main+7676>:  jbe    0x4293f3 <chpl_user_main+7683>
   0x4293ee <chpl_user_main+7678>:  mov    %rbx,0x8(%rax,%rdi,8)
   0x4293f3 <chpl_user_main+7683>:  vmovsd 0x10(%rcx,%rdi,8),%xmm1
   0x4293f9 <chpl_user_main+7689>:  vmulsd 0x10(%rdx,%rdi,8),%xmm1,%xmm1
   0x4293ff <chpl_user_main+7695>:  vandpd %xmm0,%xmm1,%xmm2
   0x429403 <chpl_user_main+7699>:  vcmpltsd %xmm3,%xmm2,%xmm2
=> 0x429408 <chpl_user_main+7704>:  vandnpd %xmm1,%xmm2,%xmm1
   0x42940c <chpl_user_main+7708>:  vmovsd %xmm1,0x10(%rax,%rdi,8)
   0x429412 <chpl_user_main+7714>:  vmovsd 0x10(%rsi,%rdi,8),%xmm2
   0x429418 <chpl_user_main+7720>:  vucomisd %xmm4,%xmm2
   0x42941c <chpl_user_main+7724>:  jae    0x429440 <chpl_user_main+7760>
   0x42941e <chpl_user_main+7726>:  vucomisd %xmm1,%xmm5
   0x429422 <chpl_user_main+7730>:  jbe    0x429453 <chpl_user_main+7779>
   0x429424 <chpl_user_main+7732>:  mov    %rbx,0x10(%rax,%rdi,8)
   0x429429 <chpl_user_main+7737>:  add    $0x2,%rdi
   0x42942d <chpl_user_main+7741>:  cmp    %r12,%rdi
   0x429430 <chpl_user_main+7744>:  jl     0x4293b0 <chpl_user_main+7616>
   0x429436 <chpl_user_main+7750>:  jmp    0x429460 <chpl_user_main+7792>
   0x429438 <chpl_user_main+7752>:  nopl   0x0(%rax,%rax,1)
   0x429440 <chpl_user_main+7760>:  movq   $0x0,0x10(%rax,%rdi,8)
   0x429449 <chpl_user_main+7769>:  vxorpd %xmm1,%xmm1,%xmm1
   0x42944d <chpl_user_main+7773>:  vucomisd %xmm1,%xmm5
   0x429451 <chpl_user_main+7777>:  ja     0x429424 <chpl_user_main+7732>
   0x429453 <chpl_user_main+7779>:  add    $0x2,%rdi
   0x429457 <chpl_user_main+7783>:  cmp    %r12,%rdi
   0x42945a <chpl_user_main+7786>:  jl     0x4293b0 <chpl_user_main+7616>

Here again vectorization is not occuring.

Here is the LLVM vectorizeOnly version

   0x429420 <chpl_user_main+7648>:  vmovsd 0x8(%rsi,%rcx,8),%xmm0
   0x429426 <chpl_user_main+7654>:  vmulsd 0x8(%rdi,%rcx,8),%xmm0,%xmm0
   0x42942c <chpl_user_main+7660>:  vandpd %xmm5,%xmm0,%xmm1
   0x429430 <chpl_user_main+7664>:  vcmpltsd %xmm2,%xmm1,%xmm1
   0x429435 <chpl_user_main+7669>:  vandnpd %xmm0,%xmm1,%xmm0
   0x429439 <chpl_user_main+7673>:  vmovsd %xmm0,0x8(%rdx,%rcx,8)
   0x42943f <chpl_user_main+7679>:  vmovq  0x8(%rbx,%rcx,8),%xmm1
=> 0x429445 <chpl_user_main+7685>:  vucomisd %xmm3,%xmm1
   0x429449 <chpl_user_main+7689>:  jae    0x429470 <chpl_user_main+7728>
   0x42944b <chpl_user_main+7691>:  vucomisd %xmm0,%xmm4
   0x42944f <chpl_user_main+7695>:  jbe    0x429483 <chpl_user_main+7747>
   0x429451 <chpl_user_main+7697>:  mov    %r14,0x8(%rdx,%rcx,8)
   0x429456 <chpl_user_main+7702>:  add    $0x1,%rcx
   0x42945a <chpl_user_main+7706>:  cmp    %rax,%rcx
   0x42945d <chpl_user_main+7709>:  jl     0x429420 <chpl_user_main+7648>
   0x42945f <chpl_user_main+7711>:  jmp    0x42948c <chpl_user_main+7756>
   0x429461 <chpl_user_main+7713>:  
    data16 data16 data16 data16 data16 nopw %cs:0x0(%rax,%rax,1)
   0x429470 <chpl_user_main+7728>:  movq   $0x0,0x8(%rdx,%rcx,8)
   0x429479 <chpl_user_main+7737>:  vxorpd %xmm0,%xmm0,%xmm0
   0x42947d <chpl_user_main+7741>:  vucomisd %xmm0,%xmm4
   0x429481 <chpl_user_main+7745>:  ja     0x429451 <chpl_user_main+7697>
   0x429483 <chpl_user_main+7747>:  add    $0x1,%rcx
   0x429487 <chpl_user_main+7751>:  cmp    %rax,%rcx
   0x42948a <chpl_user_main+7754>:  jl     0x429420 <chpl_user_main+7648>

Here is part of the asm for the clang-included version:

=> 0x4164d0 <chpl_user_main+8032>:  vmovupd (%rdi,%rdx,1),%ymm4
   0x4164d5 <chpl_user_main+8037>:  vmulpd (%rcx,%rdx,1),%ymm4,%ymm4
   0x4164da <chpl_user_main+8042>:  
    vbroadcastsd 0x9c6fd(%rip),%ymm5        # 0x4b2be0
   0x4164e3 <chpl_user_main+8051>:  vandpd %ymm5,%ymm4,%ymm6
   0x4164e7 <chpl_user_main+8055>:  vcmpnltpd %ymm8,%ymm6,%ymm6
   0x4164ed <chpl_user_main+8061>:  vandpd %ymm4,%ymm6,%ymm4
   0x4164f1 <chpl_user_main+8065>:  vmovupd %ymm4,0x0(%rbp,%rdx,1)
   0x4164f7 <chpl_user_main+8071>:  vmovupd (%rax,%rdx,1),%ymm6
   0x4164fc <chpl_user_main+8076>:  vcmplepd %ymm6,%ymm10,%ymm7
   0x416501 <chpl_user_main+8081>:  vcmpnlepd %ymm6,%ymm10,%ymm6
   0x416506 <chpl_user_main+8086>:  vandpd %ymm4,%ymm6,%ymm4
   0x41650a <chpl_user_main+8090>:  vcmpltpd %ymm2,%ymm4,%ymm6
   0x41650f <chpl_user_main+8095>:  vorpd  %ymm7,%ymm6,%ymm6
   0x416513 <chpl_user_main+8099>:  vmaxpd %ymm4,%ymm2,%ymm4
   0x416517 <chpl_user_main+8103>:  vmaskmovpd %ymm4,%ymm6,0x0(%rbp,%rdx,1)
   0x41651e <chpl_user_main+8110>:  vmovupd 0x20(%rdi,%rdx,1),%ymm4
   0x416524 <chpl_user_main+8116>:  vmulpd 0x20(%rcx,%rdx,1),%ymm4,%ymm4
   0x41652a <chpl_user_main+8122>:  vandpd %ymm5,%ymm4,%ymm5
   0x41652e <chpl_user_main+8126>:  vcmpnltpd %ymm8,%ymm5,%ymm5
   0x416534 <chpl_user_main+8132>:  vandpd %ymm4,%ymm5,%ymm4
   0x416538 <chpl_user_main+8136>:  vmovupd %ymm4,0x20(%rbp,%rdx,1)
   0x41653e <chpl_user_main+8142>:  vmovupd 0x20(%rax,%rdx,1),%ymm5
   0x416544 <chpl_user_main+8148>:  vcmplepd %ymm5,%ymm10,%ymm6
   0x416549 <chpl_user_main+8153>:  vcmpnlepd %ymm5,%ymm10,%ymm5
   0x41654e <chpl_user_main+8158>:  vandpd %ymm4,%ymm5,%ymm4
   0x416552 <chpl_user_main+8162>:  vcmpltpd %ymm2,%ymm4,%ymm5
   0x416557 <chpl_user_main+8167>:  vorpd  %ymm6,%ymm5,%ymm5
   0x41655b <chpl_user_main+8171>:  vmaxpd %ymm4,%ymm2,%ymm4
   0x41655f <chpl_user_main+8175>:  
    vmaskmovpd %ymm4,%ymm5,0x20(%rbp,%rdx,1)
   0x416566 <chpl_user_main+8182>:  add    $0x8,%rbx
   0x41656a <chpl_user_main+8186>:  add    $0x40,%rdx
   0x41656e <chpl_user_main+8190>:  add    $0x2,%rsi
   0x416572 <chpl_user_main+8194>:  jne    0x4164d0 <chpl_user_main+8032>

LLVM IR for --llvm vectorizeOnly version

; <label>:791:                                    ; preds = %811, %778
  %792 = phi i64 [ %812, %811 ], [ %776, %778 ]
  %793 = getelementptr inbounds double, double* %781, i64 %792
  %794 = getelementptr inbounds double, double* %784, i64 %792
  %795 = getelementptr inbounds double, double* %787, i64 %792
  %796 = load double, double* %794, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
  %797 = load double, double* %795, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
  %798 = fmul contract double %796, %797, !chpl.ast.id !2214
  %799 = tail call contract double @llvm.fabs.f64(double %798)
  %800 = fcmp contract olt double %799, %695, !chpl.ast.id !2215
  %801 = select i1 %800, double 0.000000e+00, double %798
  store double %801, double* %793, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193
  %802 = getelementptr inbounds double, double* %790, i64 %792
  %803 = load double, double* %802, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
  %804 = fcmp contract ult double %803, %700, !chpl.ast.id !2216
  br i1 %804, label %806, label %805
; <label>:805:                                    ; preds = %791
  store double 0.000000e+00, double* %793, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
  br label %806

; <label>:806:                                    ; preds = %805, %791
  %807 = phi double [ %801, %791 ], [ 0.000000e+00, %805 ]
  %808 = fcmp contract olt double %807, %706, !chpl.ast.id !2217
  br i1 %808, label %809, label %811

; <label>:809:                                    ; preds = %806
  %810 = bitcast double* %793 to i64*
  store i64 %698, i64* %810, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
  br label %811

; <label>:811:                                    ; preds = %809, %806
  %812 = add nsw i64 %792, 1
  %813 = icmp slt i64 %792, %775
  br i1 %813, label %791, label %814, !llvm.loop !2213

Here's the optimized clang-included version of the same loop:

vector.body1035:                                  ; preds = %vector.body1035, %vector.ph1071.new
  %index1074 = phi i64 [ 0, %vector.ph1071.new ], [ %index.next1075.1, %vector.body1035 ]
  %niter = phi i64 [ %unroll_iter, %vector.ph1071.new ], [ %niter.nsub.1, %vector.body1035 ]
  %402 = add i64 %118, %index1074
  %403 = add nsw i64 %402, 1
  %404 = getelementptr inbounds double, double* %372, i64 %403
  %405 = getelementptr inbounds double, double* %375, i64 %403
  %406 = getelementptr inbounds double, double* %378, i64 %403
  %407 = bitcast double* %405 to <4 x double>*
  %wide.load1083 = load <4 x double>, <4 x double>* %407, align 8, !tbaa !26, !alias.scope !384
  %408 = bitcast double* %406 to <4 x double>*
  %wide.load1084 = load <4 x double>, <4 x double>* %408, align 8, !tbaa !26, !alias.scope !387
  %409 = fmul <4 x double> %wide.load1083, %wide.load1084
  %410 = call <4 x double> @llvm.fabs.v4f64(<4 x double> %409)
  %411 = fcmp olt <4 x double> %410, %broadcast.splat1086
  %412 = select <4 x i1> %411, <4 x double> zeroinitializer, <4 x double> %409
  %413 = bitcast double* %404 to <4 x double>*
  store <4 x double> %412, <4 x double>* %413, align 8, !tbaa !26, !alias.scope !389, !noalias !391
  %414 = getelementptr inbounds double, double* %381, i64 %403
  %415 = bitcast double* %414 to <4 x double>*
  %wide.load1087 = load <4 x double>, <4 x double>* %415, align 8, !tbaa !26, !alias.scope !393
  %416 = fcmp ult <4 x double> %wide.load1087, %broadcast.splat1089
  %417 = select <4 x i1> %416, <4 x double> %412, <4 x double> zeroinitializer
  %418 = fcmp olt <4 x double> %417, %broadcast.splat1091
  %419 = xor <4 x i1> %416, <i1 true, i1 true, i1 true, i1 true>
  %420 = or <4 x i1> %418, %419
  %421 = select <4 x i1> %418, <4 x double> %broadcast.splat1091, <4 x double> %417
  %422 = bitcast double* %404 to <4 x double>*
  call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %421, <4 x double>* %422, i32 8, <4 x i1> %420), !tbaa !26, !alias.scope !389, !noalias !391
  %index.next1075 = or i64 %index1074, 4
  %423 = add i64 %118, %index.next1075
  %424 = add nsw i64 %423, 1

Here's the un-optimized clang-included IR

  %520 = load double*, double** %519, align 8, !tbaa !177
  store double* %520, double** %101, align 8, !tbaa !2
  %521 = load double*, double** %101, align 8, !tbaa !2
  %522 = load i64, i64* %87, align 8, !tbaa !16
  %523 = getelementptr inbounds double, double* %521, i64 %522
  store double* %523, double** %102, align 8, !tbaa !2
  %524 = load double*, double** %98, align 8, !tbaa !2
  %525 = load double, double* %524, align 8, !tbaa !178
  %526 = load double*, double** %102, align 8, !tbaa !2
  %527 = load double, double* %526, align 8, !tbaa !178
  %528 = fmul double %525, %527
  %529 = load double*, double** %94, align 8, !tbaa !2
  store double %528, double* %529, align 8, !tbaa !178
  %530 = load %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %52, align 8, !tbaa !2
  %531 = getelementptr inbounds %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s, %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %530, i32 0, i32 1
  %532 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %531, align 8, !tbaa !171
  store %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %532, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %103, align 8, !tbaa !2
  %533 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %103, align 8, !tbaa !2
  store %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %533, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %104, align 8, !tbaa !2
  %534 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %104, align 8, !tbaa !2
  %535 = getelementptr inbounds %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %534, i32 0, i32 8
  %536 = load double*, double** %535, align 8, !tbaa !177
  store double* %536, double** %105, align 8, !tbaa !2
  %537 = load double*, double** %105, align 8, !tbaa !2
  %538 = load i64, i64* %87, align 8, !tbaa !16
  %539 = getelementptr inbounds double, double* %537, i64 %538
  store double* %539, double** %106, align 8, !tbaa !2
  %540 = load double*, double** %106, align 8, !tbaa !2
  %541 = load double, double* %540, align 8, !tbaa !178
  %542 = call double @llvm.fabs.f64(double %541)
  store double %542, double* %107, align 8, !tbaa !178
  %543 = load double, double* %107, align 8, !tbaa !178
  %544 = load double, double* %65, align 8, !tbaa !178
  %545 = fcmp olt double %543, %544
  %546 = zext i1 %545 to i8
  store i8 %546, i8* %108, align 1, !tbaa !141
  %547 = load i8, i8* %108, align 1, !tbaa !141, !range !161
  %548 = trunc i8 %547 to i1
  br i1 %548, label %549, label %561

; <label>:549:                                    ; preds = %493
  %550 = load %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %52, align 8, !tbaa !2
  %551 = getelementptr inbounds %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s, %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %550, i32 0, i32 1
  %552 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %551, align 8, !tbaa !171
  store %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %552, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %109, align 8, !tbaa !2
  %553 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %109, align 8, !tbaa !2
  store %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %553, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %110, align 8, !tbaa !2
  %554 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %110, align 8, !tbaa !2
  %555 = getelementptr inbounds %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %554, i32 0, i32 8
  %556 = load double*, double** %555, align 8, !tbaa !177
  store double* %556, double** %111, align 8, !tbaa !2
  %557 = load double*, double** %111, align 8, !tbaa !2
  %558 = load i64, i64* %87, align 8, !tbaa !16
  %559 = getelementptr inbounds double, double* %557, i64 %558
  store double* %559, double** %112, align 8, !tbaa !2
  %560 = load double*, double** %112, align 8, !tbaa !2
  store double 0.000000e+00, double* %560, align 8, !tbaa !178
  br label %561

But that's hard to read so here's the -O1 version

; <label>:99:                                     ; preds = %116, %.lr.ph
  %.0361371 = phi i64 [ %.sroa.0158.0.copyload, %.lr.ph ], [ %117, %116 ]
  %100 = getelementptr inbounds double, double* %89, i64 %.0361371
  %101 = getelementptr inbounds double, double* %92, i64 %.0361371
  %102 = getelementptr inbounds double, double* %95, i64 %.0361371
  %103 = load double, double* %101, align 8, !tbaa !152
  %104 = load double, double* %102, align 8, !tbaa !152
  %105 = fmul double %103, %104
  %106 = tail call double @llvm.fabs.f64(double %105)
  %107 = fcmp olt double %106, %70
  %storemerge = select i1 %107, double 0.000000e+00, double %105
  store double %storemerge, double* %100, align 8, !tbaa !152
  %108 = getelementptr inbounds double, double* %98, i64 %.0361371
  %109 = load double, double* %108, align 8, !tbaa !152
  %110 = fcmp ult double %109, %74
  br i1 %110, label %112, label %111

; <label>:111:                                    ; preds = %99
  store double 0.000000e+00, double* %100, align 8, !tbaa !152
  br label %112
; <label>:112:                                    ; preds = %99, %111
  %113 = load double, double* %100, align 8, !tbaa !152
  %114 = fcmp olt double %113, %72
  br i1 %114, label %115, label %116

; <label>:115:                                    ; preds = %112
  store double %72, double* %100, align 8, !tbaa !152
  br label %116

; <label>:116:                                    ; preds = %112, %115
  %117 = add nsw i64 %.0361371, 1
  %118 = icmp slt i64 %.0361371, %.sroa.5.0.copyload
  br i1 %118, label %99, label %._crit_edge

._crit_edge:                                      ; preds = %116, %85
  %119 = add nuw nsw i64 %.0362372, 1
  %120 = icmp slt i64 %.0362372, %.0363
  br i1 %120, label %85, label %._crit_edge375

Vectorization succeeds with RV
Vectorization does not succeed with LLVM master

It looks like much of the issue is coming from an llvm.invariant.start which ends up (somehow) causing this code to be generated:

  %810 = bitcast double* %793 to i64*
  store i64 %698, i64* %810, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213

Removing the llvm.invariant.start related to that pointer or replacing that bitcast/store with a store of 0 allow the loop to vectorize.

This is reported as an LLVM bug here: https://bugs.llvm.org/show_bug.cgi?id=39852

mppf commented 5 years ago

planckian: Here --llvm and clang-included are slower than GCC, when --no-ieee-float is provided. Here is a performance reproducer:

use Time;

// num_samples: 3000 for "long", 300000 for "medium", 50000000 for "short"
config const num_samples = 5000000; // "short"
// array length: 44217 for "long", 5001 for "medium", 171 for "short"
config const len = 171; // "short"

proc main() {
  var ltimer:Timer;
  var RealArray_1D: [0..10][0..#len] real;

  // The stuff from LCALSLoops.chpl loopInit
  for i in 1..5 do
    initData(RealArray_1D[i-1], i);

  // This portion from RunBRawLoops.chpl runBRawLoops e.g.
  ref x = RealArray_1D[0];
  ref y = RealArray_1D[1];
  ref u = RealArray_1D[2];
  ref v  = RealArray_1D[3];
  ref w  = RealArray_1D[4];
  var expmax = 20.0;
  u[len-1] = 0.99 * expmax*v[len-1];
  ltimer.start();
  for 0..#num_samples {
    //for k in vectorizeOnly(0..#len) {
    for k in 0..#len {
      y[k] = u[k] / v[k];
      w[k] = x[k] / (exp(y[k]) - 1.0);
    }
  }
  ltimer.stop();

  var chksum: real = 0.0;
  // Get this portion from LCALSLoops.chpl loopFinalize 
  updateChksum(chksum, len, RealArray_1D[4]);

  // Get the Checksum from LCALSChecksums.chpl
  var expect = 391727437.25186200960888527333736;
  writeln("Got checksum ", chksum, " expected ", expect);
  writeln("Kernel ran in ", ltimer.elapsed(), " seconds");
}

proc initData(ra: [] real, id: int) {
  const factor: real = if id % 2 != 0 then 0.1 else 0.2;
  for (r,j) in zip(ra, 0..) {
    r = factor*(j + 1.1)/(j + 1.12345);
  }
}

proc updateChksum(ref chksum:real, length: int, 
                  ra: [] real, scale_factor: real = 1.0) {
  use LongDouble;

  ref data = ra;
  var len = ra.numElements;

  var tchk: longdouble = chksum;
  for (j, dat) in zip(0..#len, data) {
    tchk += (j+1)*dat*scale_factor;
  }
  chksum = tchk:real;
}

It's important to compile this with --no-ieee-float to see the difference. Here is the GCC assembly:

   0x417b68 <chpl_user_main+8040>:  mov    -0x828(%rbp),%rax
   0x417b6f <chpl_user_main+8047>:  vmovupd 0x0(%r13,%rbx,1),%ymm6
   0x417b76 <chpl_user_main+8054>:  vdivpd (%r14,%rbx,1),%ymm6,%ymm0
   0x417b7c <chpl_user_main+8060>:  vmovupd %ymm0,(%rax,%rbx,1)
   0x417b81 <chpl_user_main+8065>:  vmovupd (%r15,%rbx,1),%ymm1
   0x417b87 <chpl_user_main+8071>:  vmovapd %ymm1,-0x810(%rbp)
   0x417b8f <chpl_user_main+8079>:  callq  0x4d78c0 <_ZGVdN4v___exp_finite>
   0x417b94 <chpl_user_main+8084>:  vaddpd 0xc4304(%rip),%ymm0,%ymm0        # 0x4dbea0
   0x417b9c <chpl_user_main+8092>:  vmovapd -0x810(%rbp),%ymm1
   0x417ba4 <chpl_user_main+8100>:  vdivpd %ymm0,%ymm1,%ymm1
=> 0x417ba8 <chpl_user_main+8104>:  vmovupd %ymm1,(%r12,%rbx,1)
   0x417bae <chpl_user_main+8110>:  add    $0x20,%rbx
   0x417bb2 <chpl_user_main+8114>:  cmp    %rbx,-0x830(%rbp)
   0x417bb9 <chpl_user_main+8121>:  jne    0x417b68 <chpl_user_main+8040>

Vs the clang-included version:

   0x415f90 <chpl_user_main+6688>:  callq  0x404a20 <exp@plt>
   0x415f95 <chpl_user_main+6693>:  vmovapd %xmm0,0x70(%rsp)
   0x415f9b <chpl_user_main+6699>:  vpermilpd $0x1,0xe0(%rsp),%xmm0
   0x415fa6 <chpl_user_main+6710>:  callq  0x404a20 <exp@plt>
   0x415fab <chpl_user_main+6715>:  vmovapd 0x70(%rsp),%xmm1
   0x415fb1 <chpl_user_main+6721>:  vunpcklpd %xmm0,%xmm1,%xmm0
   0x415fb5 <chpl_user_main+6725>:  vmovapd %xmm0,0x70(%rsp)
   0x415fbb <chpl_user_main+6731>:  vmovups 0x80(%rsp),%ymm0
   0x415fc4 <chpl_user_main+6740>:  vzeroupper 
   0x415fc7 <chpl_user_main+6743>:  callq  0x404a20 <exp@plt>
=> 0x415fcc <chpl_user_main+6748>:  vmovaps %xmm0,0xe0(%rsp)
   0x415fd5 <chpl_user_main+6757>:  vpermilpd $0x1,0x80(%rsp),%xmm0
   0x415fe0 <chpl_user_main+6768>:  callq  0x404a20 <exp@plt>
   0x415fe5 <chpl_user_main+6773>:  vmovapd 0xe0(%rsp),%xmm1
   0x415fee <chpl_user_main+6782>:  vunpcklpd %xmm0,%xmm1,%xmm0
   0x415ff2 <chpl_user_main+6786>:  vinsertf128 $0x1,0x70(%rsp),%ymm0,%ymm0
   0x415ffa <chpl_user_main+6794>:  vaddpd 0x320(%rsp),%ymm0,%ymm0
   0x416003 <chpl_user_main+6803>:  vmovupd 0x160(%rsp),%ymm1
   0x41600c <chpl_user_main+6812>:  vdivpd %ymm0,%ymm1,%ymm0
   0x416010 <chpl_user_main+6816>:  mov    0x140(%rsp),%rax
   0x416018 <chpl_user_main+6824>:  vmovupd %ymm0,(%rax,%rbx,8)
   0x41601d <chpl_user_main+6829>:  or     $0x20,%r14

Or this version from --llvm

   0x425350 <chpl_user_main+6544>:  vmovsd 0x8(%r13,%r14,8),%xmm0
   0x425357 <chpl_user_main+6551>:  vdivsd 0x8(%r15,%r14,8),%xmm0,%xmm0
   0x42535e <chpl_user_main+6558>:  vmovsd %xmm0,0x8(%rbx,%r14,8)
   0x425365 <chpl_user_main+6565>:  vmovsd 0x8(%r12,%r14,8),%xmm1
   0x42536c <chpl_user_main+6572>:  vmovsd %xmm1,0x80(%rsp)
   0x425375 <chpl_user_main+6581>:  callq  0x404730 <__exp_finite@plt>
   0x42537a <chpl_user_main+6586>:  vmovsd 0x8d3f6(%rip),%xmm1        # 0x4b2778
   0x425382 <chpl_user_main+6594>:  mov    0xe0(%rsp),%rax
   0x42538a <chpl_user_main+6602>:  vaddsd %xmm1,%xmm0,%xmm0
   0x42538e <chpl_user_main+6606>:  vmovsd 0x80(%rsp),%xmm1
   0x425397 <chpl_user_main+6615>:  vdivsd %xmm0,%xmm1,%xmm0
=> 0x42539b <chpl_user_main+6619>:  vmovsd %xmm0,0x8(%rbp,%r14,8)
   0x4253a2 <chpl_user_main+6626>:  add    $0x1,%r14
   0x4253a6 <chpl_user_main+6630>:  cmp    %rax,%r14
   0x4253a9 <chpl_user_main+6633>:  jl     0x425350 <chpl_user_main+6544>

I believe the problem here is that it's not calling a vectorized version of exp.

mppf commented 5 years ago

I've created #11772 to summarize the results of this spike.

chapel-lang / chapel

spike: investigate LLVM performance difference for LCALS #11159