chapel-lang / chapel

a Productive Parallel Programming Language
https://chapel-lang.org
Other
1.78k stars 420 forks source link

spike: investigate LLVM performance difference for LCALS #11159

Closed mppf closed 5 years ago

mppf commented 6 years ago

LCALS performance differs for --llvm especially for the raw,short configuration, but to some degree for other benchmarks.

https://chapel-lang.org/perf/chapcs/llvm/?startdate=2018/09/12&graphs=lcalsrawshort,lcalsrawmedium,lcalsrawlong,lcalsrawompshort,lcalsrawompmedium,lcalsrawomplong

For example,

test C llvm
muladdsub 1.23 s 3.36 s
planckian 0.96 s 3.12 s
init3 0.74 s 1.8 s
trap_int 0.77 s 1.38 s
imp_hydro_2d 0.65 s 0.81 s
couple 1.16 s 1.32 s
pic_1d 1.25 s 1.3 s
pressure_calc 1.19 s 1.27 s

This task is a spike to investigate the cause of these differences.

mppf commented 5 years ago

Looking at muladsubb. First, I have a small performance reproducer:

use Time;

// num_samples: 3000 for "long", 300000 for "medium", 50000000 for "short"
config const num_samples = 50000000; // "short"
// array length: 44217 for "long", 5001 for "medium", 171 for "short"
config const len = 171; // "short"

proc main() {
  var ltimer:Timer;
  var RealArray_1D: [0..10][0..#len] real;

  // The stuff from LCALSLoops.chpl loopInit
  for i in 1..5 do
    initData(RealArray_1D[i-1], i);

  // This portion from RunBRawLoops.chpl runBRawLoops e.g.
  ref out1 = RealArray_1D[0];
  ref out2 = RealArray_1D[1];
  ref out3 = RealArray_1D[2];
  ref in1  = RealArray_1D[3];
  ref in2  = RealArray_1D[4];
  ltimer.start();
  for 0..#num_samples {
    for i in 0..#len {
      out1[i] = in1[i] * in2[i];
      out2[i] = in1[i] + in2[i];
      out3[i] = in1[i] - in2[i];
    }
  }
  ltimer.stop();

  var chksum: real = 0.0;
  // Get this portion from LCALSLoops.chpl loopFinalize 
  updateChksum(chksum, len, RealArray_1D[0]);
  updateChksum(chksum, len, RealArray_1D[1]);
  updateChksum(chksum, len, RealArray_1D[2]);

  // Get the Checksum from LCALSChecksums.chpl
  var expect = 391727437.25186200960888527333736;
  writeln("Got checksum ", chksum, " expected ", expect);
  writeln("Kernel ran in ", ltimer.elapsed(), " seconds");
}

proc initData(ra: [] real, id: int) {
  const factor: real = if id % 2 != 0 then 0.1 else 0.2;
  for (r,j) in zip(ra, 0..) {
    r = factor*(j + 1.1)/(j + 1.12345);
  }
}

proc updateChksum(ref chksum:real, length: int, 
                  ra: [] real, scale_factor: real = 1.0) {
  use LongDouble;

  ref data = ra;
  var len = ra.numElements;

  var tchk: longdouble = chksum;
  for (j, dat) in zip(0..#len, data) {
    tchk += (j+1)*dat*scale_factor;
  }
  chksum = tchk:real;
}

Next, I ran the (optimized) C version in gdb and hit control-c in the middle of execution. This loop is appraently the kernel:

   0x423108 <chpl_user_main+6184>:  vmovupd (%r11,%rdx,1),%ymm0
   0x42310e <chpl_user_main+6190>:  vmulpd (%rbx,%rdx,1),%ymm0,%ymm0
   0x423113 <chpl_user_main+6195>:  add    $0x1,%r10
   0x423117 <chpl_user_main+6199>:  vmovupd %ymm0,(%r15,%rdx,1)
=> 0x42311d <chpl_user_main+6205>:  vmovupd (%r11,%rdx,1),%ymm0
   0x423123 <chpl_user_main+6211>:  vaddpd (%rbx,%rdx,1),%ymm0,%ymm0
   0x423128 <chpl_user_main+6216>:  vmovupd %ymm0,(%r14,%rdx,1)
   0x42312e <chpl_user_main+6222>:  vmovapd (%rbx,%rdx,1),%ymm0
   0x423133 <chpl_user_main+6227>:  vsubpd (%r11,%rdx,1),%ymm0,%ymm0
   0x423139 <chpl_user_main+6233>:  vmovupd %ymm0,(%r9,%rdx,1)
   0x42313f <chpl_user_main+6239>:  add    $0x20,%rdx
   0x423143 <chpl_user_main+6243>:  cmp    %r10,%r13
   0x423146 <chpl_user_main+6246>:  ja     0x423108 <chpl_user_main+6184>

Here is the kernel in the optimized LLVM version:

   0x428e00 <chpl_user_main+6400>:  vmovsd 0x8(%rcx,%rbp,8),%xmm0
   0x428e06 <chpl_user_main+6406>:  vmulsd 0x8(%rdx,%rbp,8),%xmm0,%xmm0
   0x428e0c <chpl_user_main+6412>:  vmovsd %xmm0,0x8(%rax,%rbp,8)
   0x428e12 <chpl_user_main+6418>:  vmovsd 0x8(%rcx,%rbp,8),%xmm0
   0x428e18 <chpl_user_main+6424>:  vaddsd 0x8(%rdx,%rbp,8),%xmm0,%xmm0
   0x428e1e <chpl_user_main+6430>:  vmovsd %xmm0,0x8(%rsi,%rbp,8)
=> 0x428e24 <chpl_user_main+6436>:  vmovsd 0x8(%rcx,%rbp,8),%xmm0
   0x428e2a <chpl_user_main+6442>:  vsubsd 0x8(%rdx,%rbp,8),%xmm0,%xmm0
   0x428e30 <chpl_user_main+6448>:  vmovsd %xmm0,0x8(%rdi,%rbp,8)
   0x428e36 <chpl_user_main+6454>:  vmovsd 0x10(%rcx,%rbp,8),%xmm0
   0x428e3c <chpl_user_main+6460>:  vmulsd 0x10(%rdx,%rbp,8),%xmm0,%xmm0
   0x428e42 <chpl_user_main+6466>:  vmovsd %xmm0,0x10(%rax,%rbp,8)
   0x428e48 <chpl_user_main+6472>:  vmovsd 0x10(%rcx,%rbp,8),%xmm0
   0x428e4e <chpl_user_main+6478>:  vaddsd 0x10(%rdx,%rbp,8),%xmm0,%xmm0
   0x428e54 <chpl_user_main+6484>:  vmovsd %xmm0,0x10(%rsi,%rbp,8)
   0x428e5a <chpl_user_main+6490>:  vmovsd 0x10(%rcx,%rbp,8),%xmm0
   0x428e60 <chpl_user_main+6496>:  vsubsd 0x10(%rdx,%rbp,8),%xmm0,%xmm0
   0x428e66 <chpl_user_main+6502>:  vmovsd %xmm0,0x10(%rdi,%rbp,8)
   0x428e6c <chpl_user_main+6508>:  add    $0x2,%rbp
   0x428e70 <chpl_user_main+6512>:  cmp    %r14,%rbp
   0x428e73 <chpl_user_main+6515>:  jl     0x428e00 <chpl_user_main+6400>

This benchmark has equivalent performance with clang-included and with --llvm, so it's not necessarily the case that we're simply encoding the LLVM IR wrong. It seems that LLVM is not vectorizing.

If I adjust the program to run with vectorizeOnly, the LLVM version becomes faster than the GCC one.

Possible remedy: extend no-alias metadata for the sub-array case.

mppf commented 5 years ago

The following version of the benchmark shows LLVM can be faster:

use Time;

// num_samples: 3000 for "long", 300000 for "medium", 50000000 for "short"
config const num_samples = 50000000; // "short"
// array length: 44217 for "long", 5001 for "medium", 171 for "short"
config const len = 171; // "short"

proc main() {
  var ltimer:Timer;
  var RealArray_1D0: [0..#len] real;
  var RealArray_1D1: [0..#len] real;
  var RealArray_1D2: [0..#len] real;
  var RealArray_1D3: [0..#len] real;
  var RealArray_1D4: [0..#len] real;
  // The stuff from LCALSLoops.chpl loopInit
  initData(RealArray_1D0, 1);
  initData(RealArray_1D1, 2);
  initData(RealArray_1D2, 3);
  initData(RealArray_1D3, 4);
  initData(RealArray_1D4, 5);

  // This portion from RunBRawLoops.chpl runBRawLoops e.g.
  ltimer.start();
  for 0..#num_samples {
    //for i in vectorizeOnly(0..#len) {
    for i in 0..#len {
      RealArray_1D0[i] = RealArray_1D3[i] * RealArray_1D4[i];
      RealArray_1D1[i] = RealArray_1D3[i] + RealArray_1D4[i];
      RealArray_1D2[i] = RealArray_1D3[i] - RealArray_1D4[i];
    }
  }
  ltimer.stop();

  var chksum: real = 0.0;
  // Get this portion from LCALSLoops.chpl loopFinalize 
  updateChksum(chksum, len, RealArray_1D0);
  updateChksum(chksum, len, RealArray_1D1);
  updateChksum(chksum, len, RealArray_1D2);

  // Get the Checksum from LCALSChecksums.chpl
  var expect = 391727437.25186200960888527333736;
  writeln("Got checksum ", chksum, " expected ", expect);
  writeln("Kernel ran in ", ltimer.elapsed(), " seconds");
}

proc initData(ra: [] real, id: int) {
  const factor: real = if id % 2 != 0 then 0.1 else 0.2;
  for (r,j) in zip(ra, 0..) {
    r = factor*(j + 1.1)/(j + 1.12345);
  }
}

proc updateChksum(ref chksum:real, length: int, 
                  ra: [] real, scale_factor: real = 1.0) {
  use LongDouble;

  ref data = ra;
  var len = ra.numElements;

  var tchk: longdouble = chksum;
  for (j, dat) in zip(0..#len, data) {
    tchk += (j+1)*dat*scale_factor;
  }
  chksum = tchk:real;
}

Here LLVM is about 2x faster than GCC (due to #11324).

mppf commented 5 years ago

pressure_calc: serial LLVM version is much slower than C version, using gcc or clang-included. With vectorizeOnly, the LLVM version is still slower than the C version (from either).

Here is a perfomance reproducer:

use Time;

// num_samples: 3000 for "long", 300000 for "medium", 50000000 for "short"
config const num_samples = 30000;
// array length: 44217 for "long", 5001 for "medium", 171 for "short"
config const len = 44217;

proc main() {
  var ltimer:Timer;
  var RealArray_1D: [0..10][0..#len] real;
  var RealArray_scalars: [0..#10] real;

  // The stuff from LCALSLoops.chpl loopInit
  for i in 1..5 do
    initData(RealArray_1D[i-1], i);
  initData(RealArray_scalars, 1);

  // This portion from RunBRawLoops.chpl runBRawLoops e.g.
  ref compression = RealArray_1D[0];
  ref bvc = RealArray_1D[1];
  ref p_new = RealArray_1D[2];
  ref e_old = RealArray_1D[3];
  ref vnewc = RealArray_1D[4];

  const cls = RealArray_scalars[0];
  const p_cut = RealArray_scalars[1];
  const pmin = RealArray_scalars[2];
  const eosvmax = RealArray_scalars[3];
  ltimer.start();
  for 0..#num_samples {
    //for i in vectorizeOnly(0..#len) {
    for i in 0..#len {
      bvc[i] = cls * (compression[i] + 1.0);
    }
    //for i in vectorizeOnly(0..#len) {
    for i in 0..#len {
      p_new[i] = bvc[i] * e_old[i];
      if ( abs(p_new[i]) <  p_cut ) then p_new[i] = 0.0;
      if ( vnewc[i] >= eosvmax ) then p_new[i] = 0.0;
      if ( p_new[i]  <  pmin ) then p_new[i] = pmin;
    }
  }
  ltimer.stop();

  var chksum: real = 0.0;
  // Get this portion from LCALSLoops.chpl loopFinalize 
  updateChksum(chksum, len, RealArray_1D[2]);

  // Get the Checksum from LCALSChecksums.chpl
  var expect = 97197939.191977054695598781108856;
  writeln("Got checksum ", chksum, " expected ", expect);
  writeln("Kernel ran in ", ltimer.elapsed(), " seconds");
}

proc initData(ra: [] real, id: int) {
  const factor: real = if id % 2 != 0 then 0.1 else 0.2;
  for (r,j) in zip(ra, 0..) {
    r = factor*(j + 1.1)/(j + 1.12345);
  }
}

proc updateChksum(ref chksum:real, length: int, 
                  ra: [] real, scale_factor: real = 1.0) {
  use LongDouble;

  ref data = ra;
  var len = ra.numElements;

  var tchk: longdouble = chksum;
  for (j, dat) in zip(0..#len, data) {
    tchk += (j+1)*dat*scale_factor;
  }
  chksum = tchk:real;
}

This benchmark has two loops. Only the one setting p_new shows the performance difference.

Here is the asm for the gcc version of the inner loop:

   0x423138 <chpl_user_main+6088>:  vmovupd (%r15,%rdx,1),%ymm0
   0x42313e <chpl_user_main+6094>:  vmulpd (%r14,%rdx,1),%ymm0,%ymm1
=> 0x423144 <chpl_user_main+6100>:  add    $0x1,%r9
   0x423148 <chpl_user_main+6104>:  vandpd %ymm8,%ymm1,%ymm0
   0x42314d <chpl_user_main+6109>:  vcmpltpd %ymm11,%ymm0,%ymm0
   0x423153 <chpl_user_main+6115>:  vandnpd %ymm1,%ymm0,%ymm0
   0x423157 <chpl_user_main+6119>:  vmovapd %ymm0,(%rbx,%rdx,1)
   0x42315c <chpl_user_main+6124>:  vcmplepd (%rcx,%rdx,1),%ymm10,%ymm1
   0x423162 <chpl_user_main+6130>:  vandnpd %ymm0,%ymm1,%ymm0
   0x423166 <chpl_user_main+6134>:  vmaxpd %ymm0,%ymm9,%ymm0
   0x42316a <chpl_user_main+6138>:  vmovapd %ymm0,(%rbx,%rdx,1)
   0x42316f <chpl_user_main+6143>:  add    $0x20,%rdx
   0x423173 <chpl_user_main+6147>:  cmp    %r9,%r11
   0x423176 <chpl_user_main+6150>:  ja     0x423138 <chpl_user_main+6088>

Here is the asm for the LLVM version of the inner loop:

   0x4293b0 <chpl_user_main+7616>:  vmovsd 0x8(%rcx,%rdi,8),%xmm1
   0x4293b6 <chpl_user_main+7622>:  vmulsd 0x8(%rdx,%rdi,8),%xmm1,%xmm1
   0x4293bc <chpl_user_main+7628>:  vandpd %xmm0,%xmm1,%xmm2
   0x4293c0 <chpl_user_main+7632>:  vcmpltsd %xmm3,%xmm2,%xmm2
   0x4293c5 <chpl_user_main+7637>:  vandnpd %xmm1,%xmm2,%xmm1
   0x4293c9 <chpl_user_main+7641>:  vmovsd %xmm1,0x8(%rax,%rdi,8)
   0x4293cf <chpl_user_main+7647>:  vmovsd 0x8(%rsi,%rdi,8),%xmm2
   0x4293d5 <chpl_user_main+7653>:  vucomisd %xmm4,%xmm2
   0x4293d9 <chpl_user_main+7657>:  jb     0x4293e8 <chpl_user_main+7672>
   0x4293db <chpl_user_main+7659>:  movq   $0x0,0x8(%rax,%rdi,8)
   0x4293e4 <chpl_user_main+7668>:  vxorpd %xmm1,%xmm1,%xmm1
   0x4293e8 <chpl_user_main+7672>:  vucomisd %xmm1,%xmm5
   0x4293ec <chpl_user_main+7676>:  jbe    0x4293f3 <chpl_user_main+7683>
   0x4293ee <chpl_user_main+7678>:  mov    %rbx,0x8(%rax,%rdi,8)
   0x4293f3 <chpl_user_main+7683>:  vmovsd 0x10(%rcx,%rdi,8),%xmm1
   0x4293f9 <chpl_user_main+7689>:  vmulsd 0x10(%rdx,%rdi,8),%xmm1,%xmm1
   0x4293ff <chpl_user_main+7695>:  vandpd %xmm0,%xmm1,%xmm2
   0x429403 <chpl_user_main+7699>:  vcmpltsd %xmm3,%xmm2,%xmm2
=> 0x429408 <chpl_user_main+7704>:  vandnpd %xmm1,%xmm2,%xmm1
   0x42940c <chpl_user_main+7708>:  vmovsd %xmm1,0x10(%rax,%rdi,8)
   0x429412 <chpl_user_main+7714>:  vmovsd 0x10(%rsi,%rdi,8),%xmm2
   0x429418 <chpl_user_main+7720>:  vucomisd %xmm4,%xmm2
   0x42941c <chpl_user_main+7724>:  jae    0x429440 <chpl_user_main+7760>
   0x42941e <chpl_user_main+7726>:  vucomisd %xmm1,%xmm5
   0x429422 <chpl_user_main+7730>:  jbe    0x429453 <chpl_user_main+7779>
   0x429424 <chpl_user_main+7732>:  mov    %rbx,0x10(%rax,%rdi,8)
   0x429429 <chpl_user_main+7737>:  add    $0x2,%rdi
   0x42942d <chpl_user_main+7741>:  cmp    %r12,%rdi
   0x429430 <chpl_user_main+7744>:  jl     0x4293b0 <chpl_user_main+7616>
   0x429436 <chpl_user_main+7750>:  jmp    0x429460 <chpl_user_main+7792>
   0x429438 <chpl_user_main+7752>:  nopl   0x0(%rax,%rax,1)
   0x429440 <chpl_user_main+7760>:  movq   $0x0,0x10(%rax,%rdi,8)
   0x429449 <chpl_user_main+7769>:  vxorpd %xmm1,%xmm1,%xmm1
   0x42944d <chpl_user_main+7773>:  vucomisd %xmm1,%xmm5
   0x429451 <chpl_user_main+7777>:  ja     0x429424 <chpl_user_main+7732>
   0x429453 <chpl_user_main+7779>:  add    $0x2,%rdi
   0x429457 <chpl_user_main+7783>:  cmp    %r12,%rdi
   0x42945a <chpl_user_main+7786>:  jl     0x4293b0 <chpl_user_main+7616>

Here again vectorization is not occuring.

Here is the LLVM vectorizeOnly version

   0x429420 <chpl_user_main+7648>:  vmovsd 0x8(%rsi,%rcx,8),%xmm0
   0x429426 <chpl_user_main+7654>:  vmulsd 0x8(%rdi,%rcx,8),%xmm0,%xmm0
   0x42942c <chpl_user_main+7660>:  vandpd %xmm5,%xmm0,%xmm1
   0x429430 <chpl_user_main+7664>:  vcmpltsd %xmm2,%xmm1,%xmm1
   0x429435 <chpl_user_main+7669>:  vandnpd %xmm0,%xmm1,%xmm0
   0x429439 <chpl_user_main+7673>:  vmovsd %xmm0,0x8(%rdx,%rcx,8)
   0x42943f <chpl_user_main+7679>:  vmovq  0x8(%rbx,%rcx,8),%xmm1
=> 0x429445 <chpl_user_main+7685>:  vucomisd %xmm3,%xmm1
   0x429449 <chpl_user_main+7689>:  jae    0x429470 <chpl_user_main+7728>
   0x42944b <chpl_user_main+7691>:  vucomisd %xmm0,%xmm4
   0x42944f <chpl_user_main+7695>:  jbe    0x429483 <chpl_user_main+7747>
   0x429451 <chpl_user_main+7697>:  mov    %r14,0x8(%rdx,%rcx,8)
   0x429456 <chpl_user_main+7702>:  add    $0x1,%rcx
   0x42945a <chpl_user_main+7706>:  cmp    %rax,%rcx
   0x42945d <chpl_user_main+7709>:  jl     0x429420 <chpl_user_main+7648>
   0x42945f <chpl_user_main+7711>:  jmp    0x42948c <chpl_user_main+7756>
   0x429461 <chpl_user_main+7713>:  
    data16 data16 data16 data16 data16 nopw %cs:0x0(%rax,%rax,1)
   0x429470 <chpl_user_main+7728>:  movq   $0x0,0x8(%rdx,%rcx,8)
   0x429479 <chpl_user_main+7737>:  vxorpd %xmm0,%xmm0,%xmm0
   0x42947d <chpl_user_main+7741>:  vucomisd %xmm0,%xmm4
   0x429481 <chpl_user_main+7745>:  ja     0x429451 <chpl_user_main+7697>
   0x429483 <chpl_user_main+7747>:  add    $0x1,%rcx
   0x429487 <chpl_user_main+7751>:  cmp    %rax,%rcx
   0x42948a <chpl_user_main+7754>:  jl     0x429420 <chpl_user_main+7648>

Here is part of the asm for the clang-included version:

=> 0x4164d0 <chpl_user_main+8032>:  vmovupd (%rdi,%rdx,1),%ymm4
   0x4164d5 <chpl_user_main+8037>:  vmulpd (%rcx,%rdx,1),%ymm4,%ymm4
   0x4164da <chpl_user_main+8042>:  
    vbroadcastsd 0x9c6fd(%rip),%ymm5        # 0x4b2be0
   0x4164e3 <chpl_user_main+8051>:  vandpd %ymm5,%ymm4,%ymm6
   0x4164e7 <chpl_user_main+8055>:  vcmpnltpd %ymm8,%ymm6,%ymm6
   0x4164ed <chpl_user_main+8061>:  vandpd %ymm4,%ymm6,%ymm4
   0x4164f1 <chpl_user_main+8065>:  vmovupd %ymm4,0x0(%rbp,%rdx,1)
   0x4164f7 <chpl_user_main+8071>:  vmovupd (%rax,%rdx,1),%ymm6
   0x4164fc <chpl_user_main+8076>:  vcmplepd %ymm6,%ymm10,%ymm7
   0x416501 <chpl_user_main+8081>:  vcmpnlepd %ymm6,%ymm10,%ymm6
   0x416506 <chpl_user_main+8086>:  vandpd %ymm4,%ymm6,%ymm4
   0x41650a <chpl_user_main+8090>:  vcmpltpd %ymm2,%ymm4,%ymm6
   0x41650f <chpl_user_main+8095>:  vorpd  %ymm7,%ymm6,%ymm6
   0x416513 <chpl_user_main+8099>:  vmaxpd %ymm4,%ymm2,%ymm4
   0x416517 <chpl_user_main+8103>:  vmaskmovpd %ymm4,%ymm6,0x0(%rbp,%rdx,1)
   0x41651e <chpl_user_main+8110>:  vmovupd 0x20(%rdi,%rdx,1),%ymm4
   0x416524 <chpl_user_main+8116>:  vmulpd 0x20(%rcx,%rdx,1),%ymm4,%ymm4
   0x41652a <chpl_user_main+8122>:  vandpd %ymm5,%ymm4,%ymm5
   0x41652e <chpl_user_main+8126>:  vcmpnltpd %ymm8,%ymm5,%ymm5
   0x416534 <chpl_user_main+8132>:  vandpd %ymm4,%ymm5,%ymm4
   0x416538 <chpl_user_main+8136>:  vmovupd %ymm4,0x20(%rbp,%rdx,1)
   0x41653e <chpl_user_main+8142>:  vmovupd 0x20(%rax,%rdx,1),%ymm5
   0x416544 <chpl_user_main+8148>:  vcmplepd %ymm5,%ymm10,%ymm6
   0x416549 <chpl_user_main+8153>:  vcmpnlepd %ymm5,%ymm10,%ymm5
   0x41654e <chpl_user_main+8158>:  vandpd %ymm4,%ymm5,%ymm4
   0x416552 <chpl_user_main+8162>:  vcmpltpd %ymm2,%ymm4,%ymm5
   0x416557 <chpl_user_main+8167>:  vorpd  %ymm6,%ymm5,%ymm5
   0x41655b <chpl_user_main+8171>:  vmaxpd %ymm4,%ymm2,%ymm4
   0x41655f <chpl_user_main+8175>:  
    vmaskmovpd %ymm4,%ymm5,0x20(%rbp,%rdx,1)
   0x416566 <chpl_user_main+8182>:  add    $0x8,%rbx
   0x41656a <chpl_user_main+8186>:  add    $0x40,%rdx
   0x41656e <chpl_user_main+8190>:  add    $0x2,%rsi
   0x416572 <chpl_user_main+8194>:  jne    0x4164d0 <chpl_user_main+8032>

LLVM IR for --llvm vectorizeOnly version

; <label>:791:                                    ; preds = %811, %778
  %792 = phi i64 [ %812, %811 ], [ %776, %778 ]
  %793 = getelementptr inbounds double, double* %781, i64 %792
  %794 = getelementptr inbounds double, double* %784, i64 %792
  %795 = getelementptr inbounds double, double* %787, i64 %792
  %796 = load double, double* %794, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
  %797 = load double, double* %795, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
  %798 = fmul contract double %796, %797, !chpl.ast.id !2214
  %799 = tail call contract double @llvm.fabs.f64(double %798)
  %800 = fcmp contract olt double %799, %695, !chpl.ast.id !2215
  %801 = select i1 %800, double 0.000000e+00, double %798
  store double %801, double* %793, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193
  %802 = getelementptr inbounds double, double* %790, i64 %792
  %803 = load double, double* %802, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
  %804 = fcmp contract ult double %803, %700, !chpl.ast.id !2216
  br i1 %804, label %806, label %805
; <label>:805:                                    ; preds = %791
  store double 0.000000e+00, double* %793, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
  br label %806

; <label>:806:                                    ; preds = %805, %791
  %807 = phi double [ %801, %791 ], [ 0.000000e+00, %805 ]
  %808 = fcmp contract olt double %807, %706, !chpl.ast.id !2217
  br i1 %808, label %809, label %811

; <label>:809:                                    ; preds = %806
  %810 = bitcast double* %793 to i64*
  store i64 %698, i64* %810, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213
  br label %811

; <label>:811:                                    ; preds = %809, %806
  %812 = add nsw i64 %792, 1
  %813 = icmp slt i64 %792, %775
  br i1 %813, label %791, label %814, !llvm.loop !2213

Here's the optimized clang-included version of the same loop:

vector.body1035:                                  ; preds = %vector.body1035, %vector.ph1071.new
  %index1074 = phi i64 [ 0, %vector.ph1071.new ], [ %index.next1075.1, %vector.body1035 ]
  %niter = phi i64 [ %unroll_iter, %vector.ph1071.new ], [ %niter.nsub.1, %vector.body1035 ]
  %402 = add i64 %118, %index1074
  %403 = add nsw i64 %402, 1
  %404 = getelementptr inbounds double, double* %372, i64 %403
  %405 = getelementptr inbounds double, double* %375, i64 %403
  %406 = getelementptr inbounds double, double* %378, i64 %403
  %407 = bitcast double* %405 to <4 x double>*
  %wide.load1083 = load <4 x double>, <4 x double>* %407, align 8, !tbaa !26, !alias.scope !384
  %408 = bitcast double* %406 to <4 x double>*
  %wide.load1084 = load <4 x double>, <4 x double>* %408, align 8, !tbaa !26, !alias.scope !387
  %409 = fmul <4 x double> %wide.load1083, %wide.load1084
  %410 = call <4 x double> @llvm.fabs.v4f64(<4 x double> %409)
  %411 = fcmp olt <4 x double> %410, %broadcast.splat1086
  %412 = select <4 x i1> %411, <4 x double> zeroinitializer, <4 x double> %409
  %413 = bitcast double* %404 to <4 x double>*
  store <4 x double> %412, <4 x double>* %413, align 8, !tbaa !26, !alias.scope !389, !noalias !391
  %414 = getelementptr inbounds double, double* %381, i64 %403
  %415 = bitcast double* %414 to <4 x double>*
  %wide.load1087 = load <4 x double>, <4 x double>* %415, align 8, !tbaa !26, !alias.scope !393
  %416 = fcmp ult <4 x double> %wide.load1087, %broadcast.splat1089
  %417 = select <4 x i1> %416, <4 x double> %412, <4 x double> zeroinitializer
  %418 = fcmp olt <4 x double> %417, %broadcast.splat1091
  %419 = xor <4 x i1> %416, <i1 true, i1 true, i1 true, i1 true>
  %420 = or <4 x i1> %418, %419
  %421 = select <4 x i1> %418, <4 x double> %broadcast.splat1091, <4 x double> %417
  %422 = bitcast double* %404 to <4 x double>*
  call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %421, <4 x double>* %422, i32 8, <4 x i1> %420), !tbaa !26, !alias.scope !389, !noalias !391
  %index.next1075 = or i64 %index1074, 4
  %423 = add i64 %118, %index.next1075
  %424 = add nsw i64 %423, 1

Here's the un-optimized clang-included IR

  %520 = load double*, double** %519, align 8, !tbaa !177
  store double* %520, double** %101, align 8, !tbaa !2
  %521 = load double*, double** %101, align 8, !tbaa !2
  %522 = load i64, i64* %87, align 8, !tbaa !16
  %523 = getelementptr inbounds double, double* %521, i64 %522
  store double* %523, double** %102, align 8, !tbaa !2
  %524 = load double*, double** %98, align 8, !tbaa !2
  %525 = load double, double* %524, align 8, !tbaa !178
  %526 = load double*, double** %102, align 8, !tbaa !2
  %527 = load double, double* %526, align 8, !tbaa !178
  %528 = fmul double %525, %527
  %529 = load double*, double** %94, align 8, !tbaa !2
  store double %528, double* %529, align 8, !tbaa !178
  %530 = load %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %52, align 8, !tbaa !2
  %531 = getelementptr inbounds %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s, %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %530, i32 0, i32 1
  %532 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %531, align 8, !tbaa !171
  store %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %532, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %103, align 8, !tbaa !2
  %533 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %103, align 8, !tbaa !2
  store %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %533, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %104, align 8, !tbaa !2
  %534 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %104, align 8, !tbaa !2
  %535 = getelementptr inbounds %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %534, i32 0, i32 8
  %536 = load double*, double** %535, align 8, !tbaa !177
  store double* %536, double** %105, align 8, !tbaa !2
  %537 = load double*, double** %105, align 8, !tbaa !2
  %538 = load i64, i64* %87, align 8, !tbaa !16
  %539 = getelementptr inbounds double, double* %537, i64 %538
  store double* %539, double** %106, align 8, !tbaa !2
  %540 = load double*, double** %106, align 8, !tbaa !2
  %541 = load double, double* %540, align 8, !tbaa !178
  %542 = call double @llvm.fabs.f64(double %541)
  store double %542, double* %107, align 8, !tbaa !178
  %543 = load double, double* %107, align 8, !tbaa !178
  %544 = load double, double* %65, align 8, !tbaa !178
  %545 = fcmp olt double %543, %544
  %546 = zext i1 %545 to i8
  store i8 %546, i8* %108, align 1, !tbaa !141
  %547 = load i8, i8* %108, align 1, !tbaa !141, !range !161
  %548 = trunc i8 %547 to i1
  br i1 %548, label %549, label %561

; <label>:549:                                    ; preds = %493
  %550 = load %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %52, align 8, !tbaa !2
  %551 = getelementptr inbounds %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s, %struct.chpl__array_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %550, i32 0, i32 1
  %552 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %551, align 8, !tbaa !171
  store %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %552, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %109, align 8, !tbaa !2
  %553 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %109, align 8, !tbaa !2
  store %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %553, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %110, align 8, !tbaa !2
  %554 = load %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s*, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s** %110, align 8, !tbaa !2
  %555 = getelementptr inbounds %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s, %struct.chpl_DefaultRectangularArr_1_int64_t_F__real64_int64_t_s* %554, i32 0, i32 8
  %556 = load double*, double** %555, align 8, !tbaa !177
  store double* %556, double** %111, align 8, !tbaa !2
  %557 = load double*, double** %111, align 8, !tbaa !2
  %558 = load i64, i64* %87, align 8, !tbaa !16
  %559 = getelementptr inbounds double, double* %557, i64 %558
  store double* %559, double** %112, align 8, !tbaa !2
  %560 = load double*, double** %112, align 8, !tbaa !2
  store double 0.000000e+00, double* %560, align 8, !tbaa !178
  br label %561

But that's hard to read so here's the -O1 version

; <label>:99:                                     ; preds = %116, %.lr.ph
  %.0361371 = phi i64 [ %.sroa.0158.0.copyload, %.lr.ph ], [ %117, %116 ]
  %100 = getelementptr inbounds double, double* %89, i64 %.0361371
  %101 = getelementptr inbounds double, double* %92, i64 %.0361371
  %102 = getelementptr inbounds double, double* %95, i64 %.0361371
  %103 = load double, double* %101, align 8, !tbaa !152
  %104 = load double, double* %102, align 8, !tbaa !152
  %105 = fmul double %103, %104
  %106 = tail call double @llvm.fabs.f64(double %105)
  %107 = fcmp olt double %106, %70
  %storemerge = select i1 %107, double 0.000000e+00, double %105
  store double %storemerge, double* %100, align 8, !tbaa !152
  %108 = getelementptr inbounds double, double* %98, i64 %.0361371
  %109 = load double, double* %108, align 8, !tbaa !152
  %110 = fcmp ult double %109, %74
  br i1 %110, label %112, label %111

; <label>:111:                                    ; preds = %99
  store double 0.000000e+00, double* %100, align 8, !tbaa !152
  br label %112
; <label>:112:                                    ; preds = %99, %111
  %113 = load double, double* %100, align 8, !tbaa !152
  %114 = fcmp olt double %113, %72
  br i1 %114, label %115, label %116

; <label>:115:                                    ; preds = %112
  store double %72, double* %100, align 8, !tbaa !152
  br label %116

; <label>:116:                                    ; preds = %112, %115
  %117 = add nsw i64 %.0361371, 1
  %118 = icmp slt i64 %.0361371, %.sroa.5.0.copyload
  br i1 %118, label %99, label %._crit_edge

._crit_edge:                                      ; preds = %116, %85
  %119 = add nuw nsw i64 %.0362372, 1
  %120 = icmp slt i64 %.0362372, %.0363
  br i1 %120, label %85, label %._crit_edge375

It looks like much of the issue is coming from an llvm.invariant.start which ends up (somehow) causing this code to be generated:

  %810 = bitcast double* %793 to i64*
  store i64 %698, i64* %810, align 8, !tbaa !134, !alias.scope !2191, !noalias !2193, !llvm.mem.parallel_loop_access !2213

Removing the llvm.invariant.start related to that pointer or replacing that bitcast/store with a store of 0 allow the loop to vectorize.

This is reported as an LLVM bug here: https://bugs.llvm.org/show_bug.cgi?id=39852

mppf commented 5 years ago

planckian: Here --llvm and clang-included are slower than GCC, when --no-ieee-float is provided. Here is a performance reproducer:

use Time;

// num_samples: 3000 for "long", 300000 for "medium", 50000000 for "short"
config const num_samples = 5000000; // "short"
// array length: 44217 for "long", 5001 for "medium", 171 for "short"
config const len = 171; // "short"

proc main() {
  var ltimer:Timer;
  var RealArray_1D: [0..10][0..#len] real;

  // The stuff from LCALSLoops.chpl loopInit
  for i in 1..5 do
    initData(RealArray_1D[i-1], i);

  // This portion from RunBRawLoops.chpl runBRawLoops e.g.
  ref x = RealArray_1D[0];
  ref y = RealArray_1D[1];
  ref u = RealArray_1D[2];
  ref v  = RealArray_1D[3];
  ref w  = RealArray_1D[4];
  var expmax = 20.0;
  u[len-1] = 0.99 * expmax*v[len-1];
  ltimer.start();
  for 0..#num_samples {
    //for k in vectorizeOnly(0..#len) {
    for k in 0..#len {
      y[k] = u[k] / v[k];
      w[k] = x[k] / (exp(y[k]) - 1.0);
    }
  }
  ltimer.stop();

  var chksum: real = 0.0;
  // Get this portion from LCALSLoops.chpl loopFinalize 
  updateChksum(chksum, len, RealArray_1D[4]);

  // Get the Checksum from LCALSChecksums.chpl
  var expect = 391727437.25186200960888527333736;
  writeln("Got checksum ", chksum, " expected ", expect);
  writeln("Kernel ran in ", ltimer.elapsed(), " seconds");
}

proc initData(ra: [] real, id: int) {
  const factor: real = if id % 2 != 0 then 0.1 else 0.2;
  for (r,j) in zip(ra, 0..) {
    r = factor*(j + 1.1)/(j + 1.12345);
  }
}

proc updateChksum(ref chksum:real, length: int, 
                  ra: [] real, scale_factor: real = 1.0) {
  use LongDouble;

  ref data = ra;
  var len = ra.numElements;

  var tchk: longdouble = chksum;
  for (j, dat) in zip(0..#len, data) {
    tchk += (j+1)*dat*scale_factor;
  }
  chksum = tchk:real;
}

It's important to compile this with --no-ieee-float to see the difference. Here is the GCC assembly:

   0x417b68 <chpl_user_main+8040>:  mov    -0x828(%rbp),%rax
   0x417b6f <chpl_user_main+8047>:  vmovupd 0x0(%r13,%rbx,1),%ymm6
   0x417b76 <chpl_user_main+8054>:  vdivpd (%r14,%rbx,1),%ymm6,%ymm0
   0x417b7c <chpl_user_main+8060>:  vmovupd %ymm0,(%rax,%rbx,1)
   0x417b81 <chpl_user_main+8065>:  vmovupd (%r15,%rbx,1),%ymm1
   0x417b87 <chpl_user_main+8071>:  vmovapd %ymm1,-0x810(%rbp)
   0x417b8f <chpl_user_main+8079>:  callq  0x4d78c0 <_ZGVdN4v___exp_finite>
   0x417b94 <chpl_user_main+8084>:  vaddpd 0xc4304(%rip),%ymm0,%ymm0        # 0x4dbea0
   0x417b9c <chpl_user_main+8092>:  vmovapd -0x810(%rbp),%ymm1
   0x417ba4 <chpl_user_main+8100>:  vdivpd %ymm0,%ymm1,%ymm1
=> 0x417ba8 <chpl_user_main+8104>:  vmovupd %ymm1,(%r12,%rbx,1)
   0x417bae <chpl_user_main+8110>:  add    $0x20,%rbx
   0x417bb2 <chpl_user_main+8114>:  cmp    %rbx,-0x830(%rbp)
   0x417bb9 <chpl_user_main+8121>:  jne    0x417b68 <chpl_user_main+8040>

Vs the clang-included version:

   0x415f90 <chpl_user_main+6688>:  callq  0x404a20 <exp@plt>
   0x415f95 <chpl_user_main+6693>:  vmovapd %xmm0,0x70(%rsp)
   0x415f9b <chpl_user_main+6699>:  vpermilpd $0x1,0xe0(%rsp),%xmm0
   0x415fa6 <chpl_user_main+6710>:  callq  0x404a20 <exp@plt>
   0x415fab <chpl_user_main+6715>:  vmovapd 0x70(%rsp),%xmm1
   0x415fb1 <chpl_user_main+6721>:  vunpcklpd %xmm0,%xmm1,%xmm0
   0x415fb5 <chpl_user_main+6725>:  vmovapd %xmm0,0x70(%rsp)
   0x415fbb <chpl_user_main+6731>:  vmovups 0x80(%rsp),%ymm0
   0x415fc4 <chpl_user_main+6740>:  vzeroupper 
   0x415fc7 <chpl_user_main+6743>:  callq  0x404a20 <exp@plt>
=> 0x415fcc <chpl_user_main+6748>:  vmovaps %xmm0,0xe0(%rsp)
   0x415fd5 <chpl_user_main+6757>:  vpermilpd $0x1,0x80(%rsp),%xmm0
   0x415fe0 <chpl_user_main+6768>:  callq  0x404a20 <exp@plt>
   0x415fe5 <chpl_user_main+6773>:  vmovapd 0xe0(%rsp),%xmm1
   0x415fee <chpl_user_main+6782>:  vunpcklpd %xmm0,%xmm1,%xmm0
   0x415ff2 <chpl_user_main+6786>:  vinsertf128 $0x1,0x70(%rsp),%ymm0,%ymm0
   0x415ffa <chpl_user_main+6794>:  vaddpd 0x320(%rsp),%ymm0,%ymm0
   0x416003 <chpl_user_main+6803>:  vmovupd 0x160(%rsp),%ymm1
   0x41600c <chpl_user_main+6812>:  vdivpd %ymm0,%ymm1,%ymm0
   0x416010 <chpl_user_main+6816>:  mov    0x140(%rsp),%rax
   0x416018 <chpl_user_main+6824>:  vmovupd %ymm0,(%rax,%rbx,8)
   0x41601d <chpl_user_main+6829>:  or     $0x20,%r14

Or this version from --llvm

   0x425350 <chpl_user_main+6544>:  vmovsd 0x8(%r13,%r14,8),%xmm0
   0x425357 <chpl_user_main+6551>:  vdivsd 0x8(%r15,%r14,8),%xmm0,%xmm0
   0x42535e <chpl_user_main+6558>:  vmovsd %xmm0,0x8(%rbx,%r14,8)
   0x425365 <chpl_user_main+6565>:  vmovsd 0x8(%r12,%r14,8),%xmm1
   0x42536c <chpl_user_main+6572>:  vmovsd %xmm1,0x80(%rsp)
   0x425375 <chpl_user_main+6581>:  callq  0x404730 <__exp_finite@plt>
   0x42537a <chpl_user_main+6586>:  vmovsd 0x8d3f6(%rip),%xmm1        # 0x4b2778
   0x425382 <chpl_user_main+6594>:  mov    0xe0(%rsp),%rax
   0x42538a <chpl_user_main+6602>:  vaddsd %xmm1,%xmm0,%xmm0
   0x42538e <chpl_user_main+6606>:  vmovsd 0x80(%rsp),%xmm1
   0x425397 <chpl_user_main+6615>:  vdivsd %xmm0,%xmm1,%xmm0
=> 0x42539b <chpl_user_main+6619>:  vmovsd %xmm0,0x8(%rbp,%r14,8)
   0x4253a2 <chpl_user_main+6626>:  add    $0x1,%r14
   0x4253a6 <chpl_user_main+6630>:  cmp    %rax,%r14
   0x4253a9 <chpl_user_main+6633>:  jl     0x425350 <chpl_user_main+6544>

I believe the problem here is that it's not calling a vectorized version of exp.

mppf commented 5 years ago

I've created #11772 to summarize the results of this spike.