Slowdown in YZ transform on PrgEnv-gcc

npadmana commented 5 years ago

@ronawho is finding slowdowns on the YZ transforms when running on PrgEnv-gcc, but not on PrgEnv-intel. It would be good to understand why this is happening...

See here :https://github.com/npadmana/DistributedFFT/issues/21#issuecomment-536147613

ronawho commented 5 years ago

This is still a problem for gcc and it looks like it impacts llvm as well. It looks like the Y-transform is the one that slows down. Results for intel, gcc, and llvm on swan for size E at 64 nodes below. For all configs, the Y step starts at .26s and remains that way for intel, but for gcc/llvm times go up to 0.50s.

Intel (Y=0.26, Z=~1.10, X=0.33):

yt=0.26, zt=1.09, xt=0.33  :Checksum(1) =  5.121601045346e+02 + 5.117395998266e+02i
yt=0.26, zt=1.10, xt=0.32  :Checksum(2) =  5.120905403678e+02 + 5.118614716182e+02i
yt=0.26, zt=1.00, xt=0.32  :Checksum(3) =  5.120623229306e+02 + 5.119074203747e+02i
yt=0.26, zt=1.08, xt=0.32  :Checksum(4) =  5.120438418997e+02 + 5.119345900733e+02i
yt=0.26, zt=1.09, xt=0.33  :Checksum(5) =  5.120311521872e+02 + 5.119551325550e+02i
yt=0.26, zt=1.09, xt=0.33  :Checksum(6) =  5.120226088809e+02 + 5.119720179919e+02i
yt=0.26, zt=1.09, xt=0.32  :Checksum(7) =  5.120169296534e+02 + 5.119861371665e+02i
yt=0.26, zt=1.11, xt=0.33  :Checksum(8) =  5.120131225172e+02 + 5.119979364402e+02i
yt=0.26, zt=1.11, xt=0.33  :Checksum(9) =  5.120104767108e+02 + 5.120077674092e+02i
yt=0.26, zt=1.10, xt=0.33  :Checksum(10) =  5.120085127969e+02 + 5.120159443121e+02i
yt=0.26, zt=1.08, xt=0.33  :Checksum(11) =  5.120069224127e+02 + 5.120227453670e+02i
yt=0.26, zt=1.10, xt=0.33  :Checksum(12) =  5.120055158164e+02 + 5.120284096041e+02i
yt=0.26, zt=1.09, xt=0.33  :Checksum(13) =  5.120041820159e+02 + 5.120331373793e+02i
yt=0.26, zt=1.10, xt=0.32  :Checksum(14) =  5.120028605402e+02 + 5.120370938679e+02i
yt=0.26, zt=1.08, xt=0.33  :Checksum(15) =  5.120015223011e+02 + 5.120404138831e+02i
yt=0.26, zt=1.09, xt=0.33  :Checksum(16) =  5.120001570022e+02 + 5.120432068837e+02i
yt=0.26, zt=1.10, xt=0.32  :Checksum(17) =  5.119987650555e+02 + 5.120455615860e+02i
yt=0.26, zt=1.09, xt=0.33  :Checksum(18) =  5.119973525091e+02 + 5.120475499442e+02i
yt=0.26, zt=1.08, xt=0.33  :Checksum(19) =  5.119959279472e+02 + 5.120492304629e+02i
yt=0.26, zt=1.12, xt=0.33  :Checksum(20) =  5.119945006558e+02 + 5.120506508902e+02i
yt=0.26, zt=1.09, xt=0.33  :Checksum(21) =  5.119930795911e+02 + 5.120518503782e+02i
yt=0.26, zt=1.08, xt=0.33  :Checksum(22) =  5.119916728462e+02 + 5.120528612016e+02i
yt=0.26, zt=1.04, xt=0.33  :Checksum(23) =  5.119902874185e+02 + 5.120537101195e+02i
yt=0.26, zt=1.10, xt=0.33  :Checksum(24) =  5.119889291565e+02 + 5.120544194514e+02i
yt=0.26, zt=1.08, xt=0.33  :Checksum(25) =  5.119876028049e+02 + 5.120550079284e+02i
Successful completion of NPB FT Benchmark Class E
Elapsed time :    59.4328
MFLOPS : 1319261.8877

gcc (Y=0.26->0.50, Z=~1.10, X=0.33):

yt=0.26, zt=1.08, xt=0.33  :Checksum(1) =  5.121601045346e+02 + 5.117395998266e+02i
yt=0.25, zt=1.13, xt=0.33  :Checksum(2) =  5.120905403678e+02 + 5.118614716182e+02i
yt=0.26, zt=1.02, xt=0.33  :Checksum(3) =  5.120623229306e+02 + 5.119074203747e+02i
yt=0.25, zt=1.11, xt=0.33  :Checksum(4) =  5.120438418997e+02 + 5.119345900733e+02i
yt=0.67, zt=0.87, xt=0.33  :Checksum(5) =  5.120311521872e+02 + 5.119551325550e+02i
yt=0.48, zt=1.03, xt=0.33  :Checksum(6) =  5.120226088809e+02 + 5.119720179919e+02i
yt=0.60, zt=0.94, xt=0.33  :Checksum(7) =  5.120169296534e+02 + 5.119861371665e+02i
yt=0.58, zt=0.99, xt=0.33  :Checksum(8) =  5.120131225172e+02 + 5.119979364402e+02i
yt=0.55, zt=1.02, xt=0.33  :Checksum(9) =  5.120104767108e+02 + 5.120077674092e+02i
yt=0.52, zt=0.98, xt=0.33  :Checksum(10) =  5.120085127969e+02 + 5.120159443121e+02i
yt=0.41, zt=0.99, xt=0.33  :Checksum(11) =  5.120069224127e+02 + 5.120227453670e+02i
yt=0.52, zt=0.95, xt=0.33  :Checksum(12) =  5.120055158164e+02 + 5.120284096041e+02i
yt=0.50, zt=0.96, xt=0.33  :Checksum(13) =  5.120041820159e+02 + 5.120331373793e+02i
yt=0.50, zt=0.94, xt=0.33  :Checksum(14) =  5.120028605402e+02 + 5.120370938679e+02i
yt=0.38, zt=0.98, xt=0.33  :Checksum(15) =  5.120015223011e+02 + 5.120404138831e+02i
yt=0.46, zt=0.98, xt=0.33  :Checksum(16) =  5.120001570022e+02 + 5.120432068837e+02i
yt=0.47, zt=1.00, xt=0.33  :Checksum(17) =  5.119987650555e+02 + 5.120455615860e+02i
yt=0.41, zt=0.99, xt=0.33  :Checksum(18) =  5.119973525091e+02 + 5.120475499442e+02i
yt=0.45, zt=0.95, xt=0.33  :Checksum(19) =  5.119959279472e+02 + 5.120492304629e+02i
yt=0.41, zt=1.02, xt=0.33  :Checksum(20) =  5.119945006558e+02 + 5.120506508902e+02i
yt=0.39, zt=1.01, xt=0.33  :Checksum(21) =  5.119930795911e+02 + 5.120518503782e+02i
yt=0.44, zt=0.97, xt=0.33  :Checksum(22) =  5.119916728462e+02 + 5.120528612016e+02i
yt=0.43, zt=1.07, xt=0.33  :Checksum(23) =  5.119902874185e+02 + 5.120537101195e+02i
yt=0.43, zt=0.97, xt=0.33  :Checksum(24) =  5.119889291565e+02 + 5.120544194514e+02i
yt=0.43, zt=1.03, xt=0.33  :Checksum(25) =  5.119876028049e+02 + 5.120550079284e+02i
Successful completion of NPB FT Benchmark Class E
Elapsed time :    64.7337
MFLOPS : 1211229.1869

llvm (Y=0.26->0.50, Z=~1.10, X=0.33):

yt=0.26, zt=1.05, xt=0.33  :Checksum(1) =  5.121601045346e+02 + 5.117395998266e+02i
yt=0.26, zt=1.08, xt=0.33  :Checksum(2) =  5.120905403678e+02 + 5.118614716182e+02i
yt=0.27, zt=0.99, xt=0.33  :Checksum(3) =  5.120623229306e+02 + 5.119074203747e+02i
yt=0.26, zt=1.06, xt=0.33  :Checksum(4) =  5.120438418997e+02 + 5.119345900733e+02i
yt=0.68, zt=0.86, xt=0.33  :Checksum(5) =  5.120311521872e+02 + 5.119551325550e+02i
yt=0.48, zt=0.93, xt=0.33  :Checksum(6) =  5.120226088809e+02 + 5.119720179919e+02i
yt=0.61, zt=0.89, xt=0.33  :Checksum(7) =  5.120169296534e+02 + 5.119861371665e+02i
yt=0.59, zt=1.02, xt=0.33  :Checksum(8) =  5.120131225172e+02 + 5.119979364402e+02i
yt=0.56, zt=0.85, xt=0.33  :Checksum(9) =  5.120104767108e+02 + 5.120077674092e+02i
yt=0.53, zt=0.95, xt=0.33  :Checksum(10) =  5.120085127969e+02 + 5.120159443121e+02i
yt=0.42, zt=1.02, xt=0.33  :Checksum(11) =  5.120069224127e+02 + 5.120227453670e+02i
yt=0.53, zt=0.96, xt=0.33  :Checksum(12) =  5.120055158164e+02 + 5.120284096041e+02i
yt=0.50, zt=0.97, xt=0.33  :Checksum(13) =  5.120041820159e+02 + 5.120331373793e+02i
yt=0.51, zt=0.97, xt=0.33  :Checksum(14) =  5.120028605402e+02 + 5.120370938679e+02i
yt=0.37, zt=0.97, xt=0.33  :Checksum(15) =  5.120015223011e+02 + 5.120404138831e+02i
yt=0.46, zt=0.99, xt=0.33  :Checksum(16) =  5.120001570022e+02 + 5.120432068837e+02i
yt=0.47, zt=1.02, xt=0.33  :Checksum(17) =  5.119987650555e+02 + 5.120455615860e+02i
yt=0.41, zt=1.03, xt=0.33  :Checksum(18) =  5.119973525091e+02 + 5.120475499442e+02i
yt=0.45, zt=1.00, xt=0.33  :Checksum(19) =  5.119959279472e+02 + 5.120492304629e+02i
yt=0.41, zt=1.00, xt=0.33  :Checksum(20) =  5.119945006558e+02 + 5.120506508902e+02i
yt=0.39, zt=1.02, xt=0.33  :Checksum(21) =  5.119930795911e+02 + 5.120518503782e+02i
yt=0.45, zt=1.04, xt=0.33  :Checksum(22) =  5.119916728462e+02 + 5.120528612016e+02i
yt=0.44, zt=0.98, xt=0.33  :Checksum(23) =  5.119902874185e+02 + 5.120537101195e+02i
yt=0.43, zt=1.00, xt=0.33  :Checksum(24) =  5.119889291565e+02 + 5.120544194514e+02i
yt=0.43, zt=1.06, xt=0.33  :Checksum(25) =  5.119876028049e+02 + 5.120550079284e+02i
Successful completion of NPB FT Benchmark Class E
Elapsed time :    64.7233
MFLOPS : 1211424.8601

Diff I'm using so I don't lose it:

```diff diff --git a/src/DistributedFFT.chpl b/src/DistributedFFT.chpl index e69ed78..b939f10 100644 --- a/src/DistributedFFT.chpl +++ b/src/DistributedFFT.chpl @@ -273,6 +273,7 @@ prototype module DistributedFFT { if SrcDom.dim(3) != DstDom.dim(3) then halt("Mismatched z ranges"); coforall loc in Locales do on loc { + var xt, yt, zt: Timer; var timeTrack = new TimeTracker(); const (xSrc, ySrc, zSrc) = SrcDom.localSubdomain().dims(); @@ -298,13 +299,16 @@ prototype module DistributedFFT { for ix in xSrc { // Y-transform timeTrack.start(); + yt.start(); forall (plan, myzRange) in yPlan.batch() { plan.execute(myplane[0, ySrc.first, myzRange.first]); } + yt.stop(); timeTrack.stop(TimeStages.Y); // Z-transform, offset to reduce comm congestion/collision timeTrack.start(); + zt.start(); forall iy in offset(ySrc) { zPlan.execute(myplane[0, iy, zSrc.first]); // Transpose data into Dst, and copy the next Src slice into myplane @@ -321,6 +325,7 @@ prototype module DistributedFFT { } } } + zt.stop(); timeTrack.stop(TimeStages.Z); } @@ -329,12 +334,15 @@ prototype module DistributedFFT { // X-transform timeTrack.start(); + xt.start(); forall (plan, myzRange) in xPlan.batch() { for iy in yDst { plan.execute(Dst[iy, xDst.first, myzRange.first]); } } + xt.stop(); timeTrack.stop(TimeStages.X); + if here.id == 0 then writef("yt=%.2dr, zt=%.2dr, xt=%.2dr :", yt.elapsed(), zt.elapsed(), xt.elapsed()); } } ```

ronawho commented 5 years ago

I think I need to see if I can create a smaller reproducer. My current guesses (bad task resetting, AVX memcpy slowing down processor for next Y iteration, and some other things) haven't really panned out, so I need something smaller to profile.

It's particularly confusing to me that performance starts off the same, and then drifts for gcc/llvm. I thought this could indicate some sort of numa-affinity drift (but I can't think of any reason that would be compiler dependent) or some sort of AVX/vector instruction triggering thermal throttling and lowering the clock. My guess was that a vectorized memcpy was throttling and we only saw it after a few iterations once the processors warmed up, but I put in some cool-down loops before the Y-transform and that didn't seem to impact timings.

npadmana commented 5 years ago

Yeah, this is bizarre. Note that the X transforms are basically doing the same thing as the Y transforms (a strided FFT). The only difference is that the arrays for the X transform are always the same, whereas for Y(and Z), we allocate a single plane as scratch space. I wonder if something funky is going on with the allocation.

ronawho commented 5 years ago

Could be, but I haven't found anything conclusive yet (affinity for myplane looks right to me) and operating directly on Src still has the same slowdown.

I stumbled across this, but it's probably not relevant http://www.fftw.org/faq/section3.html#slows (repeated fftw calls can slow down if there's a lot of NaNs.) I don't think we're producing NaN's but if we were this could make sense for intel vs gcc differences. Gcc does not optimize floating-point operations by default (you have to throw --no-ieee-float) but intel does more aggressive floating-point optimizations so that might explain why the compiler matters.

I'll try to come up with a smaller reproducer later, and I should probably also run the reference version with gcc to see if they have slowdowns too (to hopefully narrow down between Chapel and fftw being the root cause)

npadmana commented 5 years ago

I don't think we're producing NaNs.... otherwise our results would be all messed up.

Also, the reference version doesn't call into FFTW -- it has its own FFT implementation. The UPC version does call into FFTW. I'll aim to set that up later today, or maybe tomorrow.

Another point -- as far as I remember, the y-transform is just a call into FFTW. There's basically no Chapel in there. So I would have guessed that the backend shouldn't have mattered, unless FFTW is somehow taking a different path/plan in this second case.

ronawho commented 5 years ago

Also, the reference version doesn't call into FFTW -- it has its own FFT implementation. The UPC version does call into FFTW. I'll aim to set that up later today, or maybe tomorrow.

Ah, interesting.

I'm only looking at this as a background task so no rush to get the reference in for me.

Another point -- as far as I remember, the y-transform is just a call into FFTW. There's basically no Chapel in there. So I would have guessed that the backend shouldn't have mattered, unless FFTW is somehow taking a different path/plan in this second case.

I think FFTW itself gets compiled separately for each backend on crays (and maybe each possible target processor.) For --llvm we end up linking in gcc libraries since they'll be ABI compatible.

ronawho commented 4 years ago

I still see non-trivial slowdowns for using gcc/llvm compared to intel (and times are much worse when running serially):

Intel:

./ft_transposed_intel -nl 256 --NPBClass=NPB.E --runSerial

ytAvg=0.94(0.97), ztAvg=1.24(1.32), xtAvg=1.08(1.10)  :Checksum(1) =  5.121601045346e+02 + 5.117395998266e+02i
ytAvg=0.95(1.02), ztAvg=1.24(1.33), xtAvg=1.09(1.14)  :Checksum(2) =  5.120905403678e+02 + 5.118614716182e+02i
ytAvg=0.93(0.97), ztAvg=1.20(1.26), xtAvg=1.08(1.10)  :Checksum(3) =  5.120623229306e+02 + 5.119074203747e+02i
ytAvg=0.94(0.99), ztAvg=1.21(1.29), xtAvg=1.09(1.11)  :Checksum(4) =  5.120438418997e+02 + 5.119345900733e+02i
ytAvg=0.95(1.00), ztAvg=1.21(1.29), xtAvg=1.13(1.15)  :Checksum(5) =  5.120311521872e+02 + 5.119551325550e+02i
ytAvg=0.94(1.01), ztAvg=1.22(1.33), xtAvg=1.11(1.16)  :Checksum(6) =  5.120226088809e+02 + 5.119720179919e+02i
ytAvg=0.93(0.96), ztAvg=1.21(1.28), xtAvg=1.11(1.13)  :Checksum(7) =  5.120169296534e+02 + 5.119861371665e+02i
ytAvg=0.94(0.96), ztAvg=1.23(1.33), xtAvg=1.12(1.14)  :Checksum(8) =  5.120131225172e+02 + 5.119979364402e+02i
ytAvg=0.93(0.96), ztAvg=1.21(1.28), xtAvg=1.11(1.13)  :Checksum(9) =  5.120104767108e+02 + 5.120077674092e+02i
ytAvg=0.93(0.96), ztAvg=1.23(1.32), xtAvg=1.11(1.13)  :Checksum(10) =  5.120085127969e+02 + 5.120159443121e+02i
ytAvg=0.95(0.97), ztAvg=1.24(1.32), xtAvg=1.12(1.14)  :Checksum(11) =  5.120069224127e+02 + 5.120227453670e+02i
ytAvg=0.93(0.96), ztAvg=1.25(1.34), xtAvg=1.11(1.13)  :Checksum(12) =  5.120055158164e+02 + 5.120284096041e+02i
ytAvg=0.94(0.96), ztAvg=1.25(1.35), xtAvg=1.12(1.14)  :Checksum(13) =  5.120041820159e+02 + 5.120331373793e+02i
ytAvg=0.94(0.97), ztAvg=1.23(1.32), xtAvg=1.08(1.11)  :Checksum(14) =  5.120028605402e+02 + 5.120370938679e+02i
ytAvg=0.93(0.96), ztAvg=1.19(1.26), xtAvg=1.08(1.20)  :Checksum(15) =  5.120015223011e+02 + 5.120404138831e+02i
ytAvg=0.93(0.96), ztAvg=1.19(1.28), xtAvg=1.07(1.10)  :Checksum(16) =  5.120001570022e+02 + 5.120432068837e+02i
ytAvg=0.94(0.96), ztAvg=1.20(1.28), xtAvg=1.08(1.09)  :Checksum(17) =  5.119987650555e+02 + 5.120455615860e+02i
ytAvg=0.93(0.99), ztAvg=1.17(1.26), xtAvg=1.07(1.11)  :Checksum(18) =  5.119973525091e+02 + 5.120475499442e+02i
ytAvg=0.94(0.98), ztAvg=1.23(1.30), xtAvg=1.08(1.10)  :Checksum(19) =  5.119959279472e+02 + 5.120492304629e+02i
ytAvg=0.95(1.02), ztAvg=1.21(1.30), xtAvg=1.09(1.14)  :Checksum(20) =  5.119945006558e+02 + 5.120506508902e+02i
ytAvg=0.93(0.98), ztAvg=1.21(1.30), xtAvg=1.08(1.11)  :Checksum(21) =  5.119930795911e+02 + 5.120518503782e+02i
ytAvg=0.94(0.99), ztAvg=1.21(1.29), xtAvg=1.09(1.11)  :Checksum(22) =  5.119916728462e+02 + 5.120528612016e+02i
ytAvg=0.95(1.00), ztAvg=1.26(1.37), xtAvg=1.13(1.15)  :Checksum(23) =  5.119902874185e+02 + 5.120537101195e+02i
ytAvg=0.94(1.02), ztAvg=1.23(1.31), xtAvg=1.11(1.15)  :Checksum(24) =  5.119889291565e+02 + 5.120544194514e+02i
ytAvg=0.93(0.96), ztAvg=1.23(1.33), xtAvg=1.11(1.13)  :Checksum(25) =  5.119876028049e+02 + 5.120550079284e+02i
Successful completion of NPB FT Benchmark Class E
Elapsed time :    92.0165
MFLOPS : 852101.3003

Gnu:

./ft_transposed_gnu -nl 256 --NPBClass=NPB.E --runSerial

ytAvg=0.94(0.98), ztAvg=1.17(1.25), xtAvg=1.08(1.11)  :Checksum(1) =  5.121601045346e+02 + 5.117395998266e+02i
ytAvg=0.95(1.04), ztAvg=1.21(1.29), xtAvg=1.09(1.14)  :Checksum(2) =  5.120905403678e+02 + 5.118614716182e+02i
ytAvg=0.93(1.00), ztAvg=1.18(1.27), xtAvg=1.08(1.12)  :Checksum(3) =  5.120623229306e+02 + 5.119074203747e+02i
ytAvg=1.10(1.51), ztAvg=1.12(1.23), xtAvg=1.09(1.17)  :Checksum(4) =  5.120438418997e+02 + 5.119345900733e+02i
ytAvg=1.35(1.44), ztAvg=1.21(1.30), xtAvg=1.13(1.16)  :Checksum(5) =  5.120311521872e+02 + 5.119551325550e+02i
ytAvg=1.30(1.38), ztAvg=1.21(1.29), xtAvg=1.11(1.15)  :Checksum(6) =  5.120226088809e+02 + 5.119720179919e+02i
ytAvg=1.28(1.36), ztAvg=1.23(1.34), xtAvg=1.11(1.14)  :Checksum(7) =  5.120169296534e+02 + 5.119861371665e+02i
ytAvg=1.27(1.36), ztAvg=1.26(1.38), xtAvg=1.12(1.15)  :Checksum(8) =  5.120131225172e+02 + 5.119979364402e+02i
ytAvg=1.25(1.35), ztAvg=1.22(1.31), xtAvg=1.11(1.13)  :Checksum(9) =  5.120104767108e+02 + 5.120077674092e+02i
ytAvg=1.24(1.36), ztAvg=1.25(1.36), xtAvg=1.11(1.14)  :Checksum(10) =  5.120085127969e+02 + 5.120159443121e+02i
ytAvg=1.24(1.37), ztAvg=1.21(1.30), xtAvg=1.12(1.16)  :Checksum(11) =  5.120069224127e+02 + 5.120227453670e+02i
ytAvg=1.22(1.38), ztAvg=1.19(1.27), xtAvg=1.11(1.13)  :Checksum(12) =  5.120055158164e+02 + 5.120284096041e+02i
ytAvg=1.23(1.41), ztAvg=1.22(1.31), xtAvg=1.12(1.15)  :Checksum(13) =  5.120041820159e+02 + 5.120331373793e+02i
ytAvg=1.22(1.46), ztAvg=1.22(1.34), xtAvg=1.08(1.13)  :Checksum(14) =  5.120028605402e+02 + 5.120370938679e+02i
ytAvg=1.21(1.51), ztAvg=1.17(1.41), xtAvg=1.08(1.11)  :Checksum(15) =  5.120015223011e+02 + 5.120404138831e+02i
ytAvg=1.21(1.62), ztAvg=1.16(1.53), xtAvg=1.07(1.11)  :Checksum(16) =  5.120001570022e+02 + 5.120432068837e+02i
ytAvg=1.23(1.97), ztAvg=1.18(1.72), xtAvg=1.08(1.13)  :Checksum(17) =  5.119987650555e+02 + 5.120455615860e+02i
ytAvg=1.24(2.37), ztAvg=1.46(15.75), xtAvg=1.07(1.11)  :Checksum(18) =  5.119973525091e+02 + 5.120475499442e+02i
ytAvg=1.23(2.35), ztAvg=1.50(15.58), xtAvg=1.08(1.11)  :Checksum(19) =  5.119959279472e+02 + 5.120492304629e+02i
ytAvg=1.23(2.32), ztAvg=1.53(15.57), xtAvg=1.09(1.15)  :Checksum(20) =  5.119945006558e+02 + 5.120506508902e+02i
ytAvg=1.20(2.25), ztAvg=1.49(15.55), xtAvg=1.08(1.11)  :Checksum(21) =  5.119930795911e+02 + 5.120518503782e+02i
ytAvg=1.20(2.21), ztAvg=1.47(14.58), xtAvg=1.09(1.17)  :Checksum(22) =  5.119916728462e+02 + 5.120528612016e+02i
ytAvg=1.19(2.24), ztAvg=1.49(14.04), xtAvg=1.13(1.16)  :Checksum(23) =  5.119902874185e+02 + 5.120537101195e+02i
ytAvg=1.17(2.08), ztAvg=1.51(15.66), xtAvg=1.11(1.15)  :Checksum(24) =  5.119889291565e+02 + 5.120544194514e+02i
ytAvg=1.16(2.11), ztAvg=1.54(14.89), xtAvg=1.11(1.14)  :Checksum(25) =  5.119876028049e+02 + 5.120550079284e+02i
Successful completion of NPB FT Benchmark Class E
Elapsed time :   219.0316
MFLOPS : 357973.0537

Here's the current diff I'm using -- it prints average (max) y, z, and x timings:

```diff diff --git a/src/DistributedFFT.chpl b/src/DistributedFFT.chpl index 58780c1..c3c85ce 100644 --- a/src/DistributedFFT.chpl +++ b/src/DistributedFFT.chpl @@ -43,6 +43,7 @@ prototype module DistributedFFT { use FFT_Timers; require "npFFTW.h"; + config const runSerial = false; /* Compile time parameters for higher performance. @@ -272,7 +273,9 @@ prototype module DistributedFFT { if SrcDom.dim(2) != DstDom.dim(1) then halt("Mismatched y-x ranges"); if SrcDom.dim(3) != DstDom.dim(3) then halt("Mismatched z ranges"); - coforall loc in Locales do on loc { + var timer: [1..3][LocaleSpace] real; + coforall loc in Locales do on loc { serial runSerial { + var xt, yt, zt: Timer; var timeTrack = new TimeTracker(); const (xSrc, ySrc, zSrc) = SrcDom.localSubdomain().dims(); @@ -298,13 +301,16 @@ prototype module DistributedFFT { for ix in xSrc { // Y-transform timeTrack.start(); + yt.start(); forall (plan, myzRange) in yPlan.batch() { plan.execute(myplane[0, ySrc.first, myzRange.first]); } + yt.stop(); timeTrack.stop(TimeStages.Y); // Z-transform, offset to reduce comm congestion/collision timeTrack.start(); + zt.start(); forall iy in offset(ySrc) { zPlan.execute(myplane[0, iy, zSrc.first]); // Transpose data into Dst, and copy the next Src slice into myplane @@ -321,6 +327,7 @@ prototype module DistributedFFT { } } } + zt.stop(); timeTrack.stop(TimeStages.Z); } @@ -329,13 +336,29 @@ prototype module DistributedFFT { // X-transform timeTrack.start(); + xt.start(); forall (plan, myzRange) in xPlan.batch() { for iy in yDst { plan.execute(Dst[iy, xDst.first, myzRange.first]); } } + xt.stop(); timeTrack.stop(TimeStages.X); + // if here.id == 0 then writef("yt=%.2dr, zt=%.2dr, xt=%.2dr :", yt.elapsed(), zt.elapsed(), xt.elapsed()); + timer[1][here.id] = yt.elapsed(); + timer[2][here.id] = zt.elapsed(); + timer[3][here.id] = xt.elapsed(); + } } + var ytAvg = +reduce timer[1]/numLocales:real; + var ztAvg = +reduce timer[2]/numLocales:real; + var xtAvg = +reduce timer[3]/numLocales:real; + var ytMax = max reduce timer[1]; + var ztMax = max reduce timer[2]; + var xtMax = max reduce timer[3]; + + writef("ytAvg=%.2dr(%.2dr), ztAvg=%.2dr(%.2dr), xtAvg=%.2dr(%.2dr) :", ytAvg, ytMax, ztAvg,ztMax, xtAvg, xtMax); + } /* ```

Misc notes so I don't forget:

Using FFTW_NO_SIMD and disabling target specialization had no impact
For some reason running with CHPL_RT_NUM_THREADS_PER_LOCALE=1 segfaults (need to dig)
Running serially I see significantly worse timings (size E at 256 nodes -- 90s icc, 220s gcc)
Identical times with doFFT_Transposed comm removed, no longer convinced this is from fftw
If I build the runtime/3rd-party with gcc and then do final compilation with intel, times are bad
- If I then rebuild just the runtime, timings are good, suggesting the difference lies within the runtime, not fftw.

So maybe this is something in our ugni layer that is getting better optimized under intel. I need to dig. I have forced gcc to use CHPL_ATOMICS=intrinsics to match intel, but that didn't matter (neither did using cstdlib with intel.)

ronawho commented 4 years ago

I'll try to come up with a smaller/faster reproducer next. Using the full NPB-FT code has made this pretty slow.

ronawho commented 3 years ago

Was clearing out some old notes and wanted to try something here before deleting them.

I no longer see segfaults with CHPL_RT_NUM_THREADS_PER_LOCALE=1. Not exactly sure what changed there, but my notes say I thought it could have been from a misaligned GET, that would have been fixed by https://github.com/chapel-lang/chapel/pull/14652 or https://github.com/chapel-lang/chapel/pull/15111
cacheline aligning all allocations doesn't do anything (not really a surprise since we see the perf issues running serially so it's unlikely to be a false-sharing issue)
I still see the worse performance with gcc compared to intel. Below is my current debugging diff:

```diff diff --git a/test/npb/ft/npadmana/DistributedFFT.chpl b/test/npb/ft/npadmana/DistributedFFT.chpl index 0d427fb336..f9bac477f5 100644 --- a/test/npb/ft/npadmana/DistributedFFT.chpl +++ b/test/npb/ft/npadmana/DistributedFFT.chpl @@ -43,6 +43,7 @@ prototype module DistributedFFT { use FFT_Locks; use FFT_Timers; use SysCTypes, CPtr; + use Time; /* Compile time parameters for higher performance. @@ -264,7 +265,9 @@ prototype module DistributedFFT { signOrKind) { checkDims(SrcDom, DstDom); + var timer: [1..3][LocaleSpace] real; coforall loc in Locales do on loc { + var xt, yt, zt: Timer; var timeTrack = new TimeTracker(); const (xSrc, ySrc, zSrc) = SrcDom.localSubdomain().dims(); @@ -288,15 +291,20 @@ prototype module DistributedFFT { } for ix in xSrc { + allLocalesBarrier.barrier(); // Y-transform timeTrack.start(); + yt.start(); forall (plan, myzRange) in yPlan.batch() { plan.execute(myplane[0, ySrc.first, myzRange.first]); } + yt.stop(); timeTrack.stop(TimeStages.Y); + allLocalesBarrier.barrier(); // Z-transform, offset to reduce comm congestion/collision timeTrack.start(); + zt.start(); forall iy in offset(ySrc) { zPlan.execute(myplane[0, iy, zSrc.first]); // Transpose data into Dst, and copy the next Src slice into myplane @@ -313,6 +321,7 @@ prototype module DistributedFFT { } } } + zt.stop(); timeTrack.stop(TimeStages.Z); } @@ -321,13 +330,26 @@ prototype module DistributedFFT { // X-transform timeTrack.start(); + xt.start(); forall (plan, myzRange) in xPlan.batch() { for iy in yDst { plan.execute(Dst[iy, xDst.first, myzRange.first]); } } + xt.stop(); timeTrack.stop(TimeStages.X); + timer[1][here.id] = yt.elapsed(); + timer[2][here.id] = zt.elapsed(); + timer[3][here.id] = xt.elapsed(); } + var ytAvg = +reduce timer[1]/numLocales:real; + var ztAvg = +reduce timer[2]/numLocales:real; + var xtAvg = +reduce timer[3]/numLocales:real; + var ytMax = max reduce timer[1]; + var ztMax = max reduce timer[2]; + var xtMax = max reduce timer[3]; + + writef("ytAvg=%.2dr(%.2dr), ztAvg=%.2dr(%.2dr), xtAvg=%.2dr(%.2dr) :", ytAvg, ytMax, ztAvg,ztMax, xtAvg, xtMax); } /* ```

npadmana / DistributedFFT

Slowdown in YZ transform on PrgEnv-gcc #25