devitocodes / devito

DSL and compiler framework for automated finite-differences and stencil computation
http://www.devitoproject.org
MIT License
537 stars 221 forks source link

nvc+MPI issues with non-grid devito dimensions #2277

Closed deckerla closed 6 months ago

deckerla commented 7 months ago

Consider the following MFE which simulates wave propagation and projection into a frequency basis followed by frequency extraction:

import devito
import numpy as np
shape = (51,51,51)
nfreq = 5
nrec = 6
space_order = 8
dt = 0.01
time_M = 200

grid = devito.Grid(shape=shape, extent=(1,1,1))
freq_dim = devito.DefaultDimension(name="freq_dim", default_value=nfreq)
f = devito.Function(name="frequencies", dimensions=(freq_dim,), shape=(nfreq,), grid=grid)
f.data[:] = np.arange(1,nfreq+1)
p0   = devito.TimeFunction(name="p0", grid=grid, time_order=2, space_order=space_order)
p0re = devito.Function(name="p0re", grid=grid, dimensions=(*grid.dimensions, freq_dim), shape=(*shape, nfreq), space_order=space_order)

src = devito.SparseTimeFunction(name="src", grid=grid, nt=time_M, npoint=1, coordinates=np.array([0.5,0.5,0.5]))

class CoordSlowSparseFunction(devito.SparseFunction):
    _sparse_position = 0

recdim = devito.Dimension(name="recdim")
p0recre = CoordSlowSparseFunction(name="p0recre", grid=grid, dimensions=(recdim, freq_dim), shape=(nrec, nfreq), npoint=nrec, coordinates=np.array([0.4,0.4,0.4]))

wp_eq = devito.Eq(p0.forward, -5/1000000*(p0.dx2 +  p0.dy2 + p0.dz2) + p0.backward)
inj_eq = src.inject(field=p0.forward, expr=devito.sin(2*3.14159*f*grid.time_dim*dt)/100000)
proj_eq_re = devito.Eq(p0re, p0re + devito.sin(2*3.14159*f*grid.time_dim*dt)*p0)
wp_proj_op = devito.Operator([wp_eq, inj_eq, proj_eq_re], name="propagator")

wp_proj_op.apply(time_M=time_M)

interp_eq_re = p0recre.interpolate(expr=p0re)

interp_op = devito.Operator([interp_eq_re], name="interpolator")

interp_op.apply()
print(p0recre.data)

Running this operator without MPI we have the following behavior:

cvx@cbox-lukedecker-extfwimpi:~$ DEVITO_LOGGING=DEBUG python mfe.py
Allocating host memory for frequencies(5,) [20 B]
Allocating host memory for src_coords(1, 3) [12 B]
Allocating host memory for p0recre_coords(6, 3) [72 B]
Operator `propagator` generated in 0.88 s
  * lowering.Clusters: 0.39 s (44.5 %)
     * specializing.Clusters: 0.24 s (27.4 %)
  * lowering.IET: 0.23 s (26.3 %)
  * lowering.Expressions: 0.22 s (25.2 %)
Flops reduction after symbolic optimization: [152 --> 54]
Allocating host memory for p0(3, 67, 67, 67) [3 MB]
Allocating host memory for p0re(67, 67, 67, 5) [6 MB]
Operator `propagator` fetched `/tmp/devito-jitcache-uid1000/94c579b990e202a56a5c59b81fe83ce007d2aff6.cpp` in 0.11 s from jit-cache
Operator `propagator` ran in 0.10 s
Global performance: [OI=0.55, 14.07 GFlops/s, 0.27 GPts/s]
Local performance:
  * multipass0<> ran in 0.09 s [OI=0.55, 16.70 GFlops/s, 0.32 GPts/s]
    + section0 ran in 0.03 s [24.62%] 
    + section1 ran in 0.07 s [75.36%] 
  * section2<<200,1,5,2,2,2>,<200,1,5,2,2,2>> ran in 0.01 s [OI=12.48, 0.13 GFlops/s, 0.01 GPts/s]
Performance[mode=advanced] arguments: {'nthreads': 48, 'nthreads_nonaffine': 48, 'x0_blk0_size': 8, 'y0_blk0_size': 8}
Operator `interpolator` generated in 0.30 s
  * lowering.Clusters: 0.16 s (53.4 %)
     * specializing.Clusters: 0.12 s (40.1 %)
        * cire: 0.08 s (26.7 %)
  * lowering.IET: 0.09 s (30.1 %)
     * specializing.IET: 0.07 s (23.4 %)
Flops reduction after symbolic optimization: [36 --> 21]
Allocating host memory for p0recre(6, 5) [120 B]
Operator `interpolator` fetched `/tmp/devito-jitcache-uid1000/3129fe9dfcd34d4a03a2a7eaab3ade05ff9aa4be.cpp` in 0.02 s from jit-cache
Operator `interpolator` ran in 0.01 s
Global performance: [OI=3.05, 0.01 GFlops/s]
Local performance:
  * section0<<6,5>,<6,5,2,2,2>> ran in 0.01 s [OI=3.05, 0.01 GFlops/s, 0.00 GPts/s]
Performance[mode=advanced] arguments: {'nthreads_nonaffine': 48}
[[70.95639 96.57933 94.03625 83.82484 73.3117 ]
 [70.95639 96.57933 94.03625 83.82484 73.3117 ]
 [70.95639 96.57933 94.03625 83.82484 73.3117 ]
 [70.95639 96.57933 94.03625 83.82484 73.3117 ]
 [70.95639 96.57933 94.03625 83.82484 73.3117 ]
 [70.95639 96.57933 94.03625 83.82484 73.3117 ]]

With the following generated c code for the propagator:

#define _POSIX_C_SOURCE 200809L
#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);
#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;
#define MIN(a,b) (((a) < (b)) ? (a) : (b))

#include "stdlib.h"
#include "math.h"
#include "sys/time.h"
#include "xmmintrin.h"
#include "pmmintrin.h"
#include "omp.h"

struct dataobj
{
  void *restrict data;
  unsigned long * size;
  unsigned long * npsize;
  unsigned long * dsize;
  int * hsize;
  int * hofs;
  int * oofs;
  void * dmap;
} ;

struct profiler
{
  double multipass0;
  double section2;
  double section0;
  double section1;
} ;

extern "C" int propagator(struct dataobj *restrict frequencies_vec, struct dataobj *restrict p0_vec, struct dataobj *restrict p0re_vec, struct dataobj *restrict src_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int x0_blk0_size, const int y0_blk0_size, const int nthreads, const int nthreads_nonaffine, struct profiler * timers);

int propagator(struct dataobj *restrict frequencies_vec, struct dataobj *restrict p0_vec, struct dataobj *restrict p0re_vec, struct dataobj *restrict src_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int x0_blk0_size, const int y0_blk0_size, const int nthreads, const int nthreads_nonaffine, struct profiler * timers)
{
  float *restrict r1_vec __attribute__ ((aligned (64)));
  posix_memalign((void**)(&r1_vec),64,5*sizeof(float));

  float (*restrict frequencies) __attribute__ ((aligned (64))) = (float (*)) frequencies_vec->data;
  float (*restrict p0)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]]) p0_vec->data;
  float (*restrict p0re)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]]) p0re_vec->data;
  float (*restrict r1) __attribute__ ((aligned (64))) = (float (*)) r1_vec;
  float (*restrict src_coords)[src_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[src_coords_vec->size[1]]) src_coords_vec->data;

  /* Flush denormal numbers to zero in hardware */
  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);

  float r2 = 1.0F/(h_x*h_x);
  float r3 = 1.0F/(h_y*h_y);
  float r4 = 1.0F/(h_z*h_z);

  for (int time = time_m, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3); time <= time_M; time += 1, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3))
  {
    START(multipass0)
    START(section0)
    #pragma omp parallel num_threads(nthreads)
    {
      #pragma omp for schedule(static,1)
      for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
      {
        r1[freq_dim] = sin(6.28318e-2F*time*frequencies[freq_dim]);
      }
    }
    STOP(section0,timers)

    START(section1)
    #pragma omp parallel num_threads(nthreads)
    {
      #pragma omp for collapse(2) schedule(dynamic,1)
      for (int x0_blk0 = x_m; x0_blk0 <= x_M; x0_blk0 += x0_blk0_size)
      {
        for (int y0_blk0 = y_m; y0_blk0 <= y_M; y0_blk0 += y0_blk0_size)
        {
          for (int x = x0_blk0; x <= MIN(x_M, x0_blk0 + x0_blk0_size - 1); x += 1)
          {
            for (int y = y0_blk0; y <= MIN(y_M, y0_blk0 + y0_blk0_size - 1); y += 1)
            {
              for (int z = z_m; z <= z_M; z += 1)
              {
                float r5 = 1.42361111100763e-5F*p0[t0][x + 8][y + 8][z + 8];
                p0[t2][x + 8][y + 8][z + 8] = r2*(r5 + 8.9285714284415e-9F*(p0[t0][x + 4][y + 8][z + 8] + p0[t0][x + 12][y + 8][z + 8]) + (-1.26984126982279e-7F)*(p0[t0][x + 5][y + 8][z + 8] + p0[t0][x + 11][y + 8][z + 8]) + 1.00000000005821e-6F*(p0[t0][x + 6][y + 8][z + 8] + p0[t0][x + 10][y + 8][z + 8]) + (-8.00000000046566e-6F)*(p0[t0][x + 7][y + 8][z + 8] + p0[t0][x + 9][y + 8][z + 8])) + r3*(r5 + 8.9285714284415e-9F*(p0[t0][x + 8][y + 4][z + 8] + p0[t0][x + 8][y + 12][z + 8]) + (-1.26984126982279e-7F)*(p0[t0][x + 8][y + 5][z + 8] + p0[t0][x + 8][y + 11][z + 8]) + 1.00000000005821e-6F*(p0[t0][x + 8][y + 6][z + 8] + p0[t0][x + 8][y + 10][z + 8]) + (-8.00000000046566e-6F)*(p0[t0][x + 8][y + 7][z + 8] + p0[t0][x + 8][y + 9][z + 8])) + r4*(r5 + 8.9285714284415e-9F*(p0[t0][x + 8][y + 8][z + 4] + p0[t0][x + 8][y + 8][z + 12]) + (-1.26984126982279e-7F)*(p0[t0][x + 8][y + 8][z + 5] + p0[t0][x + 8][y + 8][z + 11]) + 1.00000000005821e-6F*(p0[t0][x + 8][y + 8][z + 6] + p0[t0][x + 8][y + 8][z + 10]) + (-8.00000000046566e-6F)*(p0[t0][x + 8][y + 8][z + 7] + p0[t0][x + 8][y + 8][z + 9])) + p0[t1][x + 8][y + 8][z + 8];

                #pragma omp simd aligned(p0,p0re:64)
                for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
                {
                  p0re[x + 8][y + 8][z + 8][freq_dim] = r1[freq_dim]*p0[t0][x + 8][y + 8][z + 8] + p0re[x + 8][y + 8][z + 8][freq_dim];
                }
              }
            }
          }
        }
      }
    }
    STOP(section1,timers)
    STOP(multipass0,timers)

    START(section2)
    #pragma omp parallel num_threads(nthreads_nonaffine)
    {
      int chunk_size = (int)(fmax(1, (1.0F/3.0F)*(p_src_M - p_src_m + 1)/nthreads_nonaffine));
      #pragma omp for schedule(dynamic,chunk_size)
      for (int p_src = p_src_m; p_src <= p_src_M; p_src += 1)
      {
        for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
        {
          for (int rsrcx = 0; rsrcx <= 1; rsrcx += 1)
          {
            for (int rsrcy = 0; rsrcy <= 1; rsrcy += 1)
            {
              for (int rsrcz = 0; rsrcz <= 1; rsrcz += 1)
              {
                int posx = (int)(floor((-o_x + src_coords[p_src][0])/h_x));
                int posy = (int)(floor((-o_y + src_coords[p_src][1])/h_y));
                int posz = (int)(floor((-o_z + src_coords[p_src][2])/h_z));
                float px = -floor((-o_x + src_coords[p_src][0])/h_x) + (-o_x + src_coords[p_src][0])/h_x;
                float py = -floor((-o_y + src_coords[p_src][1])/h_y) + (-o_y + src_coords[p_src][1])/h_y;
                float pz = -floor((-o_z + src_coords[p_src][2])/h_z) + (-o_z + src_coords[p_src][2])/h_z;
                if (rsrcx + posx >= x_m - 1 && rsrcy + posy >= y_m - 1 && rsrcz + posz >= z_m - 1 && rsrcx + posx <= x_M + 1 && rsrcy + posy <= y_M + 1 && rsrcz + posz <= z_M + 1)
                {
                  float r0 = (1.0F/100000.0F)*(rsrcx*px + (1 - rsrcx)*(1 - px))*(rsrcy*py + (1 - rsrcy)*(1 - py))*(rsrcz*pz + (1 - rsrcz)*(1 - pz))*sin(6.28318e-2F*time*frequencies[freq_dim]);
                  #pragma omp atomic update
                  p0[t2][rsrcx + posx + 8][rsrcy + posy + 8][rsrcz + posz + 8] += r0;
                }
              }
            }
          }
        }
      }
    }
    STOP(section2,timers)
  }

  free(r1_vec);

  return 0;
}

And the interpolator:

#define _POSIX_C_SOURCE 200809L
#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);
#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;

#include "stdlib.h"
#include "math.h"
#include "sys/time.h"
#include "xmmintrin.h"
#include "pmmintrin.h"
#include "omp.h"

struct dataobj
{
  void *restrict data;
  unsigned long * size;
  unsigned long * npsize;
  unsigned long * dsize;
  int * hsize;
  int * hofs;
  int * oofs;
  void * dmap;
} ;

struct profiler
{
  double section0;
} ;

extern "C" int interpolator(struct dataobj *restrict p0re_vec, struct dataobj *restrict p0recre_vec, struct dataobj *restrict p0recre_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int recdim_M, const int recdim_m, const int nthreads_nonaffine, struct profiler * timers);

int interpolator(struct dataobj *restrict p0re_vec, struct dataobj *restrict p0recre_vec, struct dataobj *restrict p0recre_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int recdim_M, const int recdim_m, const int nthreads_nonaffine, struct profiler * timers)
{
  float (*restrict p0re)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]]) p0re_vec->data;
  float (*restrict p0recre)[p0recre_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[p0recre_vec->size[1]]) p0recre_vec->data;
  float (*restrict p0recre_coords)[p0recre_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[p0recre_coords_vec->size[1]]) p0recre_coords_vec->data;

  /* Flush denormal numbers to zero in hardware */
  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);

  float r0 = 1.0F/h_x;
  float r1 = 1.0F/h_y;
  float r2 = 1.0F/h_z;

  START(section0)
  #pragma omp parallel num_threads(nthreads_nonaffine)
  {
    int chunk_size = (int)(fmax(1, (1.0F/3.0F)*(recdim_M - recdim_m + 1)/nthreads_nonaffine));
    #pragma omp for schedule(dynamic,chunk_size)
    for (int recdim = recdim_m; recdim <= recdim_M; recdim += 1)
    {
      for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
      {
        float r3 = floor(r0*(-o_x + p0recre_coords[recdim][0]));
        int posx = (int)r3;
        float r4 = floor(r1*(-o_y + p0recre_coords[recdim][1]));
        int posy = (int)r4;
        float r5 = floor(r2*(-o_z + p0recre_coords[recdim][2]));
        int posz = (int)r5;
        float px = r0*(-o_x + p0recre_coords[recdim][0]) - r3;
        float py = r1*(-o_y + p0recre_coords[recdim][1]) - r4;
        float pz = r2*(-o_z + p0recre_coords[recdim][2]) - r5;
        float sum = 0.0F;

        for (int rp0recrex = 0; rp0recrex <= 1; rp0recrex += 1)
        {
          for (int rp0recrey = 0; rp0recrey <= 1; rp0recrey += 1)
          {
            for (int rp0recrez = 0; rp0recrez <= 1; rp0recrez += 1)
            {
              if (rp0recrex + posx >= x_m - 1 && rp0recrey + posy >= y_m - 1 && rp0recrez + posz >= z_m - 1 && rp0recrex + posx <= x_M + 1 && rp0recrey + posy <= y_M + 1 && rp0recrez + posz <= z_M + 1)
              {
                sum += (rp0recrex*px + (1 - rp0recrex)*(1 - px))*(rp0recrey*py + (1 - rp0recrey)*(1 - py))*(rp0recrez*pz + (1 - rp0recrez)*(1 - pz))*p0re[rp0recrex + posx + 8][rp0recrey + posy + 8][rp0recrez + posz + 8][freq_dim];
              }
            }
          }
        }

        p0recre[recdim][freq_dim] = sum;
      }
    }
  }
  STOP(section0,timers)

  return 0;
}

Now, when we run the same code with MPI we get very different results:

cvx@cbox-lukedecker-extfwimpi:~$ DEVITO_LOGGING=DEBUG  DEVITO_MPI=1 OMP_NUM_THREADS=22 mpirun -n 2 -display-map  -mca btl vader,self -map-by numa:PE=24 python mfe.py 
libibverbs: Warning: couldn't open config directory '/usr/local/etc/libibverbs.d'.
 Data for JOB [51184,1] offset 0 Total slots allocated 48

 ========================   JOB MAP   ========================

 Data for node: cbox-lukedecker-extfwimpi       Num slots: 48   Max slots: 0    Num procs: 2
        Process OMPI jobid: [51184,1] App: 0 Process rank: 0 Bound: socket 0[core 0[hwt 0]], socket 0[core 1[hwt 0]], socket 0[core 2[hwt 0]], socket 0[core 3[hwt 0]], socket 0[core 4[hwt 0]], socket 0[core 5[hwt 0]], socket 0[core 6[hwt 0]], socket 0[core 7[hwt 0]], socket 0[core 8[hwt 0]], socket 0[core 9[hwt 0]], socket 0[core 10[hwt 0]], socket 0[core 11[hwt 0]], socket 0[core 12[hwt 0]], socket 0[core 13[hwt 0]], socket 0[core 14[hwt 0]], socket 0[core 15[hwt 0]], socket 0[core 16[hwt 0]], socket 0[core 17[hwt 0]], socket 0[core 18[hwt 0]], socket 0[core 19[hwt 0]], socket 0[core 20[hwt 0]], socket 0[core 21[hwt 0]], socket 0[core 22[hwt 0]], socket 0[core 23[hwt 0]]:[B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B][./././././././././././././././././././././././.]
        Process OMPI jobid: [51184,1] App: 0 Process rank: 1 Bound: socket 1[core 24[hwt 0]], socket 1[core 25[hwt 0]], socket 1[core 26[hwt 0]], socket 1[core 27[hwt 0]], socket 1[core 28[hwt 0]], socket 1[core 29[hwt 0]], socket 1[core 30[hwt 0]], socket 1[core 31[hwt 0]], socket 1[core 32[hwt 0]], socket 1[core 33[hwt 0]], socket 1[core 34[hwt 0]], socket 1[core 35[hwt 0]], socket 1[core 36[hwt 0]], socket 1[core 37[hwt 0]], socket 1[core 38[hwt 0]], socket 1[core 39[hwt 0]], socket 1[core 40[hwt 0]], socket 1[core 41[hwt 0]], socket 1[core 42[hwt 0]], socket 1[core 43[hwt 0]], socket 1[core 44[hwt 0]], socket 1[core 45[hwt 0]], socket 1[core 46[hwt 0]], socket 1[core 47[hwt 0]]:[./././././././././././././././././././././././.][B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B/B]

 =============================================================
libibverbs: Warning: couldn't open config directory '/usr/local/etc/libibverbs.d'.
libibverbs: Warning: couldn't open config directory '/usr/local/etc/libibverbs.d'.
Allocating host memory for frequencies(5,) [20 B]
Allocating host memory for frequencies(5,) [20 B]
Allocating host memory for src_coords(0, 3) [0 B]
Allocating host memory for src_coords(1, 3) [12 B]
Allocating host memory for p0recre_coords(6, 3) [72 B]
Allocating host memory for p0recre_coords(6, 3) [72 B]
Operator `propagator` generated in 1.26 s
  * lowering.IET: 0.61 s (48.7 %)
     * specializing.IET: 0.51 s (40.8 %)
        * make_mpi: 0.28 s (22.4 %)
  * lowering.Clusters: 0.39 s (31.2 %)
Flops reduction after symbolic optimization: [152 --> 54]
Allocating host memory for p0(3, 41, 67, 67) [2 MB]
Operator `propagator` generated in 1.26 s
  * lowering.IET: 0.61 s (48.5 %)
     * specializing.IET: 0.51 s (40.6 %)
        * make_mpi: 0.28 s (22.3 %)
  * lowering.Clusters: 0.39 s (31.0 %)
Flops reduction after symbolic optimization: [152 --> 54]
Allocating host memory for p0(3, 42, 67, 67) [2 MB]
Allocating host memory for p0re(42, 67, 67, 5) [4 MB]
Allocating host memory for p0re(41, 67, 67, 5) [4 MB]
p_src_m=0 and p_src_M=-1 might cause no iterations along Dimension p_src
Operator `propagator` fetched `/tmp/devito-jitcache-uid1000/a653349b99b6968e90b8c82ca727c2089fdab675.cpp` in 0.13 s from jit-cache
Operator `propagator` fetched `/tmp/devito-jitcache-uid1000/a653349b99b6968e90b8c82ca727c2089fdab675.cpp` in 0.21 s from jit-cache
Operator `propagator` ran in 0.05 s
Operator `propagator` ran in 0.05 s
Global performance: [OI=0.52, 28.14 GFlops/s, 0.54 GPts/s]
Global performance: [OI=0.52, 28.14 GFlops/s, 0.54 GPts/s]
Local performance:
Local performance:
  * multipass0[rank0]<> ran in 0.04 s [OI=0.52, 18.29 GFlops/s, 0.35 GPts/s]
  * multipass0[rank0]<> ran in 0.04 s [OI=0.52, 18.29 GFlops/s, 0.35 GPts/s]
    + section0 ran in 0.01 s [9.28%] 
    + section0 ran in 0.01 s [9.28%] 
    + section1 ran in 0.04 s [90.55%] 
    + section1 ran in 0.04 s [90.55%] 
  * multipass0[rank1]<> ran in 0.04 s [OI=0.52, 17.60 GFlops/s, 0.34 GPts/s]
  * multipass0[rank1]<> ran in 0.04 s [OI=0.52, 17.60 GFlops/s, 0.34 GPts/s]
    + section0 ran in 0.01 s [9.29%] 
    + section0 ran in 0.01 s [9.29%] 
    + section1 ran in 0.04 s [90.65%] 
    + section1 ran in 0.04 s [90.65%] 
  * section2[rank1]<<200,1,5,2,2,2>,<200,1,5,2,2,2>> ran in 0.01 s [OI=12.48, 0.69 GFlops/s, 0.01 GPts/s]
  * section2[rank1]<<200,1,5,2,2,2>,<200,1,5,2,2,2>> ran in 0.01 s [OI=12.48, 0.69 GFlops/s, 0.01 GPts/s]
Performance[mode=advanced] arguments: {'nthreads': 22, 'nthreads_nonaffine': 22, 'x0_blk0_size': 8, 'y0_blk0_size': 8}
Performance[mode=advanced] arguments: {'nthreads': 22, 'nthreads_nonaffine': 22, 'x0_blk0_size': 8, 'y0_blk0_size': 8}
Operator `interpolator` generated in 0.27 s
  * lowering.Clusters: 0.11 s (40.8 %)
     * specializing.Clusters: 0.08 s (29.7 %)
  * lowering.IET: 0.11 s (40.8 %)
     * specializing.IET: 0.08 s (29.7 %)
Flops reduction after symbolic optimization: [36 --> 21]
Operator `interpolator` generated in 0.27 s
  * lowering.Clusters: 0.11 s (40.8 %)
     * specializing.Clusters: 0.08 s (29.7 %)
  * lowering.IET: 0.11 s (40.8 %)
     * specializing.IET: 0.08 s (29.7 %)
Flops reduction after symbolic optimization: [36 --> 21]
Allocating host memory for p0recre(6, 5) [120 B]
Allocating host memory for p0recre(6, 5) [120 B]
recdim_m=0 and recdim_M=-1 might cause no iterations along Dimension recdim
recdim_m=0 and recdim_M=-1 might cause no iterations along Dimension recdim
Operator `interpolator` fetched `/tmp/devito-jitcache-uid1000/c3edbe514ed93e1118c8a9b2ee38bcf004de8df5.cpp` in 0.02 s from jit-cache
Operator `interpolator` fetched `/tmp/devito-jitcache-uid1000/c3edbe514ed93e1118c8a9b2ee38bcf004de8df5.cpp` in 0.02 s from jit-cache
Operator `interpolator` ran in 0.01 s
Operator `interpolator` ran in 0.01 s
Global performance: [OI=3.05, 0.01 GFlops/s]
Global performance: [OI=3.05, 0.01 GFlops/s]
Local performance:
Local performance:
  * section1[rank0]<<12,5>,<12,5,2,2,2>> ran in 0.01 s [OI=3.05, 0.04 GFlops/s, 0.00 GPts/s]
  * section1[rank0]<<12,5>,<12,5,2,2,2>> ran in 0.01 s [OI=3.05, 0.04 GFlops/s, 0.00 GPts/s]
Performance[mode=advanced] arguments: {'nthreads_nonaffine': 22}
Performance[mode=advanced] arguments: {'nthreads_nonaffine': 22}
[[7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
 [7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
 [7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]]
[[7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
 [7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
 [7.8738990e-14 1.2676886e-13 1.4294368e-13 1.4095608e-13 1.3165730e-13]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]]

Here is the generated MPI propagator code:

#define _POSIX_C_SOURCE 200809L
#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);
#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;
#define MIN(a,b) (((a) < (b)) ? (a) : (b))

#include "stdlib.h"
#include "math.h"
#include "sys/time.h"
#include "xmmintrin.h"
#include "pmmintrin.h"
#include "mpi.h"
#include "omp.h"

struct dataobj
{
  void *restrict data;
  unsigned long * size;
  unsigned long * npsize;
  unsigned long * dsize;
  int * hsize;
  int * hofs;
  int * oofs;
  void * dmap;
} ;

struct neighborhood
{
  int lll, llc, llr, lcl, lcc, lcr, lrl, lrc, lrr;
  int cll, clc, clr, ccl, ccc, ccr, crl, crc, crr;
  int rll, rlc, rlr, rcl, rcc, rcr, rrl, rrc, rrr;
} ;

struct profiler
{
  double multipass0;
  double section2;
  double section0;
  double section1;
} ;

extern "C" int propagator(struct dataobj *restrict frequencies_vec, struct dataobj *restrict p0_vec, struct dataobj *restrict p0re_vec, struct dataobj *restrict src_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int x0_blk0_size, const int y0_blk0_size, MPI_Comm comm, struct neighborhood * nb, const int nthreads, const int nthreads_nonaffine, struct profiler * timers);

static void sendrecv0(struct dataobj *restrict p0_vec, const int x_size, const int y_size, const int z_size, int ogtime, int ogx, int ogy, int ogz, int ostime, int osx, int osy, int osz, int fromrank, int torank, MPI_Comm comm, const int nthreads);
static void haloupdate0(struct dataobj *restrict p0_vec, MPI_Comm comm, struct neighborhood * nb, int otime, const int nthreads);
static void gather0(float *restrict buf_vec, int bx_size, int by_size, int bz_size, struct dataobj *restrict p0_vec, const int otime, const int ox, const int oy, const int oz, const int nthreads);
static void scatter0(float *restrict buf_vec, int bx_size, int by_size, int bz_size, struct dataobj *restrict p0_vec, const int otime, const int ox, const int oy, const int oz, const int nthreads);

int propagator(struct dataobj *restrict frequencies_vec, struct dataobj *restrict p0_vec, struct dataobj *restrict p0re_vec, struct dataobj *restrict src_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int p_src_M, const int p_src_m, const int time_M, const int time_m, const int x0_blk0_size, const int y0_blk0_size, MPI_Comm comm, struct neighborhood * nb, const int nthreads, const int nthreads_nonaffine, struct profiler * timers)
{
  float *restrict r1_vec __attribute__ ((aligned (64)));
  posix_memalign((void**)(&r1_vec),64,5*sizeof(float));

  float (*restrict frequencies) __attribute__ ((aligned (64))) = (float (*)) frequencies_vec->data;
  float (*restrict p0)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]]) p0_vec->data;
  float (*restrict p0re)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]]) p0re_vec->data;
  float (*restrict r1) __attribute__ ((aligned (64))) = (float (*)) r1_vec;
  float (*restrict src_coords)[src_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[src_coords_vec->size[1]]) src_coords_vec->data;

  /* Flush denormal numbers to zero in hardware */
  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);

  float r2 = 1.0F/(h_x*h_x);
  float r3 = 1.0F/(h_y*h_y);
  float r4 = 1.0F/(h_z*h_z);

  for (int time = time_m, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3); time <= time_M; time += 1, t0 = (time)%(3), t1 = (time + 2)%(3), t2 = (time + 1)%(3))
  {
    START(multipass0)
    START(section0)
    #pragma omp parallel num_threads(nthreads)
    {
      #pragma omp for schedule(static,1)
      for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
      {
        r1[freq_dim] = sin(6.28318e-2F*time*frequencies[freq_dim]);
      }
    }
    STOP(section0,timers)

    START(section1)
    haloupdate0(p0_vec,comm,nb,t0,nthreads);
    #pragma omp parallel num_threads(nthreads)
    {
      #pragma omp for collapse(2) schedule(dynamic,1)
      for (int x0_blk0 = x_m; x0_blk0 <= x_M; x0_blk0 += x0_blk0_size)
      {
        for (int y0_blk0 = y_m; y0_blk0 <= y_M; y0_blk0 += y0_blk0_size)
        {
          for (int x = x0_blk0; x <= MIN(x_M, x0_blk0 + x0_blk0_size - 1); x += 1)
          {
            for (int y = y0_blk0; y <= MIN(y_M, y0_blk0 + y0_blk0_size - 1); y += 1)
            {
              for (int z = z_m; z <= z_M; z += 1)
              {
                float r5 = 1.42361111100763e-5F*p0[t0][x + 8][y + 8][z + 8];
                p0[t2][x + 8][y + 8][z + 8] = r2*(r5 + 8.9285714284415e-9F*(p0[t0][x + 4][y + 8][z + 8] + p0[t0][x + 12][y + 8][z + 8]) + (-1.26984126982279e-7F)*(p0[t0][x + 5][y + 8][z + 8] + p0[t0][x + 11][y + 8][z + 8]) + 1.00000000005821e-6F*(p0[t0][x + 6][y + 8][z + 8] + p0[t0][x + 10][y + 8][z + 8]) + (-8.00000000046566e-6F)*(p0[t0][x + 7][y + 8][z + 8] + p0[t0][x + 9][y + 8][z + 8])) + r3*(r5 + 8.9285714284415e-9F*(p0[t0][x + 8][y + 4][z + 8] + p0[t0][x + 8][y + 12][z + 8]) + (-1.26984126982279e-7F)*(p0[t0][x + 8][y + 5][z + 8] + p0[t0][x + 8][y + 11][z + 8]) + 1.00000000005821e-6F*(p0[t0][x + 8][y + 6][z + 8] + p0[t0][x + 8][y + 10][z + 8]) + (-8.00000000046566e-6F)*(p0[t0][x + 8][y + 7][z + 8] + p0[t0][x + 8][y + 9][z + 8])) + r4*(r5 + 8.9285714284415e-9F*(p0[t0][x + 8][y + 8][z + 4] + p0[t0][x + 8][y + 8][z + 12]) + (-1.26984126982279e-7F)*(p0[t0][x + 8][y + 8][z + 5] + p0[t0][x + 8][y + 8][z + 11]) + 1.00000000005821e-6F*(p0[t0][x + 8][y + 8][z + 6] + p0[t0][x + 8][y + 8][z + 10]) + (-8.00000000046566e-6F)*(p0[t0][x + 8][y + 8][z + 7] + p0[t0][x + 8][y + 8][z + 9])) + p0[t1][x + 8][y + 8][z + 8];

                #pragma omp simd aligned(p0,p0re:64)
                for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
                {
                  p0re[x + 8][y + 8][z + 8][freq_dim] = r1[freq_dim]*p0[t0][x + 8][y + 8][z + 8] + p0re[x + 8][y + 8][z + 8][freq_dim];
                }
              }
            }
          }
        }
      }
    }
    STOP(section1,timers)
    STOP(multipass0,timers)

    START(section2)
    #pragma omp parallel num_threads(nthreads_nonaffine)
    {
      int chunk_size = (int)(fmax(1, (1.0F/3.0F)*(p_src_M - p_src_m + 1)/nthreads_nonaffine));
      #pragma omp for schedule(dynamic,chunk_size)
      for (int p_src = p_src_m; p_src <= p_src_M; p_src += 1)
      {
        for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
        {
          for (int rsrcx = 0; rsrcx <= 1; rsrcx += 1)
          {
            for (int rsrcy = 0; rsrcy <= 1; rsrcy += 1)
            {
              for (int rsrcz = 0; rsrcz <= 1; rsrcz += 1)
              {
                int posx = (int)(floor((-o_x + src_coords[p_src][0])/h_x));
                int posy = (int)(floor((-o_y + src_coords[p_src][1])/h_y));
                int posz = (int)(floor((-o_z + src_coords[p_src][2])/h_z));
                float px = -floor((-o_x + src_coords[p_src][0])/h_x) + (-o_x + src_coords[p_src][0])/h_x;
                float py = -floor((-o_y + src_coords[p_src][1])/h_y) + (-o_y + src_coords[p_src][1])/h_y;
                float pz = -floor((-o_z + src_coords[p_src][2])/h_z) + (-o_z + src_coords[p_src][2])/h_z;
                if (rsrcx + posx >= x_m - 1 && rsrcy + posy >= y_m - 1 && rsrcz + posz >= z_m - 1 && rsrcx + posx <= x_M + 1 && rsrcy + posy <= y_M + 1 && rsrcz + posz <= z_M + 1)
                {
                  float r0 = (1.0F/100000.0F)*(rsrcx*px + (1 - rsrcx)*(1 - px))*(rsrcy*py + (1 - rsrcy)*(1 - py))*(rsrcz*pz + (1 - rsrcz)*(1 - pz))*sin(6.28318e-2F*time*frequencies[freq_dim]);
                  #pragma omp atomic update
                  p0[t2][rsrcx + posx + 8][rsrcy + posy + 8][rsrcz + posz + 8] += r0;
                }
              }
            }
          }
        }
      }
    }
    STOP(section2,timers)
  }

  free(r1_vec);

  return 0;
}

static void sendrecv0(struct dataobj *restrict p0_vec, const int x_size, const int y_size, const int z_size, int ogtime, int ogx, int ogy, int ogz, int ostime, int osx, int osy, int osz, int fromrank, int torank, MPI_Comm comm, const int nthreads)
{
  float *restrict bufg_vec __attribute__ ((aligned (64)));
  posix_memalign((void**)(&bufg_vec),64,x_size*y_size*z_size*sizeof(float));
  float *restrict bufs_vec __attribute__ ((aligned (64)));
  posix_memalign((void**)(&bufs_vec),64,x_size*y_size*z_size*sizeof(float));

  MPI_Request rrecv;
  MPI_Request rsend;

  MPI_Irecv(bufs_vec,x_size*y_size*z_size,MPI_FLOAT,fromrank,13,comm,&(rrecv));
  if (torank != MPI_PROC_NULL)
  {
    gather0(bufg_vec,x_size,y_size,z_size,p0_vec,ogtime,ogx,ogy,ogz,nthreads);
  }
  MPI_Isend(bufg_vec,x_size*y_size*z_size,MPI_FLOAT,torank,13,comm,&(rsend));
  MPI_Wait(&(rsend),MPI_STATUS_IGNORE);
  MPI_Wait(&(rrecv),MPI_STATUS_IGNORE);
  if (fromrank != MPI_PROC_NULL)
  {
    scatter0(bufs_vec,x_size,y_size,z_size,p0_vec,ostime,osx,osy,osz,nthreads);
  }

  free(bufg_vec);
  free(bufs_vec);
}

static void haloupdate0(struct dataobj *restrict p0_vec, MPI_Comm comm, struct neighborhood * nb, int otime, const int nthreads)
{
  sendrecv0(p0_vec,p0_vec->hsize[3],p0_vec->npsize[2],p0_vec->npsize[3],otime,p0_vec->oofs[2],p0_vec->hofs[4],p0_vec->hofs[6],otime,p0_vec->hofs[3],p0_vec->hofs[4],p0_vec->hofs[6],nb->rcc,nb->lcc,comm,nthreads);
  sendrecv0(p0_vec,p0_vec->hsize[2],p0_vec->npsize[2],p0_vec->npsize[3],otime,p0_vec->oofs[3],p0_vec->hofs[4],p0_vec->hofs[6],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->hofs[6],nb->lcc,nb->rcc,comm,nthreads);
  sendrecv0(p0_vec,p0_vec->npsize[1],p0_vec->hsize[5],p0_vec->npsize[3],otime,p0_vec->hofs[2],p0_vec->oofs[4],p0_vec->hofs[6],otime,p0_vec->hofs[2],p0_vec->hofs[5],p0_vec->hofs[6],nb->crc,nb->clc,comm,nthreads);
  sendrecv0(p0_vec,p0_vec->npsize[1],p0_vec->hsize[4],p0_vec->npsize[3],otime,p0_vec->hofs[2],p0_vec->oofs[5],p0_vec->hofs[6],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->hofs[6],nb->clc,nb->crc,comm,nthreads);
  sendrecv0(p0_vec,p0_vec->npsize[1],p0_vec->npsize[2],p0_vec->hsize[7],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->oofs[6],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->hofs[7],nb->ccr,nb->ccl,comm,nthreads);
  sendrecv0(p0_vec,p0_vec->npsize[1],p0_vec->npsize[2],p0_vec->hsize[6],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->oofs[7],otime,p0_vec->hofs[2],p0_vec->hofs[4],p0_vec->hofs[6],nb->ccl,nb->ccr,comm,nthreads);
}

static void gather0(float *restrict buf_vec, int bx_size, int by_size, int bz_size, struct dataobj *restrict p0_vec, const int otime, const int ox, const int oy, const int oz, const int nthreads)
{
  float (*restrict buf)[bx_size][by_size][bz_size] __attribute__ ((aligned (64))) = (float (*)[bx_size][by_size][bz_size]) buf_vec;
  float (*restrict p0)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]]) p0_vec->data;

  const int x_m = 0;
  const int y_m = 0;
  const int z_m = 0;
  const int x_M = bx_size - 1;
  const int y_M = by_size - 1;
  const int z_M = bz_size - 1;
  #pragma omp parallel num_threads(nthreads)
  {
    #pragma omp for collapse(2) schedule(static,1)
    for (int x = x_m; x <= x_M; x += 1)
    {
      for (int y = y_m; y <= y_M; y += 1)
      {
        #pragma omp simd aligned(p0:64)
        for (int z = z_m; z <= z_M; z += 1)
        {
          buf[0][x][y][z] = p0[otime][x + ox][y + oy][z + oz];
        }
      }
    }
  }
}

static void scatter0(float *restrict buf_vec, int bx_size, int by_size, int bz_size, struct dataobj *restrict p0_vec, const int otime, const int ox, const int oy, const int oz, const int nthreads)
{
  float (*restrict buf)[bx_size][by_size][bz_size] __attribute__ ((aligned (64))) = (float (*)[bx_size][by_size][bz_size]) buf_vec;
  float (*restrict p0)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0_vec->size[1]][p0_vec->size[2]][p0_vec->size[3]]) p0_vec->data;

  const int x_m = 0;
  const int y_m = 0;
  const int z_m = 0;
  const int x_M = bx_size - 1;
  const int y_M = by_size - 1;
  const int z_M = bz_size - 1;
  #pragma omp parallel num_threads(nthreads)
  {
    #pragma omp for collapse(2) schedule(static,1)
    for (int x = x_m; x <= x_M; x += 1)
    {
      for (int y = y_m; y <= y_M; y += 1)
      {
        #pragma omp simd aligned(p0:64)
        for (int z = z_m; z <= z_M; z += 1)
        {
          p0[otime][x + ox][y + oy][z + oz] = buf[0][x][y][z];
        }
      }
    }
  }
}

And the generated MPI interpolator code:

#define _POSIX_C_SOURCE 200809L
#define START(S) struct timeval start_ ## S , end_ ## S ; gettimeofday(&start_ ## S , NULL);
#define STOP(S,T) gettimeofday(&end_ ## S, NULL); T->S += (double)(end_ ## S .tv_sec-start_ ## S.tv_sec)+(double)(end_ ## S .tv_usec-start_ ## S .tv_usec)/1000000;

#include "stdlib.h"
#include "math.h"
#include "sys/time.h"
#include "xmmintrin.h"
#include "pmmintrin.h"
#include "mpi.h"
#include "omp.h"

struct dataobj
{
  void *restrict data;
  unsigned long * size;
  unsigned long * npsize;
  unsigned long * dsize;
  int * hsize;
  int * hofs;
  int * oofs;
  void * dmap;
} ;

struct profiler
{
  double section0;
  double section1;
} ;

extern "C" int interpolator(struct dataobj *restrict p0re_vec, struct dataobj *restrict p0recre_vec, struct dataobj *restrict p0recre_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int recdim_M, const int recdim_m, const int nthreads_nonaffine, struct profiler * timers);

int interpolator(struct dataobj *restrict p0re_vec, struct dataobj *restrict p0recre_vec, struct dataobj *restrict p0recre_coords_vec, const int x_M, const int x_m, const int y_M, const int y_m, const int z_M, const int z_m, const int freq_dim_M, const int freq_dim_m, const float h_x, const float h_y, const float h_z, const float o_x, const float o_y, const float o_z, const int recdim_M, const int recdim_m, const int nthreads_nonaffine, struct profiler * timers)
{
  float (*restrict p0re)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]] __attribute__ ((aligned (64))) = (float (*)[p0re_vec->size[1]][p0re_vec->size[2]][p0re_vec->size[3]]) p0re_vec->data;
  float (*restrict p0recre)[p0recre_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[p0recre_vec->size[1]]) p0recre_vec->data;
  float (*restrict p0recre_coords)[p0recre_coords_vec->size[1]] __attribute__ ((aligned (64))) = (float (*)[p0recre_coords_vec->size[1]]) p0recre_coords_vec->data;

  /* Flush denormal numbers to zero in hardware */
  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);

  float r0 = 1.0F/h_x;
  float r1 = 1.0F/h_y;
  float r2 = 1.0F/h_z;
  START(section0)
  for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
  {
  }
  STOP(section0,timers)

  START(section1)
  #pragma omp parallel num_threads(nthreads_nonaffine)
  {
    int chunk_size = (int)(fmax(1, (1.0F/3.0F)*(recdim_M - recdim_m + 1)/nthreads_nonaffine));
    #pragma omp for schedule(dynamic,chunk_size)
    for (int recdim = recdim_m; recdim <= recdim_M; recdim += 1)
    {
      for (int freq_dim = freq_dim_m; freq_dim <= freq_dim_M; freq_dim += 1)
      {
        float r3 = floor(r0*(-o_x + p0recre_coords[recdim][0]));
        int posx = (int)r3;
        float r4 = floor(r1*(-o_y + p0recre_coords[recdim][1]));
        int posy = (int)r4;
        float r5 = floor(r2*(-o_z + p0recre_coords[recdim][2]));
        int posz = (int)r5;
        float px = r0*(-o_x + p0recre_coords[recdim][0]) - r3;
        float py = r1*(-o_y + p0recre_coords[recdim][1]) - r4;
        float pz = r2*(-o_z + p0recre_coords[recdim][2]) - r5;
        float sum = 0.0F;

        for (int rp0recrex = 0; rp0recrex <= 1; rp0recrex += 1)
        {
          for (int rp0recrey = 0; rp0recrey <= 1; rp0recrey += 1)
          {
            for (int rp0recrez = 0; rp0recrez <= 1; rp0recrez += 1)
            {
              if (rp0recrex + posx >= x_m - 1 && rp0recrey + posy >= y_m - 1 && rp0recrez + posz >= z_m - 1 && rp0recrex + posx <= x_M + 1 && rp0recrey + posy <= y_M + 1 && rp0recrez + posz <= z_M + 1)
              {
                sum += (rp0recrex*px + (1 - rp0recrex)*(1 - px))*(rp0recrey*py + (1 - rp0recrey)*(1 - py))*(rp0recrez*pz + (1 - rp0recrez)*(1 - pz))*p0re[rp0recrex + posx + 8][rp0recrey + posy + 8][rp0recrez + posz + 8][freq_dim];
              }
            }
          }
        }

        p0recre[recdim][freq_dim] = sum;
      }
    }
  }
  STOP(section1,timers)

  return 0;
}