FEniCS / ffcx

Next generation FEniCS Form Compiler for finite element forms
https://fenicsproject.org
Other
145 stars 38 forks source link

Store coefficient values at quadrature points for kernels #565

Open IgorBaratta opened 1 year ago

IgorBaratta commented 1 year ago

Except for few modifications the code for assembling a vector looks like:

  T w1[3] = {0};
  T w0[1] = {0};
  for (int ip = 0; ip < Np; ip++)
  {
    // vectorized across
    for (int ic = 0; ic < Nc; ++ic)
    {
      w1[0] += w[4 + ic] * D0[ip][ic];
      w1[1] += w[4 + ic] * D1[ip][ic];
      w1[2] += w[4 + ic] * D2[ip][ic];
    }
    for (int ic = 0; ic < 4; ++ic)
      w0[0] += w[ic] * Phi[ip][ic];

    // Not vectorized
    T fw[3] = {0};
    fw[0] = (G[0][0] * w1[0] + G[0][1] * w1[1] + G[0][2] * w1[2]) * weights[ip];
    fw[1] = (G[1][0] * w1[0] + G[1][1] * w1[1] + G[1][2] * w1[2]) * weights[ip];
    fw[2] = (G[2][0] * w1[0] + G[2][1] * w1[1] + G[2][2] * w1[2]) * weights[ip];

    for (int i = 0; i < Nc; ++i)
      A[i] += fw[0 * Np + ip] * D0[ip][i] + fw[1 * Np + ip] * D1[ip][i] + fw[2 * Np + ip] * D2[ip][i];
  }

The application of pull-back and geometric information is not vectorized. We should consider storing more data at quadrature points to enable vectorization across quadrature points.

    T w1[3 * Np] = {0};
    T w0[Np] = {0};
    for (int ip = 0; ip < Np; ip++)
    {
      for (int ic = 0; ic < Nc; ++ic)
      {
        w1[0 * Np + ip] += w[4 + ic] * D0[ip][ic];
        w1[1 * Np + ip] += w[4 + ic] * D1[ip][ic];
        w1[2 * Np + ip] += w[4 + ic] * D2[ip][ic];
      }
      for (int ic = 0; ic < 4; ++ic)
        w0[ip] += w[ic] * Phi[ip][ic];
    }

    T fw[3 * Np] = {0};
     // Vectorized across quadrature points
    for (int ip = 0; ip < Np; ip++)
    {
      fw[0 * Np + ip] = (G[0][0] * w1[ip] + G[0][1] * w1[Np + ip] + G[0][2] * w1[2 * Np + ip]) * weights[ip ];
      fw[1 * Np + ip] = (G[1][0] * w1[ip] + G[1][1] * w1[Np + ip] + G[1][2] * w1[2 * Np + ip]) * weights[ip ];
      fw[2 * Np + ip] = (G[2][0] * w1[ip] + G[2][1] * w1[Np + ip] + G[2][2] * w1[2 * Np + ip]) * weights[ip ];
    }
    for (int ip = 0; ip < Np; ip++)
    {
      for (int i = 0; i < Nc; ++i)
        A[i] += fw[0 * Np + ip] * D0[ip][i] + fw[1 * Np + ip] * D1[ip][i] + fw[2 * Np + ip] * D2[ip][i];
    }