Open kaschau opened 1 year ago
https://kokkos.github.io/kokkos-core-wiki/API/core/spaces/partition_space.html
christian trott suggests using concurrent exec space instances instead
diff --git a/src/compute/advFlux/advFlux.hpp b/src/compute/advFlux/advFlux.hpp
index 0af10f8..1a2c3f2 100644
--- a/src/compute/advFlux/advFlux.hpp
+++ b/src/compute/advFlux/advFlux.hpp
@@ -2,10 +2,11 @@
#define __advFlux_H__
#include "block_.hpp"
+#include <vector>
// ./advFlux
// |------> secondOrderKEEP
-void secondOrderKEEP(block_ &b);
+void secondOrderKEEP(std::vector<block_> &mb);
// |------> centralDifference
void centralDifference(block_ &b);
// |------> fourthOrderKEEP
diff --git a/src/compute/advFlux/bindingsAdvFlux.cpp b/src/compute/advFlux/bindingsAdvFlux.cpp
index caf7e67..637d0c1 100644
--- a/src/compute/advFlux/bindingsAdvFlux.cpp
+++ b/src/compute/advFlux/bindingsAdvFlux.cpp
@@ -10,7 +10,7 @@ void bindAdvFlux(py::module_ &m) {
// |----> secondOrderKEEP.cpp
advFlux.def("secondOrderKEEP", &secondOrderKEEP,
"Compute centeral difference euler fluxes via second order KEEP",
- py::arg("block_ object"));
+ py::arg("mb"));
// |----> centeredDifference.cpp
advFlux.def("centralDifference", ¢ralDifference,
"Compute central difference euler fluxes",
diff --git a/src/compute/advFlux/secondOrderKEEP.cpp b/src/compute/advFlux/secondOrderKEEP.cpp
index db9d25f..52cee4b 100644
--- a/src/compute/advFlux/secondOrderKEEP.cpp
+++ b/src/compute/advFlux/secondOrderKEEP.cpp
@@ -2,77 +2,96 @@
#include "block_.hpp"
#include "kokkosTypes.hpp"
#include "thtrdat_.hpp"
-
-void secondOrderKEEP(block_ &b) {
+#include <vector>
+
+void secondOrderKEEP(std::vector<block_> &mb) {
+ int nblks = mb.size();
+ std::vector<double> weight(nblks);
+ double sum = 0;
+ for (int nblki = 0; nblki < nblks; nblki++) {
+ double w = static_cast<double>(mb[nblki].ni * mb[nblki].nj * mb[nblki].nk);
+ weight[nblki] = w;
+ sum += w;
+ };
+ for (int nblki = 0; nblki < nblks; nblki++) {
+ weight[nblki] /= sum;
+ };
+ auto instances = Kokkos::Experimental::partition_space(execSpace(), weight);
//-------------------------------------------------------------------------------------------|
// i flux face range
//-------------------------------------------------------------------------------------------|
- MDRange3 range_i({b.ng, b.ng, b.ng},
- {b.ni + b.ng, b.nj + b.ng - 1, b.nk + b.ng - 1});
- Kokkos::parallel_for(
- "2nd order KEEP i face conv fluxes", range_i,
- KOKKOS_LAMBDA(const int i, const int j, const int k) {
- double U;
- double uf;
- double vf;
- double wf;
- double pf;
-
- // Compute face normal volume flux vector
- uf = 0.5 * (b.q(i, j, k, 1) + b.q(i - 1, j, k, 1));
- vf = 0.5 * (b.q(i, j, k, 2) + b.q(i - 1, j, k, 2));
- wf = 0.5 * (b.q(i, j, k, 3) + b.q(i - 1, j, k, 3));
-
- U = b.isx(i, j, k) * uf + b.isy(i, j, k) * vf + b.isz(i, j, k) * wf;
-
- pf = 0.5 * (b.q(i, j, k, 0) + b.q(i - 1, j, k, 0));
-
- // Compute fluxes
- double rho;
- rho = 0.5 * (b.Q(i, j, k, 0) + b.Q(i - 1, j, k, 0));
-
- // Continuity rho*Ui
- b.iF(i, j, k, 0) = rho * U;
-
- // x momentum rho*u*Ui+ p*Ax
- b.iF(i, j, k, 1) = rho * uf * U + pf * b.isx(i, j, k);
-
- // y momentum rho*v*Ui+ p*Ay
- b.iF(i, j, k, 2) = rho * vf * U + pf * b.isy(i, j, k);
-
- // w momentum rho*w*Ui+ p*Az
- b.iF(i, j, k, 3) = rho * wf * U + pf * b.isz(i, j, k);
-
- // Total energy (rhoE+ p)*Ui)
- double e;
- double em;
-
- e = b.qh(i, j, k, 4);
- em = b.qh(i - 1, j, k, 4);
-
- b.iF(i, j, k, 4) =
- ((0.5 * (e + em) + rho * 0.5 *
- (b.q(i, j, k, 1) * b.q(i - 1, j, k, 1) +
- b.q(i, j, k, 2) * b.q(i - 1, j, k, 2) +
- b.q(i, j, k, 3) * b.q(i - 1, j, k, 3)))) *
- U;
-
- b.iF(i, j, k, 4) +=
- 0.5 * (b.q(i - 1, j, k, 0) * (b.q(i, j, k, 1) * b.isx(i, j, k) +
- b.q(i, j, k, 2) * b.isy(i, j, k) +
- b.q(i, j, k, 3) * b.isz(i, j, k)) +
- b.q(i, j, k, 0) * (b.q(i - 1, j, k, 1) * b.isx(i, j, k) +
- b.q(i - 1, j, k, 2) * b.isy(i, j, k) +
- b.q(i - 1, j, k, 3) * b.isz(i, j, k)));
- // Species
- for (int n = 0; n < b.ne - 5; n++) {
- b.iF(i, j, k, 5 + n) =
- 0.5 * (b.Q(i, j, k, 5 + n) + b.Q(i - 1, j, k, 5 + n)) * U;
- }
- });
-
- //-------------------------------------------------------------------------------------------|
+ for (int nblki = 0; nblki < nblks; nblki++) {
+ block_ b = mb[nblki];
+ MDRange3 range_i(instances[nblki], {b.ng, b.ng, b.ng},
+ {b.ni + b.ng, b.nj + b.ng - 1, b.nk + b.ng - 1});
+ Kokkos::parallel_for(
+ "2nd order KEEP i face conv fluxes", range_i,
+ KOKKOS_LAMBDA(const int i, const int j, const int k) {
+ double U;
+ double uf;
+ double vf;
+ double wf;
+ double pf;
+
+ // Compute face normal volume flux vector
+ uf = 0.5 * (b.q(i, j, k, 1) + b.q(i - 1, j, k, 1));
+ vf = 0.5 * (b.q(i, j, k, 2) + b.q(i - 1, j, k, 2));
+ wf = 0.5 * (b.q(i, j, k, 3) + b.q(i - 1, j, k, 3));
+
+ U = b.isx(i, j, k) * uf + b.isy(i, j, k) * vf + b.isz(i, j, k) * wf;
+
+ pf = 0.5 * (b.q(i, j, k, 0) + b.q(i - 1, j, k, 0));
+
+ // Compute fluxes
+ double rho;
+ rho = 0.5 * (b.Q(i, j, k, 0) + b.Q(i - 1, j, k, 0));
+
+ // Continuity rho*Ui
+ b.iF(i, j, k, 0) = rho * U;
+
+ // x momentum rho*u*Ui+ p*Ax
+ b.iF(i, j, k, 1) = rho * uf * U + pf * b.isx(i, j, k);
+
+ // y momentum rho*v*Ui+ p*Ay
+ b.iF(i, j, k, 2) = rho * vf * U + pf * b.isy(i, j, k);
+
+ // w momentum rho*w*Ui+ p*Az
+ b.iF(i, j, k, 3) = rho * wf * U + pf * b.isz(i, j, k);
+
+ // Total energy (rhoE+ p)*Ui)
+ double e;
+ double em;
+
+ e = b.qh(i, j, k, 4);
+ em = b.qh(i - 1, j, k, 4);
+
+ b.iF(i, j, k, 4) =
+ ((0.5 * (e + em) + rho * 0.5 *
+ (b.q(i, j, k, 1) * b.q(i - 1, j, k, 1) +
+ b.q(i, j, k, 2) * b.q(i - 1, j, k, 2) +
+ b.q(i, j, k, 3) * b.q(i - 1, j, k, 3)))) *
+ U;
+
+ b.iF(i, j, k, 4) +=
+ 0.5 * (b.q(i - 1, j, k, 0) * (b.q(i, j, k, 1) * b.isx(i, j, k) +
+ b.q(i, j, k, 2) * b.isy(i, j, k) +
+ b.q(i, j, k, 3) * b.isz(i, j, k)) +
+ b.q(i, j, k, 0) * (b.q(i - 1, j, k, 1) * b.isx(i, j, k) +
+ b.q(i - 1, j, k, 2) * b.isy(i, j, k) +
+ b.q(i - 1, j, k, 3) * b.isz(i, j, k)));
+ // Species
+ for (int n = 0; n < b.ne - 5; n++) {
+ b.iF(i, j, k, 5 + n) =
+ 0.5 * (b.Q(i, j, k, 5 + n) + b.Q(i - 1, j, k, 5 + n)) * U;
+ }
+ });
+ }
+ for (int nblki = 0; nblki < nblks; nblki++) {
+ instance[0].fence();
+ };
+
+ // //-------------------------------------------------------------------------------------------|
// j flux face range
//-------------------------------------------------------------------------------------------|
MDRange3 range_j({b.ng, b.ng, b.ng},
https://github.com/kokkos/kokkos/pull/5238
Parallelize a group of structured blocks.... and much performance is gained