Closed wence- closed 8 years ago
Uhmm...
Does this systematically appear at least?
Can you try commenting out self._min_temporaries()
in optimizer.py
and see what happens?
Hmm, sorry, I can't recall exactly what I was using to make this problem. I will keep an eye out and try that. That looks like a likely suspect though.
That helps, but doesn't solve the problem:
diff -u /data/lmitche1/pyop2-cache/mismatching-kernels/src-rank0.c /data/lmitche1/pyop2-cache/mismatching-kernels/src-rank1.c
--- /data/lmitche1/pyop2-cache/mismatching-kernels/src-rank0.c 2016-09-13 13:45:58.453774754 +0100
+++ /data/lmitche1/pyop2-cache/mismatching-kernels/src-rank1.c 2016-09-13 13:45:58.453774754 +0100
@@ -617,9 +617,9 @@
ip_k_84_1_0[j] = (k_84_0_0[j] + (t237[j] * c_84_0_1)) + k_84_0_1[j];
ip_k_84_1_1[j] = (k_84_0_2[j] + ((t245[j] * c_84_0_4) + (t240[j] * c_84_0_5))) + ((t234[j] * c_84_0_1) + k_84_0_3[j]);
ip_k_84_1_2[j] = ((k_84_0_4[j] + (t231[j] * c_84_0_1)) + ((t238[j] * c_84_0_5) + k_84_0_5[j])) + k_84_0_6[j];
- ip_j_84_1_0[j] = j_85_0_1[j] + (t133[facet[0]][ip][j] * c_84_0_4);
+ ip_j_84_1_0[j] = j_84_0_0[j] + (t133[facet[0]][ip][j] * c_84_0_4);
ip_j_84_1_1[j] = j_84_0_1[j] + (t135[facet[0]][ip][j] * c_84_0_4);
- ip_j_84_1_2[j] = j_85_0_0[j] + (t133[facet[0]][ip][j] * c_84_0_5);
+ ip_j_84_1_2[j] = j_84_0_2[j] + (t133[facet[0]][ip][j] * c_84_0_5);
ip_k_85_0_0[j] = t219[j] * -1;
ip_k_85_0_1[j] = t217[j] * -1;
ip_k_85_0_2[j] = t134[facet[1]][ip][j] * t204;
@@ -647,7 +647,7 @@
ip_k_86_1_2[j] = (k_86_0_4[j] + (t238[j] * c_86_0_2)) + k_86_0_5[j];
ip_j_86_1_0[j] = j_86_0_0[j] + (t134[facet[1]][ip][j] * c_86_0_1);
ip_j_86_1_1[j] = j_86_0_1[j] + (t135[facet[1]][ip][j] * c_86_0_3);
- ip_j_86_1_2[j] = j_87_0_0[j] + (t135[facet[1]][ip][j] * c_86_0_1);
+ ip_j_86_1_2[j] = j_86_0_2[j] + (t135[facet[1]][ip][j] * c_86_0_1);
ip_k_87_0_0[j] = ip_k_85_0_6[j];
ip_k_87_0_1[j] = ip_k_85_0_8[j];
ip_k_87_0_2[j] = t226[j] * -1;
Diff finished. Tue Sep 13 13:57:56 2016
Example code that reproduces
from firedrake import *
mesh = UnitCubeMesh(10, 10, 10)
alpha = 340.27690917706
V = VectorFunctionSpace(mesh, "DG", 1)
Q = FunctionSpace(mesh, "CG", 2)
bcs_u_inlet = Function(V)
bcs_p_outlet = Constant(0)
W = V*Q
nu_bg = Constant(10)
y_star_plus = Constant(10)
Dt = Constant(1)
body_forces = Constant((0, 0, 0))
V_sca = FunctionSpace(mesh, "CG", 1)
u_tau = Function(V_sca)
u_adv = Function(VectorFunctionSpace(mesh, "CG", 1))
u_star = Function(V)
w0 = Function(W)
u0, p0 = w0.split()
w1 = Function(W)
u1, p1 = w1.split()
nu_T0 = Function(FunctionSpace(mesh, "DG", 0))
def tensor_jump(vector_field, normal):
""" Equivalent of jump(scalar, n) for vectorial quantities. The
inbuilt firedrake jump(vector, n) is as below but uses inner - i.e.
returning a scalar... this here returns a vector.
"""
return outer(vector_field('+'), normal('+')) \
+ outer(vector_field('-'), normal('-'))
def get_form(u0, u_adv, u_star, p1,
nu_T0=Constant(0), u_tau=None):
""" The form for the discontinuous galerkin discretisation of the
equations for the coupled velocity / pressure solve usign schur
complement pressure projection.
:var u0: Velocity field
:type k1: :class:`Function`
:var nu_T0: The turbulent viscosity
:type nu_T0: :class:`Function`
:var u_tau: The friction velocity
:type u_tau: :class:`Function`
"""
def tensor_jump(vector_field, normal):
""" Equivalent of jump(scalar, n) for vectorial quantities. The
inbuilt firedrake jump(vector, n) is as below but uses inner - i.e.
returning a scalar... this here returns a vector.
"""
return outer(vector_field('+'), normal('+')) \
+ outer(vector_field('-'), normal('-'))
n = FacetNormal(mesh)
h = CellSize(mesh)
# Define the test and trial functions
u, p = TrialFunctions(W)
v, q = TestFunctions(W)
# Test and trial functions for the momentum parts
w = TrialFunction(V)
r = TestFunction(V)
# For DG methods we have k at element edge, k_hat. The conditional
# ensures we upwind - i.e. take upstream values for the edge term.
s = 0.5 * sign(dot(n('-'), u_adv)) + 0.5
u_hat = s * w('-') + (1.0 - s) * w('+')
# Define free slip condition start with wall-shear stress
nu_up = (nu_bg + nu_T0)
tau = nu_up * (nabla_grad(w) + nabla_grad(w).T)
tau_rem = nu_up * (nabla_grad(u - u_star) + nabla_grad(u - u_star).T)
t_w = - u_tau / y_star_plus * w
t_w_rem = - u_tau / y_star_plus * (u - u_star)
# For this method we split the pressure term out from the first equation
# and the advection/diffusion terms out of the second. The first
# equation we shall call Mom (for momentum), the second we shall call
# Rem (for remainder terms).
# Domain integrals
# Transient
transient_mom = (1 / Dt) * inner(w - u0, r)
Mom = transient_mom * dx
transient_rem = (1 / Dt) * inner(u - u_star, v)
Rem = transient_rem * dx
# Convection
convection_vol = inner(w, dot(u_adv, nabla_grad(r)))
Mom -= convection_vol * dx
# Diffusion
diffusion_vol = inner(grad(r), tau)
Mom += diffusion_vol * dx
diffusion_rem = inner(grad(v), tau_rem)
# Pressure
pressure_mom = dot(grad(p1), r)
Mom += pressure_mom * dx
pressure_rem = dot(grad(p - p1), v)
Rem += pressure_rem * dx
# Forcing
f = body_forces
forcing = inner(f, r)
Mom -= forcing * dx
# Continuity
continuity = dot(u, grad(q))
Rem -= continuity * dx
# Jump terms
# Convection
convection_edge = dot(u_hat, dot(tensor_jump(r, n), u_adv))
Mom += convection_edge * dS
# Diffusion
tau_jump = avg(nu_up) * (tensor_jump(w, n) + tensor_jump(w, n).T)
diffusion_edge = - inner(avg(grad(r)), tau_jump) \
- inner(tensor_jump(r, n), avg(tau)) \
+ (alpha / avg(h)) \
* inner(tensor_jump(r, n), tau_jump)
Mom += diffusion_edge * dS
tau_jump_rem = avg(nu_up) * (tensor_jump(u - u_star, n) \
+ tensor_jump(u - u_star, n).T)
diffusion_edge_rem = - inner(avg(grad(v)), tau_jump_rem) \
- inner(tensor_jump(v, n), avg(tau_rem)) \
+ (alpha / avg(h)) \
* inner(tensor_jump(v, n), tau_jump_rem)
# Inlet terms
inlets = (1, )
# Convection
convection_inlet = dot(bcs_u_inlet, r) * dot(n, u_adv)
for bnd_id in inlets: Mom += convection_inlet * ds(bnd_id)
# Diffusion
tau_jump_inlet = nu_up * 2 * sym(outer(w, n) - outer(bcs_u_inlet, n))
diffusion_inlet = - inner(grad(r), tau_jump_inlet) \
- inner(outer(r, n), tau) \
+ (alpha / h) * inner(outer(r, n), tau_jump_inlet)
for bnd_id in inlets: Mom += diffusion_inlet * ds(bnd_id)
tau_jump_inlet_rem = nu_up * 2 * sym(outer(u - u_star, n))
diffusion_inlet_rem = - inner(grad(v), tau_jump_inlet_rem) \
- inner(outer(v, n), tau_rem) \
+ (alpha / h) \
* inner(outer(v, n), tau_jump_inlet_rem)
# Continuity
continuity_inlet = q * dot(bcs_u_inlet, n)
for bnd_id in inlets: Rem += continuity_inlet * ds(bnd_id)
# Outlet terms
outlets = (2, )
# Convection
convection_outlet = dot(w, r) * dot(n, u_adv)
for bnd_id in outlets: Mom += convection_outlet * ds(bnd_id)
# Pressure
pressure_outlet_mom = (bcs_p_outlet - p1) * dot(n, r)
for bnd_id in outlets: Mom += pressure_outlet_mom * ds(bnd_id)
pressure_outlet_rem = (p - p1) * dot(n, v)
for bnd_id in outlets: Rem -= pressure_outlet_rem * ds(bnd_id)
# Continuity
continuity_outlet = q * dot(u, n)
for bnd_id in outlets: Rem += continuity_outlet * ds(bnd_id)
# Wall terms
walls = (3, 4)
# Convection
convection_wall = dot(w, r) * dot(n, u_adv)
for bnd_id in walls: Mom += convection_wall * ds(bnd_id)
# Diffusion
tau_jump_wall = dot(w, n) * outer(n, n)
diffusion_wall = - inner(grad(r), tau_jump_wall) \
- inner(dot(r, n), dot(n, dot(tau, n))) \
- inner(t_w, r) \
+ (alpha / h) * inner(outer(r, n), tau_jump_wall)
for bnd_id in walls: Mom += diffusion_wall * ds(bnd_id)
tau_jump_wall_rem = dot(u - u_star, n) * outer(n, n)
diffusion_wall_rem = - inner(grad(v), tau_jump_wall_rem) \
- inner(dot(v, n), dot(n, dot(tau_rem, n))) \
- inner(t_w_rem, v) \
+ (alpha / h) \
* inner(outer(v, n), tau_jump_wall_rem)
# Free-slip boundaries
fs_bounds = (5, 6)
# Convection
convection_fsb = dot(w, r) * dot(n, u_adv)
for bnd_id in fs_bounds: Mom += convection_fsb * ds(bnd_id)
# Diffusion
tau_jump_fsb = dot(w, n) * outer(n, n)
diffusion_fsb = - inner(grad(r), tau_jump_fsb) \
- inner(dot(r, n), dot(n, dot(tau, n))) \
+ (alpha / h) * inner(outer(r, n), tau_jump_fsb)
for bnd_id in fs_bounds: Mom += diffusion_fsb * ds(bnd_id)
tau_jump_fsb_rem = dot(u - u_star, n) * outer(n, n)
diffusion_fsb_rem = - inner(grad(v), tau_jump_fsb_rem) \
- inner(dot(v, n), dot(n, dot(tau_rem, n))) \
+ (alpha / h) \
* inner(outer(v, n), tau_jump_fsb_rem)
# Seperate the sides of the momentum equation
a_mom = lhs(Mom)
L_mom = rhs(Mom)
# Seperate the sides of the remainder equation
a_rem = lhs(Rem)
L_rem = rhs(Rem)
return a_mom, L_mom, a_rem, L_rem
a_mom, _, _, _ = get_form(u0, u_adv, u_star, p1, nu_T0, u_tau)
assemble(a_mom).M
Slightly reduced example:
from firedrake import *
mesh = UnitCubeMesh(2, 2, 2)
alpha = 340.27690917706
V = VectorFunctionSpace(mesh, "DG", 1)
Q = FunctionSpace(mesh, "CG", 2)
bcs_u_inlet = Function(V)
bcs_p_outlet = Constant(0)
W = V*Q
nu_bg = Constant(10)
y_star_plus = Constant(10)
Dt = Constant(1)
body_forces = Constant((0, 0, 0))
V_sca = FunctionSpace(mesh, "CG", 1)
u_tau = Function(V_sca)
u_adv = Function(VectorFunctionSpace(mesh, "CG", 1))
u_star = Function(V)
w0 = Function(W)
u0, p0 = w0.split()
w1 = Function(W)
u1, p1 = w1.split()
nu_T0 = Function(FunctionSpace(mesh, "DG", 0))
def tensor_jump(vector_field, normal):
""" Equivalent of jump(scalar, n) for vectorial quantities. The
inbuilt firedrake jump(vector, n) is as below but uses inner - i.e.
returning a scalar... this here returns a vector.
"""
return outer(vector_field('+'), normal('+')) \
+ outer(vector_field('-'), normal('-'))
def get_form(u0, u_adv, u_star, p1,
nu_T0=Constant(0), u_tau=None):
""" The form for the discontinuous galerkin discretisation of the
equations for the coupled velocity / pressure solve usign schur
complement pressure projection.
:var u0: Velocity field
:type k1: :class:`Function`
:var nu_T0: The turbulent viscosity
:type nu_T0: :class:`Function`
:var u_tau: The friction velocity
:type u_tau: :class:`Function`
"""
def tensor_jump(vector_field, normal):
""" Equivalent of jump(scalar, n) for vectorial quantities. The
inbuilt firedrake jump(vector, n) is as below but uses inner - i.e.
returning a scalar... this here returns a vector.
"""
return outer(vector_field('+'), normal('+')) \
+ outer(vector_field('-'), normal('-'))
n = FacetNormal(mesh)
h = CellSize(mesh)
# Define the test and trial functions
u, p = TrialFunctions(W)
v, q = TestFunctions(W)
# Test and trial functions for the momentum parts
w = TrialFunction(V)
r = TestFunction(V)
# For DG methods we have k at element edge, k_hat. The conditional
# ensures we upwind - i.e. take upstream values for the edge term.
s = 0.5 * sign(dot(n('-'), u_adv)) + 0.5
u_hat = s * w('-') + (1.0 - s) * w('+')
# Define free slip condition start with wall-shear stress
nu_up = (nu_bg + nu_T0)
tau = nu_up * (nabla_grad(w) + nabla_grad(w).T)
# For this method we split the pressure term out from the first equation
# and the advection/diffusion terms out of the second. The first
# equation we shall call Mom (for momentum), the second we shall call
# Rem (for remainder terms).
# Domain integrals
# Transient
Mom = 0
# Convection
convection_edge = dot(u_hat, dot(tensor_jump(r, n), u_adv))
Mom += convection_edge * dS
# Diffusion
tau_jump = avg(nu_up) * (tensor_jump(w, n) + tensor_jump(w, n).T)
diffusion_edge = - inner(avg(grad(r)), tau_jump) \
- inner(tensor_jump(r, n), avg(tau)) \
+ (alpha / avg(h)) \
* inner(tensor_jump(r, n), tau_jump)
Mom += diffusion_edge * dS
return lhs(Mom) # , L_mom, a_rem, L_rem
a_mom = get_form(u0, u_adv, u_star, p1, nu_T0, u_tau)
assemble(a_mom).M
ugh, painful. I can't reproduce it.
This is supposed to produce a single kernel and terminate, right?
How many processes did you use to run this?
btw, are you sure you haven't accidentally cut anything from the last diff
that you pasted ?
Could you try commenting out lines [142,152] in optimizer.py
?
If that fixed the problem, could you then uncomment them, and only comment out line 188 (self._simplify(...)) in scheduler.py
?
I used 16 processes. It doesn't always happen, you have to run firedrake-clean in between each run.
Yes, if I comment out lines 142-152 then the problem goes away. Similarly if I comment out line 188 in scheduler.py only. I did a little bit more digging, this is what I found. SSALoopMerger.merge
computes the loops to merge and turns it into merged_loops
. These are then sent to SSALoopMerger._simplify
. But, the loops found to merge are not guaranteed to come out in the same order on all processes. So we have on (say) rank 0 an ordering [A, B, C]
and on rank 1 an ordering [B, A, C]
. The temporary replacement walks the loops in the order it sees them and for each statement in the loop builds a dict: {rvalue: lvalue}, now if we see a new statement whose rvalue is already found in the dict, we just reuse the temporary. But now imagine that A
and B
both have statements whose rvalue
is the same, but which write to different lvalues:
loop A:
lvalueA = rvalue;
loop B:
lvalueB = rvalue;
use_lvalueB;
If I visit loop A first, this is transformed into:
loop A:
lvalueA = rvalue;
loop B:
use_lvalueA;
And vice versa if loop B is visited first.
Concretely this is what I see happening in this example. I've pasted below the order in which the merged loops are visited on two ranks that disagree. And then the resulting temporaries-replaced loops (with diff).
On rank 0:
for (int j = 0; j < 12; j += 1)
{
ip_k_84_0_0[j] = t240[j] * -1;
ip_k_84_0_1[j] = t234[j] * -1;
ip_k_84_0_2[j] = t245[j] * -1;
ip_k_84_0_3[j] = t133[facet[0]][ip][j] * t205;
ip_k_84_0_4[j] = t237[j] * -1;
ip_k_84_0_5[j] = t231[j] * -1;
ip_k_84_0_6[j] = t238[j] * -1;
ip_k_84_1_0[j] = (k_84_0_0[j] + (t237[j] * c_84_0_1)) + k_84_0_1[j];
ip_k_84_1_1[j] = (k_84_0_2[j] + ((t245[j] * c_84_0_4) + (t240[j] * c_84_0_5))) + ((t234[j] * c_84_0_1) + k_84_0_3[j]);
ip_k_84_1_2[j] = ((k_84_0_4[j] + (t231[j] * c_84_0_1)) + ((t238[j] * c_84_0_5) + k_84_0_5[j])) + k_84_0_6[j];
ip_j_84_1_0[j] = j_84_0_0[j] + (t133[facet[0]][ip][j] * c_84_0_4);
ip_j_84_1_1[j] = j_84_0_1[j] + (t135[facet[0]][ip][j] * c_84_0_4);
ip_j_84_1_2[j] = j_84_0_2[j] + (t133[facet[0]][ip][j] * c_84_0_5);
ip_k_85_0_0[j] = t219[j] * -1;
ip_k_85_0_1[j] = t217[j] * -1;
ip_k_85_0_2[j] = t134[facet[1]][ip][j] * t204;
ip_k_85_0_3[j] = t223[j] * -1;
ip_k_85_0_4[j] = t224[j] * -1;
ip_k_85_0_5[j] = t213[j] * -1;
ip_k_85_0_6[j] = t133[facet[1]][ip][j] * t204;
ip_k_85_0_7[j] = t216[j] * -1;
ip_k_85_0_8[j] = t210[j] * -1;
ip_k_85_1_0[j] = (k_85_0_0[j] + (t216[j] * c_85_0_1)) + k_85_0_1[j];
ip_k_85_1_1[j] = (k_85_0_2[j] + ((t224[j] * c_85_0_5) + (t219[j] * c_85_0_4))) + ((t213[j] * c_85_0_1) + k_85_0_3[j]);
ip_k_85_1_2[j] = (k_85_0_4[j] + ((t223[j] * c_85_0_5) + (t217[j] * c_85_0_4))) + ((t210[j] * c_85_0_1) + k_85_0_5[j]);
ip_j_85_1_0[j] = j_85_0_0[j] + (t133[facet[0]][ip][j] * c_85_0_4);
ip_j_85_1_1[j] = j_85_0_1[j] + (t133[facet[0]][ip][j] * c_85_0_5);
ip_k_86_0_0[j] = t133[facet[0]][ip][j] * t205;
ip_k_86_0_1[j] = t247[j] * -1;
ip_k_86_0_2[j] = t240[j] * -1;
ip_k_86_0_3[j] = t134[facet[0]][ip][j] * t205;
ip_k_86_0_4[j] = t238[j] * -1;
ip_k_86_0_5[j] = t237[j] * -1;
ip_k_86_0_6[j] = t243[j] * -1;
ip_k_86_0_7[j] = t234[j] * -1;
ip_k_86_1_0[j] = (k_86_0_0[j] + ((t247[j] * c_86_0_1) + (t243[j] * c_86_0_2))) + ((t237[j] * c_86_0_3) + k_86_0_1[j]);
ip_k_86_1_1[j] = (k_86_0_2[j] + (t240[j] * c_86_0_2)) + ((t234[j] * c_86_0_3) + k_86_0_3[j]);
ip_k_86_1_2[j] = (k_86_0_4[j] + (t238[j] * c_86_0_2)) + k_86_0_5[j];
ip_j_86_1_0[j] = j_86_0_0[j] + (t134[facet[1]][ip][j] * c_86_0_1);
ip_j_86_1_1[j] = j_86_0_1[j] + (t135[facet[1]][ip][j] * c_86_0_3);
ip_j_86_1_2[j] = j_86_0_2[j] + (t135[facet[1]][ip][j] * c_86_0_1);
ip_k_87_0_0[j] = t133[facet[1]][ip][j] * t204;
ip_k_87_0_1[j] = t210[j] * -1;
ip_k_87_0_2[j] = t226[j] * -1;
ip_k_87_0_3[j] = t224[j] * -1;
ip_k_87_0_4[j] = t219[j] * -1;
ip_k_87_0_5[j] = t134[facet[1]][ip][j] * t204;
ip_k_87_0_6[j] = t217[j] * -1;
ip_k_87_0_7[j] = t216[j] * -1;
ip_k_87_0_8[j] = t222[j] * -1;
ip_k_87_0_9[j] = t213[j] * -1;
ip_k_87_1_0[j] = (k_87_0_0[j] + ((t222[j] * c_87_0_1) + (t226[j] * c_87_0_2))) + ((t216[j] * c_87_0_3) + k_87_0_1[j]);
ip_k_87_1_1[j] = (k_87_0_2[j] + ((t224[j] * c_87_0_2) + (t219[j] * c_87_0_1))) + ((t213[j] * c_87_0_3) + k_87_0_3[j]);
ip_k_87_1_2[j] = (k_87_0_4[j] + (t217[j] * c_87_0_1)) + ((t210[j] * c_87_0_3) + k_87_0_5[j]);
ip_j_87_1_0[j] = j_87_0_0[j] + (t135[facet[1]][ip][j] * c_87_0_2);
}
for (int j = 0; j < 12; j += 1)
{
k_84_0_0[j] = t180[j] * c_84_0_0;
k_84_0_1[j] = (t176[j] * c_84_0_2) + (t170[j] * c_84_0_3);
k_84_0_2[j] = t178[j] * c_84_0_0;
k_84_0_3[j] = (t167[j] * c_84_0_3) + (t173[j] * c_84_0_2);
k_84_0_4[j] = t177[j] * c_84_0_0;
k_84_0_5[j] = t171[j] * c_84_0_2;
k_84_0_6[j] = t164[j] * c_84_0_3;
j_84_0_0[j] = t198[j] * -1;
j_84_0_1[j] = t196[j] * -1;
j_84_0_2[j] = t195[j] * -1;
k_85_0_0[j] = t162[j] * c_85_0_0;
k_85_0_1[j] = (t158[j] * c_85_0_2) + (t152[j] * c_85_0_3);
k_85_0_2[j] = t160[j] * c_85_0_0;
k_85_0_3[j] = (t149[j] * c_85_0_3) + (t155[j] * c_85_0_2);
k_85_0_4[j] = t159[j] * c_85_0_0;
k_85_0_5[j] = (t146[j] * c_85_0_3) + (t153[j] * c_85_0_2);
j_85_0_0[j] = t195[j] * -1;
j_85_0_1[j] = t198[j] * -1;
k_86_0_0[j] = t180[j] * c_86_0_0;
k_86_0_1[j] = (t176[j] * c_86_0_4) + (t170[j] * c_86_0_5);
k_86_0_2[j] = t178[j] * c_86_0_0;
k_86_0_3[j] = (t167[j] * c_86_0_5) + (t173[j] * c_86_0_4);
k_86_0_4[j] = t177[j] * c_86_0_0;
k_86_0_5[j] = (t171[j] * c_86_0_4) + (t164[j] * c_86_0_5);
j_86_0_0[j] = t188[j] * -1;
j_86_0_1[j] = t181[j] * -1;
j_86_0_2[j] = t187[j] * -1;
k_87_0_0[j] = t162[j] * c_87_0_0;
k_87_0_1[j] = (t158[j] * c_87_0_4) + (t152[j] * c_87_0_5);
k_87_0_2[j] = t160[j] * c_87_0_0;
k_87_0_3[j] = (t149[j] * c_87_0_5) + (t155[j] * c_87_0_4);
k_87_0_4[j] = t159[j] * c_87_0_0;
k_87_0_5[j] = (t146[j] * c_87_0_5) + (t153[j] * c_87_0_4);
j_87_0_0[j] = t187[j] * -1;
}
for (int i = 0; i < 4; i += 1)
{
w_1[0][i+8] = w_1_[i+16][0];
w_1[1][i+8] = w_1_[i+20][0];
}
On rank 1:
for (int j = 0; j < 12; j += 1)
{
k_84_0_0[j] = t180[j] * c_84_0_0;
k_84_0_1[j] = (t176[j] * c_84_0_2) + (t170[j] * c_84_0_3);
k_84_0_2[j] = t178[j] * c_84_0_0;
k_84_0_3[j] = (t167[j] * c_84_0_3) + (t173[j] * c_84_0_2);
k_84_0_4[j] = t177[j] * c_84_0_0;
k_84_0_5[j] = t171[j] * c_84_0_2;
k_84_0_6[j] = t164[j] * c_84_0_3;
j_84_0_0[j] = t198[j] * -1;
j_84_0_1[j] = t196[j] * -1;
j_84_0_2[j] = t195[j] * -1;
k_85_0_0[j] = t162[j] * c_85_0_0;
k_85_0_1[j] = (t158[j] * c_85_0_2) + (t152[j] * c_85_0_3);
k_85_0_2[j] = t160[j] * c_85_0_0;
k_85_0_3[j] = (t149[j] * c_85_0_3) + (t155[j] * c_85_0_2);
k_85_0_4[j] = t159[j] * c_85_0_0;
k_85_0_5[j] = (t146[j] * c_85_0_3) + (t153[j] * c_85_0_2);
j_85_0_0[j] = t195[j] * -1;
j_85_0_1[j] = t198[j] * -1;
k_86_0_0[j] = t180[j] * c_86_0_0;
k_86_0_1[j] = (t176[j] * c_86_0_4) + (t170[j] * c_86_0_5);
k_86_0_2[j] = t178[j] * c_86_0_0;
k_86_0_3[j] = (t167[j] * c_86_0_5) + (t173[j] * c_86_0_4);
k_86_0_4[j] = t177[j] * c_86_0_0;
k_86_0_5[j] = (t171[j] * c_86_0_4) + (t164[j] * c_86_0_5);
j_86_0_0[j] = t188[j] * -1;
j_86_0_1[j] = t181[j] * -1;
j_86_0_2[j] = t187[j] * -1;
k_87_0_0[j] = t162[j] * c_87_0_0;
k_87_0_1[j] = (t158[j] * c_87_0_4) + (t152[j] * c_87_0_5);
k_87_0_2[j] = t160[j] * c_87_0_0;
k_87_0_3[j] = (t149[j] * c_87_0_5) + (t155[j] * c_87_0_4);
k_87_0_4[j] = t159[j] * c_87_0_0;
k_87_0_5[j] = (t146[j] * c_87_0_5) + (t153[j] * c_87_0_4);
j_87_0_0[j] = t187[j] * -1;
}
for (int j = 0; j < 12; j += 1)
{
ip_k_84_0_0[j] = t240[j] * -1;
ip_k_84_0_1[j] = t234[j] * -1;
ip_k_84_0_2[j] = t245[j] * -1;
ip_k_84_0_3[j] = t133[facet[0]][ip][j] * t205;
ip_k_84_0_4[j] = t237[j] * -1;
ip_k_84_0_5[j] = t231[j] * -1;
ip_k_84_0_6[j] = t238[j] * -1;
ip_k_84_1_0[j] = (k_84_0_0[j] + (t237[j] * c_84_0_1)) + k_84_0_1[j];
ip_k_84_1_1[j] = (k_84_0_2[j] + ((t245[j] * c_84_0_4) + (t240[j] * c_84_0_5))) + ((t234[j] * c_84_0_1) + k_84_0_3[j]);
ip_k_84_1_2[j] = ((k_84_0_4[j] + (t231[j] * c_84_0_1)) + ((t238[j] * c_84_0_5) + k_84_0_5[j])) + k_84_0_6[j];
ip_j_84_1_0[j] = j_84_0_0[j] + (t133[facet[0]][ip][j] * c_84_0_4);
ip_j_84_1_1[j] = j_84_0_1[j] + (t135[facet[0]][ip][j] * c_84_0_4);
ip_j_84_1_2[j] = j_84_0_2[j] + (t133[facet[0]][ip][j] * c_84_0_5);
ip_k_85_0_0[j] = t219[j] * -1;
ip_k_85_0_1[j] = t217[j] * -1;
ip_k_85_0_2[j] = t134[facet[1]][ip][j] * t204;
ip_k_85_0_3[j] = t223[j] * -1;
ip_k_85_0_4[j] = t224[j] * -1;
ip_k_85_0_5[j] = t213[j] * -1;
ip_k_85_0_6[j] = t133[facet[1]][ip][j] * t204;
ip_k_85_0_7[j] = t216[j] * -1;
ip_k_85_0_8[j] = t210[j] * -1;
ip_k_85_1_0[j] = (k_85_0_0[j] + (t216[j] * c_85_0_1)) + k_85_0_1[j];
ip_k_85_1_1[j] = (k_85_0_2[j] + ((t224[j] * c_85_0_5) + (t219[j] * c_85_0_4))) + ((t213[j] * c_85_0_1) + k_85_0_3[j]);
ip_k_85_1_2[j] = (k_85_0_4[j] + ((t223[j] * c_85_0_5) + (t217[j] * c_85_0_4))) + ((t210[j] * c_85_0_1) + k_85_0_5[j]);
ip_j_85_1_0[j] = j_85_0_0[j] + (t133[facet[0]][ip][j] * c_85_0_4);
ip_j_85_1_1[j] = j_85_0_1[j] + (t133[facet[0]][ip][j] * c_85_0_5);
ip_k_86_0_0[j] = t133[facet[0]][ip][j] * t205;
ip_k_86_0_1[j] = t247[j] * -1;
ip_k_86_0_2[j] = t240[j] * -1;
ip_k_86_0_3[j] = t134[facet[0]][ip][j] * t205;
ip_k_86_0_4[j] = t238[j] * -1;
ip_k_86_0_5[j] = t237[j] * -1;
ip_k_86_0_6[j] = t243[j] * -1;
ip_k_86_0_7[j] = t234[j] * -1;
ip_k_86_1_0[j] = (k_86_0_0[j] + ((t247[j] * c_86_0_1) + (t243[j] * c_86_0_2))) + ((t237[j] * c_86_0_3) + k_86_0_1[j]);
ip_k_86_1_1[j] = (k_86_0_2[j] + (t240[j] * c_86_0_2)) + ((t234[j] * c_86_0_3) + k_86_0_3[j]);
ip_k_86_1_2[j] = (k_86_0_4[j] + (t238[j] * c_86_0_2)) + k_86_0_5[j];
ip_j_86_1_0[j] = j_86_0_0[j] + (t134[facet[1]][ip][j] * c_86_0_1);
ip_j_86_1_1[j] = j_86_0_1[j] + (t135[facet[1]][ip][j] * c_86_0_3);
ip_j_86_1_2[j] = j_86_0_2[j] + (t135[facet[1]][ip][j] * c_86_0_1);
ip_k_87_0_0[j] = t133[facet[1]][ip][j] * t204;
ip_k_87_0_1[j] = t210[j] * -1;
ip_k_87_0_2[j] = t226[j] * -1;
ip_k_87_0_3[j] = t224[j] * -1;
ip_k_87_0_4[j] = t219[j] * -1;
ip_k_87_0_5[j] = t134[facet[1]][ip][j] * t204;
ip_k_87_0_6[j] = t217[j] * -1;
ip_k_87_0_7[j] = t216[j] * -1;
ip_k_87_0_8[j] = t222[j] * -1;
ip_k_87_0_9[j] = t213[j] * -1;
ip_k_87_1_0[j] = (k_87_0_0[j] + ((t222[j] * c_87_0_1) + (t226[j] * c_87_0_2))) + ((t216[j] * c_87_0_3) + k_87_0_1[j]);
ip_k_87_1_1[j] = (k_87_0_2[j] + ((t224[j] * c_87_0_2) + (t219[j] * c_87_0_1))) + ((t213[j] * c_87_0_3) + k_87_0_3[j]);
ip_k_87_1_2[j] = (k_87_0_4[j] + (t217[j] * c_87_0_1)) + ((t210[j] * c_87_0_3) + k_87_0_5[j]);
ip_j_87_1_0[j] = j_87_0_0[j] + (t135[facet[1]][ip][j] * c_87_0_2);
}
for (int i = 0; i < 4; i += 1)
{
w_1[0][i+8] = w_1_[i+16][0];
w_1[1][i+8] = w_1_[i+20][0];
}
The resulting difference between the loops:
@@ -10,9 +10,9 @@
ip_k_84_1_0[j] = (k_84_0_0[j] + (t237[j] * c_84_0_1)) + k_84_0_1[j];
ip_k_84_1_1[j] = (k_84_0_2[j] + ((t245[j] * c_84_0_4) + (t240[j] * c_84_0_5))) + ((t234[j] * c_84_0_1) + k_84_0_3[j]);
ip_k_84_1_2[j] = ((k_84_0_4[j] + (t231[j] * c_84_0_1)) + ((t238[j] * c_84_0_5) + k_84_0_5[j])) + k_84_0_6[j];
- ip_j_84_1_0[j] = j_84_0_0[j] + (t133[facet[0]][ip][j] * c_84_0_4);
+ ip_j_84_1_0[j] = j_85_0_1[j] + (t133[facet[0]][ip][j] * c_84_0_4);
ip_j_84_1_1[j] = j_84_0_1[j] + (t135[facet[0]][ip][j] * c_84_0_4);
- ip_j_84_1_2[j] = j_84_0_2[j] + (t133[facet[0]][ip][j] * c_84_0_5);
+ ip_j_84_1_2[j] = j_85_0_0[j] + (t133[facet[0]][ip][j] * c_84_0_5);
ip_k_85_0_0[j] = t219[j] * -1;
ip_k_85_0_1[j] = t217[j] * -1;
ip_k_85_0_2[j] = t134[facet[1]][ip][j] * t204;
@@ -40,7 +40,7 @@
ip_k_86_1_2[j] = (k_86_0_4[j] + (t238[j] * c_86_0_2)) + k_86_0_5[j];
ip_j_86_1_0[j] = j_86_0_0[j] + (t134[facet[1]][ip][j] * c_86_0_1);
ip_j_86_1_1[j] = j_86_0_1[j] + (t135[facet[1]][ip][j] * c_86_0_3);
- ip_j_86_1_2[j] = j_86_0_2[j] + (t135[facet[1]][ip][j] * c_86_0_1);
+ ip_j_86_1_2[j] = j_87_0_0[j] + (t135[facet[1]][ip][j] * c_86_0_1);
ip_k_87_0_0[j] = ip_k_85_0_6[j];
ip_k_87_0_1[j] = ip_k_85_0_8[j];
ip_k_87_0_2[j] = t226[j] * -1;
Yes, that's what I thought although I'm still unable to reproduce the bug. That's why I pointed you to those lines.
Can you replace these two lines:
found_nests = defaultdict(list)
---> OrderedDict()
found_nests[key].append(loops[-1])
---> found_nests.setdefault(key, []).append(loops[-1])
and see if the problem goes away?
Sorry I can't test it myself
That seems to work.
Right, thanks, fix in #92
In a form, I don't know what right now, I get different code generated across MPI ranks:
Here are two kernels:
and
Thoughts on where this might be breaking?