Open will-saunders-ukaea opened 1 month ago
Another backtrace example:
File "/home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/interpolation.py", line 837, in _interpolate
assembled_interpolator = self.frozen_assembled_interpolator
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'SameMeshInterpolator' object has no attribute 'frozen_assembled_interpolator'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/global_kernel.py", line 349, in __call__
func = self._func_cache[key]
~~~~~~~~~~~~~~~~^^^^^
KeyError: 125673324463536
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/js0259/git-ukaea/NESODrake-TwoStream/examples/prototype.py", line 164, in <module>
out_rho.write(rho, poisson_rhs)
File "petsc4py/PETSc/Log.pyx", line 188, in petsc4py.PETSc.Log.EventDecorator.decorator.wrapped_func
File "petsc4py/PETSc/Log.pyx", line 189, in petsc4py.PETSc.Log.EventDecorator.decorator.wrapped_func
File "/home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/output/vtk_output.py", line 648, in write
vtu = self._write_vtu(*functions)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/output/vtk_output.py", line 531, in _write_vtu
functions = tuple(self._prepare_output(f, max_elem)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/output/vtk_output.py", line 531, in <genexpr>
functions = tuple(self._prepare_output(f, max_elem)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/js0259/venvs/nesodrake-2024_08_12/src/pyadjoint/pyadjoint/tape.py", line 110, in wrapper
return function(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/output/vtk_output.py", line 486, in _prepare_output
output.interpolate(function)
File "petsc4py/PETSc/Log.pyx", line 188, in petsc4py.PETSc.Log.EventDecorator.decorator.wrapped_func
File "petsc4py/PETSc/Log.pyx", line 189, in petsc4py.PETSc.Log.EventDecorator.decorator.wrapped_func
File "/home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/function.py", line 413, in interpolate
return assemble(interp, tensor=self, ad_block_tag=ad_block_tag)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "petsc4py/PETSc/Log.pyx", line 188, in petsc4py.PETSc.Log.EventDecorator.decorator.wrapped_func
File "petsc4py/PETSc/Log.pyx", line 189, in petsc4py.PETSc.Log.EventDecorator.decorator.wrapped_func
File "/home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/adjoint_utils/assembly.py", line 30, in wrapper
output = assemble(form, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/assemble.py", line 133, in assemble
return get_assembler(expr, *args, **kwargs).assemble(tensor=tensor)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/assemble.py", line 381, in assemble
result = BaseFormAssembler.base_form_postorder_traversal(self._form, visitor, visited)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/assemble.py", line 603, in base_form_postorder_traversal
visited[e] = visitor(e, *(visited[arg] for arg in operands))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/assemble.py", line 377, in visitor
return self.base_form_assembly_visitor(e, t, *operands)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/assemble.py", line 550, in base_form_assembly_visitor
return firedrake.Interpolator(expression, tensor, **interp_data)._interpolate(default_missing_val=default_missing_val)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "petsc4py/PETSc/Log.pyx", line 188, in petsc4py.PETSc.Log.EventDecorator.decorator.wrapped_func
File "petsc4py/PETSc/Log.pyx", line 189, in petsc4py.PETSc.Log.EventDecorator.decorator.wrapped_func
File "/home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/interpolation.py", line 840, in _interpolate
assembled_interpolator = self.callable()
^^^^^^^^^^^^^^^
File "/home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/interpolation.py", line 1005, in callable
l()
File "/home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/parloop.py", line 241, in compute
self()
File "petsc4py/PETSc/Log.pyx", line 188, in petsc4py.PETSc.Log.EventDecorator.decorator.wrapped_func
File "petsc4py/PETSc/Log.pyx", line 189, in petsc4py.PETSc.Log.EventDecorator.decorator.wrapped_func
File "/home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/parloop.py", line 251, in __call__
self._compute(self.iterset.core_part)
File "/home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/parloop.py", line 232, in _compute
self.global_kernel(self.comm, part.offset, part.offset+part.size, *self.arglist)
File "/home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/global_kernel.py", line 351, in __call__
func = self.compile(comm)
^^^^^^^^^^^^^^^^^^
File "petsc4py/PETSc/Log.pyx", line 188, in petsc4py.PETSc.Log.EventDecorator.decorator.wrapped_func
File "petsc4py/PETSc/Log.pyx", line 189, in petsc4py.PETSc.Log.EventDecorator.decorator.wrapped_func
File "/home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/global_kernel.py", line 422, in compile
return compilation.load(self, extension, self.name,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/compilation.py", line 596, in load
dll = compiler(cppargs, ldargs, cpp=cpp, comm=comm).get_so(code, extension)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/compilation.py", line 339, in get_so
raise CompilationError("Generated code differs across ranks (see output in %s)" % output)
pyop2.exceptions.CompilationError: Generated code differs across ranks (see output in /home/js0259/venvs/nesodrake-2024_08_12/.cache/pyop2/mismatching-kernels)
Abort(1) on node 1 (rank 1 in comm 496): application called MPI_Abort(PYOP2_COMM_WORLD, 1) - process 1
Can you share the differing bits of generated code?
This looks suspiciously similar to a parallel cache coherency issue I ran into recently (noted in these minutes). Are you running your code over multiple communicators, or just COMM_WORLD
?
Also it would be good to see the differing code as Connor says: Take a look in /home/js0259/venvs/nesodrake-2024_08_12/.cache/pyop2/mismatching-kernels
On the Firedrake side there is just COMM_WORLD. We have some communicators that we create/destroy in C++, these are never passed to Firedrake,
Miss-matched files are hopefully attached. mismatching-kernels.zip
I think this is the call stack for these files:
[20] > /home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/compilation.py(340)get_so()
-> raise CompilationError("Generated code differs across ranks (see output in %s)" % output)
(Pdb++) bt
[0] /home/js0259/git-ukaea/NESODrake-TwoStream/examples/prototype.py(164)<module>()
-> out_rho.write(rho, poisson_rhs)
[1] /home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/output/vtk_output.py(648)write()
-> vtu = self._write_vtu(*functions)
[2] /home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/output/vtk_output.py(529)_write_vtu()
-> coordinates = self._prepare_output(mesh.coordinates, max_elem)
[3] /home/js0259/venvs/nesodrake-2024_08_12/src/pyadjoint/pyadjoint/tape.py(110)wrapper()
-> return function(*args, **kwargs)
[4] /home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/output/vtk_output.py(486)_prepare_output()
-> output.interpolate(function)
[5] /home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/function.py(413)interpolate()
-> return assemble(interp, tensor=self, ad_block_tag=ad_block_tag)
[6] /home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/adjoint_utils/assembly.py(30)wrapper()
-> output = assemble(form, *args, **kwargs)
[7] /home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/assemble.py(133)assemble()
-> return get_assembler(expr, *args, **kwargs).assemble(tensor=tensor)
[8] /home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/assemble.py(381)assemble()
-> result = BaseFormAssembler.base_form_postorder_traversal(self._form, visitor, visited)
[9] /home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/assemble.py(603)base_form_postorder_traversal()
-> visited[e] = visitor(e, *(visited[arg] for arg in operands))
[10] /home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/assemble.py(377)visitor()
-> return self.base_form_assembly_visitor(e, t, *operands)
[11] /home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/assemble.py(550)base_form_assembly_visitor()
-> return firedrake.Interpolator(expression, tensor, **interp_data)._interpolate(default_missing_val=default_missing_val)
[12] /home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/interpolation.py(840)_interpolate()
-> assembled_interpolator = self.callable()
[13] /home/js0259/venvs/nesodrake-2024_08_12/src/firedrake/firedrake/interpolation.py(1005)callable()
-> l()
[14] /home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/parloop.py(241)compute()
-> self()
[15] /home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/parloop.py(251)__call__()
-> self._compute(self.iterset.core_part)
[16] /home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/parloop.py(232)_compute()
-> self.global_kernel(self.comm, part.offset, part.offset+part.size, *self.arglist)
[17] /home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/global_kernel.py(351)__call__()
-> func = self.compile(comm)
[18] /home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/global_kernel.py(422)compile()
-> return compilation.load(self, extension, self.name,
[19] /home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/compilation.py(597)load()
-> dll = compiler(cppargs, ldargs, cpp=cpp, comm=comm).get_so(code, extension)
[20] > /home/js0259/venvs/nesodrake-2024_08_12/src/PyOP2/pyop2/compilation.py(340)get_so()
-> raise CompilationError("Generated code differs across ranks (see output in %s)" % output)
(Pdb++)
At the interpolation call [11] one rank is trying to interpolate a scalar valued function and the other rank a vector valued function. I do have vtu output calls for a vector valued function and a scalar valued function, both the backtraces are on the scalar valued write call.
# One rank:
(Pdb++) p expression
Coefficient(WithGeometry(FunctionSpace(<firedrake.mesh.MeshTopology object at 0x728e9c1041d0>, VectorElement(FiniteElement('Discontinuous Lagrange', triangle, 1, variant='equispaced'), dim=2, variant='equispaced'), name=None), Mesh(VectorElement(FiniteElement('Discontinuous Lagrange', triangle, 1, variant='equispaced'), dim=2, variant='equispaced'), 14)), 53)
(Pdb++)
# The other rank:
(Pdb++) p expression
Coefficient(WithGeometry(FunctionSpace(<firedrake.mesh.MeshTopology object at 0x734d44054410>, FiniteElement('Discontinuous Lagrange', triangle, 1), name=None), Mesh(VectorElement(FiniteElement('Discontinuous Lagrange', triangle, 1, variant='equispaced'), dim=2, variant='equispaced'), 14)), 36)
One rank, the "vector" one, is on line 529 of firedrake/output/vtk_output.py
processing the mesh coordinates and the second rank, the "scalar" one, has progressed to line 531 to process the actual function to write.
I've managed to strip out all our particle gubbins to leave just Firedrake calls which still fail, see below. It's still quite involved I will continue to prune what I can.
Edit: Instructions, to get this to fail I
1) run firedrake-clean
2) run OMP_NUM_THREADS=1 mpiexec -n 2 python mfe.py
from firedrake import *
from firedrake.__future__ import interpolate, Interpolator
import sys
import time
from petsc4py import PETSc; PETSc.Sys.popErrorHandler()
if __name__ == "__main__":
# integrator = boris
num_steps = 1
num_print_steps = 1
num_write_steps = 1
num_energy_steps = 1
num_cells_y = 3
num_cells_x = 100
num_particles = 1
dt = 0.001
p = 1
mesh_width = 0.01
mesh = PeriodicRectangleMesh(num_cells_x, num_cells_y, 1.0, mesh_width, quadrilateral=False)
BDM = FunctionSpace(mesh, "BDM", p + 1)
DG = FunctionSpace(mesh, "DG", p)
W = BDM * DG
E = Function(BDM)
rho = Function(DG, name="rho")
neutralising_field = Function(DG, name="neutralising_field")
poisson_rhs = Function(DG, name="poisson_rhs")
net_charge_density = 105.2
neutralising_field.interpolate(net_charge_density)
poisson_rhs.interpolate(neutralising_field - rho)
# weak form
sigma, u = TrialFunctions(W)
tau, v = TestFunctions(W)
a = (dot(sigma, tau) + div(tau) * u + div(sigma) * v) * dx
L = poisson_rhs * v * dx
w = Function(W)
E, phi = w.subfunctions
nullspace = MixedVectorSpaceBasis(W, [W.sub(0), VectorSpaceBasis(constant=True)])
poisson_rhs.interpolate(neutralising_field - rho)
rho_integral = assemble(rho * dx)
rhs_integral = assemble(poisson_rhs * dx)
if mpi.COMM_WORLD.rank == 0:
print("rho_integral:", rho_integral)
print("rhs_integral:", rhs_integral)
out_rho = VTKFile("rho.pvd")
out_E = VTKFile("E.pvd")
for stepx in range(num_steps):
poisson_rhs.interpolate(neutralising_field - rho)
solve(a == L, w, bcs=[], nullspace=nullspace)
out_rho.write(rho, poisson_rhs)
I haven't seen it fail if I comment out the solve call.
not the issue here but you can use PETSc.Sys.Print to ensure only one print statement in parallel.
not the issue here but you can use PETSc.Sys.Print to ensure only one print statement in parallel.
Oh nice, thanks.
Describe the bug Running with multiple ranks intermittently causes errors to be thrown like:
I have a Firedrake install from 17/07/2024 which does not exhibit the issue and an install from 12/08/2024 which does.
Steps to Reproduce Steps to reproduce the behavior: WIP MFE - As the error is intermittent getting an backtrace to isolate the behaviour to make a MFE around is work in progress.
I have had the most success reproducing the error by: 1) cleaning the cache. 2) launch code sequentially (runs) 3) launch with 2 ranks
Working on MFE.
Expected behaviour Runs without error.
Error message
Environment:
firedrake-status