idaholab / moose

Multiphysics Object Oriented Simulation Environment
https://www.mooseframework.org
GNU Lesser General Public License v2.1
1.76k stars 1.05k forks source link

Floating Point Exception in Debug after Update #28350

Open GiudGiud opened 2 months ago

GiudGiud commented 2 months ago

Bug description

A Floating point exception is generated then caught by the FPExceptionGuard as intended. However we dont have the location of the exception even with the necessary breakpoint, which is not as intended

How to reproduce

on a linux machine Use this input

[GlobalParams]
displacements = 'disp_x disp_y disp_z'
use_displaced_mesh = false
[]

Box1_inactive_name = 'Box1_inactive'
Box1Hull_LowerDElements_inactive_name = 'Box1Hull_LowerDElements_inactive'
inactive_domain_block_names = '${Box1_inactive_name} ${Box1Hull_LowerDElements_inactive_name}'

[Problem]
solve = true
kernel_coverage_check = SKIP_LIST
kernel_coverage_block_list = '${inactive_domain_block_names}'
material_coverage_check = SKIP_LIST
material_coverage_block_list = '${inactive_domain_block_names}'
[]

[Mesh]
[BaseMesh]
  type = GeneratedMeshGenerator
  subdomain_name = 'BaseMesh'
  elem_type = 'HEX8'
  dim = 3
  nx = 6
  ny = 6
  nz = 2
  xmin = -3
  xmax = +3
  ymin = -3
  ymax = +3
  zmin = -2
  zmax = +2
[]

[Box1]
  type = SubdomainBoundingBoxGenerator
  block_name = 'Box1'
  input = "BaseMesh"
  block_id = 1
  location = "INSIDE"
  bottom_left = "-1.0 -1.0 -0"
  top_right = "+1.0 +1.0 +2"
[]

[Box1Hull_Sideset]
  type = SideSetsAroundSubdomainGenerator
  input = 'Box1'
  block = 'Box1'
  include_only_external_sides = true
  new_boundary = 'Box1Hull_Sideset'
[]

[Box1Hull_LowerDElements]
  type = LowerDBlockFromSidesetGenerator
  input = 'Box1Hull_Sideset'
  sidesets = 'Box1Hull_Sideset'
  new_block_name = 'Box1Hull_LowerDElements'
[]

add_subdomain_names = ${inactive_domain_block_names}
[]

[Physics]
[SolidMechanics]
  [QuasiStatic]
    [all]
      strain = SMALL
      incremental = true
      add_variables = true
      generate_output = 'stress_xx stress_yy stress_zz'
      block = 'BaseMesh Box1'
    []
  []
[]
[]

# ===== Kernels for Shell-Elements
[Kernels]
[shell_disp_x]
  type = ADStressDivergenceShell
  block = 'Box1Hull_LowerDElements'
  component = 0
  variable = disp_x
  through_thickness_order = SECOND
[]
[shell_disp_y]
  type = ADStressDivergenceShell
  block = 'Box1Hull_LowerDElements'
  component = 1
  variable = disp_y
  through_thickness_order = SECOND
[]
[shell_disp_z]
  type = ADStressDivergenceShell
  block = 'Box1Hull_LowerDElements'
  component = 2
  variable = disp_z
  through_thickness_order = SECOND
[]
[shell_rot_x]
  type = ADStressDivergenceShell
  block = 'Box1Hull_LowerDElements'
  component = 3
  variable = rot_x
  through_thickness_order = SECOND
[]
[shell_rot_y]
  type = ADStressDivergenceShell
  block = 'Box1Hull_LowerDElements'
  component = 4
  variable = rot_y
  through_thickness_order = SECOND
[]
[]

# ===== Dummy-Kernels for Inactive Domain =====
[Kernels]
[./donothing1]
  type = MatDiffusion
  block = '${inactive_domain_block_names}'
  variable = disp_z
  diffusivity = 1e-7
[]
[./donothing2]
  type = MatDiffusion
  block = '${inactive_domain_block_names}'
  variable = disp_x
  diffusivity = 1e-7
[]
[./donothing3]
  type = MatDiffusion
  block = '${inactive_domain_block_names}'
  variable = disp_y
  diffusivity = 1e-7
[]
[]

[Variables]
[./rot_x]
  order = FIRST
  family = LAGRANGE
  block = 'Box1Hull_LowerDElements'
[]
[./rot_y]
  order = FIRST
  family = LAGRANGE
  block = 'Box1Hull_LowerDElements'
[]
[]

[AuxVariables]
[dummy]
  type = MooseVariableFVReal
[]
[]

# fix the lower model boundary in y and z direction
[BCs]
[./back_fix_y]
  type = DirichletBC
  variable = disp_y
  boundary = 'back'
  value = 0.0
[]
[./back_fix_z]
  type = DirichletBC
  variable = disp_z
  boundary = 'back'
  value = 0.0
[]
[]

# fix the left mdoel boundary in x direction
[BCs]
[./left_fix_x]
  type = DirichletBC
  variable = disp_x
  boundary = 'left'
  value = 0.0
[]
[]

# put some pressure on the right model boundary
[BCs]
[./right_Dirichlet]
  type = FunctionDirichletBC
  variable = disp_x
  boundary = 'right'
  function = right_pressure_function
[]
[]
[Functions]
[right_pressure_function]
  type = ParsedFunction
  expression = '-0.001 * t'
[]
[]

# Material: Volume Elements
[Materials]

[elasticity_tensor]
  type = ComputeIsotropicElasticityTensor
  block = 'BaseMesh Box1'
  youngs_modulus = 1e6
  poissons_ratio = 0.25
[]

[stress]
  type = ComputeFiniteStrainElasticStress
  block = 'BaseMesh Box1'
[]

[]

# Material: Shell Elements
[Materials]
[shell_elasticity]
  type = ADComputeIsotropicElasticityTensorShell
  block = 'Box1Hull_LowerDElements'
  youngs_modulus = 1e9
  poissons_ratio = 0.3
  through_thickness_order = SECOND
[]
[shell_strain]
  type = ADComputeIncrementalShellStrain
  block = 'Box1Hull_LowerDElements'
  displacements = 'disp_x disp_y disp_z'
  rotations = 'rot_x rot_y'
  thickness = 1.0
  through_thickness_order = SECOND
[]
[shell_stress]
  type = ADComputeShellStress
  block = 'Box1Hull_LowerDElements'
  through_thickness_order = SECOND
[]
[]

# move elements between subdomains back and forth
[UserObjects]
[GlobalSubdomainModifier]
  type = TimedSubdomainModifier
  times = '0.2 0.4'
  blocks_from = 'Box1 Box1Hull_LowerDElements'
  blocks_to = 'Box1_inactive Box1Hull_LowerDElements_inactive'
  execute_on = 'INITIAL TIMESTEP_BEGIN'
[]
[]

[Preconditioning]
[.\SMP]
  type = SMP
  full = true
[]
[]

[Executioner]
type = Transient

end_time = 1.0
dtmin = 0.001
[TimeSteppers]
  [BlockEventTimeStepper]
    type = TimeSequenceStepper
    time_sequence = '0.05 0.1 0.2 0.4 1.0'
  []
[]

solve_type = 'PJFNK'
petsc_options = '-snes_converged_reason'
petsc_options_iname = '-pc_type -pc_factor_mat_solver_package'
petsc_options_value = ' lu       mumps'

l_tol = 1E-10
l_max_its = 20

nl_abs_tol = 1E-10
nl_rel_tol = 1e-8
nl_max_its = 20

[]

[Outputs]
exodus = true
[]

Impact

Cannot simulate in debug, problem for debugging and developing

Discussed in https://github.com/idaholab/moose/discussions/28252

Originally posted by **Flolaffel** July 29, 2024 ### Check these boxes if you have followed the posting rules. - [X] Q&A General is the most appropriate section for my question - [X] I have consulted the posting Guidelines on the Discussions front page - [X] I have searched the Discussions forum and my question has not been asked before - [X] I have searched the MOOSE website and the documentation does not answer my question - [X] I have formatted my post following the posting guidelines (screenshots as a last resort, triple back quotes around pasted text) ### Question Hello, I updated MOOSE and Conda today and now I can't use the debug build anymore. I followed these steps from the [New Users](https://mooseframework.inl.gov/getting_started/new_users.html) page to update. ``` cd ~/projects/moose git fetch origin git rebase origin/master ``` ``` conda activate base conda env remove -n moose conda create -n moose moose-dev=2024.07.19 conda activate moose ``` ``` cd ~/projects/YourAppName make clobberall make ``` The input doesn't matter. Top opt build is working fine. But as soon as I try to debug, this happens: ``` Time Step 0, time = 0 Time Step 1, time = 0.05, dt = 0.05 Floating point exception signaled (invalid floating point operation)! libMesh terminating: To track this down, compile in debug mode, then in gdb do: break libmesh_handleFPE run ... bt Stack frames: 35 0: libMesh::print_trace(std::ostream&) 1: libMesh::MacroFunctions::report_error(char const*, int, char const*, char const*, std::ostream&) 2: /home/.local/ruef/miniforge/envs/moose/libmesh/lib/libmesh_dbg.so.0(+0x222e6e3) [0x7fb88922e6e3] 3: /lib/x86_64-linux-gnu/libc.so.6(+0x3bf90) [0x7fb87f6ccf90] 4: fedisableexcept 5: libMesh::enableFPE(bool) 6: /home/.local/ruef/projects/moose/framework/libmoose-dbg.so.0(+0x20421ec) [0x7fb88ee421ec] 7: NonlinearSystemBase::computeResidualTags(std::set, std::allocator > const&) 8: FEProblemBase::computeResidualTags(std::set, std::allocator > const&) 9: FEProblemBase::computeResidualInternal(libMesh::NumericVector const&, libMesh::NumericVector&, std::set, std::allocator > const&) 10: FEProblemBase::computeResidual(libMesh::NumericVector const&, libMesh::NumericVector&, unsigned int) 11: FEProblemBase::computeResidualSys(libMesh::NonlinearImplicitSystem&, libMesh::NumericVector const&, libMesh::NumericVector&) 12: ComputeResidualFunctor::residual(libMesh::NumericVector const&, libMesh::NumericVector&, libMesh::NonlinearImplicitSystem&) 13: libmesh_petsc_snes_residual 14: SNESComputeFunction 15: /home/.local/ruef/miniforge/envs/moose/petsc/lib/libpetsc.so.3.20(+0xf6937f) [0x7fb88316937f] 16: SNESSolve 17: libMesh::PetscNonlinearSolver::solve(libMesh::SparseMatrix&, libMesh::NumericVector&, libMesh::NumericVector&, double, unsigned int) 18: libMesh::NonlinearImplicitSystem::solve() 19: TimeIntegrator::solve() 20: NonlinearSystem::solve() 21: FEProblemBase::solve(unsigned int) 22: FEProblemSolve::solve() 23: FixedPointSolve::solveStep(double&, double&, std::set, std::allocator > const&) 24: FixedPointSolve::solve() 25: TimeStepper::step() 26: Transient::takeStep(double) 27: Transient::execute() 28: MooseApp::executeExecutioner() 29: MooseApp::run() 30: ./simp-dbg(+0x3e80) [0x5582b63c4e80] 31: main 32: /lib/x86_64-linux-gnu/libc.so.6(+0x2718a) [0x7fb87f6b818a] 33: __libc_start_main 34: ./simp-dbg(+0x3079) [0x5582b63c4079] [0] ../src/base/libmesh.C, line 139, compiled Jul 19 2024 at 13:27:54 -------------------------------------------------------------------------- MPI_ABORT was invoked on rank 0 in communicator MPI_COMM_WORLD with errorcode 1. NOTE: invoking MPI_ABORT causes Open MPI to kill all MPI processes. You may or may not see output from other processes, depending on exactly when Open MPI kills them. -------------------------------------------------------------------------- ``` What should I do?
pbehne commented 2 months ago

To summarize the above, the issue only occurs in dbg builds on linux.

The issue seems to be a bug in HDF5 when creating exodus files. The function where the FPE is occurring is HDF5's H5Eset_auto2. The backtrace is listed below. While stepping through code in gdb, the following command has been useful in determining whether a FPE has occurred yet: call (int)fetestexcept(FE_DIVBYZERO | FE_INVALID) This command returns 1 if a FPE has occurred and 0 otherwise.

The version of HDF5 in the conda environments I used is 1.14.3. I created a conda environment using moose-dev hdf5=1.12.1, and the input ran without crashing. This indicates a bug introduced in newer versions of HDF5. Tagging @milljm so he is aware there seems to be a bug in the version of hdf5 shipped in our conda packages.

I have not pinpointed the bug within H5Eset_auto2 because I cannot step into it using gdb due to HDF5 not being compiled with debugging symbols. However, @roystgnr and I were able to reproduce the bug in libMesh's unit test suite. Roy will build a more recent version of HDF5 to see if he can further pinpoint the issue. If now, I will build HDF5 myself and point petsc to use it. Then I should be able to get further with the debugger.

Thanks to @roystgnr, @milljm, and @lindsayad for their help!

Backtrace

#0  0x00007fffdf6a2c70 in H5Eset_auto2 () from /home/behnpa/miniforge/envs/moose_with_libmesh_petsc/lib/libhdf5.so.310
#1  0x00007fffe81a5958 in set_auto (func=0x0, client_data=0x0) at ../../../../../contrib/netcdf/netcdf-c-4.6.2/libhdf5/hdf5internal.c:67
#2  0x00007fffe81a596d in nc4_hdf5_initialize () at ../../../../../contrib/netcdf/netcdf-c-4.6.2/libhdf5/hdf5internal.c:78
#3  0x00007fffe81aff4d in NC4_initialize () at ../../../../../contrib/netcdf/netcdf-c-4.6.2/libsrc4/nc4dispatch.c:139
#4  0x00007fffe8145615 in nc_initialize () at ../../../../../contrib/netcdf/netcdf-c-4.6.2/liblib/nc_initialize.c:91
#5  0x00007fffe8149d2d in NC_create (path0=0x7fffffff8d00 "input_out.e", cmode=768, initialsz=0, basepe=0, chunksizehintp=0x0, 
    useparallel=0, parameters=0x0, ncidp=0x7fffffff890c) at ../../../../../contrib/netcdf/netcdf-c-4.6.2/libdispatch/dfile.c:2036
#6  0x00007fffe81492f2 in nc__create (path=0x7fffffff8d00 "input_out.e", cmode=768, initialsz=0, chunksizehintp=0x0, ncidp=0x7fffffff890c)
    at ../../../../../contrib/netcdf/netcdf-c-4.6.2/libdispatch/dfile.c:629
#7  0x00007fffe81492ab in nc_create (path=0x7fffffff8d00 "input_out.e", cmode=768, ncidp=0x7fffffff890c)
    at ../../../../../contrib/netcdf/netcdf-c-4.6.2/libdispatch/dfile.c:556
#8  0x00007fffeb60acc1 in ex_create_int (path=0x7fffffff8d00 "input_out.e", cmode=8, comp_ws=0x7fffffff8acc, io_ws=0x7fffffff8ac8, 
    run_version=811) at ../../../../../contrib/exodusii/v8.11/exodus/src/ex_create.c:155
#9  0x00007fffeab76a3c in libMesh::ExodusII_IO_Helper::create (this=0x555556171a70, filename=...) at ../src/mesh/exodusII_io_helper.C:2183
#10 0x00007fffeab3f032 in libMesh::ExodusII_IO::write_nodal_data_common (this=0x5555560c21e0, fname=..., names=..., continuous=true)
    at ../src/mesh/exodusII_io.C:2300
#11 0x00007fffeab3c1e3 in libMesh::ExodusII_IO::write_nodal_data (this=0x5555560c21e0, fname=..., soln=..., names=...)
    at ../src/mesh/exodusII_io.C:1824
#12 0x00007fffeae62196 in libMesh::MeshOutput<libMesh::MeshBase>::write_equation_systems (this=0x5555560c2220, fname=..., es=..., 
    system_names=0x0) at ../src/mesh/mesh_output.C:82
#13 0x00007fffeab3d0df in libMesh::ExodusII_IO::write_timestep (this=0x5555560c21e0, fname=..., es=..., timestep=1, time=0, 
    system_names=0x0) at ../src/mesh/exodusII_io.C:2000
#14 0x00007ffff5cd9945 in Exodus::outputNodalVariables (this=0x555555fd9730)
    at /data/behnpa/projects/moose_libmesh_test/framework/src/outputs/Exodus.C:321
#15 0x00007ffff5cbb726 in AdvancedOutput::output (this=0x555555fd9730)
    at /data/behnpa/projects/moose_libmesh_test/framework/src/outputs/AdvancedOutput.C:286
#16 0x00007ffff5cda3c4 in Exodus::output (this=0x555555fd9730)
    at /data/behnpa/projects/moose_libmesh_test/framework/src/outputs/Exodus.C:454
#17 0x00007ffff5cea0e3 in OversampleOutput::outputStep (this=0x555555fd9730, type=...)
    at /data/behnpa/projects/moose_libmesh_test/framework/src/outputs/OversampleOutput.C:100
#18 0x00007ffff5ce79ba in OutputWarehouse::outputStep (this=0x55555594ac10, type=...)
    at /data/behnpa/projects/moose_libmesh_test/framework/src/outputs/OutputWarehouse.C:157
#19 0x00007ffff54af5d6 in FEProblemBase::outputStep (this=0x555555d3c7d0, type=...)
    at /data/behnpa/projects/moose_libmesh_test/framework/src/problems/FEProblemBase.C:6291
#20 0x00007ffff431ab95 in Transient::preExecute (this=0x555555da1000)
    at /data/behnpa/projects/moose_libmesh_test/framework/src/executioners/Transient.C:254
#21 0x00007ffff431ad08 in Transient::execute (this=0x555555da1000)
    at /data/behnpa/projects/moose_libmesh_test/framework/src/executioners/Transient.C:283
#22 0x00007ffff46eaa13 in MooseApp::executeExecutioner (this=0x55555594a370)
    at /data/behnpa/projects/moose_libmesh_test/framework/src/base/MooseApp.C:1172
#23 0x00007ffff46f1522 in MooseApp::run (this=0x55555594a370) at /data/behnpa/projects/moose_libmesh_test/framework/src/base/MooseApp.C:1554
#24 0x000055555556350d in Moose::main<SolidMechanicsTestApp> (argc=3, argv=0x7fffffffb8a8)
    at /data/behnpa/projects/moose_libmesh_test/framework/build/header_symlinks/MooseMain.h:47
#25 0x0000555555562407 in main (argc=3, argv=0x7fffffffb8a8)
    at /data/behnpa/projects/moose_libmesh_test/modules/solid_mechanics/src/main.C:17
milljm commented 2 months ago

pining HDF5 to 1.12.1 will be hella fun (not).

IIRC it was a requirement to bump HDF5, when we found it necessary to bump MPICH to 4.x, because we found it necessary to bump MPICH due to wanting to support Python 3.11 =D

roystgnr commented 2 months ago

Copying the important bits from slack:

1.14.2 appears to be fine, and at https://github.com/HDFGroup/hdf5/issues/4381 I see they fixed the issue with 1.14.3 and the fix made it into 1.14.4. Testing the latest release (1.14.4.3) seems to confirm that for me. So we ought to be able to get away with either a tiny upgrade or a tiny downgrade, no need to back off all the way to 1.12.

If we can't manage any version change, I think I can get a small (the ifdefs and comments will be longer than the code...) workaround in at the libMesh level; just let me know.

milljm commented 2 months ago

I'll see about getting that in with this PR: https://github.com/idaholab/moose/pull/28399