Exawind / nalu-wind

Solver for wind farm simulations targeting exascale computational platforms
https://exawind.github.io/nalu-wind/
Other
124 stars 85 forks source link

SIGSTOP on CPU in GeometryInteriorAlg #1321

Closed marchdf closed 1 month ago

marchdf commented 1 month ago

Hitting the following:

(lldb) bt
* thread #1, name = 'naluX', stop reason = signal SIGSTOP
    frame #0: 0x0000555558d34fb4 naluX`sierra::nalu::nalu_ngp::impl::ElemFieldOp<stk::mesh::HostMesh, stk::mesh::HostField<double, stk::mesh::DefaultNgpFieldSyncDebugger>, sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>>::Ops::operator+=(this=0x00007fffe96d42c0, val=0x000052d002029860) const at NgpFieldOps.h:296:24
    frame #1: 0x0000555558d34a01 naluX`sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::impl_compute_dual_nodal_volume(this=0x00007fffe987b608, edata=0x00007fffe9b76020)::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&)::operator()(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&) const at GeometryInteriorAlg.C:106:30
    frame #2: 0x0000555558d344d8 naluX`void sierra::nalu::nalu_ngp::run_elem_algorithm<stk::mesh::HostMesh, sierra::nalu::nalu_ngp::FieldManager, sierra::nalu::ElemDataRequests, sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::impl_compute_dual_nodal_volume()::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&)>(this=0x00007fffe9b77210, bktIndex=0x00007fffe94291e0)::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&))::'lambda'(Kokkos::Impl::HostThreadTeamMember<Kokkos::Serial> const&)::operator()(Kokkos::Impl::HostThreadTeamMember<Kokkos::Serial> const&) const::'lambda'(unsigned long const&)::operator()(unsigned long const&) const at NgpLoopUtils.h:388:11
    frame #3: 0x0000555558d33ae2 naluX`void Kokkos::parallel_for<unsigned long, void sierra::nalu::nalu_ngp::run_elem_algorithm<stk::mesh::HostMesh, sierra::nalu::nalu_ngp::FieldManager, sierra::nalu::ElemDataRequests, sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::impl_compute_dual_nodal_volume()::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&)>(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>, sierra::nalu::nalu_ngp::MeshInfo<stk::mesh::HostMesh, sierra::nalu::nalu_ngp::FieldManager> const&, stk::topology::rank_t, sierra::nalu::ElemDataRequests const&, stk::mesh::Selector const&, sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::impl_compute_dual_nodal_volume()::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&))::'lambda'(Kokkos::Impl::HostThreadTeamMember<Kokkos::Serial> const&)::operator()(Kokkos::Impl::HostThreadTeamMember<Kokkos::Serial> const&) const::'lambda'(unsigned long const&), Kokkos::Impl::HostThreadTeamMember<Kokkos::Serial>>(loop_boundaries=0x00007fffe9b771d0, closure=0x00007fffe9b77210, (null)=0x0000000000000000) at Kokkos_HostThreadTeam.hpp:742:5
    frame #4: 0x0000555558d337a2 naluX`void sierra::nalu::nalu_ngp::run_elem_algorithm<stk::mesh::HostMesh, sierra::nalu::nalu_ngp::FieldManager, sierra::nalu::ElemDataRequests, sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::impl_compute_dual_nodal_volume()::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&)>(this=0x00007fffe987b490, team=0x00007fffe95bf1a0)::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&))::'lambda'(Kokkos::Impl::HostThreadTeamMember<Kokkos::Serial> const&)::operator()(Kokkos::Impl::HostThreadTeamMember<Kokkos::Serial> const&) const at NgpLoopUtils.h:366:7
    frame #5: 0x0000555558d330c3 naluX`std::enable_if<std::is_void<stk::mesh::HostMesh>::value, void>::type Kokkos::Impl::ParallelFor<void sierra::nalu::nalu_ngp::run_elem_algorithm<stk::mesh::HostMesh, sierra::nalu::nalu_ngp::FieldManager, sierra::nalu::ElemDataRequests, sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::impl_compute_dual_nodal_volume()::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&)>(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>, sierra::nalu::nalu_ngp::MeshInfo<stk::mesh::HostMesh, sierra::nalu::nalu_ngp::FieldManager> const&, stk::topology::rank_t, sierra::nalu::ElemDataRequests const&, stk::mesh::Selector const&, sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::impl_compute_dual_nodal_volume()::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&))::'lambda'(Kokkos::Impl::HostThreadTeamMember<Kokkos::Serial> const&), Kokkos::TeamPolicy<Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic>>, Kokkos::Serial>::exec<void>(this=0x00007fffe987b490, data=0x000050f0000017d8) const at Kokkos_Serial_Parallel_Team.hpp:229:7
    frame #6: 0x0000555558d32952 naluX`Kokkos::Impl::ParallelFor<void sierra::nalu::nalu_ngp::run_elem_algorithm<stk::mesh::HostMesh, sierra::nalu::nalu_ngp::FieldManager, sierra::nalu::ElemDataRequests, sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::impl_compute_dual_nodal_volume()::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&)>(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>, sierra::nalu::nalu_ngp::MeshInfo<stk::mesh::HostMesh, sierra::nalu::nalu_ngp::FieldManager> const&, stk::topology::rank_t, sierra::nalu::ElemDataRequests const&, stk::mesh::Selector const&, sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::impl_compute_dual_nodal_volume()::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&))::'lambda'(Kokkos::Impl::HostThreadTeamMember<Kokkos::Serial> const&), Kokkos::TeamPolicy<Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic>>, Kokkos::Serial>::execute(this=0x00007fffe987b490) const at Kokkos_Serial_Parallel_Team.hpp:257:20
    frame #7: 0x0000555558d31f5c naluX`void Kokkos::parallel_for<Kokkos::TeamPolicy<Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic>>, void sierra::nalu::nalu_ngp::run_elem_algorithm<stk::mesh::HostMesh, sierra::nalu::nalu_ngp::FieldManager, sierra::nalu::ElemDataRequests, sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::impl_compute_dual_nodal_volume()::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&)>(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>, sierra::nalu::nalu_ngp::MeshInfo<stk::mesh::HostMesh, sierra::nalu::nalu_ngp::FieldManager> const&, stk::topology::rank_t, sierra::nalu::ElemDataRequests const&, stk::mesh::Selector const&, sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::impl_compute_dual_nodal_volume()::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&))::'lambda'(Kokkos::Impl::HostThreadTeamMember<Kokkos::Serial> const&), void>(str=error: summary string parsing error, policy=0x00007fffe9999a40, functor=0x00007fffe9999a90) at Kokkos_Parallel.hpp:144:11
    frame #8: 0x0000555558d1327e naluX`void sierra::nalu::nalu_ngp::run_elem_algorithm<stk::mesh::HostMesh, sierra::nalu::nalu_ngp::FieldManager, sierra::nalu::ElemDataRequests, sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::impl_compute_dual_nodal_volume()::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&)>(algName=error: summary string parsing error, meshInfo=0x0000503000da4b50, rank=ELEM_RANK, dataReqs=0x000051300002caa0, sel=0x00007fffe9876550, algorithm=(unnamed class) @ 0x00007fffe9876750) at NgpLoopUtils.h:356:3
  * frame #9: 0x0000555558d110db naluX`sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::impl_compute_dual_nodal_volume(this=0x000051300002ca40) at GeometryInteriorAlg.C:94:3
    frame #10: 0x0000555558d0fb47 naluX`sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::execute(this=0x000051300002ca40) at GeometryInteriorAlg.C:64:3
    frame #11: 0x0000555558f1739b naluX`sierra::nalu::NgpAlgDriver::execute(this=0x0000508000029d20) at NgpAlgDriver.C:39:16
    frame #12: 0x0000555556bea79d naluX`sierra::nalu::Realm::compute_geometry(this=0x000051c000003880) at Realm.C:2602:23
    frame #13: 0x0000555556bd3e19 naluX`sierra::nalu::Realm::initialize_prolog(this=0x000051c000003880) at Realm.C:544:3
    frame #14: 0x0000555556c8bb8c naluX`sierra::nalu::Realms::initialize_prolog(this=0x00005030000a66f0) at Realms.C:75:12
    frame #15: 0x00005555564466c2 naluX`sierra::nalu::Simulation::initialize(this=0x00007fffe9b00fa0) at Simulation.C:184:12
    frame #16: 0x000055555641be12 naluX`main(argc=5, argv=0x00007fffffff2c58) at nalu.C:197:9
    frame #17: 0x00007fffee6217e5 libc.so.6`__libc_start_main + 229
    frame #18: 0x000055555633d92e naluX`_start + 46

It happens on L106 of GeometryInteriorAlg.C:

(lldb) frame s 0
frame #0: 0x0000555558d34fb4 naluX`sierra::nalu::nalu_ngp::impl::ElemFieldOp<stk::mesh::HostMesh, stk::mesh::HostField<double, stk::mesh::DefaultNgpFieldSyncDebugger>, sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>>::Ops::operator+=(this=0x00007fffe96d42c0, val=0x000052d002029860) const at NgpFieldOps.h:296:24
   293  #ifdef STK_SIMD_NONE
   294        fld.get(einfo[0].meshIdx, ic) += stk::simd::get_data(val, 0);
   295  #else
-> 296        for (int is = 0; is < edata_.numSimdElems; ++is) {
   297          fld.get(einfo[is].meshIdx, ic) += stk::simd::get_data(val, is);
   298        }
   299  #endif
(lldb) frame s 1
frame #1: 0x0000555558d34a01 naluX`sierra::nalu::GeometryInteriorAlg<sierra::nalu::AlgTraitsHex8>::impl_compute_dual_nodal_volume(this=0x00007fffe987b608, edata=0x00007fffe9b76020)::'lambda'(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&)::operator()(sierra::nalu::nalu_ngp::ElemSimdData<stk::mesh::HostMesh>&) const at GeometryInteriorAlg.C:106:30
   103        for (int ip = 0; ip < AlgTraits::numScvIp_; ++ip) {
   104          const auto nn = ipNodeMap[ip];
   105          dnvOps(edata, nn, 0) += v_scv_vol(ip);
-> 106          elemVolOps(edata, 0) += v_scv_vol(ip);
   107        }
   108      });
   109

elemVolOps looks ok I think?

const auto dnvOps = nalu_ngp::simd_elem_nodal_field_updater(ngpMesh, dualVol);                                                                                                                                                                                                                                                                                                                                                            const auto elemVolOps = nalu_ngp::simd_elem_field_updater(ngpMesh, elemVol);

Maybe it is not allowed to do += on that data type?

@alanw0 and @overfelt do you have any thoughts on this one?

marchdf commented 1 month ago

Gotta say I am a bit surprised to find myself on line 296 instead of 294 in NgpFieldOps.h. I thought we turned off simd here: https://github.com/spack/spack/blob/5ac2b8a178a47d9073918dc6014d41f48f393a29/var/spack/repos/builtin/packages/nalu-wind/package.py#L145

marchdf commented 1 month ago

The above PR proposes a fix. Looks like the ifdefs were wrong. Testing it out now.