STEllAR-GROUP / octotiger

Astrophysics program simulating the evolution of star systems based on the fast multipole method on adaptive Octrees
http://octotiger.stellar-group.org/
Boost Software License 1.0
47 stars 17 forks source link

Octo-Tiger segfaults during refinement #380

Closed diehlpk closed 2 years ago

diehlpk commented 2 years ago

Octo-Tiger crashes for level 11 and level 12 at the following step

OMEGA = 9.682662e-01, output_dt = 4.000000e-02
0.000000e+00 4.000000e-02
dwd step...
1 1.717497e-03 1.717497e-03 2.112086e+00 1.662994e-03
2 1.000000e+99 1.000000e+99 2.042667e+00 9.682662e+98
3 2.000000e+99 1.000000e+99 2.056738e+00 1.936532e+99
4 3.000000e+99 1.000000e+99 2.032418e+00 2.904799e+99
5 4.000000e+99 1.000000e+99 2.008762e+00 3.873065e+99
6 5.000000e+99 1.000000e+99 2.098139e+00 4.841331e+99
7 6.000000e+99 1.000000e+99 2.059668e+00 5.809597e+99
8 7.000000e+99 1.000000e+99 1.945080e+00 6.777864e+99
diagnostics...
New Omega = 9.682662e-01
-----------------------------------------------
checking for refinement
[h35n17:2369574] *** Process received signal ***
[h35n17:2369574] Signal: Segmentation fault (11)
[h35n17:2369574] Signal code: Address not mapped (1)
[h35n17:2369574] Failing at address: 0x28
[h35n17:2369574] [ 0] linux-vdso64.so.1(__kernel_sigtramp_rt64+0x0)[0x2000000504d8]
[h35n17:2369574] [ 1] ERROR: One or more process (first noticed rank 0) terminated with signal 11 (core dumped)

with the following error

Program terminated with signal SIGSEGV, Segmentation fault.
#0 0x000020000061372c in node_server::<lambda()>::operator() (__closure=0x20003ac356c0) at /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp:394
394                                            const auto vr = sqrt(sqr(dt_.ur[sx_i]) + sqr(dt_.ur[sy_i]) + sqr(dt_.ur[sz_i])) / dt_.ur[0];

and this is the back trace using the core file

#0 0x000020000061372c in node_server::<lambda()>::operator() (__closure=0x20003ac356c0) at /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp:394
#1 hpx::util::detail::callable_vtable<void()>::_invoke<node_server::execute_solver(bool, node_count_type)::<lambda()> >(void *) (f=0x20003ac356c0)
   at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/functional/detail/vtable/callable_vtable.hpp:93
#2 0x0000200000646764 in hpx::util::detail::basic_function<void (), false, false>::operator()() const (this=0x20003c44aa50) at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/functional/detail/basic_function.hpp:225
#3 hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void>::invoke_helper(std::integral_constant<bool, true>) (this=0x20003c44aa50)
   at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/executors/service_executors.hpp:186
#4 hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void>::invoke() (this=0x20003c44aa50)
   at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/executors/service_executors.hpp:169
#5 0x00002000006462a8 in hpx::util::detail::invoke_mem_fun<void (), hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void> >::operator()<std::shared_ptr<hpx::paralle
l::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void> >&>(std::shared_ptr<hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_func
tion<void (), false>, void> >&) const (t1=<synthetic pointer>..., this=<optimized out>) at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/functional/detail/invoke.hpp:78
#6 hpx::util::detail::bound_front<void (hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void>::*)(), hpx::util::pack_c<unsigned long, 0ul>, std::shared_ptr<hpx::par
allel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void> > >::operator()<>() & (this=<synthetic pointer>)
   at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/functional/bind_front.hpp:78
#7 boost::asio::asio_handler_invoke<hpx::util::detail::bound_front<void (hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void>::*)(), hpx::util::pack_c<unsigned lon
g, 0ul>, std::shared_ptr<hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void> > > >(hpx::util::detail::bound_front<void (hpx::parallel::execution::detail::service_e
xecutor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void>::*)(), hpx::util::pack_c<unsigned long, 0ul>, std::shared_ptr<hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::
unique_function<void (), false>, void> > >&, ...) (function=<synthetic pointer>...) at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/boost/include/boost/asio/handler_invoke_hook.hpp:69
#8 boost_asio_handler_invoke_helpers::invoke<hpx::util::detail::bound_front<void (hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void>::*)(), hpx::util::pack_c<uns
igned long, 0ul>, std::shared_ptr<hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void> > >, hpx::util::detail::bound_front<void (hpx::parallel::execution::detail::s
ervice_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void>::*)(), hpx::util::pack_c<unsigned long, 0ul>, std::shared_ptr<hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx
::util::unique_function<void (), false>, void> > > >(hpx::util::detail::bound_front<void (hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void>::*)(), hpx::util::pac
k_c<unsigned long, 0ul>, std::shared_ptr<hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void> > >&, hpx::util::detail::bound_front<void (hpx::parallel::execution::d
etail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void>::*)(), hpx::util::pack_c<unsigned long, 0ul>, std::shared_ptr<hpx::parallel::execution::detail::service_executor::async_execute_wrapper_he
lper<hpx::util::unique_function<void (), false>, void> > >&) (context=<synthetic pointer>..., function=<synthetic pointer>...) at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/boost/include/boost/asio/detail/handler_invoke_helpers.hpp:37
#9 boost::asio::detail::handler_work<hpx::util::detail::bound_front<void (hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void>::*)(), hpx::util::pack_c<unsigned lo
ng, 0ul>, std::shared_ptr<hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void> > >, boost::asio::system_executor, boost::asio::system_executor>::complete<hpx::util:
:detail::bound_front<void (hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void>::*)(), hpx::util::pack_c<unsigned long, 0ul>, std::shared_ptr<hpx::parallel::executi
on::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void> > > >(hpx::util::detail::bound_front<void (hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util
::unique_function<void (), false>, void>::*)(), hpx::util::pack_c<unsigned long, 0ul>, std::shared_ptr<hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void> > >&, hp
x::util::detail::bound_front<void (hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void>::*)(), hpx::util::pack_c<unsigned long, 0ul>, std::shared_ptr<hpx::parallel:
:execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void> > >&) (this=<synthetic pointer>, handler=<synthetic pointer>..., function=<synthetic pointer>...)
   at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/boost/include/boost/asio/detail/handler_work.hpp:100
#10 boost::asio::detail::completion_handler<hpx::util::detail::bound_front<void (hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void>::*)(), hpx::util::pack_c<unsig
ned long, 0ul>, std::shared_ptr<hpx::parallel::execution::detail::service_executor::async_execute_wrapper_helper<hpx::util::unique_function<void (), false>, void> > > >::do_complete(void*, boost::asio::detail::scheduler_operation*, boost::sys
tem::error_code const&, unsigned long) (owner=0x20001e45eb00, base=0x20003ba16f80) at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/boost/include/boost/asio/detail/completion_handler.hpp:70
#11 0x00002000028d29fc in boost::asio::detail::scheduler_operation::complete (bytes_transferred=<optimized out>, ec=..., owner=<optimized out>, this=<optimized out>)
   at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/boost/include/boost/asio/detail/scheduler_operation.hpp:40
#12 boost::asio::detail::scheduler::do_run_one (ec=..., this_thread=..., lock=..., this=0x20001e45eb00) at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/boost/include/boost/asio/detail/impl/scheduler.ipp:447
#13 boost::asio::detail::scheduler::run (this=0x20001e45eb00, ec=...) at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/boost/include/boost/asio/detail/impl/scheduler.ipp:200
#14 0x00002000028fb6f8 in boost::asio::io_context::run (this=<optimized out>) at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/boost/include/boost/asio/impl/io_context.ipp:63
#15 hpx::util::io_service_pool::thread_run (this=0x20001ea61348, index=1, startup=<optimized out>) at /ccs/proj/cph102/diehl/PowerTigerKokkos/src/hpx/libs/core/io_service/src/io_service_pool.cpp:99
#16 0x00002000028fdc00 in std::__invoke_impl<void, void (hpx::util::io_service_pool::*)(unsigned long, hpx::util::barrier*), hpx::util::io_service_pool*, unsigned long, hpx::util::barrier*> (__t=<optimized out>, __f=<optimized out>)
   at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/invoke.h:89
#17 std::__invoke<void (hpx::util::io_service_pool::*)(unsigned long, hpx::util::barrier*), hpx::util::io_service_pool*, unsigned long, hpx::util::barrier*> (__fn=<optimized out>)
   at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/invoke.h:95
#18 std::thread::_Invoker<std::tuple<void (hpx::util::io_service_pool::*)(unsigned long, hpx::util::barrier*), hpx::util::io_service_pool*, unsigned long, hpx::util::barrier*> >::_M_invoke<0ul, 1ul, 2ul, 3ul> (this=<optimized out>)
   at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/thread:244
#19 std::thread::_Invoker<std::tuple<void (hpx::util::io_service_pool::*)(unsigned long, hpx::util::barrier*), hpx::util::io_service_pool*, unsigned long, hpx::util::barrier*> >::operator() (this=<optimized out>)
   at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/thread:251
--Type <RET> for more, q to quit, c to continue without paging--
#20 std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (hpx::util::io_service_pool::*)(unsigned long, hpx::util::barrier*), hpx::util::io_service_pool*, unsigned long, hpx::util::barrier*> > >::_M_run (this=<optimized out>)
   at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/thread:195
#21 0x0000200003e399c0 in std::execute_native_thread_routine (__p=<optimized out>) at /gpfs/alpine/scratch/belhorn/stf007/builds/gcc-build-9.3.0-3/gcc-9.3.0/libstdc++-v3/src/c++11/thread.cc:80
#22 0x0000200003c68ae0 in start_thread () from /lib64/power9/libpthread.so.0
#23 0x000020000425e7c8 in clone () from /lib64/power9/libc.so.6
(gdb)
shibersag commented 2 years ago

When restarting the files using the benchmark version on QB I get the following output as should be:

Normalized constants 222 1.988973e+33 3.447893e+09 1.757393e+01 1.000000e+00 A = 3.213504e+01 | B = 4.043302e+01 | G = 1.000000e+00 | kb = 1.803381e-66 | c = 1.528044e+02 | mh = 8.412883e-58 | sigma = 1.547390e-34 | h = 4.924802e-78 solving gravity------------ ...done Starting run... Solving gravity

checking for refinement regridding Regridded tree in 0.004153 seconds rebalancing 5609 nodes with 4908 leaves Rebalanced tree in 0.006956 seconds forming tree connections 4280 amr boundaries Formed tree in 0.510611 seconds solving gravity regrid done in 1.637393 seconds

OMEGA = 9.682662e-01, output_dt = 4.000000e-02 0.000000e+00 4.000000e-02 dwd step... 1 2.059591e-03 2.059591e-03 8.246676e+00 1.994232e-03 2 4.118950e-03 2.059358e-03 7.942974e+00 3.988240e-03 3 6.176963e-03 2.058014e-03 8.200184e+00 5.980945e-03 4 8.234463e-03 2.057500e-03 8.039265e+00 7.973152e-03 5 1.029151e-02 2.057048e-03 7.992803e+00 9.964922e-03 6 1.234807e-02 2.056558e-03 8.140656e+00 1.195622e-02 7 1.440416e-02 2.056087e-03 8.079330e+00 1.394706e-02 8 1.645979e-02 2.055630e-03 8.044502e+00 1.593746e-02

dmarce1 commented 2 years ago

oops, sorry sagiv didn't mean to delete your comment. Was trying to delete mine. This is the Kokkos version that is crashing? dt.ul and dt.ur are vectors and they must not be sized correctly, I'm having trouble finding out why though, working on it. I thought I had it fixed but was wrong.

dmarce1 commented 2 years ago

In node_server_actions_3.cpp We have, beginning on line 715:

        dt.dt = 1.0e+99;
        for (const auto &this_dt : dts) {
            if (this_dt.dt < dt.dt) {
                dt = this_dt;
            }
        }

The condition this_dt.dt < dt.dt must not be occurring which explains the 1.0e+99's as well as the reason the vectors in dt are empty. So this means the time-step itself is not being correctly computed (it is always greater than 1.0e+99). Not sure why yet.

dmarce1 commented 2 years ago

What I think is happening is that in hydro_kokkos_kernel.hpp, either on line 460 or 529, ts.a is either being set to zero or set to some very small value. Gregor can you take a look?

G-071 commented 2 years ago

I can take a look at it! @diehlpk what were the exact runtime parameters you used, so that I can reproduce the issue? If you have it handy, the entire output log might be useful as well.

diehlpk commented 2 years ago

@dmarce1

OMEGA = 9.687093e-01, output_dt = 4.000000e-02
3.736165e+01 4.000000e-02
dwd step...
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Signal speed error in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
diehlpk commented 2 years ago

@dmarce1


Signal speed low in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()
Segmentation fault (core dumped)
diehlpk commented 2 years ago

@dmarce1

regrid done in 53.657513 seconds
---------------------------------------
OMEGA = 9.687093e-01, output_dt = 4.000000e-02
3.736165e+01 4.000000e-02
dwd step...
Signal speed low in /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/node_server_actions_3.cpp on line 718
Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()
terminate called after throwing an instance of 'std::runtime_error'
  what():  cuda_pinned_allocator failed due to cudaFreeHost failure : driver shutting down
terminate called recursively
terminate called recursively
terminate called recursively
terminate called recursively
terminate called recursively
diehlpk commented 2 years ago

@dmarce1 It seems the latest push addressed the bug

Done!
Normalized constants 222
1.988920e+33 3.488878e+09 1.788846e+01 1.000000e+00
A = 3.369228e+01 | B = 4.189325e+01 | G = 1.000000e+00 | kb = 1.824916e-66 | c = 1.537120e+02 | mh = 8.413107e-58 | sigma = 1.632012e-34 | h = 4.895986e-78
solving gravity------------
...done
Start execution the solver...
Starting run...
Solving gravity
-----------------------------------------------
checking for refinement
regridding
Regridded tree in 0.082719 seconds
rebalancing 8393 nodes with 7344 leaves
Rebalanced tree in 0.209994 seconds
forming tree connections
5960 amr boundaries
Formed tree in 1.916525 seconds
solving gravity
regrid done in 51.420726 seconds
---------------------------------------
OMEGA = 9.687093e-01, output_dt = 4.000000e-02
3.736165e+01 4.000000e-02
dwd step...
1 2.423346e+02 1.663152e-03 2.999958e+02 2.347518e+02
2 2.423363e+02 1.662655e-03 2.977107e+02 2.347534e+02
3 2.423379e+02 1.662134e-03 2.979868e+02 2.347550e+02
diehlpk commented 2 years ago

New error message

(gdb) 
#0  grid::set (this=0x200024ad0c90, name=..., data=<optimized out>, version=<optimized out>)
    at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_vector.h:1040
#1  0x00002000007a9828 in node_server::node_server (this=0x200024aa6000, loc=...) at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_vector.h:1165
#2  0x00002000007c6ef8 in hpx::components::detail_adl_barrier::init<hpx::traits::construct_without_back_ptr>::call_new<node_server, hpx::components::managed_component<node_server, hpx::components::detail::this_type>, node_location&> (this_=0x200024d80090, component=@0x200024d80090: 0x0)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/runtime/components/server/managed_component_base.hpp:72
#3  hpx::components::managed_component<node_server, hpx::components::detail::this_type>::managed_component<node_location&> (this=0x200024d80090)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/runtime/components/server/managed_component_base.hpp:358
#4  hpx::components::server::create<hpx::components::managed_component<node_server, hpx::components::detail::this_type>, node_location&> ()
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/runtime/components/server/create_component.hpp:45
#5  0x00002000007ade9c in hpx::components::detail::local_new_component<node_server>::call<node_location&> ()
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/runtime/components/new.hpp:327
#6  hpx::components::new_<node_server, node_location&> (locality=...) at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/runtime/components/new.hpp:421
#7  node_server::<lambda()>::operator() (__closure=0x200024bc0e00) at /ccs/proj/cph102/diehl/PowerTigerKokkos/src/octotiger-kokkos/src/io/silo_in.cpp:174
#8  hpx::lcos::local::detail::task_object<void, node_server::node_server(const node_location&)::<lambda()>, void, hpx::lcos::detail::task_base<void> >::do_run_impl (
    this=0x200024bc0d80) at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/futures/futures_factory.hpp:103
#9  hpx::lcos::local::detail::task_object<void, node_server::node_server(const node_location&)::<lambda()>, void, hpx::lcos::detail::task_base<void> >::do_run(void) (
    this=0x200024bc0d80) at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/futures/futures_factory.hpp:72
#10 0x000000001004e2ac in hpx::lcos::detail::task_base<void>::run_impl (this_=...)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/futures/detail/future_data.hpp:914
#11 0x000000001004e454 in hpx::util::detail::deferred<void (*)(hpx::memory::intrusive_ptr<hpx::lcos::detail::task_base<void> >), hpx::util::pack_c<unsigned long, 0ul>, hpx::memory::intrusive_ptr<hpx::lcos::detail::task_base<void> > >::operator() (this=<optimized out>)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/memory/intrusive_ptr.hpp:89
#12 hpx::threads::detail::thread_function_nullary<hpx::util::detail::deferred<void (*)(hpx::memory::intrusive_ptr<hpx::lcos::detail::task_base<void> >), hpx::util::pack_c<unsigned long, 0ul>, hpx::memory::intrusive_ptr<hpx::lcos::detail::task_base<void> > > >::operator() (this=<optimized out>)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/threading_base/register_thread.hpp:78
#13 hpx::util::detail::callable_vtable<std::pair<hpx::threads::thread_state_enum, hpx::threads::thread_id> (hpx::threads::thread_state_ex_enum)>::_invoke<hpx::threads::detail::thread_function_nullary<hpx::util::detail::deferred<void (*)(hpx::memory::intrusive_ptr<hpx::lcos::detail::task_base<void> >), hpx::util::pack_c<unsigned long, 0ul>, hpx::memory::intrusive_ptr<hpx::lcos::detail::task_base<void> > > > >(void*, hpx::threads::thread_state_ex_enum&&) (f=<optimized out>, vs#0=<optimized out>)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/include/hpx/functional/detail/vtable/callable_vtable.hpp:93
#14 0x0000200002c1d3d0 in hpx::util::detail::basic_function<std::pair<hpx::threads::thread_state_enum, hpx::threads::thread_id> (hpx::threads::thread_state_ex_enum), false, false>::operator()(hpx::threads::thread_state_ex_enum) const (vs#0=<optimized out>, this=0x2000203931f0)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/src/hpx/libs/core/functional/include/hpx/functional/detail/basic_function.hpp:225
#15 hpx::threads::coroutines::detail::coroutine_impl::operator() (this=0x200020392650)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/src/hpx/libs/core/coroutines/src/detail/coroutine_impl.cpp:74
#16 0x00002000038e3608 in hpx::threads::coroutines::detail::posix::trampoline<hpx::threads::coroutines::detail::coroutine_impl> (fun=<optimized out>)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/src/hpx/libs/core/coroutines/include/hpx/coroutines/detail/posix_utility.hpp:261
#17 0x0000200004177ffc in makecontext () from /lib64/power9/libc.so.6
dmarce1 commented 2 years ago

@G-071 I have tracked what may be causing this bug to the flux kernel. (It is a bug nonetheless even if it isn't the cause of this particular bug) Non NaNs are input into the flux kernel in U, but it returns NaNs in F. The branch trap_flux_bug contains traps to catch the NaNs. I am sending you an email with the restart files attached. You will need to change the restart_filename in dwd.ini to match your local directory. You will also need to disable fast-math in config.sh when building octotiger, or else the Nans will not be trapped (they are quiet nans). I think also Debug mode is required.

The commandline I am using for the build is ./build-all.sh Debug with-CC with-cuda with-mpi without-papi without-apex with-kokkos with-simd without-hpx-backend-multipole without-hpx-backend-monopole without-hpx-cuda-polling octotiger

The commandline for octotiger I am using is ./build/octotiger/build/octotiger --config_file=dwd.ini --monopole_host_kernel_type=KOKKOS --multipole_host_kernel_type=KOKKOS --monopole_device_kernel_type=KOKKOS_CUDA --multipole_device_kernel_type=KOKKOS_CUDA --hydro_device_kernel_type=KOKKOS_CUDA --hydro_host_kernel_type=KOKKOS --cuda_streams_per_gpu=128 --cuda_buffer_capacity=2

G-071 commented 2 years ago

@dmarce1 I have just pushed some commits to your trap_fluxbug branch! There was indeed an error in the flux kernel in one of the if branches. From what I can see, it got triggered by the parameter --eos=WD in dwd.ini. It caused ```physics::A``` being different than 0.0, see https://github.com/STEllAR-GROUP/octotiger/blob/c877a4a10337b323c04339642ac17ecd1bc0f9e8/src/grid.cpp#L1820

I think I found and addressed everything wrong in the flux kernel with A_!=0 in 6be12970ffa6b4be7f3dce40baec099721fe708a and 247a2a93557dd695fab2091916036a123a9221fb.

There was also a bug in the NaN checks for dudt themselves (out-of-bounds addressing), I fixed that in 247a2a93557dd695fab2091916036a123a9221fb.

I also added some quick tests for --eos=WD in 5f9c4600d4740002e7d11130e9140d02de548a50. However, it is essentially just a rotating_star with that parameter set! Can you think of a better scenario we could use here? Maybe a smaller dwd scenario (the one you gave would be too large for it)?

So far, I have only run it on my development machine: Here the dwd now runs for over 100 time-steps without any crashes - Can you pull the branch, rebuild and see if it works for you as well?

One more thing: It is best to build with --with-hpx-cuda-polling! There is a bug in the HPX version we use where the performance of the callback executors degrades over time (see https://github.com/STEllAR-GROUP/hpx/issues/5366). It seems to be fixed on master, but until we upgrade the HPX dependency we should stay with the polling executors (they're faster on most machines anyway). You need to rebuild both Kokkos and octotiger with that argument!

diehlpk commented 2 years ago

@G-071 I tested your patch on Summit and now we get


A = 3.369228e+01 | B = 4.189325e+01 | G = 1.000000e+00 | kb = 1.824875e-66 | c = 1.537113e+02 | mh = 8.412995e-58 | sigma = 1.631990e-34 | h = 4.895877e-78
Initializing cell_geometry 3 16 22
Initializing cell_geometry 3 16 22
Initializing cell_geometry 3 16 22
Initializing cell_geometry 3 16 22
Initializing cell_geometry 3 16 22
[f14n02:584856] *** Process received signal ***
[f14n02:584856] Signal: Segmentation fault (11)
[f14n02:584856] Signal code: Invalid permissions (2)
[f14n02:584856] Failing at address: 0x200020210000
[f14n02:584856] [ 0] linux-vdso64.so.1(__kernel_sigtramp_rt64+0x0)[0x2000000504d8]
[f14n02:584856] [ 1] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN4grid3setENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPdi+0xc0)[0x2000004baa40]
[f14n02:584856] [ 2] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN11node_serverC2ERK13node_location+0xfa8)[0x2000007a98d8]
[f14n02:584856] [ 3] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN3hpx10components6server6createINS0_17managed_componentI11node_serverNS0_6detail9this_typeEEEJR13node_locationEEENS_6naming8gid_typeEDpOT0_+0x98)[0x2000007c6fa8]
[f14n02:584856] [ 4] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(+0x6cdf4c)[0x2000007adf4c]
[f14n02:584856] [ 5] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/octotiger(_ZN3hpx4lcos6detail9task_baseIvE8run_implENS_6memory13intrusive_ptrIS3_EE+0x2c)[0x1004e2ac]
[f14n02:584856] [ 6] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/octotiger(_ZN3hpx4util6detail15callable_vtableIFSt4pairINS_7threads17thread_state_enumENS4_9thread_idEENS4_20thread_state_ex_enumEEE7_invokeINS4_6detail23thread_function_nullaryINS1_8deferredIPFvNS_6memory13intrusive_ptrINS_4lcos6detail9task_baseIvEEEEENS0_6pack_cImJLm0EEEEJSL_EEEEEEES7_PvOS8_+0x44)[0x1004e454]
[f14n02:584856] [ 7] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(_ZN3hpx7threads10coroutines6detail14coroutine_implclEv+0x130)[0x200002c1d3d0]
[f14n02:584856] [ 8] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx.so.1(+0x8d3608)[0x2000038e3608]
[f14n02:584856] [ 9] /lib64/power9/libc.so.6(makecontext+0xd8)[0x200004177ffc]
[f14n02:584856] [10] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(+0xbcb58)[0x200002c1cb58]
[f14n02:584856] *** End of error message ***
[f14n02:584859] *** Process received signal ***
[f14n02:584859] Signal: Segmentation fault (11)
[f14n02:584859] Signal code: Invalid permissions (2)
[f14n02:584859] Failing at address: 0x20001e400000
[f14n02:584859] [ 0] linux-vdso64.so.1(__kernel_sigtramp_rt64+0x0)[0x2000000504d8]
[f14n02:584859] [ 1] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN4grid3setENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPdi+0xc0)[0x2000004baa40]
[f14n02:584859] [ 2] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN11node_serverC2ERK13node_location+0xfa8)[0x2000007a98d8]
[f14n02:584859] [ 3] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN3hpx10components6server6createINS0_17managed_componentI11node_serverNS0_6detail9this_typeEEEJR13node_locationEEENS_6naming8gid_typeEDpOT0_+0x98)[0x2000007c6fa8]
[f14n02:584859] [ 4] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(+0x6cdf4c)[0x2000007adf4c]
[f14n02:584859] [ 5] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/octotiger(_ZN3hpx4lcos6detail9task_baseIvE8run_implENS_6memory13intrusive_ptrIS3_EE+0x2c)[0x1004e2ac]
[f14n02:584859] [ 6] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/octotiger(_ZN3hpx4util6detail15callable_vtableIFSt4pairINS_7threads17thread_state_enumENS4_9thread_idEENS4_20thread_state_ex_enumEEE7_invokeINS4_6detail23thread_function_nullaryINS1_8deferredIPFvNS_6memory13intrusive_ptrINS_4lcos6detail9task_baseIvEEEEENS0_6pack_cImJLm0EEEEJSL_EEEEEEES7_PvOS8_+0x44)[0x1004e454]
[f14n02:584859] [ 7] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(_ZN3hpx7threads10coroutines6detail14coroutine_implclEv+0x130)[0x200002c1d3d0]
[f14n02:584859] [ 8] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx.so.1(+0x8d3608)[0x2000038e3608]
[f14n02:584859] [ 9] /lib64/power9/libc.so.6(makecontext+0xd8)[0x200004177ffc]
[f14n02:584859] [10] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(+0xbcb58)[0x200002c1cb58]
[f14n02:584859] *** End of error message ***
[f14n02:584860] *** Process received signal ***
[f14n02:584860] Signal: Segmentation fault (11)
[f14n02:584860] Signal code: Invalid permissions (2)
[f14n02:584860] Failing at address: 0x20001ea20000
[f14n02:584860] [ 0] linux-vdso64.so.1(__kernel_sigtramp_rt64+0x0)[0x2000000504d8]
[f14n02:584860] [ 1] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN4grid3setENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPdi+0xc0)[0x2000004baa40]
[f14n02:584860] [ 2] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN11node_serverC2ERK13node_location+0xfa8)[0x2000007a98d8]
[f14n02:584860] [ 3] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN3hpx10components6server6createINS0_17managed_componentI11node_serverNS0_6detail9this_typeEEEJR13node_locationEEENS_6naming8gid_typeEDpOT0_+0x98)[0x2000007c6fa8]
[f14n02:584860] [ 4] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(+0x6cdf4c)[0x2000007adf4c]
[f14n02:584860] [ 5] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/octotiger(_ZN3hpx4lcos6detail9task_baseIvE8run_implENS_6memory13intrusive_ptrIS3_EE+0x2c)[0x1004e2ac]
[f14n02:584860] [ 6] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/octotiger(_ZN3hpx4util6detail15callable_vtableIFSt4pairINS_7threads17thread_state_enumENS4_9thread_idEENS4_20thread_state_ex_enumEEE7_invokeINS4_6detail23thread_function_nullaryINS1_8deferredIPFvNS_6memory13intrusive_ptrINS_4lcos6detail9task_baseIvEEEEENS0_6pack_cImJLm0EEEEJSL_EEEEEEES7_PvOS8_+0x44)[0x1004e454]
[f14n02:584860] [ 7] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(_ZN3hpx7threads10coroutines6detail14coroutine_implclEv+0x130)[0x200002c1d3d0]
[f14n02:584860] [ 8] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx.so.1(+0x8d3608)[0x2000038e3608]
[f14n02:584860] [ 9] /lib64/power9/libc.so.6(makecontext+0xd8)[0x200004177ffc]
[f14n02:584860] [10] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(+0xbcb58)[0x200002c1cb58]
[f14n02:584860] *** End of error message ***
[f14n02:584855] *** Process received signal ***
[f14n02:584855] Signal: Segmentation fault (11)
[f14n02:584855] Signal code: Invalid permissions (2)
[f14n02:584855] Failing at address: 0x20001e030000
[f14n02:584855] [ 0] linux-vdso64.so.1(__kernel_sigtramp_rt64+0x0)[0x2000000504d8]
[f14n02:584855] [ 1] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN4grid3setENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPdi+0xc0)[0x2000004baa40]
[f14n02:584855] [ 2] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN11node_serverC2ERK13node_location+0xfa8)[0x2000007a98d8]
[f14n02:584855] [ 3] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN3hpx10components6server6createINS0_17managed_componentI11node_serverNS0_6detail9this_typeEEEJR13node_locationEEENS_6naming8gid_typeEDpOT0_+0x98)[0x2000007c6fa8]
[f14n02:584855] [ 4] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(+0x6cdf4c)[0x2000007adf4c]
[f14n02:584855] [ 5] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/octotiger(_ZN3hpx4lcos6detail9task_baseIvE8run_implENS_6memory13intrusive_ptrIS3_EE+0x2c)[0x1004e2ac]
[f14n02:584855] [ 6] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/octotiger(_ZN3hpx4util6detail15callable_vtableIFSt4pairINS_7threads17thread_state_enumENS4_9thread_idEENS4_20thread_state_ex_enumEEE7_invokeINS4_6detail23thread_function_nullaryINS1_8deferredIPFvNS_6memory13intrusive_ptrINS_4lcos6detail9task_baseIvEEEEENS0_6pack_cImJLm0EEEEJSL_EEEEEEES7_PvOS8_+0x44)[0x1004e454]
[f14n02:584855] [ 7] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(_ZN3hpx7threads10coroutines6detail14coroutine_implclEv+0x130)[0x200002c1d3d0]
[f14n02:584855] [ 8] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(+0xbcab8)[0x200002c1cab8]
[f14n02:584855] [ 9] /lib64/power9/libc.so.6(makecontext+0xd8)[0x200004177ffc]
[f14n02:584855] [10] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(+0xbcb58)[0x200002c1cb58]
[f14n02:584855] *** End of error message ***
[f14n02:584858] *** Process received signal ***
[f14n02:584858] Signal: Segmentation fault (11)
[f14n02:584858] Signal code: Invalid permissions (2)
[f14n02:584858] Failing at address: 0x20001f610000
[f14n02:584858] [ 0] linux-vdso64.so.1(__kernel_sigtramp_rt64+0x0)[0x2000000504d8]
[f14n02:584858] [ 1] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN4grid3setENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPdi+0xc0)[0x2000004baa40]
[f14n02:584858] [ 2] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN11node_serverC2ERK13node_location+0xfa8)[0x2000007a98d8]
[f14n02:584858] [ 3] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN3hpx10components6server6createINS0_17managed_componentI11node_serverNS0_6detail9this_typeEEEJR13node_locationEEENS_6naming8gid_typeEDpOT0_+0x98)[0x2000007c6fa8]
[f14n02:584858] [ 4] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(+0x6cdf4c)[0x2000007adf4c]
[f14n02:584858] [ 5] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/octotiger(_ZN3hpx4lcos6detail9task_baseIvE8run_implENS_6memory13intrusive_ptrIS3_EE+0x2c)[0x1004e2ac]
[f14n02:584858] [ 6] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/octotiger(_ZN3hpx4util6detail15callable_vtableIFSt4pairINS_7threads17thread_state_enumENS4_9thread_idEENS4_20thread_state_ex_enumEEE7_invokeINS4_6detail23thread_function_nullaryINS1_8deferredIPFvNS_6memory13intrusive_ptrINS_4lcos6detail9task_baseIvEEEEENS0_6pack_cImJLm0EEEEJSL_EEEEEEES7_PvOS8_+0x44)[0x1004e454]
[f14n02:584858] [ 7] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(_ZN3hpx7threads10coroutines6detail14coroutine_implclEv+0x130)[0x200002c1d3d0]
[f14n02:584858] [ 8] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx.so.1(+0x8d3608)[0x2000038e3608]
[f14n02:584858] [ 9] /lib64/power9/libc.so.6(makecontext+0xd8)[0x200004177ffc]
[f14n02:584858] [10] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(+0xbcb58)[0x200002c1cb58]
[f14n02:584858] *** End of error message ***
[f14n02:584857] *** Process received signal ***
[f14n02:584857] Signal: Segmentation fault (11)
[f14n02:584857] Signal code: Invalid permissions (2)
[f14n02:584857] Failing at address: 0x20001e810000
[f14n02:584857] [ 0] linux-vdso64.so.1(__kernel_sigtramp_rt64+0x0)[0x2000000504d8]
[f14n02:584857] [ 1] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN4grid3setENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEPdi+0xc0)[0x2000004baa40]
[f14n02:584857] [ 2] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN11node_serverC2ERK13node_location+0xfa8)[0x2000007a98d8]
[f14n02:584857] [ 3] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN3hpx10components6server6createINS0_17managed_componentI11node_serverNS0_6detail9this_typeEEEJR13node_locationEEENS_6naming8gid_typeEDpOT0_+0x98)[0x2000007c6fa8]
[f14n02:584857] [ 4] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(+0x6cdf4c)[0x2000007adf4c]
[f14n02:584857] [ 5] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/octotiger(_ZN3hpx4lcos6detail9task_baseIvE8run_implENS_6memory13intrusive_ptrIS3_EE+0x2c)[0x1004e2ac]
[f14n02:584857] [ 6] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/octotiger(_ZN3hpx4util6detail15callable_vtableIFSt4pairINS_7threads17thread_state_enumENS4_9thread_idEENS4_20thread_state_ex_enumEEE7_invokeINS4_6detail23thread_function_nullaryINS1_8deferredIPFvNS_6memory13intrusive_ptrINS_4lcos6detail9task_baseIvEEEEENS0_6pack_cImJLm0EEEEJSL_EEEEEEES7_PvOS8_+0x44)[0x1004e454]
[f14n02:584857] [ 7] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(_ZN3hpx7threads10coroutines6detail14coroutine_implclEv+0x130)[0x200002c1d3d0]
[f14n02:584857] [ 8] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx.so.1(+0x8d3608)[0x2000038e3608]
[f14n02:584857] [ 9] /lib64/power9/libc.so.6(makecontext+0xd8)[0x200004177ffc]
[f14n02:584857] [10] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(+0xbcb58)[0x200002c1cb58]
[f14n02:584857] *** End of error message ***
ERROR:  One or more process (first noticed rank 1) terminated with signal 11 (core dumped)
diehlpk commented 2 years ago

It seems that Gregor's patch solved the problem and the code is running.

@G-071 Can you please do a PR?

diehlpk commented 2 years ago

Ok, I think Gregor's patch fixed the problem partially and for some runs, the code crashes with

Solving gravity
-----------------------------------------------
checking for refinement
regridding
Regridded tree in 0.023345 seconds
rebalancing 89641 nodes with 78436 leaves
Rebalanced tree in 0.036139 seconds
forming tree connections
[h26n01:657875] *** Process received signal ***
[h26n01:657875] Signal: Segmentation fault (11)
[h26n01:657875] Signal code: Address not mapped (1)
[h26n01:657875] Failing at address: 0x200a199e3eb8
[h26n01:657875] [ 0] linux-vdso64.so.1(__kernel_sigtramp_rt64+0x0)[0x2000000504d8]
[h26n01:657875] [ 1] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/jemalloc/lib/libjemalloc.so.2(+0x18e48)[0x200002db8e48]
[h26n01:657875] [ 2] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/jemalloc/lib/libjemalloc.so.2(+0x753dc)[0x200002e153dc]
[h26n01:657875] [ 3] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/jemalloc/lib/libjemalloc.so.2(malloc+0x7ec)[0x200002dad47c]
[h26n01:657875] [ 4] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/jemalloc/lib/libjemalloc.so.2(_Znwm+0x20)[0x200002e19fd0]
[h26n01:657875] [ 5] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN3hpx4lcos6detail12promise_baseINS_6naming7id_typeES4_NS1_12promise_dataIS4_EEE17init_shared_stateEv+0x244)[0x2000006124a4]
[h26n01:657875] [ 6] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN3hpx6detail17async_remote_implIN11node_server23get_child_client_actionEJRKN3geo6octantEEEENS_4lcos6futureINS_6traits14extract_actionIT_vE4type17local_result_typeEEENS0_12async_policyERKNS_6naming7id_typeEONSI_7addressEDpOT0_+0x19c)[0x20000062f9bc]
[h26n01:657875] [ 7] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN3hpx6detail10async_implIN11node_server23get_child_client_actionERKNS0_12async_policyEJRKN3geo6octantEEEENS_4lcos6futureINS_6traits14extract_actionIT_vE4type17local_result_typeEEEOT0_RKNS_6naming7id_typeEDpOT1_+0xb8)[0x200000630528]
[h26n01:657875] [ 8] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN11node_client16get_child_clientERK13node_locationRKN3geo6octantE+0x170)[0x2000005c3fa0]
[h26n01:657875] [ 9] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN11node_server9form_treeEN3hpx6naming7id_typeES2_St6vectorIS2_SaIS2_EE+0x43c)[0x2000005c595c]
[h26n01:657875] [10] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN3hpx4lcos5local6detail11task_objectIiNS_4util6detail8deferredINS_6detail14action_invokerIN11node_server16form_tree_actionEEENS4_6pack_cImJLm0ELm1ELm2ELm3ELm4EEEEJmiNS_6naming7id_typeESF_St6vectorISF_SaISF_EEEEEvNS0_6detail9task_baseIiEEE6do_runEv+0xd4)[0x20000063a424]
[h26n01:657875] [11] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN3hpx4lcos6detail9task_baseIiE8run_implENS_6memory13intrusive_ptrIS3_EE+0x2c)[0x2000005d09fc]
[h26n01:657875] [12] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/octotiger/build-kokkos/libhpx_octolib.so(_ZN3hpx4util6detail15callable_vtableIFSt4pairINS_7threads17thread_state_enumENS4_9thread_idEENS4_20thread_state_ex_enumEEE7_invokeINS4_6detail23thread_function_nullaryINS1_8deferredIPFvNS_6memory13intrusive_ptrINS_4lcos6detail9task_baseIiEEEEENS0_6pack_cImJLm0EEEEJSL_EEEEEEES7_PvOS8_+0x44)[0x2000005d0b04]
[h26n01:657875] [13] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(_ZN3hpx7threads10coroutines6detail14coroutine_implclEv+0x130)[0x200002c1d3d0]
[h26n01:657875] [14] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx.so.1(+0x8d3608)[0x2000038e3608]
[h26n01:657875] [15] /lib64/power9/libc.so.6(makecontext+0xd8)[0x200004177ffc]
[h26n01:657875] [16] /ccs/proj/cph102/diehl/PowerTigerKokkos/build/hpx/lib64/libhpx_core.so(+0xbcb58)[0x200002c1cb58]
[h26n01:657875] *** End of error message ***
ERROR:  One or more process (first noticed rank 11) terminated with signal 11 (core dumped)

------------------------------------------------------------
Sender: LSF System <lsfadmin@batch2>
Subject: Job 1634742: <octo-v1309_no_apex-cuda-2> in cluster <summit> Exited

Job <octo-v1309_no_apex-cuda-2> was submitted from host <login3> by user <diehlpk> in cluster <summit> at Thu Nov 18 15:00:09 2021
Job was executed on host(s) <1*batch2>, in queue <batch>, as user <diehlpk> in cluster <summit> at Thu Nov 18 15:01:26 2021
                            <42*h25n18>
                            <42*h26n01>
</ccs/home/diehlpk> was used as the home directory.
</gpfs/alpine/cph102/proj-shared/diehl/sagiv/close/level11/2> was used as the working directory.
Started at Thu Nov 18 15:01:26 2021
Terminated at Thu Nov 18 15:04:54 2021
Results reported at Thu Nov 18 15:04:54 2021

The output (if any) is above this job summary.
diehlpk commented 2 years ago

@dmarce1 Can you please have a look?

diehlpk commented 2 years ago

Here is the bt

(gdb) bt
#0  0x0000200002db8e48 in ffs_lu (bitmap=<optimized out>) at include/jemalloc/internal/bit_util.h:22
#1  bitmap_sfu (binfo=<optimized out>, bitmap=0x2003c14fbec0) at include/jemalloc/internal/bitmap.h:315
#2  arena_slab_reg_alloc (bin_info=<optimized out>, slab=0x2003c14fbe80) at src/arena.c:232
#3  je_arena_tcache_fill_small (tsdn=0x200020c2b130, arena=0x200022200c80, tcache=<optimized out>, tbin=0x200020c2b330, binind=<optimized out>, 
    prof_accumbytes=<optimized out>) at src/arena.c:1268
#4  0x0000200002e153dc in je_tcache_alloc_small_hard (tsdn=<optimized out>, arena=<optimized out>, tcache=<optimized out>, tbin=<optimized out>, 
    binind=<optimized out>, tcache_success=0x200020c22670) at src/tcache.c:93
#5  0x0000200002dad47c in tcache_alloc_small (slow_path=false, zero=false, binind=<optimized out>, size=<optimized out>, tcache=0x200020c2b2f0, 
    arena=0x200022200c80, tsd=0x200020c2b130) at include/jemalloc/internal/tsd.h:138
#6  arena_malloc (slow_path=false, tcache=0x200020c2b2f0, zero=false, ind=<optimized out>, size=<optimized out>, arena=0x0, tsdn=0x200020c2b130)
    at include/jemalloc/internal/arena_inlines_b.h:94
#7  iallocztm (slow_path=false, arena=0x0, is_internal=false, tcache=0x200020c2b2f0, zero=false, ind=<optimized out>, size=<optimized out>, 
    tsdn=0x200020c2b130) at include/jemalloc/internal/jemalloc_internal_inlines_c.h:53
#8  imalloc_no_sample (ind=<optimized out>, usize=32, size=<optimized out>, tsd=0x200020c2b130, dopts=<synthetic pointer>, 
    sopts=<synthetic pointer>) at src/jemalloc.c:1709
#9  imalloc_body (tsd=0x200020c2b130, dopts=<synthetic pointer>, sopts=<synthetic pointer>) at src/jemalloc.c:1905
#10 imalloc (dopts=<synthetic pointer>, sopts=<synthetic pointer>) at src/jemalloc.c:2005
#11 malloc (size=<optimized out>) at src/jemalloc.c:2038
#12 0x0000200002e19fd0 in newImpl<false> (size=<optimized out>) at src/jemalloc_cpp.cpp:77
#13 operator new (size=<optimized out>) at src/jemalloc_cpp.cpp:87
#14 0x0000200002cc1e60 in __gnu_cxx::new_allocator<std::_List_node<hpx::threads::thread_id> >::allocate (this=0x2000218807a0, __n=1)
    at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/ext/new_allocator.h:102
#15 std::allocator_traits<std::allocator<std::_List_node<hpx::threads::thread_id> > >::allocate (__a=..., __n=1)
    at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/alloc_traits.h:444
#16 std::__cxx11::_List_base<hpx::threads::thread_id, std::allocator<hpx::threads::thread_id> >::_M_get_node (this=0x2000218807a0)
    at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_list.h:438
#17 std::__cxx11::list<hpx::threads::thread_id, std::allocator<hpx::threads::thread_id> >::_M_create_node<hpx::threads::thread_id const&> (
    this=0x2000218807a0) at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_list.h:630
#18 std::__cxx11::list<hpx::threads::thread_id, std::allocator<hpx::threads::thread_id> >::_M_insert<hpx::threads::thread_id const&> (
    __position=..., this=0x2000218807a0) at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_list.h:1907
#19 std::__cxx11::list<hpx::threads::thread_id, std::allocator<hpx::threads::thread_id> >::push_front (__x=<synthetic pointer>..., 
    this=0x2000218807a0) at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/stl_list.h:1158
#20 hpx::threads::policies::thread_queue<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo>::recycle_thread (thrd=..., this=0x200021880500)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/src/hpx/libs/core/schedulers/include/hpx/schedulers/thread_queue.hpp:333
#21 hpx::threads::policies::thread_queue<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo>::cleanup_terminated_locked (delete_all=false, this=<optimized out>)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/src/hpx/libs/core/schedulers/include/hpx/schedulers/thread_queue.hpp:422
#22 hpx::threads::policies::thread_queue<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::poli--Type <RET> for more, q to quit, c to continue without paging--
cies::lockfree_lifo>::cleanup_terminated (this=0x200021880500, delete_all=<optimized out>)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/src/hpx/libs/core/schedulers/include/hpx/schedulers/thread_queue.hpp:442
#23 0x0000200002cd7e8c in hpx::threads::policies::thread_queue<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo>::destroy_thread (thrd=<optimized out>, this=0x200021880500)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/src/hpx/libs/core/coroutines/include/hpx/coroutines/thread_enums.hpp:201
#24 hpx::threads::policies::local_priority_queue_scheduler<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo>::destroy_thread (thrd=<optimized out>, this=<optimized out>)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/src/hpx/libs/core/schedulers/include/hpx/schedulers/local_priority_queue_scheduler.hpp:727
#25 hpx::threads::detail::scheduling_loop<hpx::threads::policies::local_priority_queue_scheduler<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo> > (num_thread=5, scheduler=..., counters=..., params=...)
    at /ccs/proj/cph102/diehl/PowerTigerKokkos/src/hpx/libs/core/thread_pools/include/hpx/thread_pools/scheduling_loop.hpp:851
#26 0x0000200002cd8c84 in hpx::threads::detail::scheduled_thread_pool<hpx::threads::policies::local_priority_queue_scheduler<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo> >::thread_func (this=0x200005943020, 
    thread_num=5, global_thread_num=5, startup=...) at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/unique_ptr.h:360
#27 0x0000200002c84494 in std::__invoke_impl<void, void (hpx::threads::detail::scheduled_thread_pool<hpx::threads::policies::local_priority_queue_scheduler<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo> >::*)(unsigned long, unsigned long, std::shared_ptr<hpx::util::barrier>), hpx::threads::detail::scheduled_thread_pool<hpx::threads::policies::local_priority_queue_scheduler<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo> >*, unsigned long, unsigned long, std::shared_ptr<hpx::util::barrier> > (__t=<error reading variable: value has been optimized out>, 
    __f=<error reading variable: value has been optimized out>)
    at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/shared_ptr_base.h:756
#28 std::__invoke<void (hpx::threads::detail::scheduled_thread_pool<hpx::threads::policies::local_priority_queue_scheduler<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo> >::*)(unsigned long, unsigned long, std::shared_ptr<hpx::util::barrier>), hpx::threads::detail::scheduled_thread_pool<hpx::threads::policies::local_priority_queue_scheduler<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo> >*, unsigned long, unsigned long, std::shared_ptr<hpx::util::barrier> > (__fn=<error reading variable: value has been optimized out>)
    at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/bits/invoke.h:95
#29 std::thread::_Invoker<std::tuple<void (hpx::threads::detail::scheduled_thread_pool<hpx::threads::policies::local_priority_queue_scheduler<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo> >::*)(unsigned long, unsigned long, std::shared_ptr<hpx::util::barrier>), hpx::threads::detail::scheduled_thread_pool<hpx::threads::policies::local_priority_queue_scheduler<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo> >*, unsigned long, unsigned long, std::shared_ptr<hpx::util::barrier> > >::_M_invoke<0ul, 1ul, 2ul, 3ul, 4ul> (this=<error reading variable: value has been optimized out>)
    at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/thread:244
#30 std::thread::_Invoker<std::tuple<void (hpx::threads::detail::scheduled_thread_pool<hpx::threads::policies::local_priority_queue_scheduler<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo> >::*)(unsigned long, unsigned long, std::shared_ptr<hpx::util::barrier>), hpx::threads::detail::scheduled_thread_pool<hpx::threads::policies::local_priority_queue_scheduler<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo> >*, unsigned long, unsigned long, std::shared_ptr<hpx::util::barrier> > >::operator() (this=<error reading variable: value has been optimized out>)
    at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/thread:251
--Type <RET> for more, q to quit, c to continue without paging--
#31 std::thread::_State_impl<std::thread::_Invoker<std::tuple<void (hpx::threads::detail::scheduled_thread_pool<hpx::threads::policies::local_priority_queue_scheduler<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo> >::*)(unsigned long, unsigned long, std::shared_ptr<hpx::util::barrier>), hpx::threads::detail::scheduled_thread_pool<hpx::threads::policies::local_priority_queue_scheduler<std::mutex, hpx::threads::policies::lockfree_lifo, hpx::threads::policies::lockfree_fifo, hpx::threads::policies::lockfree_lifo> >*, unsigned long, unsigned long, std::shared_ptr<hpx::util::barrier> > > >::_M_run (this=<optimized out>)
    at /autofs/nccs-svm1_sw/summit/gcc/9.3.0-2/include/c++/9.3.0/thread:195
#32 0x0000200003e299c0 in std::execute_native_thread_routine (__p=<optimized out>)
    at /gpfs/alpine/scratch/belhorn/stf007/builds/gcc-build-9.3.0-3/gcc-9.3.0/libstdc++-v3/src/c++11/thread.cc:80
#33 0x0000200003c58ae0 in start_thread () from /lib64/power9/libpthread.so.0
#34 0x000020000424e7c8 in clone () from /lib64/power9/libc.so.6
diehlpk commented 2 years ago

thread apply all bt

[Uploading gdb.txt…]()

diehlpk commented 2 years ago

I will close the issue since we do not run on Summit anymore.