n-west / bliss

BSD 3-Clause "New" or "Revised" License
4 stars 2 forks source link

Crash when requesting hits on cuda #44

Closed n-west closed 7 months ago

n-west commented 7 months ago

I'm not sure if it's just trigerring executing or some other effect, but requesting hits() without first setting device to cpu causes a crash even if every operation in compute graph can happen on a gpu. Here's a backtrace:

(cuda-gdb) bt
#0  0x00007ffff0c66fd0 in cudbgReportDriverInternalError () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#1  0x00007ffff0c6b800 in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#2  0x00007ffff0fad6f9 in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#3  0x00007ffff0c6b84a in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#4  0x00007ffff0c6c1d6 in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#5  0x00007ffff0f7a492 in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#6  0x00007ffff0d65aa3 in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#7  0x0000555555cbd863 in __cudart516 ()
#8  0x0000555555cbd928 in __cudart1336 ()
#9  0x00007ffff7bc8a99 in __pthread_once_slow () from /lib/x86_64-linux-gnu/libpthread.so.0
#10 0x0000555555d0bec9 in __cudart1612 ()
#11 0x0000555555cb44c7 in __cudart514 ()
#12 0x0000555555cde6e0 in cudaSetDevice ()
#13 0x00005555555d881b in bland::detail::blandDLTensor::blandDLTensor (this=0x7fffffff87f0, shape=..., dtype=..., device=..., strides=...) at /datax/scratch/nwest/Projects/bliss/bland/bland/bland_tensor_internals.cpp:93
#14 0x00005555555db767 in bland::ndarray::ndarray (this=0x7fffffff87f0, dims=..., dtype=..., device=...) at /datax/scratch/nwest/Projects/bliss/bland/bland/ndarray.cpp:124
#15 0x00005555555ec54e in bland::to (src=..., dest_dev=...) at /datax/scratch/nwest/Projects/bliss/bland/bland/ops/ops.cpp:52
#16 0x00005555555dbb3d in bland::ndarray::to (this=0x7fffffff8b00, dest=...) at /datax/scratch/nwest/Projects/bliss/bland/bland/ndarray.cpp:257
#17 0x00005555555e9b7c in bland::ndarray_deferred::operator bland::ndarray (this=0x7fffffff8cb0) at /datax/scratch/nwest/Projects/bliss/bland/bland/ndarray_deferred.cpp:47
#18 0x00005555555a8ff9 in operator() (__closure=0x5555577a0db0) at /datax/scratch/nwest/Projects/bliss/bliss/flaggers/spectral_kurtosis.cpp:35
#19 0x00005555555a9b84 in std::__invoke_impl<bland::ndarray, bliss::flag_spectral_kurtosis(bliss::coarse_channel, float, float)::<lambda()>&>(std::__invoke_other, struct {...} &) (__f=...)
    at /mnt_home2/nwest/.conda/envs/bliss-dev/x86_64-conda-linux-gnu/include/c++/11.4.0/bits/invoke.h:61
#20 0x00005555555a9a68 in std::__invoke_r<bland::ndarray, bliss::flag_spectral_kurtosis(bliss::coarse_channel, float, float)::<lambda()>&>(struct {...} &) (__fn=...) at /mnt_home2/nwest/.conda/envs/bliss-dev/x86_64-conda-linux-gnu/include/c++/11.4.0/bits/invoke.h:116
#21 0x00005555555a98ff in std::_Function_handler<bland::ndarray(), bliss::flag_spectral_kurtosis(bliss::coarse_channel, float, float)::<lambda()> >::_M_invoke(const std::_Any_data &) (__functor=...)
    at /mnt_home2/nwest/.conda/envs/bliss-dev/x86_64-conda-linux-gnu/include/c++/11.4.0/bits/std_function.h:291
--Type <RET> for more, q to quit, c to continue without paging--
#22 0x00005555555e9eb1 in std::function<bland::ndarray ()>::operator()() const (this=0x5555577a1b90) at /mnt_home2/nwest/.conda/envs/bliss-dev/x86_64-conda-linux-gnu/include/c++/11.4.0/bits/std_function.h:590
#23 0x00005555555e9ae0 in bland::ndarray_deferred::operator bland::ndarray (this=0x7fffffff8fa0) at /datax/scratch/nwest/Projects/bliss/bland/bland/ndarray_deferred.cpp:41
#24 0x0000555555598ef7 in operator() (__closure=0x5555577a1da0) at /datax/scratch/nwest/Projects/bliss/bliss/drift_search/integrate_drifts.cpp:43
#25 0x000055555559a376 in std::__invoke_impl<bliss::frequency_drift_plane, bliss::integrate_drifts(bliss::coarse_channel, bliss::integrate_drifts_options)::<lambda()>&>(std::__invoke_other, struct {...} &) (__f=...)
    at /mnt_home2/nwest/.conda/envs/bliss-dev/x86_64-conda-linux-gnu/include/c++/11.4.0/bits/invoke.h:61
#26 0x0000555555599ffa in std::__invoke_r<bliss::frequency_drift_plane, bliss::integrate_drifts(bliss::coarse_channel, bliss::integrate_drifts_options)::<lambda()>&>(struct {...} &) (__fn=...)
    at /mnt_home2/nwest/.conda/envs/bliss-dev/x86_64-conda-linux-gnu/include/c++/11.4.0/bits/invoke.h:116
#27 0x0000555555599bde in std::_Function_handler<bliss::frequency_drift_plane(), bliss::integrate_drifts(bliss::coarse_channel, bliss::integrate_drifts_options)::<lambda()> >::_M_invoke(const std::_Any_data &) (__functor=...)
    at /mnt_home2/nwest/.conda/envs/bliss-dev/x86_64-conda-linux-gnu/include/c++/11.4.0/bits/std_function.h:291
#28 0x0000555555593b25 in std::function<bliss::frequency_drift_plane ()>::operator()() const (this=0x5555577a20f0) at /mnt_home2/nwest/.conda/envs/bliss-dev/x86_64-conda-linux-gnu/include/c++/11.4.0/bits/std_function.h:590
#29 0x0000555555592566 in bliss::coarse_channel::integrated_drift_plane (this=0x7fffffffa930) at /datax/scratch/nwest/Projects/bliss/bliss/core/coarse_channel.cpp:279
#30 0x000055555559f4c3 in bliss::protohit_search (dedrifted_coarse_channel=..., options=...) at /datax/scratch/nwest/Projects/bliss/bliss/drift_search/protohit_search.cpp:30
#31 0x000055555559cd5d in bliss::hit_search[abi:cxx11](bliss::coarse_channel, bliss::hit_search_options) (dedrifted_scan=..., options=...) at /datax/scratch/nwest/Projects/bliss/bliss/drift_search/hit_search.cpp:19
#32 0x000055555559d2a1 in operator() (__closure=0x5555577acff0) at /datax/scratch/nwest/Projects/bliss/bliss/drift_search/hit_search.cpp:69
#33 0x000055555559dbb4 in std::__invoke_impl<std::__cxx11::list<bliss::hit>, bliss::hit_search(bliss::scan, bliss::hit_search_options)::<lambda()>&>(std::__invoke_other, struct {...} &) (__f=...)
    at /mnt_home2/nwest/.conda/envs/bliss-dev/x86_64-conda-linux-gnu/include/c++/11.4.0/bits/invoke.h:61
#34 0x000055555559daa5 in std::__invoke_r<std::__cxx11::list<bliss::hit>, bliss::hit_search(bliss::scan, bliss::hit_search_options)::<lambda()>&>(struct {...} &) (__fn=...) at /mnt_home2/nwest/.conda/envs/bliss-dev/x86_64-conda-linux-gnu/include/c++/11.4.0/bits/invoke.h:116
#35 0x000055555559d909 in std::_Function_handler<std::__cxx11::list<bliss::hit, std::allocator<bliss::hit> >(), bliss::hit_search(bliss::scan, bliss::hit_search_options)::<lambda()> >::_M_invoke(const std::_Any_data &) (__functor=...)
    at /mnt_home2/nwest/.conda/envs/bliss-dev/x86_64-conda-linux-gnu/include/c++/11.4.0/bits/std_function.h:291
#36 0x0000555555593795 in std::function<std::__cxx11::list<bliss::hit, std::allocator<bliss::hit> > ()>::operator()() const (this=0x5555577acfc0) at /mnt_home2/nwest/.conda/envs/bliss-dev/x86_64-conda-linux-gnu/include/c++/11.4.0/bits/std_function.h:590
#37 0x0000555555592041 in bliss::coarse_channel::hits[abi:cxx11]() const (this=0x555557785320) at /datax/scratch/nwest/Projects/bliss/bliss/core/coarse_channel.cpp:132
#38 0x0000555555589023 in bliss::scan::hits[abi:cxx11]() (this=0x7fffffffba20) at /datax/scratch/nwest/Projects/bliss/bliss/core/scan.cpp:246
#39 0x000055555556b3be in main (argc=1, argv=0x7fffffffdf28) at /datax/scratch/nwest/Projects/bliss/bliss/justrun.cpp:63
n-west commented 7 months ago

Fixed in 15ea6e25f731ad8826e67996245ad769775efb8d