PaddlePaddle / Anakin

High performance Cross-platform Inference-engine, you could run Anakin on x86-cpu,arm, nv-gpu, amd-gpu,bitmain and cambricon devices.
https://anakin.baidu.com/
Apache License 2.0
531 stars 135 forks source link

SIGFPE error while running Anakin on AMD GPU #516

Open avinashcpandey opened 5 years ago

avinashcpandey commented 5 years ago

I am running Anakin on AMD GPU. I used below git repo https://github.com/ROCmSoftwarePlatform/Anakin branch AMD_master_upstream

Now when I run any model I see below FP error

With gdb

Thread 1 "benchmark" received signal SIGFPE, Arithmetic exception. 0x00001555527dc6b6 in miopen::solver::ConvBinWinograd3x3U::IsApplicable(miopen::ConvolutionContext const&) const () from /home/amd/avinash/Anakin_GPU/rocm-Anakin/Anakin/output/miopen/lib/libMIOpen.so.1 (gdb)

On Console

logger caught a signal: SIGFPE FTL| 11:21:19.00891| 3.844s| main_thread| :0] Signal caught: SIGFPE Signal caught: fatal error: stack trace: 19 0x4cb3a9 _start + 41 18 0x7fe38ed7d830 libc_start_main + 240 17 0x4cd592 main + 1691 16 0x4d0c7d anakin::test::EngineTest::run_all(char const) + 1243 15 0x4d7ff6 std::function<void ()>::operator()() const + 50 14 0x4dfa48 std::_Function_handler<void (), std::_Bind<std::_Mem_fn<void (NetTest_net_execute_base_test::)()> (NetTest_net_execute_base_test)> >::_M_invoke(std::_Any_data const&) + 32 13 0x4e4ca9 void std::_Bind<std::_Mem_fn<void (NetTest_net_execute_base_test::)()> (NetTest_net_execute_base_test)>::operator()<, void>() + 57 12 0x4e91bf void std::_Bind<std::_Mem_fn<void (NetTest_net_execute_base_test::)()> (NetTest_net_execute_base_test)>::call<void, , 0ul>(std::tuple<>&&, std::_Index_tuple<0ul>) + 87 11 0x4ec335 void std::_Mem_fn_base<void (NetTest_net_execute_base_test::)(), true>::operator()<, void>(NetTest_net_execute_base_test&) const + 101 10 0x4cc25a NetTest_net_execute_base_test::net_execute_base_test() + 2076 9 0x7fe390a563f1 anakin::Net<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2>, (anakin::Precision)0, (anakin::OpRunType)1>::Net(anakin::graph::Graph<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2>, (anakin::Precision)0>&, bool) + 225 8 0x7fe390a59e3a anakin::Net<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2>, (anakin::Precision)0, (anakin::OpRunType)1>::init(anakin::graph::Graph<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2>, (anakin::Precision)0>&) + 7540 7 0x7fe390c6420d anakin::ops::ConvReluHelper<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2>, (anakin::Precision)0>::Init(anakin::saber::Context<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >&, std::vector<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >, std::allocator<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >> > const&, std::vector<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >, std::allocator<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >> >&) + 571 6 0x7fe390b9b0cd anakin::saber::BaseFunc<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2>, (anakin::saber::DataType)1, anakin::saber::ImplBase, anakin::saber::ConvParam>::init(std::vector<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >, std::allocator<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >> > const&, std::vector<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >, std::allocator<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >> >&, anakin::saber::ConvParam<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >&, anakin::saber::SaberImplStrategy, anakin::saber::ImplEnum, anakin::saber::Context<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >&) + 739 5 0x7fe38f8d979a anakin::saber::SaberConv2D<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2>, (anakin::saber::DataType)1>::init(std::vector<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >, std::allocator<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >> > const&, std::vector<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >, std::allocator<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >> >&, anakin::saber::ConvParam<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >&, anakin::saber::Context<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >&) + 76 4 0x7fe38f8db3e0 anakin::saber::SaberConv2D<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2>, (anakin::saber::DataType)1>::create(std::vector<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >, std::allocator<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >> > const&, std::vector<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >, std::allocator<anakin::saber::Tensor<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >*> >&, anakin::saber::ConvParam<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >&, anakin::saber::Context<anakin::saber::TargetType<(anakin::saber::TargetTypeEnum)2> >&) + 7236 3 0x7fe38f8fa92a _ZN6miopen6solver17SearchForSolutionIINS0_19ConvBinWinograd3x3UENS0_22ConvOclDirectFwd1x1AMDENS0_19ConvOclDirectFwdGenENS0_19ConvOclDirectFwd3x3ENS0_19ConvOclDirectFwd1x1ENS0_16ConvOclDirectFwdEENS_18ConvolutionContextENS_2DbEEENSt11common_typeIIDpDTcl12FindSolutiontlT_Efp_fp0_EEEE4typeERKT0T1 + 122 2 0x7fe38f8ff2ef void miopen::solver::ExpandSearch<miopen::ConvolutionContext, miopen::solver::ConvSolution, miopen::Db, bool, miopen::solver::ConvBinWinograd3x3U, miopen::solver::ConvOclDirectFwd1x1AMD, miopen::solver::ConvOclDirectFwdGen, miopen::solver::ConvOclDirectFwd3x3, miopen::solver::ConvOclDirectFwd1x1, miopen::solver::ConvOclDirectFwd>(miopen::ConvolutionContext const&, miopen::Db&, miopen::solver::ConvSolution&, bool&, miopen::solver::ConvBinWinograd3x3U&&, miopen::solver::ConvOclDirectFwd1x1AMD&&, miopen::solver::ConvOclDirectFwdGen&&, miopen::solver::ConvOclDirectFwd3x3&&, miopen::solver::ConvOclDirectFwd1x1&&, miopen::solver::ConvOclDirectFwd&&) + 63 1 0x7fe38f9009c3 void miopen::solver::ExpandSearch<miopen::ConvolutionContext, miopen::solver::ConvSolution, miopen::Db, bool, miopen::solver::ConvBinWinograd3x3U>(miopen::ConvolutionContext const&, miopen::Db&, miopen::solver::ConvSolution&, bool&, miopen::solver::ConvBinWinograd3x3U&) + 111 0 0x7fe38dce56b6 miopen::solver::ConvBinWinograd3x3U::IsApplicable(miopen::ConvolutionContext const&) const + 598