i have 330 hours of data, and i want to fork it with previous model, i get below error.
terminate called after throwing an instance of 'std::invalid_argument'
what(): CUDNN_STATUS_BAD_PARAM
*** Aborted at 1615368589 (unix time) try "date -d @1615368589" if you are using GNU date ***
terminate called after throwing an instance of 'std::invalid_argument'
what(): CUDNN_STATUS_BAD_PARAM
*** Aborted at 1615368589 (unix time) try "date -d @1615368589" if you are using GNU date ***
PC: @ 0x7f5d86a1de97 gsignal
*** SIGABRT (@0xb0c7) received by PID 45255 (TID 0x7f5dc8229380) from PID 45255; stack trace: ***
PC: @ 0x7fec48027e97 gsignal
*** SIGABRT (@0xb0c8) received by PID 45256 (TID 0x7fec89833380) from PID 45256; stack trace: ***
@ 0x7f5d87f8d890 (unknown)
PC: @ 0x7f22a40f6ca2 cuda::evalNodes<>()
PC: @ 0x7f5b68563ca2 cuda::evalNodes<>()
*** SIGFPE (@0x7f22a40f6ca2) received by PID 45251 (TID 0x7f22c5386380) from PID 18446744072167058594; stack trace: ***
@ 0x7fec49597890 (unknown)
PC: @ 0x7f3abd22dca2 cuda::evalNodes<>()
PC: @ 0x7f02272a5ca2 cuda::evalNodes<>()
PC: @ 0x7f030ebe2ca2 cuda::evalNodes<>()
*** SIGFPE (@0x7f5b68563ca2) received by PID 45249 (TID 0x7f5b897f3380) from PID 1750482082; stack trace: ***
@ 0x7f5d86a1de97 gsignal
PC: @ 0x7fa0cf3c1ca2 cuda::evalNodes<>()
*** SIGFPE (@0x7f3abd22dca2) received by PID 45253 (TID 0x7f3ade4bd380) from PID 18446744072587762850; stack trace: ***
*** SIGFPE (@0x7f030ebe2ca2) received by PID 45250 (TID 0x7f032fe72380) from PID 247344290; stack trace: ***
*** SIGFPE (@0x7f02272a5ca2) received by PID 45252 (TID 0x7f0248535380) from PID 657087650; stack trace: ***
*** SIGFPE (@0x7fa0cf3c1ca2) received by PID 45254 (TID 0x7fa0f0651380) from PID 18446744072891407522; stack trace: ***
@ 0x7fec48027e97 gsignal
@ 0x7f22850ea890 (unknown)
@ 0x7f5d86a1f801 abort
@ 0x7f5b49557890 (unknown)
@ 0x7f3a9e221890 (unknown)
@ 0x7f0208299890 (unknown)
@ 0x7f02efbd6890 (unknown)
@ 0x7fa0b03b5890 (unknown)
@ 0x7fec48029801 abort
@ 0x7f5d87641957 (unknown)
@ 0x7fec48c4b957 (unknown)
@ 0x7f5d87647ab6 (unknown)
@ 0x7fec48c51ab6 (unknown)
@ 0x7f5d87647af1 std::terminate()
@ 0x7fec48c51af1 std::terminate()
@ 0x7f5d87647d24 __cxa_throw
@ 0x7f22a40f6ca2 cuda::evalNodes<>()
@ 0x7fec48c51d24 __cxa_throw
@ 0x5623b8f75b33 fl::cudnnCheckErr()
@ 0x7f5b68563ca2 cuda::evalNodes<>()
@ 0x7f3abd22dca2 cuda::evalNodes<>()
@ 0x7fa0cf3c1ca2 cuda::evalNodes<>()
@ 0x7f02272a5ca2 cuda::evalNodes<>()
@ 0x7f030ebe2ca2 cuda::evalNodes<>()
@ 0x5610bd8e5b33 fl::cudnnCheckErr()
@ 0x5623b8f74216 fl::conv2d()
@ 0x5610bd8e4216 fl::conv2d()
@ 0x5623b8f1e661 fl::Conv2D::forward()
@ 0x5623b8f3248e fl::UnaryModule::forward()
@ 0x5610bd88e661 fl::Conv2D::forward()
@ 0x5610bd8a248e fl::UnaryModule::forward()
@ 0x5623b8f1c55a fl::Sequential::forward()
@ 0x5623b8c4da93 _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x5610bd88c55a fl::Sequential::forward()
@ 0x5623b8be13f8 main
@ 0x5610bd5bda93 _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x5610bd5513f8 main
@ 0x7f5d86a00b97 __libc_start_main
@ 0x7f22a40f7858 cuda::evalNodes<>()
@ 0x5623b8c47d6a _start
@ 0x7fec4800ab97 __libc_start_main
@ 0x7f5b68564858 cuda::evalNodes<>()
@ 0x7f3abd22e858 cuda::evalNodes<>()
@ 0x7fa0cf3c2858 cuda::evalNodes<>()
@ 0x7f02272a6858 cuda::evalNodes<>()
@ 0x7f030ebe3858 cuda::evalNodes<>()
@ 0x5610bd5b7d6a _start
@ 0x7f22a3ca536b cuda::Array<>::eval()
@ 0x7f0226e5436b cuda::Array<>::eval()
@ 0x7f030e79136b cuda::Array<>::eval()
@ 0x7f5b6811236b cuda::Array<>::eval()
@ 0x7f3abcddc36b cuda::Array<>::eval()
@ 0x7fa0cef7036b cuda::Array<>::eval()
@ 0x7f22a50c45b1 _ZN4cuda10reduce_allIL7af_op_t5EccEET1_RKNS_5ArrayIT0_EEbd
@ 0x7f02282735b1 _ZN4cuda10reduce_allIL7af_op_t5EccEET1_RKNS_5ArrayIT0_EEbd
@ 0x7f030fbb05b1 _ZN4cuda10reduce_allIL7af_op_t5EccEET1_RKNS_5ArrayIT0_EEbd
@ 0x7f5b695315b1 _ZN4cuda10reduce_allIL7af_op_t5EccEET1_RKNS_5ArrayIT0_EEbd
@ 0x7f3abe1fb5b1 _ZN4cuda10reduce_allIL7af_op_t5EccEET1_RKNS_5ArrayIT0_EEbd
@ 0x7fa0d038f5b1 _ZN4cuda10reduce_allIL7af_op_t5EccEET1_RKNS_5ArrayIT0_EEbd
@ 0x7f22a4a87c03 af_any_true_all
@ 0x7f0227c36c03 af_any_true_all
@ 0x7f030f573c03 af_any_true_all
@ 0x7f5b68ef4c03 af_any_true_all
@ 0x7f3abdbbec03 af_any_true_all
@ 0x7fa0cfd52c03 af_any_true_all
@ 0x7f22a4c5e609 af::anyTrue<>()
@ 0x560a6c347774 _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x560a6c2db3f8 main
@ 0x7f0227e0d609 af::anyTrue<>()
@ 0x5576be14c774 _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x7f030f74a609 af::anyTrue<>()
@ 0x55d0aeecd774 _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x5576be0e03f8 main
@ 0x7f2283b5db97 __libc_start_main
@ 0x55d0aee613f8 main
@ 0x560a6c341d6a _start
@ 0x7f3abdd95609 af::anyTrue<>()
@ 0x564aa2305774 _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x7f5b690cb609 af::anyTrue<>()
@ 0x7fa0cff29609 af::anyTrue<>()
@ 0x7f0206d0cb97 __libc_start_main
@ 0x5634c39bd774 _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x560413cb0774 _ZZ4mainENKUlSt10shared_ptrIN2fl6ModuleEES_IN3w2l17SequenceCriterionEES_INS3_10W2lDatasetEES_INS0_19FirstOrderOptimizerEES9_ddblE3_clES2_S5_S7_S9_S9_ddbl
@ 0x7f02ee649b97 __libc_start_main
@ 0x5576be146d6a _start
@ 0x560413c443f8 main
@ 0x564aa22993f8 main
@ 0x5634c39513f8 main
@ 0x55d0aeec7d6a _start
@ 0x7f3a9cc94b97 __libc_start_main
@ 0x7fa0aee28b97 __libc_start_main
@ 0x564aa22ffd6a _start
@ 0x7f5b47fcab97 __libc_start_main
@ 0x560413caad6a _start
@ 0x5634c39b7d6a _start
--------------------------------------------------------------------------
mpirun noticed that process rank 4 with PID 0 on node 3f1be4751a1c exited on signal 8 (Floating point exception).
--------------------------------------------------------------------------
Platform and Hardware
I train in w2l container, with 8x tesla v100
Additional Context
i start training with command
mpirun --allow-run-as-root -n 8 /root/new/build/Train fork /root/data/GM_am.bin -enable_distributed true --flagsfile=/root/data/0/fork_test.cfg
Bug Description
Dear all,
i have 330 hours of data, and i want to fork it with previous model, i get below error.
Platform and Hardware
I train in w2l container, with 8x tesla v100
Additional Context
i start training with command
mpirun --allow-run-as-root -n 8 /root/new/build/Train fork /root/data/GM_am.bin -enable_distributed true --flagsfile=/root/data/0/fork_test.cfg
here my config