Open xiezipeng-ML opened 1 year ago
# 2卡数据并行 [04/19 03:32:49 lb.models.utils.graph_base]: Start compling the train graph which may take some time. Please wait for a moment ... F20230419 03:32:54.952033 3055036 shape.cpp:30] Check failed: index < tp()->NumAxes() (0 vs. 0) Shape: () visit index: 0 > num_axes: 0 *** Check failure stack trace: *** @ 0x7f90eff21c9a google::LogMessage::Fail() @ 0x7f90eff24bd1 google::LogMessage::SendToLog() @ 0x7f90eff217c9 google::LogMessage::Flush() @ 0x7f90eff254b9 google::LogMessageFatal::~LogMessageFatal() @ 0x7f90e3d1feac oneflow::ConstShapeMixIn<>::At() @ 0x7f90e868117e oneflow::SubTskGphBuilderUtil::HasEmptySliceIfSplit() @ 0x7f90e867ea77 oneflow::SliceBoxingSubTskGphBuilder::Build() @ 0x7f90e8659811 oneflow::ChainSubTskGphBuilder::Build() @ 0x7f90e866ee3a oneflow::FlatSubTskGphBuilder::Build() @ 0x7f90e866ddad oneflow::DispatchHierarchicalSubTskGphBuilder::Build() @ 0x7f90e870f232 oneflow::TaskGraph::BldSubTskGphByBoxing() @ 0x7f90e87112e3 oneflow::TaskGraph::TaskGraph() @ 0x7f90e87a6563 oneflow::Compiler::Compile() @ 0x7f90e7ce1131 oneflow::NNGraph::CompilePlanForRuntime() @ 0x7f91c13179a3 (unknown) @ 0x7f91c118087a (unknown) @ 0x507f97 cfunction_call @ 0x4f0f1c _PyObject_MakeTpCall @ 0x505ed0 method_vectorcall @ 0x4ecebf _PyEval_EvalFrameDefault @ 0x4f8923 function_code_fastcall @ 0x505bb1 method_vectorcall @ 0x4ecebf _PyEval_EvalFrameDefault @ 0x4e729a _PyEval_EvalCode @ 0x505cbd method_vectorcall @ 0x5063f4 PyObject_Call @ 0x4ec77b _PyEval_EvalFrameDefault @ 0x4e729a _PyEval_EvalCode @ 0x4f8645 _PyFunction_Vectorcall @ 0x4f07f5 _PyObject_FastCallDictTstate @ 0x503616 _PyObject_Call_Prepend @ 0x5cc3f3 slot_tp_call F20230419 03:32:55.400154 3055037 rpc_client.cpp:49] Check failed: stub->CallMethod<ctrl_method>(&client_ctx, request_, &response_).error_code() == grpc::StatusCode::OK (14 vs. 0) *** Check failure stack trace: *** @ 0x7f585adf0c9a google::LogMessage::Fail() @ 0x7f585adf3bd1 google::LogMessage::SendToLog() @ 0x7f585adf07c9 google::LogMessage::Flush() @ 0x7f585adf44b9 google::LogMessageFatal::~LogMessageFatal() @ 0x7f584ec1d381 oneflow::RpcClient::PullKV() @ 0x7f584ec1d514 oneflow::RpcClient::PullKV() @ 0x7f5852bb066e oneflow::NNGraph::CompilePlanForRuntime() @ 0x7f592c1e69a3 (unknown) @ 0x7f592c04f87a (unknown) @ 0x507f97 cfunction_call @ 0x4f0f1c _PyObject_MakeTpCall @ 0x505ed0 method_vectorcall @ 0x4ecebf _PyEval_EvalFrameDefault @ 0x4f8923 function_code_fastcall @ 0x505bb1 method_vectorcall @ 0x4ecebf _PyEval_EvalFrameDefault @ 0x4e729a _PyEval_EvalCode @ 0x505cbd method_vectorcall @ 0x5063f4 PyObject_Call @ 0x4ec77b _PyEval_EvalFrameDefault @ 0x4e729a _PyEval_EvalCode @ 0x4f8645 _PyFunction_Vectorcall @ 0x4f07f5 _PyObject_FastCallDictTstate @ 0x503616 _PyObject_Call_Prepend @ 0x5cc3f3 slot_tp_call @ 0x506498 PyObject_Call @ 0x4ec77b _PyEval_EvalFrameDefault @ 0x4e729a _PyEval_EvalCode @ 0x4f8645 _PyFunction_Vectorcall @ 0x4e882b _PyEval_EvalFrameDefault @ 0x4f8923 function_code_fastcall @ 0x4e882b _PyEval_EvalFrameDefault Killing subprocess 3055036 Killing subprocess 3055037 Traceback (most recent call last): File "/home/xiezipeng/anaconda3/envs/oneflow-dev-gcc9/lib/python3.9/runpy.py", line 197, in _run_module_as_main return _run_code(code, main_globals, None, File "/home/xiezipeng/anaconda3/envs/oneflow-dev-gcc9/lib/python3.9/runpy.py", line 87, in _run_code exec(code, run_globals) File "/home/xiezipeng/anaconda3/envs/oneflow-dev-gcc9/lib/python3.9/site-packages/oneflow/distributed/launch.py", line 240, in <module> main() File "/home/xiezipeng/anaconda3/envs/oneflow-dev-gcc9/lib/python3.9/site-packages/oneflow/distributed/launch.py", line 228, in main sigkill_handler(signal.SIGTERM, None) File "/home/xiezipeng/anaconda3/envs/oneflow-dev-gcc9/lib/python3.9/site-packages/oneflow/distributed/launch.py", line 196, in sigkill_handler raise subprocess.CalledProcessError( subprocess.CalledProcessError: Command '['/home/xiezipeng/anaconda3/envs/oneflow-dev-gcc9/bin/python3', '-u', 'tools/train_net.py', '--config-file', 'projects/mock_gpt_train/configs/training.py']' died with <Signals.SIGABRT: 6>.