Open tanklandry opened 1 year ago
F20230228 07:15:23.032397 192538 rpc_client.cpp:40] Check failed: stub->CallMethod(&clientctx, request, &response_).error_code() == grpc::StatusCode::OK (14 vs. 0) Check failure stack trace: F20230228 07:15:23.032395 192276 ctrl_client.cpp:54] Check failed: rpcclient.GetStubAt(i)->CallMethod( &client_ctx, request, &response).error_code() == grpc::StatusCode::OK (14 vs. 0) Machine 1 lost F20230228 07:15:23.032667 192277 ctrl_client.cpp:54] Check failed: rpcclient.GetStubAt(i)->CallMethod( &client_ctx, request, &response).error_code() == grpc::StatusCode::OK (14 vs. 0) Machine 1 lost F20230228 07:15:23.032728 192278 ctrl_client.cpp:54] Check failed: rpcclient.GetStubAt(i)->CallMethod( &client_ctx, request, &response).error_code() == grpc::StatusCode::OK (14 vs. 0) Machine 1 lost Check failure stack trace: Check failure stack trace: F20230228 07:15:23.032753 192537 rpc_client.cpp:40] Check failed: stub->CallMethod(&clientctx, request, &response_).error_code() == grpc::StatusCode::OK (14 vs. 0) Check failure stack trace: Check failure stack trace: @ 0x7f4fcc0319ba google::LogMessage::Fail() @ 0x7f4fcc031ca2 google::LogMessage::SendToLog() @ 0x7f4fcc0319ba google::LogMessage::Fail() @ 0x7f67977169ba google::LogMessage::Fail() @ 0x7fa94f6f59ba google::LogMessage::Fail() @ 0x7fa94f6f59ba google::LogMessage::Fail() F20230228 07:15:23.032397 192538 rpc_client.cpp:40] Check failed: stub->CallMethod(&clientctx, request, &response_).error_code() == grpc::StatusCode::OK (14 vs. 0) F20230228 07:15:23.061861 192518 io_event_poller.cpp:95] Check failed: !(cur_event->events & EPOLLERR) fd: 62: Resource temporarily unavailable [11] Check failure stack trace: @ 0x7f4fcc031527 google::LogMessage::Flush() @ 0x7f4fcc031ca2 google::LogMessage::SendToLog() @ 0x7f6797716ca2 google::LogMessage::SendToLog() @ 0x7fa94f6f5ca2 google::LogMessage::SendToLog() @ 0x7fa94f6f5ca2 google::LogMessage::SendToLog() @ 0x7f4fcc034099 google::LogMessageFatal::~LogMessageFatal() @ 0x7f4fcc031527 google::LogMessage::Flush() @ 0x7fa94f6f5527 google::LogMessage::Flush() @ 0x7f6797716527 google::LogMessage::Flush() @ 0x7f4fcc0319ba google::LogMessage::Fail() @ 0x7fa94f6f5527 google::LogMessage::Flush() @ 0x7f4fcc034099 google::LogMessageFatal::~LogMessageFatal() @ 0x7fa94f6f8099 google::LogMessageFatal::~LogMessageFatal() @ 0x7f6797719099 google::LogMessageFatal::~LogMessageFatal() @ 0x7f4fc1bb548f oneflow::RpcClient::PushKV() @ 0x7f4fc1ba0f55 _ZZN7oneflow14GrpcCtrlClientC4ERKNS_10ProcessCtxEENKUlvE_clEv @ 0x7fa945264f55 _ZZN7oneflow14GrpcCtrlClientC4ERKNS_10ProcessCtxEENKUlvE_clEv @ 0x7f4fcc031ca2 google::LogMessage::SendToLog() @ 0x7f678d285f55 _ZZN7oneflow14GrpcCtrlClientC4ERKNS_10ProcessCtxEENKUlvE_clEv @ 0x7fa94f6f8099 google::LogMessageFatal::~LogMessageFatal() @ 0x7f4fcc0463ff execute_native_thread_routine @ 0x7fa94f70a3ff execute_native_thread_routine @ 0x7f679772b3ff execute_native_thread_routine @ 0x7f4fcc031527 google::LogMessage::Flush() @ 0x7f50366ae6db start_thread @ 0x7f4fc1bb5530 oneflow::RpcClient::PushKV() @ 0x7fa9b9d726db start_thread @ 0x7f6801d936db start_thread @ 0x7f5035c3271f clone @ 0x7fa9b92f671f clone @ 0x7f680131771f clone Killing subprocess 192187 Killing subprocess 192188 Killing subprocess 192189 Killing subprocess 192190 Traceback (most recent call last): File "/data/lhy/torch/lib/python3.8/runpy.py", line 194, in _run_module_as_main return _run_code(code, main_globals, None, File "/data/lhy/torch/lib/python3.8/runpy.py", line 87, in _run_code exec(code, run_globals) File "/data/lhy/torch/lib/python3.8/site-packages/oneflow/distributed/launch.py", line 240, in main() File "/data/lhy/torch/lib/python3.8/site-packages/oneflow/distributed/launch.py", line 228, in main sigkill_handler(signal.SIGTERM, None) File "/data/lhy/torch/lib/python3.8/site-packages/oneflow/distributed/launch.py", line 196, in sigkill_handler raise subprocess.CalledProcessError( subprocess.CalledProcessError: Command '['/data/lhy/torch/bin/python3', '-u', 'demo.py']' died with <Signals.SIGBUS: 7>.
请帮忙打开如下环境变量:
然后执行时会产生一个 log 目录,请把 log 目录打包发出来看看
之前我遇到过Signals.SIGBUS: 7之类的报错, 是由于端口号被占用引起的.
Signals.SIGBUS: 7
可以在你的运行脚本里面加上 --master_port 12344 之类的, 换一个端口试试
--master_port 12344
F20230228 07:15:23.032397 192538 rpc_client.cpp:40] Check failed: stub->CallMethod(&clientctx, request, &response_).error_code() == grpc::StatusCode::OK (14 vs. 0)
Check failure stack trace:
F20230228 07:15:23.032395 192276 ctrl_client.cpp:54] Check failed: rpcclient.GetStubAt(i)->CallMethod( &client_ctx, request, &response).error_code() == grpc::StatusCode::OK (14 vs. 0) Machine 1 lost
F20230228 07:15:23.032667 192277 ctrl_client.cpp:54] Check failed: rpcclient.GetStubAt(i)->CallMethod( &client_ctx, request, &response).error_code() == grpc::StatusCode::OK (14 vs. 0) Machine 1 lost
F20230228 07:15:23.032728 192278 ctrl_client.cpp:54] Check failed: rpcclient.GetStubAt(i)->CallMethod( &client_ctx, request, &response).error_code() == grpc::StatusCode::OK (14 vs. 0) Machine 1 lost
Check failure stack trace:
Check failure stack trace:
F20230228 07:15:23.032753 192537 rpc_client.cpp:40] Check failed: stub->CallMethod(&clientctx, request, &response_).error_code() == grpc::StatusCode::OK (14 vs. 0)
Check failure stack trace:
Check failure stack trace:
@ 0x7f4fcc0319ba google::LogMessage::Fail()
@ 0x7f4fcc031ca2 google::LogMessage::SendToLog()
@ 0x7f4fcc0319ba google::LogMessage::Fail()
@ 0x7f67977169ba google::LogMessage::Fail()
@ 0x7fa94f6f59ba google::LogMessage::Fail()
@ 0x7fa94f6f59ba google::LogMessage::Fail()
F20230228 07:15:23.032397 192538 rpc_client.cpp:40] Check failed: stub->CallMethod(&clientctx, request, &response_).error_code() == grpc::StatusCode::OK (14 vs. 0) F20230228 07:15:23.061861 192518 io_event_poller.cpp:95] Check failed: !(cur_event->events & EPOLLERR) fd: 62: Resource temporarily unavailable [11]
Check failure stack trace:
@ 0x7f4fcc031527 google::LogMessage::Flush()
@ 0x7f4fcc031ca2 google::LogMessage::SendToLog()
@ 0x7f6797716ca2 google::LogMessage::SendToLog()
@ 0x7fa94f6f5ca2 google::LogMessage::SendToLog()
@ 0x7fa94f6f5ca2 google::LogMessage::SendToLog()
@ 0x7f4fcc034099 google::LogMessageFatal::~LogMessageFatal()
@ 0x7f4fcc031527 google::LogMessage::Flush()
@ 0x7fa94f6f5527 google::LogMessage::Flush()
@ 0x7f6797716527 google::LogMessage::Flush()
@ 0x7f4fcc0319ba google::LogMessage::Fail()
@ 0x7fa94f6f5527 google::LogMessage::Flush()
@ 0x7f4fcc034099 google::LogMessageFatal::~LogMessageFatal()
@ 0x7fa94f6f8099 google::LogMessageFatal::~LogMessageFatal()
@ 0x7f6797719099 google::LogMessageFatal::~LogMessageFatal()
@ 0x7f4fc1bb548f oneflow::RpcClient::PushKV()
@ 0x7f4fc1ba0f55 _ZZN7oneflow14GrpcCtrlClientC4ERKNS_10ProcessCtxEENKUlvE_clEv
@ 0x7fa945264f55 _ZZN7oneflow14GrpcCtrlClientC4ERKNS_10ProcessCtxEENKUlvE_clEv
@ 0x7f4fcc031ca2 google::LogMessage::SendToLog()
@ 0x7f678d285f55 _ZZN7oneflow14GrpcCtrlClientC4ERKNS_10ProcessCtxEENKUlvE_clEv
@ 0x7fa94f6f8099 google::LogMessageFatal::~LogMessageFatal()
@ 0x7f4fcc0463ff execute_native_thread_routine
@ 0x7fa94f70a3ff execute_native_thread_routine
@ 0x7f679772b3ff execute_native_thread_routine
@ 0x7f4fcc031527 google::LogMessage::Flush()
@ 0x7f50366ae6db start_thread
@ 0x7f4fc1bb5530 oneflow::RpcClient::PushKV()
@ 0x7fa9b9d726db start_thread
@ 0x7f6801d936db start_thread
@ 0x7f5035c3271f clone
@ 0x7fa9b92f671f clone
@ 0x7f680131771f clone
Killing subprocess 192187
Killing subprocess 192188
Killing subprocess 192189
Killing subprocess 192190
Traceback (most recent call last):
File "/data/lhy/torch/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/data/lhy/torch/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/data/lhy/torch/lib/python3.8/site-packages/oneflow/distributed/launch.py", line 240, in
main()
File "/data/lhy/torch/lib/python3.8/site-packages/oneflow/distributed/launch.py", line 228, in main
sigkill_handler(signal.SIGTERM, None)
File "/data/lhy/torch/lib/python3.8/site-packages/oneflow/distributed/launch.py", line 196, in sigkill_handler
raise subprocess.CalledProcessError(
subprocess.CalledProcessError: Command '['/data/lhy/torch/bin/python3', '-u', 'demo.py']' died with <Signals.SIGBUS: 7>.