alibaba / FastNN

FastNN provides distributed training examples that use EPL.
Apache License 2.0
81 stars 19 forks source link

bert示例运行报错 OP_REQUIRES failed at nccl_communicator.cc:116 : Internal: unhandled system error,请问如何解决? #16

Open gyr-kdgc opened 11 months ago

gyr-kdgc commented 11 months ago

2023-09-27 09:21:54.582250: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1351] Created TensorFlow device (/job:worker/replica:0/task:1/device:GPU:0 with 5211 MB memory) -> physical GPU (device: 0, name: Tesla V100-PCIE-16GB, pci bus id: 0000:0b:00.0, compute capability: 7.0) 2023-09-27 09:21:54.583242: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:258] Initialize GrpcChannelCache for job worker -> {0 -> 127.0.0.1:54368, 1 -> localhost:45069} 2023-09-27 09:21:54.587690: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:365] Started server with target: grpc://localhost:45069 2023-09-27 09:21:59.865339: W tensorflow/core/framework/op_kernel.cc:1651] OP_REQUIRES failed at nccl_communicator.cc:116 : Internal: unhandled system error 2023-09-27 09:21:59.865421: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Internal: unhandled system error [[{{node BROADCAST_0_broadcast_pool_group_0/1/EplNcclCommunicatorCreater}}]] 2023-09-27 09:21:59.865506: W tensorflow/core/framework/op_kernel.cc:1651] OP_REQUIRES failed at nccl_communicator.cc:116 : Internal: unhandled system error 2023-09-27 09:21:59.865408: W tensorflow/core/framework/op_kernel.cc:1651] OP_REQUIRES failed at nccl_communicator.cc:116 : Internal: unhandled system error ERROR:tensorflow:Error recorded from training_loop: From /job:worker/replica:0/task:1: unhandled system error [[node BROADCAST_0_broadcast_pool_group_0/1/EplNcclCommunicatorCreater (defined at /usr/local/lib/python3.8/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]

Original stack trace for 'BROADCAST_0_broadcast_pool_group_0/1/EplNcclCommunicatorCreater': File "run_squad_dp.py", line 33, in tf.app.run() File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/platform/app.py", line 40, in run _run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef) File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 312, in run _run_main(main, args) File "/usr/local/lib/python3.8/dist-packages/absl/app.py", line 258, in _run_main sys.exit(main(argv)) File "/home/epl/FastNN/bert/run_squad.py", line 1255, in main estimator.train(input_fn=train_input_fn, max_steps=num_train_steps, hooks=hooks) File "/usr/local/lib/python3.8/dist-packages/tensorflow_estimator/python/estimator/tpu/tpu_estimator.py", line 3025, in train return super(TPUEstimator, self).train( File "/usr/local/lib/python3.8/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 370, in train loss = self._train_model(input_fn, hooks, saving_listeners) File "/usr/local/lib/python3.8/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1161, in _train_model return self._train_model_default(input_fn, hooks, saving_listeners) File "/usr/local/lib/python3.8/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1193, in _train_model_default return self._train_with_estimator_spec(estimator_spec, worker_hooks, File "/usr/local/lib/python3.8/dist-packages/tensorflow_estimator/python/estimator/estimator.py", line 1478, in _train_with_estimator_spec with training.MonitoredTrainingSession( File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 581, in MonitoredTrainingSession return MonitoredSession( File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1010, in init super(MonitoredSession, self).init( File "/usr/local/lib/python3.8/dist-packages/epl/parallel/hooks.py", line 319, in init res = fn(self, args, kwargs) File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 725, in init self._sess = _RecoverableSession(self._coordinated_creator) File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1207, in init _WrappedSession.init(self, self._create_session()) File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 1212, in _create_session return self._sess_creator.create_session() File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 878, in create_session self.tf_sess = self._session_creator.create_session() File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/monitored_session.py", line 639, in create_session return self._get_session_manager().prepare_session( File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/training/session_manager.py", line 296, in prepare_session sess.run(init_op, feed_dict=init_feed_dict) File "/usr/local/lib/python3.8/dist-packages/epl/parallel/hooks.py", line 453, in run assign_ops = _init_local_resources(self, fn) File "/usr/local/lib/python3.8/dist-packages/epl/parallel/hooks.py", line 416, in _init_local_resources assign_ops = broadcast_variables() File "/usr/local/lib/python3.8/dist-packages/epl/parallel/hooks.py", line 354, in broadcast_variables reduced_variables = comm.broadcast(bcast_variables) File "/usr/local/lib/python3.8/dist-packages/epl/communicators/collective_communicator.py", line 131, in broadcast comm_pool = self.get_or_create_comm(comm_name, comm_spec, communication_op=Communicator.BROADCAST) File "/usr/local/lib/python3.8/dist-packages/epl/communicators/collective_communicator.py", line 78, in get_or_create_comm comm = CommunicationPool(self.options.num_communicators, File "/usr/local/lib/python3.8/dist-packages/epl/communicators/communication_pool.py", line 41, in init self._communicator_list = [ File "/usr/local/lib/python3.8/dist-packages/epl/communicators/communication_pool.py", line 42, in buildcommunicator('{}/group{}'.format(comm_name, index), comm_spec) File "/usr/local/lib/python3.8/dist-packages/epl/communicators/options.py", line 309, in build_communicator return Communicator.create( File "/usr/local/lib/python3.8/dist-packages/epl/communicators/base.py", line 98, in create return impl(shared_name, devices=devices, kwargs) File "/usr/local/lib/python3.8/dist-packages/epl/communicators/nccl.py", line 79, in init ops.GraphKeys.LOCAL_RESOURCES, self.build_resource()) File "/usr/local/lib/python3.8/dist-packages/epl/communicators/nccl.py", line 101, in build_resource self._create_op = self._handle.create( File "/usr/local/lib/python3.8/dist-packages/epl/communicators/nccl_ops.py", line 165, in create return _ops.epl_nccl_communicator_creater( File "", line 1007, in epl_nccl_communicator_creater File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/framework/op_def_library.py", line 792, in _apply_op_helper op = g.create_op(op_type_name, inputs, dtypes=None, name=scope, File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/util/deprecation.py", line 513, in new_func return func(args, **kwargs) File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/framework/ops.py", line 3356, in create_op return self._create_op_internal(op_type, inputs, dtypes, input_types, name, File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/framework/ops.py", line 3418, in _create_op_internal ret = Operation( File "/usr/local/lib/python3.8/dist-packages/tensorflow_core/python/framework/ops.py", line 1748, in init self._traceback = tf_stack.extract_stack()

E0927 09:21:59.979781 140333610264384 error_handling.py:75] Error recorded from training_loop: From /job:worker/replica:0/task:1: unhandled system error [[node BROADCAST_0_broadcast_pool_group_0/1/EplNcclCommunicatorCreater (defined at /usr/local/lib/python3.8/dist-packages/tensorflow_core/python/framework/ops.py:1748) ]]

Original stack trace for 'BROADCAST_0_broadcast_pool_group_0/1/EplNcclCommunicatorCreater':

SeaOfOcean commented 10 months ago

可以看看 NCCL_SOCKET_IFNAME 是否配置正确