alibaba / euler

A distributed graph deep learning framework.
Apache License 2.0
2.89k stars 559 forks source link

分布式报错 #266

Open MeliaLin opened 4 years ago

MeliaLin commented 4 years ago

Caused by op u'save/SaveV2_1', defined at: File "/usr/lib/python2.7/runpy.py", line 174, in _run_module_as_main "main", fname, loader, pkg_name) File "/usr/lib/python2.7/runpy.py", line 72, in _run_code exec code in run_globals File "/home/zhaog/euler/tf_euler/main.py", line 28, in tf.app.run(run_loop.main) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 125, in run _sys.exit(main(argv)) File "tf_euler/python/run_loop.py", line 403, in main run_distributed(flags_obj, run_network_embedding) File "tf_euler/python/run_loop.py", line 395, in run_distributed run(flags_obj, server.target, flags_obj.task_index == 0) File "tf_euler/python/run_loop.py", line 357, in run_network_embedding run_train(model, flags_obj, master, is_chief) File "tf_euler/python/run_loop.py", line 138, in run_train config=config) as sess: File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 504, in MonitoredTrainingSession stop_grace_period_secs=stop_grace_period_secs) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 921, in init stop_grace_period_secs=stop_grace_period_secs) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 643, in init self._sess = _RecoverableSession(self._coordinated_creator) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1107, in init _WrappedSession.init(self, self._create_session()) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 1112, in _create_session return self._sess_creator.create_session() File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 800, in create_session self.tf_sess = self._session_creator.create_session() File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 557, in create_session self._scaffold.finalize() File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/monitored_session.py", line 213, in finalize self._saver = training_saver._get_saver_or_default() # pylint: disable=protected-access File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 886, in _get_saver_or_default saver = Saver(sharded=True, allow_empty=True) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1102, in init self.build() File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1114, in build self._build(self._filename, build_save=True, build_restore=True) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 1151, in _build build_save=build_save, build_restore=build_restore) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 786, in _build_internal save_tensor = self._AddShardedSaveOps(filename_tensor, per_device) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 369, in _AddShardedSaveOps return self._AddShardedSaveOpsForV2(filename_tensor, per_device) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 343, in _AddShardedSaveOpsForV2 sharded_saves.append(self._AddSaveOps(sharded_filename, saveables)) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 284, in _AddSaveOps save = self.save_op(filename_tensor, saveables) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/training/saver.py", line 202, in save_op tensors) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/ops/gen_io_ops.py", line 1690, in save_v2 shape_and_slices=shape_and_slices, tensors=tensors, name=name) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func return func(*args, **kwargs) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op op_def=op_def) File "/home/zhaog/.local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1770, in init self._traceback = tf_stack.extract_stack()

NotFoundError (see above for traceback): ckpt/model.ckpt-0_temp_75c97d512fc64129a5e87134602cf72f; No such file or directory [[node save/SaveV2_1 (defined at tf_euler/python/run_loop.py:138) = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_INT64, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:ps/replica:0/task:1/device:CPU:0"](save/ShardedFilename_1, save/SaveV2_1/tensor_names, save/SaveV2_1/shape_and_slices, beta1_power, beta2_power, global_step, supervisedgcn_1/dense_1/bias, supervisedgcn_1/dense_1/bias/Adam, supervisedgcn_1/dense_1/bias/Adam_1, supervisedgcn_1/gcnencoder_1/meanaggregator_1/dense_3/kernel, supervisedgcn_1/gcnencoder_1/meanaggregator_1/dense_3/kernel/Adam, supervisedgcn_1/gcnencoder_1/meanaggregator_1/dense_3/kernel/Adam_1, supervisedgcn_1/gcnencoder_1/meanaggregator_2/dense_5/kernel, supervisedgcn_1/gcnencoder_1/meanaggregator_2/dense_5/kernel/Adam, supervisedgcn_1/gcnencoder_1/meanaggregator_2/dense_5/kernel/Adam_1)]] [[{{node save/Identity_S135}} = _HostRecv[client_terminated=false, recv_device="/job:worker/replica:0/task:0/device:CPU:0", send_device="/job:ps/replica:0/task:1/device:CPU:0", send_device_incarnation=5790890598492272996, tensor_name="edge_82_save/Identity", tensor_type=DT_STRING, _device="/job:worker/replica:0/task:0/device:CPU:0"]()]]

ergouy commented 3 years ago

铁子,有没有安装教程,我按照官方安装完总是缺一些东西。