DeepRec-AI / HybridBackend

A high-performance framework for training wide-and-deep recommender systems on heterogeneous cluster
Apache License 2.0
156 stars 30 forks source link

feature_column bucket_size is 6, use 8 gpus, then worker-5 and worker-6 'save/RestoreV2' failed #89

Open zhbhhb opened 1 year ago

zhbhhb commented 1 year ago

feature_column bucket_size is 6, use 8 gpus, then worker-5 and worker-6 'save/RestoreV2' failed; backtrace: Traceback (most recent call last): File "neg_feedback_multi.py", line 1252, in tf.app.run() File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/platform/app.py", line 40, in run _run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef) File "/home/pai/lib/python3.6/site-packages/absl/app.py", line 308, in run _run_main(main, args) File "/home/pai/lib/python3.6/site-packages/absl/app.py", line 254, in _run_main sys.exit(main(argv)) File "neg_feedback_multi.py", line 1235, in main model.run() File "neg_feedback_multi.py", line 1227, in run classifier.train_and_evaluate(train_spec, eval_spec) File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/estimator/estimator.py", line 276, in train_and_evaluate return executor.run() File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 640, in run getattr(self, task_to_run)() File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 650, in run_worker return self._start_distributed_training() File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 796, in _start_distributed_training saving_listeners=saving_listeners) File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/estimator/estimator.py", line 188, in train saving_listeners=saving_listeners) File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 370, in train loss = self._train_model(input_fn, hooks, saving_listeners) File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1161, in _train_model return self._train_model_default(input_fn, hooks, saving_listeners) File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1195, in _train_model_default saving_listeners) File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1490, in _train_with_estimator_spec log_step_count_steps=log_step_count_steps) as mon_sess: File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/training/session.py", line 131, in HybridBackendMonitoredTrainingSession sess = fn(args, kwargs) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 678, in MonitoredTrainingSession stop_grace_period_secs=stop_grace_period_secs) File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/training/session.py", line 64, in init session_creator, hooks, should_recover=True, kwargs) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 827, in init self._sess = _RecoverableSession(self._coordinated_creator) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 1309, in init _WrappedSession.init(self, self._create_session()) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 1314, in _create_session return self._sess_creator.create_session() File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 980, in create_session self.tf_sess = self._session_creator.create_session() File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 733, in create_session self._scaffold.finalize() File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 252, in finalize self._saver.build() File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 1059, in build self._build(self._filename, build_save=True, build_restore=True) File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/training/saver.py", line 258, in _build super()._build(args, *kwargs) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 1137, in _build build_restore=build_restore) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 660, in _build_internal restore_sequentially, reshape) File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/training/saver.py", line 200, in _AddShardedRestoreOps filename_tensor, per_device, restore_sequentially, reshape) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 536, in _AddShardedRestoreOps name="restore_shard")) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 476, in _AddRestoreOps restore_sequentially) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 744, in bulk_restore return io_ops.restore_v2(filename_tensor, names, slices, dtypes) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/ops/gen_io_ops.py", line 2380, in restore_v2 name=name) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper op_def=op_def) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func return func(args, **kwargs) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3360, in create_op attrs, op_def, compute_device) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3429, in _create_op_internal op_def=op_def) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 1773, in init control_input_ops) File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 1613, in _create_c_op raise ValueError(str(e)) ValueError: Expected non-negative start and positive length but got start = 6, length = 0: string = 6,0:0,10 for 'save/RestoreV2' (op: 'RestoreV2') with input shapes: [], [382], [382] and with computed input tensors: input[2] = <144150 23 108114,18018:0,23 144150 23 108114,18018:0,23 195