feature_column bucket_size is 6, use 8 gpus, then worker-5 and worker-6 'save/RestoreV2' failed;
backtrace:
Traceback (most recent call last):
File "neg_feedback_multi.py", line 1252, in
tf.app.run()
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/home/pai/lib/python3.6/site-packages/absl/app.py", line 308, in run
_run_main(main, args)
File "/home/pai/lib/python3.6/site-packages/absl/app.py", line 254, in _run_main
sys.exit(main(argv))
File "neg_feedback_multi.py", line 1235, in main
model.run()
File "neg_feedback_multi.py", line 1227, in run
classifier.train_and_evaluate(train_spec, eval_spec)
File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/estimator/estimator.py", line 276, in train_and_evaluate
return executor.run()
File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 640, in run
getattr(self, task_to_run)()
File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 650, in run_worker
return self._start_distributed_training()
File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 796, in _start_distributed_training
saving_listeners=saving_listeners)
File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/estimator/estimator.py", line 188, in train
saving_listeners=saving_listeners)
File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 370, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1161, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1195, in _train_model_default
saving_listeners)
File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1490, in _train_with_estimator_spec
log_step_count_steps=log_step_count_steps) as mon_sess:
File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/training/session.py", line 131, in HybridBackendMonitoredTrainingSession
sess = fn(args, kwargs)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 678, in MonitoredTrainingSession
stop_grace_period_secs=stop_grace_period_secs)
File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/training/session.py", line 64, in init
session_creator, hooks, should_recover=True, kwargs)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 827, in init
self._sess = _RecoverableSession(self._coordinated_creator)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 1309, in init
_WrappedSession.init(self, self._create_session())
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 1314, in _create_session
return self._sess_creator.create_session()
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 980, in create_session
self.tf_sess = self._session_creator.create_session()
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 733, in create_session
self._scaffold.finalize()
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 252, in finalize
self._saver.build()
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 1059, in build
self._build(self._filename, build_save=True, build_restore=True)
File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/training/saver.py", line 258, in _build
super()._build(args, *kwargs)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 1137, in _build
build_restore=build_restore)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 660, in _build_internal
restore_sequentially, reshape)
File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/training/saver.py", line 200, in _AddShardedRestoreOps
filename_tensor, per_device, restore_sequentially, reshape)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 536, in _AddShardedRestoreOps
name="restore_shard"))
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 476, in _AddRestoreOps
restore_sequentially)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 744, in bulk_restore
return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/ops/gen_io_ops.py", line 2380, in restore_v2
name=name)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
op_def=op_def)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func
return func(args, **kwargs)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3360, in create_op
attrs, op_def, compute_device)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3429, in _create_op_internal
op_def=op_def)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 1773, in init
control_input_ops)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 1613, in _create_c_op
raise ValueError(str(e))
ValueError: Expected non-negative start and positive length but got start = 6, length = 0: string = 6,0:0,10 for 'save/RestoreV2' (op: 'RestoreV2') with input shapes: [], [382], [382] and with computed input tensors: input[2] = <144150 23 108114,18018:0,23 144150 23 108114,18018:0,23 195
feature_column bucket_size is 6, use 8 gpus, then worker-5 and worker-6 'save/RestoreV2' failed; backtrace: Traceback (most recent call last): File "neg_feedback_multi.py", line 1252, in
tf.app.run()
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/home/pai/lib/python3.6/site-packages/absl/app.py", line 308, in run
_run_main(main, args)
File "/home/pai/lib/python3.6/site-packages/absl/app.py", line 254, in _run_main
sys.exit(main(argv))
File "neg_feedback_multi.py", line 1235, in main
model.run()
File "neg_feedback_multi.py", line 1227, in run
classifier.train_and_evaluate(train_spec, eval_spec)
File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/estimator/estimator.py", line 276, in train_and_evaluate
return executor.run()
File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 640, in run
getattr(self, task_to_run)()
File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 650, in run_worker
return self._start_distributed_training()
File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/training.py", line 796, in _start_distributed_training
saving_listeners=saving_listeners)
File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/estimator/estimator.py", line 188, in train
saving_listeners=saving_listeners)
File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 370, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1161, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1195, in _train_model_default
saving_listeners)
File "/home/pai/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1490, in _train_with_estimator_spec
log_step_count_steps=log_step_count_steps) as mon_sess:
File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/training/session.py", line 131, in HybridBackendMonitoredTrainingSession
sess = fn(args, kwargs)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 678, in MonitoredTrainingSession
stop_grace_period_secs=stop_grace_period_secs)
File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/training/session.py", line 64, in init
session_creator, hooks, should_recover=True, kwargs)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 827, in init
self._sess = _RecoverableSession(self._coordinated_creator)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 1309, in init
_WrappedSession.init(self, self._create_session())
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 1314, in _create_session
return self._sess_creator.create_session()
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 980, in create_session
self.tf_sess = self._session_creator.create_session()
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 733, in create_session
self._scaffold.finalize()
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 252, in finalize
self._saver.build()
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 1059, in build
self._build(self._filename, build_save=True, build_restore=True)
File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/training/saver.py", line 258, in _build
super()._build(args, *kwargs)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 1137, in _build
build_restore=build_restore)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 660, in _build_internal
restore_sequentially, reshape)
File "/home/pai/lib/python3.6/site-packages/hybridbackend/tensorflow/training/saver.py", line 200, in _AddShardedRestoreOps
filename_tensor, per_device, restore_sequentially, reshape)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 536, in _AddShardedRestoreOps
name="restore_shard"))
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 476, in _AddRestoreOps
restore_sequentially)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/saver.py", line 744, in bulk_restore
return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/ops/gen_io_ops.py", line 2380, in restore_v2
name=name)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
op_def=op_def)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func
return func(args, **kwargs)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3360, in create_op
attrs, op_def, compute_device)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3429, in _create_op_internal
op_def=op_def)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 1773, in init
control_input_ops)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 1613, in _create_c_op
raise ValueError(str(e))
ValueError: Expected non-negative start and positive length but got start = 6, length = 0: string = 6,0:0,10 for 'save/RestoreV2' (op: 'RestoreV2') with input shapes: [], [382], [382] and with computed input tensors: input[2] = <144150 23 108114,18018:0,23 144150 23 108114,18018:0,23 195