Closed alachyankar closed 6 years ago
Using old model checkpoints from the slim version of the OD API caused problems with the graph expected by estimator.
Hi, I have the same problem here and it seems that you had solved it. Could you share the steps about how you make this through? Actually, I just followed a post that I think it is good, and the training result is similar to the authors. While I prepared my own data and start to train, this problem occurs. Any suggestion will be appreciate!
Got the same error when using custom .record
files. Anyone has a solution yet?
System information
Error comes after TF recognizes the GPUs and loads the checkpoint files.
training_1 | Traceback (most recent call last): training_1 | File "/workspace/models/research/object_detection/model_main.py", line 86, in
training_1 | tf.app.run()
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 125, in run
training_1 | _sys.exit(main(argv))
training_1 | File "/workspace/models/research/object_detection/model_main.py", line 82, in main
training_1 | tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 451, in train_and_evaluate
training_1 | return executor.run()
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 590, in run
training_1 | return self.run_local()
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 691, in run_local
training_1 | saving_listeners=saving_listeners)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 343, in train
training_1 | loss = self._train_model(input_fn, hooks, saving_listeners)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1129, in _train_model
training_1 | return self._train_model_default(input_fn, hooks, saving_listeners)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1163, in _train_model_default
training_1 | saving_listeners)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1371, in _train_with_estimator_spec
training1 | , loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 583, in run
training_1 | run_metadata=run_metadata)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1059, in run
training_1 | run_metadata=run_metadata)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1150, in run
training_1 | raise six.reraise(original_exc_info)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1135, in run
training_1 | return self._sess.run(args, kwargs)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 1207, in run
training_1 | run_metadata=run_metadata)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/monitored_session.py", line 987, in run
training_1 | return self._sess.run(args, kwargs)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 887, in run
training_1 | run_metadata_ptr)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1110, in _run
training_1 | feed_dict_tensor, options, run_metadata)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1286, in _do_run
training_1 | run_metadata)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1308, in _do_call
training_1 | raise type(e)(node_def, op, message)
training_1 | tensorflow.python.framework.errors_impl.DataLossError: Attempted to pad to a smaller size than the input element.
training_1 | [[{{node IteratorGetNext}} = IteratorGetNextoutput_shapes=[[32], [32,100], [32,100,4], [32,100,95], [32,100], [32,100], [32,100], [32,100], [32,300,300,3], [32], [32], [32], [32,3]], output_types=[DT_STRING, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_INT64, DT_INT64, DT_BOOL, DT_FLOAT, DT_FLOAT, DT_STRING, DT_INT32, DT_STRING, DT_INT32], _device="/job:localhost/replica:0/task:0/device:CPU:0"]]
training_1 | [[{{node Loss/Cumsum_20/_9585}} = _HostRecv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_13821_Loss/Cumsum_20", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]
training_1 |
training_1 | Caused by op u'IteratorGetNext', defined at:
training_1 | File "/workspace/models/research/object_detection/model_main.py", line 86, in
training_1 | tf.app.run()
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 125, in run
training_1 | _sys.exit(main(argv))
training_1 | File "/workspace/models/research/object_detection/model_main.py", line 82, in main
training_1 | tf.estimator.train_and_evaluate(estimator, train_spec, eval_specs[0])
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 451, in train_and_evaluate
training_1 | return executor.run()
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 590, in run
training_1 | return self.run_local()
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/training.py", line 691, in run_local
training_1 | saving_listeners=saving_listeners)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 343, in train
training_1 | loss = self._train_model(input_fn, hooks, saving_listeners)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1129, in _train_model
training_1 | return self._train_model_default(input_fn, hooks, saving_listeners)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1156, in _train_model_default
training_1 | input_fn, model_fn_lib.ModeKeys.TRAIN))
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 995, in _get_features_and_labels_from_input_fn
training_1 | result = self._call_input_fn(input_fn, mode)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 1084, in _call_input_fn
training_1 | return input_fn( kwargs)
training_1 | File "/workspace/models/research/object_detection/inputs.py", line 303, in _train_input_fn
training_1 | input_dict = dataset_util.make_initializable_iterator(dataset).get_next()
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/data/ops/iterator_ops.py", line 420, in get_next
training_1 | name=name)), self._output_types,
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_dataset_ops.py", line 2153, in iterator_get_next
training_1 | output_shapes=output_shapes, name=name)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
training_1 | op_def=op_def)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
training_1 | return func(args, kwargs)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 3260, in create_op
training_1 | op_def=op_def)
training_1 | File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1748, in init
training_1 | self._traceback = tf_stack.extract_stack()
training_1 |
training_1 | DataLossError (see above for traceback): Attempted to pad to a smaller size than the input element.
training_1 | [[{{node IteratorGetNext}} = IteratorGetNextoutput_shapes=[[32], [32,100], [32,100,4], [32,100,95], [32,100], [32,100], [32,100], [32,100], [32,300,300,3], [32], [32], [32], [32,3]], output_types=[DT_STRING, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_INT64, DT_INT64, DT_BOOL, DT_FLOAT, DT_FLOAT, DT_STRING, DT_INT32, DT_STRING, DT_INT32], _device="/job:localhost/replica:0/task:0/device:CPU:0"]]
training_1 | [[{{node Loss/Cumsum_20/_9585}} = _HostRecv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_13821_Loss/Cumsum_20", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]
training_1 |