Hi
Hope you can help me out here.
I keep getting resource allocation errors a few epochs into training:
2021-03-04 00:13:01.557810: epoch 9 of 55, step 40 of 2006, loss = 0.64, Top-1 = 0.81 Top-5 = 0.98 (366.8 examples/sec; 0.174 sec/batch)
Traceback (most recent call last):
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1356, in _do_call
return fn(*args)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1341, in _run_fn
options, feed_dict, fetch_list, target_list, run_metadata)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1429, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[64,224,224,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
[[{{node batch}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "run.py", line 412, in
main()
File "run.py", line 383, in main
do_train(sess, args)
File "run.py", line 126, in do_train
img,lbl = sess.run([images, labels], options= args.run_options, run_metadata= args.run_metadata)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 950, in run
run_metadata_ptr)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1173, in _run
feed_dict_tensor, options, run_metadata)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1350, in _do_run
run_metadata)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1370, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[64,224,224,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
[[node batch (defined at /scratch/cdhassel/MLWIC2_helper_files/data_loader.py:161) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
Original stack trace for 'batch':
File "run.py", line 412, in
main()
File "run.py", line 383, in main
do_train(sess, args)
File "run.py", line 85, in do_train
images, labels, info = train_loader.load()
File "/scratch/cdhassel/MLWIC2_helper_files/data_loader.py", line 161, in load
allow_smaller_final_batch=True if not self.is_training else False)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 324, in new_func
return func(*args, *kwargs)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/input.py", line 1021, in batch
name=name)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/input.py", line 790, in _batch
dequeued = queue.dequeue_many(batch_size, name=name)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/data_flow_ops.py", line 488, in dequeue_many
self._queue_ref, n=n, component_types=self._dtypes, name=name)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 3862, in queue_dequeue_many_v2
timeout_ms=timeout_ms, name=name)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
op_def=op_def)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
return func(args, **kwargs)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3616, in create_op
op_def=op_def)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2005, in init
self._traceback = tf_stack.extract_stack()
the train function ran for 9.05553334885173 hours. The trained model is in AU_model01. Specify this directory as the log_dir when you use classify().
Hi Hope you can help me out here. I keep getting resource allocation errors a few epochs into training:
2021-03-04 00:13:01.557810: epoch 9 of 55, step 40 of 2006, loss = 0.64, Top-1 = 0.81 Top-5 = 0.98 (366.8 examples/sec; 0.174 sec/batch) Traceback (most recent call last): File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1356, in _do_call return fn(*args) File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1341, in _run_fn options, feed_dict, fetch_list, target_list, run_metadata) File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1429, in _call_tf_sessionrun run_metadata) tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[64,224,224,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu [[{{node batch}}]] Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "run.py", line 412, in
main()
File "run.py", line 383, in main
do_train(sess, args)
File "run.py", line 126, in do_train
img,lbl = sess.run([images, labels], options= args.run_options, run_metadata= args.run_metadata)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 950, in run
run_metadata_ptr)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1173, in _run
feed_dict_tensor, options, run_metadata)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1350, in _do_run
run_metadata)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1370, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[64,224,224,3] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
[[node batch (defined at /scratch/cdhassel/MLWIC2_helper_files/data_loader.py:161) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
Original stack trace for 'batch': File "run.py", line 412, in
main()
File "run.py", line 383, in main
do_train(sess, args)
File "run.py", line 85, in do_train
images, labels, info = train_loader.load()
File "/scratch/cdhassel/MLWIC2_helper_files/data_loader.py", line 161, in load
allow_smaller_final_batch=True if not self.is_training else False)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 324, in new_func
return func(*args, *kwargs)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/input.py", line 1021, in batch
name=name)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/training/input.py", line 790, in _batch
dequeued = queue.dequeue_many(batch_size, name=name)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/data_flow_ops.py", line 488, in dequeue_many
self._queue_ref, n=n, component_types=self._dtypes, name=name)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 3862, in queue_dequeue_many_v2
timeout_ms=timeout_ms, name=name)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
op_def=op_def)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
return func(args, **kwargs)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3616, in create_op
op_def=op_def)
File "/home/cdhassel/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2005, in init
self._traceback = tf_stack.extract_stack()
the train function ran for 9.05553334885173 hours. The trained model is in AU_model01. Specify this directory as the log_dir when you use classify().