I am trying to run ./experiments/scripts/lov_color_2d_train.sh 0, but got resource exhausted errors (an excerpt of error message at the end of this post). I tried to set gpu_options and to reduce batch_size (from 128 down to 32, then 16, then 8) to no avail. Any suggestion of walk-around to resource exhaustion error will be highly appreciated. Thanks. CC
config = tf.ConfigProto()
config.gpu_options.allocator_type = 'BFC'
config.gpu_options.per_process_gpu_memory_fraction = 0.85
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
Error Message:
2019-01-04 16:21:00.922232: I tensorflow/core/common_runtime/bfc_allocator.cc:678] Sum Total of in-use chunks: 4.02GiB
2019-01-04 16:21:00.922251: I tensorflow/core/common_runtime/bfc_allocator.cc:680] Stats:
Limit: 9955036364
InUse: 4319733760
MaxInUse: 4326713344
NumAllocs: 493
MaxAllocSize: 817692672
2019-01-04 14:16:00.791337: W tensorflow/core/common_runtime/bfc_allocator.cc:279] xxx*****xx
2019-01-04 14:16:00.828279: W tensorflow/core/framework/op_kernel.cc:1318] OP_REQUIRES failed at transpose_op.cc:199 : Resource exhausted: OOM when allocating tensor with shape[2,480,640,66] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
2019-01-04 14:16:00.828287: W tensorflow/core/framework/op_kernel.cc:1318] OP_REQUIRES failed at conv_grad_input_ops.cc:924 : Resource exhausted: OOM when allocating tensor with shape[2,64,480,640] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
2019-01-04 14:16:00.920932: W tensorflow/core/kernels/queue_base.cc:277] _0_fifo_queue: Skipping cancelled enqueue attempt with queue not closed
Traceback (most recent call last):
File "./tools/train_net.py", line 327, in
max_iters=args.max_iters)
File "/home/chchien/DeepLearningPY2/PoseCNN/tools/../lib/fcn/train.py", line 557, in train_net
sw.train_model_vertex_pose(sess, train_op, loss, loss_cls, loss_vertex, loss_pose, learning_rate, max_iters, data_layer)
File "/home/chchien/DeepLearningPY2/PoseCNN/tools/../lib/fcn/train.py", line 241, in train_model_vertex_pose
loss_value, loss_cls_value, loss_vertex_value, loss_posevalue, lr, = sess.run([loss, loss_cls, loss_vertex, loss_pose, learning_rate, train_op])
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 900, in run
run_metadata_ptr)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1135, in _run
feed_dict_tensor, options, run_metadata)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1316, in _do_run
run_metadata)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1335, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[2,480,640,66] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[Node: vertex_pred/BiasAdd-0-0-TransposeNCHWToNHWC-LayoutOptimizer = Transpose[T=DT_FLOAT, Tperm=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"](vertex_pred/BiasAdd, PermConstNCHWToNHWC-LayoutOptimizer)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
Exception in thread Thread-2:
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 801, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 754, in run
self.target(*self.args, **self.__kwargs)
File "/home/chchien/DeepLearningPY2/PoseCNN/tools/../lib/fcn/train.py", line 436, in load_and_enqueue
sess.run(net.enqueue_op, feed_dict=feed_dict)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 900, in run
run_metadata_ptr)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1135, in _run
feed_dict_tensor, options, run_metadata)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1316, in _do_run
run_metadata)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1335, in _do_call
raise type(e)(node_def, op, message)
CancelledError: Enqueue operation was cancelled
[[Node: fifo_queue_enqueue = QueueEnqueueV2[Tcomponents=[DT_FLOAT, DT_INT32, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/device:CPU:0"](fifo_queue, _arg_Placeholder_0_0, _arg_Placeholder_1_0_1, _arg_Placeholder_2_0_2, _arg_Placeholder_3_0_3, _arg_Placeholder_4_0_4, _arg_Placeholder_5_0_5, _arg_Placeholder_6_0_6, _arg_Placeholder_7_0_7, _arg_Placeholder_8_0_8, _arg_Placeholder_9_0_9)]]
Caused by op u'fifo_queue_enqueue', defined at:
File "./tools/train_net.py", line 319, in
from networks.factory import get_network
File "/home/chchien/DeepLearningPY2/PoseCNN/tools/../lib/networks/init.py", line 16, in
from . import factory
File "/home/chchien/DeepLearningPY2/PoseCNN/tools/../lib/networks/factory.py", line 27, in
cfg.TRAIN.POSE_REG, cfg.TRAIN.ADAPT, cfg.TRAIN.TRAINABLE, cfg.IS_TRAIN)
File "/home/chchien/DeepLearningPY2/PoseCNN/tools/../lib/networks/vgg16_convs.py", line 65, in init
self.enqueue_op = q.enqueue([self.data, self.gt_label_2d, self.keep_prob, self.vertex_targets, self.vertex_weights, self.poses, self.extents, self.meta_data, self.points, self.symmetry])
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/ops/data_flow_ops.py", line 346, in enqueue
self._queue_ref, vals, name=scope)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 3977, in queue_enqueue_v2
timeout_ms=timeout_ms, name=name)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
op_def=op_def)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1718, in init
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
I am trying to run ./experiments/scripts/lov_color_2d_train.sh 0, but got resource exhausted errors (an excerpt of error message at the end of this post). I tried to set gpu_options and to reduce batch_size (from 128 down to 32, then 16, then 8) to no avail. Any suggestion of walk-around to resource exhaustion error will be highly appreciated. Thanks. CC
Error Message:
2019-01-04 16:21:00.922232: I tensorflow/core/common_runtime/bfc_allocator.cc:678] Sum Total of in-use chunks: 4.02GiB 2019-01-04 16:21:00.922251: I tensorflow/core/common_runtime/bfc_allocator.cc:680] Stats: Limit: 9955036364 InUse: 4319733760 MaxInUse: 4326713344 NumAllocs: 493 MaxAllocSize: 817692672
2019-01-04 14:16:00.791337: W tensorflow/core/common_runtime/bfc_allocator.cc:279] xxx*****xx 2019-01-04 14:16:00.828279: W tensorflow/core/framework/op_kernel.cc:1318] OP_REQUIRES failed at transpose_op.cc:199 : Resource exhausted: OOM when allocating tensor with shape[2,480,640,66] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc 2019-01-04 14:16:00.828287: W tensorflow/core/framework/op_kernel.cc:1318] OP_REQUIRES failed at conv_grad_input_ops.cc:924 : Resource exhausted: OOM when allocating tensor with shape[2,64,480,640] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc 2019-01-04 14:16:00.920932: W tensorflow/core/kernels/queue_base.cc:277] _0_fifo_queue: Skipping cancelled enqueue attempt with queue not closed Traceback (most recent call last): File "./tools/train_net.py", line 327, in
max_iters=args.max_iters)
File "/home/chchien/DeepLearningPY2/PoseCNN/tools/../lib/fcn/train.py", line 557, in train_net
sw.train_model_vertex_pose(sess, train_op, loss, loss_cls, loss_vertex, loss_pose, learning_rate, max_iters, data_layer)
File "/home/chchien/DeepLearningPY2/PoseCNN/tools/../lib/fcn/train.py", line 241, in train_model_vertex_pose
loss_value, loss_cls_value, loss_vertex_value, loss_posevalue, lr, = sess.run([loss, loss_cls, loss_vertex, loss_pose, learning_rate, train_op])
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 900, in run
run_metadata_ptr)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1135, in _run
feed_dict_tensor, options, run_metadata)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1316, in _do_run
run_metadata)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1335, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[2,480,640,66] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[Node: vertex_pred/BiasAdd-0-0-TransposeNCHWToNHWC-LayoutOptimizer = Transpose[T=DT_FLOAT, Tperm=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"](vertex_pred/BiasAdd, PermConstNCHWToNHWC-LayoutOptimizer)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
Exception in thread Thread-2: Traceback (most recent call last): File "/usr/lib/python2.7/threading.py", line 801, in __bootstrap_inner self.run() File "/usr/lib/python2.7/threading.py", line 754, in run self.target(*self.args, **self.__kwargs) File "/home/chchien/DeepLearningPY2/PoseCNN/tools/../lib/fcn/train.py", line 436, in load_and_enqueue sess.run(net.enqueue_op, feed_dict=feed_dict) File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 900, in run run_metadata_ptr) File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1135, in _run feed_dict_tensor, options, run_metadata) File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1316, in _do_run run_metadata) File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/client/session.py", line 1335, in _do_call raise type(e)(node_def, op, message) CancelledError: Enqueue operation was cancelled [[Node: fifo_queue_enqueue = QueueEnqueueV2[Tcomponents=[DT_FLOAT, DT_INT32, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/device:CPU:0"](fifo_queue, _arg_Placeholder_0_0, _arg_Placeholder_1_0_1, _arg_Placeholder_2_0_2, _arg_Placeholder_3_0_3, _arg_Placeholder_4_0_4, _arg_Placeholder_5_0_5, _arg_Placeholder_6_0_6, _arg_Placeholder_7_0_7, _arg_Placeholder_8_0_8, _arg_Placeholder_9_0_9)]]
Caused by op u'fifo_queue_enqueue', defined at: File "./tools/train_net.py", line 319, in
from networks.factory import get_network
File "/home/chchien/DeepLearningPY2/PoseCNN/tools/../lib/networks/init.py", line 16, in
from . import factory
File "/home/chchien/DeepLearningPY2/PoseCNN/tools/../lib/networks/factory.py", line 27, in
cfg.TRAIN.POSE_REG, cfg.TRAIN.ADAPT, cfg.TRAIN.TRAINABLE, cfg.IS_TRAIN)
File "/home/chchien/DeepLearningPY2/PoseCNN/tools/../lib/networks/vgg16_convs.py", line 65, in init
self.enqueue_op = q.enqueue([self.data, self.gt_label_2d, self.keep_prob, self.vertex_targets, self.vertex_weights, self.poses, self.extents, self.meta_data, self.points, self.symmetry])
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/ops/data_flow_ops.py", line 346, in enqueue
self._queue_ref, vals, name=scope)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/ops/gen_data_flow_ops.py", line 3977, in queue_enqueue_v2
timeout_ms=timeout_ms, name=name)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
op_def=op_def)
File "/home/chchien/.local/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1718, in init
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
CancelledError (see above for traceback): Enqueue operation was cancelled [[Node: fifo_queue_enqueue = QueueEnqueueV2[Tcomponents=[DT_FLOAT, DT_INT32, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], timeout_ms=-1, _device="/job:localhost/replica:0/task:0/device:CPU:0"](fifo_queue, _arg_Placeholder_0_0, _arg_Placeholder_1_0_1, _arg_Placeholder_2_0_2, _arg_Placeholder_3_0_3, _arg_Placeholder_4_0_4, _arg_Placeholder_5_0_5, _arg_Placeholder_6_0_6, _arg_Placeholder_7_0_7, _arg_Placeholder_8_0_8, _arg_Placeholder_9_0_9)]]