tsinghua-rll / VoxelNet-tensorflow

A 3D object detection system for autonomous driving.
MIT License
453 stars 123 forks source link

W tensorflow/core/framework/op_kernel.cc:1192] Resource exhausted: OOM when allocating tensor with shape[1,128,12,402,354] #51

Open chunhuaqiushi1989 opened 5 years ago

chunhuaqiushi1989 commented 5 years ago

hi ,y friend,when I run sudo python train.py, some errors happen. like that: 2018-11-20 10:13:15.537986: W tensorflow/core/common_runtime/bfcallocator.cc:277] ____*****xxxxxx*__**x** 2018-11-20 10:13:15.538016: W tensorflow/core/framework/op_kernel.cc:1192] Resource exhausted: OOM when allocating tensor with shape[1,128,12,402,354] Traceback (most recent call last): File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1323, in _do_call return fn(*args) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1302, in _run_fn status, run_metadata) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/errors_impl.py", line 473, in exit c_api.TF_GetCode(self.status.status)) tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[1,128,12,402,354] [[Node: gpu0/MiddleAndRPN/conv1/Conv3D = Conv3D[T=DT_FLOAT, data_format="NDHWC", padding="VALID", strides=[1, 2, 1, 1, 1], _device="/job:localhost/replica:0/task:0/device:GPU:0"](gpu0/MiddleAndRPN/conv1/Pad, MiddleAndRPN_/conv1/kernel/read)]] [[Node: gpu0/MiddleAndRPN/Sum/_153 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_4878_gpu0/MiddleAndRPN/Sum", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "train.py", line 135, in tf.app.run(main) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/platform/app.py", line 48, in run _sys.exit(main(_sys.argv[:1] + flags_passthrough)) File "train.py", line 109, in main ret = model.train_step(sess, train_loader.load(), train=True, summary=is_summary) File "/home/liulei/tensorflow_workplace/VoxelNet-tensorflow/model/model.py", line 197, in train_step return session.run(output_feed, input_feed) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 889, in run run_metadata_ptr) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1120, in _run feed_dict_tensor, options, run_metadata) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1317, in _do_run options, run_metadata) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/client/session.py", line 1336, in _do_call raise type(e)(node_def, op, message) tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[1,128,12,402,354] [[Node: gpu0/MiddleAndRPN/conv1/Conv3D = Conv3D[T=DT_FLOAT, data_format="NDHWC", padding="VALID", strides=[1, 2, 1, 1, 1], _device="/job:localhost/replica:0/task:0/device:GPU:0"](gpu0/MiddleAndRPN/conv1/Pad, MiddleAndRPN_/conv1/kernel/read)]] [[Node: gpu0/MiddleAndRPN/Sum/_153 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_4878_gpu0/MiddleAndRPN/Sum", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'gpu0/MiddleAndRPN/conv1/Conv3D', defined at: File "train.py", line 135, in tf.app.run(main) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/platform/app.py", line 48, in run _sys.exit(main(_sys.argv[:1] + flags_passthrough)) File "train.py", line 73, in main avail_gpus=cfg.GPU_AVAILABLE.split(',') File "/home/liulei/tensorflow_workplace/VoxelNet-tensorflow/model/model.py", line 73, in init input=feature.outputs, alpha=self.alpha, beta=self.beta, training=is_train) File "/home/liulei/tensorflow_workplace/VoxelNet-tensorflow/model/rpn.py", line 41, in init (1, 1, 1), self.input, name='conv1') File "/home/liulei/tensorflow_workplace/VoxelNet-tensorflow/model/rpn.py", line 149, in ConvMD pad, Cout, k, strides=s, padding="valid", reuse=tf.AUTO_REUSE, name=scope) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/convolutional.py", line 809, in conv3d return layer.apply(inputs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py", line 671, in apply return self.call(inputs, *args, *kwargs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/base.py", line 575, in call outputs = self.call(inputs, args, **kwargs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/layers/convolutional.py", line 167, in call outputs = self._convolution_op(inputs, self.kernel) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/nn_ops.py", line 835, in call return self.conv_op(inp, filter) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/nn_ops.py", line 499, in call return self.call(inp, filter) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/nn_ops.py", line 187, in call name=self.name) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_nn_ops.py", line 847, in conv3d padding=padding, data_format=data_format, name=name) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 2956, in create_op op_def=op_def) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1470, in init self._traceback = self._graph._extract_stack() # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[1,128,12,402,354] [[Node: gpu0/MiddleAndRPN/conv1/Conv3D = Conv3D[T=DT_FLOAT, data_format="NDHWC", padding="VALID", strides=[1, 2, 1, 1, 1], _device="/job:localhost/replica:0/task:0/device:GPU:0"](gpu0/MiddleAndRPN/conv1/Pad, MiddleAndRPN_/conv1/kernel/read)]] [[Node: gpu0/MiddleAndRPN/Sum/_153 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_4878_gpu0/MiddleAndRPN/Sum", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]