I got the following error message when training on multiple GPUs...
Traceback (most recent call last):
File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/u7801832/xmcenv/xmcgan_image_generation/xmcgan/main.py", line 70, in
app.run(main)
File "/home/u7801832/xmcenv/lib/python3.8/site-packages/absl/app.py", line 303, in run
_run_main(main, args)
File "/home/u7801832/xmcenv/lib/python3.8/site-packages/absl/app.py", line 251, in _run_main
sys.exit(main(argv))
File "/home/u7801832/xmcenv/xmcgan_image_generation/xmcgan/main.py", line 62, in main
train_utils.train(FLAGS.config, FLAGS.workdir)
File "/home/u7801832/xmcenv/xmcgan_image_generation/xmcgan/train_utils.py", line 421, in train
batch = jax.tree_map(np.asarray, next(train_iter))
File "/home/u7801832/xmcenv/lib/python3.8/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 761, in next
return self._next_internal()
File "/home/u7801832/xmcenv/lib/python3.8/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 744, in _next_internal
ret = gen_dataset_ops.iterator_get_next(
File "/home/u7801832/xmcenv/lib/python3.8/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 2728, in iterator_get_next
_ops.raise_from_not_ok_status(e, name)
File "/home/u7801832/xmcenv/lib/python3.8/site-packages/tensorflow/python/framework/ops.py", line 6897, in raise_from_not_ok_status
six.raise_from(core._status_to_exception(e.code, message), None)
File "", line 3, in raise_from
tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes at component 0: expected [7,16,17,768] but got [1,112,17,768]. [Op:IteratorGetNext]
I got the following error message when training on multiple GPUs...
Traceback (most recent call last): File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.8/runpy.py", line 87, in _run_code exec(code, run_globals) File "/home/u7801832/xmcenv/xmcgan_image_generation/xmcgan/main.py", line 70, in
app.run(main)
File "/home/u7801832/xmcenv/lib/python3.8/site-packages/absl/app.py", line 303, in run
_run_main(main, args)
File "/home/u7801832/xmcenv/lib/python3.8/site-packages/absl/app.py", line 251, in _run_main
sys.exit(main(argv))
File "/home/u7801832/xmcenv/xmcgan_image_generation/xmcgan/main.py", line 62, in main
train_utils.train(FLAGS.config, FLAGS.workdir)
File "/home/u7801832/xmcenv/xmcgan_image_generation/xmcgan/train_utils.py", line 421, in train
batch = jax.tree_map(np.asarray, next(train_iter))
File "/home/u7801832/xmcenv/lib/python3.8/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 761, in next
return self._next_internal()
File "/home/u7801832/xmcenv/lib/python3.8/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 744, in _next_internal
ret = gen_dataset_ops.iterator_get_next(
File "/home/u7801832/xmcenv/lib/python3.8/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 2728, in iterator_get_next
_ops.raise_from_not_ok_status(e, name)
File "/home/u7801832/xmcenv/lib/python3.8/site-packages/tensorflow/python/framework/ops.py", line 6897, in raise_from_not_ok_status
six.raise_from(core._status_to_exception(e.code, message), None)
File "", line 3, in raise_from
tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes at component 0: expected [7,16,17,768] but got [1,112,17,768]. [Op:IteratorGetNext]
The training script is as follows...
!/bin/bash
CONFIG="xmcgan/configs/coco_xmc.py" EXP_NAME=$1 WORKDIR="/work/u7801832/data2/" # CHANGEME
CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6" python -m xmcgan.main \ --config="$CONFIG" \ --mode="train" \ --workdir="$WORKDIR" \
Please help.
Thanks