ethz-asl / hfnet

From Coarse to Fine: Robust Hierarchical Localization at Large Scale with HF-Net (https://arxiv.org/abs/1812.03506)
MIT License
785 stars 185 forks source link

InvalidArgumentError #73

Closed Amrmesi closed 3 days ago

Amrmesi commented 4 days ago

When I enable the homographic augmentation in the hfnet train config file the training always gives the InvalidArgumentError saying it expects 5 types and got 8. I tried debugging and found out that it provides valid_mask and local desc map valid mask and dense score valid mask. I have no idea why it isn't working this is the error: QUIRES failed at iterator_ops.cc:1181 : Invalid argument: Number of components does not match: expected 5 types but got 8. Traceback (most recent call last): File "/home/amrmesi/miniconda3/envs/hfnet/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1334, in _do_call return fn(*args) File "/home/amrmesi/miniconda3/envs/hfnet/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1319, in _run_fn options, feed_dict, fetch_list, target_list, run_metadata) File "/home/amrmesi/miniconda3/envs/hfnet/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1407, in _call_tf_sessionrun run_metadata) tensorflow.python.framework.errors_impl.InvalidArgumentError: Number of components does not match: expected 5 types but got 8. [[{{node IteratorFromStringHandleV2}} = IteratorFromStringHandleV2output_shapes=[, , [?,480,640,1], , [?]], output_types=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_STRING], _device="/job:localhost/replica:0/task:0/device:CPU:0"]] [[{{node MobilenetV2/expanded_conv_3/project/BatchNorm/gamma/read/_289}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_785_MobilenetV2/expanded_conv_3/project/BatchNorm/gamma/read", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "hfnet/train.py", line 82, in _cli_train(config, output_dir) File "hfnet/train.py", line 66, in _cli_train train(config, config['train_iter'], output_dir) File "hfnet/train.py", line 32, in train keep_checkpoints=config.get('keep_checkpoints', 1)) File "/home/amrmesi/hfnet/hfnet/models/base_model.py", line 310, in train feed_dict={self.handle: self.dataset_handles['training']}) File "/home/amrmesi/miniconda3/envs/hfnet/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 929, in run run_metadata_ptr) File "/home/amrmesi/miniconda3/envs/hfnet/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1152, in _run feed_dict_tensor, options, run_metadata) File "/home/amrmesi/miniconda3/envs/hfnet/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1328, in _do_run run_metadata) File "/home/amrmesi/miniconda3/envs/hfnet/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1348, in _do_call raise type(e)(node_def, op, message) tensorflow.python.framework.errors_impl.InvalidArgumentError: Number of components does not match: expected 5 types but got 8. [[node IteratorFromStringHandleV2 (defined at /home/amrmesi/hfnet/hfnet/models/base_model.py:267) = IteratorFromStringHandleV2output_shapes=[, , [?,480,640,1], , [?]], output_types=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_STRING], _device="/job:localhost/replica:0/task:0/device:CPU:0"]] [[{{node MobilenetV2/expanded_conv_3/project/BatchNorm/gamma/read/_289}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_785_MobilenetV2/expanded_conv_3/project/BatchNorm/gamma/read", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]

Caused by op 'IteratorFromStringHandleV2', defined at: File "hfnet/train.py", line 82, in _cli_train(config, output_dir) File "hfnet/train.py", line 66, in _cli_train train(config, config['train_iter'], output_dir) File "hfnet/train.py", line 22, in train with _init_graph(config) as net: File "/home/amrmesi/miniconda3/envs/hfnet/lib/python3.6/contextlib.py", line 81, in enter return next(self.gen) File "hfnet/train.py", line 51, in _init_graph data=dataset.get_tf_datasets(), n_gpus=n_gpus, *config['model']) File "/home/amrmesi/hfnet/hfnet/models/base_model.py", line 125, in init self._build_graph() File "/home/amrmesi/hfnet/hfnet/models/base_model.py", line 267, in _build_graph self.handle, output_types, output_shapes) File "/home/amrmesi/miniconda3/envs/hfnet/lib/python3.6/site-packages/tensorflow/python/data/ops/iterator_ops.py", line 291, in from_string_handle sparse.as_dense_shapes(output_shapes, output_classes))) File "/home/amrmesi/miniconda3/envs/hfnet/lib/python3.6/site-packages/tensorflow/python/ops/gen_dataset_ops.py", line 1982, in iterator_from_string_handle_v2 output_types=output_types, output_shapes=output_shapes, name=name) File "/home/amrmesi/miniconda3/envs/hfnet/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "/home/amrmesi/miniconda3/envs/hfnet/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func return func(args, **kwargs) File "/home/amrmesi/miniconda3/envs/hfnet/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op op_def=op_def) File "/home/amrmesi/miniconda3/envs/hfnet/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1770, in init self._traceback = tf_stack.extract_stack()

InvalidArgumentError (see above for traceback): Number of components does not match: expected 5 types but got 8. [[node IteratorFromStringHandleV2 (defined at /home/amrmesi/hfnet/hfnet/models/base_model.py:267) = IteratorFromStringHandleV2output_shapes=[, , [?,480,640,1], , [?]], output_types=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_STRING], _device="/job:localhost/replica:0/task:0/device:CPU:0"]] [[{{node MobilenetV2/expanded_conv_3/project/BatchNorm/gamma/read/_289}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_785_MobilenetV2/expanded_conv_3/project/BatchNorm/gamma/read", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]

Amrmesi commented 3 days ago

it turns out the problem was that the iterator in the build graph has to handle the same output shapes and types from the training and validation and test datasets. The augmentation only happens to the training set that's why it gave me this error so I modified the get_data function in the distillation.py so that all the datasets are in the same shapes and types here is the updated function: def _get_data(self, paths, split_name, **config): is_training = split_name == 'training'

def _read_image(path):
    image = tf.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    return image

def _create_npz_reader(keys):
    def _read_npz(keys, path):
        npz = np.load(path.decode('utf-8'))
        return [npz[k].astype(np.float32) for k in keys]
    return lambda x: _read_npz(keys, x)

def _preprocess(image):
    if config['preprocessing']['resize']:
        image = tf.image.resize_images(
                image, config['preprocessing']['resize'],
                method=tf.image.ResizeMethod.BILINEAR)
    if config['preprocessing']['grayscale']:
        image = tf.image.rgb_to_grayscale(image)
    return image

def _delete_keys(data):
    keys = ['keypoints']
    for k in keys:
        data.pop(k, None)
    return data

# Create datasets for names and images
names = tf.data.Dataset.from_tensor_slices(paths['names'])
images = tf.data.Dataset.from_tensor_slices(paths['images'])
images = images.map_parallel(_read_image)
images = images.map_parallel(_preprocess)
dataset = tf.data.Dataset.zip({'image': images, 'name': names})

# Load targets
if config['load_targets']:
    for i, target in enumerate(config['targets']):
        t = tf.data.Dataset.from_tensor_slices(paths[i])
        reader = _create_npz_reader(target['keys'])
        types = [tf.float32] * len(target['keys'])
        t = t.map_parallel(lambda p: tf.py_func(reader, [p], types))
        dataset = tf.data.Dataset.zip((dataset, t)).map(
            lambda da, de: {**da, **{k: de[j]
                            for j, k in enumerate(target['keys'])}})

    if 'keypoints' in dataset.keys():
        dataset = dataset.map(
            lambda d: {
                **d, 'keypoints': tf.reshape(
                    d['keypoints'][:, ::-1], [-1, 2])})

# Apply augmentations or additional preprocessing
if split_name in ['training', 'validation', 'test']:
    if config['augmentation']['photometric']['enable']:
        dataset = dataset.map_parallel(
            lambda d: pipeline.photometric_augmentation(
                d, **config['augmentation']['photometric']))
    if config['augmentation']['homographic']['enable']:
        dataset = dataset.map_parallel(
            lambda d: pipeline.homographic_augmentation(
                d, **config['augmentation']['homographic']))
    print(f"Post-augmentation Dataset ({split_name}) output types:", dataset.output_types)
    print(f"Post-augmentation Dataset ({split_name}) output shapes:", dataset.output_shapes)

# Ensure batching works for validation and test sets as well
if split_name == 'validation':
    dataset = dataset.take(config['validation_size'])
if split_name == 'training':
    dataset = dataset.skip(config['validation_size'])

if config['cache_in_memory']:
    tf.logging.info('Caching dataset, first access will take some time')
    dataset = dataset.cache()

if 'keypoints' in dataset.keys():
    dataset = dataset.map_parallel(pipeline.add_keypoint_map)
if config['for_batching']:
    dataset = dataset.map_parallel(_delete_keys)

return dataset