tensorflow / datasets

TFDS is a collection of datasets ready to use with TensorFlow, Jax, ...
https://www.tensorflow.org/datasets
Apache License 2.0
4.3k stars 1.54k forks source link

Error while loading Dementiabank dataset #3322

Open cheulyop opened 3 years ago

cheulyop commented 3 years ago

Short description There's an error when loading the Dementiabank dataset, although manually downloaded files are in the correct directory.

Environment information

Reproduction instructions

import tensorflow_datasets as tfds

tfds.load('Dementiabank')

Link to logs

Downloading and preparing dataset Unknown size (download: Unknown size, generated: 17.71 GiB, total: 17.71 GiB) to /home/admin/tensorflow_datasets/dementiabank/1.0.0...
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/projects/voice/trill/foo.py in 
      27 import tensorflow_datasets as tfds
      28 
----> 29 tfds.load('Dementiabank', batch_size=-1)

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow_datasets/core/load.py in load(name, split, data_dir, batch_size, shuffle_files, download, as_supervised, decoders, read_config, with_info, builder_kwargs, download_and_prepare_kwargs, as_dataset_kwargs, try_gcs)
    331   if with_info:
    332     return ds, dbuilder.info
--> 333   return ds
    334 
    335 

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow_datasets/core/dataset_builder.py in download_and_prepare(self, download_dir, download_config)
    437           # Old version of TF are not os.PathLike compatible
    438           with tf_compat.mock_gfile_pathlike():
--> 439             self._download_and_prepare(
    440                 dl_manager=dl_manager,
    441                 download_config=download_config,

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow_datasets/core/dataset_builder.py in _download_and_prepare(self, dl_manager, download_config)
   1153           self.info.file_format].FILE_SUFFIX
   1154 
-> 1155       split_info_futures = [
   1156           split_builder.submit_split_generation(  # pylint: disable=g-complex-comprehension
   1157               split_name=split_name,

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow_datasets/core/dataset_builder.py in <listcomp>(.0)
   1154 
   1155       split_info_futures = [
-> 1156           split_builder.submit_split_generation(  # pylint: disable=g-complex-comprehension
   1157               split_name=split_name,
   1158               generator=generator,

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow_datasets/core/split_builder.py in submit_split_generation(self, split_name, generator, path, disable_shuffling)
    289     # `_build_from_xyz` method.
    290     if isinstance(generator, collections.abc.Iterable):
--> 291       return self._build_from_generator(**build_kwargs)
    292     else:  # Otherwise, beam required
    293       unknown_generator_type = TypeError(

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow_datasets/core/split_builder.py in _build_from_generator(self, split_name, generator, path, disable_shuffling)
    359         example = self._features.encode_example(example)
    360       except Exception as e:  # pylint: disable=broad-except
--> 361         utils.reraise(e, prefix=f'Failed to encode example:\n{example}\n')
    362       writer.write(key, example)
    363     shard_lengths, total_size = writer.finalize()

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow_datasets/core/split_builder.py in _build_from_generator(self, split_name, generator, path, disable_shuffling)
    357     ):
    358       try:
--> 359         example = self._features.encode_example(example)
    360       except Exception as e:  # pylint: disable=broad-except
    361         utils.reraise(e, prefix=f'Failed to encode example:\n{example}\n')

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow_datasets/core/features/features_dict.py in encode_example(self, example_dict)
    196         utils.reraise(
    197             e, prefix=f'In <{feature.__class__.__name__}> with name "{k}":\n')
--> 198     return example
    199 
    200   def _flatten(self, x):

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow_datasets/core/features/features_dict.py in encode_example(self, example_dict)
    194         example[k] = feature.encode_example(example_value)
    195       except Exception as e:  # pylint: disable=broad-except
--> 196         utils.reraise(
    197             e, prefix=f'In <{feature.__class__.__name__}> with name "{k}":\n')
    198     return example

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow_datasets/core/features/audio_feature.py in encode_example(self, audio_or_path_or_fobj)
     95     else:
     96       return self._encode_file(audio_or_path_or_fobj, self._file_format)
---> 97 
     98   @property
     99   def sample_rate(self):

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow_datasets/core/features/audio_feature.py in encode_example(self, audio_or_path_or_fobj)
     93         except Exception as e:  # pylint: disable=broad-except
     94           utils.reraise(e, prefix=f'Error for {filename}: ')
---> 95     else:
     96       return self._encode_file(audio_or_path_or_fobj, self._file_format)
     97 

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow_datasets/core/features/audio_feature.py in _encode_file(self, fobj, file_format)
     83 
     84   def encode_example(self, audio_or_path_or_fobj):
---> 85     if isinstance(audio_or_path_or_fobj, (np.ndarray, list)):
     86       return audio_or_path_or_fobj
     87     elif isinstance(audio_or_path_or_fobj, type_utils.PathLikeCls):

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow_datasets/core/features/feature.py in encode_example(self, example_data)
    697 
    698   def encode_example(self, example_data):
--> 699     """See base class for details."""
    700     np_dtype = np.dtype(self.dtype.as_numpy_dtype)
    701     if isinstance(example_data, tf.Tensor):

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow_datasets/core/utils/tf_utils.py in assert_shape_match(shape1, shape2)
    143   Args:
    144     shape1 (tuple): Static shape
--> 145     shape2 (tuple): Dynamic shape (can contain None)
    146   """
    147   shape1 = tf.TensorShape(shape1)

~/.miniconda3/envs/tf/lib/python3.8/site-packages/tensorflow/python/framework/tensor_shape.py in assert_same_rank(self, other)
    999     if self.rank is not None and other.rank is not None:
   1000       if self.rank != other.rank:
-> 1001         raise ValueError("Shapes %s and %s must have the same rank" %
   1002                          (self, other))
   1003 

ValueError: Failed to encode example:
{'audio': '/home/admin/tensorflow_datasets/downloads/manual/dementia/English/Pitt/Control/cookie/006-2.mp3', 'label': 'control', 'speaker_id': '006'}
In <Audio> with name "audio":
Error for /home/admin/tensorflow_datasets/downloads/manual/dementia/English/Pitt/Control/cookie/006-2.mp3: Shapes (1922688, 2) and (None,) must have the same rank

Expected behavior Dataset loads.

Additional context Add any other context about the problem here.

cheulyop commented 3 years ago

The error seems to occur while running the below lines. https://github.com/tensorflow/datasets/blob/c6745749bbbe3041e96176f51538085c77fcac4f/tensorflow_datasets/core/features/audio_feature.py#L78-L82

If audio has channels > 1, then audio_data is reshaped accordingly, but the shape of what is being referenced by super() remains (None,), thus the mismatch.

I'm getting around this by setting audio channels to mono(1) before it checks for the matching shape/rank, but could there be a better way, i.e., to update the shape of the parent class?

I tried creating the base class tensorflow_datasets.core.features.Tensor and using its encode_example directly, but it didn't help and gave me another error instead.

  File "/home/admin/.miniconda3/envs/tf/bin/tfds", line 10, in <module>
    sys.exit(launch_cli())
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/scripts/cli/main.py", line 126, in launch_cli
    app.run(main, flags_parser=_parse_flags)
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/absl/app.py", line 303, in run
    _run_main(main, args)
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/absl/app.py", line 251, in _run_main
    sys.exit(main(argv))
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/scripts/cli/main.py", line 121, in main
    args.subparser_fn(args)
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/scripts/cli/build.py", line 199, in _build_datasets
    _download_and_prepare(args, builder)
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/scripts/cli/build.py", line 355, in _download_and_prepare
    builder.download_and_prepare(
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/core/dataset_builder.py", line 439, in download_and_prepare
    self._download_and_prepare(
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/core/dataset_builder.py", line 1155, in _download_and_prepare
    split_info_futures = [
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/core/dataset_builder.py", line 1156, in <listcomp>
    split_builder.submit_split_generation(  # pylint: disable=g-complex-comprehension
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/core/split_builder.py", line 291, in submit_split_generation
    return self._build_from_generator(**build_kwargs)
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/core/split_builder.py", line 362, in _build_from_generator
    writer.write(key, example)
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/core/tfrecords_writer.py", line 273, in write
    serialized_example = self._serializer.serialize_example(example)
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/core/example_serializer.py", line 52, in serialize_example
    example = _dict_to_tf_example(example, self._flat_example_specs)
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/core/example_serializer.py", line 89, in _dict_to_tf_example
    example_dict = {
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/core/example_serializer.py", line 90, in <dictcomp>
    k: run_with_reraise(_item_to_tf_feature, k, item, tensor_info)
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/core/example_serializer.py", line 70, in run_with_reraise
    utils.reraise(
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/core/example_serializer.py", line 68, in run_with_reraise
    return fn(example_data, tensor_info)
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/core/example_serializer.py", line 129, in _item_to_tf_feature
    v = _item_to_np_array(item, shape=tensor_info.shape, dtype=tensor_info.dtype)
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/core/example_serializer.py", line 120, in _item_to_np_array
    utils.assert_shape_match(item.shape, shape)
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_datasets/core/utils/tf_utils.py", line 145, in assert_shape_match
    shape1.assert_same_rank(shape2)
  File "/home/admin/.miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow/python/framework/tensor_shape.py", line 1001, in assert_same_rank
    raise ValueError("Shapes %s and %s must have the same rank" %
ValueError: Error while serializing feature `audio`: `TensorInfo(shape=(None,), dtype=tf.int64)`: Shapes (1922688, 2) and (None,) must have the same rank

Any help would be much appreciated.

bagustris commented 3 years ago

Same here. Is there any workaround?