tensorflow / datasets

TFDS is a collection of datasets ready to use with TensorFlow, Jax, ...
https://www.tensorflow.org/datasets
Apache License 2.0
4.23k stars 1.52k forks source link

register local data into tfds and no dataset_info.json #4752

Closed lesjie-wen closed 1 year ago

lesjie-wen commented 1 year ago

l meet some problems when l trying to load my local data that says no dataset_info.json

What I've tried so far Actually l follow the code from Song Yang:solving inverse problems, and l have already download the data,and the code is follow the official documentation on how to build own datasets has such root structure: --brats(2021) |--dummy_data |--init.py |--brats.py |--brats_test.py |--checksums.tsv This is brats.py

"""brats dataset."""

import tensorflow_datasets as tfds
import os
import SimpleITK as sitk
import numpy as np
import tensorflow as tf

_DESCRIPTION = """
BraTS 2021
"""

_CITATION = """
[1] U.Baid, et al., The RSNA-ASNR-MICCAI BraTS 2021 Benchmark on Brain Tumor Segmentation and Radiogenomic Classification, arXiv:2107.02314, 2021.

[2] B. H. Menze, A. Jakab, S. Bauer, J. Kalpathy-Cramer, K. Farahani, J. Kirby, et al. "The Multimodal Brain Tumor Image Segmentation Benchmark (BRATS)", IEEE Transactions on Medical Imaging 34(10), 1993-2024 (2015) DOI: 10.1109/TMI.2014.2377694

[3] S. Bakas, H. Akbari, A. Sotiras, M. Bilello, M. Rozycki, J.S. Kirby, et al., "Advancing The Cancer Genome Atlas glioma MRI collections with expert segmentation labels and radiomic features", Nature Scientific Data, 4:170117 (2017) DOI: 10.1038/sdata.2017.117

[4] S. Bakas, H. Akbari, A. Sotiras, M. Bilello, M. Rozycki, J. Kirby, et al., "Segmentation Labels and Radiomic Features for the Pre-operative Scans of the TCGA-GBM collection", The Cancer Imaging Archive, 2017. DOI: 10.7937/K9/TCIA.2017.KLXWJJ1Q

[5] S. Bakas, H. Akbari, A. Sotiras, M. Bilello, M. Rozycki, J. Kirby, et al., "Segmentation Labels and Radiomic Features for the Pre-operative Scans of the TCGA-LGG collection", The Cancer Imaging Archive, 2017. DOI: 10.7937/K9/TCIA.2017.GJQ7R0EF
"""

class Brats(tfds.core.GeneratorBasedBuilder):
  """DatasetBuilder for brats dataset."""

  VERSION = tfds.core.Version('1.0.0')
  RELEASE_NOTES = {
    '1.0.0': 'Initial release.',
  }

  def _info(self) -> tfds.core.DatasetInfo:
    """Returns the dataset metadata."""
    datasetInfo = tfds.core.DatasetInfo(
      builder=self,
      description=_DESCRIPTION,
      features=tfds.features.FeaturesDict({
        # These are the features of your dataset like images, labels ...
        'image': tfds.features.Image(shape=(240, 240, 1)),
        'label': tfds.features.ClassLabel(names=['t1', 't1ce', 't2', 'flair']),
      }),
      # If there's a common (input, target) tuple from the
      # features, specify them here. They'll be used if
      # `as_supervised=True` in `builder.as_dataset`.
      supervised_keys=('image', 'label'),  # Set to `None` to disable
      citation=_CITATION,
    )

    return datasetInfo

  def _split_generators(self, dl_manager: tfds.download.DownloadManager):
    """Returns SplitGenerators."""
    train_path = '/home/data/Brats/training/brats/1.0.0'
    dl_manager.manual_dir = train_path
    return {
      'train': self._generate_examples(train_path),
    }

  def read_img(self, path):
    return sitk.GetArrayFromImage(sitk.ReadImage(path))

  def get_bound(self, data, return_coord=False):
    """
    get the boundary of image z y x
    data is padded with 0
    """
    data_0 = data - data.min()
    # display
    #     display_arr_stats(data_0)

    z, y, x = np.where(data_0)
    z_start, z_end = np.min(z), np.max(z)
    y_start, y_end = np.min(y), np.max(y)
    x_start, x_end = np.min(x), np.max(x)

    indicator = np.ones_like(data, dtype=bool)
    indicator[z_start:z_end, y_start:y_end, x_start:x_end] = False
    if return_coord:
      return z_start, z_end, y_start, y_end, x_start, x_end, indicator
    return indicator

  def mri_data_norm(self, data, scale=6.0, return_v=False):
    # important to transfer datatype to keep division works
    data = data.astype(float)

    # get a box mask to remove background
    min_z, max_z, min_y, max_y, min_x, max_x, indicator = self.get_bound(data, return_coord=True)
    crop_data = np.array(data[min_z:max_z, min_y:max_y, min_x:max_x] * 1.0)
    mean, std = np.mean(crop_data), np.std(crop_data)
    # clip outliers
    crop_data = np.clip(crop_data, max(mean - scale * std, crop_data.min()), min(mean + scale * std, crop_data.max()))

    # normalize scale [0,1]
    min_v = crop_data.min()
    crop_data = np.array(crop_data - min_v)
    max_v = crop_data.max() * 1.0
    crop_data = np.array(crop_data) / max_v

    data[min_z:max_z, min_y:max_y, min_x:max_x] = np.array(crop_data)
    data[indicator] = 0

    if return_v:
      return np.array(data), [min_v, max_v, np.float(min_y), np.float(max_y), np.float(min_x), np.float(max_x)]
    else:
      return np.array(data)

  def _generate_examples(self, path):
    """Yields examples."""
    img_list = tf.io.gfile.listdir(path)
    domains = ['t1', 't1ce', 't2', 'flair']
    count = -1
    for img_folder in img_list:
      img_path = os.path.join(path, img_folder, os.path.split(img_folder)[-1] + '_t1.nii.gz')
      img_array = self.read_img(img_path)
      z, x, y = np.where(img_array)
      z_min, z_max = np.min(z), np.max(z)
      z_min = z_min + 40
      z_max = z_max - 25

      for domain in domains:
        img_path = os.path.join(path, img_folder, os.path.split(img_folder)[-1] + f'_{domain}.nii.gz')
        img_array = self.read_img(img_path)
        img_array = self.mri_data_norm(img_array, scale=6.0)
        for z_idx in range(z_min, z_max + 1):
          img = img_array[z_idx, ...]
          # Sanity check intensity values
          assert np.min(img) >= 0.0 and np.max(img) <= 1.0 and np.max(img) >= 0.1
          count += 1
          yield count, {
            'image': np.clip(img[..., None] * 255., 0.0, 255.).astype(np.uint8),
            'label': domain
          }
The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/lesjie/scoreInverseProblems_2/main.py", line 65, in <module>
    app.run(main)
  File "/root/miniconda3/envs/tf2.4/lib/python3.9/site-packages/absl/app.py", line 308, in run
    _run_main(main, args)
  File "/root/miniconda3/envs/tf2.4/lib/python3.9/site-packages/absl/app.py", line 254, in _run_main
    sys.exit(main(argv))
  File "/home/lesjie/scoreInverseProblems_2/main.py", line 54, in main
    run_lib.train(FLAGS.config, FLAGS.workdir)
  File "/home/lesjie/scoreInverseProblems_2/run_lib.py", line 77, in train
    train_ds, eval_ds, _ = datasets.get_dataset(config,
  File "/home/lesjie/scoreInverseProblems_2/datasets.py", line 122, in get_dataset
    dataset_builder = tfds.builder(config.data.dataset.replace('_', ''), data_dir='/home/data/Brats/training/')
  File "/root/miniconda3/envs/tf2.4/lib/python3.9/site-packages/tensorflow_datasets/core/load.py", line 177, in builder
    return cls(**builder_kwargs)  # pytype: disable=not-instantiable
  File "/root/miniconda3/envs/tf2.4/lib/python3.9/contextlib.py", line 137, in __exit__
    self.gen.throw(typ, value, traceback)
  File "/root/miniconda3/envs/tf2.4/lib/python3.9/site-packages/tensorflow_datasets/core/utils/py_utils.py", line 399, in try_reraise
    reraise(e, *args, **kwargs)
  File "/root/miniconda3/envs/tf2.4/lib/python3.9/site-packages/tensorflow_datasets/core/utils/py_utils.py", line 365, in reraise
    raise exception from e
FileNotFoundError: Failed to construct dataset brats: Try to load `DatasetInfo` from a directory which does not exist or does not contain `dataset_info.json`. Please delete the directory `/home/data/Brats/training/brats/1.0.0`  if you are trying to re-generate the dataset.

Process finished with exit code 1

It would be nice if... Acutally l am wondering when register my local own data,when and where that the dataset_info.json will be created automatically or mnually?

Environment information (if applicable)

ch-pavan commented 1 year ago

It should be created automatically, remove the directory and try again. You could also write DatasetInfo manually by following this method in this DatasetInfo Documentation