KitwareMedical / HASI

High-throughput Applications for Skeletal Imaging
Apache License 2.0
6 stars 8 forks source link

ENH: Add test for initial OAI testing data #65

Closed thewtex closed 3 years ago

thewtex commented 3 years ago

Created with the script:

  #!/usr/bin/env python

  from pathlib import Path
  import os
  import subprocess
  import itk
  import pandas as pd
  import zarr
  from numcodecs import Blosc
  import json
  import numpy as np

  oai_data_root = '/mnt/cybertron/OAI'
  dess_file = '/home/matt/src/OAI_analysis/data/SEG_3D_DESS_all.csv'

  os.chdir(oai_data_root)

  # months
  time_points = [0, 12, 18, 24, 30, 36, 48, 72, 96]
  time_point_folders = ['OAIBaselineImages',] + [f'OAI{m}MonthImages' for m in time_points[1:]]
  time_point_patients = { tp: set() for tp in time_points }

  dess_df = pd.read_csv(dess_file)

  # We can use all images in dess_df in the future
  target_patients = [9279291, 9298954, 9003380]

  output_prefix = Path('/home/matt/data/hasi-testing-data/knee')

  image_types = set()
  data_index = {}
  data_dtypes = np.dtype([('Months', np.uint16),
                     ('PatientID', np.unicode_, 64),
                     ('Laterality', np.unicode_, 8),
                     ('Path', np.unicode_, 1024),
                     ('CID', np.unicode_, 64),
                     ('Type', np.unicode_, 128),
                     ('Name', np.unicode_, 256)])
  data_rows = []
  for patient in target_patients[:3]:
      for time_point_index, time_point in enumerate([0, 12, 18, 24, 30, 36, 48, 72, 96]):
          folder = Path(time_point_folders[time_point_index]) / 'results'
          for study in folder.iterdir():
              if study.is_dir():
                  for patient_dir in study.iterdir():
                      if patient_dir.match(str(patient)):

                          acquisition_id = patient_dir.relative_to(folder)
                          acquisition_dess = dess_df['Folder'].str.contains(str(acquisition_id))
                          acquisition_df = dess_df.loc[acquisition_dess, :]
                          dess_count = 0
                          for _, descr in acquisition_df.iterrows():
                              is_left = descr['SeriesDescription'].find('LEFT') > -1
                              vol_folder = folder / descr['Folder']
                              image = itk.imread(str(vol_folder))
                              md = itk.MetaDataDictionary()
                              if is_left:
                                  md['laterality'] = 'left'
                              else:
                                  md['laterality'] = 'right'
                              image.SetMetaDataDictionary(md)
                              patient_cid = subprocess.check_output(['ipfs', 'add', '--cid-version', '1', '--raw-leaves', '-Q'], input=str(patient).encode()).decode().strip()
                              month_id = f'Month-{time_point}'
                              patient_id = f'PatientID-{patient_cid}'
                              output_dir = output_prefix / \
                                  patient_id / \
                                  month_id
                              if not output_dir.exists():
                                  output_dir.mkdir(parents=True)
                              if not patient_id in data_index:
                                  data_index[patient_id] = {}
                              if not month_id in data_index[patient_id]:
                                  data_index[patient_id][month_id] = {}

                              output_image_dir = output_dir / 'Images' / f'SAG_3D_DESS_{dess_count}.zarr'
                              print(output_image_dir)
                              image_da = itk.xarray_from_image(image)
                              name = output_image_dir.stem
                              image_ds = image_da.to_dataset(name=name)
                              store = zarr.DirectoryStore(output_image_dir)
                              chunk_size = 64
                              compressor = Blosc(cname='zstd', clevel=5, shuffle=Blosc.SHUFFLE)
                              image_ds.to_zarr(store, mode='w', compute=True,
                                               encoding={name: {'chunks': [chunk_size]*image.GetImageDimension(), 'compressor': compressor}})
                              cid = subprocess.check_output(['ipfs', 'add', '-r', '--hidden', '-s', 'size-1000000',
                                                             '--raw-leaves', '--cid-version', '1', '-Q',
                                                             str(output_image_dir)]).decode().strip()
                              data_index[patient_id][month_id][str(output_image_dir.name)] = cid
                              data_row = np.array((time_point,
                                                   patient_cid,
                                                   str(md['laterality']),
                                                   str(output_image_dir.relative_to(output_prefix)),
                                                   cid,
                                                   'Image',
                                                   f'SEG_3D_DESS_{dess_count}'),
                                                   dtype=data_dtypes)
                              data_rows.append(pd.DataFrame(data_row, index=np.arange(1)))

                              dess_count += 1

  data_index_json = output_prefix / 'index.json'
  with open(data_index_json, 'w') as fp:
      json.dump(data_index, fp)

  data_df = pd.concat(data_rows, ignore_index=True)
  data_df.to_parquet(output_prefix / 'index.parquet', partition_cols=['PatientID', 'Months'], engine='pyarrow')
  data_df.to_csv(output_prefix / 'index.csv')
HastingsGreer commented 3 years ago

Very exciting!