Closed thewtex closed 3 years ago
Created with the script:
#!/usr/bin/env python from pathlib import Path import os import subprocess import itk import pandas as pd import zarr from numcodecs import Blosc import json import numpy as np oai_data_root = '/mnt/cybertron/OAI' dess_file = '/home/matt/src/OAI_analysis/data/SEG_3D_DESS_all.csv' os.chdir(oai_data_root) # months time_points = [0, 12, 18, 24, 30, 36, 48, 72, 96] time_point_folders = ['OAIBaselineImages',] + [f'OAI{m}MonthImages' for m in time_points[1:]] time_point_patients = { tp: set() for tp in time_points } dess_df = pd.read_csv(dess_file) # We can use all images in dess_df in the future target_patients = [9279291, 9298954, 9003380] output_prefix = Path('/home/matt/data/hasi-testing-data/knee') image_types = set() data_index = {} data_dtypes = np.dtype([('Months', np.uint16), ('PatientID', np.unicode_, 64), ('Laterality', np.unicode_, 8), ('Path', np.unicode_, 1024), ('CID', np.unicode_, 64), ('Type', np.unicode_, 128), ('Name', np.unicode_, 256)]) data_rows = [] for patient in target_patients[:3]: for time_point_index, time_point in enumerate([0, 12, 18, 24, 30, 36, 48, 72, 96]): folder = Path(time_point_folders[time_point_index]) / 'results' for study in folder.iterdir(): if study.is_dir(): for patient_dir in study.iterdir(): if patient_dir.match(str(patient)): acquisition_id = patient_dir.relative_to(folder) acquisition_dess = dess_df['Folder'].str.contains(str(acquisition_id)) acquisition_df = dess_df.loc[acquisition_dess, :] dess_count = 0 for _, descr in acquisition_df.iterrows(): is_left = descr['SeriesDescription'].find('LEFT') > -1 vol_folder = folder / descr['Folder'] image = itk.imread(str(vol_folder)) md = itk.MetaDataDictionary() if is_left: md['laterality'] = 'left' else: md['laterality'] = 'right' image.SetMetaDataDictionary(md) patient_cid = subprocess.check_output(['ipfs', 'add', '--cid-version', '1', '--raw-leaves', '-Q'], input=str(patient).encode()).decode().strip() month_id = f'Month-{time_point}' patient_id = f'PatientID-{patient_cid}' output_dir = output_prefix / \ patient_id / \ month_id if not output_dir.exists(): output_dir.mkdir(parents=True) if not patient_id in data_index: data_index[patient_id] = {} if not month_id in data_index[patient_id]: data_index[patient_id][month_id] = {} output_image_dir = output_dir / 'Images' / f'SAG_3D_DESS_{dess_count}.zarr' print(output_image_dir) image_da = itk.xarray_from_image(image) name = output_image_dir.stem image_ds = image_da.to_dataset(name=name) store = zarr.DirectoryStore(output_image_dir) chunk_size = 64 compressor = Blosc(cname='zstd', clevel=5, shuffle=Blosc.SHUFFLE) image_ds.to_zarr(store, mode='w', compute=True, encoding={name: {'chunks': [chunk_size]*image.GetImageDimension(), 'compressor': compressor}}) cid = subprocess.check_output(['ipfs', 'add', '-r', '--hidden', '-s', 'size-1000000', '--raw-leaves', '--cid-version', '1', '-Q', str(output_image_dir)]).decode().strip() data_index[patient_id][month_id][str(output_image_dir.name)] = cid data_row = np.array((time_point, patient_cid, str(md['laterality']), str(output_image_dir.relative_to(output_prefix)), cid, 'Image', f'SEG_3D_DESS_{dess_count}'), dtype=data_dtypes) data_rows.append(pd.DataFrame(data_row, index=np.arange(1))) dess_count += 1 data_index_json = output_prefix / 'index.json' with open(data_index_json, 'w') as fp: json.dump(data_index, fp) data_df = pd.concat(data_rows, ignore_index=True) data_df.to_parquet(output_prefix / 'index.parquet', partition_cols=['PatientID', 'Months'], engine='pyarrow') data_df.to_csv(output_prefix / 'index.csv')
Very exciting!
Created with the script: