Open shaojieguoECNU opened 14 hours ago
This notebook actually comes from a very old version of my EDA on AAPM data, thus may containing deprecated code. But don't worry, once you are familiar with the AAPM archive structure, the essential processing code just looks as simple as the following.
import os
import SimpleITK as sitk
def ima2arr(ima_path):
return sitk.GetArrayFromImage(sitk.ReadImage(ima_path))
def find_files_with_string(directory, strings='', ext=''):
"""
In `directory`, recursively find all files that contains specific string in `strings` and extension
as `ext`. Examples: `find_files_with_string('/path/to/data', strings=['ABC', '123'], exts='txt')`
will return a list of full paths for all `txt` files in `'/path/to/data'` with both `'ABC'` and `'123'`
contained in their file names. The output will look like:
[
'/path/to/data/12ea_ABC_123.txt',
'/path/to/data/subfolder/cea_123ABC_1ag.txt',
'/path/to/data/subfolder/subsubfolder/s_ABC00123.txt',
...
]
"""
matched_paths = []
if not isinstance(strings, (list, tuple)):
strings = [strings] # make it a list by default
if not os.path.exists(directory):
raise ValueError(f'Path does not exist: {directory}.')
for root, dirs, files in os.walk(directory):
for d in dirs[:]:
# Deal with extensions like `.nii.gz`. These files will be seen as directories by `os.walk`
if d.endswith('.nii.gz'):
dirs.remove(d) # Remove it from dirs, so we don’t recurse into it
files.append(d) # Treat it as a file
for file in files:
cond1 = all(s in file for s in strings)
cond2 = file.endswith(ext)
if cond1 and cond2:
matched_paths.append(os.path.join(root, file))
return sorted(matched_paths)
# We want 1mm B30 full dose data:
data_dir = 'path/to/your/aapm/Training_Image_Data/1mm B30'
ima_paths = find_files_with_string(data_dir, strings='FD', ext='.IMA')
save_dir = 'path/to/your/processed/dir' # define where to save the processed data
os.makedirs(save_dir, exist_ok=True)
for ima_path in ima_paths:
ima_file = ima_path.split('/')[-1] # if you are using Windows OS, the split string should be '\\' instead of '/'
npy_file = ima_file.replace('.IMA', '.npy')
npy_path = os.path.join(save_dir, npy_file)
arr = ima2arr(ima_path) # shape: [1, 512, 512], containing HU values
np.save(npy_path, arr)
In this way, you can see the converted files in save_dir
. For data normalization (important step before feeding data to deep learning models), you can either normalize the HU values (as defined below by normalize_hu
) or convert HU to attenuation coefficient (as defined below by hu2mu
). Generally, I prefer the former because it does not depend on the energy level which will affect mu_water
and mu_air
.
def normalize_hu(hu, min_hu=-1000, max_hu=2000):
if hasattr(hu, 'clip'):
hu = hu.clip(min_hu, max_hu)
return (hu - min_hu) / (max_hu - min_hu)
def hu2mu(hu, mu_water=0.192, mu_air=0.0):
# See https://solutioinsilico.com/medical-physics/labs/ct-hu-calculation.php for formula definition
return hu * (mu_water - mu_air) / 1000 + mu_water
Once you get all the .npy
files, you can customize your rule to split training/test data.
import matplotlib.pyplot as plt
hu = np.load(npy_path) # choose one of the file in the `save_dir`
hu = hu.squeeze() # AAPM data have additional dummy dimension
norm_hu = normalize_hu(hu)
print('Info of HU: ', hu.shape, hu.min(), hu.max())
print('Info of normalized HU: ', norm_hu.shape, norm_hu.min(), norm_hu.max())
plt.imshow(normalized_hu, cmap='gray')
when i run the process_aapm.ipynb, it said ModuleNotFoundError: No module named 'utilities.tomo_tools_astra