Open kcho opened 1 year ago
Beginning to build framework that creates a report comparing dicom information against zip file names and runsheet information. One difficulty is that some subjects are missing values or have unexpected values for "PatientID" (participant id), "PatientName" (subject id), or "StudyDate" (acquisition date) in their dicoms. In other words, for example, it is not safe to assume that all dicoms have information about acquisition date.
Please see /PHShome/nk582/dicom_id_info_dict.csv
import random
from pathlib import Path
import pydicom
def get_random_dicom_path_per_session(dicom_root):
"""
Get random dicom path for each session in root directory.
1. Start:
/data/predict1/data_from_nda/MRI_ROOT/sourcedata ->
2. Loop through each subject directory:
/data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345 ->
3. Loop through each session directory:
/data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345/ses-190001011 ->
4. Checks to make sure at least one dicom directory exists:
/data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345/ses-190001011/* ->
5. Checks to make sure at least one file exists:
/data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345/ses-190001011/*/*
6. Finish:
Selects random file from random dicom directory
Parameters:
- dicom_root: Dicom root directory.
Returns:
- List of random dicom paths for each sub-*/ses-*.
"""
dicom_root = Path(dicom_root)
random_dicom_path_per_session = []
# /data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345
dicom_subject_directories = [
d for d in dicom_root.glob("[A-Za-z][A-Za-z]?????") if d.is_dir()
]
if not dicom_subject_directories:
print(f"Warning: No dicom_subject_directories found in {dicom_root}.")
return []
for dicom_subject_directory in dicom_subject_directories:
# /data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345/ses-190001011
dicom_session_directories = [d for d in dicom_subject_directory.glob("ses-*")]
if not dicom_session_directories:
print(
f"Warning: No dicom_session_directories found in {dicom_root}/{dicom_subject_directory}."
)
continue
for dicom_session_directory in dicom_session_directories:
# /data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345/ses-190001011/*
dicom_directories = [
d for d in dicom_session_directory.glob("*") if d.is_dir()
]
if not dicom_directories:
print(
f"Warning: No dicom_directories found in {dicom_root}/{dicom_subject_directory}/{dicom_session_directory}."
)
continue
at_least_one_file_found = False
while not at_least_one_file_found and dicom_directories:
# /data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345/ses-190001011/*
random_dicom_directory = random.choice(dicom_directories)
# /data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345/ses-190001011/*/*
files_in_random_dicom_directory = [
f for f in random_dicom_directory.iterdir() if f.is_file()
]
if not files_in_random_dicom_directory:
print(
f"Warning: No files found in {dicom_root}/{dicom_subject_directory}/{dicom_session_directory}/{random_dicom_directory}."
)
dicom_directories.remove(random_dicom_directory)
else:
at_least_one_file_found = True
random_file_in_random_dicom_directory = random.choice(
files_in_random_dicom_directory
)
random_dicom_path_per_session.append(
random_file_in_random_dicom_directory
)
return random_dicom_path_per_session
def get_id_info_from_dicom(path_to_dicom):
"""
Get info relevant to participant id from dicom.
Parameters:
- path_to_dicom: Path to dicom.
Returns:
- Dictionary with following structure:
{
Participant ID: {
Subject ID: Value,
Session ID: Value,
Path to Dicom: Value
}
}
"""
# Load DICOM
ds = pydicom.dcmread(path_to_dicom)
# Extract Participant ID
patient_id = ds.get("PatientID", None)
# Extract Subject ID
subject_id = ds.get("PatientName", None)
# Extract Session ID
session_id = ds.get("StudyDate", None)
return {
patient_id: {
"Subject": subject_id,
"Session": session_id,
"Path": str(path_to_dicom), # Added the path of the DICOM file
}
}
def get_dicom_id_info_dict(dicom_root):
"""
Calls get_id_info_from_dicom and get_id_info_from_dicom to get info
relevant to participant id from random dicoms for all sub-*/ses-*.
Parameters:
- dicom_root: Dicom root directory.
Returns:
- dicom_id_info_dict: Dictionary of dictionaries from get_id_info_from_dicom.
"""
random_dicom_path_per_session = get_random_dicom_path_per_session(dicom_root)
dicom_id_info_dict = {}
for path in random_dicom_path_per_session:
info = get_id_info_from_dicom(path)
dicom_id_info_dict.update(info)
return dicom_id_info_dict
import csv
dicom_id_info_dict = get_dicom_id_info_dict(
"/data/predict1/data_from_nda/MRI_ROOT/sourcedata"
)
def save_to_csv(data, filename):
with open(filename, "w", newline="") as csv_file:
writer = csv.writer(csv_file)
writer.writerow(["PatientID", "Subject", "Session", "Path"])
for patient_id, info in data.items():
writer.writerow(
[patient_id, info["Subject"], info["Session"], info["Path"]]
)
save_to_csv(dicom_id_info_dict, "dicom_id_info_dict.csv")
DPACC is depending on the date inserted into the run sheet or the filename when finding missing scans, BIDS conversion, etc. However, we need to start using the date in the dicom headers as the true date of the scan and flag any run sheets and zip files that have mismatching dates.