Date in the zip filename and in the dicom header

kcho commented 1 year ago

DPACC is depending on the date inserted into the run sheet or the filename when finding missing scans, BIDS conversion, etc. However, we need to start using the date in the dicom headers as the true date of the scan and flag any run sheets and zip files that have mismatching dates.

nickckim commented 1 year ago

Beginning to build framework that creates a report comparing dicom information against zip file names and runsheet information. One difficulty is that some subjects are missing values or have unexpected values for "PatientID" (participant id), "PatientName" (subject id), or "StudyDate" (acquisition date) in their dicoms. In other words, for example, it is not safe to assume that all dicoms have information about acquisition date.

Please see /PHShome/nk582/dicom_id_info_dict.csv

nickckim commented 1 year ago

import random
from pathlib import Path

import pydicom

def get_random_dicom_path_per_session(dicom_root):
    """
    Get random dicom path for each session in root directory.

    1. Start:
       /data/predict1/data_from_nda/MRI_ROOT/sourcedata ->

    2. Loop through each subject directory:
       /data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345 ->

    3. Loop through each session directory:
       /data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345/ses-190001011 ->

    4. Checks to make sure at least one dicom directory exists:
       /data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345/ses-190001011/* ->

    5. Checks to make sure at least one file exists:
       /data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345/ses-190001011/*/*

    6. Finish:
       Selects random file from random dicom directory

    Parameters:
    - dicom_root: Dicom root directory.

    Returns:
    - List of random dicom paths for each sub-*/ses-*.
    """

    dicom_root = Path(dicom_root)
    random_dicom_path_per_session = []

    # /data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345
    dicom_subject_directories = [
        d for d in dicom_root.glob("[A-Za-z][A-Za-z]?????") if d.is_dir()
    ]

    if not dicom_subject_directories:
        print(f"Warning: No dicom_subject_directories found in {dicom_root}.")
        return []

    for dicom_subject_directory in dicom_subject_directories:
        # /data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345/ses-190001011
        dicom_session_directories = [d for d in dicom_subject_directory.glob("ses-*")]

        if not dicom_session_directories:
            print(
                f"Warning: No dicom_session_directories found in {dicom_root}/{dicom_subject_directory}."
            )
            continue

        for dicom_session_directory in dicom_session_directories:
            # /data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345/ses-190001011/*
            dicom_directories = [
                d for d in dicom_session_directory.glob("*") if d.is_dir()
            ]

            if not dicom_directories:
                print(
                    f"Warning: No dicom_directories found in {dicom_root}/{dicom_subject_directory}/{dicom_session_directory}."
                )
                continue

            at_least_one_file_found = False
            while not at_least_one_file_found and dicom_directories:
                # /data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345/ses-190001011/*
                random_dicom_directory = random.choice(dicom_directories)

                # /data/predict1/data_from_nda/MRI_ROOT/sourcedata/AB12345/ses-190001011/*/*
                files_in_random_dicom_directory = [
                    f for f in random_dicom_directory.iterdir() if f.is_file()
                ]

                if not files_in_random_dicom_directory:
                    print(
                        f"Warning: No files found in {dicom_root}/{dicom_subject_directory}/{dicom_session_directory}/{random_dicom_directory}."
                    )
                    dicom_directories.remove(random_dicom_directory)
                else:
                    at_least_one_file_found = True
                    random_file_in_random_dicom_directory = random.choice(
                        files_in_random_dicom_directory
                    )
                    random_dicom_path_per_session.append(
                        random_file_in_random_dicom_directory
                    )

    return random_dicom_path_per_session

def get_id_info_from_dicom(path_to_dicom):
    """
    Get info relevant to participant id from dicom.

    Parameters:
    - path_to_dicom: Path to dicom.

    Returns:
    - Dictionary with  following structure:
        {
            Participant ID: {
                Subject ID: Value,
                Session ID: Value,
                Path to Dicom: Value
            }
        }
    """

    # Load DICOM
    ds = pydicom.dcmread(path_to_dicom)
    # Extract Participant ID
    patient_id = ds.get("PatientID", None)
    # Extract Subject ID
    subject_id = ds.get("PatientName", None)
    # Extract Session ID
    session_id = ds.get("StudyDate", None)

    return {
        patient_id: {
            "Subject": subject_id,
            "Session": session_id,
            "Path": str(path_to_dicom),  # Added the path of the DICOM file
        }
    }

def get_dicom_id_info_dict(dicom_root):
    """
    Calls get_id_info_from_dicom and get_id_info_from_dicom to get info
    relevant to participant id from random dicoms for all sub-*/ses-*.

    Parameters:
    - dicom_root: Dicom root directory.

    Returns:
    - dicom_id_info_dict: Dictionary of dictionaries from get_id_info_from_dicom.
    """

    random_dicom_path_per_session = get_random_dicom_path_per_session(dicom_root)
    dicom_id_info_dict = {}

    for path in random_dicom_path_per_session:
        info = get_id_info_from_dicom(path)
        dicom_id_info_dict.update(info)

    return dicom_id_info_dict

nickckim commented 1 year ago

import csv

dicom_id_info_dict = get_dicom_id_info_dict(
    "/data/predict1/data_from_nda/MRI_ROOT/sourcedata"
)

def save_to_csv(data, filename):
    with open(filename, "w", newline="") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["PatientID", "Subject", "Session", "Path"])
        for patient_id, info in data.items():
            writer.writerow(
                [patient_id, info["Subject"], info["Session"], info["Path"]]
            )

save_to_csv(dicom_id_info_dict, "dicom_id_info_dict.csv")

AMP-SCZ / qqc

Date in the zip filename and in the dicom header #56