nyukat / mammography_metarepository

Meta-repository of screening mammography classifiers
https://arxiv.org/abs/2108.04800
BSD 2-Clause "Simplified" License
64 stars 11 forks source link

.pkl generation #15

Closed cyh-0 closed 2 years ago

cyh-0 commented 2 years ago

Hi Jwitos,

I am wondering if it's possible for you to share the data.pkl generator file for CMMD dataset?

Cheers

jwitos commented 2 years ago

hi @Ch1kara, again apologies for late reply. This snippet should generate the correct pkl file for CMMD dataset:

import os
import pandas as pd
from pathlib import Path
import pydicom
import pickle

df = pd.read_excel("/ChineseMammographyDatabase/CMMD_clinicaldata.xlsx")  # Path to clinical data excel file
cmmd_dir = "/ChineseMammographyDatabase/CMMD"  # Directory where your CMMD studies are stored
print(len(df.drop_duplicates('ID1')))  # sanity check: how many studies total; result should be 1,775

datalist = []
for d in sorted(os.listdir(cmmd_dir)):
    if d == 'D1-0951':
        continue  # This study fails at preprosessing for the GMIC model.

    study_dict = {
        "L-MLO": [],
        "R-MLO": [],
        "L-CC": [],
        "R-CC": [],
        "horizontal_flip": "NO",
        "cancer_label": {
            "left_malignant": 0,
            "right_malignant": 0,
            "left_benign": 0,
            "right_benign": 0
        }
    }

    study_dir = os.path.join(cmmd_dir, d)

    # Find all dicoms for the study and assign
    # to correct laterality / views
    for path in Path(study_dir).rglob('*.dcm'):
        ds = pydicom.dcmread(path.absolute())

        # Figure out mammo view (MLO versus CC)
        if ds.ViewCodeSequence[0].CodeMeaning == 'medio-lateral oblique':
            mammo_view = 'MLO'
        elif ds.ViewCodeSequence[0].CodeMeaning == 'cranio-caudal':
            mammo_view = 'CC'
        else:
            raise ValueError(f"Unsupported mammo view {mammo_view}")

        lat_and_view = f"{ds.ImageLaterality}-{mammo_view}"  # get a key of Laterality-MammoView
        study_dict[lat_and_view].append(os.path.join(d, path.name.strip(".dcm")))  # assign image path to correct view

    # Find correct labels
    study_df = df[df.ID1==d]
    for _, row in study_df.iterrows():
        if row.classification == 'Malignant':
            if row.LeftRight == 'L':
                study_dict['cancer_label']['left_malignant'] = 1
            elif row.LeftRight == 'R':
                study_dict['cancer_label']['right_malignant'] = 1
            else:
                raise ValueError()

    datalist.append(study_dict)

with open("cmmd_datalist.pkl", "wb") as f:
    pickle.dump(datalist, f)

HTH and feel free to reopen if you have any further questions

cyh-0 commented 2 years ago

Thanks again for the help! really appreciated!