Closed cyh-0 closed 2 years ago
hi @Ch1kara, again apologies for late reply. This snippet should generate the correct pkl file for CMMD dataset:
import os
import pandas as pd
from pathlib import Path
import pydicom
import pickle
df = pd.read_excel("/ChineseMammographyDatabase/CMMD_clinicaldata.xlsx") # Path to clinical data excel file
cmmd_dir = "/ChineseMammographyDatabase/CMMD" # Directory where your CMMD studies are stored
print(len(df.drop_duplicates('ID1'))) # sanity check: how many studies total; result should be 1,775
datalist = []
for d in sorted(os.listdir(cmmd_dir)):
if d == 'D1-0951':
continue # This study fails at preprosessing for the GMIC model.
study_dict = {
"L-MLO": [],
"R-MLO": [],
"L-CC": [],
"R-CC": [],
"horizontal_flip": "NO",
"cancer_label": {
"left_malignant": 0,
"right_malignant": 0,
"left_benign": 0,
"right_benign": 0
}
}
study_dir = os.path.join(cmmd_dir, d)
# Find all dicoms for the study and assign
# to correct laterality / views
for path in Path(study_dir).rglob('*.dcm'):
ds = pydicom.dcmread(path.absolute())
# Figure out mammo view (MLO versus CC)
if ds.ViewCodeSequence[0].CodeMeaning == 'medio-lateral oblique':
mammo_view = 'MLO'
elif ds.ViewCodeSequence[0].CodeMeaning == 'cranio-caudal':
mammo_view = 'CC'
else:
raise ValueError(f"Unsupported mammo view {mammo_view}")
lat_and_view = f"{ds.ImageLaterality}-{mammo_view}" # get a key of Laterality-MammoView
study_dict[lat_and_view].append(os.path.join(d, path.name.strip(".dcm"))) # assign image path to correct view
# Find correct labels
study_df = df[df.ID1==d]
for _, row in study_df.iterrows():
if row.classification == 'Malignant':
if row.LeftRight == 'L':
study_dict['cancer_label']['left_malignant'] = 1
elif row.LeftRight == 'R':
study_dict['cancer_label']['right_malignant'] = 1
else:
raise ValueError()
datalist.append(study_dict)
with open("cmmd_datalist.pkl", "wb") as f:
pickle.dump(datalist, f)
HTH and feel free to reopen if you have any further questions
Thanks again for the help! really appreciated!
Hi Jwitos,
I am wondering if it's possible for you to share the data.pkl generator file for CMMD dataset?
Cheers