Open dancoster opened 1 year ago
Use the table of signficant pairs as input (=df_meds
).
import numpy as np
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# lab measurements
vital_signs = ['Heart Rate', 'Respiratory rate','Oxygen saturation', 'Systolic blood pressure', 'Diastolic blood pressure',
'Temperature']
labs_bmp = ['Glucose','Potassium','Sodium','Chloride', 'Creatinine', 'Blood urea nitrogen', 'Bicarbonate', 'Calcium',
'Albumin', 'Lactate dehydrogenase','Magnesium','Lactic acid']
labs_cbc = ['Hematocrit','Hemoglobin', 'Platelets', 'White blood cell count', 'Red blood cell count',
'Mean corpuscular volume', 'Lymphocytes', 'Neutrophils']
labs_cauglation = ['Prothrombin time INR']
## Load files
data_path = os.path.join(f"{res_path}", ".csv")
import numpy as np
#Params
n_patients = 25000
min_p_val = 1e-10
min_no_patients = 100
horizon = '4h'
#Load dataset
df_data = pd.read_csv(os.path.join(res_path, "hirid_extract_with_labels_48_all_parts.csv"))
df_data['CHARTTIME'] = pd.to_datetime(df_data['CHARTTIME'], utc=True)
# Take subset of n_patients=5000
patient_ids = df_data.HADM_ID.unique()
subsample_ids = np.random.choice(patient_ids , size =n_patients)
df_data = df_data[df_data.HADM_ID.isin(subsample_ids)]
#Taks only a subset of the columns
df_data = df_data[['HADM_ID', 'AGE', 'GENDER', 'EST_DISCHTIME','CHARTTIME', 'discharge_status', 'LABEL_48']+feature_names]
# Filter only pairs with > 100 patients with signficiant difference (< 1e-10)
df_meds = pd.read_csv('temp_pairs.csv')
df_meds = df_meds[df_meds['No. of Patients'] > min_no_patients]
df_meds = df_meds[(df_meds['BonferroniPvals'] < min_p_val)]
#rename RBC
# df_meds = df_meds.replace('Red blood cell', 'Red blood cell count')
# hirid_parser.load_med()
hirid_parser.pharma_records_with_name
HIRID_MED_MAPPING = {
"Insulin" : [1000963, 1000379, 1000381, 1000724, 15],
"Vancomycin": [189, 331],
"KCL": [1000398, 1001063, 1000080],
"Intravenous blood transfusion of packed cells": [1000100, 1000743],
"Glucose": [1000022, 1000690, 1000689, 1000544, 1000835, 1000746, 1000060, 1000567],
"Magnesium Sulfate": [1000421],
"Magnesiocard 5 mmol sachet": [1000420],
}
df_results = pd.DataFrame(columns=['lab_name','imputer_type','conversion_scheme','is_prev_val','RMSE','nRMSE'])
def extract_med_per_lab(lab_name,df_meds,inputevents_mv):
#Create lab specific list of medications
df_med = df_meds[(df_meds['Lab Name'] == lab_name)][['Lab Name','Med Name']].rename(columns={'Med Name':'med_label'})
#pull drug itemid from d_items
med_item_id = df_med
med_item_id["ITEMID"] = med_item_id["med_label"].apply(lambda r: HIRID_MED_MAPPING[r] if r in HIRID_MED_MAPPING else None)
df_med = med_item_id.dropna().explode("ITEMID")
#pull relevant medications to lab_name
df_inputevents = inputevents_mv[inputevents_mv.ITEMID.isin(df_med.ITEMID)]
# round start time to hour
df_inputevents['STARTTIME_rounded'] = pd.to_datetime(df_inputevents['STARTTIME'], utc=True).dt.round(freq='H')
#Take mean amount or drug per hour
df_inputevents= df_inputevents.groupby(['ITEMID','LABEL','HADM_ID','STARTTIME_rounded'])['AMOUNT'].mean().reset_index()
return(df_inputevents,df_med)
def add_med_adminstrations_cols(df_inputevents,df_data,df_med):
df_data=df_data.rename(columns={"HADM_ID":"subject_id"})
#Add drug cols
for temp_item_id in df_med.ITEMID:
temp_df_inputevents = df_inputevents[df_inputevents.ITEMID == temp_item_id].reset_index()
df_data = pd.merge(df_data, temp_df_inputevents, how='left', left_on=['subject_id','CHARTTIME'], right_on = ['HADM_ID','STARTTIME_rounded'])
df_data = df_data.drop(['HADM_ID','STARTTIME_rounded','index','ITEMID', 'LABEL'], axis=1)
med_name = df_inputevents[df_inputevents.ITEMID == temp_item_id]['LABEL'].iloc[0]
df_data = df_data.rename(columns={"AMOUNT": med_name})
# sort charttime in descending order per subject id
df_data = df_data.rename(columns={"subject_id":"HADM_ID"}).sort_values(['HADM_ID','CHARTTIME'])
return(df_data.copy())
# Loop over lab msesurements
# for lab_name in ['Glucose']:
res = {}
for lab_name in ['Hematocrit', 'Glucose', 'Lactic acid', 'Magnesium', 'Hemoglobin']:
# Extract admissions of drug
df_inputevents,df_med = extract_med_per_lab(lab_name,df_meds,hirid_parser.pharma_records_with_name.rename(columns={"givendose":"AMOUNT"}))
df_lab_data = add_med_adminstrations_cols(df_inputevents,df_data,df_med)
res[lab_name] = df_lab_data
import pickle as pk
pk.dump(res, open(os.path.join(res_path, "hirid_extract_imputed_data.pkl"), "wb"))
res.keys()
res["Lactic acid"]
Add mapping of HiRid longitudinal Cols and MIMIC-Extract Cols.
hirid_mapping = {
'Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma' : None,
'Albumin [Mass/volume] in Serum or Plasma' : 'Albumin',
'Amylase [Enzymatic activity/volume] in Serum or Plasma': None,
'Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma' : None,
'Bicarbonate [Moles/volume] in Arterial blood':'Bicarbonate',
'Bilirubin.direct [Mass/volume] in Serum or Plasma': None,
'Bilirubin.total [Moles/volume] in Serum or Plasma' : None,
'Calcium [Moles/volume] in Blood': 'Calcium',
'Calcium.ionized [Moles/volume] in Blood': 'Calcium',
'Carboxyhemoglobin/Hemoglobin.total in Arterial blood': 'Hemoglobin',
'Chloride [Moles/volume] in Blood': 'Chloride',
'Core body temperature': 'Temperature',
'Creatinine [Moles/volume] in Blood': 'Creatinine',
'Diastolic arterial pressure': 'Diastolic blood pressure',
'Glucose [Moles/volume] in Serum or Plasma': 'Glucose',
'Heart rate': 'Heart Rate',
'Hemoglobin [Mass/volume] in blood': 'Hemoglobin',
'INR in Blood by Coagulation assay': 'Prothrombin time INR',
'Lactate [Mass/volume] in blood': 'Lactic acid',
'Lymphocytes [#/volume] in Blood': 'Lymphocytes',
'Magnesium [Moles/volume] in Blood': 'Magnesium',
'Methemoglobin/Hemoglobin.total in Arterial blood': 'Hemoglobin',
'Neutrophils/100 leukocytes in Blood': 'Neutrophils',
'Peripheral oxygen saturation': 'Oxygen saturation',
'Platelets [#/volume] in Blood': 'Platelets',
'Potassium [Moles/volume] in Blood': 'Potassium',
'Pulmonary artery diastolic pressure': 'Diastolic blood pressure',
'Pulmonary artery systolic pressure': 'Systolic blood pressure',
'Respiratory rate': 'Respiratory rate',
'Sodium [Moles/volume] in Blood': 'Sodium',
'Systolic arterial pressure': 'Systolic blood pressure'
}
temp_funcs.txt