dancoster / DrugLab

Repository for the drug<>lab pair
1 stars 0 forks source link

Merge - HiRiD: Add 'extract_med_per_lab' and 'add_med_adminstrations_cols' functions. #39

Open dancoster opened 1 year ago

dancoster commented 1 year ago

temp_funcs.txt

dancoster commented 1 year ago

tempfuncs@.txt

dancoster commented 1 year ago

Use the table of signficant pairs as input (=df_meds).

PavanReddy28 commented 1 year ago
import numpy as np
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# lab measurements
vital_signs = ['Heart Rate', 'Respiratory rate','Oxygen saturation', 'Systolic blood pressure', 'Diastolic blood pressure',
               'Temperature']
labs_bmp = ['Glucose','Potassium','Sodium','Chloride', 'Creatinine', 'Blood urea nitrogen', 'Bicarbonate', 'Calcium',
            'Albumin', 'Lactate dehydrogenase','Magnesium','Lactic acid']
labs_cbc = ['Hematocrit','Hemoglobin', 'Platelets', 'White blood cell count', 'Red blood cell count',
            'Mean corpuscular volume', 'Lymphocytes', 'Neutrophils']
labs_cauglation = ['Prothrombin time INR']

## Load files
data_path =  os.path.join(f"{res_path}", ".csv")
import numpy as np
#Params
n_patients = 25000
min_p_val = 1e-10
min_no_patients = 100
horizon = '4h'

#Load dataset
df_data = pd.read_csv(os.path.join(res_path, "hirid_extract_with_labels_48_all_parts.csv"))
df_data['CHARTTIME'] = pd.to_datetime(df_data['CHARTTIME'], utc=True)

# Take subset of n_patients=5000
patient_ids = df_data.HADM_ID.unique()
subsample_ids = np.random.choice(patient_ids , size =n_patients) 
df_data = df_data[df_data.HADM_ID.isin(subsample_ids)]

#Taks only a subset of the columns 
df_data = df_data[['HADM_ID', 'AGE', 'GENDER', 'EST_DISCHTIME','CHARTTIME', 'discharge_status', 'LABEL_48']+feature_names]
# Filter only pairs with > 100 patients with signficiant difference (< 1e-10)
df_meds = pd.read_csv('temp_pairs.csv')
df_meds = df_meds[df_meds['No. of Patients'] > min_no_patients]
df_meds = df_meds[(df_meds['BonferroniPvals'] < min_p_val)]

#rename RBC
# df_meds = df_meds.replace('Red blood cell', 'Red blood cell count')
# hirid_parser.load_med()
hirid_parser.pharma_records_with_name

HIRID_MED_MAPPING = {
    "Insulin" : [1000963, 1000379, 1000381, 1000724, 15],
    "Vancomycin": [189, 331],
    "KCL": [1000398, 1001063, 1000080],
    "Intravenous blood transfusion of packed cells": [1000100, 1000743],
    "Glucose": [1000022, 1000690, 1000689, 1000544, 1000835, 1000746, 1000060, 1000567],
    "Magnesium Sulfate": [1000421],
    "Magnesiocard 5 mmol sachet": [1000420],
}
df_results = pd.DataFrame(columns=['lab_name','imputer_type','conversion_scheme','is_prev_val','RMSE','nRMSE'])

def extract_med_per_lab(lab_name,df_meds,inputevents_mv):
    #Create lab specific list of medications
    df_med = df_meds[(df_meds['Lab Name'] == lab_name)][['Lab Name','Med Name']].rename(columns={'Med Name':'med_label'})

    #pull drug itemid from d_items
    med_item_id = df_med
    med_item_id["ITEMID"] = med_item_id["med_label"].apply(lambda r: HIRID_MED_MAPPING[r] if r in HIRID_MED_MAPPING else None)
    df_med = med_item_id.dropna().explode("ITEMID")

    #pull relevant medications to lab_name
    df_inputevents = inputevents_mv[inputevents_mv.ITEMID.isin(df_med.ITEMID)]

    # round start time to hour
    df_inputevents['STARTTIME_rounded'] = pd.to_datetime(df_inputevents['STARTTIME'], utc=True).dt.round(freq='H')

    #Take mean amount or drug per hour
    df_inputevents= df_inputevents.groupby(['ITEMID','LABEL','HADM_ID','STARTTIME_rounded'])['AMOUNT'].mean().reset_index()

    return(df_inputevents,df_med)

def add_med_adminstrations_cols(df_inputevents,df_data,df_med):
    df_data=df_data.rename(columns={"HADM_ID":"subject_id"})
    #Add drug cols
    for temp_item_id in df_med.ITEMID:
        temp_df_inputevents = df_inputevents[df_inputevents.ITEMID == temp_item_id].reset_index()
        df_data = pd.merge(df_data, temp_df_inputevents,  how='left', left_on=['subject_id','CHARTTIME'], right_on = ['HADM_ID','STARTTIME_rounded'])
        df_data = df_data.drop(['HADM_ID','STARTTIME_rounded','index','ITEMID', 'LABEL'], axis=1)
        med_name = df_inputevents[df_inputevents.ITEMID == temp_item_id]['LABEL'].iloc[0]
        df_data = df_data.rename(columns={"AMOUNT": med_name})

    # sort charttime in descending order per subject id
    df_data = df_data.rename(columns={"subject_id":"HADM_ID"}).sort_values(['HADM_ID','CHARTTIME'])

    return(df_data.copy())

# Loop over lab msesurements
# for lab_name in ['Glucose']:
res = {}
for lab_name in ['Hematocrit', 'Glucose', 'Lactic acid', 'Magnesium', 'Hemoglobin']:
    # Extract admissions of drug
    df_inputevents,df_med = extract_med_per_lab(lab_name,df_meds,hirid_parser.pharma_records_with_name.rename(columns={"givendose":"AMOUNT"}))
    df_lab_data = add_med_adminstrations_cols(df_inputevents,df_data,df_med)
    res[lab_name] = df_lab_data
import pickle as pk
pk.dump(res, open(os.path.join(res_path, "hirid_extract_imputed_data.pkl"), "wb"))
res.keys()
res["Lactic acid"]
dancoster commented 1 year ago

Add mapping of HiRid longitudinal Cols and MIMIC-Extract Cols.

PavanReddy28 commented 1 year ago
hirid_mapping = {
    'Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma' : None,
    'Albumin [Mass/volume] in Serum or Plasma' : 'Albumin',
    'Amylase [Enzymatic activity/volume] in Serum or Plasma': None,
    'Aspartate aminotransferase [Enzymatic activity/volume] in Serum or Plasma' : None,
    'Bicarbonate [Moles/volume] in Arterial blood':'Bicarbonate',
    'Bilirubin.direct [Mass/volume] in Serum or Plasma': None,
    'Bilirubin.total [Moles/volume] in Serum or Plasma' : None,
    'Calcium [Moles/volume] in Blood': 'Calcium',
    'Calcium.ionized [Moles/volume] in Blood': 'Calcium',
    'Carboxyhemoglobin/Hemoglobin.total in Arterial blood': 'Hemoglobin',
    'Chloride [Moles/volume] in Blood': 'Chloride', 
    'Core body temperature': 'Temperature',
    'Creatinine [Moles/volume] in Blood': 'Creatinine', 
    'Diastolic arterial pressure': 'Diastolic blood pressure',
    'Glucose [Moles/volume] in Serum or Plasma': 'Glucose', 
    'Heart rate': 'Heart Rate',
    'Hemoglobin [Mass/volume] in blood': 'Hemoglobin',
    'INR in Blood by Coagulation assay': 'Prothrombin time INR', 
    'Lactate [Mass/volume] in blood': 'Lactic acid',
    'Lymphocytes [#/volume] in Blood': 'Lymphocytes', 
    'Magnesium [Moles/volume] in Blood': 'Magnesium',
    'Methemoglobin/Hemoglobin.total in Arterial blood': 'Hemoglobin',
    'Neutrophils/100 leukocytes in Blood': 'Neutrophils', 
    'Peripheral oxygen saturation': 'Oxygen saturation',
    'Platelets [#/volume] in Blood': 'Platelets', 
    'Potassium [Moles/volume] in Blood': 'Potassium',
    'Pulmonary artery diastolic pressure': 'Diastolic blood pressure',
    'Pulmonary artery systolic pressure': 'Systolic blood pressure', 
    'Respiratory rate': 'Respiratory rate',
    'Sodium [Moles/volume] in Blood': 'Sodium', 
    'Systolic arterial pressure': 'Systolic blood pressure'
}