Closed Aylr closed 6 years ago
def random_date(): """Sketchy random date generator""" return datetime.date(np.random.randint(2016, 2018), np.random.randint(1, 12), np.random.randint(1, 27)) def fake_data(claims=10, n_patients=200): rng = np.random.RandomState(0) df = pd.DataFrame({ # 'patient_id': rng.choice(['A', 'B', 'C', 'D'], size=claims, replace=True), 'patient_id': rng.choice(range(n_patients), size=claims, replace=True), 'claim_date': pd.to_datetime([random_date() for _ in range(claims)]), 'drug': rng.choice(['oxycodone', 'morphine'], size=claims, replace=True), 'qty': rng.choice([10, 30, 60, 90, 120], size=claims, replace=True), 'strength': rng.choice([5, 10, 20, 30], size=claims, replace=True), 'doses_per_day': rng.choice([1, 2, 3, 4], size=claims, replace=True), 'overdose': rng.choice(['Y', 'N'], size=claims, replace=True, p=[0.1, 0.9]), }) df['days_supply'] = df.qty / df.doses_per_day df['mg_total'] = df.qty * df.strength df['mme_total'] = np.where(df.drug == 'oxycodone', 1.7 * df.mg_total, np.where(df.drug == 'hydrocodone', 1.2 * df.mg_total, df.mg_total)) df['mme_per_day'] = df.mme_total / df.days_supply # df.index = df.claim_date # df.sort_index(ascending=False, inplace=True) return df[['patient_id', 'claim_date', 'drug', 'strength', 'qty', 'doses_per_day', 'days_supply', 'mg_total', 'mme_total', 'mme_per_day', 'overdose']] df = fake_data(20000, 2000) df.head() def index_on_date_column(df, date_col): result = df.copy() result.index = result[date_col] # result.drop(date_col, axis='columns', inplace=True) return result.sort_index(ascending=False) def calculate_claim_date_limits( df, patient_grain_col='patient_id', date_col='claim_date', verbose=False): """Calculate first and last claims for each patient.""" temp = df.copy() patient_ranges = temp.groupby(by=[patient_grain_col]).agg( {date_col: ['min', 'max']}) first_label = 'first_{}'.format(date_col) last_label = 'last_{}'.format(date_col) temp[first_label] = temp[patient_grain_col].map( patient_ranges[date_col, 'min']) temp[last_label] = temp[patient_grain_col].map( patient_ranges[date_col, 'max']) temp['days_elapsed'] = temp[last_label] - temp[first_label] if verbose: print(patient_ranges) return temp def reindex_on_last_claim_date(df, last_claim_col): """From time-indexed dataframe, make times relative.""" result = df.copy() result.index = result[last_claim_col] - result.index return result def upsample(df, resolution='1D', patient_grain_col='patient_id', drug_col='drug'): """Upsample a dataframe to 1 day increments.""" result = df.groupby(by=[patient_grain_col, drug_col]).resample( resolution).mean() if patient_grain_col in result.columns: result.drop(patient_grain_col, axis='columns', inplace=True) return result def downsample(df, resolution='180D'): """Downsample a dataframe to 180 day increments.""" binned = df.unstack(level=[0, 1]).resample(resolution).sum().stack( level=[1, 2]) result = binned.reset_index(level=[1, 2]) result.index.rename('time', inplace=True) return result def group_by_patient_drug_time(df, patient_grain_col='patient_id', drug_col='drug'): grouped = df.groupby([patient_grain_col, drug_col, 'time']).sum() unstacked = grouped.unstack(level=1).unstack() return unstacked def concatenate_hierarchical_name(things): """Flatten out a hierarchical multiindex column name.""" tokens = [] for x in things: if type(x) == pd.Timedelta: tokens.append(str(x.days)) else: tokens.append(x) return '_'.join(tokens) def flatten_hierarchical_column_names(df): """Flatten out a dataframe's hierarchical multiindex column names.""" new_names = [concatenate_hierarchical_name(x) for x in df.columns.ravel()] print('flattened out {} new names'.format(len(new_names))) result = df.copy() result.columns = new_names return result def debug_displayer(df, name, verbose): """Simple helper to show progress in pipeline.""" if verbose == 0: return elif verbose == 1: print( 'generated {} dataframe that contains {} rows'.format(name, len(df))) elif verbose == 2: display(df.head()) def build_target_lookup(df, patient_grain_col='patient_id', target_col='overdose'): """Build a target lookup dataframe from the original claims dataframe.""" targets = df[[patient_grain_col, target_col]].reset_index(drop=True) return targets.groupby(patient_grain_col).max() def map_target( features_df, raw_claims_df, patient_grain_col='patient_id', target_col='overdose'): """Join the target column to the features dataframe using the raw claims dataframe.""" targets = build_target_lookup(raw_claims_df, patient_grain_col, target_col) temp = features_df.copy() temp[patient_grain_col] = temp.index.values temp[target_col] = temp[patient_grain_col].map(targets[target_col]) temp.drop(patient_grain_col, axis='columns', inplace=True) return temp def preprocessing_pipeline(df, patient_grain_col, date_col, target_col, drug_col, verbose=0): step0 = index_on_date_column(df, date_col) debug_displayer(step0, 'step 0', verbose=verbose) step1 = calculate_claim_date_limits(step0, patient_grain_col=patient_grain_col, date_col=date_col) debug_displayer(step1, 'step 1', verbose=verbose) last_date_key = 'last_{}'.format(date_col) step2 = reindex_on_last_claim_date(step1, last_date_key) debug_displayer(step2, 'step 2', verbose=verbose) upsampled = upsample(step2, resolution='1D', patient_grain_col=patient_grain_col, drug_col=drug_col) debug_displayer(upsampled, 'upsampled', verbose=verbose) downsampled = downsample(upsampled, resolution='180D') debug_displayer(downsampled, 'downsampled', verbose=verbose) unstacked = group_by_patient_drug_time(downsampled, patient_grain_col=patient_grain_col, drug_col=drug_col) debug_displayer(unstacked, 'unstacked', verbose=verbose) flat = flatten_hierarchical_column_names(unstacked) debug_displayer(flat, 'flat', verbose=verbose) final = map_target(flat, df, patient_grain_col=patient_grain_col, target_col=target_col) final.fillna(0, inplace=True) debug_displayer(final, 'final', verbose=verbose) return final clean = preprocessing_pipeline( df, patient_grain_col='patient_id', date_col='claim_date', target_col='overdose', drug_col='drug', verbose=2)
trainer = healthcareai.SupervisedModelTrainer(clean, 'overdose', 'classification', verbose=False) rf = trainer.random_forest()
new_data = clean.reset_index() new_data.head() rf.make_predictions(new_data)
rf.grain_column
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-32-b3fc7c3fdb9b> in <module>() ----> 1 rf.make_predictions(new_data) ~/repos/ml.opioid.hso/.env/lib/python3.6/site-packages/healthcareai/trained_models/trained_supervised_model.py in make_predictions(self, dataframe) 183 184 result = pd.DataFrame({ --> 185 self.grain_column: dataframe[self.grain_column].values, 186 'Prediction': None, 187 'Probability': None, ~/repos/ml.opioid.hso/.env/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key) 2137 return self._getitem_multilevel(key) 2138 else: -> 2139 return self._getitem_column(key) 2140 2141 def _getitem_column(self, key): ~/repos/ml.opioid.hso/.env/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key) 2144 # get column 2145 if self.columns.is_unique: -> 2146 return self._get_item_cache(key) 2147 2148 # duplicate columns & possible reduce dimensionality ~/repos/ml.opioid.hso/.env/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item) 1840 res = cache.get(item) 1841 if res is None: -> 1842 values = self._data.get(item) 1843 res = self._box_item_values(item, values) 1844 cache[item] = res ~/repos/ml.opioid.hso/.env/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath) 3850 loc = indexer.item() 3851 else: -> 3852 raise ValueError("cannot label index with a null key") 3853 3854 return self.iget(loc, fastpath=fastpath) ValueError: cannot label index with a null key
Closing, dupe of #460
STR
rf.grain_column
Stacktrace