HealthCatalyst / healthcareai-py

Python tools for healthcare machine learning
http://healthcare.ai
MIT License
309 stars 186 forks source link

Prediction fails if no grain column existed on training #470

Closed Aylr closed 6 years ago

Aylr commented 6 years ago

STR

  1. Make fake data
def random_date():
    """Sketchy random date generator"""
    return datetime.date(np.random.randint(2016, 2018), np.random.randint(1, 12), np.random.randint(1, 27))

def fake_data(claims=10, n_patients=200):
    rng = np.random.RandomState(0)

    df = pd.DataFrame({
#         'patient_id': rng.choice(['A', 'B', 'C', 'D'], size=claims, replace=True),
         'patient_id': rng.choice(range(n_patients), size=claims, replace=True),
        'claim_date': pd.to_datetime([random_date() for _ in range(claims)]),
        'drug': rng.choice(['oxycodone', 'morphine'], size=claims, replace=True),
        'qty': rng.choice([10, 30, 60, 90, 120], size=claims, replace=True),
        'strength': rng.choice([5, 10, 20, 30], size=claims, replace=True),
        'doses_per_day': rng.choice([1, 2, 3, 4], size=claims, replace=True),
        'overdose': rng.choice(['Y', 'N'], size=claims, replace=True, p=[0.1, 0.9]),
    })

    df['days_supply'] = df.qty / df.doses_per_day
    df['mg_total'] = df.qty * df.strength
    df['mme_total'] = np.where(df.drug == 'oxycodone', 1.7 * df.mg_total,
                        np.where(df.drug == 'hydrocodone', 1.2 * df.mg_total, df.mg_total))
    df['mme_per_day'] = df.mme_total / df.days_supply
#     df.index = df.claim_date
#     df.sort_index(ascending=False, inplace=True)

    return df[['patient_id', 'claim_date', 'drug', 'strength', 'qty', 'doses_per_day', 'days_supply', 'mg_total', 'mme_total', 'mme_per_day', 'overdose']]

df = fake_data(20000, 2000)
df.head()

def index_on_date_column(df, date_col):
    result = df.copy()
    result.index = result[date_col]
    #     result.drop(date_col, axis='columns', inplace=True)
    return result.sort_index(ascending=False)

def calculate_claim_date_limits(
        df,
        patient_grain_col='patient_id',
        date_col='claim_date',
        verbose=False):
    """Calculate first and last claims for each patient."""
    temp = df.copy()
    patient_ranges = temp.groupby(by=[patient_grain_col]).agg(
        {date_col: ['min', 'max']})

    first_label = 'first_{}'.format(date_col)
    last_label = 'last_{}'.format(date_col)

    temp[first_label] = temp[patient_grain_col].map(
        patient_ranges[date_col, 'min'])
    temp[last_label] = temp[patient_grain_col].map(
        patient_ranges[date_col, 'max'])
    temp['days_elapsed'] = temp[last_label] - temp[first_label]

    if verbose:
        print(patient_ranges)

    return temp

def reindex_on_last_claim_date(df, last_claim_col):
    """From time-indexed dataframe, make times relative."""
    result = df.copy()
    result.index = result[last_claim_col] - result.index
    return result

def upsample(df, resolution='1D', patient_grain_col='patient_id',
             drug_col='drug'):
    """Upsample a dataframe to 1 day increments."""
    result = df.groupby(by=[patient_grain_col, drug_col]).resample(
        resolution).mean()
    if patient_grain_col in result.columns:
        result.drop(patient_grain_col, axis='columns', inplace=True)

    return result

def downsample(df, resolution='180D'):
    """Downsample a dataframe to 180 day increments."""
    binned = df.unstack(level=[0, 1]).resample(resolution).sum().stack(
        level=[1, 2])
    result = binned.reset_index(level=[1, 2])
    result.index.rename('time', inplace=True)
    return result

def group_by_patient_drug_time(df, patient_grain_col='patient_id',
                               drug_col='drug'):
    grouped = df.groupby([patient_grain_col, drug_col, 'time']).sum()
    unstacked = grouped.unstack(level=1).unstack()
    return unstacked

def concatenate_hierarchical_name(things):
    """Flatten out a hierarchical multiindex column name."""
    tokens = []
    for x in things:
        if type(x) == pd.Timedelta:
            tokens.append(str(x.days))
        else:
            tokens.append(x)

    return '_'.join(tokens)

def flatten_hierarchical_column_names(df):
    """Flatten out a dataframe's hierarchical multiindex column names."""
    new_names = [concatenate_hierarchical_name(x) for x in df.columns.ravel()]

    print('flattened out {} new names'.format(len(new_names)))
    result = df.copy()
    result.columns = new_names

    return result

def debug_displayer(df, name, verbose):
    """Simple helper to show progress in pipeline."""
    if verbose == 0:
        return
    elif verbose == 1:
        print(
        'generated {} dataframe that contains {} rows'.format(name, len(df)))
    elif verbose == 2:
        display(df.head())

def build_target_lookup(df, patient_grain_col='patient_id',
                        target_col='overdose'):
    """Build a target lookup dataframe from the original claims dataframe."""
    targets = df[[patient_grain_col, target_col]].reset_index(drop=True)

    return targets.groupby(patient_grain_col).max()

def map_target(
        features_df,
        raw_claims_df,
        patient_grain_col='patient_id',
        target_col='overdose'):
    """Join the target column to the features dataframe using the raw claims dataframe."""
    targets = build_target_lookup(raw_claims_df, patient_grain_col, target_col)

    temp = features_df.copy()
    temp[patient_grain_col] = temp.index.values
    temp[target_col] = temp[patient_grain_col].map(targets[target_col])
    temp.drop(patient_grain_col, axis='columns', inplace=True)
    return temp

def preprocessing_pipeline(df, patient_grain_col, date_col, target_col,
                           drug_col, verbose=0):
    step0 = index_on_date_column(df, date_col)
    debug_displayer(step0, 'step 0', verbose=verbose)

    step1 = calculate_claim_date_limits(step0,
                                        patient_grain_col=patient_grain_col,
                                        date_col=date_col)
    debug_displayer(step1, 'step 1', verbose=verbose)

    last_date_key = 'last_{}'.format(date_col)

    step2 = reindex_on_last_claim_date(step1, last_date_key)
    debug_displayer(step2, 'step 2', verbose=verbose)

    upsampled = upsample(step2, resolution='1D',
                         patient_grain_col=patient_grain_col, drug_col=drug_col)
    debug_displayer(upsampled, 'upsampled', verbose=verbose)

    downsampled = downsample(upsampled, resolution='180D')
    debug_displayer(downsampled, 'downsampled', verbose=verbose)

    unstacked = group_by_patient_drug_time(downsampled,
                                           patient_grain_col=patient_grain_col,
                                           drug_col=drug_col)
    debug_displayer(unstacked, 'unstacked', verbose=verbose)

    flat = flatten_hierarchical_column_names(unstacked)
    debug_displayer(flat, 'flat', verbose=verbose)

    final = map_target(flat, df, patient_grain_col=patient_grain_col, target_col=target_col)
    final.fillna(0, inplace=True)
    debug_displayer(final, 'final', verbose=verbose)

    return final

clean = preprocessing_pipeline(
    df,
    patient_grain_col='patient_id',
    date_col='claim_date',
    target_col='overdose',
    drug_col='drug',
    verbose=2)
  1. Train, note that the dataframe has no grain column, though it does have an index with a patient id
trainer = healthcareai.SupervisedModelTrainer(clean, 'overdose', 'classification', verbose=False)
rf = trainer.random_forest()
  1. Predict on new data
new_data = clean.reset_index()
new_data.head()

rf.make_predictions(new_data)
  1. Note that there is no grain column

rf.grain_column

Stacktrace

---------------------------------------------------------------------------
  ValueError                                Traceback (most recent call last)
  <ipython-input-32-b3fc7c3fdb9b> in <module>()
  ----> 1 rf.make_predictions(new_data)

  ~/repos/ml.opioid.hso/.env/lib/python3.6/site-packages/healthcareai/trained_models/trained_supervised_model.py in make_predictions(self, dataframe)
      183 
      184         result = pd.DataFrame({
  --> 185             self.grain_column: dataframe[self.grain_column].values,
      186             'Prediction': None,
      187             'Probability': None,

  ~/repos/ml.opioid.hso/.env/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
     2137             return self._getitem_multilevel(key)
     2138         else:
  -> 2139             return self._getitem_column(key)
     2140 
     2141     def _getitem_column(self, key):

  ~/repos/ml.opioid.hso/.env/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
     2144         # get column
     2145         if self.columns.is_unique:
  -> 2146             return self._get_item_cache(key)
     2147 
     2148         # duplicate columns & possible reduce dimensionality

  ~/repos/ml.opioid.hso/.env/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
     1840         res = cache.get(item)
     1841         if res is None:
  -> 1842             values = self._data.get(item)
     1843             res = self._box_item_values(item, values)
     1844             cache[item] = res

  ~/repos/ml.opioid.hso/.env/lib/python3.6/site-packages/pandas/core/internals.py in get(self, item, fastpath)
     3850                         loc = indexer.item()
     3851                     else:
  -> 3852                         raise ValueError("cannot label index with a null key")
     3853 
     3854             return self.iget(loc, fastpath=fastpath)

  ValueError: cannot label index with a null key
Aylr commented 6 years ago

Closing, dupe of #460