JakeColtman / bartpy

Bayesian Additive Regression Trees For Python
https://jakecoltman.github.io/bartpy/
MIT License
219 stars 44 forks source link

Model predicting NaN #56

Open ajoules opened 3 years ago

ajoules commented 3 years ago

Hi,

Thank you for bart-py!

My BART model is predicting NaN for some cases. Does anyone know why this happens? or how I can prevent this?

My data has missing data but to my knowledge, BART can handle this. My data are finite.

Thank you!

Code: (Sorry for the lengthy data generation)

import pandas as pd
import numpy as np
import random
import bartpy
from bartpy.sklearnmodel import SklearnModel
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# simulate df with 46 features and 9000 rows
# create binary vars and make a df
label=np.random.randint(2, size=9000)
df = pd.DataFrame({'label':label})
df['a']=np.random.randint(2, size=9000)

# create integers
df['b'] = np.random.randint(low=50, high=96, size=9000)
df['b'] = np.random.randint(low=4, high=97, size=9000)
df['c'] = np.random.randint(low=0, high=1759.22, size=9000)
df['d'] = np.random.randint(low=0, high=5702.2, size=9000)
df['e'] = np.random.randint(low=0, high=7172.31, size=9000)

# create numerics
df['f'] = np.random.uniform(0, 908.56, 9000)
df['f'] = np.random.uniform(0,908.56, 9000)
df['g'] = np.random.uniform(0,2508.78, 9000)
df['h'] = np.random.uniform(0,3757.56, 9000)
df['i'] = np.random.uniform(0,560.18, 9000)
df['j'] = np.random.uniform(0,1362.71, 9000)
df['k'] = np.random.uniform(0,2578.26, 9000)
df['l'] = np.random.uniform(175.07,997, 9000)
df['m'] = np.random.uniform(992.39,3972.81, 9000)
df['n'] = np.random.uniform(1787.24,5823.21, 9000)
df['o'] = np.random.uniform(-56,53, 9000)
df['p'] = np.random.uniform(-47,46, 9000)
df['q'] = np.random.uniform(-1089.03,1546.87, 9000)
df['r'] = np.random.uniform(-1599.14,898.79, 9000)
df['s'] = np.random.uniform(-2871.02,5329, 9000)
df['t'] = np.random.uniform(-4231.44,2481.55, 9000)
df['u'] = np.random.uniform(-3435.9,5824.22, 9000)
df['v'] = np.random.uniform(-5086.6,4548.43, 9000)
df['w'] = np.random.uniform(-406.57,907.91, 9000)
df['x'] = np.random.uniform(-834.82,840.27, 9000)
df['y'] = np.random.uniform(-549.2,2506.29, 9000)
df['z'] = np.random.uniform(-1547.2,2434.18, 9000)
df['aa'] = np.random.uniform(-426.6,3636.17, 9000)
df['bb'] = np.random.uniform(-2819.8,3390, 9000)
df['cc'] = np.random.uniform(-266.75,527.81, 9000)
df['dd'] = np.random.uniform(-778.64,527.81, 9000)
df['ee'] = np.random.uniform(-476.09,1358.32, 9000)
df['ff'] = np.random.uniform(-1890.91,919.3, 9000)
df['gg'] = np.random.uniform(-1633.23,2577.01, 9000)
df['hh'] = np.random.uniform(-2427.93,2078.78, 9000)
df['ii'] = np.random.uniform(-339.67,518.32, 9000)
df['jj'] = np.random.uniform(-528.07,412, 9000)
df['kk'] = np.random.uniform(-1460.23,1610.58, 9000)
df['ll'] = np.random.uniform(-1984.08,1127.82, 9000)
df['mm'] = np.random.uniform(-2153.38,2402.24, 9000)
df['nn'] = np.random.uniform(-2311.27,1809.37, 9000)
df['oo'] = np.random.uniform(16,92, 9000)
df['pp'] = np.random.uniform(4,24, 9000)
df['qq'] = np.random.uniform(4,80, 9000)
df['rr'] = np.random.uniform(0,1, 9000)

# add missings to floats
# select only numeric columns to apply the missingness to
cols_list = df.select_dtypes('float64').columns.tolist()

# randomly remove cases from the dataframe
for col in df[cols_list]:
    df.loc[df.sample(frac=0.02).index, col] = np.nan

# # 80/20 train test split
X_train, X_test, y_train, y_test = train_test_split(df.drop(['label'],axis=1), df['label'], train_size=0.7, random_state = 99)

# Modelling
model = SklearnModel(n_jobs = 30) 
model.fit(X_train, y_train) 

# Predictions
y_predictions = model.predict(X_test)
np.isnan(y_predictions).sum()