Open jkbngl opened 5 years ago
TODO:
# For features with strings in them
data['white_blood_cell_count'] = data['white_blood_cell_count'].apply(pd.to_numeric, errors='ignore')
data['white_blood_cell_count'] = data['white_blood_cell_count'].apply(lambda x: 0 if str(type(x))=="<class 'str'>" else x)
data['white_blood_cell_count'] = data['white_blood_cell_count'].fillna((data['packed_cell_volume'].mean()))
# Then for the rest
from pandas.api.types import is_numeric_dtype
feature_list = data.columns.values.tolist()
for feature in feature_list:
if(is_numeric_dtype(data[feature])):
data[feature] = data[feature].replace(np.nan, 0)
data[feature] = data[feature].replace(0, data[feature].mean())
#data[feature] = data[feature].apply(pd.to_numeric, errors='ignore')
#data[feature] = data[feature].apply(lambda x: 0 if str(type(x))=="<class 'str'>" else x)
#data[feature] = data[feature].fillna((data[feature].mean()), inplace=True)
else:
print(feature + " NON NUMERIC")
data.head()
df.isnull().sum().sort_values(ascending=False)
time_start = time.time()
tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300)
tsne_pca_results = tsne.fit_transform(pca_result_50)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))
[out] t-SNE done! Time elapsed: 42.01495909690857 seconds
df_subset['tsne-pca50-one'] = tsne_pca_results[:,0] df_subset['tsne-pca50-two'] = tsne_pca_results[:,1] plt.figure(figsize=(16,4)) ax1 = plt.subplot(1, 3, 1) sns.scatterplot( x="pca-one", y="pca-two", hue="y", palette=sns.color_palette("hls", 10), data=df_subset, legend="full", alpha=0.3, ax=ax1 ) ax2 = plt.subplot(1, 3, 2) sns.scatterplot( x="tsne-2d-one", y="tsne-2d-two", hue="y", palette=sns.color_palette("hls", 10), data=df_subset, legend="full", alpha=0.3, ax=ax2 ) ax3 = plt.subplot(1, 3, 3) sns.scatterplot( x="tsne-pca50-one", y="tsne-pca50-two", hue="y", palette=sns.color_palette("hls", 10), data=df_subset, legend="full", alpha=0.3, ax=ax3 )
- [ ] try Cox Proportional Hazards to predict survival/ infection/ getting the disease as well:
https://towardsdatascience.com/churn-prediction-and-prevention-in-python-2d454e5fd9a5 [x] add label to 6d plot
- [x] classification with age:
``` python
plt.hist('Age', data=customers[customers['Gender'] == 'Male'], alpha=0.5, label='Male');
plt.hist('Age', data=customers[customers['Gender'] == 'Female'], alpha=0.5, label='Female');
plt.title('Distribution of Age by Gender');
plt.xlabel('Age');
plt.legend();
https://www.kaggle.com/akshayksingh/kidney-disease-dataset