sonalmogra28 / number-plate-detection

0 stars 0 forks source link

Hospital Cost Prediction #1

Open sonalmogra28 opened 11 months ago

sonalmogra28 commented 11 months ago

-- coding: utf-8 --

"""HospitalCost.ipynb

Automatically generated by Colaboratory.

Original file is located at https://colab.research.google.com/drive/1EwJgHFGPXWDp2T-wobAg5p62IvKoPG6e """

import dependecies

import pandas as pd import seaborn as sns import numpy as np import matplotlib.pyplot as plt

from google.colab import drive import pandas as pd drive.mount('/content/drive') maindf = pd.read_csv('/content/drive/MyDrive/HospitalCost.csv')

maindf.shape

maindf

Which categories are included in our data set?

maindf.columns

maindf.describe() #The function “describe” will present characteristics of all numerical categories

"""# Data Analysis

Age Group

"""

maindf["age"].describe()

maindf.reset_index().groupby('age')['index'].count().plot(kind='bar',figsize=(10,10)) pd.DataFrame(maindf.reset_index().groupby('age')['index'].count().reset_index()).plot(kind='scatter',x='age',y='index')

"""# Scatter Plot showing Age/Charges"""

maindf[['region','charges']].plot(kind='scatter',x='region',y='charges',c='DarkBlue',figsize=(10,10))

boxplot = df.boxplot(column=['Col1', 'Col2', 'Col3'])

maindf.boxplot(column=['age', 'bmi', 'children']) #Box Plot: A Graphical Way to Show minimum, maximum, median, and two quartiles

"""# Data Analysis on Gender"""

maindf['sex'].describe()

maindf['sex'].value_counts().plot(kind='bar')

"""# Analysis between Sex & Charges"""

temp = maindf[['sex', 'charges']].copy() # Create a copy to avoid SettingWithCopyWarning

temp.loc[temp['sex'] == 'male', 'charges_male'] = temp.loc[temp['sex'] == 'male', 'charges'] temp.loc[temp['sex'] != 'male', 'charges_female'] = temp.loc[temp['sex'] != 'male', 'charges']

temp = temp.sort_values('charges', ascending=False)

Plotting the charges for male and female separately

temp[['sex', 'charges_male', 'charges_female']].set_index('sex').plot(figsize=(7, 7), subplots=True)

temp['charges_male'].describe()

temp['charges_female'].describe()

temp=maindf.copy() temp.loc[temp['sex']=='male','sex']=str(1) temp.loc[temp['sex']!='1','sex']=str(2)

temp['sex']= temp['sex'].astype(int)

temp['sex'].unique()

temp[['sex','charges']].plot(kind='scatter',x='sex',y='charges')

"""# BMI Analysis"""

maindf['bmi'].describe()

maindf[['bmi','charges']].plot(kind='scatter',x='bmi',y='charges')

"""# Analysis with Children"""

maindf.columns

maindf['children'].describe()

maindf['children'].value_counts().plot(kind='bar',rot=0)

maindf[['children','charges']].plot(kind='scatter',x='children',y='charges')

"""# Analysis on Smokers"""

maindf['smoker'].describe()

maindf.loc[maindf['smoker'] == 'no', 'smoker'] = '0' maindf.loc[maindf['smoker'] != '0', 'smoker'] = '1' maindf['smoker'] = maindf['smoker'].astype(int)

maindf[['smoker','charges']].plot(kind='scatter',x='smoker',y='charges')

"""# Regional Analysis"""

maindf['region'].value_counts().plot(kind='bar')

temp = maindf[['region','charges']].copy() temp['temp'] = np.nan q=-1 for region in maindf['region'].unique(): q+=1 temp.loc[temp['region']==region,'temp'] = str(q) print(region,' = ',q)

temp['region'] = np.nan temp['region']= temp['temp'] temp['region'] = temp['region'].astype(int) del temp['temp'] temp[['region','charges']].plot(kind='scatter',x='region',y='charges')

"""# Correlating each variables with charges"""

maindf = pd.read_csv('/content/drive/MyDrive/HospitalCost.csv')

temp = maindf.copy()

temp.loc[temp['sex'] == 'male', 'sex'] = '1' temp.loc[temp['sex'] != '1', 'sex'] = '2' temp['sex'] = temp['sex'].astype(int)

temp.loc[temp['smoker'] == 'no', 'smoker'] = '0' temp.loc[temp['smoker'] != '0', 'smoker'] = '1' temp['smoker'] = temp['smoker'].astype(int)

temp['temp'] = np.nan q = -1

for region in maindf['region'].unique(): q += 1 temp.loc[temp['region'] == region, 'temp'] = str(q)

temp['region'] = temp['temp'].astype(int) del temp['temp']

maindf = temp.copy()

maindf.columns

maindf.corr()['charges'].sort_values(ascending=False)[1:]

"""# Findings

Finding correlations between different categories, e.g. bmi vs charges, smokers vs charges, children vs charges:
It seems smokerS is highest when correlated with charges and the same pattern can be seen in visuals of smoker vs charges.
What categories contributed more to the health cost, if so & why? categories columns : smoker,children,region. Yes, smoker is contrbuting most to the health cost , and rightly so is because smokers are more vulnerable to lung cancer, and other types of cancer , so they do spend more on insurance.
What are the effects of medical/health cost between male & female, cigarettes usage, BMI effects? Effects of health cost on sex: Gender does not contribute at all to charges, means gender has no effect on charges . Either you are male and female, charges are unrelated to this.
Effects of cigrattes (smokers) on health cost: As obvious, smokers are more vulnerable to health issues so they do have more charges than non-smokers effect of BMI on health cost: As shown in above visuals 'bmi vs charges' , there is a mix of response on charges, low bmi's also have high charges and high bmi's do also have high charges, and correlation also close to 0, so we can say that BMI has no effect on health cost.

Machine Learning Part

Purpose: To determine the effect of several factors on insurance costs & predict the cost of health insurance based on factors that influence it. """

maindf = pd.read_csv('/content/drive/MyDrive/HospitalCost.csv') maindf.shape

"""From the data analysis part we can see 3 factors are most influencing for the charge colum: a) smoker b) age c) bmi All 3 columns are responsible for the increase or decrease in price

Converting categorical column "Smoker" into integer for ML algorithm input

"""

maindf.loc[maindf['smoker'] == 'no', 'smoker'] = '0' maindf.loc[maindf['smoker'] != '0', 'smoker'] = '1' maindf['smoker'] = maindf['smoker'].astype(int)

maindf.dtypes

"""### Now we do have columns in integer and floating points"""

y = maindf['charges'] x = maindf[['smoker','age','bmi']].iloc[:,:] x

"""### Normalizing inut columns age and bmi"""

x = x.copy() # Create a copy to avoid modifying the original DataFrame

x.loc[:, 'age'] = x['age'] / x['age'].max() x.loc[:, 'bmi'] = x['bmi'] / x['bmi'].max()

"""### Train test split to judge how well our model is performing"""

from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.33, random_state=42)

Finally, we can proceed with the procedure of importing and fitting the Linear Regression model.

from sklearn.linear_model import LinearRegression lm = LinearRegression() lm.fit(X_train,y_train)

print(lm.intercept_)

coeffdf = pd.DataFrame(lm.coef,x.columns, columns=['Coefficient'])

print(coeff_df)

predictions = lm.predict(X_test) print("Predicted medical costs values:", predictions)

"""Graphical comparison of expected values of our analysis"""

plt.scatter(y_test, predictions)

"""## Finally, let’s calculate and print mean absolute error (MAE) and mean squared error (MSE) for our predictions"""

from sklearn import metrics print("MAE",metrics.mean_absolute_error(y_test, predictions)) print("MSE",metrics.mean_squared_error(y_test, predictions))

"""### Decision Tree Regressor"""

from sklearn.datasets import load_diabetes from sklearn.model_selection import cross_val_score from sklearn.tree import DecisionTreeRegressor

X, y = load_diabetes(return_X_y=True)

regressor = DecisionTreeRegressor()

cross_val_score(regressor, X_train, y_train, cv=10) regressor=regressor.fit(X_train,y_train)

from sklearn.metrics import mean_squared_error from math import sqrt

y_pred = regressor.predict(X_test) sqrt(mean_squared_error(y_test, y_pred,squared=True))

"""THE Root Mean Square Error (RMSE) SCORE: 9781 is of base model difference between a statistical model’s predicted values and the actual values"""

print(pd.DataFrame(data=[x.columns,regressor.featureimportances,]))

"""The model is also showing the data analysis part to be strong by suggesting the smoker columns are the most important (61%) of all three variables above.

HYPER PARAMETER TUNING : finding the best settings for a machine learning model

"""

final_regressor = DecisionTreeRegressor(criterion='friedman_mse', splitter='best', max_depth=10, min_samples_split=50,min_samples_leaf=5, min_weight_fraction_leaf=0.001, max_features=2, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.01)

cross_val_score(final_regressor, X_train, y_train, cv=10) final_regressor = final_regressor.fit(X_train, y_train)

from sklearn.metrics import mean_squared_error from math import sqrt

y_pred = final_regressor.predict(X_test) sqrt_mean_squared_error = sqrt(mean_squared_error(y_test, y_pred))

"""The model looks better after hyperparameter tuning. So, lets try XGboost regressor and see if this will produce a better result than this."""

from sklearn import ensemble regressor = ensemble.GradientBoostingRegressor( loss='squared_error', learning_rate=0.01, n_estimators=70, subsample=1.0, criterion='friedman_mse', min_samples_split=50, min_samples_leaf=5, min_weight_fraction_leaf=0.0, max_depth=10, min_impurity_decrease=0.0, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.01 )

cross_val_score(regressor, X_train, y_train, cv=10) regressor=regressor.fit(X_train,y_train)

from sklearn.metrics import mean_squared_error from math import sqrt

y_pred = regressor.predict(X_test) sqrt(mean_squared_error(y_test, y_pred,squared=True))