mohanbabu27 commented 5 years ago

I have tried all my level best to run the code from CH1. Copied as it as but getting different errors every time. Fixed many erros and but stuck at this errors. Can one of you help please.. KeyError: "['GDP per capita'] not in index"

Code below

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model

#load the data
oecd_bli = pd.read_csv("OECD_BLI_2017.csv",thousands=',')
gdp_per_capita = pd.read_csv("gdp_per_capita.csv",thousands=',',delimiter=','
                             ,encoding='latin1',na_values='n/a')

#prepare the data
def prepare_country_stats(oecd_bli, gdp_per_capita):
    #get the pandas dataframe of GDP per capita and Life satisfaction
    oecd_bli = oecd_bli[oecd_bli["INEQUALITY"]=="TOT"]
    oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", 
                              values="Value")
    gdp_per_capita.rename(columns={"2017": "GDP per capita"}, inplace=True)
    gdp_per_capita.set_index("Country", inplace=True)
    full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, 
                                  left_index=True, right_index=True)
    return full_country_stats[["GDP per capita", 'Life satisfaction']]

country_stats = prepare_country_stats(oecd_bli, gdp_per_capita) 
#regularization remove_indices = [0, 1, 6, 8, 33, 34, 35]
country_stats.to_csv('country_stats.csv',encoding='utf-8')
X = np.c_[country_stats["GDP per capita"]]
Y = np.c_[country_stats["Life satisfaction"]]

#Visualize the data
country_stats.plot(kind='scatter',x='GDP per capita',y='Life satisfaction')

#Select a linear model
lin_reg_model = sklearn.linear_model.LinearRegression()

#Train the model
lin_reg_model.fit(X, Y)

#plot Regression model
t0, t1 = lin_reg_model.intercept_[0], lin_reg_model.coef_[0][0]
X = np.linspace(0, 110000, 1000)
plt.plot(X, t0 + t1 * X, "k")
plt.show()

#Make a prediction for Cyprus
X_new=[[22587]]
print(lin_reg_model.predict(X_new))

ageron commented 5 years ago

Hi @mohanbabu27 , Could you please indicate the line that causes the exception? Also, I see a few differences with the code in the notebook:

the delimiter you use when loading the gdp_per_capita.csv file is "," instead of "\t", so please make sure the file is indeed comma separated instead of tab separated.
You are using data from a different year (2017 instead of 2015), so please make sure the format is exactly the same, or use the same file as I did (it is in the datasets/lifesat folder)
The end of the prepare_country_stats() function is different (but if I remember correctly I just got rid of a few countries that did not follow the trend, to illustrate that a model will end up biased if the data is biased)

Hope this helps.

ghost commented 5 years ago

Try this ...

import pandas as pd
import numpy as np
import sklearn.linear_model
import sklearn.neighbors
import matplotlib.pyplot as plt

# Load the data
bli = pd.read_csv("BLI2015.csv", thousands=',')

gdp = pd.read_csv("GDP.csv",thousands=',',delimiter=',',
                             encoding='latin1', na_values="n/a")

# Prepare the data

bli = bli[bli["INEQUALITY"]=="TOT"]
bli = bli[bli["INDICATOR"]=="SW_LIFS"]

nbli = pd.DataFrame(columns=['Pais','Satisfaccion'])
nbli['Pais']=bli['Country']
nbli['Satisfaccion']=bli['Value']
nbli.set_index("Pais", inplace=True)

ngdp = pd.DataFrame(columns=['Pais','Renta'])
ngdp['Pais']=gdp['Country']
ngdp['Renta']=gdp['2015']
ngdp.set_index("Pais", inplace=True)
ngdp = ngdp.apply(lambda x: pd.to_numeric(x.astype(str).str.replace(',',''), errors='coerce'))

country_stats = pd.merge(left=nbli, right=ngdp,
                                  left_index=True, right_index=True)

country_stats.sort_values(by="Renta", inplace=True)

# Visualize the data
country_stats.plot(kind='scatter', x="Renta", y='Satisfaccion')
plt.show()

X = np.c_[country_stats["Renta"]]
y = np.c_[country_stats["Satisfaccion"]]

# Select a linear model (Opcional)
model = sklearn.linear_model.LinearRegression()
model = sklearn.neighbors.KNeighborsRegressor(n_neighbors=3)

# Train the model
model.fit(X, y)

# Make a prediction for Cyprus
X_new = [[22587]]  # Cyprus' GDP per capita
print(model.predict(X_new))

ageron / handson-ml

CH1: KeyError: "['GDP per capita'] not in index" #375

Code below