ageron / handson-ml

⛔️ DEPRECATED – See https://github.com/ageron/handson-ml3 instead.
Apache License 2.0
25.2k stars 12.91k forks source link

CH1: KeyError: "['GDP per capita'] not in index" #375

Open mohanbabu27 opened 5 years ago

mohanbabu27 commented 5 years ago

I have tried all my level best to run the code from CH1. Copied as it as but getting different errors every time. Fixed many erros and but stuck at this errors. Can one of you help please.. KeyError: "['GDP per capita'] not in index"

Code below

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.linear_model

#load the data
oecd_bli = pd.read_csv("OECD_BLI_2017.csv",thousands=',')
gdp_per_capita = pd.read_csv("gdp_per_capita.csv",thousands=',',delimiter=','
                             ,encoding='latin1',na_values='n/a')

#prepare the data
def prepare_country_stats(oecd_bli, gdp_per_capita):
    #get the pandas dataframe of GDP per capita and Life satisfaction
    oecd_bli = oecd_bli[oecd_bli["INEQUALITY"]=="TOT"]
    oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", 
                              values="Value")
    gdp_per_capita.rename(columns={"2017": "GDP per capita"}, inplace=True)
    gdp_per_capita.set_index("Country", inplace=True)
    full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, 
                                  left_index=True, right_index=True)
    return full_country_stats[["GDP per capita", 'Life satisfaction']]

country_stats = prepare_country_stats(oecd_bli, gdp_per_capita) 
#regularization remove_indices = [0, 1, 6, 8, 33, 34, 35]
country_stats.to_csv('country_stats.csv',encoding='utf-8')
X = np.c_[country_stats["GDP per capita"]]
Y = np.c_[country_stats["Life satisfaction"]]

#Visualize the data
country_stats.plot(kind='scatter',x='GDP per capita',y='Life satisfaction')

#Select a linear model
lin_reg_model = sklearn.linear_model.LinearRegression()

#Train the model
lin_reg_model.fit(X, Y)

#plot Regression model
t0, t1 = lin_reg_model.intercept_[0], lin_reg_model.coef_[0][0]
X = np.linspace(0, 110000, 1000)
plt.plot(X, t0 + t1 * X, "k")
plt.show()

#Make a prediction for Cyprus
X_new=[[22587]]
print(lin_reg_model.predict(X_new))
ageron commented 5 years ago

Hi @mohanbabu27 , Could you please indicate the line that causes the exception? Also, I see a few differences with the code in the notebook:

Hope this helps.

ghost commented 5 years ago

Try this ...

import pandas as pd
import numpy as np
import sklearn.linear_model
import sklearn.neighbors
import matplotlib.pyplot as plt

# Load the data
bli = pd.read_csv("BLI2015.csv", thousands=',')

gdp = pd.read_csv("GDP.csv",thousands=',',delimiter=',',
                             encoding='latin1', na_values="n/a")

# Prepare the data

bli = bli[bli["INEQUALITY"]=="TOT"]
bli = bli[bli["INDICATOR"]=="SW_LIFS"]

nbli = pd.DataFrame(columns=['Pais','Satisfaccion'])
nbli['Pais']=bli['Country']
nbli['Satisfaccion']=bli['Value']
nbli.set_index("Pais", inplace=True)

ngdp = pd.DataFrame(columns=['Pais','Renta'])
ngdp['Pais']=gdp['Country']
ngdp['Renta']=gdp['2015']
ngdp.set_index("Pais", inplace=True)
ngdp = ngdp.apply(lambda x: pd.to_numeric(x.astype(str).str.replace(',',''), errors='coerce'))

country_stats = pd.merge(left=nbli, right=ngdp,
                                  left_index=True, right_index=True)

country_stats.sort_values(by="Renta", inplace=True)

# Visualize the data
country_stats.plot(kind='scatter', x="Renta", y='Satisfaccion')
plt.show()

X = np.c_[country_stats["Renta"]]
y = np.c_[country_stats["Satisfaccion"]]

# Select a linear model (Opcional)
model = sklearn.linear_model.LinearRegression()
model = sklearn.neighbors.KNeighborsRegressor(n_neighbors=3)

# Train the model
model.fit(X, y)

# Make a prediction for Cyprus
X_new = [[22587]]  # Cyprus' GDP per capita
print(model.predict(X_new))