nubank / fklearn

fklearn: Functional Machine Learning
Apache License 2.0
1.51k stars 165 forks source link

An example with categorical features #51

Open prcastro opened 5 years ago

prcastro commented 5 years ago

Have a tutorial for dealing with categorical features in a machine learning problem, including the usage of tools inside fklearn.training.transformation.

victor-ab commented 5 years ago

@prcastro, is there a way to include a transformation tool like the onehot_categorizer inside a pipeline? Because it will generate new columns and break the pipeline.

vultor33 commented 5 years ago

I am not from Nubank, I just did this example for myself and shared it here.

# This is not an official example, it has no warranty of any kind.

import pandas as pd
# DATA WAS OBTAINED AT: https://www.kaggle.com/c/titanic/data
# IT WAS ALSO ADDED TO THE COMMENT IN FKLEARN ISSUE #51
DATA_FILE = 'titanic-train.txt'
data = pd.read_csv(DATA_FILE,delimiter=',',dtype=str)
data.loc[:,'Age'] = data.loc[:,'Age'].astype(float)
data.loc[:,'Fare'] = data.loc[:,'Fare'].astype(float)
data.loc[:,'Parch'] = data.loc[:,'Parch'].astype(int)
data.loc[:,'Pclass'] = data.loc[:,'Pclass'].astype(int)
data.loc[:,'SibSp'] = data.loc[:,'SibSp'].astype(int)
AUXILIARY = ['PassengerId','Name','Cabin', 'Ticket']
TARGET = ['Survived']
FEATURES = set(data.columns) - set(AUXILIARY) - set(TARGET)

from fklearn.training.transformation import onehot_categorizer
# ONE HOT ENCODER DEFINITION
my_onehotencoder = onehot_categorizer(columns_to_categorize = ['Embarked','Sex'])
# FEATURE NAMES ARE NEEDED (SEE ISSUE #68 FOR MORE)
_, data_after_enconding, _ = my_onehotencoder(data)  # applying encoder to training dataset
NEW_FEATURES = ['Pclass', 
            'Age', 
            'SibSp', 
            'Parch', 
            'Fare', 
            'Embarked==C', 
            'Embarked==Q', 
            'Embarked==S',
            'Sex==female', 
            'Sex==male'] # This names are in "data_after_enconding", I had just typed them here.

from fklearn.training.imputation import imputer
from fklearn.training.transformation import onehot_categorizer
from fklearn.training.transformation import standard_scaler
from fklearn.training.classification import xgb_classification_learner
# SOME OTHER TRANSFORMATIONS
my_imputer = imputer(columns_to_impute=NEW_FEATURES, impute_strategy='median')
my_scaler = standard_scaler(columns_to_scale=NEW_FEATURES)
# MODEL DEFINITION
my_model = xgb_classification_learner(features = NEW_FEATURES,
                                      target = TARGET[0])    

from fklearn.training.pipeline import build_pipeline
# PIPELINE DEFINITON
my_learner = build_pipeline(my_onehotencoder, my_imputer, my_scaler, my_model)
# TRAINING
(prediction_function, data_trained, logs) = my_learner(data)

# EVALUATION
from sklearn.metrics import accuracy_score
Survived_prediction = []
for i in data_trained.index:
    if data_trained.prediction[i] > 0.5:
        Survived_prediction.append('1')
    else:
        Survived_prediction.append('0')
print('Train accuracy:  ', accuracy_score(Survived_prediction,data_trained.Survived))

DATASET

titanic-train.txt