onnx / sklearn-onnx

Convert scikit-learn models and pipelines to ONNX
Apache License 2.0
523 stars 98 forks source link

Unable to create onnx pipeline with multiple classifiers #1086

Open achyuta26 opened 2 months ago

achyuta26 commented 2 months ago

I'm trying to create a pipeline with 3 models. I wanted to do try out a POC with a toy dataset simulating 3 classifiers that I've actually trained. So I picked up the iris dataset and broke it into 3 datasets for each model and class. Amongst the 3 models, the first two are LightGBM classifer and the 3rd is a RandomForest classifer. The objective is to take 4 features and have 3 binary classifiers send their prediction in a pipeline.

This is the code snippet I'm trying to run to generate a pipeline of onnx format:

import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score,\
classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
# import joblib

from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn, update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from skl2onnx.proto import onnx_proto

data = load_iris(as_frame=True)
df = data['frame']
df.rename({"sepal length (cm)":"sepal_length",\
    "sepal width (cm)": "sepal_width",\
    "petal length (cm)": "petal_length",\
    "petal width (cm)":"petal_width"}, axis=1, inplace=True)

def create_target_column(df, target):
    return np.where(df['target']==target, 1, 0)

# target_dict = {
#     0 : 'setosa',
#     1 : 'versicolor',
#     2 : 'virginica'   
# }

df["is_setosa"] = create_target_column(df, 0)
df["is_versicolor"] = create_target_column(df, 1)
df["is_virginica"] = create_target_column(df, 2)

features = df.columns[:-4]

class TrainModel(object):
    def __init__(self,X_train,X_test,X_valid,y_train,y_test,y_valid,target_name):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.X_valid = X_valid
        self.y_valid = y_valid
        self.target_name = target_name

    def train_rf_model(self):
        rf = RandomForestClassifier(n_estimators=100,random_state=seed)
        rf.fit(self.X_train, self.y_train)
        return rf

    def train_lgbm_model(self):
        train = lgb.Dataset(self.X_train, label=self.y_train)
        valid_sets = [(self.X_test,self.y_test)]
        params={}
        params['random_state'] = seed
        params['n_estimators'] = 100
        params['learning_rate'] = 0.001
        params['boosting_type'] = 'gbdt'
        params['objective'] = 'binary'
        params['metric'] = {'binary_logloss','auc'}
        params['tree_learner'] = 'data'
        #training the model
        res = {}
        clf = lgb.train(params,train,valid_sets,valid_names=["valid"],\
                        evals_result=res,\
                        callbacks=[lgb.reset_parameter(learning_rate=lambda x: 0.95 ** x * 0.1)])
        return clf

    def predict(self,model,df=None):
        if(df is not None):
            data = df
        else:
            data = self.X_valid
        y_probs = model.predict(data)
        y_pred = (y_probs > 0.5).astype("int")
        return y_probs, y_pred

    def compute_metrics(self,y_pred,y_truth=None):
        if(y_truth is not None):
            pass
        else:
            y_truth = self.y_valid
        metrics = dict()
        metrics['lob'] = self.target_name
        metrics['classification_report'] = classification_report(y_truth,y_pred)
        metrics['roc_auc_score'] = roc_auc_score(y_truth,y_pred)
        metrics['accuracy_score'] = accuracy_score(y_truth,y_pred)
        return metrics

    def print_metrics(self,metrics):
        print("classification_report:\n",metrics['classification_report'],"\n\n")
        print("roc_auc_score:\n",metrics["roc_auc_score"],"\n\n")
        print("accuracy_score:\n",metrics["accuracy_score"],"\n\n")

    def train(self,model_name='lgbm'):
        if(model_name=='lgbm'):
            model = self.train_lgbm_model()
        elif(model_name=='rf'):
            model = self.train_rf_model()
        else:
            return "Supported model keywords are: [LightGBM: 'lgbm', RandomForest: 'rf']"
        y_logs, y_pred = self.predict(model)
        metrics = self.compute_metrics(y_pred)
        return model, metrics, y_pred, y_logs

def train_model_per_target(target_name, model_name='lgbm'):
    X_train, X_test, y_train, y_test = train_test_split(df[features], df[f'is_{target_name}'], test_size=0.2)
    tm = TrainModel(X_train,X_test, X_test, y_train, y_test, y_test, target_name)
    model, _, _, _ = tm.train(model_name)
    return model, X_train, X_test, y_train, y_test

setosa_model, setosa_X_train, setosa_X_test, setosa_y_train, setosa_y_test = train_model_per_target(target_name='setosa')

versicolor_model, versicolor_X_train, versicolor_X_test, versicolor_y_train, versicolor_y_test = train_model_per_target(target_name='versicolor')

virginica_model, virginica_X_train, virginica_X_test, virginica_y_train, virginica_y_test = train_model_per_target(target_name='virginica', model_name='rf')

class SetosaPredictionModel(BaseEstimator, TransformerMixin):
    def __init__(self):        
        self.features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
        self.model = lgb.Booster(model_file="/Workspace/Users/[Achyuta.Jha@amexgbt.com/vta_poc_models/setosa_lgbm_model.txt](http://Achyuta.Jha@amexgbt.com/vta_poc_models/setosa_lgbm_model.txt)")

    def fit(self, X, y=None):
        return None

    def transform(self, X):
        y_probs = self.model.predict(X[self.features])
        X['setosa_pred'] = (y_probs > 0.5).astype("int")
        return X

class VersicolorPredictionModel(BaseEstimator, TransformerMixin):
    def __init__(self):        
        self.features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
        self.model = lgb.Booster(model_file="/Workspace/Users/[Achyuta.Jha@amexgbt.com/vta_poc_models/versicolor_lgbm_model.txt](http://Achyuta.Jha@amexgbt.com/vta_poc_models/versicolor_lgbm_model.txt)")

    def fit(self, X, y=None):
        return None

    def transform(self, X):
        y_probs = self.model.predict(X[self.features])
        X['versicolor_pred'] = (y_probs > 0.5).astype("int")
        return X

class VirginicaPredictionModel(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
        self.model = joblib.load("/Workspace/Users/[Achyuta.Jha@amexgbt.com/vta_poc_models/virginica_rf_model.joblib](http://Achyuta.Jha@amexgbt.com/vta_poc_models/virginica_rf_model.joblib)")

    def fit(self, X, y=None):
        return None

    def predict(self, X):
        pred_list = []
        pred_list.append(X['setosa_pred'].values)
        pred_list.append(X['versicolor_pred'].values)
        pred_list.append(self.model.predict(X[self.features]))
        return pred_list

pipeline = Pipeline([
    ('setosa_prediction_model', SetosaPredictionModel()),
    ('versicolor_prediction_model', VersicolorPredictionModel()),
    ('virginica_prediction_model', VirginicaPredictionModel())
    ])

def convert_setosa_model(scope, operator, container):
    op = operator.raw_operator
    inputs = operator.inputs
    outputs = operator.outputs
    name = scope.get_unique_operator_name('SetosaModel')

    feature_names = op.features
    input_names = [inputs[i].full_name for i in range(len(inputs))]
    input_name = input_names[0]

    probabilities_name = scope.get_unique_variable_name(name + '_probabilities')
    probability_name = scope.get_unique_variable_name(name + '_probability')

    container.add_node('LgbmPredict', input_name, probabilities_name,
                       op_domain='[ai.onnx.ml](http://ai.onnx.ml/)', name=name)

    probabilities_shape = (1, 2)
    container.add_node('ArrayFeatureExtractor', probabilities_name, probability_name,
                       op_domain='[ai.onnx.ml](http://ai.onnx.ml/)',
                       name=scope.get_unique_operator_name('ArrayFeatureExtractor'),
                       attr={'indices': [1]})

    container.add_node('GreaterOrEqual', probability_name, outputs[0].full_name,
                       op_domain='[ai.onnx.ml](http://ai.onnx.ml/)', name=scope.get_unique_operator_name('GreaterOrEqual'),
                       attr={})
    container.add_node('Cast', outputs[0].full_name, outputs[0].full_name,
                       op_domain='[ai.onnx.ml](http://ai.onnx.ml/)',
                       name=scope.get_unique_operator_name('Cast'),
                       attr={'to': onnx_proto.TensorProto.INT64})

    # Set the number of classes
    operator.target_opset[-1].set_onnx_attr("n_classes", 2)

update_registered_converter(
    SetosaPredictionModel, 'SetosaLGBMClassifier',
    calculate_linear_classifier_output_shapes, convert_setosa_model,
    options={'nocl': [True, False], 'zipmap': [True, False, 'columns']})

def convert_versicolor_model(scope, operator, container):
    op = operator.raw_operator
    inputs = operator.inputs
    outputs = operator.outputs
    name = scope.get_unique_operator_name('SetosaModel')

    feature_names = op.features
    input_names = [inputs[i].full_name for i in range(len(inputs))]
    input_name = input_names[0]

    probabilities_name = scope.get_unique_variable_name(name + '_probabilities')
    probability_name = scope.get_unique_variable_name(name + '_probability')

    container.add_node('LgbmPredict', input_name, probabilities_name,
                       op_domain='[ai.onnx.ml](http://ai.onnx.ml/)', name=name)

    probabilities_shape = (1, 2)
    container.add_node('ArrayFeatureExtractor', probabilities_name, probability_name,
                       op_domain='[ai.onnx.ml](http://ai.onnx.ml/)',
                       name=scope.get_unique_operator_name('ArrayFeatureExtractor'),
                       attr={'indices': [1]})

    container.add_node('GreaterOrEqual', probability_name, outputs[0].full_name,
                       op_domain='[ai.onnx.ml](http://ai.onnx.ml/)', name=scope.get_unique_operator_name('GreaterOrEqual'),
                       attr={})
    container.add_node('Cast', outputs[0].full_name, outputs[0].full_name,
                       op_domain='[ai.onnx.ml](http://ai.onnx.ml/)',
                       name=scope.get_unique_operator_name('Cast'),
                       attr={'to': onnx_proto.TensorProto.INT64})

    # Set the number of classes
    operator.target_opset[-1].set_onnx_attr("n_classes", 2)

update_registered_converter(
    VersicolorPredictionModel, 'VersicolorLGBMClassifier',
    calculate_linear_classifier_output_shapes, convert_versicolor_model,
    options={'nocl': [True, False], 'zipmap': [True, False, 'columns']})

def convert_virginica_model(scope, operator, container):
    op = operator.raw_operator
    inputs = operator.inputs
    outputs = operator.outputs
    name = scope.get_unique_operator_name('VirginicaModel')

    feature_names = op.features
    input_names = [inputs[i].full_name for i in range(len(inputs))]
    input_name = input_names[0]

    output_name = scope.get_unique_variable_name('output')
    indices_name = scope.get_unique_variable_name('indices')

    # Convert input to a float tensor
    container.add_node('Cast', input_name, input_name + '_casted', op_version=9,
                       to=onnx_proto.TensorProto.FLOAT)

    # Extract relevant features
    container.add_node('ArrayFeatureExtractor', input_name + '_casted', input_name + '_extracted',
                       op_domain='[ai.onnx.ml](http://ai.onnx.ml/)', name=scope.get_unique_operator_name('ArrayFeatureExtractor'),
                       attr={'indices': [0, 1, 2, 3]})

    # Reshape to a single row
    container.add_node('Reshape', input_name + '_extracted', input_name + '_reshaped',
                       op_domain='[ai.onnx.ml](http://ai.onnx.ml/)', name=scope.get_unique_operator_name('Reshape'),
                       attr={'shape': {'dims': [1, 4]}})

    # Convert to tensor
    container.add_node('Identity', input_name + '_reshaped', output_name,
                       op_domain='[ai.onnx.ml](http://ai.onnx.ml/)', name=scope.get_unique_operator_name('Identity'))

    # Perform prediction
    container.add_node('SklToOnnxLinearClassifier', output_name, outputs[0].full_name,
                       op_domain='[ai.onnx.ml](http://ai.onnx.ml/)', name=name,
                       **calculate_linear_classifier_output_shapes(operator.raw_operator_, operator.target_opset))

    # Set the number of classes
    operator.target_opset[-1].set_onnx_attr("n_classes", 2)

update_registered_converter(
    VirginicaPredictionModel, 'VirginicaRFClassifier',
    calculate_linear_classifier_output_shapes, convert_virginica_model,
    options={'nocl': [True, False], 'zipmap': [True, False, 'columns']})

model_onnx = convert_sklearn(
    pipeline, 'pipeline_lgbrf',
    [('input', FloatTensorType([None, 4]))],
    target_opset={'': 12, 'ai.onnx.ml': 2})

While running the last command for model_onnx it returns this error: RuntimeError: No known ways to retrieve the number of classes for class <class '__main__.SetosaPredictionModel'>.

Library versions:

I've come across some blogs trying to convert the pipeline but I need to have it for a custom model class. TIA!

xadupre commented 1 month ago

The full stack is not given but I assume the error comes from function calculate_linear_classifier_output_shapes which tries to infer the number of output columns. You can either create your own function to calculate the output shape or modify your model so that function def _infer_linear_classifier_output_types(operator): is able to guess the number of classes.