Got input with wrong type during conversion when using pipeline

onnx / sklearn-onnx

Convert scikit-learn models and pipelines to ONNX

Apache License 2.0

538 stars 99 forks source link

I am trying to convert a regression model that needs preprocessing over numerical and categorical features. The categorical one works fine, but the numerical has something off. The code is the following:

# %%
import numpy as np
import pandas as pd

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.pipeline import Pipeline

import skl2onnx
from skl2onnx.common.data_types import StringTensorType, Int32TensorType, FloatTensorType

from pyquickhelper.helpgen.graphviz_helper import plot_graphviz
from mlprodict.onnxrt import OnnxInference

import onnxruntime as rt

path = "./datasets/unbalanced_unique.csv"

# %%
df_train = pd.read_csv(path)
df_train.head()

# %%
X = df_train[["rows", "query_type", "query_size", "unique", "mode", "threads", "backend"]].to_numpy()
y = df_train["execution_mean"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# %%
numeric_features = [0, 2, 3, 5]
categorical_features = [1, 4, 6]

numeric_transformer = Pipeline(steps=[
    ('scaler', sklearn.preprocessing.StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', sklearn.preprocessing.OneHotEncoder(sparse=True, handle_unknown='ignore')),
])

preprocessor = sklearn.compose.ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
])

# %%
reg = GradientBoostingRegressor()
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('reg', reg)
])
pipeline.fit(X_train, y_train)

# %%
initial_type = [
    ("rows", Int32TensorType([1,1])),
    ("query_type", StringTensorType([1,1])),
    ("query_size", Int32TensorType([1,1])),
    ("unique", FloatTensorType([1,1])),
    ("mode", StringTensorType([1,1])),
    ("threads", Int32TensorType([1,1])),
    ("backend", StringTensorType([1,1])),
]

# %%
onx = skl2onnx.convert_sklearn(pipeline, initial_types=initial_type)
with open("./out/regression.onnx", "wb") as f:
    f.write(onx.SerializeToString())

The error message is

RuntimeError: Operator SklearnScaler (type: SklearnScaler) got an input merged_columns with a wrong type . Only [, , ] are allowed

import numpy as np import pandas as pd import sklearn from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingRegressor from sklearn.pipeline import Pipeline import skl2onnx from skl2onnx.common.data_types import ( StringTensorType, Int32TensorType, FloatTensorType, ) from pyquickhelper.helpgen.graphviz_helper import plot_graphviz from onnx.reference import ReferenceEvaluator import onnxruntime as rt # path = "./datasets/unbalanced_unique.csv" # %% # df_train = pd.read_csv(path) df_train = pd.DataFrame( [ { "rows": 5, "query_type": "A", "query_size": 4, "unique": 1, "mode": "E", "threads": 5, "backend": "ZZ", "execution_mean": 5.5, }, { "rows": 4, "query_type": "B", "query_size": 2, "unique": 0, "mode": "FF", "threads": 5, "backend": "WWW", "execution_mean": 4.5, }, ] ) df_train.head() # %% X = df_train[ ["rows", "query_type", "query_size", "unique", "mode", "threads", "backend"] ].to_numpy() y = df_train["execution_mean"].to_numpy() X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # %% numeric_features = [0, 2, 3, 5] categorical_features = [1, 4, 6] numeric_transformer = Pipeline( steps=[("scaler", sklearn.preprocessing.StandardScaler())] ) categorical_transformer = Pipeline( steps=[ ( "onehot", sklearn.preprocessing.OneHotEncoder(sparse=True, handle_unknown="ignore"), ), ] ) preprocessor = sklearn.compose.ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ] ) # %% reg = GradientBoostingRegressor() pipeline = Pipeline([("preprocess", preprocessor), ("reg", reg)]) pipeline.fit(X_train, y_train) # %% initial_type = [ ("rows", Int32TensorType([1, 1])), ("query_type", StringTensorType([1, 1])), ("query_size", Int32TensorType([1, 1])), ("unique", FloatTensorType([1, 1])), ("mode", StringTensorType([1, 1])), ("threads", Int32TensorType([1, 1])), ("backend", StringTensorType([1, 1])), ] # %% onx = skl2onnx.convert_sklearn(pipeline, initial_types=initial_type) with open("regression.onnx", "wb") as f: f.write(onx.SerializeToString())

onnx / sklearn-onnx

Got input with wrong type during conversion when using pipeline #982