onnx / sklearn-onnx

Convert scikit-learn models and pipelines to ONNX
Apache License 2.0
538 stars 99 forks source link

Got input with wrong type during conversion when using pipeline #982

Open reuseman opened 1 year ago

reuseman commented 1 year ago

I am trying to convert a regression model that needs preprocessing over numerical and categorical features. The categorical one works fine, but the numerical has something off. The code is the following:

# %%
import numpy as np
import pandas as pd

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.pipeline import Pipeline

import skl2onnx
from skl2onnx.common.data_types import StringTensorType, Int32TensorType, FloatTensorType

from pyquickhelper.helpgen.graphviz_helper import plot_graphviz
from mlprodict.onnxrt import OnnxInference

import onnxruntime as rt

path = "./datasets/unbalanced_unique.csv"

# %%
df_train = pd.read_csv(path)
df_train.head()

# %%
X = df_train[["rows", "query_type", "query_size", "unique", "mode", "threads", "backend"]].to_numpy()
y = df_train["execution_mean"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# %%
numeric_features = [0, 2, 3, 5]
categorical_features = [1, 4, 6]

numeric_transformer = Pipeline(steps=[
    ('scaler', sklearn.preprocessing.StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', sklearn.preprocessing.OneHotEncoder(sparse=True, handle_unknown='ignore')),
])

preprocessor = sklearn.compose.ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
])

# %%
reg = GradientBoostingRegressor()
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('reg', reg)
])
pipeline.fit(X_train, y_train)

# %%
initial_type = [
    ("rows", Int32TensorType([1,1])),
    ("query_type", StringTensorType([1,1])),
    ("query_size", Int32TensorType([1,1])),
    ("unique", FloatTensorType([1,1])),
    ("mode", StringTensorType([1,1])),
    ("threads", Int32TensorType([1,1])),
    ("backend", StringTensorType([1,1])),
]

# %%
onx = skl2onnx.convert_sklearn(pipeline, initial_types=initial_type)
with open("./out/regression.onnx", "wb") as f:
    f.write(onx.SerializeToString())

The error message is

RuntimeError: Operator SklearnScaler (type: SklearnScaler) got an input merged_columns with a wrong type . Only [, , ] are allowed
xadupre commented 1 year ago

I tried with a dummy set and it works. Maybe pandas changed the type of a column because one row is misaligned or for some other reason.

import numpy as np
import pandas as pd

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.pipeline import Pipeline

import skl2onnx
from skl2onnx.common.data_types import (
    StringTensorType,
    Int32TensorType,
    FloatTensorType,
)

from pyquickhelper.helpgen.graphviz_helper import plot_graphviz
from onnx.reference import ReferenceEvaluator

import onnxruntime as rt

# path = "./datasets/unbalanced_unique.csv"

# %%
# df_train = pd.read_csv(path)
df_train = pd.DataFrame(
    [
        {
            "rows": 5,
            "query_type": "A",
            "query_size": 4,
            "unique": 1,
            "mode": "E",
            "threads": 5,
            "backend": "ZZ",
            "execution_mean": 5.5,
        },
        {
            "rows": 4,
            "query_type": "B",
            "query_size": 2,
            "unique": 0,
            "mode": "FF",
            "threads": 5,
            "backend": "WWW",
            "execution_mean": 4.5,
        },
    ]
)
df_train.head()

# %%
X = df_train[
    ["rows", "query_type", "query_size", "unique", "mode", "threads", "backend"]
].to_numpy()
y = df_train["execution_mean"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# %%
numeric_features = [0, 2, 3, 5]
categorical_features = [1, 4, 6]

numeric_transformer = Pipeline(
    steps=[("scaler", sklearn.preprocessing.StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[
        (
            "onehot",
            sklearn.preprocessing.OneHotEncoder(sparse=True, handle_unknown="ignore"),
        ),
    ]
)

preprocessor = sklearn.compose.ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# %%
reg = GradientBoostingRegressor()
pipeline = Pipeline([("preprocess", preprocessor), ("reg", reg)])
pipeline.fit(X_train, y_train)

# %%
initial_type = [
    ("rows", Int32TensorType([1, 1])),
    ("query_type", StringTensorType([1, 1])),
    ("query_size", Int32TensorType([1, 1])),
    ("unique", FloatTensorType([1, 1])),
    ("mode", StringTensorType([1, 1])),
    ("threads", Int32TensorType([1, 1])),
    ("backend", StringTensorType([1, 1])),
]

# %%
onx = skl2onnx.convert_sklearn(pipeline, initial_types=initial_type)
with open("regression.onnx", "wb") as f:
    f.write(onx.SerializeToString())