setfit example does not work

geraldstanje commented 5 months ago

Hi,

I took the following example and enabled the training part in the code: https://github.com/huggingface/setfit/blob/main/notebooks/setfit-onnx-optimum.ipynb

But example gives an error:

Traceback (most recent call last):
  File "/teamspace/studios/this_studio/setfit_test/setfit-onnx-optimum-example.py", line 205, in <module>
    main()
  File "/teamspace/studios/this_studio/setfit_test/setfit-onnx-optimum-example.py", line 190, in main
    onnx_setfit_model(test_dataset["text"][:2])
  File "/teamspace/studios/this_studio/setfit_test/setfit-onnx-optimum-example.py", line 126, in __call__
    return self.predict(inputs)
  File "/teamspace/studios/this_studio/setfit_test/setfit-onnx-optimum-example.py", line 123, in predict
    return self.model_head.predict(embeddings)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/sklearn/linear_model/_base.py", line 451, in predict
    scores = self.decision_function(X)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/sklearn/linear_model/_base.py", line 432, in decision_function
    X = self._validate_data(X, accept_sparse="csr", reset=False)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/sklearn/base.py", line 605, in _validate_data
    out = check_array(X, input_name="X", **check_params)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/sklearn/utils/validation.py", line 915, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/sklearn/utils/_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/torch/_tensor.py", line 1062, in __array__
    return self.numpy()
TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

it looks like the function predict in OnnxSetFitModel needs to change as follows - does it mean it runs on the cpu and not gpu? does mean_pooling also run on the cpu?

    def predict(self, inputs):
        encoded_inputs = self.tokenizer(
            inputs, padding=True, truncation=True, return_tensors="pt"
        ).to(self.ort_model.device)

        outputs = self.ort_model(**encoded_inputs)
        embeddings = mean_pooling(
            outputs["last_hidden_state"], encoded_inputs["attention_mask"]
        )

        if embeddings.is_cuda:
            embeddings = embeddings.cpu()

        embeddings_np = embeddings.numpy()
        return self.model_head.predict(embeddings_np)

Code:

from pathlib import Path
from time import perf_counter

import evaluate
import numpy as np
import torch
from tqdm.auto import tqdm
import os

import matplotlib.pyplot as plt
import pandas as pd

from setfit import SetFitModel
from setfit import SetFitModel, Trainer, TrainingArguments

from datasets import load_dataset
from setfit.exporters.utils import mean_pooling
from optimum.onnxruntime import ORTModelForFeatureExtraction, AutoOptimizationConfig, ORTOptimizer
from transformers import AutoTokenizer

metric = evaluate.load("accuracy")

class PerformanceBenchmark:
    def __init__(self, model, dataset, optim_type):
        self.model = model
        self.dataset = dataset
        self.optim_type = optim_type

    def compute_accuracy(self):
        preds = self.model.predict(self.dataset["text"])
        labels = self.dataset["label"]
        accuracy = metric.compute(predictions=preds, references=labels)
        print(f"Accuracy on test set - {accuracy['accuracy']:.3f}")
        return accuracy

    def compute_size(self):
        state_dict = self.model.model_body.state_dict()
        tmp_path = Path("model.pt")
        torch.save(state_dict, tmp_path)
        # Calculate size in megabytes
        size_mb = Path(tmp_path).stat().st_size / (1024 * 1024)
        # Delete temporary file
        tmp_path.unlink()
        print(f"Model size (MB) - {size_mb:.2f}")
        return {"size_mb": size_mb}

    def time_model(self, query="that loves its characters and communicates something rather beautiful about human nature"):
        latencies = []
        # Warmup
        for _ in range(10):
            _ = self.model([query])
        # Timed run
        for _ in range(100):
            start_time = perf_counter()
            _ = self.model([query])
            latency = perf_counter() - start_time
            latencies.append(latency)
        # Compute run statistics
        time_avg_ms = 1000 * np.mean(latencies)
        time_std_ms = 1000 * np.std(latencies)
        print(rf"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}")
        return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}

    def run_benchmark(self):
        metrics = {}
        metrics[self.optim_type] = self.compute_size()
        metrics[self.optim_type].update(self.compute_accuracy())
        metrics[self.optim_type].update(self.time_model())
        return metrics

def plot_metrics(perf_metrics):
    df = pd.DataFrame.from_dict(perf_metrics, orient="index")

    for idx in df.index:
        df_opt = df.loc[idx]
        plt.errorbar(
            df_opt["time_avg_ms"],
            df_opt["accuracy"] * 100,
            xerr=df_opt["time_std_ms"],
            fmt="o",
            alpha=0.5,
            ms=df_opt["size_mb"] / 15,
            label=idx,
            capsize=5,
            capthick=1,
        )

    legend = plt.legend(loc="lower right")

    plt.ylim(63, 95)
    # Use the slowest model to define the x-axis range
    xlim = max([metrics["time_avg_ms"] for metrics in perf_metrics.values()]) * 1.2
    plt.xlim(0, xlim)
    plt.ylabel("Accuracy (%)")
    plt.xlabel("Average latency with batch_size=1 (ms)")
    plt.show()

class OnnxPerformanceBenchmark(PerformanceBenchmark):
    def __init__(self, *args, model_path, **kwargs):
        super().__init__(*args, **kwargs)
        self.model_path = model_path

    def compute_size(self):
        size_mb = Path(self.model_path).stat().st_size / (1024 * 1024)
        print(f"Model size (MB) - {size_mb:.2f}")
        return {"size_mb": size_mb}

class OnnxSetFitModel:
    def __init__(self, ort_model, tokenizer, model_head):
        self.ort_model = ort_model
        self.tokenizer = tokenizer
        self.model_head = model_head

    def predict(self, inputs):
        encoded_inputs = self.tokenizer(
            inputs, padding=True, truncation=True, return_tensors="pt"
        ).to(self.ort_model.device)

        outputs = self.ort_model(**encoded_inputs)
        embeddings = mean_pooling(
            outputs["last_hidden_state"], encoded_inputs["attention_mask"]
        )
        return self.model_head.predict(embeddings)

    def __call__(self, inputs):
        return self.predict(inputs)

def main():
    # Set the TOKENIZERS_PARALLELISM environment variable
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'

    dataset = load_dataset("SetFit/sst2")
    #dataset
    train_dataset = dataset["train"]
    test_dataset = dataset["validation"]

    # Evaluate the uploaded model!
    #model = SetFitModel.from_pretrained("dkorat/bge-small-en-v1.5_setfit-sst2-english")
    #pb = PerformanceBenchmark(model=model, dataset=test_dataset, optim_type="bge-small (PyTorch)")
    #perf_metrics = pb.run_benchmark()

    # Fine-tune the base model and Evaluate!
    # Load pretrained model from the Hub
    model = SetFitModel.from_pretrained(
        "sentence-transformers/all-MiniLM-L6-v2" #"BAAI/bge-small-en-v1.5"
    )
    args = TrainingArguments(num_iterations=20)

    # Create trainer
    small_trainer = Trainer(
        model=model, args=args, train_dataset=train_dataset
    )
    # Train!
    small_trainer.train()

    # Save and push the model to the Hub (change the model name accordingly)
    model.save_pretrained("setfit-test-model-example")

    # Evaluate!
    pb = PerformanceBenchmark(
        model=small_trainer.model, dataset=test_dataset, optim_type="bge-small (base)"
    )
    perf_metrics = pb.run_benchmark()

    plot_metrics(perf_metrics)

    #!pip install optimum[onnxruntime-gpu] -qqq

    # Load a PyTorch model and export it to the ONNX format
    ort_model = ORTModelForFeatureExtraction.from_pretrained(
        "setfit-test-model-example", #"dkorat/bge-small-en-v1.5_setfit-sst2-english",
        export=True,
        provider="CUDAExecutionProvider",
    )

    # Create the optimizer
    optimizer = ORTOptimizer.from_pretrained(ort_model)

    # Optimize using the appropriate optimization strategy
    opt_model_path = optimizer.optimize(save_dir="bge_auto_opt_O2", optimization_config=AutoOptimizationConfig.O2())

    # Load the optimized ONNX model
    ort_model = ORTModelForFeatureExtraction.from_pretrained(opt_model_path, provider="CUDAExecutionProvider")

    # Load the optimized ONNX model
    tokenizer = AutoTokenizer.from_pretrained(opt_model_path, model_max_length=512)
    onnx_setfit_model = OnnxSetFitModel(ort_model, tokenizer, model.model_head)

    # Perform inference
    onnx_setfit_model(test_dataset["text"][:2])

    pb = OnnxPerformanceBenchmark(
        onnx_setfit_model,
        test_dataset,
        "bge-small (optimum ONNX)",
        model_path="bge_auto_opt_O2/model_optimized.onnx",
    )
    perf_metrics.update(pb.run_benchmark())

    plot_metrics(perf_metrics)

    print(f"Speedup: {perf_metrics['bge-small (PyTorch)']['time_avg_ms'] / perf_metrics['bge-small (optimum ONNX)']['time_avg_ms']:.2f}x")

if __name__ == "__main__":
    main()

Logs: setfit_test.txt

geraldstanje commented 5 months ago

@MosheWasserb

according to https://github.com/huggingface/setfit/pull/435/files and https://github.com/huggingface/setfit/blob/main/docs/source/en/tutorials/onnx.mdx there should be the following in the notebook:

self.model_head.predict(embeddings.cpu())

but https://raw.githubusercontent.com/huggingface/setfit/main/notebooks/setfit-onnx-optimum.ipynb uses:

self.model_head.predict(embeddings)

is that a bug?

geraldstanje commented 4 months ago

any infos team?

kitrakrev commented 1 month ago

Can I work on this?

huggingface / setfit

setfit example does not work #532