bug: bentoml not support gpu in mlflow?

bentoml / BentoML

The easiest way to serve AI apps and models - Build reliable Inference APIs, LLM apps, Multi-model chains, RAG service, and much more!

Apache License 2.0

7.03k stars 781 forks source link

Describe the bug

i am save bento with mlflow (sentence transformers)

def save_model_to_mlflow(self, version):
    signature = mlflow.models.infer_signature(
        self.input_data, self.output_data
    )
    model_info: mlflow.models.model.ModelInfo = (
        mlflow.sentence_transformers.log_model(
            model=self.model,
            artifact_path=self.model_name,
            signature=signature,
            registered_model_name=self.model_name,
        )
    )

    self.mlflow_helper.update_model_description(
        self.model_name, model_info.run_id, "test sentence bert model"
    )
    self.mlflow_helper.update_model_tag(
        self.model_name,
        model_info.run_id,
        {"ct": "true", "model_version": version},
    )

bentoml.mlflow.import_model(
    tag,
    model_uri=version.source,
    signatures={"predict": {"batchable": batchable}},
)

and below is service.py

from typing import List

import bentoml
from bentoml.io import JSON, NumpyNdarray

from constant import BUILD_NAME, MODEL_NAME

sbert_model = bentoml.mlflow.get(MODEL_NAME)
_sbert_runnable = sbert_model.to_runnable()

class TestSentenceBert(_sbert_runnable):
    def __init__(self):
        super().__init__()
    @bentoml.Runnable.method(batchable=True, batch_dim=0)
    def predict(self, sentences: List[str]):
        output = super().predict(sentences)
        return output

sbert_runner = bentoml.Runner(TestSentenceBert)
svc = bentoml.Service(
    BUILD_NAME, runners=[sbert_runner], models=[sbert_model]
)

samples = [
    "안녕",
    "게임",
]

@svc.api(
    input=JSON.from_sample(samples),
    output=NumpyNdarray(),
    route=BUILD_NAME,
)
async def predict(sentences):
    output = await sbert_runner.predict.async_run(sentences)
    return output

how can i use gpu~? model not found and i want model.to("cuda:0")

To reproduce

No response

Expected behavior

No response

Environment

newest version

# https://github.com/bentoml/BentoML/blob/main/src/bentoml/_internal/frameworks/mlflow.py#L246 class MLflowPyfuncRunnable(bentoml.Runnable): # The only case that multi-threading may not be supported is when user define a # custom python_function MLflow model with pure python code, but there's no way # of telling that from the MLflow model metadata. It should be a very rare case, # because most custom python_function models are likely numpy code or model # inference with pre/post-processing code. SUPPORTED_RESOURCES = ("cpu", ) SUPPORTS_CPU_MULTI_THREADING = True ...

class TestSentenceBert(_sbert_runnable): # override SUPPORTED_RESOURCES = ("gpu",) # <--- need to add "gpu" force to find gpu SUPPORTS_CPU_MULTI_THREADING = True def __init__(self): super().__init__() @bentoml.Runnable.method(batchable=True, batch_dim=0) def predict(self, sentences: List[str]): output = super().predict(sentences) return output

bentoml / BentoML