No better performance for concurring requests using batching

Discussed in https://github.com/bentoml/BentoML/discussions/3137

^{Originally posted by **katerinafrid** October 24, 2022} I'm trying to speed up the processing of requests to my pytorch model, but I do not see any improvements compared to the standard sequential processing. Am I doing something wrong? First, I execute utils.py file, then I start a server by running server.py. utils.py ```python class NLUPipeline(transformers.Pipeline): def preprocess(self, inputs): return self.tokenizer(inputs['premise_list'], inputs['hypothesis_list'], truncation='only_first', max_length=128, padding=True, return_tensors='pt').to(self.device) def _sanitize_parameters(self, **kwargs): return {}, {}, {} def _forward(self, model_inputs): model_out = self.model(**model_inputs) return model_out.logits @classmethod def postprocess(cls, model_outputs): return model_outputs class NLURunnable(bentoml.Runnable): SUPPORT_NVIDIA_GPU = True SUPPORTED_RESOURCES = ('cuda' if torch.cuda.is_available() else 'cpu',) SUPPORTS_CPU_MULTI_THREADING = True def __init__(self): # load the model instance self.nlu_model = bentoml.transformers.load_model("nlu_pl:latest", device=0 if torch.cuda.is_available() else -1) @bentoml.Runnable.method(batchable=True, batch_dim=0) def predict(self, input_data): return self.nlu_model(input_data) def register_nlu_pipeline(): TASK_NAME = "zero-shot-classification" TASK_DEFINITION = { "impl": NLUPipeline, "tf": (), "pt": (transformers.AutoModelForSequenceClassification,), "default": {}, "type": "text", } SUPPORTED_TASKS[TASK_NAME] = TASK_DEFINITION def create_nlu_pipeline(nlu_model_path: str = default_nlu_model_path): classifier = transformers.pipeline( task="zero-shot-classification", model=transformers.AutoModelForSequenceClassification.from_pretrained( nlu_model_path ), tokenizer=transformers.AutoTokenizer.from_pretrained( nlu_model_path ), ) return classifier nlu_pl = create_nlu_pipeline('path') bentoml.transformers.save_model( 'nlu_pl', pipeline=nlu_pl, signatures={ "predict": { "batchable": True, "batch_dim": 0, }, }, ) ``` server.py ```python nlu_model = bentoml.transformers.get("nlu_pl:latest") nlu_runner = bentoml.Runner(NLURunnable, models=[nlu_model], method_configs={"predict": {"max_batch_size": 16, "max_latency_ms": 600}} ) register_nlu_pipeline() svc = bentoml.Service("server", runners=[nlu_runner]) class NLURequest(BaseModel): premise_list: List[str] hypothesis_list: List[str] @svc.api(input=JSON(pydantic_model=NLURequest), output=JSON()) async def nlu_request(json: NLURequest) -> Dict[str, Any]: req_body = {"premise_list": json.premise_list, "hypothesis_list": json.hypothesis_list} response = await nlu_runner.predict.async_run(req_body) return {"result": response.cpu().numpy()} ```

I am experiencing error with batching enabled, 2022-11-08T13:07:08+0000 [INFO] [api_server:1] 127.0.0.1:33304 (scheme=http,method=POST,path=/v1/get_intents,type=application/json,length=91) (status=200,type=application/json,length=20) 1450.984ms (trace=f8472c38b374f57a7213989491a40acc,span=c32005903caa5b5f,sampled=0) Traceback (most recent call last): File "/workspace/personality_framework/personality_service/bento_service.py", line 238, in get_intent result=await runner1.is_positive.async_run([{"sentence":query}]) File "/tmp/e2/lib/python3.8/site-packages/bentoml/_internal/runner/runner.py", line 53, in async_run return await self.runner._runner_handle.async_run_method( # type: ignore File "/tmp/e2/lib/python3.8/site-packages/bentoml/_internal/runner/runner_handle/remote.py", line 207, in async_run_method raise ServiceUnavailable(body.decode()) from None

without batching the same code works well ,

my batching configuration is enabled: true max_batch_size: 100 max_latency_ms: 1000

without batching with load testing i get reply to my 100 simultaneous requests without error , with batching im facing the above error

bentoml / BentoML

No better performance for concurring requests using batching #3141

Discussed in https://github.com/bentoml/BentoML/discussions/3137