Your current environment

llm = VLLMOpenAI( openai_api_key="EMPTY", openai_api_base=api_base, model_name="microsoft/Phi-3-vision-128k-instruct", model_kwargs={"stop": ["."]} )

image_path = "invoice_data_images/Screenshot 2024-05-02 160946_page_1.png" with open(image_path, "rb") as image_file: image_base64 = base64.b64encode(image_file.read()).decode("utf-8")

prompt_1 = "Give me the invoice date from the given image." messages = [ HumanMessage( content=[ {"type": "text", "text": prompt_1}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_base64}"}} ] ) ]

response = llm.invoke(messages) print(response)

Error

{ "name": "BadRequestError", "message": "Error code: 400 - {'object': 'error', 'message': \"This model's maximum context length is 3744 tokens. However, you requested 254457 tokens (254201 in the messages, 256 in the completion). Please reduce the length of the messages or completion.\", 'type': 'BadRequestError', 'param': None, 'code': 400}", "stack": "--------------------------------------------------------------------------- BadRequestError Traceback (most recent call last) Cell In[96], line 26 16 messages = [ 17 HumanMessage( 18 content=[ (...) 22 ) 23 ] 25 # Invoke the model with the message ---> 26 response = llm.invoke(messages) 27 print(response)

File ~/SapidBlue/invoice_data_extraction/lightllm_xinf/venv/lib/python3.8/site-packages/langchain_core/language_models/llms.py:346, in BaseLLM.invoke(self, input, config, stop, kwargs) 336 def invoke( 337 self, 338 input: LanguageModelInput, (...) 342 kwargs: Any, 343 ) -> str: 344 config = ensure_config(config) 345 return ( --> 346 self.generate_prompt( 347 [self._convert_input(input)], 348 stop=stop, 349 callbacks=config.get(\"callbacks\"), 350 tags=config.get(\"tags\"), 351 metadata=config.get(\"metadata\"), 352 run_name=config.get(\"run_name\"), 353 run_id=config.pop(\"run_id\", None), 354 **kwargs, 355 ) 356 .generations[0][0] 357 .text 358 )

File ~/SapidBlue/invoice_data_extraction/lightllm_xinf/venv/lib/python3.8/site-packages/langchain_core/language_models/llms.py:703, in BaseLLM.generate_prompt(self, prompts, stop, callbacks, kwargs) 695 def generate_prompt( 696 self, 697 prompts: List[PromptValue], (...) 700 kwargs: Any, 701 ) -> LLMResult: 702 prompt_strings = [p.to_string() for p in prompts] --> 703 return self.generate(prompt_strings, stop=stop, callbacks=callbacks, **kwargs)

File ~/SapidBlue/invoice_data_extraction/lightllm_xinf/venv/lib/python3.8/site-packages/langchain_core/language_models/llms.py:882, in BaseLLM.generate(self, prompts, stop, callbacks, tags, metadata, run_name, run_id, kwargs) 867 if (self.cache is None and get_llm_cache() is None) or self.cache is False: 868 run_managers = [ 869 callback_manager.on_llm_start( 870 dumpd(self), (...) 880 ) 881 ] --> 882 output = self._generate_helper( 883 prompts, stop, run_managers, bool(new_arg_supported), kwargs 884 ) 885 return output 886 if len(missing_prompts) > 0:

File ~/SapidBlue/invoice_data_extraction/lightllm_xinf/venv/lib/python3.8/site-packages/langchain_core/language_models/llms.py:740, in BaseLLM._generate_helper(self, prompts, stop, run_managers, new_arg_supported, **kwargs) 738 for run_manager in run_managers: 739 run_manager.on_llm_error(e, response=LLMResult(generations=[])) --> 740 raise e 741 flattened_outputs = output.flatten() 742 for manager, flattened_output in zip(run_managers, flattened_outputs):

File ~/SapidBlue/invoice_data_extraction/lightllm_xinf/venv/lib/python3.8/site-packages/langchain_core/language_models/llms.py:727, in BaseLLM._generate_helper(self, prompts, stop, run_managers, new_arg_supported, kwargs) 717 def _generate_helper( 718 self, 719 prompts: List[str], (...) 723 kwargs: Any, 724 ) -> LLMResult: 725 try: 726 output = ( --> 727 self._generate( 728 prompts, 729 stop=stop, 730 # TODO: support multiple run managers 731 run_manager=run_managers[0] if run_managers else None, 732 **kwargs, 733 ) 734 if new_arg_supported 735 else self._generate(prompts, stop=stop) 736 ) 737 except BaseException as e: 738 for run_manager in run_managers:

File ~/SapidBlue/invoice_data_extraction/lightllm_xinf/venv/lib/python3.8/site-packages/langchain_community/llms/openai.py:464, in BaseOpenAI._generate(self, prompts, stop, run_manager, kwargs) 452 choices.append( 453 { 454 \"text\": generation.text, (...) 461 } 462 ) 463 else: --> 464 response = completion_with_retry( 465 self, prompt=_prompts, run_manager=run_manager, params 466 ) 467 if not isinstance(response, dict): 468 # V1 client returns the response in an PyDantic object instead of 469 # dict. For the transition period, we deep convert it to dict. 470 response = response.dict()

File ~/SapidBlue/invoice_data_extraction/lightllm_xinf/venv/lib/python3.8/site-packages/langchain_community/llms/openai.py:119, in completion_with_retry(llm, run_manager, kwargs) 117 \"\"\"Use tenacity to retry the completion call.\"\"\" 118 if is_openai_v1(): --> 119 return llm.client.create(kwargs) 121 retry_decorator = _create_retry_decorator(llm, run_manager=run_manager) 123 @retry_decorator 124 def _completion_with_retry(**kwargs: Any) -> Any:

File ~/SapidBlue/invoice_data_extraction/lightllm_xinf/venv/lib/python3.8/site-packages/openai/_utils/_utils.py:277, in required_args..inner..wrapper(*args, *kwargs) 275 msg = f\"Missing required argument: {quote(missing[0])}\" 276 raise TypeError(msg) --> 277 return func(args, **kwargs)

File ~/SapidBlue/invoice_data_extraction/lightllm_xinf/venv/lib/python3.8/site-packages/openai/resources/completions.py:528, in Completions.create(self, model, prompt, best_of, echo, frequency_penalty, logit_bias, logprobs, max_tokens, n, presence_penalty, seed, stop, stream, stream_options, suffix, temperature, top_p, user, extra_headers, extra_query, extra_body, timeout) 499 @required_args([\"model\", \"prompt\"], [\"model\", \"prompt\", \"stream\"]) 500 def create( 501 self, (...) 526 timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN, 527 ) -> Completion | Stream[Completion]: --> 528 return self._post( 529 \"/completions\", 530 body=maybe_transform( 531 { 532 \"model\": model, 533 \"prompt\": prompt, 534 \"best_of\": best_of, 535 \"echo\": echo, 536 \"frequency_penalty\": frequency_penalty, 537 \"logit_bias\": logit_bias, 538 \"logprobs\": logprobs, 539 \"max_tokens\": max_tokens, 540 \"n\": n, 541 \"presence_penalty\": presence_penalty, 542 \"seed\": seed, 543 \"stop\": stop, 544 \"stream\": stream, 545 \"stream_options\": stream_options, 546 \"suffix\": suffix, 547 \"temperature\": temperature, 548 \"top_p\": top_p, 549 \"user\": user, 550 }, 551 completion_create_params.CompletionCreateParams, 552 ), 553 options=make_request_options( 554 extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout 555 ), 556 cast_to=Completion, 557 stream=stream or False, 558 stream_cls=Stream[Completion], 559 )

File ~/SapidBlue/invoice_data_extraction/lightllm_xinf/venv/lib/python3.8/site-packages/openai/_base_client.py:1261, in SyncAPIClient.post(self, path, cast_to, body, options, files, stream, stream_cls) 1247 def post( 1248 self, 1249 path: str, (...) 1256 stream_cls: type[_StreamT] | None = None, 1257 ) -> ResponseT | _StreamT: 1258 opts = FinalRequestOptions.construct( 1259 method=\"post\", url=path, json_data=body, files=to_httpx_files(files), **options 1260 ) -> 1261 return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))

File ~/SapidBlue/invoice_data_extraction/lightllm_xinf/venv/lib/python3.8/site-packages/openai/_base_client.py:942, in SyncAPIClient.request(self, cast_to, options, remaining_retries, stream, stream_cls) 933 def request( 934 self, 935 cast_to: Type[ResponseT], (...) 940 stream_cls: type[_StreamT] | None = None, 941 ) -> ResponseT | _StreamT: --> 942 return self._request( 943 cast_to=cast_to, 944 options=options, 945 stream=stream, 946 stream_cls=stream_cls, 947 remaining_retries=remaining_retries, 948 )

File ~/SapidBlue/invoice_data_extraction/lightllm_xinf/venv/lib/python3.8/site-packages/openai/_base_client.py:1041, in SyncAPIClient._request(self, cast_to, options, remaining_retries, stream, stream_cls) 1038 err.response.read() 1040 log.debug(\"Re-raising status error\") -> 1041 raise self._make_status_error_from_response(err.response) from None 1043 return self._process_response( 1044 cast_to=cast_to, 1045 options=options, (...) 1048 stream_cls=stream_cls, 1049 )

BadRequestError: Error code: 400 - {'object': 'error', 'message': \"This model's maximum context length is 8192 tokens. However, you requested 254457 tokens (254201 in the messages, 256 in the completion). Please reduce the length of the messages or completion.\", 'type': 'BadRequestError', 'param': None, 'code': 400}" }



### How would you like to use vllm

I hosted VLLM on an EC2 instance and want to extract text data from images.

I hosted Phi-3 Vision using the CLI command:

python3 -m vllm.entrypoints.openai.api_server --port 8000 --model microsoft/Phi-3-vision-128k-instruct --trust-remote-code --dtype=half --max_model_len=8192

However, when I increase max_model_len, I get a CUDA out of memory error.

When I use microsoft/Phi-3-vision-128k-instruct from Hugging Face, I don't encounter any issues.

vllm-project / vllm

[Usage]: In phi3 vision maximum context length issue #6301

Your current environment

Error