Closed Edisonwei54 closed 3 months ago
Traceback (most recent call last): File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/worker_manager.py", line 150, in _load_lora lora = self._lora_model_cls.from_local_checkpoint( File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/models.py", line 246, in from_local_checkpoint return cls.from_lora_tensors( File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/models.py", line 150, in from_lora_tensors module_name, is_lora_a = parse_fine_tuned_lora_name(tensor_name) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/utils.py", line 89, in parse_fine_tuned_lora_name assert parts[0] == "base_model" AssertionError
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 38, in _raise_exception_on_finish task.result() File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 501, in run_engine_loop has_requests_in_progress = await asyncio.wait_for( File "/opt/conda/envs/vllm/lib/python3.10/asyncio/tasks.py", line 445, in wait_for return fut.result() File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 475, in engine_step request_outputs = await self.engine.step_async() File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 221, in step_async output = await self.model_executor.execute_model_async( File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/executor/distributed_gpu_executor.py", line 110, in execute_model_async all_outputs = await self._run_workers_async("execute_model", File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/executor/ray_gpu_executor.py", line 326, in _run_workers_async all_outputs = await asyncio.gather(coros) File "/opt/conda/envs/vllm/lib/python3.10/concurrent/futures/thread.py", line 58, in run result = self.fn(self.args, self.kwargs) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 146, in execute_method raise e File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 137, in execute_method return executor(*args, *kwargs) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(args, kwargs) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/worker/worker.py", line 249, in execute_model output = self.model_runner.execute_model(seq_group_metadata_list, File "/opt/conda/envs/vllm/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 790, in execute_model self.set_active_loras(lora_requests, lora_mapping) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 901, in set_active_loras self.lora_manager.set_active_loras(lora_requests, lora_mapping) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/worker_manager.py", line 113, in set_active_loras self._apply_loras(lora_requests) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/worker_manager.py", line 235, in _apply_loras self.add_lora(lora) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/worker_manager.py", line 243, in add_lora lora = self._load_lora(lora_request) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/worker_manager.py", line 162, in _load_lora raise RuntimeError( RuntimeError: Loading lora /home/greatwall/app/edison/output/qwen1half-14b-chat/v65-20240515-143141/checkpoint-1110 failed
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "uvloop/cbhandles.pyx", line 63, in uvloop.loop.Handle._run File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 45, in _raise_exception_on_finish raise AsyncEngineDeadError( vllm.engine.async_llm_engine.AsyncEngineDeadError: Task finished unexpectedly. This should never happen! Please open an issue on Github. See stack trace above for the actual cause. INFO 05-17 08:38:11 async_llm_engine.py:154] Aborted request cmpl-8b4fdb5f840a472985d41587f7208686. INFO: 192.168.26.100:56198 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error ERROR: Exception in ASGI application Traceback (most recent call last): File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/worker_manager.py", line 150, in _load_lora lora = self._lora_model_cls.from_local_checkpoint( File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/models.py", line 246, in from_local_checkpoint return cls.from_lora_tensors( File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/models.py", line 150, in from_lora_tensors module_name, is_lora_a = parse_fine_tuned_lora_name(tensor_name) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/utils.py", line 89, in parse_fine_tuned_lora_name assert parts[0] == "base_model" AssertionError
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "/opt/conda/envs/vllm/lib/python3.10/site-packages/uvicorn/protocols/http/httptools_impl.py", line 411, in run_asgi result = await app( # type: ignore[func-returns-value] File "/opt/conda/envs/vllm/lib/python3.10/site-packages/uvicorn/middleware/proxy_headers.py", line 69, in call return await self.app(scope, receive, send) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/fastapi/applications.py", line 1054, in call await super().call(scope, receive, send) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/starlette/applications.py", line 123, in call await self.middleware_stack(scope, receive, send) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/starlette/middleware/errors.py", line 186, in call raise exc File "/opt/conda/envs/vllm/lib/python3.10/site-packages/starlette/middleware/errors.py", line 164, in call await self.app(scope, receive, _send) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/starlette/middleware/cors.py", line 85, in call await self.app(scope, receive, send) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/starlette/middleware/exceptions.py", line 65, in call await wrap_app_handling_exceptions(self.app, conn)(scope, receive, send) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app raise exc File "/opt/conda/envs/vllm/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app await app(scope, receive, sender) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/starlette/routing.py", line 756, in call await self.middleware_stack(scope, receive, send) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/starlette/routing.py", line 776, in app await route.handle(scope, receive, send) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/starlette/routing.py", line 297, in handle await self.app(scope, receive, send) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/starlette/routing.py", line 77, in app await wrap_app_handling_exceptions(app, request)(scope, receive, send) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/starlette/_exception_handler.py", line 64, in wrapped_app raise exc File "/opt/conda/envs/vllm/lib/python3.10/site-packages/starlette/_exception_handler.py", line 53, in wrapped_app await app(scope, receive, sender) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/starlette/routing.py", line 72, in app response = await func(request) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/fastapi/routing.py", line 278, in app raw_response = await run_endpoint_function( File "/opt/conda/envs/vllm/lib/python3.10/site-packages/fastapi/routing.py", line 191, in run_endpoint_function return await dependant.call(values) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 99, in create_chat_completion generator = await openai_serving_chat.create_chat_completion( File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/entrypoints/openai/serving_chat.py", line 138, in create_chat_completion return await self.chat_completion_full_generator( File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/entrypoints/openai/serving_chat.py", line 301, in chat_completion_full_generator async for res in result_generator: File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 666, in generate raise e File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 660, in generate async for request_output in stream: File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 77, in anext raise result File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 38, in _raise_exception_on_finish task.result() File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 501, in run_engine_loop has_requests_in_progress = await asyncio.wait_for( File "/opt/conda/envs/vllm/lib/python3.10/asyncio/tasks.py", line 445, in wait_for return fut.result() File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 475, in engine_step request_outputs = await self.engine.step_async() File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/engine/async_llm_engine.py", line 221, in step_async output = await self.model_executor.execute_model_async( File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/executor/distributed_gpu_executor.py", line 110, in execute_model_async all_outputs = await self._run_workers_async("execute_model", File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/executor/ray_gpu_executor.py", line 326, in _run_workers_async all_outputs = await asyncio.gather(coros) File "/opt/conda/envs/vllm/lib/python3.10/concurrent/futures/thread.py", line 58, in run result = self.fn(self.args, self.kwargs) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 146, in execute_method raise e File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 137, in execute_method return executor(*args, kwargs) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, *kwargs) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/worker/worker.py", line 249, in execute_model output = self.model_runner.execute_model(seq_group_metadata_list, File "/opt/conda/envs/vllm/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(args, kwargs) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 790, in execute_model self.set_active_loras(lora_requests, lora_mapping) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 901, in set_active_loras self.lora_manager.set_active_loras(lora_requests, lora_mapping) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/worker_manager.py", line 113, in set_active_loras self._apply_loras(lora_requests) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/worker_manager.py", line 235, in _apply_loras self.add_lora(lora) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/worker_manager.py", line 243, in add_lora lora = self._load_lora(lora_request) File "/opt/conda/envs/vllm/lib/python3.10/site-packages/vllm/lora/worker_manager.py", line 162, in _load_lora raise RuntimeError( RuntimeError: Loading lora /home/greatwall/app/edison/output/qwen1half-14b-chat/v65-20240515-143141/checkpoint-1110 failed
@WoosukKwon @zhuohan123
https://github.com/vllm-project/vllm/pull/3177 I see that this submission already supports Lora for Qwen2. What is the reason for it still not working? Is it due to Lora's issue?
Your current environment
🐛 Describe the bug
CUDA_VISIBLE_DEVICES=0,1 python -m vllm.entrypoints.openai.api_server \ --model /home/greatwall/app/edison/models/Qwen1.5-14B-Chat \ --trust-remote-code \ --served-model-name qwen14B \ --max-model-len 4096 \ --gpu-memory-utilization 0.9 \ --enable-lora \ --lora-modules lora1=/home/greatwall/app/edison/output/qwen1half-14b-chat/v65-20240515-143141/checkpoint-1110 \ --host 0.0.0.0 \ --port 8088 \ --tensor-parallel-size 2 \ --enforce-eager