Open Madavan2304 opened 6 months ago
This issue has been automatically marked as stale because it has not had any activity within 90 days. It will be automatically closed if no further activity occurs within 30 days. Leave a comment if you feel this issue should remain open. Thank you!
Your current environment
Code :
!pip install vllm
from vllm import LLM, SamplingParams
choosing the large language model
llm = LLM(model="AdaptLLM/finance-chat")
setting the parameters
sampling_params = SamplingParams(temperature=0.7)
Output :
INFO 04-17 09:32:58 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='AdaptLLM/finance-chat', tokenizer='AdaptLLM/finance-chat', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
ImportError Traceback (most recent call last) Cell In[4], line 4 1 from vllm import LLM, SamplingParams 3 # choosing the large language model ----> 4 llm = LLM(model="AdaptLLM/finance-chat") 6 # setting the parameters 7 sampling_params = SamplingParams(temperature=0.7)
File /opt/conda/lib/python3.10/site-packages/vllm/entrypoints/llm.py:112, in LLM.init(self, model, tokenizer, tokenizer_mode, trust_remote_code, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, enforce_eager, max_context_len_to_capture, disable_custom_all_reduce, kwargs) 93 kwargs["disable_log_stats"] = True 94 engine_args = EngineArgs( 95 model=model, 96 tokenizer=tokenizer, (...) 110 kwargs, 111 ) --> 112 self.llm_engine = LLMEngine.from_engine_args( 113 engine_args, usage_context=UsageContext.LLM_CLASS) 114 self.request_counter = Counter()
File /opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py:196, in LLMEngine.from_engine_args(cls, engine_args, usage_context) 193 executor_class = GPUExecutor 195 # Create the LLM engine. --> 196 engine = cls( 197 *engine_configs, 198 executor_class=executor_class, 199 log_stats=not engine_args.disable_log_stats, 200 usage_context=usage_context, 201 ) 202 return engine
File /opt/conda/lib/python3.10/site-packages/vllm/engine/llm_engine.py:110, in LLMEngine.init(self, model_config, cache_config, parallel_config, scheduler_config, device_config, lora_config, vision_language_config, executor_class, log_stats, usage_context) 107 self.detokenizer = Detokenizer(self.tokenizer) 108 self.seq_counter = Counter() --> 110 self.model_executor = executor_class(model_config, cache_config, 111 parallel_config, scheduler_config, 112 device_config, lora_config, 113 vision_language_config) 115 # If usage stat is enabled, collect relevant info. 116 if is_usage_stats_enabled():
File /opt/conda/lib/python3.10/site-packages/vllm/executor/gpu_executor.py:37, in GPUExecutor.init(self, model_config, cache_config, parallel_config, scheduler_config, device_config, lora_config, vision_language_config) 34 self.vision_language_config = vision_language_config 36 # Instantiate the worker and load the model to GPU. ---> 37 self._init_worker() 39 # Profile the memory usage and initialize the cache. 40 self._init_cache()
File /opt/conda/lib/python3.10/site-packages/vllm/executor/gpu_executor.py:45, in GPUExecutor._init_worker(self) 42 def _init_worker(self): 43 # Lazy import the Worker to avoid importing torch.cuda/xformers 44 # before CUDA_VISIBLE_DEVICES is set in the Worker ---> 45 from vllm.worker.worker import Worker 47 assert self.parallel_config.world_size == 1, ( 48 "GPUExecutor only supports single GPU.") 50 distributed_init_method = get_distributed_init_method( 51 get_ip(), get_open_port())
File /opt/conda/lib/python3.10/site-packages/vllm/worker/worker.py:21 19 from vllm.sequence import SamplerOutput, SequenceGroupMetadata 20 from vllm.worker.cache_engine import CacheEngine ---> 21 from vllm.worker.model_runner import ModelRunner 24 class Worker: 25 """A worker class that executes (a partition of) the model on a GPU. 26 27 Each worker is associated with a single GPU. The worker is responsible for 28 maintaining the KV cache and executing the model on the GPU. In case of 29 distributed inference, each worker is assigned a partition of the model. 30 """
File /opt/conda/lib/python3.10/site-packages/vllm/worker/model_runner.py:17 15 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager 16 from vllm.model_executor import SamplingMetadata ---> 17 from vllm.model_executor.model_loader import get_model 18 from vllm.model_executor.parallel_utils import custom_all_reduce, pynccl_utils 19 from vllm.model_executor.parallel_utils.communication_op import ( 20 broadcast_tensor_dict)
File /opt/conda/lib/python3.10/site-packages/vllm/model_executor/model_loader.py:10 8 from vllm.config import DeviceConfig, ModelConfig 9 from vllm.model_executor.models import ModelRegistry ---> 10 from vllm.model_executor.models.llava import LlavaForConditionalGeneration 11 from vllm.model_executor.weight_utils import (get_quant_config, 12 initialize_dummy_weights) 14 _VISION_MODEL_CLASSES = [ 15 LlavaForConditionalGeneration, 16 ]
File /opt/conda/lib/python3.10/site-packages/vllm/model_executor/models/llava.py:11 9 from vllm.attention import AttentionMetadata 10 from vllm.config import VisionLanguageConfig ---> 11 from vllm.model_executor.layers.activation import get_act_fn 12 from vllm.model_executor.layers.linear import LinearMethodBase 13 from vllm.model_executor.layers.logits_processor import LogitsProcessor
File /opt/conda/lib/python3.10/site-packages/vllm/model_executor/layers/activation.py:9 6 import torch.nn as nn 7 import torch.nn.functional as F ----> 9 from vllm._C import ops 10 from vllm.model_executor.layers.quantization import QuantizationConfig 11 from vllm.model_executor.parallel_utils.parallel_state import ( 12 get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
ImportError: /opt/conda/lib/python3.10/site-packages/vllm/_C.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c106detail14torchCheckFailEPKcS2_jRKSs
How would you like to use vllm
I want to run inference of a AdaptLLM/finance-chat. I don't know how to integrate it with vllm.