Open cavities opened 5 days ago
Qwen2ForSequenceClassification
. Can you try it?fail,
pip install vllm==0.6.3.post1
git clone https://github.com/vllm-project/vllm.git
python python_only_dev.py
edit stella_en_1.5B_v5/config.json
architectures: Qwen2ForCausalLM ==> Qwen2ForSequenceClassification
python vllm_debug.py
from vllm import LLM, SamplingParams
llm = LLM(model="../models/stella_en_1.5B_v5/",enforce_eager=True)
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
output = llm.encode(prompts)
for output in outputs:
print(output.outputs.embedding)
but ... Aborted (core dumped)
Please follow these instructions on how to install vLLM from source. You should not pip install
from PyPI since that's an older version of vLLM.
@DarkLight1337 The installation problem is resolved I found that although stella_en_1.5B_v5 base on qwen2_1.5B, I can use qwen2 encode to embedding but stella_en_1.5B_v5 error like:
TypeError Traceback (most recent call last)
Cell In[41], line 12
2 prompts = [
3 "Hello, my name is",
4 "The president of the United States is",
5 "The capital of France is",
6 "The future of AI is",
7 ]
10 # Generate embedding. The output is a list of EmbeddingRequestOutputs.
---> 12 outputs = model.encode(prompts)
13 # Print the outputs.
14 for output in outputs:
File /data/anaconda3/envs/rag-new/lib/python3.12/site-packages/vllm/utils.py:1063, in deprecate_kwargs.<locals>.wrapper.<locals>.inner(*args, **kwargs)
1056 msg += f" {additional_message}"
1058 warnings.warn(
1059 DeprecationWarning(msg),
1060 stacklevel=3, # The inner function takes up one level
1061 )
-> 1063 return fn(*args, **kwargs)
File /data/anaconda3/envs/rag-new/lib/python3.12/site-packages/vllm/entrypoints/llm.py:719, in LLM.encode(self, prompts, pooling_params, prompt_token_ids, use_tqdm, lora_request, prompt_adapter_request)
710 pooling_params = PoolingParams()
712 self._validate_and_add_requests(
713 prompts=parsed_prompts,
714 params=pooling_params,
715 lora_request=lora_request,
716 prompt_adapter_request=prompt_adapter_request,
717 )
--> 719 outputs = self._run_engine(use_tqdm=use_tqdm)
720 return LLMEngine.validate_outputs(outputs, EmbeddingRequestOutput)
File /data/anaconda3/envs/rag-new/lib/python3.12/site-packages/vllm/entrypoints/llm.py:879, in LLM._run_engine(self, use_tqdm)
877 total_out_toks = 0
878 while self.llm_engine.has_unfinished_requests():
--> 879 step_outputs = self.llm_engine.step()
880 for output in step_outputs:
881 if output.finished:
File /data/anaconda3/envs/rag-new/lib/python3.12/site-packages/vllm/engine/llm_engine.py:1389, in LLMEngine.step(self)
1385 if allow_async_output_proc:
1386 execute_model_req.async_callback = self.async_callbacks[
1387 virtual_engine]
-> 1389 outputs = self.model_executor.execute_model(
1390 execute_model_req=execute_model_req)
1392 # We need to do this here so that last step's sampled_token_ids can
1393 # be passed to the next iteration for PP.
1394 if self.scheduler_config.is_multi_step:
File /data/anaconda3/envs/rag-new/lib/python3.12/site-packages/vllm/executor/gpu_executor.py:134, in GPUExecutor.execute_model(self, execute_model_req)
131 def execute_model(
132 self, execute_model_req: ExecuteModelRequest
133 ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
--> 134 output = self.driver_worker.execute_model(execute_model_req)
135 return output
File /data/anaconda3/envs/rag-new/lib/python3.12/site-packages/vllm/worker/worker_base.py:303, in LocalOrDistributedWorkerBase.execute_model(self, execute_model_req)
299 """Executes at least one model step on the given sequences, unless no
300 sequences are provided."""
301 start_time = time.perf_counter()
--> 303 inputs = self.prepare_input(execute_model_req)
304 if inputs is None:
305 return None
File /data/anaconda3/envs/rag-new/lib/python3.12/site-packages/vllm/worker/worker_base.py:291, in LocalOrDistributedWorkerBase.prepare_input(self, execute_model_req)
289 broadcast_tensor_dict({}, src=0)
290 return None
--> 291 return self._get_driver_input_and_broadcast(execute_model_req)
292 else:
293 return self._get_worker_input_from_broadcast()
File /data/anaconda3/envs/rag-new/lib/python3.12/site-packages/vllm/worker/worker_base.py:253, in LocalOrDistributedWorkerBase._get_driver_input_and_broadcast(self, execute_model_req)
248 assert self.is_driver_worker
250 worker_input: WorkerInput = self.prepare_worker_input(
251 execute_model_req=execute_model_req)
252 model_input: ModelRunnerInputBase = (
--> 253 self.model_runner.prepare_model_input(
254 execute_model_req.seq_group_metadata_list,
255 execute_model_req.virtual_engine,
256 execute_model_req.finished_requests_ids))
258 kwargs = extract_previous_hidden_states(execute_model_req)
260 if self.do_metadata_broadcast:
File /data/anaconda3/envs/rag-new/lib/python3.12/site-packages/vllm/worker/embedding_model_runner.py:176, in EmbeddingModelRunner.prepare_model_input(self, seq_group_metadata_list, virtual_engine, finished_requests_ids)
169 def prepare_model_input(
170 self,
171 seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
172 virtual_engine: int = 0,
173 finished_requests_ids: Optional[List[str]] = None
174 ) -> ModelInputForGPUWithPoolingMetadata:
175 assert seq_group_metadata_list is not None
--> 176 model_input = self._prepare_model_input_tensors(
177 seq_group_metadata_list, finished_requests_ids)
178 # Prepare PoolingMetadata.
179 assert model_input.seq_lens is not None
File /data/anaconda3/envs/rag-new/lib/python3.12/site-packages/vllm/worker/model_runner.py:1196, in GPUModelRunnerBase._prepare_model_input_tensors(self, seq_group_metadata_list, finished_requests_ids)
1192 builder.add_seq_group(seq_group_metadata)
1194 builder.reset_cached_inter_data()
-> 1196 return builder.build()
File /data/anaconda3/envs/rag-new/lib/python3.12/site-packages/vllm/worker/model_runner.py:867, in ModelInputForGPUBuilder.build(self)
864 seq_lens.extend(itertools.repeat(1, cuda_graph_pad_size))
866 # Attention metadata.
--> 867 attn_metadata = self.attn_metadata_builder.build(
868 seq_lens, query_lens, cuda_graph_pad_size, batch_size)
870 # LoRA data.
871 lora_requests = set()
File /data/anaconda3/envs/rag-new/lib/python3.12/site-packages/vllm/attention/backends/utils.py:194, in CommonMetadataBuilder.build(self, seq_lens, query_lens, cuda_graph_pad_size, batch_size)
184 """Build attention metadata with on-device tensors.
185
186 Args:
(...)
191 batch_size: The maybe padded batch size.
192 """
193 for inter_data in self.input_builder.inter_data_list:
--> 194 self._add_seq_group(inter_data,
195 self.input_builder.chunked_prefill_enabled)
197 device = self.runner.device
198 use_captured_graph = cuda_graph_pad_size != -1
File /data/anaconda3/envs/rag-new/lib/python3.12/site-packages/vllm/attention/backends/utils.py:170, in CommonMetadataBuilder._add_seq_group(self, inter_data, chunked_prefill_enabled)
167 block_table = computed_block_nums
168 elif ((chunked_prefill_enabled or not is_prompt)
169 and block_tables is not None):
--> 170 block_table = block_tables[seq_id][-curr_sliding_window_block:]
171 self.block_tables.append(block_table)
173 # Compute slot mapping.
TypeError: 'NoneType' object is not subscriptable
Can you show your environment by running collect_env.py
? And also show your code to serve/use the model (is it the same as before?)
I'm sure it's not an installation problem. Embedding model especially the model processed by sentence transformer,It seems that the first round has not stopped continuing to reason in general.
when i use embedding model block tables are {seq_id:None}
def compute_slot_mapping(is_profile_run: bool, slot_mapping: List[int], seq_id: int, seq_len: int, context_len: int, start_idx: int, block_size: int, block_tables: Dict[int, List[int]]): """ Compute slot mapping. """ if is_profile_run:
# initialized yet. In this case, we just use a dummy
# slot mapping.
**# In embeddings, the block tables are {seq_id: None}.**
slot_mapping.extend([PAD_SLOT_ID] * seq_len)
return
# Mask the [0, start_idx) tokens of the prompt with
# PAD_SLOT_ID, where start_idx is max(0, seq_len -
# sliding_window). For example, if the prompt len is 10,
# sliding window is 8, and block size is 4, the first two
# tokens are masked and the slot mapping will be
# [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
padding_mask_len = max(0, start_idx - context_len)
slot_mapping.extend([PAD_SLOT_ID] * padding_mask_len)
range_start = max(start_idx, context_len)
range_end = seq_len
numel = range_end - range_start
block_table = block_tables[seq_id]
# numpy implementation will be faster than python if we have
# many elements, otherwise it will be slower.
if numel < _COMPUTE_SLOT_MAPPING_NUMPY_NUMEL:
_compute_slot_mapping_python(slot_mapping, block_table, range_start,
range_end, block_size)
else:
_compute_slot_mapping_numpy(slot_mapping, block_table, range_start,
range_end, block_size)
# TODO(sang): Combine chunked prefill and prefix caching by
# only allowing multiple of block_size chunk size.
# NOTE: This only works for oooooooxxx style attention.
block_table = []
if inter_data.prefix_cache_hit:
block_table = block_tables[seq_id]
elif ((chunked_prefill_enabled or not is_prompt)
and block_tables is not None):
if curr_sliding_window_block == 0:
block_table = block_tables[seq_id]
else:
**# embedding block table => {seq_id:None}.**
block_table = block_tables[seq_id][
-curr_sliding_window_block:]
self.block_tables.append(block_table)
Can you show your environment by running
collect_env.py
? And also show your code to serve/use the model (is it the same as before?)
Please note that embedding models are not supported for the CPU version of vLLM. That is why I asked.
pip install vllm==0.6.0
I've switched to pip installation: I found this problem: prompts = [ "Hello, my name is", "The president of the United States is", "The capital of France is", "The future of AI is", ]
path = "../model/Qwen2.5-0.5B-Instruct/"
# Create an LLM.
model = LLM(model=path, enforce_eager=True, dtype="float16")
self_tokenizer = model.get_tokenizer()
##############
self_tokenizer.encode( "Hello, my name is"),self_tokenizer.encode( "The president of the United States is")
([9707, 11, 847, 829, 374], [785, 4767, 315, 279, 3639, 4180, 374])
path = "../model/stella_en_1.5B_v5/"
# Create an LLM.
model = LLM(model=path, enforce_eager=True, dtype="float16")
self_tokenizer = model.get_tokenizer()
##############
self_tokenizer.encode( "Hello, my name is"),self_tokenizer.encode( "The president of the United States is")
([9707, 11, 847, 829, 374, 151643], [785, 4767, 315, 279, 3639, 4180, 374, 151643])
Do you get a similar behavior using the HuggingFace tokenizers directly? It may just be that they use different tokenizers.
in vllm/attention/backends/utils.py : 165L - 171L check block_tables value
block_table = []
if inter_data.prefix_cache_hit:
block_table = computed_block_nums
elif ((chunked_prefill_enabled or not is_prompt)
and block_tables is not None ):
print(block_tables )
block_table = block_tables[seq_id][-curr_sliding_window_block:]
self.block_tables.append(block_table)
the block_tables ====>
{8: None}
{9: None}
{10: None}
{11: None}
change code to
block_table = []
if inter_data.prefix_cache_hit:
block_table = computed_block_nums
elif ((chunked_prefill_enabled or not is_prompt)
and block_tables is not None and block_tables[seq_id] is not None):
block_table = block_tables[seq_id][-curr_sliding_window_block:]
self.block_tables.append(block_table)
sucess get the outputs.embedding why?
when i use native qwen2 embedding model
Do you get a similar behavior using the HuggingFace tokenizers directly? It may just be that they use different tokenizers.
Different from qwen2 It has extra tokens files: added_tokens.json
Oh, it looks like this repo uses Qwen2ForCausalLM
architecture... in that case you may need #10184 to use this model, sorry for missing that!
Edit: Hmm actually, I don't think we can properly load the linear layer right now...
@maxdebayser any thoughts on how to load sentence_transformers.models.Dense
layers automatically?
# The path of your model after cloning it
model_dir = "{Your MODEL_PATH}"
vector_dim = 1024
vector_linear_directory = f"2_Dense_{vector_dim}"
model = AutoModel.from_pretrained(model_dir, trust_remote_code=True).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
vector_linear = torch.nn.Linear(in_features=model.config.hidden_size, out_features=vector_dim)
vector_linear_dict = {
k.replace("linear.", ""): v for k, v in
torch.load(os.path.join(model_dir, f"{vector_linear_directory}/pytorch_model.bin")).items()
}
vector_linear.load_state_dict(vector_linear_dict)
vector_linear.cuda()
# Embed the queries
with torch.no_grad():
input_data = tokenizer(queries, padding="longest", truncation=True, max_length=512, return_tensors="pt")
input_data = {k: v.cuda() for k, v in input_data.items()}
attention_mask = input_data["attention_mask"]
last_hidden_state = model(**input_data)[0]
last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
query_vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
query_vectors = normalize(vector_linear(query_vectors).cpu().numpy())
=================================================== how to use vllm layer load weights?
@maxdebayser any thoughts on how to load
sentence_transformers.models.Dense
layers automatically?
$tree stella_en_1.5B_v5 stella_en_1.5B_v5 ├── 1_Pooling │ └── config.json ├── 2_Dense │ ├── config.json │ ├── model.safetensors │ └── pytorch_model.bin ├── 2_Dense_1024 │ ├── config.json │ ├── model.safetensors │ └── pytorch_model.bin ├── 2_Dense_2048 │ ├── config.json │ ├── model.safetensors │ └── pytorch_model.bin ├── 2_Dense_256 │ ├── config.json │ ├── model.safetensors │ └── pytorch_model.bin ├── 2_Dense_4096 │ ├── config.json │ ├── model.safetensors │ └── pytorch_model.bin ├── 2_Dense_6144 │ ├── config.json │ ├── model.safetensors │ └── pytorch_model.bin ├── 2_Dense_768 │ ├── config.json │ ├── model.safetensors │ └── pytorch_model.bin ├── 2_Dense_8192 │ ├── config.json │ ├── model.safetensors │ └── pytorch_model.bin ├── added_tokens.json ├── config.json ├── config_sentence_transformers.json ├── merges.txt ├── modeling_qwen.py ├── model.safetensors
vLLM loads weights in a similar way to HF transformers, so it only considers the root directory of the repo (in this case, the main model.safetensors
file)
vLLM loads weights in a similar way to HF transformers, so it only considers the root directory of the repo (in this case, the main
model.safetensors
file)
Many models have additional linear layers, but these linear layers are added after the pooler
There's only one difference config between the two models (stella_en_1.5B_v5 and Qwen2.5-0.5B-Instruct) is max_position_embeddings ,stella_en_1.5B_v5 config.max_position_embeddings==131072 Qwen2.5-0.5B-Instruct.max_position_embeddings==32768, and stella_en_1.5B_v5 config.enable_chunked_prefill == True, Qwen2.5-0.5B-Instruct config.enable_chunked_prefill == False.
I think we could add another Layer type like Pooler that we load after the model is loaded. After that the embedding model runner could run this layer if it exists after like the pooler.
Just thinking out loud: would it make sense to have some kind of wrapper model for sentence transformer models where we can load these models in a generic way and handle these different kinds of configurations or should the embedding model_runner handle those? Since the model runner is somewhat tied to the hardware back end, perhaps this would be better to encapsulate hardware-independent behavior that is common to sentence transformer models.
cc: @flaviabeo
The model to consider.
https://huggingface.co/dunzhang/stella_en_1.5B_v5 last_hidden_state = model(**input_data)[0] in init model:
vector_linear = torch.nn.Linear(in_features=model.config.hidden_size, out_features=vector_dim) vector_linear_dict = { k.replace("linear.", ""): v for k, v in torch.load(os.path.join(model_dir, f"{vector_linear_directory}/pytorch_model.bin")).items() } vector_linear.load_state_dict(vector_linear_dict) This model is a qwen2 base model,but in the end, a separate linear layer needs to be loaded。 last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0) query_vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] query_vectors = normalize(vector_linear(query_vectors)
The closest model vllm already supports.
No response
What's your difficulty of supporting the model you want?
No response
Before submitting a new issue...