I am trying to speed up inference using quantized version of the llm2vec models. I have trained a gemma-2B-model on custom data. This is my inference code -
import torch
from transformers import BitsAndBytesConfig
import numpy as np
import torch
import sys
# sys.path.append('/home/sandeep.pandey/from_157/llm2vec_modified/llm2vec')
# from llm2vec.llm2vec2 import LLM2Vec
# Now you can import the LLM2Vec class
from llm2vec import LLM2Vec
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset
import configparser
import time
# Assuming LLM2Vec and CustomModel are already defined
class CustomModel(LLM2Vec):
def prepare_for_tokenization(self, text):
text = (
"<start_of_turn>user " + text.strip() + "<end_of_turn> \n\n"
+ "<start_of_turn>model "
)
return text
# Define the main function
def main():
base_model = "path to base model"
lora_model = "path to contrastive lora checkpoint"
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
# Load the Custom Model
custom_model = CustomModel.from_pretrained(
base_model,
peft_model_name_or_path=lora_model,
device_map="cuda" if torch.cuda.is_available() else "cpu",
quantization_config=nf4_config
)
# Encoding queries using instructions
instruction = (
"Given a web search query, retrieve relevant passages that answer the query:"
)
queries = [
[instruction, "How much protein should a female eat daily"]
]
start = time.time()
q_reps = custom_model.encode(queries)
print(f"Time taken: {time.time() - start:.2f} seconds")
if __name__ == "__main__":
main()
the error i am getting is -
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.61s/it]
Batches: 0%| | 0/1 [00:00<?, ?it/s]
Traceback (most recent call last):
File "/home/sandeep.pandey/from_157/llm2vec_modified/notebooks/try_quantized_gemma.py", line 59, in
main()
File "/home/sandeep.pandey/from_157/llm2vec_modified/notebooks/try_quantized_gemma.py", line 54, in main
q_reps = custom_model.encode(queries)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/sandeep.pandey/.local/lib/python3.12/site-packages/llm2vec/llm2vec.py", line 403, in encode
all_embeddings = [result.get() for result in results]
^^^^^^^^^^^^
File "/opt/miniconda/lib/python3.12/multiprocessing/pool.py", line 774, in get
raise self._value
File "/opt/miniconda/lib/python3.12/multiprocessing/pool.py", line 540, in _handle_tasks
put(task)
File "/opt/miniconda/lib/python3.12/multiprocessing/connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/miniconda/lib/python3.12/multiprocessing/reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
AttributeError: Can't pickle local object 'add_hook_to_module..new_forward'
Batches: 0%| | 0/1 [00:00<?, ?it/s]
I am trying to speed up inference using quantized version of the llm2vec models. I have trained a gemma-2B-model on custom data. This is my inference code -
the error i am getting is -