McGill-NLP / llm2vec

Code for 'LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders'
https://mcgill-nlp.github.io/llm2vec/
MIT License
1.31k stars 95 forks source link

getting error when trying to do quantization using bitsandbytes #138

Open sandeep-krutrim opened 3 months ago

sandeep-krutrim commented 3 months ago

I am trying to speed up inference using quantized version of the llm2vec models. I have trained a gemma-2B-model on custom data. This is my inference code -

import torch
from transformers import BitsAndBytesConfig
import numpy as np
import torch
import sys
# sys.path.append('/home/sandeep.pandey/from_157/llm2vec_modified/llm2vec')

# from llm2vec.llm2vec2 import LLM2Vec

# Now you can import the LLM2Vec class
from llm2vec import LLM2Vec
from sentence_transformers import SentenceTransformer, util
from datasets import load_dataset
import configparser
import time

# Assuming LLM2Vec and CustomModel are already defined
class CustomModel(LLM2Vec):
    def prepare_for_tokenization(self, text):
        text = (
                "<start_of_turn>user " + text.strip() + "<end_of_turn> \n\n"
                + "<start_of_turn>model "
            )
        return text

# Define the main function
def main():
    base_model = "path to base model"
    lora_model = "path to contrastive lora checkpoint"

    nf4_config = BitsAndBytesConfig(
       load_in_4bit=True,
       bnb_4bit_quant_type="nf4",
       bnb_4bit_use_double_quant=True,
       bnb_4bit_compute_dtype=torch.bfloat16
    )

    # Load the Custom Model
    custom_model = CustomModel.from_pretrained(
       base_model,
       peft_model_name_or_path=lora_model,
       device_map="cuda" if torch.cuda.is_available() else "cpu",
       quantization_config=nf4_config
    )

    # Encoding queries using instructions
    instruction = (
        "Given a web search query, retrieve relevant passages that answer the query:"
    )
    queries = [
        [instruction, "How much protein should a female eat daily"]
    ]
    start = time.time()
    q_reps = custom_model.encode(queries)
    print(f"Time taken: {time.time() - start:.2f} seconds")

if __name__ == "__main__":
    main()

the error i am getting is -

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00, 2.61s/it] Batches: 0%| | 0/1 [00:00<?, ?it/s] Traceback (most recent call last): File "/home/sandeep.pandey/from_157/llm2vec_modified/notebooks/try_quantized_gemma.py", line 59, in main() File "/home/sandeep.pandey/from_157/llm2vec_modified/notebooks/try_quantized_gemma.py", line 54, in main q_reps = custom_model.encode(queries) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/sandeep.pandey/.local/lib/python3.12/site-packages/llm2vec/llm2vec.py", line 403, in encode all_embeddings = [result.get() for result in results] ^^^^^^^^^^^^ File "/opt/miniconda/lib/python3.12/multiprocessing/pool.py", line 774, in get raise self._value File "/opt/miniconda/lib/python3.12/multiprocessing/pool.py", line 540, in _handle_tasks put(task) File "/opt/miniconda/lib/python3.12/multiprocessing/connection.py", line 206, in send self._send_bytes(_ForkingPickler.dumps(obj)) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/miniconda/lib/python3.12/multiprocessing/reduction.py", line 51, in dumps cls(buf, protocol).dump(obj) AttributeError: Can't pickle local object 'add_hook_to_module..new_forward' Batches: 0%| | 0/1 [00:00<?, ?it/s]