betterftr commented 1 year ago

System Info

I am getting an error after loading Blip2 in 4bit, cant inference, cant train. Can anyone help?

Who can help?

No response

Information

[ ] The official example scripts
[ ] My own modified scripts

Tasks

[ ] An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
[ ] My own task or dataset (give details below)

Reproduction

`import torch from transformers import Blip2ForConditionalGeneration, AutoProcessor, Blip2Processor, AutoModelForCausalLM, BitsAndBytesConfig from peft import prepare_model_for_kbit_training

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-6.7b-coco")

model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-6.7b-coco", device_map='auto', load_in_8bit=True)

nf4_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16 )

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-6.7b-coco") model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-6.7b-coco", device_map='auto', quantization_config=nf4_config)`

Then when I want to train with PEFT or just do a single image captioning with the loaded model I get:

`FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.

AttributeError Traceback (most recent call last) Cell In[10], line 6 3 pixel_values = inputs.pixel_values 5 #generated_ids = model.generate(pixel_values=pixel_values, min_length=50, max_new_tokens=50, length_penalty=1.4, top_k=150, top_p=0.95, repetition_penalty=2.1, num_beams=5, temperature=0.75) ----> 6 generated_ids = model.generate(pixel_values=pixel_values, max_length=50) 7 generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] 8 print(generated_caption)

File H:\CONDA\envs\blip\lib\site-packages\torch\utils_contextlib.py:115, in context_decorator..decorate_context(*args, kwargs) 112 @functools.wraps(func) 113 def decorate_context(*args, *kwargs): 114 with ctx_factory(): --> 115 return func(args, kwargs)

File H:\CONDA\envs\blip\lib\site-packages\transformers\models\blip_2\modeling_blip_2.py:1854, in Blip2ForConditionalGeneration.generate(self, pixel_values, input_ids, attention_mask, generate_kwargs) 1851 inputs_embeds = self.get_input_embeddings()(input_ids) 1852 inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) -> 1854 outputs = self.language_model.generate( 1855 inputs_embeds=inputs_embeds, 1856 attention_mask=attention_mask, 1857 generate_kwargs, 1858 ) 1860 return outputs

File H:\CONDA\envs\blip\lib\site-packages\torch\utils_contextlib.py:115, in context_decorator..decorate_context(*args, kwargs) 112 @functools.wraps(func) 113 def decorate_context(*args, *kwargs): 114 with ctx_factory(): --> 115 return func(args, kwargs)

File H:\CONDA\envs\blip\lib\site-packages\transformers\generation\utils.py:1518, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, kwargs) 1512 raise ValueError( 1513 "num_return_sequences has to be 1 when doing greedy search, " 1514 f"but is {generation_config.num_return_sequences}." 1515 ) 1517 # 11. run greedy search -> 1518 return self.greedy_search( 1519 input_ids, 1520 logits_processor=logits_processor, 1521 stopping_criteria=stopping_criteria, 1522 pad_token_id=generation_config.pad_token_id, 1523 eos_token_id=generation_config.eos_token_id, 1524 output_scores=generation_config.output_scores, 1525 return_dict_in_generate=generation_config.return_dict_in_generate, 1526 synced_gpus=synced_gpus, 1527 streamer=streamer, 1528 model_kwargs, 1529 ) 1531 elif is_contrastive_search_gen_mode: 1532 if generation_config.num_return_sequences > 1:

File H:\CONDA\envs\blip\lib\site-packages\transformers\generation\utils.py:2335, in GenerationMixin.greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, model_kwargs) 2332 model_inputs = self.prepare_inputs_for_generation(input_ids, model_kwargs) 2334 # forward pass to get next token -> 2335 outputs = self( 2336 **model_inputs, 2337 return_dict=True, 2338 output_attentions=output_attentions, 2339 output_hidden_states=output_hidden_states, 2340 ) 2342 if synced_gpus and this_peer_finished: 2343 continue # don't waste resources running the code we don't need

File H:\CONDA\envs\blip\lib\site-packages\torch\nn\modules\module.py:1501, in Module._call_impl(self, *args, *kwargs) 1496 # If we don't have any hooks, we want to skip the rest of the logic in 1497 # this function, and just call forward. 1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1499 or _global_backward_pre_hooks or _global_backward_hooks 1500 or _global_forward_hooks or _global_forward_pre_hooks): -> 1501 return forward_call(args, **kwargs) 1502 # Do not call functions when jit is used 1503 full_backward_hooks, non_full_backward_hooks = [], []

File H:\CONDA\envs\blip\lib\site-packages\accelerate\hooks.py:165, in add_hook_to_module..new_forward(*args, kwargs) 163 output = old_forward(*args, *kwargs) 164 else: --> 165 output = old_forward(args, kwargs) 166 return module._hf_hook.post_forward(module, output)

File H:\CONDA\envs\blip\lib\site-packages\transformers\models\opt\modeling_opt.py:957, in OPTForCausalLM.forward(self, input_ids, attention_mask, head_mask, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict) 944 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) 945 outputs = self.model.decoder( 946 input_ids=input_ids, 947 attention_mask=attention_mask, (...) 954 return_dict=return_dict, 955 ) --> 957 logits = self.lm_head(outputs[0]).contiguous() 959 loss = None 960 if labels is not None: 961 # move labels to correct device to enable model parallelism

File H:\CONDA\envs\blip\lib\site-packages\torch\nn\modules\module.py:1501, in Module._call_impl(self, *args, *kwargs) 1496 # If we don't have any hooks, we want to skip the rest of the logic in 1497 # this function, and just call forward. 1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1499 or _global_backward_pre_hooks or _global_backward_hooks 1500 or _global_forward_hooks or _global_forward_pre_hooks): -> 1501 return forward_call(args, **kwargs) 1502 # Do not call functions when jit is used 1503 full_backward_hooks, non_full_backward_hooks = [], []

File H:\CONDA\envs\blip\lib\site-packages\accelerate\hooks.py:165, in add_hook_to_module..new_forward(*args, kwargs) 163 output = old_forward(*args, *kwargs) 164 else: --> 165 output = old_forward(args, kwargs) 166 return module._hf_hook.post_forward(module, output)

File H:\CONDA\envs\blip\lib\site-packages\bitsandbytes\nn\modules.py:219, in Linear4bit.forward(self, x) 216 x = x.to(self.compute_dtype) 218 bias = None if self.bias is None else self.bias.to(self.compute_dtype) --> 219 out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state) 221 out = out.to(inp_dtype) 223 return out

AttributeError: 'Parameter' object has no attribute 'quant_state'`

Expected behavior

8 bit works fine

sgugger commented 1 year ago

cc @younesbelkada

younesbelkada commented 1 year ago

Hi @betterftr Thanks for the issue, indeed there seems to be a bug, that should be fixed in https://github.com/huggingface/transformers/pull/23895

ArthurZucker commented 10 months ago

cc @SunMarc if you have time to look into this ?

SunMarc commented 10 months ago

Hey @betterftr , i'm able to run the following script with this env. Let me know if this works on your side.

transformers version: 4.34.0.dev0 (main branch)
accelerate version: 0.23

bitsandbytes version: 0.41.1


import torch
from transformers import Blip2ForConditionalGeneration, Blip2Processor, BitsAndBytesConfig
from PIL import Image
import requests

nf4_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.float16 )

processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-6.7b-coco") model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-6.7b-coco", device_map='auto', quantization_config=nf4_config)

def prepare_img(): url = "https://huggingface.co/hf-internal-testing/blip-test-image/resolve/main/demo.jpg" image = Image.open(requests.get(url, stream=True).raw) return image

image = prepare_img() inputs = processor(images=[image, image], return_tensors="pt").to(dtype=torch.float16)

predictions = model.generate(**inputs, num_beams=2) print(processor.batch_decode(predictions, skip_special_tokens=True)[0].strip())

print -> a woman sitting on the beach with her dog

github-actions[bot] commented 9 months ago

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

Please note that issues that do not follow the contributing guidelines are likely to be ignored.

robinsonmhj commented 5 months ago

@SunMarc I am getting similar error while using llma2 7b model, and I am using the latest version of transformers

here is the code

from transformers import AutoTokenizer, set_seed, BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
model_name = 'llm-models/Llama-2-7b-hf'
model = AutoModelForCausalLM.from_pretrained(
    model_name,  
    quantization_config=bnb_config,
    device_map="cuda",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map='cuda')
def generate_text(prompt):
    # Tokenize the prompt
    inputs = tokenizer.encode(prompt, return_tensors='pt')

    print(f'inputs is {inputs} on {inputs.device}')

    inputs = inputs.to('cuda:0')

    print(f'inputs is {inputs} on {inputs.device}')

    # Generate a response
    outputs = model.generate(inputs)

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return response

prompt = 'User1: Hey, I need a new laptop. Which one should I buy?'
response = generate_text(prompt)
print(response)

package info transformers==4.38.1 accelerate==0.21.0 bitsandbytes==0.42.0

I also tried 4.34, it doesn't work either. Besides that, I check this PR, it doesn't look like that it is in any of the release branch nor the maser branch

here is the error I get

FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.

AssertionError Traceback (most recent call last) Cell In[18], line 2 1 prompt = 'User1: Hey, I need a new laptop. Which one should I buy?' ----> 2 response = generate_text(prompt) 3 print(response)

Cell In[17], line 13, in generate_text(prompt) 10 print(f'inputs is {inputs} on {inputs.device}') 12 # Generate a response ---> 13 outputs = model.generate(inputs) 15 # Decode the response 16 response = tokenizer.decode(outputs[0], skip_special_tokens=True)

File /opt/conda/envs/domino-ray/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, kwargs) 112 @functools.wraps(func) 113 def decorate_context(*args, *kwargs): 114 with ctx_factory(): --> 115 return func(args, kwargs)

File /opt/conda/envs/domino-ray/lib/python3.10/site-packages/transformers/generation/utils.py:1345, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, **kwargs) 1337 logger.warning( 1338 "A decoder-only architecture is being used, but right-padding was detected! For correct " 1339 "generation results, please set padding_side='left' when initializing the tokenizer." 1340 ) 1342 if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs: 1343 # if model is encoder decoder encoder_outputs are created 1344 # and added to model_kwargs -> 1345 model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation( 1346 inputs_tensor, model_kwargs, model_input_name 1347 ) 1349 # 5. Prepare input_ids which will be used for auto-regressive generation 1350 if self.config.is_encoder_decoder:

File /opt/conda/envs/domino-ray/lib/python3.10/site-packages/transformers/generation/utils.py:644, in GenerationMixin._prepare_encoder_decoder_kwargs_for_generation(self, inputs_tensor, model_kwargs, model_input_name) 642 encoder_kwargs["return_dict"] = True 643 encoder_kwargs[model_input_name] = inputs_tensor --> 644 model_kwargs["encoder_outputs"]: ModelOutput = encoder(**encoder_kwargs) 646 return model_kwargs