huggingface / transformers

🤗 Transformers: State-of-the-art Machine Learning for Pytorch, TensorFlow, and JAX.
https://huggingface.co/transformers
Apache License 2.0
134.98k stars 27k forks source link

Issue with reloading model, please help me what I should I change #26120

Closed Shruthipriya-BS closed 1 year ago

Shruthipriya-BS commented 1 year ago

My code : ' ' ' import torch from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, logging, ) import torch import pandas as pd from datasets import load_dataset, concatenate_datasets from peft import LoraConfig, PeftModel from trl import SFTTrainer from peft import get_peft_model import gc import timeit import pandas as pd from transformers import AutoTokenizer import nltk nltk.download('punkt') nltk.download('stopwords') import numpy as np from transformers import BartTokenizer, BartForConditionalGeneration from nltk.corpus import stopwords from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity import torchaudio from IPython.display import Audio import re dataset1_name = "knkarthick/dialogsum" dataset1 = load_dataset(dataset1_name, split='train')

import pandas as pd

train_df = pd.DataFrame(dataset) test_df = pd.DataFrame(test_dataset)

instruction finetuning data preparation function

def prepare_dataset(df,split='train'): textcol = [] instruction = """Write a concise summary of the below input text.Return your response in bullet points which covers the key points of the text. """ # change instuction according to the task if split == 'train': for , row in df.iterrows(): input_q = row["dialogue"] output = row["summary"] text = ("### Instruction: \n" + instruction + "\n### Input: \n" + input_q + "\n### Response :\n" + output) # keeping output column in training dataset text_col.append(text) df.loc[:,'text'] = textcol else: for , row in df.iterrows(): input_q = row["dialogue"] text = ("### Instruction: \n" + instruction + "\n### Input: \n" + input_q + "\n### Response :\n" ) # not keeping output column in test dataset text_col.append(text) df.loc[:,'text'] = text_col return df

train_df = prepare_dataset(train_df,'train') test_df = prepare_dataset(test_df,'test') dataset = Dataset.from_pandas(train_df) fp16 = False

Number of training epochs

num_train_epochs = 2

Enable bf16 training

bf16 = True model_name = "NousResearch/Llama-2-7b-chat-hf" bnb_4bit_compute_dtype = "bfloat16" compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

Quantization config

bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="bfloat16", bnb_4bit_use_double_quant=False )

if computedtype == torch.float16 and True: major, = torch.cuda.get_device_capability() if major >= 8: print("=" 80) print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16") print("=" 80)

loading the model with quantization config

model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map = 'auto', )

model.config.use_cache = False model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True ) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right"

from peft import LoraConfig, get_peft_model

lora_alpha = 16 lora_dropout = 0.1 lora_r = 64 # rank

Parameter efficient finetuning for LoRA configuration

peft_config = LoraConfig( lora_alpha=lora_alpha, lora_dropout=lora_dropout,

target_modules= ["q_proj","v_proj"], # we will only create adopters for q, v metrices of attention module

r=lora_r,
bias="none",
task_type="CAUSAL_LM"

)

arguments are self explanatory

import transformers

Tensorboard logs

tb_log_dir = "./results/logs"

training_arguments = transformers.TrainingArguments( output_dir="llama2_qlora_finetuned", per_device_train_batch_size=1, gradient_accumulation_steps=1, optim="paged_adamw_32bit", save_steps=10, logging_steps = 1, learning_rate=2e-4, fp16=False, bf16=True, max_grad_norm=0.3, max_steps = 250, warmup_ratio = 0.03, group_by_length=True, lr_scheduler_type="cosine", report_to="tensorboard" )

from trl import SFTTrainer

trainer = SFTTrainer( model=model, train_dataset=dataset, peft_config=peft_config, # passing peft config dataset_text_field="text", # mentioned the required column args=training_arguments, # training agruments tokenizer=tokenizer, # tokenizer packing=False, max_seq_length=512 ) trainer.train() trainer.model.save_pretrained('modeldir')

Steps to follow: runtime -> Restart runetime run the below

import os

import gc import torch from transformers import ( AutoModelForCausalLM, AutoTokenizer,

) from peft import LoraConfig, PeftModel, get_peft_model

gc.collect()

Set the environment variable before importing PyTorch

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "0"

model_name = "NousResearch/Llama-2-7b-chat-hf"

device_map = {"": 0}

device_map='auto'

output_dir='modeldir'

base_model = AutoModelForCausalLM.from_pretrained( model_name, low_cpu_mem_usage=True, return_dict=True, torch_dtype=torch.bfloat16, device_map=device_map, offload_folder='offload/', offload_state_dict = True ) model = PeftModel.from_pretrained(base_model, output_dir, offload_folder = "offload/") model = model.merge_and_unload() gc.collect()

Reload tokenizer to save it

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right"

model.push_to_hub("modified_llama2", use_auth_token = True) tokenizer.push_to_hub("modified_llama2", use_auth_token = True)

Restart runtime to clear VRAM to load in 4bit for inference run the below for inference

new_model="modified_llama2" huggingface_profile = "Shruthipriya" full_path = huggingface_profile + "/" + new_model

Activate 4-bit precision base model loading

use_4bit = True

Activate nested quantization for 4-bit base models

use_nested_quant = False

Compute dtype for 4-bit base models

bnb_4bit_compute_dtype = "bfloat16"

Quantization type (fp4 or nf4)

bnb_4bit_quant_type = "nf4"

def load_model(model_name):

Load tokenizer and model with QLoRA configuration

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16")
        print("=" * 80)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
#tokenizer.pad_token = "<PAD>"
tokenizer.padding_side = "right"

return model, tokenizer, peft_config

model, tokenizer = load_model(full_path)

NotImplementedError Traceback (most recent call last) /home/shruthipriya/Documents/sp/sp-env/summary.ipynb Cell 21 line 6 57 tokenizer.padding_side = "right" 59 return model, tokenizer, peft_config ---> 61 model, tokenizer = load_model(full_path)

/home/shruthipriya/Documents/sp/sp-env/summary.ipynb Cell 21 line 3 32 print("Your GPU supports bfloat16, you can accelerate training with the argument --bf16") 33 print("=" * 80) ---> 35 model = AutoModelForCausalLM.from_pretrained( 36 model_name, 37 device_map='auto', 38 quantization_config=bnb_config, 39 ) 41 model.config.use_cache = False 42 model.config.pretraining_tp = 1

File ~/Documents/sp/sp-env/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py:493, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, *kwargs) 491 elif type(config) in cls._model_mapping.keys(): 492 model_class = _get_model_class(config, cls._model_mapping) --> 493 return model_class.from_pretrained( 494 pretrained_model_name_or_path, model_args, config=config, hub_kwargs, kwargs 495 ) 496 raise ValueError( 497 f"Unrecognized configuration class {config.class} for this kind of AutoModel: {cls.name}.\n" 498 f"Model type should be one of {', '.join(c.name for c in cls._model_mapping.keys())}." 499 )

File ~/Documents/sp/sp-env/lib/python3.10/site-packages/transformers/modeling_utils.py:2903, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs) 2893 if dtype_orig is not None: 2894 torch.set_default_dtype(dtype_orig) 2896 ( 2897 model, 2898 missing_keys, 2899 unexpected_keys, 2900 mismatched_keys, 2901 offload_index, 2902 error_msgs, -> 2903 ) = cls._load_pretrained_model( 2904 model, 2905 state_dict, 2906 loaded_state_dict_keys, # XXX: rename? 2907 resolved_archive_file, 2908 pretrained_model_name_or_path, 2909 ignore_mismatched_sizes=ignore_mismatched_sizes, 2910 sharded_metadata=sharded_metadata, 2911 _fast_init=_fast_init, 2912 low_cpu_mem_usage=low_cpu_mem_usage, 2913 device_map=device_map, 2914 offload_folder=offload_folder, 2915 offload_state_dict=offload_state_dict, 2916 dtype=torch_dtype, 2917 is_quantized=(load_in_8bit or load_in_4bit), 2918 keep_in_fp32_modules=keep_in_fp32_modules, 2919 ) 2921 model.is_loaded_in_4bit = load_in_4bit 2922 model.is_loaded_in_8bit = load_in_8bit

File ~/Documents/sp/sp-env/lib/python3.10/site-packages/transformers/modeling_utils.py:3260, in PreTrainedModel._load_pretrained_model(cls, model, state_dict, loaded_keys, resolved_archive_file, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, _fast_init, low_cpu_mem_usage, device_map, offload_folder, offload_state_dict, dtype, is_quantized, keep_in_fp32_modules) 3250 mismatched_keys += _find_mismatched_keys( 3251 state_dict, 3252 model_state_dict, (...) 3256 ignore_mismatched_sizes, 3257 ) 3259 if low_cpu_mem_usage: -> 3260 new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model( 3261 model_to_load, 3262 state_dict, 3263 loaded_keys, 3264 start_prefix, 3265 expected_keys, 3266 device_map=device_map, 3267 offload_folder=offload_folder, 3268 offload_index=offload_index, 3269 state_dict_folder=state_dict_folder, 3270 state_dict_index=state_dict_index, 3271 dtype=dtype, 3272 is_quantized=is_quantized, 3273 is_safetensors=is_safetensors, 3274 keep_in_fp32_modules=keep_in_fp32_modules, 3275 ) 3276 error_msgs += new_error_msgs 3277 else:

File ~/Documents/sp/sp-env/lib/python3.10/site-packages/transformers/modeling_utils.py:725, in _load_state_dict_into_meta_model(model, state_dict, loaded_state_dict_keys, start_prefix, expected_keys, device_map, offload_folder, offload_index, state_dict_folder, state_dict_index, dtype, is_quantized, is_safetensors, keep_in_fp32_modules) 722 fp16_statistics = None 724 if "SCB" not in param_name: --> 725 set_module_quantized_tensor_to_device( 726 model, param_name, param_device, value=param, fp16_statistics=fp16_statistics 727 ) 729 return error_msgs, offload_index, state_dict_index

File ~/Documents/sp/sp-env/lib/python3.10/site-packages/transformers/utils/bitsandbytes.py:77, in set_module_quantized_tensor_to_device(module, tensor_name, device, value, fp16_statistics) 75 new_value = old_value.to(device) 76 elif isinstance(value, torch.Tensor): ---> 77 new_value = value.to("cpu") 78 if value.dtype == torch.int8: 79 is_8bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse( 80 "0.37.2" 81 )

NotImplementedError: Cannot copy out of meta tensor; no data!

SunMarc commented 1 year ago

Hi @Shruthipriya-BS , please submit minimal reproducer with your config. It's hard to help you with the current description.

github-actions[bot] commented 1 year ago

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

Please note that issues that do not follow the contributing guidelines are likely to be ignored.