Closed sweterek420 closed 1 year ago
!pip install -q -U bitsandbytes !pip install -q -U sentencepiece !pip install -q sentencepiece !pip install -U sentencepiece !pip install -q -U transformers !pip install -q -U peft !pip install -q -U accelerate !pip install -q -U datasets !pip install -q -U trl !pip install -q -U einops
def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True): loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False) is_gptq_quantized = getattr(model, "quantization_method", None) == "gptq" for name, param in model.named_parameters(): param.requires_grad = False
if not is_gptq_quantized:
for param in model.parameters():
if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
param.data = param.data.to(torch.float32)
if (loaded_in_kbit or is_gptq_quantized) and use_gradient_checkpointing:
if hasattr(model, "enable_input_require_grads"):
model.enable_input_require_grads()
else:
def make_inputs_require_grad(module, input, output):
output.requires_grad_(True)
model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
model.gradient_checkpointing_enable()
return model
import torch from datasets import load_dataset from peft import LoraConfig, PeftModel from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, TrainingArguments, GenerationConfig ) from peft.tuners.lora import LoraLayer
from trl import SFTTrainer
model_name = "openlm-research/open_llama_3b_v2" # Change to the 3 billion parameter model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.add_special_tokens({"pad_token":"
dataset = load_dataset("timdettmers/openassistant-guanaco")
compute_dtype = getattr(torch, "float16") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=True, ) model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, device_map={"": 0} )
model.resize_token_embeddings(len(tokenizer)) model.config.pad_token_id = tokenizer.pad_token_id model.config.use_cache = False
model = prepare_model_for_kbit_training(model) peft_config = LoraConfig( lora_alpha=32, lora_dropout=0.1, r=8, bias="none", task_type="CAUSAL_LM", target_modules= ["q_proj","v_proj"] )
training_arguments = TrainingArguments( output_dir="./results", evaluation_strategy="steps", do_eval=True, per_device_train_batch_size=4, gradient_accumulation_steps=1, per_device_eval_batch_size=4, log_level="debug", optim="paged_adamw_32bit", save_steps=2, logging_steps=1, learning_rate=1e-4, eval_steps=5, fp16=True, max_grad_norm=0.3, warmup_ratio=0.03, lr_scheduler_type="constant", )
trainer = SFTTrainer( model=model, train_dataset=dataset['train'], eval_dataset=dataset['test'], peft_config=peft_config, dataset_text_field="text", max_seq_length=512, tokenizer=tokenizer, args=training_arguments, )
trainer.train()
model.save_pretrained("./content/gptx") # Save the model
Dunno if it's kind of temp fix but: The transformers library depends on sentencepiece, and sometimes the order of installation can affect whether dependencies are correctly recognized. By installing sentencepiece before transformers, i ensured that sentencepiece was available when transformers was installed.
I can't repo any errors here in trying all of those pip installs.
One thing you might attempt is bringing those onto the same line and letting pip manage when to install what:
!pip install -q -U bitsandbytes sentencepiece sentencepiece sentencepiece transformers peft accelerate datasets trl einops
Thanks, your one line with 3x sentencepiece mastermindly solved my issue. Now i'm fighting with ValueError: text input must of type str
(single example), List[str]
(batch or single pretokenized example) or List[List[str]]
(batch of pretokenized examples).
`import torch from datasets import load_dataset, Features, Value, Dataset from peft import LoraConfig, PeftModel import bitsandbytes as bnb from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, GenerationConfig ) from peft.tuners.lora import LoraLayer from trl import SFTTrainer
def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True): loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False) is_gptq_quantized = getattr(model, "quantization_method", None) == "gptq" for name, param in model.named_parameters(): param.requires_grad = False
if not is_gptq_quantized:
for param in model.parameters():
if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
param.data = param.data.to(torch.float32)
if (loaded_in_kbit or is_gptq_quantized) and use_gradient_checkpointing:
if hasattr(model, "enable_input_require_grads"):
model.enable_input_require_grads()
else:
def make_inputs_require_grad(module, input, output):
output.requires_grad_(True)
model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
model.gradient_checkpointing_enable()
return model
model_name = "gpt2-large" # Change to the 3 billion parameter model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.add_special_tokens({"pad_token":"
def tokenization(batch): encoding = tokenizer([str(text) for text in batch["text"]], padding='max_length', truncation=True, max_length=512) encoding['labels'] = encoding['input_ids'].clone() return encoding
def read_lua_code_dataset(file_path): with open(file_path, 'r') as f: content = f.read() examples = content.split('[END]') examples = [example.replace('[START]', '').strip() for example in examples if example.strip()] return examples
examples = read_lua_code_dataset('./lua/lua_code_dataset.txt') dataset = Dataset.from_dict({'text': examples}) dataset = dataset.map(tokenization, batched=True, remove_columns=["text"]) dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
compute_dtype = getattr(torch, "float16")
model = AutoModelForCausalLM.from_pretrained( model_name, device_map={"": 0} ) print(model) model.resize_token_embeddings(len(tokenizer)) model.config.pad_token_id = tokenizer.pad_token_id model.config.use_cache = False
model = prepare_model_for_kbit_training(model) peft_config = LoraConfig( lora_alpha=32, lora_dropout=0.1, r=8, bias="none", task_type="CAUSAL_LM", target_modules= ["c_attn","c_proj"] )
training_arguments = TrainingArguments( output_dir="./results", evaluation_strategy="steps", do_eval=True, per_device_train_batch_size=32, gradient_accumulation_steps=1, per_device_eval_batch_size=32, log_level="debug", optim="paged_adamw_32bit", save_steps=2, logging_steps=1, learning_rate=2e-4, eval_steps=5, fp16=True, max_grad_norm=0.1, warmup_ratio=0.03, lr_scheduler_type="constant", )
dataset = dataset.train_test_split(test_size=0.2)
trainer = SFTTrainer( model=model, train_dataset=dataset['train'], eval_dataset=dataset['test'], peft_config=peft_config, dataset_text_field="input_ids", max_seq_length=512, tokenizer=tokenizer, args=training_arguments, )
trainer.train()
model.save_pretrained("./content/gpt-l-2")`
@sweterek420 Thanks for the reply. From the error text you shared, it appears some input is not of the correct type (e.g. something needs to be of type str
, List[str]
, or List[List[str]]
, but it's not).
This appears to be an issue outside of Colab's control. We recommend asking on Stack Overflow or reaching out to the notebook author.
Describe the current behavior A clear and concise explanation of what is currently happening.
Describe the expected behavior A clear and concise explanation of what you expected to happen.
What web browser you are using (Chrome, Firefox, Safari, etc.)
Additional context Link to a minimal, public, self-contained notebook that reproduces this issue.