Closed KOVVURISATYANARAYANAREDDY closed 2 years ago
please update your code to following after the PR #370 is merged.
- unwrapped_model.save_pretrained(save_dir, save_function=accelerator.save, state_dict=accelerator.get_state_dict(model))
+ unwrapped_model.save_pretrained(save_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save, state_dict=accelerator.get_state_dict(model))
After the change has been merged. I'm getting a new error .
Code i have used.
import torch
import torch.nn.functional as F
from datasets import load_dataset
from accelerate import Accelerator, DeepSpeedPlugin
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from data import CustomDataset
import pandas as pd
from tqdm import tqdm
from transformers import AdamW
import wandb
set_seed(42)
import os
#os.system('wandb login --relogin')
#wandb.login()
#wandb.init(project="OPT-accelerate", entity="satya4093", name='exp-1')
BATCH_SIZE = 4
lr = 0.0001
deepspeed_plugin = DeepSpeedPlugin(zero_stage=3, gradient_accumulation_steps=4, offload_optimizer_device='cpu')
accelerator = Accelerator(fp16=True, deepspeed_plugin=deepspeed_plugin, log_with="wandb")
#accelerator = Accelerator(log_with="wandb")
hps = {"learning_rate": lr}
accelerator.init_trackers("my_project", config=hps)
model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") #6.7b, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m", use_fast=False)
#model = accelerator.prepare(model)
df = pd.read_excel("dataset.xlsx")
dataset = CustomDataset(df, tokenizer, max_len=512)
data = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=5)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr) #, correct_bias=True)
model, optimizer, data = accelerator.prepare(model, optimizer, data)
#optimizer, data = accelerator.prepare(optimizer, data)
device = accelerator.device
accelerator.print("Device: ", device)
gradient_accum_steps = 4
model.train()
for epoch in range(10):
running_loss = 0
for batch_idx, batch in enumerate(data):
input_ids = batch["Input_ids"].to(device)
attention_mask = batch["attentions"].to(device)
labels = batch["Input_ids"].to(device)
#print("Input_ids shape", input_ids.shape)
optimizer.zero_grad()
output = model(input_ids,
attention_mask=attention_mask,
labels=labels
)
#wandb.log({"Loss", output['loss'].item()})
accelerator.print(f"Loss - {round(output['loss'].item(), 3)}, Batch_id - {batch_idx}, Epoch - {epoch}")
accelerator.log({"train_loss": output['loss'].item()}, step=batch_idx)
accelerator.backward(output['loss']/gradient_accum_steps)
if (batch_idx+1) % gradient_accum_steps == 0:
optimizer.step()
torch.cuda.empty_cache()
running_loss += output['loss'].item()
avg_loss = running_loss / ((batch_idx+1) * BATCH_SIZE)
accelerator.print("Avg_loss:", avg_loss)
if avg_loss < 100 :
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
save_dir = f"MODEL/{epoch}/"
print(f"Saving Model to {save_dir}")
#unwrapped_model.save_pretrained(save_dir, save_function=accelerator.save, state_dict=accelerator.get_state_dict(model))
unwrapped_model.save_pretrained(save_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save, state_dict=accelerator.get_state_dict(model))
break
accelerator.end_training()
I got the following error saying 'Nonetype' object is not iterable.
Saving Model to MODEL/0/
Traceback (most recent call last):
Traceback (most recent call last):
File "untitled.py", line 87, in <module>
unwrapped_model.save_pretrained(save_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save, state_dict=accelerator.get_state_dict(model))
File "/opt/conda/envs/OPT/lib/python3.8/site-packages/accelerate-0.9.0.dev0-py3.8.egg/accelerate/accelerator.py", line 889, in get_state_dict
for k in state_dict:
TypeError: 'NoneType' object is not iterable
Installed packages version:
deepspeed==0.6.4
accelerate==0.9.0.dev0
torch==1.10.1+cu111
torchaudio==0.10.1+rocm4.1
torchvision==0.11.2+cu111
transformers==4.19.2
I installed accelerate by cloning latest repo and "python setup.py install"
@pacman100 Please inform if i make a mistake.
Adding this line worked for me now.
Inside accelerator.py, "get_state_dict" method.
if self.is_main_process:
for k in state_dict:
if state_dict[k].dtype == torch.float16:
state_dict[k] = state_dict[k].float()
Hello @KOVVURISATYANARAYANAREDDY , thank you for informing on the new error and also coming up with the resolution. I have checked your resolution by making the above change, using the saved model and making sure the same performance is being achieved.
Thank you @pacman100
Accelerator with Deepspeed stage 3 when saving the model giving following error Code I used:
The above code is giving following error:
Can someone help?