🚀 A simple way to launch, train, and use PyTorch models on almost any device and distributed configuration, automatic mixed precision (including fp8), and easy-to-configure FSDP and DeepSpeed support
accelerate 0.18.0
os Linux n176-080-142 5.4.143.bsk.7-amd64 #5.4.143.bsk.7 SMP Debian 5.4.143.bsk.7 Mon Jul 4 02:44:16 UTC 2 x86_64 GNU/Linux
Information
[ ] The official example scripts
[X] My own modified scripts
Tasks
[ ] One of the scripts in the examples/ folder of Accelerate or an officially supported no_trainer script in the examples folder of the transformers repo (such as run_no_trainer_glue.py)
[X] My own task or dataset (give details below)
Reproduction
this is the code snippet
batch_size = 2
max_sequence_length = 1024
lama_finetune_dataset = LlamaFinetuneDataset(finetune_dataset_filepath)
train_datasets = torch.utils.data.DataLoader(llama_finetune_dataset,
batch_size=batch_size,
shuffle=True,
collate_fn=LlamaFinetuneDataset.collate_fn)
model = AutoModelForCausalLM.from_pretrained(model_card)
model.train()
no_decay = ['bias', 'norm.weight']
optimizer_grouped_parameters = [
{
'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
'weight_decay': 1e-1, # NOTE: follow llama paper
},
{
'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
'weight_decay': 0.0,
},
]
# for betas follow origin paper
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=lr, betas=(0.9, 0.95), weight_decay=1e-1)
scheduler = get_cosine_schedule_with_warmup(optimizer,
# NOTE: origin paper use 2000 warmup steps for pretrain
# but for continue training, it would be 0.
num_warmup_steps=0,
num_training_steps=int(len(train_datasets) / batch_size * epochs))
model, optimizer, scheduler, train_datasets = accelerator.prepare(model, optimizer, scheduler, train_datasets)
steps = 0
best_model_path = ''
for epoch in range(epochs):
for inputs in tqdm(train_datasets, disable=not accelerator.is_main_process):
outputs = model(**inputs)
loss = outputs.loss
if accelerator.is_main_process:
tensorboard.add_scalar('Loss/ce_loss', loss.to(torch.float16), steps)
accelerator.wait_for_everyone()
if accelerator.is_main_process and not steps % 1:
accelerator.print(f'device: {accelerator.device}, epoch: {epoch}, steps: {steps}, '
f'loss: {round(loss.to(torch.float16).detach().cpu().numpy().tolist(), 4)} ')
if steps and not steps % save_model_steps:
accelerator.print('saving model...')
best_model_path = f'{model_save_folder}/{model_card.split("/")[-1]}_best_model_step{steps}_loss{round(loss.to(torch.float16).detach().cpu().numpy().tolist(), 4)}'
if accelerator.is_main_process and not os.path.exists(model_save_folder):
os.mkdir(model_save_folder)
if accelerator.is_main_process and not os.path.exists(best_model_path):
os.mkdir(best_model_path)
# save model
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(best_model_path,
save_function=accelerator.save,
is_main_process=accelerator.is_main_process,
max_shard_size='1GB',
state_dict=accelerator.get_state_dict(unwrapped_model))
if not accelerator.is_main_process:
shutil.rmtree(best_model_path)
# save config and tokenizer
if accelerator.is_main_process:
unwrapped_model.config.save_pretrained(best_model_path)
tokenizer.save_pretrained(best_model_path)
accelerator.print('successfully saved model...')
accelerator.wait_for_everyone()
optimizer.zero_grad()
accelerator.backward(loss)
optimizer.step()
scheduler.step()
steps += 1
System Info
Information
Tasks
no_trainer
script in theexamples
folder of thetransformers
repo (such asrun_no_trainer_glue.py
)Reproduction
this is the code snippet
this is the accelerate config
this is the deepspeed config
Expected behavior
launch batch size 2 with 8 GPUs, don't raise OOM