Closed HebaGamalElDin closed 2 years ago
Hi, just unwrap the model:
model.module.generate(inputs)
(I didn't verify, but this should work)
Hi, just unwrap the model:
model.module.generate(inputs)
(I didn't verify, but this should work)
Yes, this actually works, thank you @nitaytech .. I have just faced one more issue now.
when I decode the batch after generation I get no prediction strings (means generated_text is always empty string), should I load the processor as well at the devices? hint: this behavior happened only with the DistributedDataParallel, It was working all together on a single GPU
def test(processor: TrOCRProcessor, model: VisionEncoderDecoderModel, dataloader: DataLoader):
output: dict[int, str] = []
model.eval()
with torch.no_grad():
for i, batch in enumerate(dataloader):
inputs: torch.Tensor = batch["input"].cuda(non_blocking=True)
generated_ids = model.module.generate(inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
ids = [t.item() for t in batch["idx"]]
output.extend(zip(ids, generated_text))
return output
am I missing anything else?
I'd recommend using HuggingFace Accelerate for training TrOCR in a distributed set-up.
You can then use unwrap_model to turn the distributed module back into a regular nn.Module (on which you can call generate)
We do provide an example for that, see here: https://github.com/huggingface/transformers/blob/8edf1963103127247ae3ef96fc5ba6a96eb4a290/examples/pytorch/summarization/run_summarization_no_trainer.py#L675
This is taken from the example script for summarization, but it would be equivalent for TrOCR
We do provide an example for that, see here:
This is taken from the example script for summarization, but it would be equivalent for TrOCR
Yet, I already switched to huggingface Accelerate (as I am working on Sagemaker so I installed "accelerate[sagemaker]" version), however the same issue present.
Here's the training script, that's already working properly on a single GPU.. I couldn't figure out the what is the root issue.
import os import torch print(f"TORCH_VERSION: {torch.__version__}") print(f"CUDA AVAILABILITY: {torch.cuda.is_available()} GPUs: {torch.cuda.get_device_name()}") import pandas as pd import random import math import re import numpy as np import itertools from PIL import Image import PIL.ImageOps import cv2 from smart_open import open as smart_open import io from torch.utils.data import DataLoader from transformers import AdamW, TrOCRProcessor, VisionEncoderDecoderModel, get_scheduler from Data_pipeline import Context, HCRDataset, OCRDataLoad from Validation_Metrics import getWordLevelError, getCharacterLevelError from accelerate import Accelerator import accelerate accelerator = Accelerator(kwargs_handlers=[accelerate.DistributedDataParallelKwargs(find_unused_parameters=True)]) accelerator.print(f"ACCELERATOR DEVICE:{accelerator.distributed_type}---- NUM OF PROCESSES: {accelerator.num_processes }")
from datasets import load_metric cer_metric = load_metric("cer") wer_metric = load_metric("wer")
def load_model() -> VisionEncoderDecoderModel: model: VisionEncoderDecoderModel = VisionEncoderDecoderModel.from_pretrained('gagan3012/ArOCRv4') return model.to(accelerator.device)
def init_model_for_training(model: VisionEncoderDecoderModel, processor: TrOCRProcessor): model.config.decoder_start_token_id = processor.tokenizer.cls_token_id model.config.pad_token_id = processor.tokenizer.pad_token_id model.config.vocab_size = model.config.decoder.vocab_size model.config.bos_token_id = processor.tokenizer.bos_token_id model.config.max_length = 162 model.config.decoder.is_decoder = True model.config.decoder.add_cross_attention = True torch.cuda.manual_seed_all(42) model.config.num_beams = 4
def predict(processor: TrOCRProcessor, model: VisionEncoderDecoderModel, dataloader: DataLoader): output: dict[int, str] = [] with torch.no_grad(): for i, batch in enumerate(dataloader): inputs: torch.Tensor = batch["input"].to(accelerator.device)
generated_ids = model.generate(inputs)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
ids = [t.item() for t in batch["idx"]]
output.extend(zip(ids, generated_text))
return output
def validate(context: Context, print_wrong: bool = False) -> float: predictions = predict(context.processor, context.model, context.val_dataloader) assert len(predictions) > 0
CER_avg = []
WER_avg = []
correct_count = 0
wrong_count = 0
for id, prediction in predictions:
label = context.val_dataset.get_label(id)
path = context.val_dataset.get_path(id)
CER = getCharacterLevelError(label, prediction)
WER = getWordLevelError(label, prediction)
CER_avg.append(CER)
WER_avg.append(WER)
accelerator.print(f"validation-batch--------------{id}-----------Label--------{label}---------Prediction-----------{prediction} -----CER----- {CER}----")
return round(sum(CER_avg)/len(CER_avg),2), round(sum(WER_avg)/len(WER_avg),2)
def load_processor() -> TrOCRProcessor: return TrOCRProcessor.from_pretrained('gagan3012/ArOCRv4')
def train(context, train_epochs, learning_rate): model = context.model optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = train_epochs * len(context.train_dataloader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
model, optimizer, context.training_dataloader,context.val_dataloader = accelerator.prepare(model, optimizer, context.train_dataloader, context.val_dataloader)
overall_loss = 0.0
overall_cer = 0.0
overall_wer = 0.0
for epoch in range(train_epochs):
context.model.train()
train_loss = 0.0
min_cer = 1.0
min_train_loss = 1.0
for j, batch in enumerate(context.train_dataloader):
inputs: torch.Tensor = batch["input"].to(accelerator.device)
labels: torch.Tensor = batch["label"].to(accelerator.device)
#print(inputs)
#print(labels)
outputs = model(pixel_values=inputs, labels=labels)
loss = outputs.loss
accelerator.backward(loss)
#loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
train_loss+=loss
#accelerator.print(f"Batch: {j}----Loss: {loss}")
overall_loss+=train_loss
if (loss < min_train_loss) or (min_train_loss==1.0):
min_train_loss = loss
accelerator.print(f"Epoch {epoch}-----Loss---{train_loss/len(context.train_dataloader)}--------- min-loss: {min_train_loss}")
# evaluate
unwrapped_model = accelerator.unwrap_model(model)
context.model = unwrapped_model
cer, wer = validate(context)
del loss, outputs, train_loss
overall_cer+=cer
overall_wer+=wer
accelerator.print(f"\n---- overall loss: {overall_loss/train_epochs}\n\n")
accelerator.print(f"\n---- overall cer: {overall_cer/train_epochs}\n\n")
accelerator.print(f"\n---- overall wer: {overall_wer/train_epochs}\n\n")
def main(): batch_size = 8 train_epochs = 10 learning_rate = 0.001 checkpoints_path = "checkpoints"
processor = load_processor()
(x_train,y_train),(x_valid,y_valid),(x_test,y_test) = OCRDataLoad()
train_dataset = HCRDataset(x_train, y_train, processor)
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True, num_workers=4)#, sampler=train_sampler)
val_dataset = HCRDataset(x_valid, y_valid, processor)
val_dataloader = DataLoader(val_dataset, batch_size, shuffle=False, num_workers=4)#, sampler=val_sampler)
# SageMaker data parallel: Wrap the PyTorch model with the library's DDP
model = load_model()
init_model_for_training(model, processor)
#model = DDP(model, broadcast_buffers=False)
context = Context(model, processor, train_dataset, train_dataloader, val_dataset, val_dataloader)
train(context, train_epochs, learning_rate)
unwraped_model = accelerator.unwrap_model(context.model)
# SageMaker data parallel: Save model on master node.
unwraped_model.save_pretrained(checkpoints_path)
if name == 'main': main()
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Please note that issues that do not follow the contributing guidelines are likely to be ignored.
We do provide an example for that, see here:
This is taken from the example script for summarization, but it would be equivalent for TrOCR
accelerator.unwrap_model(model).generate
worked for me. module
was not working.
System Info
transformer version: 4.21.1 python: 3.8 pytorch: 1.12
Who can help?
@NielsRogge
Reproduction
Steps:
Expected behavior
I'm training the trOCR model on my customized Arabic Dataset on Sagemaker instance, I'm running a distributed data training job and have added the model into all GPUs using pytorch as follows:
When I'm running the validation function it raises this error
when I'm trying the generate function:
What could be the problem pleaes?