nlp-with-transformers / notebooks

Jupyter notebooks for the Natural Language Processing with Transformers book
https://transformersbook.com/
Apache License 2.0
3.92k stars 1.23k forks source link

Chapter 6 failed - fine tune PEGASSUS #71

Open srashtchi opened 2 years ago

srashtchi commented 2 years ago

Information

The problem arises in chapter:

Describe the bug

Steps to reproduce the behavior:

Run the notebook on a CUDA/GPU enabled device- A100 card

trainer = Trainer(model=model,
                  args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"],
                  eval_dataset=dataset_samsum_pt["validation"])

Trainer() fails withwith the following error:

Traceback (most recent call last): File "/home/shabnam/anaconda3/envs/rapids-22.08/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3378, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "", line 1, in trainer = Trainer(model=model, args=training_args, File "/home/shabnam/anaconda3/envs/rapids-22.08/lib/python3.9/site-packages/transformers/trainer.py", line 450, in init self._move_model_to_device(model, args.device) File "/home/shabnam/anaconda3/envs/rapids-22.08/lib/python3.9/site-packages/transformers/trainer.py", line 722, in _move_model_to_device model = model.to(device) AttributeError: 'str' object has no attribute 'to'

Expected behavior: training ...

srashtchi commented 2 years ago

If you need to see the whole code before this section I faced error here is a code:

import torch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:2" if use_cuda else "cpu")
print("Running on: ",device)
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_metric
import pandas as pd
from tqdm import tqdm

dataset_samsum = load_dataset("samsum") #document
model ="google/pegasus-large"
tokenizer = AutoTokenizer.from_pretrained(model)
rouge_metric = load_metric("rouge", cache_dir=None)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

def chunks(list_of_elements, batch_size):
    """Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i: i + batch_size]

def evaluate_summaries_pegasus(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="dialogue",
                               column_summary="summary"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
            zip(article_batches, target_batches), total=len(article_batches)):
        inputs = tokenizer(article_batch, max_length=1024, truncation=True,
                           padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                   attention_mask=inputs["attention_mask"].to(device),
                                   length_penalty=0.8, num_beams=8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                              clean_up_tokenization_spaces=True)
                             for s in summaries]
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute()
    return score

def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["dialogue"], max_length=1024,
                                truncation=True)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["summary"], max_length=128,
                                     truncation=True)

    return {"input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]}

dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features,batched=True)
columns = ["input_ids", "labels", "attention_mask"]
dataset_samsum_pt.set_format(type="torch", columns=columns)

from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='pegasus-samsum', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16)

trainer = Trainer(model=model,
                  args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"],
                  eval_dataset=dataset_samsum_pt["validation"])
srashtchi commented 2 years ago

Does this issue similar to #46 issue raised in Apr? if it does why with the fix it still exist? @lvwerra

tanmey007 commented 1 year ago

model ="google/pegasus-large" This is a string (model name) model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-large") This should resolve your issue @srashtchi