allenai / longformer

Longformer: The Long-Document Transformer
https://arxiv.org/abs/2004.05150
Apache License 2.0
2.05k stars 276 forks source link

CUDA error: device-side assert triggered in multi class text classification #237

Open iteimouri opened 2 years ago

iteimouri commented 2 years ago

I am trying to run this approach for a dataset that has 5 classes rather than binary only. My code is as follow,

import pandas as pd
import datasets
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import wandb
import os
import mlflow
from datasets import Dataset

config = LongformerConfig()

# Taking the data from Yelp - note this is different than the original link which used IMBD (binary)
train_data, test_data = datasets.load_dataset('yelp_review_full', split =['train', 'test'], 
                                             cache_dir='FileStore/shared_uploads/data')

# reshaping the datasets so the column would be in same order as the original link,
# ie [text, label] rather than [label, text]
# if you use the commented lines you will go to binary classification 
df_train = pd.DataFrame(train_data)
df_train = df_train[df_train.columns[::-1]]
#df_train = df_train.loc[df_train['label'].isin([0, 1])]
train_data = Dataset.from_pandas(df_train)
df_test = pd.DataFrame(test_data)
df_test = df_test[df_test.columns[::-1]]
#df_test = df_test.loc[df_test['label'].isin([0, 1])]
test_data = Dataset.from_pandas(df_test)

# load model and tokenizer and define length of the text sequence
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096',
                                                           gradient_checkpointing=False,
                                                           attention_window = 512)
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', max_length = 1024)

model.config

# define a function that will tokenize the model, and will return the relevant inputs for the model
def tokenization(batched_text):
    return tokenizer(batched_text['text'], padding = 'max_length', truncation=True, max_length = 1024)

train_data = train_data.map(tokenization, batched = True, batch_size = len(train_data))
test_data = test_data.map(tokenization, batched = True, batch_size = len(test_data))

train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# define accuracy metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # argmax(pred.predictions, axis=1)
    #pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro') #as there is no class imbalance
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# define the training arguments
training_args = TrainingArguments(
    output_dir = '/FileStore/shared_uploads/results',
    num_train_epochs = 2,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 8,    
    per_device_eval_batch_size= 8,
    evaluation_strategy = "no",
    save_strategy="no",
    disable_tqdm = False, 
    load_best_model_at_end=True,
    warmup_steps=200,
    weight_decay=0.01,
    logging_steps = 4,
    fp16 = True,
    logging_dir='/FileStore/shared_uploads/logs',
    dataloader_num_workers = 0,
    run_name = 'longformer-classification-yelp'
)

# instantiate the trainer class and check for available devices
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

os.environ["WANDB_DISABLED"] = "true"

# train the model
trainer.train()

when I run the trainer.train() I get the CUDA error: device-side assert triggered which I assume is there because I am no longer using the binary classification. I know this could be the reason as when I only select two classes on my train and test sets (as commented in the code) then I have no issue running the training. I wonder where does one declare that this model is not binary but multi-class?

I know in LongformerForMultiLabelSequenceClassification.from_pretrained() one can select num_labels but that is used for multi label text classification rather than multi class text classification.