I am trying to run this approach for a dataset that has 5 classes rather than binary only. My code is as follow,
import pandas as pd
import datasets
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import wandb
import os
import mlflow
from datasets import Dataset
config = LongformerConfig()
# Taking the data from Yelp - note this is different than the original link which used IMBD (binary)
train_data, test_data = datasets.load_dataset('yelp_review_full', split =['train', 'test'],
cache_dir='FileStore/shared_uploads/data')
# reshaping the datasets so the column would be in same order as the original link,
# ie [text, label] rather than [label, text]
# if you use the commented lines you will go to binary classification
df_train = pd.DataFrame(train_data)
df_train = df_train[df_train.columns[::-1]]
#df_train = df_train.loc[df_train['label'].isin([0, 1])]
train_data = Dataset.from_pandas(df_train)
df_test = pd.DataFrame(test_data)
df_test = df_test[df_test.columns[::-1]]
#df_test = df_test.loc[df_test['label'].isin([0, 1])]
test_data = Dataset.from_pandas(df_test)
# load model and tokenizer and define length of the text sequence
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096',
gradient_checkpointing=False,
attention_window = 512)
tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096', max_length = 1024)
model.config
# define a function that will tokenize the model, and will return the relevant inputs for the model
def tokenization(batched_text):
return tokenizer(batched_text['text'], padding = 'max_length', truncation=True, max_length = 1024)
train_data = train_data.map(tokenization, batched = True, batch_size = len(train_data))
test_data = test_data.map(tokenization, batched = True, batch_size = len(test_data))
train_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
# define accuracy metrics
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
# argmax(pred.predictions, axis=1)
#pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro') #as there is no class imbalance
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
# define the training arguments
training_args = TrainingArguments(
output_dir = '/FileStore/shared_uploads/results',
num_train_epochs = 2,
per_device_train_batch_size = 4,
gradient_accumulation_steps = 8,
per_device_eval_batch_size= 8,
evaluation_strategy = "no",
save_strategy="no",
disable_tqdm = False,
load_best_model_at_end=True,
warmup_steps=200,
weight_decay=0.01,
logging_steps = 4,
fp16 = True,
logging_dir='/FileStore/shared_uploads/logs',
dataloader_num_workers = 0,
run_name = 'longformer-classification-yelp'
)
# instantiate the trainer class and check for available devices
trainer = Trainer(
model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_data,
eval_dataset=test_data
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device
os.environ["WANDB_DISABLED"] = "true"
# train the model
trainer.train()
when I run the trainer.train() I get the CUDA error: device-side assert triggered which I assume is there because I am no longer using the binary classification. I know this could be the reason as when I only select two classes on my train and test sets (as commented in the code) then I have no issue running the training. I wonder where does one declare that this model is not binary but multi-class?
I know in LongformerForMultiLabelSequenceClassification.from_pretrained() one can select num_labels but that is used for multi label text classification rather than multi class text classification.
I am trying to run this approach for a dataset that has 5 classes rather than binary only. My code is as follow,
when I run the
trainer.train()
I get theCUDA error: device-side assert triggered
which I assume is there because I am no longer using the binary classification. I know this could be the reason as when I only select two classes on my train and test sets (as commented in the code) then I have no issue running the training. I wonder where does one declare that this model is not binary but multi-class?I know in
LongformerForMultiLabelSequenceClassification.from_pretrained()
one can selectnum_labels
but that is used for multi label text classification rather than multi class text classification.