Does PEFT support CodeBERT?

MabelQi commented 3 weeks ago

System Info

linux

Who can help?

@pacman100 @younesbelkada @BenjaminBossan

When I used prefix tuning to fine-tune codebert for sequence classification, it showed the following errors:

I used RobertaForSequenceClassification to load codebert. Did the error occure for the reason that PEFT does not support codebert?

Information

[ ] The official example scripts
[X] My own modified scripts

Tasks

[ ] An officially supported task in the examples folder
[X] My own task or dataset (give details below)

Reproduction

import argparse
import os
import torch
from sklearn.metrics import f1_score
import pandas as pd
from torch.optim import AdamW
from torch.utils.data import DataLoader

from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
)

parse = argparse.ArgumentParser('')
parse.add_argument('--model_name', type=str, default='codebert')
args = parse.parse_args()

from datasets import Dataset, DatasetDict, disable_caching
from transformers import RobertaForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
from tqdm import tqdm

disable_caching()

set_seed(42)
cache_dir = '/home/liangpeng/project/Mabel/cache/datasets'

# Set hyperparameters
model_name_or_path = ''
batch_size = 1
gradient_accumulation_steps = 4

if args.model_name == 'codebert':
    model_name_or_path = '/home/liangpeng/project/Mabel/CLMs/graphcodebert-base'
    batch_size = 4
    model = RobertaForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True, num_labels=2)
    hidden_size = model.config.hidden_size

peft_type = PeftType.P_TUNING
device = 'cuda'
num_epochs = 5
lr = 3e-2
max_input_length = 512
# max_output_length = 8

peft_config = PrefixTuningConfig(
    task_type='SEQ_CLS',
    num_virtual_tokens=1 # to be changed
)

if any (k in model_name_or_path for k in ('gpt', 'opt', 'bloom')):
    padding_side = 'left'
else:
    padding_side = 'right'

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side)

print(f'Eos token id: {tokenizer.eos_token_id}')
print(f'Padding token id: {tokenizer.pad_token_id}')
if getattr(tokenizer, 'pad_token_id') is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
print(f'Padding token id: {tokenizer.pad_token_id}')

train_df = pd.read_csv('train.csv')
eval_df = pd.read_csv('validation.csv')

trainset = Dataset.from_pandas(train_df)
evalset = Dataset.from_pandas(eval_df)

datasets = DatasetDict({
    'train': trainset,
    'validation': evalset
})

print(datasets['train'][0])

def tokenize_function(examples):
    outputs = tokenizer(examples['text'], truncation=True, max_length=max_input_length)
    return outputs

tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=['text', 'text_label'],
    num_proc=1,
    load_from_cache_file=False,
    desc='Running tokenizer on dataset',
)

tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')

def collate_fn(examples):
    return tokenizer.pad(examples, padding="longest", return_tensors="pt")

# Instantiate dataloaders
train_dataloader = DataLoader(
    tokenized_datasets['train'],
    shuffle=True,
    collate_fn=collate_fn,
    batch_size=batch_size
)

eval_dataloader = DataLoader(
    tokenized_datasets['validation'],
    shuffle=False,
    collate_fn=collate_fn,
    batch_size=batch_size
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

optimizer = AdamW(model.parameters(), lr=lr)

# Instantiate sheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataloader) * num_epochs),
    num_training_steps=len(train_dataloader) * num_epochs
)

# calculate the acc, precision, recall, f1 on the validation set，计算验证集上所有类别的准确率、精确率、召回率、F1的平均值
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def evaluate(predictions, references):
    predictions = predictions.cpu().numpy()
    references = references.cpu().numpy()

    acc = accuracy_score(references, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(references, predictions, average='micro', zero_division=1)
    return [acc, precision, recall, f1]

output_dir = '/home/liangpeng/project/Mabel/peft/output/dataset4' + args.model_name
total_steps = 0
best_f1 = 0
actual_step = 0
glb_step = 0
model.to(device)

for epoch in range(num_epochs):
    model.train()

    progress_bar = tqdm(total=len(train_dataloader), desc=f'Training epoch {epoch}', mininterval=1)

    for step, batch in enumerate(train_dataloader):
        total_steps += 1
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        if actual_step % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            glb_step += 1

        if step % 10 == 0:
            progress_bar.set_postfix({'loss': loss.item()})
            progress_bar.update(10)

    progress_bar.close()

    model.eval()
    all_predictions = []
    all_references = []

    progress_bar_2 = tqdm(total=len(eval_dataloader), desc=f'Evaluating epoch {epoch}', mininterval=1)
    for step, batch in enumerate(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions)
        all_references.append(batch['labels'])

        if step % 100 == 0:
            progress_bar_2.update(10)
    progress_bar_2.close()

    all_predictions = torch.cat(all_predictions)
    all_references = torch.cat(all_references)
    np_predictions = all_predictions.cpu().numpy()
    np_references = all_references.cpu().numpy()
    f1 = f1_score(np_predictions, np_references, average='micro')
    print(f'Epoch {epoch}: f1: {f1}')
    # 把 f1 写入文件
    f1_results_dir = os.path.join(output_dir, 'results', 'f1_results.txt')
    os.makedirs(os.path.dirname(f1_results_dir), exist_ok=True)
    with open(f1_results_dir, 'a') as f:
        f.write(f'Epoch {epoch}: f1: {f1}\n')

    if f1 > best_f1:
        best_f1 = f1
        model.save_pretrained(os.path.join(output_dir, 'best_model'))

    model.save_pretrained(os.path.join(output_dir, f'epoch_{epoch}'))

Expected behavior

finetune the codebert model

BenjaminBossan commented 3 weeks ago

I took a look at your example but since you're using custom models and datasets, I cannot reproduce it. Instead, I tried to simplify the example as much as I could, resulting in the code below. It uses the codebert model from MS and some dummy data:

import torch
from torch.optim import AdamW

from peft import (
    get_peft_model,
    PeftType,
    PrefixTuningConfig,
)
from transformers import RobertaForSequenceClassification

model_name_or_path = "microsoft/codebert-base" # BB
model = RobertaForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True, num_labels=2) # BB
batch_size = 1
gradient_accumulation_steps = 4

num_epochs = 5
lr = 3e-2
max_input_length = 512

peft_config = PrefixTuningConfig(
    task_type='SEQ_CLS',
    num_virtual_tokens=1 # to be changed
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

optimizer = AdamW(model.parameters(), lr=lr)
model.train()
for epoch in range(num_epochs):
    outputs = model(torch.arange(10).view(-1, 1), torch.ones(10).view(-1, 1), labels=torch.zeros(20).view(10, 2))
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print(epoch, loss.item())

print("finished successfully")

When I run it locally, it passes. Therefore, it's unclear to me what goes wrong with your script. If possible, I would advise to run this code on CPU with a debugger turned on and check what exactly is going wrong, as it looks like you're indexing outside of the tensor dimensions. If you do that, you should also be able to provide the full stack trace of the error, which could also help investigating.

MabelQi commented 3 weeks ago

@BenjaminBossan Thank you for your quick reply! ☺️ I've fixed this issue but now new issue appears: I am trying to use PEFT methods to fine tune Codebert for multi-label classification, which means I set num_labels=5. Doese PEFT library support fine tuning models for multi-label classification tasks? I doubt it because all the examples are setting num_label=2

BenjaminBossan commented 3 weeks ago

Yes, it should totally work for 5 labels. Did you try it and encounter an error?

MabelQi commented 3 weeks ago

@BenjaminBossan Yes I tried it, it did not report any errors. However, compared to extracting part of my dataset turning it into a binary classification task, the result is extremely poor when I set num_labels=5 (a multi-label classification).

BenjaminBossan commented 3 weeks ago

Well, predicting 5 classes is harder than 2, so it's hard to tell if there is something wrong or if the output is expected. Could you please show me the repr of your model (i.e. what is shown when calling print(model)) so that I can double-check if it looks correct?

huggingface / peft