Closed halimx2 closed 1 year ago
train.py에서 밑처럼 바꾸고
# model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config)
model = BertForSequenceClassificationWithEntity(config=model_config)
load_data.py에서
entity_ids = []
for idx, sentence in enumerate(list(dataset['sentence'])) :
entity_id = [0 for i in range(241)]
tokens = tokenizer.tokenize(sentence)
entities[idx] = tokenizer.tokenize(entities[idx])
for id, token in enumerate(tokens):
entity_id[id] = int(token in entities[idx])
entity_ids.append(torch.tensor(entity_id))
tokenized_sentences['entity_ids'] = entity_ids
로 entity를 올려줬는데, 점수가 진짜 어이 없이 없게 나왔네요. 확인해보니까, pretrained된 모델을 안 써서 그런데 혹시 klue/bert-base 에서 약간 비트는 방법을 아시는 분 있을까요?
밑에 train.py와 model.py 첨부합니다
import pickle as pickle
import os
import pandas as pd
import torch
import numpy as np
import time
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, Trainer, TrainingArguments, AdamW
# import wandb
from load_data import *
from metrics import *
from modeling_bert import *
from CustomScheduler import CosineAnnealingWarmUpRestarts
def train():
# MODEL_NAME = "bert-base-uncased"
MODEL_NAME = "klue/bert-base"
# MODEL_NAME = "klue/roberta-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# load dataset and make tokenizing dataset
tokenized_train, train_label = tokenized_dataset_with_entity("../dataset/train/train.csv", tokenizer)
# tokenized_dev, dev_label = tokenized_dataset("../dataset/train/dev.csv", tokenizer)
# make dataset for pytorch.
RE_train_dataset = RE_Dataset(tokenized_train, train_label)
# RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
# setting model hyperparameter
model_config = AutoConfig.from_pretrained(MODEL_NAME)
model_config.num_labels = 30
# model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config)
model = BertForSequenceClassificationWithEntity(config=model_config)
print(model.config)
model.parameters
model.to(device)
# optimizer and scheduler
optimizers = AdamW(model.parameters(), lr=0)
scheduler = CosineAnnealingWarmUpRestarts(optimizers, T_0=1000, T_mult=2, eta_max=3e-5, T_up=500, gamma=0.5)
training_args = TrainingArguments(
output_dir='./results', # output directory
save_total_limit=5, # number of total save model.
save_steps=500, # model saving step. ## check-point가 여기야
num_train_epochs=5, # total number of training epochs
learning_rate=5e-5, # learning_rate
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=16, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=100, # log saving step.
evaluation_strategy='steps', # evaluation strategy to adopt during training
# `no`: No evaluation during training.
# `steps`: Evaluate every `eval_steps`.
# `epoch`: Evaluate every end of epoch.
eval_steps = 500, # evaluation step.
load_best_model_at_end = True
)
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=RE_train_dataset, # training dataset
eval_dataset=RE_train_dataset, # evaluation dataset ## 수정
compute_metrics=compute_metrics, # define metrics function
optimizers=(optimizers, scheduler)
)
# train model
trainer.train()
# 모델 저장
# 모델 저장 경로와 이름 설정
model_save_path = './best_model'
model_name = 'model_{}_{}'.format(MODEL_NAME, int(time.time()))
# 경로와 이름을 합쳐서 완전한 경로 생성
model_path = os.path.join(model_save_path, model_name)
# 모델 저장 경로에 폴더가 없으면 폴더 생성
if not os.path.exists(model_save_path):
os.makedirs(model_save_path)
# 모델 저장
model.save_pretrained(model_path)
def main_train():
train()
if __name__== '__main__':
train()
import torch
from torch import nn
from transformers.models.bert.modeling_bert import (
BertModel,
BertEncoder,
BertEmbeddings,
BertPooler,
BertPreTrainedModel
)
from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions, SequenceClassifierOutput
from transformers.file_utils import ModelOutput
from collections import OrderedDict, UserDict
import math
import os
import warnings
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
import torch.utils.checkpoint
from packaging import version
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
class BertEmbeddingWithEntity(BertEmbeddings):
""" Bert Embedding with entity embedding """
def __init__(self, config):
super().__init__(config)
self.entity_embeddings = nn.Embedding(2, config.hidden_size)
self.config = config
def forward(
self,
input_ids=None,
token_type_ids=None,
position_ids=None,
entity_ids=None,
inputs_embeds=None,
past_key_values_length=0,
):
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
seq_length = input_shape[1]
if position_ids is None:
position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
# Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
# when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
# issue #5664
if token_type_ids is None:
if hasattr(self, "token_type_ids"):
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
if self.position_embedding_type == "absolute":
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
# Add entity embedding
if entity_ids is not None:
entity_embeddings = self.entity_embeddings(entity_ids)
embeddings += entity_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class BertModelWithEntity(BertPreTrainedModel):
""" Bert model with entity embedding """
def __init__(self, config, add_pooling_layer=True):
super().__init__(config)
self.config = config
## BertEmbedding -> BertEmbeddingWithEntity
self.embeddings = BertEmbeddingWithEntity(config)
self.encoder = BertEncoder(config)
self.pooler = BertPooler(config) if add_pooling_layer else None
self.init_weights()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
entity_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if self.config.is_decoder:
use_cache = use_cache if use_cache is not None else self.config.use_cache
else:
use_cache = False
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input_shape = input_ids.size()
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
batch_size, seq_length = input_shape
device = input_ids.device if input_ids is not None else inputs_embeds.device
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
if attention_mask is None:
attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
if token_type_ids is None:
if hasattr(self.embeddings, "token_type_ids"):
buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
token_type_ids = buffered_token_type_ids_expanded
else:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
if self.config.is_decoder and encoder_hidden_states is not None:
encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
if encoder_attention_mask is None:
encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
else:
encoder_extended_attention_mask = None
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
embedding_output = self.embeddings(
input_ids=input_ids,
position_ids=position_ids,
entity_ids=entity_ids,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
past_key_values_length=past_key_values_length,
)
encoder_outputs = self.encoder(
embedding_output,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = encoder_outputs[0]
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPoolingAndCrossAttentions(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
past_key_values=encoder_outputs.past_key_values,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
cross_attentions=encoder_outputs.cross_attentions,
)
class BertForSequenceClassificationWithEntity(BertPreTrainedModel):
""" Bert model with entity embedding for sequence classification """
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
## BertModel -> BertModelWithEntity
self.bert = BertModelWithEntity(config)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.init_weights()
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
entity_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
entity_ids=entity_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
model = BertForSequenceClassificationWithEntity(config=model_config)
에서
model = BertForSequenceClassificationWithEntity.from_pretrained("klue/bert-base", config=model_config)
이렇게 from_pretrained()를 붙이니까 점수가 나오기는 했는데 점수가 잘 나오지는 않았네요.
코드 봐보고 다시 리뷰 남기겠습니다~
pretrained 된 모델을 쓰려면,from_pretrained
인자를 붙여서 모델을 생성하는게 맞습니다만, 위에 코드는 그 깃허브 그대로 참고하셔서 작성하신거죠?
마지막까지 확인한다고 pr을 안 넣고 왔는데 tokenizer 부분에 해당 input 추가하고 수정할 요소를 찾다가 없어서 그대로 작성했습니다.
지금 밖이라 오후이 집에 들어가면 바로 pr 올려놓겠습니다.
현재 상황과 문제점
지금은 entity의 위치를 알려주지 않는다. 그래서 성능을 올리기 위해서는 entity가 정확히 어디에 있는지 가르쳐주는 것이 중요해서 entity 유무에 따른 embedding layer를 추가해주려고 한다.
개선 제안 사항
train.csv에 보면 entity위치의 index가 나와있다. 그래서 이것을 토대로 작성해보려고 한다.
성능 개선 기대점
entitiy를 좀더 잘 파악해서 성능이 올랐으면 좋겠다.