Closed geo47 closed 3 years ago
@geo47
for char embedding, we need to construct char_ids
from datasets. did you modify dataset.py
for it?
https://github.com/dsindex/ntagger/blob/master/dataset.py
Hi,
Thanks for your reply. I didn't modify dataset.py, I just follow the char embedding code same as GloveLSTMCRF and ElmoLSTMCRF.
Here is the code that I used. One thing you would notice in the code I also applied Multi-head attention on the top of Bi-LSTM. Could you also verify that piece of code, because it seems working fine. :-)
class BertLSTMCRF(BaseModel):
def __init__(self, config, bert_config, bert_model, bert_tokenizer, label_path, pos_path, use_crf=False, use_pos=False, use_char_cnn=False, disable_lstm=False, use_mha=False, feature_based=False):
super().__init__(config=config)
self.config = config
self.device = config['opt'].device
self.seq_size = config['n_ctx']
pos_emb_dim = config['pos_emb_dim']
lstm_hidden_dim = config['lstm_hidden_dim']
lstm_num_layers = config['lstm_num_layers']
lstm_dropout = config['lstm_dropout']
self.use_crf = use_crf
self.use_pos = use_pos
self.use_char_cnn = use_char_cnn # use cnn embeddings
self.disable_lstm = disable_lstm
self.use_mha = use_mha # use Multi-head attention
# bert embedding layer
self.bert_config = bert_config
self.bert_model = bert_model
self.bert_tokenizer = bert_tokenizer
self.bert_feature_based = feature_based
self.bert_hidden_size = bert_config.hidden_size
self.bert_num_layers = bert_config.num_hidden_layers
# DSA layer for bert_feature_based
dsa_num_attentions = config['dsa_num_attentions']
dsa_input_dim = self.bert_hidden_size
dsa_dim = config['dsa_dim']
dsa_r = config['dsa_r']
self.dsa = DSA(config, dsa_num_attentions, dsa_input_dim, dsa_dim, dsa_r=dsa_r)
self.layernorm_dsa = nn.LayerNorm(self.dsa.last_dim)
bert_emb_dim = self.bert_hidden_size
if self.bert_feature_based:
'''
# 1) last layer, 2) mean pooling
bert_emb_dim = self.bert_hidden_size
'''
# 3) DSA pooling
bert_emb_dim = self.dsa.last_dim
# pos embedding layer
self.poss = super().load_dict(pos_path)
self.pos_vocab_size = len(self.poss)
padding_idx = config['pad_pos_id']
self.embed_pos = super().create_embedding_layer(self.pos_vocab_size, pos_emb_dim, weights_matrix=None, non_trainable=False, padding_idx=padding_idx)
# char embedding layer
if self.use_char_cnn:
self.charcnn = CharCNN(config)
# BiLSTM layer
if self.use_pos and self.use_char_cnn:
emb_dim = bert_emb_dim + pos_emb_dim + self.charcnn.last_dim
elif self.use_pos:
emb_dim = bert_emb_dim + pos_emb_dim
elif self.use_char_cnn:
emb_dim = bert_emb_dim + self.charcnn.last_dim
else:
emb_dim = bert_emb_dim
if not self.disable_lstm:
self.lstm = nn.LSTM(input_size=emb_dim,
hidden_size=lstm_hidden_dim,
num_layers=lstm_num_layers,
dropout=lstm_dropout,
bidirectional=True,
batch_first=True)
self.dropout = nn.Dropout(config['dropout'])
# Multihead attention:
if self.use_mha:
self.mha = nn.MultiheadAttention(2 * lstm_hidden_dim, num_heads=8)
# projection layer
self.labels = super().load_dict(label_path)
self.label_size = len(self.labels)
if not self.disable_lstm:
# bi concat fully connection neural network
self.linear = nn.Linear(lstm_hidden_dim*2, self.label_size)
else:
self.linear = nn.Linear(emb_dim, self.label_size)
# CRF layer
if self.use_crf:
self.crf = CRF(num_tags=self.label_size, batch_first=True)
def _compute_bert_embedding(self, x, head_mask=None):
params = {
'input_ids': x[0],
'attention_mask': x[1],
'output_hidden_states': True,
'output_attentions': True,
'return_dict': True
}
if self.bert_model.config.model_type not in ['bart', 'distilbert']:
params['token_type_ids'] = None if self.bert_model.config.model_type in ['roberta'] else x[2] # RoBERTa don't use segment_ids
if head_mask is not None:
params['head_mask'] = head_mask
if self.bert_feature_based:
# feature-based
with torch.no_grad():
bert_outputs = self.bert_model(**params)
if self.bert_model.config.model_type in ['bart']:
all_hidden_states = bert_outputs.decoder_hidden_states
else:
all_hidden_states = bert_outputs.hidden_states
'''
# 1) last layer
embedded = bert_outputs.last_hidden_state
# embedded : [batch_size, seq_size, bert_hidden_size]
'''
'''
# 2) mean pooling
stack = torch.stack(all_hidden_states, dim=-1)
embedded = torch.mean(stack, dim=-1)
# ([batch_size, seq_size, bert_hidden_size], ..., [batch_size, seq_size, bert_hidden_size])
# -> stack(-1) -> [batch_size, seq_size, bert_hidden_size, *], ex) * == 25 for bert large
# -> max/mean(-1) -> [batch_size, seq_size, bert_hidden_size]
'''
# 3) DSA pooling
stack = torch.stack(all_hidden_states, dim=-2)
# stack : [batch_size, seq_size, *, bert_hidden_size]
stack = stack.view(-1, self.bert_num_layers + 1, self.bert_hidden_size)
# stack : [*, bert_num_layers, bert_hidden_size]
dsa_mask = torch.ones(stack.shape[0], stack.shape[1]).to(self.device)
# dsa_mask : [*, bert_num_layers]
dsa_out = self.dsa(stack, dsa_mask)
# dsa_out : [*, self.dsa.last_dim]
dsa_out = self.layernorm_dsa(dsa_out)
embedded = dsa_out.view(-1, self.seq_size, self.dsa.last_dim)
# embedded : [batch_size, seq_size, self.dsa.last_dim]
else:
# fine-tuning
# x[0], x[1], x[2] : [batch_size, seq_size]
bert_outputs = self.bert_model(**params)
embedded = bert_outputs.last_hidden_state
# embedded : [batch_size, seq_size, bert_hidden_size]
return embedded, bert_outputs
def forward(self, x, head_mask=None, freeze_bert=False):
# x[0,1,2] : [batch_size, seq_size]
mask = x[1].to(torch.uint8).to(self.device)
# mask == attention_mask : [batch_size, seq_size]
lengths = torch.sum(mask.to(torch.long), dim=1)
# lengths : [batch_size]
# 1. Embedding
if freeze_bert:
# freeze_bert is the runtime option which has the same effect to the static option `feature_based`.
with torch.no_grad():
bert_embed_out, bert_outputs = self._compute_bert_embedding(x, head_mask=head_mask)
else:
bert_embed_out, bert_outputs = self._compute_bert_embedding(x, head_mask=head_mask)
# bert_embed_out : [batch_size, seq_size, *]
pos_ids = x[3]
pos_embed_out = self.embed_pos(pos_ids)
# pos_embed_out : [batch_size, seq_size, pos_emb_dim]
if self.use_pos and self.use_char_cnn:
char_ids = x[2]
# char_ids : [batch_size, seq_size, char_n_ctx]
charcnn_out = self.charcnn(char_ids)
# charcnn_out : [batch_size, seq_size, self.charcnn.last_dim]
embed_out = torch.cat([bert_embed_out, pos_embed_out, charcnn_out], dim=-1)
# embed_out : [batch_size, seq_size, emb_dim]
elif self.use_pos:
embed_out = torch.cat([bert_embed_out, pos_embed_out], dim=-1)
elif self.use_char_cnn:
char_ids = x[2]
# char_ids : [batch_size, seq_size, char_n_ctx]
charcnn_out = self.charcnn(char_ids)
# charcnn_out : [batch_size, seq_size, self.charcnn.last_dim]
embed_out = torch.cat([bert_embed_out, charcnn_out], dim=-1)
# embed_out : [batch_size, seq_size, emb_dim]
else:
embed_out = bert_embed_out
# embed_out : [batch_size, seq_size, emb_dim]
embed_out = self.dropout(embed_out)
############################################################################
# # 1. Embedding
# token_embed_out = self.embed_token(token_ids)
# # token_embed_out : [batch_size, seq_size, token_emb_dim]
# pos_embed_out = self.embed_pos(pos_ids)
# # pos_embed_out : [batch_size, seq_size, pos_emb_dim]
# if self.use_char_cnn:
# char_ids = x[2]
# # char_ids : [batch_size, seq_size, char_n_ctx]
# charcnn_out = self.charcnn(char_ids)
# # charcnn_out : [batch_size, seq_size, self.charcnn.last_dim]
# embed_out = torch.cat([token_embed_out, pos_embed_out, charcnn_out], dim=-1)
# # embed_out : [batch_size, seq_size, emb_dim]
# else:
# embed_out = torch.cat([token_embed_out, pos_embed_out], dim=-1)
# # embed_out : [batch_size, seq_size, emb_dim]
# embed_out = self.dropout(embed_out)
############################################################################
# 2. LSTM
if not self.disable_lstm:
# FIXME : pytorch 1.7.0 bug https://github.com/pytorch/pytorch/issues/43227 , lengths.cpu()
packed_embed_out = torch.nn.utils.rnn.pack_padded_sequence(embed_out, lengths.cpu(), batch_first=True, enforce_sorted=False)
lstm_out, (final_hidden_state, final_cell_state) = self.lstm(packed_embed_out)
lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True, total_length=self.seq_size)
# lstm_out : [batch_size, seq_size, lstm_hidden_dim*2]
lstm_out = self.dropout(lstm_out)
else:
lstm_out = embed_out
# lstm_out : [batch_size, seq_size, emb_dim]
# 3. Multi-head attention
if self.use_mha:
attn_output, attn_output_weights = self.mha(lstm_out, lstm_out, lstm_out)
logits = self.linear(attn_output)
else:
logits = self.linear(lstm_out)
# 3. Output
# logits = self.linear(attn_output)
# logits : [batch_size, seq_size, label_size]
if not self.use_crf: return logits
prediction = self.crf.decode(logits)
prediction = torch.as_tensor(prediction, dtype=torch.long)
# prediction : [batch_size, seq_size]
return logits, prediction
@geo47
i have just added '--use_mha' option.
https://github.com/dsindex/ntagger/blob/master/model.py
if self.use_mha:
# reference : https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
# https://discuss.pytorch.org/t/how-to-add-padding-mask-to-nn-transformerencoder-module/63390/3
query = lstm_out.permute(1, 0, 2)
# query : [seq_size, batch_size, self.lstm_dim]
key = query
value = query
key_padding_mask = mask.ne(1) # attention_mask => mask = [[1, 1, ..., 0, ...]] => [[False, False, ..., True, ...]]
attn_output, attn_output_weights = self.mha(query, key, value, key_padding_mask=key_padding_mask)
# attn_output : [seq_size, batch_size, self.mha_dim]
mha_out = attn_output.permute(1, 0, 2)
# mha_out : [batch_size, seq_size, self.mha_dim]
# residual, layernorm, dropout
mha_out = self.layernorm_mha(mha_out + lstm_out)
mha_out = self.dropout(mha_out)
else:
mha_out = lstm_out
# mha_out : [batch_size, seq_size, self.mha_dim]
however...
since BERT
, LSTM
layers already utilize contextual information like multi-head attention
,
i think the effect of mha
layer might be small.
what about use multi-layers of the TransformerEncoder instead of using LSTM layer?
i mean not for BertLSTMCRF
but GloveLSTMCRF
.
i.e., LSTM can be replaced by multi layers of the TransformerEncoder.
@dsindex
You are right, the mha
results not seems effective here, as BERT
already utilizes multi-head attention
to learn contextual information. However, I try to use feature based BERT pre-trained
model, to get contextual embeddings by freezing all layers except the last one and for the model train BiLSTM-MHA-CRF
as described in this paper.
Also, the results shown above looks different then the one I got. For BERT-base(cased), BiLSTM
i got F1 = 0.939
with only 3 epoch
training.
Lastly, As suggested, I will now try to implement new model architecture based on BertTransformerEncoder
(note: I use Bert only to get embeddings, not to fine-tune), and let's see if it improves the results.
-- Like in the etagger, I want to use CharCNN
for BertLSTMCRF
model in ntagger
Thanks
Hi @dsindex
Here are some of my observations regarding the model.
BERT-BiLSTM-MHA-CRF || 91.27 || word, pos || bert_feature-based, DSA
BERT-BiLSTM-CRF || 91.32 || word, pos || bert_feature-based, (No DSA)
BERT-BiLSTM-CRF || 91.34 || word, pos || bert_feature-based, DSA
BERT-BiLSTM-MHA-CRF || 91.62 || word, pos || bert_feature-based, (No DSA)
From the observations above, It seems like Dynamic Self-Attention
for BERT based embedding badly effect the BERT-BiLSTM-MHA-CRF
model. Without DSA
, the model performs significantly well. However, BERT-BiLSTM-CRF
model doesn't have any significant change with or without DSA
.
Attached code reflects the BertLSTMCRF
class I used to disable DSA
for testing the model.
class BertLSTMCRF(BaseModel):
def __init__(self, config, bert_config, bert_model, bert_tokenizer, label_path, pos_path, use_crf=False, use_pos=False, use_mha=False, disable_lstm=False, feature_based=False):
super().__init__(config=config)
self.config = config
self.device = config['opt'].device
self.seq_size = config['n_ctx']
pos_emb_dim = config['pos_emb_dim']
lstm_hidden_dim = config['lstm_hidden_dim']
lstm_num_layers = config['lstm_num_layers']
lstm_dropout = config['lstm_dropout']
self.use_crf = use_crf
self.use_pos = use_pos
self.use_mha = use_mha
mha_num_attentions = config['mha_num_attentions']
self.disable_lstm = disable_lstm
# bert embedding layer
self.bert_config = bert_config
self.bert_model = bert_model
self.bert_tokenizer = bert_tokenizer
self.bert_feature_based = feature_based
self.bert_hidden_size = bert_config.hidden_size
self.bert_num_layers = bert_config.num_hidden_layers
'''
# DSA layer for bert_feature_based
dsa_num_attentions = config['dsa_num_attentions']
dsa_input_dim = self.bert_hidden_size
dsa_dim = config['dsa_dim']
dsa_r = config['dsa_r']
self.dsa = DSA(config, dsa_num_attentions, dsa_input_dim, dsa_dim, dsa_r=dsa_r)
self.layernorm_dsa = nn.LayerNorm(self.dsa.last_dim)
'''
bert_emb_dim = self.bert_hidden_size
if self.bert_feature_based:
# 1) last layer, 2) mean pooling
bert_emb_dim = self.bert_hidden_size
'''
# 3) DSA pooling
bert_emb_dim = self.dsa.last_dim
'''
# pos embedding layer
self.poss = super().load_dict(pos_path)
self.pos_vocab_size = len(self.poss)
padding_idx = config['pad_pos_id']
self.embed_pos = super().create_embedding_layer(self.pos_vocab_size, pos_emb_dim, weights_matrix=None, non_trainable=False, padding_idx=padding_idx)
if self.use_pos:
emb_dim = bert_emb_dim + pos_emb_dim
else:
emb_dim = bert_emb_dim
# BiLSTM layer
self.lstm_dim = emb_dim
if not self.disable_lstm:
self.lstm = nn.LSTM(input_size=emb_dim,
hidden_size=lstm_hidden_dim,
num_layers=lstm_num_layers,
dropout=lstm_dropout,
bidirectional=True,
batch_first=True)
self.lstm_dim = lstm_hidden_dim*2
self.dropout = nn.Dropout(config['dropout'])
# Multi-Head Attention layer
self.mha_dim = self.lstm_dim
if self.use_mha:
self.mha = nn.MultiheadAttention(self.lstm_dim, num_heads=mha_num_attentions)
self.layernorm_mha = nn.LayerNorm(self.mha_dim)
# projection layer
self.labels = super().load_dict(label_path)
self.label_size = len(self.labels)
self.linear = nn.Linear(self.mha_dim, self.label_size)
# CRF layer
if self.use_crf:
self.crf = CRF(num_tags=self.label_size, batch_first=True)
def _compute_bert_embedding(self, x, head_mask=None):
params = {
'input_ids': x[0],
'attention_mask': x[1],
'output_hidden_states': True,
'output_attentions': True,
'return_dict': True
}
if self.bert_model.config.model_type not in ['bart', 'distilbert']:
params['token_type_ids'] = None if self.bert_model.config.model_type in ['roberta'] else x[2] # RoBERTa don't use segment_ids
if head_mask is not None:
params['head_mask'] = head_mask
if self.bert_feature_based:
# feature-based
with torch.no_grad():
bert_outputs = self.bert_model(**params)
if self.bert_model.config.model_type in ['bart']:
all_hidden_states = bert_outputs.decoder_hidden_states
else:
all_hidden_states = bert_outputs.hidden_states
# 1) last layer
embedded = bert_outputs.last_hidden_state
# embedded : [batch_size, seq_size, bert_hidden_size]
'''
# 2) mean pooling
stack = torch.stack(all_hidden_states, dim=-1)
embedded = torch.mean(stack, dim=-1)
# ([batch_size, seq_size, bert_hidden_size], ..., [batch_size, seq_size, bert_hidden_size])
# -> stack(-1) -> [batch_size, seq_size, bert_hidden_size, *], ex) * == 25 for bert large
# -> max/mean(-1) -> [batch_size, seq_size, bert_hidden_size]
'''
'''
# 3) DSA pooling
stack = torch.stack(all_hidden_states, dim=-2)
# stack : [batch_size, seq_size, *, bert_hidden_size]
stack = stack.view(-1, self.bert_num_layers + 1, self.bert_hidden_size)
# stack : [*, bert_num_layers, bert_hidden_size]
dsa_mask = torch.ones(stack.shape[0], stack.shape[1]).to(self.device)
# dsa_mask : [*, bert_num_layers]
dsa_out = self.dsa(stack, dsa_mask)
# dsa_out : [*, self.dsa.last_dim]
dsa_out = self.layernorm_dsa(dsa_out)
embedded = dsa_out.view(-1, self.seq_size, self.dsa.last_dim) # hidden state vectors
# embedded : [batch_size, seq_size, self.dsa.last_dim]
'''
else:
# fine-tuning
# x[0], x[1], x[2] : [batch_size, seq_size]
bert_outputs = self.bert_model(**params)
embedded = bert_outputs.last_hidden_state
# embedded : [batch_size, seq_size, bert_hidden_size]
return embedded, bert_outputs
def forward(self, x, head_mask=None, freeze_bert=False):
# x[0,1,2] : [batch_size, seq_size]
mask = x[1].to(torch.uint8).to(self.device)
# mask == attention_mask : [batch_size, seq_size]
lengths = torch.sum(mask.to(torch.long), dim=1)
# lengths : [batch_size]
# 1. Embedding
if freeze_bert:
# freeze_bert is the runtime option which has the same effect to the static option `feature_based`.
with torch.no_grad():
bert_embed_out, bert_outputs = self._compute_bert_embedding(x, head_mask=head_mask)
else:
bert_embed_out, bert_outputs = self._compute_bert_embedding(x, head_mask=head_mask)
# bert_embed_out : [batch_size, seq_size, *]
pos_ids = x[3]
pos_embed_out = self.embed_pos(pos_ids)
# pos_embed_out : [batch_size, seq_size, pos_emb_dim]
if self.use_pos:
embed_out = torch.cat([bert_embed_out, pos_embed_out], dim=-1)
else:
embed_out = bert_embed_out
# embed_out : [batch_size, seq_size, emb_dim]
embed_out = self.dropout(embed_out)
# 2. LSTM
if not self.disable_lstm:
# FIXME : pytorch 1.7.0 bug https://github.com/pytorch/pytorch/issues/43227 , lengths.cpu()
packed_embed_out = torch.nn.utils.rnn.pack_padded_sequence(embed_out, lengths.cpu(), batch_first=True, enforce_sorted=False)
lstm_out, (h_n, c_n) = self.lstm(packed_embed_out)
lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True, total_length=self.seq_size)
# lstm_out : [batch_size, seq_size, self.lstm_dim == lstm_hidden_dim*2]
lstm_out = self.dropout(lstm_out)
else:
lstm_out = embed_out
# lstm_out : [batch_size, seq_size, self.lstm_dim == emb_dim]
# 3. MHA
if self.use_mha:
# reference : https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
# https://discuss.pytorch.org/t/how-to-add-padding-mask-to-nn-transformerencoder-module/63390/3
query = lstm_out.permute(1, 0, 2)
# query : [seq_size, batch_size, self.lstm_dim]
key = query
value = query
key_padding_mask = mask.ne(1) # attention_mask => mask = [[1, 1, ..., 0, ...]] => [[False, False, ..., True, ...]]
attn_output, attn_output_weights = self.mha(query, key, value, key_padding_mask=key_padding_mask)
# attn_output : [seq_size, batch_size, self.mha_dim]
mha_out = attn_output.permute(1, 0, 2)
# mha_out : [batch_size, seq_size, self.mha_dim]
# residual, layernorm, dropout
mha_out = self.layernorm_mha(mha_out + lstm_out)
mha_out = self.dropout(mha_out)
else:
mha_out = lstm_out
# mha_out : [batch_size, seq_size, self.mha_dim]
# 4. Output
logits = self.linear(mha_out)
# logits : [batch_size, seq_size, label_size]
if not self.use_crf: return logits
prediction = self.crf.decode(logits)
prediction = torch.as_tensor(prediction, dtype=torch.long)
# prediction : [batch_size, seq_size]
return logits, prediction
Please let me know your thoughts. Thanks.
@geo47
feature-based approach with DSA seems not that effective based on your results and previous experiments.
i did experiments based on the bellow paper indicating the effectiveness of weighted sum of last four hidden. (since DSA can be regarded as weighted sum.)
BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding https://arxiv.org/pdf/1810.04805.pdf
@dsindex
It was really informative.
So I must not use DSA for improved results as I am using feature_based BERT model. In that case, MHA without DSA seems more effective.
since we got enough results(IMO), close this issue :)
@dsindex
Hi..! Sorry, but the issue was related to Concatenating Char embeddings with BERT embeddings, I was busy in doing other project. Now I am back and looking into it. It would be great if we could add this feature or If you have a quick solution then please let me know. Thanks :-)
@geo47
i just add --use_char_cnn
option for BERT embeddings.
https://github.com/dsindex/ntagger/commit/7e0207c53ce28d3254208de4bd635f0a3aa18721
character ids for each subword tokens were built like this: https://github.com/dsindex/ntagger/blob/master/util_bert.py
# char extension
c_ids = batch_to_ids([word_tokens])[0].detach().cpu().numpy().tolist()
"""
convention in BERT:
for single sequence:
word : the dog is hairy
word_idx : 0 1 2 3 | params
----------------------------------------------------------------------- | -------------- |
tokens: [CLS] the dog is ha ##iry . [SEP] <pad> <pad> <pad> ... | |
token_idx: 0 1 2 3 4 5 6 7 8 9 10 ... | |
input_ids: x x x x x x x x 0 0 0 ... | input_ids |
segment_ids: 0 0 0 0 0 0 0 0 0 0 0 ... | token_type_ids |
input_mask: 1 1 1 1 1 1 1 1 0 0 0 ... | attention_mask |
label_ids: 0 1 1 1 1 0 1 0 0 0 0 ... | |
----------------------------------------------------------------------- | |
pos_ids: 0 10 2 ...
char_ids: [0,..., 0] [259, ..., 261] ...
-----------------------------------------------------------------------
idx 0 1 2 3
word2token_idx: 1 2 3 4 0 0 0 ...
word2token_idx[idx] = token_idx
-----------------------------------------------------------------------
"""
however, i am not sure this is reasonable approach;
thank you :)
experiment results
how to
using sub token label, --bert_use_sub_label + --bert_use_pos --use_char_cnn
$ python preprocess.py --config=configs/config-bert.json --data_dir=data/conll2003 --bert_model_name_or_path=./embeddings/bert-base-cased --bert_use_sub_label
$ python train.py --config=configs/config-bert.json --data_dir=data/conll2003 --save_path=pytorch-model-bert.pt --bert_model_name_or_path=./embeddings/bert-base-cased --bert_output_dir=bert-checkpoint --batch_size=32 --lr=1e-5 --epoch=10 --bert_freezing_epoch=3 --bert_lr_during_freezing=1e-3 --use_crf --bert_use_pos --use_char_cnn
$ python evaluate.py --config=configs/config-bert.json --data_dir=data/conll2003 --model_path=pytorch-model-bert.pt --bert_output_dir=bert-checkpoint --use_crf --bert_use_pos --use_char_cnn $ cd data/conll2003; perl ../../etc/conlleval.pl < test.txt.pred ; cd ../.. INFO:main:[F1] : 0.9113209212035649, 3684 INFO:main:[Elapsed Time] : 3684 examples, 148480.65185546875ms, 40.282283112823464ms on average accuracy: 98.27%; precision: 91.23%; recall: 91.52%; FB1: 91.37
** using sub token label, --bert_use_sub_label + --bert_use_pos --use_char_cnn --epoch=30 INFO:main:[F1] : 0.9142604856512141, 3684 INFO:main:[Elapsed Time] : 3684 examples, 146215.95406532288ms, 39.665775003881194ms on average accuracy: 98.32%; precision: 91.57%; recall: 91.75%; FB1: 91.66
@dsindex
Thanks again,
This approach seems simple and working, however, adding contextual character embeddings from BERT would be more effective in terms of learning the character context.
I am trying to apply character-bert embeddings with word embeddings in our current model. Would it be compatible to concatenate Char-BERT embeddings..?
@geo47
interesting~!
ntagger(BERT) uses subword-level embeddgins. but, CharacterBERT seems to be word-level.
CharacterBERT is a variant of BERT that produces word-level contextual representations by attending to the characters of each input token.
so, you may need to merge subword embeddings or to slice.
i mean:
i try to change —bert_use_crf_slice
option to —bert_use_subword_pooling
.
so, release backup code(https://github.com/dsindex/ntagger/releases/tag/v1.0) before modification.
@geo47
i think 'Char-BERT' embeddings can be concatenated via --bert_use_subword_pooling
option.
https://github.com/dsindex/ntagger/blob/master/model.py
if self.use_subword_pooling:
word2token_idx = x[5]
mask_word2token_idx = x[6].to(torch.uint8).unsqueeze(-1).to(self.device)
# mask_word2token_idx = torch.sign(torch.abs(word2token_idx)).to(torch.uint8).unsqueeze(-1).to(self.device)
# first subword pooling
# solution from https://stackoverflow.com/questions/55628014/indexing-a-3d-tensor-using-a-2d-tensor
src = embed_out
offset = torch.arange(0, src.size(0) * src.size(1), src.size(1)).to(self.device)
index = word2token_idx + offset.unsqueeze(1)
embed_out = src.reshape(-1, src.shape[-1])[index]
embed_out *= mask_word2token_idx
# update mask, lengths for word-level
mask = x[6].to(torch.uint8).to(self.device)
lengths = torch.sum(mask.to(torch.long), dim=1)
if self.use_word_embedding:
token_ids = x[7]
token_embed_out = self.embed_token(token_ids)
# token_embed_out : [batch_size, seq_size, token_emb_dim]
embed_out = torch.cat([embed_out, token_embed_out], dim=-1)
https://github.com/dsindex/ntagger/blob/master/util_bert.py
convention in BERT:
for single sequence:
word : the dog is hairy .
word_idx : 0 1 2 3 4 | params
----------------------------------------------------------------------- | -------------- |
tokens: [CLS] the dog is ha ##iry . [SEP] <pad> <pad> <pad> ... | |
token_idx: 0 1 2 3 4 5 6 7 8 9 10 ... | |
input_ids: x x x x x x x x 0 0 0 ... | input_ids |
segment_ids: 0 0 0 0 0 0 0 0 0 0 0 ... | token_type_ids |
input_mask: 1 1 1 1 1 1 1 1 0 0 0 ... | attention_mask |
label_ids: 0 1 1 1 1 0 1 0 0 0 0 ... | |
----------------------------------------------------------------------- | |
pos_ids: 0 10 2 ...
char_ids: [0,..., 0] [259, ..., 261] ...
-----------------------------------------------------------------------
-----------------------------------------------------------------------
with --bert_use_subword_pooling:
word2token_idx: 0 1 2 3 4 6 0 0 0 ...
word_mask: 1 1 1 1 1 1 0 0 0 ...
'label_ids, pos_ids, char_ids' are generated as word-level.
-----------------------------------------------------------------------
with --bert_use_subword_pooling --bert_use_word_embedding:
word_ids: 0 2 2928 16 23223 4 0 0 ...
-----------------------------------------------------------------------
with the option, we are able to convert subword-level embeddings(from bert) to word-level embeddings. and following layers(eg, lstm / mha / crf) should be applied at word-level.
additionally, if we use --use_char_cnn
with --bert_use_subword_pooling
,
char_ids are computed at word-level not subword-level.
it is different implementation with https://github.com/dsindex/ntagger/issues/2#issuecomment-793796631
if opt.bert_use_subword_pooling:
c_ids = batch_to_ids([word])[0].detach().cpu().numpy().tolist()
char_ids.extend(c_ids)
else:
c_ids = batch_to_ids([word_tokens])[0].detach().cpu().numpy().tolist()
char_ids.extend(c_ids)
Hello,
With the given parameter, I applied CharCNN and concate with BERT embedding alongside POS embeddings. However, CharCNN concatination gives dimension error.
Could you figure out this problem, or I am doing something wrong here?
Thanks.