dsindex / ntagger

reference pytorch code for named entity tagging
86 stars 13 forks source link

Concate CharCNN to BertLSTCRF #2

Closed geo47 closed 3 years ago

geo47 commented 3 years ago

Hello,

With the given parameter, I applied CharCNN and concate with BERT embedding alongside POS embeddings. However, CharCNN concatination gives dimension error.

File "ntagger/model.py", line 220, in forward
mask = char_ids.view(-1, self.char_n_ctx).ne(self.char_padding_idx) # broadcasting
RuntimeError: shape '[-1, 50]' is invalid for input of size 2880

Could you figure out this problem, or I am doing something wrong here?

Thanks.

dsindex commented 3 years ago

@geo47

for char embedding, we need to construct char_ids from datasets. did you modify dataset.py for it? https://github.com/dsindex/ntagger/blob/master/dataset.py

geo47 commented 3 years ago

Hi,

Thanks for your reply. I didn't modify dataset.py, I just follow the char embedding code same as GloveLSTMCRF and ElmoLSTMCRF.

Here is the code that I used. One thing you would notice in the code I also applied Multi-head attention on the top of Bi-LSTM. Could you also verify that piece of code, because it seems working fine. :-)

class BertLSTMCRF(BaseModel):
    def __init__(self, config, bert_config, bert_model, bert_tokenizer, label_path, pos_path, use_crf=False, use_pos=False, use_char_cnn=False, disable_lstm=False, use_mha=False, feature_based=False):
        super().__init__(config=config)

        self.config = config
        self.device = config['opt'].device
        self.seq_size = config['n_ctx']
        pos_emb_dim = config['pos_emb_dim']
        lstm_hidden_dim = config['lstm_hidden_dim']
        lstm_num_layers = config['lstm_num_layers']
        lstm_dropout = config['lstm_dropout']
        self.use_crf = use_crf
        self.use_pos = use_pos
        self.use_char_cnn = use_char_cnn # use cnn embeddings
        self.disable_lstm = disable_lstm
        self.use_mha = use_mha # use Multi-head attention

        # bert embedding layer
        self.bert_config = bert_config
        self.bert_model = bert_model
        self.bert_tokenizer = bert_tokenizer
        self.bert_feature_based = feature_based
        self.bert_hidden_size = bert_config.hidden_size
        self.bert_num_layers = bert_config.num_hidden_layers

        # DSA layer for bert_feature_based
        dsa_num_attentions = config['dsa_num_attentions']
        dsa_input_dim = self.bert_hidden_size
        dsa_dim = config['dsa_dim']
        dsa_r = config['dsa_r']
        self.dsa = DSA(config, dsa_num_attentions, dsa_input_dim, dsa_dim, dsa_r=dsa_r)
        self.layernorm_dsa = nn.LayerNorm(self.dsa.last_dim)

        bert_emb_dim = self.bert_hidden_size
        if self.bert_feature_based:
            '''
            # 1) last layer, 2) mean pooling
            bert_emb_dim = self.bert_hidden_size
            '''
            # 3) DSA pooling
            bert_emb_dim = self.dsa.last_dim

        # pos embedding layer
        self.poss = super().load_dict(pos_path)
        self.pos_vocab_size = len(self.poss)
        padding_idx = config['pad_pos_id']
        self.embed_pos = super().create_embedding_layer(self.pos_vocab_size, pos_emb_dim, weights_matrix=None, non_trainable=False, padding_idx=padding_idx)

        # char embedding layer
        if self.use_char_cnn:
            self.charcnn = CharCNN(config)

        # BiLSTM layer
        if self.use_pos and self.use_char_cnn:
            emb_dim = bert_emb_dim + pos_emb_dim + self.charcnn.last_dim
        elif self.use_pos:
            emb_dim = bert_emb_dim + pos_emb_dim
        elif self.use_char_cnn:
            emb_dim = bert_emb_dim + self.charcnn.last_dim
        else:
            emb_dim = bert_emb_dim

        if not self.disable_lstm:
            self.lstm = nn.LSTM(input_size=emb_dim,
                                hidden_size=lstm_hidden_dim,
                                num_layers=lstm_num_layers,
                                dropout=lstm_dropout,
                                bidirectional=True,
                                batch_first=True)

        self.dropout = nn.Dropout(config['dropout'])

        # Multihead attention:
        if self.use_mha:
            self.mha = nn.MultiheadAttention(2 * lstm_hidden_dim, num_heads=8)

        # projection layer
        self.labels = super().load_dict(label_path)
        self.label_size = len(self.labels)
        if not self.disable_lstm:
            # bi concat fully connection neural network
            self.linear = nn.Linear(lstm_hidden_dim*2, self.label_size)
        else:
            self.linear = nn.Linear(emb_dim, self.label_size)

        # CRF layer
        if self.use_crf:
            self.crf = CRF(num_tags=self.label_size, batch_first=True)

    def _compute_bert_embedding(self, x, head_mask=None):
        params = {
            'input_ids': x[0],
            'attention_mask': x[1],
            'output_hidden_states': True,
            'output_attentions': True,
            'return_dict': True
        }
        if self.bert_model.config.model_type not in ['bart', 'distilbert']:
            params['token_type_ids'] = None if self.bert_model.config.model_type in ['roberta'] else x[2] # RoBERTa don't use segment_ids
        if head_mask is not None:
            params['head_mask'] = head_mask
        if self.bert_feature_based:
            # feature-based
            with torch.no_grad():
                bert_outputs = self.bert_model(**params)
                if self.bert_model.config.model_type in ['bart']:
                    all_hidden_states = bert_outputs.decoder_hidden_states
                else:
                    all_hidden_states = bert_outputs.hidden_states
                '''
                # 1) last layer
                embedded = bert_outputs.last_hidden_state
                # embedded : [batch_size, seq_size, bert_hidden_size]
                '''
                '''
                # 2) mean pooling
                stack = torch.stack(all_hidden_states, dim=-1)
                embedded = torch.mean(stack, dim=-1)
                # ([batch_size, seq_size, bert_hidden_size], ..., [batch_size, seq_size, bert_hidden_size])
                # -> stack(-1) -> [batch_size, seq_size, bert_hidden_size, *], ex) * == 25 for bert large
                # -> max/mean(-1) ->  [batch_size, seq_size, bert_hidden_size]
                '''
                # 3) DSA pooling
                stack = torch.stack(all_hidden_states, dim=-2)
                # stack : [batch_size, seq_size, *, bert_hidden_size]
                stack = stack.view(-1, self.bert_num_layers + 1, self.bert_hidden_size)
                # stack : [*, bert_num_layers, bert_hidden_size]
                dsa_mask = torch.ones(stack.shape[0], stack.shape[1]).to(self.device)
                # dsa_mask : [*, bert_num_layers]
                dsa_out = self.dsa(stack, dsa_mask)
                # dsa_out : [*, self.dsa.last_dim]
                dsa_out = self.layernorm_dsa(dsa_out)
                embedded = dsa_out.view(-1, self.seq_size, self.dsa.last_dim)
                # embedded : [batch_size, seq_size, self.dsa.last_dim]
        else:
            # fine-tuning
            # x[0], x[1], x[2] : [batch_size, seq_size]
            bert_outputs = self.bert_model(**params)
            embedded = bert_outputs.last_hidden_state
            # embedded : [batch_size, seq_size, bert_hidden_size]
        return embedded, bert_outputs

    def forward(self, x, head_mask=None, freeze_bert=False):
        # x[0,1,2] : [batch_size, seq_size]

        mask = x[1].to(torch.uint8).to(self.device)
        # mask == attention_mask : [batch_size, seq_size]
        lengths = torch.sum(mask.to(torch.long), dim=1)
        # lengths : [batch_size]

        # 1. Embedding
        if freeze_bert:
            # freeze_bert is the runtime option which has the same effect to the static option `feature_based`.
            with torch.no_grad():
                bert_embed_out, bert_outputs = self._compute_bert_embedding(x, head_mask=head_mask)
        else:
            bert_embed_out, bert_outputs = self._compute_bert_embedding(x, head_mask=head_mask)
        # bert_embed_out : [batch_size, seq_size, *]
        pos_ids = x[3]
        pos_embed_out = self.embed_pos(pos_ids)
        # pos_embed_out : [batch_size, seq_size, pos_emb_dim]
        if self.use_pos and self.use_char_cnn:
            char_ids = x[2]
            # char_ids : [batch_size, seq_size, char_n_ctx]
            charcnn_out = self.charcnn(char_ids)
            # charcnn_out : [batch_size, seq_size, self.charcnn.last_dim]
            embed_out = torch.cat([bert_embed_out, pos_embed_out, charcnn_out], dim=-1)
            # embed_out : [batch_size, seq_size, emb_dim]
        elif self.use_pos:
            embed_out = torch.cat([bert_embed_out, pos_embed_out], dim=-1)
        elif self.use_char_cnn:
            char_ids = x[2]
            # char_ids : [batch_size, seq_size, char_n_ctx]
            charcnn_out = self.charcnn(char_ids)
            # charcnn_out : [batch_size, seq_size, self.charcnn.last_dim]
            embed_out = torch.cat([bert_embed_out, charcnn_out], dim=-1)
            # embed_out : [batch_size, seq_size, emb_dim]
        else:
            embed_out = bert_embed_out
        # embed_out : [batch_size, seq_size, emb_dim]
        embed_out = self.dropout(embed_out)

        ############################################################################
        # # 1. Embedding
        # token_embed_out = self.embed_token(token_ids)
        # # token_embed_out : [batch_size, seq_size, token_emb_dim]
        # pos_embed_out = self.embed_pos(pos_ids)
        # # pos_embed_out   : [batch_size, seq_size, pos_emb_dim]
        # if self.use_char_cnn:
        #     char_ids = x[2]
        #     # char_ids : [batch_size, seq_size, char_n_ctx]
        #     charcnn_out = self.charcnn(char_ids)
        #     # charcnn_out : [batch_size, seq_size, self.charcnn.last_dim]
        #     embed_out = torch.cat([token_embed_out, pos_embed_out, charcnn_out], dim=-1)
        #     # embed_out : [batch_size, seq_size, emb_dim]
        # else:
        #     embed_out = torch.cat([token_embed_out, pos_embed_out], dim=-1)
        #     # embed_out : [batch_size, seq_size, emb_dim]
        # embed_out = self.dropout(embed_out)
        ############################################################################

        # 2. LSTM
        if not self.disable_lstm:
            # FIXME : pytorch 1.7.0 bug https://github.com/pytorch/pytorch/issues/43227 , lengths.cpu()
            packed_embed_out = torch.nn.utils.rnn.pack_padded_sequence(embed_out, lengths.cpu(), batch_first=True, enforce_sorted=False)
            lstm_out, (final_hidden_state, final_cell_state) = self.lstm(packed_embed_out)
            lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True, total_length=self.seq_size)
            # lstm_out : [batch_size, seq_size, lstm_hidden_dim*2]

            lstm_out = self.dropout(lstm_out)
        else:
            lstm_out = embed_out
            # lstm_out : [batch_size, seq_size, emb_dim]

        # 3. Multi-head attention
        if self.use_mha:
            attn_output, attn_output_weights = self.mha(lstm_out, lstm_out, lstm_out)
            logits = self.linear(attn_output)
        else:
            logits = self.linear(lstm_out)

        # 3. Output
        # logits = self.linear(attn_output)
        # logits : [batch_size, seq_size, label_size]
        if not self.use_crf: return logits
        prediction = self.crf.decode(logits)
        prediction = torch.as_tensor(prediction, dtype=torch.long)
        # prediction : [batch_size, seq_size]
        return logits, prediction
dsindex commented 3 years ago

@geo47

i have just added '--use_mha' option.

https://github.com/dsindex/ntagger/blob/master/model.py

        if self.use_mha:
            # reference : https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
            #             https://discuss.pytorch.org/t/how-to-add-padding-mask-to-nn-transformerencoder-module/63390/3
            query = lstm_out.permute(1, 0, 2)
            # query : [seq_size, batch_size, self.lstm_dim]
            key = query
            value = query
            key_padding_mask = mask.ne(1) # attention_mask => mask = [[1, 1, ..., 0, ...]] => [[False, False, ..., True, ...]]
            attn_output, attn_output_weights = self.mha(query, key, value, key_padding_mask=key_padding_mask)
            # attn_output : [seq_size, batch_size, self.mha_dim]
            mha_out = attn_output.permute(1, 0, 2)
            # mha_out : [batch_size, seq_size, self.mha_dim]
            # residual, layernorm, dropout
            mha_out = self.layernorm_mha(mha_out + lstm_out)
            mha_out = self.dropout(mha_out)
        else:
            mha_out = lstm_out
            # mha_out : [batch_size, seq_size, self.mha_dim]

however...

since BERT, LSTM layers already utilize contextual information like multi-head attention, i think the effect of mha layer might be small.

스크린샷 2021-02-18 오후 7 01 04 스크린샷 2021-02-19 오후 2 26 14

what about use multi-layers of the TransformerEncoder instead of using LSTM layer?

i mean not for BertLSTMCRF but GloveLSTMCRF. i.e., LSTM can be replaced by multi layers of the TransformerEncoder.

geo47 commented 3 years ago

@dsindex

You are right, the mha results not seems effective here, as BERT already utilizes multi-head attention to learn contextual information. However, I try to use feature based BERT pre-trained model, to get contextual embeddings by freezing all layers except the last one and for the model train BiLSTM-MHA-CRF as described in this paper.

Also, the results shown above looks different then the one I got. For BERT-base(cased), BiLSTM i got F1 = 0.939 with only 3 epoch training.

Lastly, As suggested, I will now try to implement new model architecture based on BertTransformerEncoder (note: I use Bert only to get embeddings, not to fine-tune), and let's see if it improves the results.

-- Like in the etagger, I want to use CharCNN for BertLSTMCRF model in ntagger

Thanks

geo47 commented 3 years ago

Hi @dsindex

Here are some of my observations regarding the model.

BERT-BiLSTM-MHA-CRF ||     91.27     ||     word, pos     ||     bert_feature-based, DSA
BERT-BiLSTM-CRF     ||     91.32     ||     word, pos     ||     bert_feature-based, (No DSA)
BERT-BiLSTM-CRF     ||     91.34     ||     word, pos     ||     bert_feature-based, DSA
BERT-BiLSTM-MHA-CRF ||     91.62     ||     word, pos     ||     bert_feature-based, (No DSA)

From the observations above, It seems like Dynamic Self-Attention for BERT based embedding badly effect the BERT-BiLSTM-MHA-CRF model. Without DSA, the model performs significantly well. However, BERT-BiLSTM-CRF model doesn't have any significant change with or without DSA.

Attached code reflects the BertLSTMCRF class I used to disable DSA for testing the model.

class BertLSTMCRF(BaseModel):
    def __init__(self, config, bert_config, bert_model, bert_tokenizer, label_path, pos_path, use_crf=False, use_pos=False, use_mha=False, disable_lstm=False, feature_based=False):
        super().__init__(config=config)

        self.config = config
        self.device = config['opt'].device
        self.seq_size = config['n_ctx']
        pos_emb_dim = config['pos_emb_dim']
        lstm_hidden_dim = config['lstm_hidden_dim']
        lstm_num_layers = config['lstm_num_layers']
        lstm_dropout = config['lstm_dropout']
        self.use_crf = use_crf
        self.use_pos = use_pos
        self.use_mha = use_mha
        mha_num_attentions = config['mha_num_attentions']
        self.disable_lstm = disable_lstm

        # bert embedding layer
        self.bert_config = bert_config
        self.bert_model = bert_model
        self.bert_tokenizer = bert_tokenizer
        self.bert_feature_based = feature_based
        self.bert_hidden_size = bert_config.hidden_size
        self.bert_num_layers = bert_config.num_hidden_layers

        '''
        # DSA layer for bert_feature_based
        dsa_num_attentions = config['dsa_num_attentions']
        dsa_input_dim = self.bert_hidden_size
        dsa_dim = config['dsa_dim']
        dsa_r = config['dsa_r']
        self.dsa = DSA(config, dsa_num_attentions, dsa_input_dim, dsa_dim, dsa_r=dsa_r)
        self.layernorm_dsa = nn.LayerNorm(self.dsa.last_dim)
        '''

        bert_emb_dim = self.bert_hidden_size
        if self.bert_feature_based:

            # 1) last layer, 2) mean pooling
            bert_emb_dim = self.bert_hidden_size
            '''
            # 3) DSA pooling
            bert_emb_dim = self.dsa.last_dim
            '''

        # pos embedding layer
        self.poss = super().load_dict(pos_path)
        self.pos_vocab_size = len(self.poss)
        padding_idx = config['pad_pos_id']
        self.embed_pos = super().create_embedding_layer(self.pos_vocab_size, pos_emb_dim, weights_matrix=None, non_trainable=False, padding_idx=padding_idx)
        if self.use_pos:
            emb_dim = bert_emb_dim + pos_emb_dim
        else:
            emb_dim = bert_emb_dim

        # BiLSTM layer
        self.lstm_dim = emb_dim
        if not self.disable_lstm:
            self.lstm = nn.LSTM(input_size=emb_dim,
                                hidden_size=lstm_hidden_dim,
                                num_layers=lstm_num_layers,
                                dropout=lstm_dropout,
                                bidirectional=True,
                                batch_first=True)
            self.lstm_dim = lstm_hidden_dim*2

        self.dropout = nn.Dropout(config['dropout'])

        # Multi-Head Attention layer
        self.mha_dim = self.lstm_dim 
        if self.use_mha:
            self.mha = nn.MultiheadAttention(self.lstm_dim, num_heads=mha_num_attentions)
            self.layernorm_mha = nn.LayerNorm(self.mha_dim)

        # projection layer
        self.labels = super().load_dict(label_path)
        self.label_size = len(self.labels)
        self.linear = nn.Linear(self.mha_dim, self.label_size)

        # CRF layer
        if self.use_crf:
            self.crf = CRF(num_tags=self.label_size, batch_first=True)

    def _compute_bert_embedding(self, x, head_mask=None):
        params = {
            'input_ids': x[0],
            'attention_mask': x[1],
            'output_hidden_states': True,
            'output_attentions': True,
            'return_dict': True
        }
        if self.bert_model.config.model_type not in ['bart', 'distilbert']:
            params['token_type_ids'] = None if self.bert_model.config.model_type in ['roberta'] else x[2] # RoBERTa don't use segment_ids
        if head_mask is not None:
            params['head_mask'] = head_mask
        if self.bert_feature_based:
            # feature-based
            with torch.no_grad():
                bert_outputs = self.bert_model(**params)
                if self.bert_model.config.model_type in ['bart']:
                    all_hidden_states = bert_outputs.decoder_hidden_states
                else:
                    all_hidden_states = bert_outputs.hidden_states

                # 1) last layer
                embedded = bert_outputs.last_hidden_state
                # embedded : [batch_size, seq_size, bert_hidden_size]

                '''
                # 2) mean pooling
                stack = torch.stack(all_hidden_states, dim=-1)
                embedded = torch.mean(stack, dim=-1)
                # ([batch_size, seq_size, bert_hidden_size], ..., [batch_size, seq_size, bert_hidden_size])
                # -> stack(-1) -> [batch_size, seq_size, bert_hidden_size, *], ex) * == 25 for bert large
                # -> max/mean(-1) ->  [batch_size, seq_size, bert_hidden_size]
                '''
                '''
                # 3) DSA pooling
                stack = torch.stack(all_hidden_states, dim=-2)
                # stack : [batch_size, seq_size, *, bert_hidden_size]
                stack = stack.view(-1, self.bert_num_layers + 1, self.bert_hidden_size)
                # stack : [*, bert_num_layers, bert_hidden_size]
                dsa_mask = torch.ones(stack.shape[0], stack.shape[1]).to(self.device)
                # dsa_mask : [*, bert_num_layers]
                dsa_out = self.dsa(stack, dsa_mask)
                # dsa_out : [*, self.dsa.last_dim]
                dsa_out = self.layernorm_dsa(dsa_out)
                embedded = dsa_out.view(-1, self.seq_size, self.dsa.last_dim) # hidden state vectors
                # embedded : [batch_size, seq_size, self.dsa.last_dim]
                '''
        else:
            # fine-tuning
            # x[0], x[1], x[2] : [batch_size, seq_size]
            bert_outputs = self.bert_model(**params)
            embedded = bert_outputs.last_hidden_state
            # embedded : [batch_size, seq_size, bert_hidden_size]
        return embedded, bert_outputs

    def forward(self, x, head_mask=None, freeze_bert=False):
        # x[0,1,2] : [batch_size, seq_size]

        mask = x[1].to(torch.uint8).to(self.device)
        # mask == attention_mask : [batch_size, seq_size]
        lengths = torch.sum(mask.to(torch.long), dim=1)
        # lengths : [batch_size]

        # 1. Embedding
        if freeze_bert:
            # freeze_bert is the runtime option which has the same effect to the static option `feature_based`.
            with torch.no_grad():
                bert_embed_out, bert_outputs = self._compute_bert_embedding(x, head_mask=head_mask)
        else:
            bert_embed_out, bert_outputs = self._compute_bert_embedding(x, head_mask=head_mask)
        # bert_embed_out : [batch_size, seq_size, *]
        pos_ids = x[3]
        pos_embed_out = self.embed_pos(pos_ids)
        # pos_embed_out : [batch_size, seq_size, pos_emb_dim]
        if self.use_pos:
            embed_out = torch.cat([bert_embed_out, pos_embed_out], dim=-1)
        else:
            embed_out = bert_embed_out
        # embed_out : [batch_size, seq_size, emb_dim]
        embed_out = self.dropout(embed_out)

        # 2. LSTM
        if not self.disable_lstm:
            # FIXME : pytorch 1.7.0 bug https://github.com/pytorch/pytorch/issues/43227 , lengths.cpu()
            packed_embed_out = torch.nn.utils.rnn.pack_padded_sequence(embed_out, lengths.cpu(), batch_first=True, enforce_sorted=False)
            lstm_out, (h_n, c_n) = self.lstm(packed_embed_out)
            lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True, total_length=self.seq_size)
            # lstm_out : [batch_size, seq_size, self.lstm_dim == lstm_hidden_dim*2]
            lstm_out = self.dropout(lstm_out)
        else:
            lstm_out = embed_out
            # lstm_out : [batch_size, seq_size, self.lstm_dim == emb_dim]

        # 3. MHA
        if self.use_mha:
            # reference : https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
            #             https://discuss.pytorch.org/t/how-to-add-padding-mask-to-nn-transformerencoder-module/63390/3
            query = lstm_out.permute(1, 0, 2)
            # query : [seq_size, batch_size, self.lstm_dim]
            key = query
            value = query
            key_padding_mask = mask.ne(1) # attention_mask => mask = [[1, 1, ..., 0, ...]] => [[False, False, ..., True, ...]]
            attn_output, attn_output_weights = self.mha(query, key, value, key_padding_mask=key_padding_mask)
            # attn_output : [seq_size, batch_size, self.mha_dim]
            mha_out = attn_output.permute(1, 0, 2)
            # mha_out : [batch_size, seq_size, self.mha_dim]
            # residual, layernorm, dropout
            mha_out = self.layernorm_mha(mha_out + lstm_out)
            mha_out = self.dropout(mha_out)
        else:
            mha_out = lstm_out
            # mha_out : [batch_size, seq_size, self.mha_dim]

        # 4. Output
        logits = self.linear(mha_out)
        # logits : [batch_size, seq_size, label_size]
        if not self.use_crf: return logits
        prediction = self.crf.decode(logits)
        prediction = torch.as_tensor(prediction, dtype=torch.long)
        # prediction : [batch_size, seq_size]
        return logits, prediction

Please let me know your thoughts. Thanks.

dsindex commented 3 years ago

@geo47

feature-based approach with DSA seems not that effective based on your results and previous experiments.

스크린샷 2021-02-24 오전 10 36 04

i did experiments based on the bellow paper indicating the effectiveness of weighted sum of last four hidden. (since DSA can be regarded as weighted sum.)

BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding https://arxiv.org/pdf/1810.04805.pdf

스크린샷 2021-02-24 오전 10 43 05
geo47 commented 3 years ago

@dsindex

It was really informative.

So I must not use DSA for improved results as I am using feature_based BERT model. In that case, MHA without DSA seems more effective.

dsindex commented 3 years ago

since we got enough results(IMO), close this issue :)

geo47 commented 3 years ago

@dsindex

Hi..! Sorry, but the issue was related to Concatenating Char embeddings with BERT embeddings, I was busy in doing other project. Now I am back and looking into it. It would be great if we could add this feature or If you have a quick solution then please let me know. Thanks :-)

dsindex commented 3 years ago

@geo47

i just add --use_char_cnn option for BERT embeddings. https://github.com/dsindex/ntagger/commit/7e0207c53ce28d3254208de4bd635f0a3aa18721

character ids for each subword tokens were built like this: https://github.com/dsindex/ntagger/blob/master/util_bert.py

# char extension
c_ids = batch_to_ids([word_tokens])[0].detach().cpu().numpy().tolist()

 """
    convention in BERT:
    for single sequence:
      word      : the dog is hairy
      word_idx  : 0   1   2  3                                                | params
      ----------------------------------------------------------------------- | -------------- |
      tokens:        [CLS] the dog is ha ##iry . [SEP] <pad> <pad> <pad> ...  |                |
      token_idx:       0   1   2   3  4  5     6   7     8     9     10  ...  |                |
      input_ids:       x   x   x   x  x  x     x   x     0     0     0   ...  | input_ids      |
      segment_ids:     0   0   0   0  0  0     0   0     0     0     0   ...  | token_type_ids |
      input_mask:      1   1   1   1  1  1     1   1     0     0     0   ...  | attention_mask |
      label_ids:       0   1   1   1  1  0     1   0     0     0     0   ...  |                |
      ----------------------------------------------------------------------- |                |
      pos_ids:         0   10  2   ...
      char_ids:        [0,..., 0] [259, ..., 261] ...
      -----------------------------------------------------------------------
      idx              0   1   2   3
      word2token_idx:  1   2   3   4  0  0  0 ...  
      word2token_idx[idx] = token_idx
      -----------------------------------------------------------------------
    """

however, i am not sure this is reasonable approach;

thank you :)

edit

** using sub token label, --bert_use_sub_label + --bert_use_pos --use_char_cnn --epoch=30 INFO:main:[F1] : 0.9142604856512141, 3684 INFO:main:[Elapsed Time] : 3684 examples, 146215.95406532288ms, 39.665775003881194ms on average accuracy: 98.32%; precision: 91.57%; recall: 91.75%; FB1: 91.66

geo47 commented 3 years ago

@dsindex

Thanks again,

This approach seems simple and working, however, adding contextual character embeddings from BERT would be more effective in terms of learning the character context.

I am trying to apply character-bert embeddings with word embeddings in our current model. Would it be compatible to concatenate Char-BERT embeddings..?

dsindex commented 3 years ago

@geo47

interesting~!

ntagger(BERT) uses subword-level embeddgins. but, CharacterBERT seems to be word-level.

CharacterBERT is a variant of BERT that produces word-level contextual representations by attending to the characters of each input token.

so, you may need to merge subword embeddings or to slice.

i mean:

dsindex commented 3 years ago

i try to change —bert_use_crf_slice option to —bert_use_subword_pooling. so, release backup code(https://github.com/dsindex/ntagger/releases/tag/v1.0) before modification.

dsindex commented 3 years ago

@geo47

i think 'Char-BERT' embeddings can be concatenated via --bert_use_subword_pooling option.

https://github.com/dsindex/ntagger/blob/master/model.py

    if self.use_subword_pooling:
            word2token_idx = x[5]
            mask_word2token_idx = x[6].to(torch.uint8).unsqueeze(-1).to(self.device)
            # mask_word2token_idx = torch.sign(torch.abs(word2token_idx)).to(torch.uint8).unsqueeze(-1).to(self.device)
            # first subword pooling
            # solution from https://stackoverflow.com/questions/55628014/indexing-a-3d-tensor-using-a-2d-tensor
            src = embed_out
            offset = torch.arange(0, src.size(0) * src.size(1), src.size(1)).to(self.device)
            index = word2token_idx + offset.unsqueeze(1)
            embed_out = src.reshape(-1, src.shape[-1])[index]
            embed_out *= mask_word2token_idx
            # update mask, lengths for word-level
            mask = x[6].to(torch.uint8).to(self.device)
            lengths = torch.sum(mask.to(torch.long), dim=1)
            if self.use_word_embedding:
                token_ids = x[7]
                token_embed_out = self.embed_token(token_ids)
                # token_embed_out : [batch_size, seq_size, token_emb_dim]
                embed_out = torch.cat([embed_out, token_embed_out], dim=-1)

https://github.com/dsindex/ntagger/blob/master/util_bert.py

convention in BERT:
    for single sequence:
      word      : the dog is hairy .
      word_idx  : 0   1   2  3     4                                          | params
      ----------------------------------------------------------------------- | -------------- |
      tokens:        [CLS] the dog is ha ##iry . [SEP] <pad> <pad> <pad> ...  |                |
      token_idx:       0   1   2   3  4  5     6   7     8     9     10  ...  |                |
      input_ids:       x   x   x   x  x  x     x   x     0     0     0   ...  | input_ids      |
      segment_ids:     0   0   0   0  0  0     0   0     0     0     0   ...  | token_type_ids |
      input_mask:      1   1   1   1  1  1     1   1     0     0     0   ...  | attention_mask |
      label_ids:       0   1   1   1  1  0     1   0     0     0     0   ...  |                |
      ----------------------------------------------------------------------- |                |
      pos_ids:         0   10  2   ...
      char_ids:        [0,..., 0] [259, ..., 261] ...
      -----------------------------------------------------------------------
      -----------------------------------------------------------------------
      with --bert_use_subword_pooling:
      word2token_idx:  0   1   2     3   4      6  0  0  0 ...
      word_mask:       1   1   1     1   1      1  0  0  0 ...
      'label_ids, pos_ids, char_ids' are generated as word-level. 
      -----------------------------------------------------------------------
      with --bert_use_subword_pooling --bert_use_word_embedding:
      word_ids:        0   2   2928  16  23223  4  0  0 ...
      -----------------------------------------------------------------------

with the option, we are able to convert subword-level embeddings(from bert) to word-level embeddings. and following layers(eg, lstm / mha / crf) should be applied at word-level.

additionally, if we use --use_char_cnn with --bert_use_subword_pooling, char_ids are computed at word-level not subword-level. it is different implementation with https://github.com/dsindex/ntagger/issues/2#issuecomment-793796631

    if opt.bert_use_subword_pooling:
            c_ids = batch_to_ids([word])[0].detach().cpu().numpy().tolist()
            char_ids.extend(c_ids)
        else:
            c_ids = batch_to_ids([word_tokens])[0].detach().cpu().numpy().tolist()
            char_ids.extend(c_ids)