Hello,

With the given parameter, I applied CharCNN and concate with BERT embedding alongside POS embeddings. However, CharCNN concatination gives dimension error.

File "ntagger/model.py", line 220, in forward
mask = char_ids.view(-1, self.char_n_ctx).ne(self.char_padding_idx) # broadcasting
RuntimeError: shape '[-1, 50]' is invalid for input of size 2880

Could you figure out this problem, or I am doing something wrong here?

Thanks.

@geo47

for char embedding, we need to construct char_ids from datasets. did you modify dataset.py for it? https://github.com/dsindex/ntagger/blob/master/dataset.py

Hi,

Thanks for your reply. I didn't modify dataset.py, I just follow the char embedding code same as GloveLSTMCRF and ElmoLSTMCRF.

Here is the code that I used. One thing you would notice in the code I also applied Multi-head attention on the top of Bi-LSTM. Could you also verify that piece of code, because it seems working fine. :-)

class BertLSTMCRF(BaseModel):
    def __init__(self, config, bert_config, bert_model, bert_tokenizer, label_path, pos_path, use_crf=False, use_pos=False, use_char_cnn=False, disable_lstm=False, use_mha=False, feature_based=False):
        super().__init__(config=config)

        self.config = config
        self.device = config['opt'].device
        self.seq_size = config['n_ctx']
        pos_emb_dim = config['pos_emb_dim']
        lstm_hidden_dim = config['lstm_hidden_dim']
        lstm_num_layers = config['lstm_num_layers']
        lstm_dropout = config['lstm_dropout']
        self.use_crf = use_crf
        self.use_pos = use_pos
        self.use_char_cnn = use_char_cnn # use cnn embeddings
        self.disable_lstm = disable_lstm
        self.use_mha = use_mha # use Multi-head attention

        # bert embedding layer
        self.bert_config = bert_config
        self.bert_model = bert_model
        self.bert_tokenizer = bert_tokenizer
        self.bert_feature_based = feature_based
        self.bert_hidden_size = bert_config.hidden_size
        self.bert_num_layers = bert_config.num_hidden_layers

        # DSA layer for bert_feature_based
        dsa_num_attentions = config['dsa_num_attentions']
        dsa_input_dim = self.bert_hidden_size
        dsa_dim = config['dsa_dim']
        dsa_r = config['dsa_r']
        self.dsa = DSA(config, dsa_num_attentions, dsa_input_dim, dsa_dim, dsa_r=dsa_r)
        self.layernorm_dsa = nn.LayerNorm(self.dsa.last_dim)

        bert_emb_dim = self.bert_hidden_size
        if self.bert_feature_based:
            '''
            # 1) last layer, 2) mean pooling
            bert_emb_dim = self.bert_hidden_size
            '''
            # 3) DSA pooling
            bert_emb_dim = self.dsa.last_dim

        # pos embedding layer
        self.poss = super().load_dict(pos_path)
        self.pos_vocab_size = len(self.poss)
        padding_idx = config['pad_pos_id']
        self.embed_pos = super().create_embedding_layer(self.pos_vocab_size, pos_emb_dim, weights_matrix=None, non_trainable=False, padding_idx=padding_idx)

        # char embedding layer
        if self.use_char_cnn:
            self.charcnn = CharCNN(config)

        # BiLSTM layer
        if self.use_pos and self.use_char_cnn:
            emb_dim = bert_emb_dim + pos_emb_dim + self.charcnn.last_dim
        elif self.use_pos:
            emb_dim = bert_emb_dim + pos_emb_dim
        elif self.use_char_cnn:
            emb_dim = bert_emb_dim + self.charcnn.last_dim
        else:
            emb_dim = bert_emb_dim

        if not self.disable_lstm:
            self.lstm = nn.LSTM(input_size=emb_dim,
                                hidden_size=lstm_hidden_dim,
                                num_layers=lstm_num_layers,
                                dropout=lstm_dropout,
                                bidirectional=True,
                                batch_first=True)

        self.dropout = nn.Dropout(config['dropout'])

        # Multihead attention:
        if self.use_mha:
            self.mha = nn.MultiheadAttention(2 * lstm_hidden_dim, num_heads=8)

        # projection layer
        self.labels = super().load_dict(label_path)
        self.label_size = len(self.labels)
        if not self.disable_lstm:
            # bi concat fully connection neural network
            self.linear = nn.Linear(lstm_hidden_dim*2, self.label_size)
        else:
            self.linear = nn.Linear(emb_dim, self.label_size)

        # CRF layer
        if self.use_crf:
            self.crf = CRF(num_tags=self.label_size, batch_first=True)

    def _compute_bert_embedding(self, x, head_mask=None):
        params = {
            'input_ids': x[0],
            'attention_mask': x[1],
            'output_hidden_states': True,
            'output_attentions': True,
            'return_dict': True
        }
        if self.bert_model.config.model_type not in ['bart', 'distilbert']:
            params['token_type_ids'] = None if self.bert_model.config.model_type in ['roberta'] else x[2] # RoBERTa don't use segment_ids
        if head_mask is not None:
            params['head_mask'] = head_mask
        if self.bert_feature_based:
            # feature-based
            with torch.no_grad():
                bert_outputs = self.bert_model(**params)
                if self.bert_model.config.model_type in ['bart']:
                    all_hidden_states = bert_outputs.decoder_hidden_states
                else:
                    all_hidden_states = bert_outputs.hidden_states
                '''
                # 1) last layer
                embedded = bert_outputs.last_hidden_state
                # embedded : [batch_size, seq_size, bert_hidden_size]
                '''
                '''
                # 2) mean pooling
                stack = torch.stack(all_hidden_states, dim=-1)
                embedded = torch.mean(stack, dim=-1)
                # ([batch_size, seq_size, bert_hidden_size], ..., [batch_size, seq_size, bert_hidden_size])
                # -> stack(-1) -> [batch_size, seq_size, bert_hidden_size, *], ex) * == 25 for bert large
                # -> max/mean(-1) ->  [batch_size, seq_size, bert_hidden_size]
                '''
                # 3) DSA pooling
                stack = torch.stack(all_hidden_states, dim=-2)
                # stack : [batch_size, seq_size, *, bert_hidden_size]
                stack = stack.view(-1, self.bert_num_layers + 1, self.bert_hidden_size)
                # stack : [*, bert_num_layers, bert_hidden_size]
                dsa_mask = torch.ones(stack.shape[0], stack.shape[1]).to(self.device)
                # dsa_mask : [*, bert_num_layers]
                dsa_out = self.dsa(stack, dsa_mask)
                # dsa_out : [*, self.dsa.last_dim]
                dsa_out = self.layernorm_dsa(dsa_out)
                embedded = dsa_out.view(-1, self.seq_size, self.dsa.last_dim)
                # embedded : [batch_size, seq_size, self.dsa.last_dim]
        else:
            # fine-tuning
            # x[0], x[1], x[2] : [batch_size, seq_size]
            bert_outputs = self.bert_model(**params)
            embedded = bert_outputs.last_hidden_state
            # embedded : [batch_size, seq_size, bert_hidden_size]
        return embedded, bert_outputs

    def forward(self, x, head_mask=None, freeze_bert=False):
        # x[0,1,2] : [batch_size, seq_size]

        mask = x[1].to(torch.uint8).to(self.device)
        # mask == attention_mask : [batch_size, seq_size]
        lengths = torch.sum(mask.to(torch.long), dim=1)
        # lengths : [batch_size]

        # 1. Embedding
        if freeze_bert:
            # freeze_bert is the runtime option which has the same effect to the static option `feature_based`.
            with torch.no_grad():
                bert_embed_out, bert_outputs = self._compute_bert_embedding(x, head_mask=head_mask)
        else:
            bert_embed_out, bert_outputs = self._compute_bert_embedding(x, head_mask=head_mask)
        # bert_embed_out : [batch_size, seq_size, *]
        pos_ids = x[3]
        pos_embed_out = self.embed_pos(pos_ids)
        # pos_embed_out : [batch_size, seq_size, pos_emb_dim]
        if self.use_pos and self.use_char_cnn:
            char_ids = x[2]
            # char_ids : [batch_size, seq_size, char_n_ctx]
            charcnn_out = self.charcnn(char_ids)
            # charcnn_out : [batch_size, seq_size, self.charcnn.last_dim]
            embed_out = torch.cat([bert_embed_out, pos_embed_out, charcnn_out], dim=-1)
            # embed_out : [batch_size, seq_size, emb_dim]
        elif self.use_pos:
            embed_out = torch.cat([bert_embed_out, pos_embed_out], dim=-1)
        elif self.use_char_cnn:
            char_ids = x[2]
            # char_ids : [batch_size, seq_size, char_n_ctx]
            charcnn_out = self.charcnn(char_ids)
            # charcnn_out : [batch_size, seq_size, self.charcnn.last_dim]
            embed_out = torch.cat([bert_embed_out, charcnn_out], dim=-1)
            # embed_out : [batch_size, seq_size, emb_dim]
        else:
            embed_out = bert_embed_out
        # embed_out : [batch_size, seq_size, emb_dim]
        embed_out = self.dropout(embed_out)

        ############################################################################
        # # 1. Embedding
        # token_embed_out = self.embed_token(token_ids)
        # # token_embed_out : [batch_size, seq_size, token_emb_dim]
        # pos_embed_out = self.embed_pos(pos_ids)
        # # pos_embed_out   : [batch_size, seq_size, pos_emb_dim]
        # if self.use_char_cnn:
        #     char_ids = x[2]
        #     # char_ids : [batch_size, seq_size, char_n_ctx]
        #     charcnn_out = self.charcnn(char_ids)
        #     # charcnn_out : [batch_size, seq_size, self.charcnn.last_dim]
        #     embed_out = torch.cat([token_embed_out, pos_embed_out, charcnn_out], dim=-1)
        #     # embed_out : [batch_size, seq_size, emb_dim]
        # else:
        #     embed_out = torch.cat([token_embed_out, pos_embed_out], dim=-1)
        #     # embed_out : [batch_size, seq_size, emb_dim]
        # embed_out = self.dropout(embed_out)
        ############################################################################

        # 2. LSTM
        if not self.disable_lstm:
            # FIXME : pytorch 1.7.0 bug https://github.com/pytorch/pytorch/issues/43227 , lengths.cpu()
            packed_embed_out = torch.nn.utils.rnn.pack_padded_sequence(embed_out, lengths.cpu(), batch_first=True, enforce_sorted=False)
            lstm_out, (final_hidden_state, final_cell_state) = self.lstm(packed_embed_out)
            lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True, total_length=self.seq_size)
            # lstm_out : [batch_size, seq_size, lstm_hidden_dim*2]

            lstm_out = self.dropout(lstm_out)
        else:
            lstm_out = embed_out
            # lstm_out : [batch_size, seq_size, emb_dim]

        # 3. Multi-head attention
        if self.use_mha:
            attn_output, attn_output_weights = self.mha(lstm_out, lstm_out, lstm_out)
            logits = self.linear(attn_output)
        else:
            logits = self.linear(lstm_out)

        # 3. Output
        # logits = self.linear(attn_output)
        # logits : [batch_size, seq_size, label_size]
        if not self.use_crf: return logits
        prediction = self.crf.decode(logits)
        prediction = torch.as_tensor(prediction, dtype=torch.long)
        # prediction : [batch_size, seq_size]
        return logits, prediction

@geo47

i have just added '--use_mha' option.

https://github.com/dsindex/ntagger/blob/master/model.py

        if self.use_mha:
            # reference : https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
            #             https://discuss.pytorch.org/t/how-to-add-padding-mask-to-nn-transformerencoder-module/63390/3
            query = lstm_out.permute(1, 0, 2)
            # query : [seq_size, batch_size, self.lstm_dim]
            key = query
            value = query
            key_padding_mask = mask.ne(1) # attention_mask => mask = [[1, 1, ..., 0, ...]] => [[False, False, ..., True, ...]]
            attn_output, attn_output_weights = self.mha(query, key, value, key_padding_mask=key_padding_mask)
            # attn_output : [seq_size, batch_size, self.mha_dim]
            mha_out = attn_output.permute(1, 0, 2)
            # mha_out : [batch_size, seq_size, self.mha_dim]
            # residual, layernorm, dropout
            mha_out = self.layernorm_mha(mha_out + lstm_out)
            mha_out = self.dropout(mha_out)
        else:
            mha_out = lstm_out
            # mha_out : [batch_size, seq_size, self.mha_dim]

however...

since BERT, LSTM layers already utilize contextual information like multi-head attention, i think the effect of mha layer might be small.

what about use multi-layers of the TransformerEncoder instead of using LSTM layer?

i mean not for BertLSTMCRF but GloveLSTMCRF. i.e., LSTM can be replaced by multi layers of the TransformerEncoder.

@dsindex

You are right, the mha results not seems effective here, as BERT already utilizes multi-head attention to learn contextual information. However, I try to use feature based BERT pre-trained model, to get contextual embeddings by freezing all layers except the last one and for the model train BiLSTM-MHA-CRF as described in this paper.

Also, the results shown above looks different then the one I got. For BERT-base(cased), BiLSTM i got F1 = 0.939 with only 3 epoch training.

Lastly, As suggested, I will now try to implement new model architecture based on BertTransformerEncoder (note: I use Bert only to get embeddings, not to fine-tune), and let's see if it improves the results.

-- Like in the etagger, I want to use CharCNN for BertLSTMCRF model in ntagger

Thanks

Hi @dsindex

Here are some of my observations regarding the model.

BERT-BiLSTM-MHA-CRF ||     91.27     ||     word, pos     ||     bert_feature-based, DSA
BERT-BiLSTM-CRF     ||     91.32     ||     word, pos     ||     bert_feature-based, (No DSA)
BERT-BiLSTM-CRF     ||     91.34     ||     word, pos     ||     bert_feature-based, DSA
BERT-BiLSTM-MHA-CRF ||     91.62     ||     word, pos     ||     bert_feature-based, (No DSA)

From the observations above, It seems like Dynamic Self-Attention for BERT based embedding badly effect the BERT-BiLSTM-MHA-CRF model. Without DSA, the model performs significantly well. However, BERT-BiLSTM-CRF model doesn't have any significant change with or without DSA.

Attached code reflects the BertLSTMCRF class I used to disable DSA for testing the model.

class BertLSTMCRF(BaseModel):
    def __init__(self, config, bert_config, bert_model, bert_tokenizer, label_path, pos_path, use_crf=False, use_pos=False, use_mha=False, disable_lstm=False, feature_based=False):
        super().__init__(config=config)

        self.config = config
        self.device = config['opt'].device
        self.seq_size = config['n_ctx']
        pos_emb_dim = config['pos_emb_dim']
        lstm_hidden_dim = config['lstm_hidden_dim']
        lstm_num_layers = config['lstm_num_layers']
        lstm_dropout = config['lstm_dropout']
        self.use_crf = use_crf
        self.use_pos = use_pos
        self.use_mha = use_mha
        mha_num_attentions = config['mha_num_attentions']
        self.disable_lstm = disable_lstm

        # bert embedding layer
        self.bert_config = bert_config
        self.bert_model = bert_model
        self.bert_tokenizer = bert_tokenizer
        self.bert_feature_based = feature_based
        self.bert_hidden_size = bert_config.hidden_size
        self.bert_num_layers = bert_config.num_hidden_layers

        '''
        # DSA layer for bert_feature_based
        dsa_num_attentions = config['dsa_num_attentions']
        dsa_input_dim = self.bert_hidden_size
        dsa_dim = config['dsa_dim']
        dsa_r = config['dsa_r']
        self.dsa = DSA(config, dsa_num_attentions, dsa_input_dim, dsa_dim, dsa_r=dsa_r)
        self.layernorm_dsa = nn.LayerNorm(self.dsa.last_dim)
        '''

        bert_emb_dim = self.bert_hidden_size
        if self.bert_feature_based:

            # 1) last layer, 2) mean pooling
            bert_emb_dim = self.bert_hidden_size
            '''
            # 3) DSA pooling
            bert_emb_dim = self.dsa.last_dim
            '''

        # pos embedding layer
        self.poss = super().load_dict(pos_path)
        self.pos_vocab_size = len(self.poss)
        padding_idx = config['pad_pos_id']
        self.embed_pos = super().create_embedding_layer(self.pos_vocab_size, pos_emb_dim, weights_matrix=None, non_trainable=False, padding_idx=padding_idx)
        if self.use_pos:
            emb_dim = bert_emb_dim + pos_emb_dim
        else:
            emb_dim = bert_emb_dim

        # BiLSTM layer
        self.lstm_dim = emb_dim
        if not self.disable_lstm:
            self.lstm = nn.LSTM(input_size=emb_dim,
                                hidden_size=lstm_hidden_dim,
                                num_layers=lstm_num_layers,
                                dropout=lstm_dropout,
                                bidirectional=True,
                                batch_first=True)
            self.lstm_dim = lstm_hidden_dim*2

        self.dropout = nn.Dropout(config['dropout'])

        # Multi-Head Attention layer
        self.mha_dim = self.lstm_dim 
        if self.use_mha:
            self.mha = nn.MultiheadAttention(self.lstm_dim, num_heads=mha_num_attentions)
            self.layernorm_mha = nn.LayerNorm(self.mha_dim)

        # projection layer
        self.labels = super().load_dict(label_path)
        self.label_size = len(self.labels)
        self.linear = nn.Linear(self.mha_dim, self.label_size)

        # CRF layer
        if self.use_crf:
            self.crf = CRF(num_tags=self.label_size, batch_first=True)

    def _compute_bert_embedding(self, x, head_mask=None):
        params = {
            'input_ids': x[0],
            'attention_mask': x[1],
            'output_hidden_states': True,
            'output_attentions': True,
            'return_dict': True
        }
        if self.bert_model.config.model_type not in ['bart', 'distilbert']:
            params['token_type_ids'] = None if self.bert_model.config.model_type in ['roberta'] else x[2] # RoBERTa don't use segment_ids
        if head_mask is not None:
            params['head_mask'] = head_mask
        if self.bert_feature_based:
            # feature-based
            with torch.no_grad():
                bert_outputs = self.bert_model(**params)
                if self.bert_model.config.model_type in ['bart']:
                    all_hidden_states = bert_outputs.decoder_hidden_states
                else:
                    all_hidden_states = bert_outputs.hidden_states

                # 1) last layer
                embedded = bert_outputs.last_hidden_state
                # embedded : [batch_size, seq_size, bert_hidden_size]

                '''
                # 2) mean pooling
                stack = torch.stack(all_hidden_states, dim=-1)
                embedded = torch.mean(stack, dim=-1)
                # ([batch_size, seq_size, bert_hidden_size], ..., [batch_size, seq_size, bert_hidden_size])
                # -> stack(-1) -> [batch_size, seq_size, bert_hidden_size, *], ex) * == 25 for bert large
                # -> max/mean(-1) ->  [batch_size, seq_size, bert_hidden_size]
                '''
                '''
                # 3) DSA pooling
                stack = torch.stack(all_hidden_states, dim=-2)
                # stack : [batch_size, seq_size, *, bert_hidden_size]
                stack = stack.view(-1, self.bert_num_layers + 1, self.bert_hidden_size)
                # stack : [*, bert_num_layers, bert_hidden_size]
                dsa_mask = torch.ones(stack.shape[0], stack.shape[1]).to(self.device)
                # dsa_mask : [*, bert_num_layers]
                dsa_out = self.dsa(stack, dsa_mask)
                # dsa_out : [*, self.dsa.last_dim]
                dsa_out = self.layernorm_dsa(dsa_out)
                embedded = dsa_out.view(-1, self.seq_size, self.dsa.last_dim) # hidden state vectors
                # embedded : [batch_size, seq_size, self.dsa.last_dim]
                '''
        else:
            # fine-tuning
            # x[0], x[1], x[2] : [batch_size, seq_size]
            bert_outputs = self.bert_model(**params)
            embedded = bert_outputs.last_hidden_state
            # embedded : [batch_size, seq_size, bert_hidden_size]
        return embedded, bert_outputs

    def forward(self, x, head_mask=None, freeze_bert=False):
        # x[0,1,2] : [batch_size, seq_size]

        mask = x[1].to(torch.uint8).to(self.device)
        # mask == attention_mask : [batch_size, seq_size]
        lengths = torch.sum(mask.to(torch.long), dim=1)
        # lengths : [batch_size]

        # 1. Embedding
        if freeze_bert:
            # freeze_bert is the runtime option which has the same effect to the static option `feature_based`.
            with torch.no_grad():
                bert_embed_out, bert_outputs = self._compute_bert_embedding(x, head_mask=head_mask)
        else:
            bert_embed_out, bert_outputs = self._compute_bert_embedding(x, head_mask=head_mask)
        # bert_embed_out : [batch_size, seq_size, *]
        pos_ids = x[3]
        pos_embed_out = self.embed_pos(pos_ids)
        # pos_embed_out : [batch_size, seq_size, pos_emb_dim]
        if self.use_pos:
            embed_out = torch.cat([bert_embed_out, pos_embed_out], dim=-1)
        else:
            embed_out = bert_embed_out
        # embed_out : [batch_size, seq_size, emb_dim]
        embed_out = self.dropout(embed_out)

        # 2. LSTM
        if not self.disable_lstm:
            # FIXME : pytorch 1.7.0 bug https://github.com/pytorch/pytorch/issues/43227 , lengths.cpu()
            packed_embed_out = torch.nn.utils.rnn.pack_padded_sequence(embed_out, lengths.cpu(), batch_first=True, enforce_sorted=False)
            lstm_out, (h_n, c_n) = self.lstm(packed_embed_out)
            lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True, total_length=self.seq_size)
            # lstm_out : [batch_size, seq_size, self.lstm_dim == lstm_hidden_dim*2]
            lstm_out = self.dropout(lstm_out)
        else:
            lstm_out = embed_out
            # lstm_out : [batch_size, seq_size, self.lstm_dim == emb_dim]

        # 3. MHA
        if self.use_mha:
            # reference : https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
            #             https://discuss.pytorch.org/t/how-to-add-padding-mask-to-nn-transformerencoder-module/63390/3
            query = lstm_out.permute(1, 0, 2)
            # query : [seq_size, batch_size, self.lstm_dim]
            key = query
            value = query
            key_padding_mask = mask.ne(1) # attention_mask => mask = [[1, 1, ..., 0, ...]] => [[False, False, ..., True, ...]]
            attn_output, attn_output_weights = self.mha(query, key, value, key_padding_mask=key_padding_mask)
            # attn_output : [seq_size, batch_size, self.mha_dim]
            mha_out = attn_output.permute(1, 0, 2)
            # mha_out : [batch_size, seq_size, self.mha_dim]
            # residual, layernorm, dropout
            mha_out = self.layernorm_mha(mha_out + lstm_out)
            mha_out = self.dropout(mha_out)
        else:
            mha_out = lstm_out
            # mha_out : [batch_size, seq_size, self.mha_dim]

        # 4. Output
        logits = self.linear(mha_out)
        # logits : [batch_size, seq_size, label_size]
        if not self.use_crf: return logits
        prediction = self.crf.decode(logits)
        prediction = torch.as_tensor(prediction, dtype=torch.long)
        # prediction : [batch_size, seq_size]
        return logits, prediction

Please let me know your thoughts. Thanks.

@geo47

feature-based approach with DSA seems not that effective based on your results and previous experiments.

i did experiments based on the bellow paper indicating the effectiveness of weighted sum of last four hidden. (since DSA can be regarded as weighted sum.)

BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding https://arxiv.org/pdf/1810.04805.pdf

@dsindex

It was really informative.

So I must not use DSA for improved results as I am using feature_based BERT model. In that case, MHA without DSA seems more effective.

since we got enough results(IMO), close this issue :)

@dsindex

Hi..! Sorry, but the issue was related to Concatenating Char embeddings with BERT embeddings, I was busy in doing other project. Now I am back and looking into it. It would be great if we could add this feature or If you have a quick solution then please let me know. Thanks :-)

@geo47

i just add --use_char_cnn option for BERT embeddings. https://github.com/dsindex/ntagger/commit/7e0207c53ce28d3254208de4bd635f0a3aa18721

character ids for each subword tokens were built like this: https://github.com/dsindex/ntagger/blob/master/util_bert.py

# char extension
c_ids = batch_to_ids([word_tokens])[0].detach().cpu().numpy().tolist()

 """
    convention in BERT:
    for single sequence:
      word      : the dog is hairy
      word_idx  : 0   1   2  3                                                | params
      ----------------------------------------------------------------------- | -------------- |
      tokens:        [CLS] the dog is ha ##iry . [SEP] <pad> <pad> <pad> ...  |                |
      token_idx:       0   1   2   3  4  5     6   7     8     9     10  ...  |                |
      input_ids:       x   x   x   x  x  x     x   x     0     0     0   ...  | input_ids      |
      segment_ids:     0   0   0   0  0  0     0   0     0     0     0   ...  | token_type_ids |
      input_mask:      1   1   1   1  1  1     1   1     0     0     0   ...  | attention_mask |
      label_ids:       0   1   1   1  1  0     1   0     0     0     0   ...  |                |
      ----------------------------------------------------------------------- |                |
      pos_ids:         0   10  2   ...
      char_ids:        [0,..., 0] [259, ..., 261] ...
      -----------------------------------------------------------------------
      idx              0   1   2   3
      word2token_idx:  1   2   3   4  0  0  0 ...  
      word2token_idx[idx] = token_idx
      -----------------------------------------------------------------------
    """

however, i am not sure this is reasonable approach;

thank you :)

edit

experiment results
how to
using sub token label, --bert_use_sub_label + --bert_use_pos --use_char_cnn

preprocessing

$ python preprocess.py --config=configs/config-bert.json --data_dir=data/conll2003 --bert_model_name_or_path=./embeddings/bert-base-cased --bert_use_sub_label

train

$ python train.py --config=configs/config-bert.json --data_dir=data/conll2003 --save_path=pytorch-model-bert.pt --bert_model_name_or_path=./embeddings/bert-base-cased --bert_output_dir=bert-checkpoint --batch_size=32 --lr=1e-5 --epoch=10 --bert_freezing_epoch=3 --bert_lr_during_freezing=1e-3 --use_crf --bert_use_pos --use_char_cnn

evaluate

$ python evaluate.py --config=configs/config-bert.json --data_dir=data/conll2003 --model_path=pytorch-model-bert.pt --bert_output_dir=bert-checkpoint --use_crf --bert_use_pos --use_char_cnn $ cd data/conll2003; perl ../../etc/conlleval.pl < test.txt.pred ; cd ../.. INFO:main:[F1] : 0.9113209212035649, 3684 INFO:main:[Elapsed Time] : 3684 examples, 148480.65185546875ms, 40.282283112823464ms on average accuracy: 98.27%; precision: 91.23%; recall: 91.52%; FB1: 91.37

** using sub token label, --bert_use_sub_label + --bert_use_pos --use_char_cnn --epoch=30 INFO:main:[F1] : 0.9142604856512141, 3684 INFO:main:[Elapsed Time] : 3684 examples, 146215.95406532288ms, 39.665775003881194ms on average accuracy: 98.32%; precision: 91.57%; recall: 91.75%; FB1: 91.66

@dsindex

Thanks again,

This approach seems simple and working, however, adding contextual character embeddings from BERT would be more effective in terms of learning the character context.

I am trying to apply character-bert embeddings with word embeddings in our current model. Would it be compatible to concatenate Char-BERT embeddings..?

@geo47

interesting~!

ntagger(BERT) uses subword-level embeddgins. but, CharacterBERT seems to be word-level.

CharacterBERT is a variant of BERT that produces word-level contextual representations by attending to the characters of each input token.

so, you may need to merge subword embeddings or to slice.

i mean:

feed input to BERT and CharacterBERT independently.
merge BERT's subword embeddings to word embedding and concatenate it to CharacterBERT embeddings.
or slice the first subword embeddings and concatenate it to CharacterBERT embeddings.
- you can refer slice operations for subword embeddings from https://github.com/dsindex/ntagger/issues/1#issuecomment-782654930
concatenate two word-level embeddings and apply it to BiLSTM+MHA+CRF(?)

i try to change —bert_use_crf_slice option to —bert_use_subword_pooling. so, release backup code(https://github.com/dsindex/ntagger/releases/tag/v1.0) before modification.

@geo47

i think 'Char-BERT' embeddings can be concatenated via --bert_use_subword_pooling option.

https://github.com/dsindex/ntagger/blob/master/model.py

    if self.use_subword_pooling:
            word2token_idx = x[5]
            mask_word2token_idx = x[6].to(torch.uint8).unsqueeze(-1).to(self.device)
            # mask_word2token_idx = torch.sign(torch.abs(word2token_idx)).to(torch.uint8).unsqueeze(-1).to(self.device)
            # first subword pooling
            # solution from https://stackoverflow.com/questions/55628014/indexing-a-3d-tensor-using-a-2d-tensor
            src = embed_out
            offset = torch.arange(0, src.size(0) * src.size(1), src.size(1)).to(self.device)
            index = word2token_idx + offset.unsqueeze(1)
            embed_out = src.reshape(-1, src.shape[-1])[index]
            embed_out *= mask_word2token_idx
            # update mask, lengths for word-level
            mask = x[6].to(torch.uint8).to(self.device)
            lengths = torch.sum(mask.to(torch.long), dim=1)
            if self.use_word_embedding:
                token_ids = x[7]
                token_embed_out = self.embed_token(token_ids)
                # token_embed_out : [batch_size, seq_size, token_emb_dim]
                embed_out = torch.cat([embed_out, token_embed_out], dim=-1)

https://github.com/dsindex/ntagger/blob/master/util_bert.py

convention in BERT:
    for single sequence:
      word      : the dog is hairy .
      word_idx  : 0   1   2  3     4                                          | params
      ----------------------------------------------------------------------- | -------------- |
      tokens:        [CLS] the dog is ha ##iry . [SEP] <pad> <pad> <pad> ...  |                |
      token_idx:       0   1   2   3  4  5     6   7     8     9     10  ...  |                |
      input_ids:       x   x   x   x  x  x     x   x     0     0     0   ...  | input_ids      |
      segment_ids:     0   0   0   0  0  0     0   0     0     0     0   ...  | token_type_ids |
      input_mask:      1   1   1   1  1  1     1   1     0     0     0   ...  | attention_mask |
      label_ids:       0   1   1   1  1  0     1   0     0     0     0   ...  |                |
      ----------------------------------------------------------------------- |                |
      pos_ids:         0   10  2   ...
      char_ids:        [0,..., 0] [259, ..., 261] ...
      -----------------------------------------------------------------------
      -----------------------------------------------------------------------
      with --bert_use_subword_pooling:
      word2token_idx:  0   1   2     3   4      6  0  0  0 ...
      word_mask:       1   1   1     1   1      1  0  0  0 ...
      'label_ids, pos_ids, char_ids' are generated as word-level. 
      -----------------------------------------------------------------------
      with --bert_use_subword_pooling --bert_use_word_embedding:
      word_ids:        0   2   2928  16  23223  4  0  0 ...
      -----------------------------------------------------------------------

with the option, we are able to convert subword-level embeddings(from bert) to word-level embeddings. and following layers(eg, lstm / mha / crf) should be applied at word-level.

additionally, if we use --use_char_cnn with --bert_use_subword_pooling, char_ids are computed at word-level not subword-level. it is different implementation with https://github.com/dsindex/ntagger/issues/2#issuecomment-793796631

    if opt.bert_use_subword_pooling:
            c_ids = batch_to_ids([word])[0].detach().cpu().numpy().tolist()
            char_ids.extend(c_ids)
        else:
            c_ids = batch_to_ids([word_tokens])[0].detach().cpu().numpy().tolist()
            char_ids.extend(c_ids)

dsindex / ntagger

Concate CharCNN to BertLSTCRF #2

edit

preprocessing

train

evaluate