KeyError: 337 when training a hugging face model using pytorch #14759

Closed pratikchhapolika closed 2 years ago

pratikchhapolika commented 2 years ago

I am training a simple binary classification model using Hugging face models using pytorch.

Bert PyTorch HuggingFace.

Here is the code:

import transformers
from transformers import TFAutoModel, AutoTokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from transformers import AutoTokenizer

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup,BertConfig

I am reading a text-data and classifying it as toxic or non-toxic. I have downloaded and saved model in path.

BERT_MODEL_NAME = '/home/pch/conv-bert-base'
MODEL_PATHS = {'conv-bert-base': '/home/pch/conv-bert-base/'}
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

TRANSFORMERS = {"conv-bert-base": (BertModel, BertTokenizer, "conv-bert-base")}

df=pd.read_excel('gold_data.xlsx', engine='openpyxl')

class SEDataset(Dataset):
    Sexually Explicit dataset for the hate speech.
    def __init__(self, df,tokenizer: BertTokenizer, max_token_len: int = 512):

            df {pandas dataframe} -- Dataframe where the data is. 

        self.df = df 
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

            self.y = df['toxic'].values
        except KeyError: # test data
            self.y = np.zeros(len(df))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        data_row = self.df[idx]

        text_data = data_row['text']

        encoding = tokenizer.encode_plus(

        self.word_ids = encoding["input_ids"]

        return self.word_ids[idx], torch.tensor(self.y[idx]), self.attention_mask[idx]

class Transformer(nn.Module):

    def __init__(self, model, num_classes=1):

            model {string} -- Transformer to build the model on. Expects "conv-bert-base".
            num_classes {int} -- Number of classes (default: {1})
        super().__init__() = model

        model_class, tokenizer_class, pretrained_weights = TRANSFORMERS[model]

        bert_config = BertConfig.from_json_file(MODEL_PATHS[model] + 'config.json')
        bert_config.output_hidden_states = True

        self.transformer = BertModel(bert_config)

        self.nb_features = self.transformer.pooler.dense.out_features

        self.pooler = nn.Sequential(
            nn.Linear(self.nb_features, self.nb_features), 

        self.logit = nn.Linear(self.nb_features, num_classes)

    def forward(self, tokens):
        Usual torch forward function

            tokens {torch tensor} -- Sentence tokens

            torch tensor -- Class logits
        _, _, hidden_states = self.transformer(
            tokens, attention_mask=(tokens > 0).long()

        hidden_states = hidden_states[-1][:, 0] # Use the representation of the first token of the last layer

        ft = self.pooler(hidden_states)

        return self.logit(ft)

def fit(model, train_dataset, val_dataset, epochs=1, batch_size=32, warmup_prop=0, lr=5e-5):

    device = torch.device('cuda')

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    optimizer = AdamW(model.parameters(), lr=lr)

    num_warmup_steps = int(warmup_prop * epochs * len(train_loader))
    num_training_steps = epochs * len(train_loader)

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

    loss_fct = nn.BCEWithLogitsLoss(reduction='mean').to(device)

    for epoch in range(epochs):
        start_time = time.time()

        avg_loss = 0

        for step, (x, y_batch) in tqdm(enumerate(train_loader), total=len(train_loader)): 
            y_pred = model(

            loss = loss_fct(y_pred.view(-1).float(), y_batch.float().to(device))
            avg_loss += loss.item() / len(train_loader)

            xm.optimizer_step(optimizer, barrier=True)

        preds = []
        truths = []
        avg_val_loss = 0.

        with torch.no_grad():
            for x, y_batch in val_loader:                
                y_pred = model(
                loss = loss_fct(y_pred.detach().view(-1).float(), y_batch.float().to(device))
                avg_val_loss += loss.item() / len(val_loader)

                probs = torch.sigmoid(y_pred).detach().cpu().numpy()
                preds += list(probs.flatten())
                truths += list(y_batch.numpy().flatten())
            score = roc_auc_score(truths, preds)

        dt = time.time() - start_time
        lr = scheduler.get_last_lr()[0]
        print(f'Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={dt:.0f}s \t loss={avg_loss:.4f} \t val_loss={avg_val_loss:.4f} \t val_auc={score:.4f}')

model = Transformer("conv-bert-base")
epochs = 1 # 1 epoch seems to be enough
batch_size = 32
warmup_prop = 0.1
lr = 2e-5  # Important parameter to tweak

train_dataset = SEDataset(df3,tokenizer)
val_dataset = SEDataset(val_data,tokenizer)

fit(model, train_dataset, val_dataset, epochs=epochs, batch_size=batch_size, warmup_prop=warmup_prop, lr=lr)

I have attached all the codes above.


**0%| | 0/29 [00:00<?, ?it/s]

KeyError: 337**

pratikchhapolika commented 2 years ago

@patrickvonplaten any help on this, please !!

LysandreJik commented 2 years ago

I think this is an issue with the ConvBERT tokenizer conversion cc @abhishekkrthakur

abhishekkrthakur commented 2 years ago

@pratikchhapolika where does this error occur? would you mind posting the full stacktrace?

pratikchhapolika commented 2 years ago

@abhishekkrthakur This is the only error I get. The KeyError: *** keeps changing after I re-run the model.

Uploaded the notebook. Please change it to .ipynb


pratikchhapolika commented 2 years ago

@abhishekkrthakur any help?

pratikchhapolika commented 2 years ago

@pratikchhapolika where does this error occur? would you mind posting the full stacktrace?

Any help please!

abhishekkrthakur commented 2 years ago

Thanks for the ping. I kinda lost it during christmas time. Unfortunately, im not able to see your pdf file. Could you please upload an ipynb version?

pratikchhapolika commented 2 years ago

Thanks for the ping. I kinda lost it during christmas time. Unfortunately, im not able to see your pdf file. Could you please upload an ipynb version?

Just rename .pdf to .ipynb

abhishekkrthakur commented 2 years ago

I'm not sure what the error is but its not related to the model. Here is my code for imdb (since i don't have your dataset) that works just fine:

import pandas as pd
import tez
import torch
import torch.nn as nn
import transformers
from sklearn import metrics, model_selection
from transformers import AdamW, get_linear_schedule_with_warmup

class BERTDataset:
    def __init__(self, review, target): = review = target
        self.tokenizer = transformers.AutoTokenizer.from_pretrained("YituTech/conv-bert-base")
        self.max_len = 64

    def __len__(self):
        return len(

    def __getitem__(self, item):
        review = str([item])
        review = " ".join(review.split())

        inputs = self.tokenizer.encode_plus(

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor([item], dtype=torch.float),

class BERTBaseUncased(tez.Model):
    def __init__(self, num_train_steps):
        config = transformers.AutoConfig.from_pretrained("YituTech/conv-bert-base")
                "output_hidden_states": True,
        self.tokenizer = transformers.AutoTokenizer.from_pretrained("YituTech/conv-bert-base")
        self.bert = transformers.AutoModel.from_pretrained("YituTech/conv-bert-base", config=config)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)
        self.num_train_steps = num_train_steps
        self.step_scheduler_after = "batch"

    def fetch_optimizer(self):
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.bias"]
        optimizer_parameters = [
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
        opt = AdamW(optimizer_parameters, lr=3e-5)
        return opt

    def fetch_scheduler(self):
        sch = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=0, num_training_steps=self.num_train_steps
        return sch

    def loss(self, outputs, targets):
        if targets is None:
            return None
        return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

    def monitor_metrics(self, outputs, targets):
        if targets is None:
            return {}
        outputs = torch.sigmoid(outputs).cpu().detach().numpy() >= 0.5
        targets = targets.cpu().detach().numpy()
        accuracy = metrics.accuracy_score(targets, outputs)
        return {"accuracy": accuracy}

    def forward(self, ids, mask, token_type_ids, targets=None):
        o_2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        pooled_output = torch.mean(o_2.last_hidden_state, dim=1)
        b_o = self.bert_drop(pooled_output)
        output = self.out(b_o)
        loss = self.loss(output, targets)
        acc = self.monitor_metrics(output, targets)
        return output, loss, acc

if __name__ == "__main__":
    dfx = pd.read_csv("/home/abhishek/workspace/autoxgb/datasets/imdb.csv").fillna("none")
    dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = BERTDataset(, target=df_train.sentiment.values)

    valid_dataset = BERTDataset(, target=df_valid.sentiment.values)

    n_train_steps = int(len(df_train) / 32 * 10)
    model = BERTBaseUncased(num_train_steps=n_train_steps)

    tb_logger = tez.callbacks.TensorBoardLogger(log_dir=".logs/")
    es = tez.callbacks.EarlyStopping(monitor="valid_loss", model_path="model.bin")
        callbacks=[tb_logger, es],
zhongsheng-chen commented 2 years ago

I got the same error when I used transformer to perform NER on Chinese text. my code is : from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese") model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")

nlp = pipeline("ner", model=model, tokenizer=tokenizer) example = "我的名字叫大头,男,生于1900年12月12日"

ner_results = nlp(example) print(ner_results)

Then I got: KeyError Traceback (most recent call last) in 7 example = "我的名字叫大头,男,生于1900年12月12日" 8 ----> 9 ner_results = nlp(example) 10 print(ner_results)

~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/ in call(self, inputs, kwargs) 187 kwargs["offset_mapping"] = offset_mapping 188 --> 189 return super().call(inputs, kwargs) 190 191 def preprocess(self, sentence, offset_mapping=None):

~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/ in call(self, inputs, num_workers, batch_size, *args, **kwargs) 1025 return self.iterate(inputs, preprocess_params, forward_params, postprocess_params) 1026 else: -> 1027 return self.run_single(inputs, preprocess_params, forward_params, postprocess_params) 1028 1029 def run_multi(self, inputs, preprocess_params, forward_params, postprocess_params):

~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/ in run_single(self, inputs, preprocess_params, forward_params, postprocess_params) 1033 model_inputs = self.preprocess(inputs, preprocess_params) 1034 model_outputs = self.forward(model_inputs, forward_params) -> 1035 outputs = self.postprocess(model_outputs, **postprocess_params) 1036 return outputs 1037

~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/ in postprocess(self, model_outputs, aggregation_strategy, ignore_labels) 240 sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy 241 ) --> 242 grouped_entities = self.aggregate(pre_entities, aggregation_strategy) 243 # Filter anything that is in self.ignore_labels 244 entities = [

~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/ in aggregate(self, pre_entities, aggregation_strategy) 319 score = pre_entity["scores"][entity_idx] 320 entity = { --> 321 "entity": self.model.config.id2label[entity_idx], 322 "score": score, 323 "index": pre_entity["index"],

KeyError: 7357

nf78 commented 2 years ago

I've actually found a solution for this and posted it on a stackoverflow answer