huggingface / transformers

🤗 Transformers: State-of-the-art Machine Learning for Pytorch, TensorFlow, and JAX.
https://huggingface.co/transformers
Apache License 2.0
132.23k stars 26.34k forks source link

KeyError: 337 when training a hugging face model using pytorch #14759

Closed pratikchhapolika closed 2 years ago

pratikchhapolika commented 2 years ago

I am training a simple binary classification model using Hugging face models using pytorch.

Bert PyTorch HuggingFace.

Here is the code:

import transformers
from transformers import TFAutoModel, AutoTokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from transformers import AutoTokenizer

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup,BertConfig

I am reading a text-data and classifying it as toxic or non-toxic. I have downloaded and saved model in path.

BERT_MODEL_NAME = '/home/pch/conv-bert-base'
MODEL_PATHS = {'conv-bert-base': '/home/pch/conv-bert-base/'}
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

TRANSFORMERS = {"conv-bert-base": (BertModel, BertTokenizer, "conv-bert-base")}

df=pd.read_excel('gold_data.xlsx', engine='openpyxl')
df2=df[['text','labels','validation']]
df3=df2[df2.labels.isin([0,1])]
val_data=df2[df2.validation.isin([1])]

class SEDataset(Dataset):
    """
    Sexually Explicit dataset for the hate speech.
    """
    def __init__(self, df,tokenizer: BertTokenizer, max_token_len: int = 512):
        """
        Constructor

        Arguments:
            df {pandas dataframe} -- Dataframe where the data is. 
        """

        super().__init__()
        self.df = df 
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

        try:
            self.y = df['toxic'].values
        except KeyError: # test data
            self.y = np.zeros(len(df))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        data_row = self.df[idx]

        text_data = data_row['text']

        encoding = tokenizer.encode_plus(
                  text_data,
                  add_special_tokens=True,
                  max_length=512,
                  return_token_type_ids=False,
                  padding="max_length",
                  return_attention_mask=True,
                  return_tensors='pt',)

        self.word_ids = encoding["input_ids"]
        self.attention_mask=encoding["attention_mask"]

        return self.word_ids[idx], torch.tensor(self.y[idx]), self.attention_mask[idx]

class Transformer(nn.Module):

    def __init__(self, model, num_classes=1):
        """
        Constructor

        Arguments:
            model {string} -- Transformer to build the model on. Expects "conv-bert-base".
            num_classes {int} -- Number of classes (default: {1})
        """
        super().__init__()
        self.name = model

        model_class, tokenizer_class, pretrained_weights = TRANSFORMERS[model]

        bert_config = BertConfig.from_json_file(MODEL_PATHS[model] + 'config.json')
        bert_config.output_hidden_states = True

        self.transformer = BertModel(bert_config)

        self.nb_features = self.transformer.pooler.dense.out_features

        self.pooler = nn.Sequential(
            nn.Linear(self.nb_features, self.nb_features), 
            nn.Tanh(),
        )

        self.logit = nn.Linear(self.nb_features, num_classes)

    def forward(self, tokens):
        """
        Usual torch forward function

        Arguments:
            tokens {torch tensor} -- Sentence tokens

        Returns:
            torch tensor -- Class logits
        """
        _, _, hidden_states = self.transformer(
            tokens, attention_mask=(tokens > 0).long()
        )

        hidden_states = hidden_states[-1][:, 0] # Use the representation of the first token of the last layer

        ft = self.pooler(hidden_states)

        return self.logit(ft)

def fit(model, train_dataset, val_dataset, epochs=1, batch_size=32, warmup_prop=0, lr=5e-5):

    device = torch.device('cuda')
    model.to(device)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    optimizer = AdamW(model.parameters(), lr=lr)

    num_warmup_steps = int(warmup_prop * epochs * len(train_loader))
    num_training_steps = epochs * len(train_loader)

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

    loss_fct = nn.BCEWithLogitsLoss(reduction='mean').to(device)

    for epoch in range(epochs):
        model.train()
        start_time = time.time()

        optimizer.zero_grad()
        avg_loss = 0

        for step, (x, y_batch) in tqdm(enumerate(train_loader), total=len(train_loader)): 
            y_pred = model(x.to(device))

            loss = loss_fct(y_pred.view(-1).float(), y_batch.float().to(device))
            loss.backward()
            avg_loss += loss.item() / len(train_loader)

            xm.optimizer_step(optimizer, barrier=True)
            scheduler.step()
            model.zero_grad()
            optimizer.zero_grad()

        model.eval()
        preds = []
        truths = []
        avg_val_loss = 0.

        with torch.no_grad():
            for x, y_batch in val_loader:                
                y_pred = model(x.to(device))
                loss = loss_fct(y_pred.detach().view(-1).float(), y_batch.float().to(device))
                avg_val_loss += loss.item() / len(val_loader)

                probs = torch.sigmoid(y_pred).detach().cpu().numpy()
                preds += list(probs.flatten())
                truths += list(y_batch.numpy().flatten())
            score = roc_auc_score(truths, preds)

        dt = time.time() - start_time
        lr = scheduler.get_last_lr()[0]
        print(f'Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={dt:.0f}s \t loss={avg_loss:.4f} \t val_loss={avg_val_loss:.4f} \t val_auc={score:.4f}')

model = Transformer("conv-bert-base")
epochs = 1 # 1 epoch seems to be enough
batch_size = 32
warmup_prop = 0.1
lr = 2e-5  # Important parameter to tweak

train_dataset = SEDataset(df3,tokenizer)
val_dataset = SEDataset(val_data,tokenizer)

fit(model, train_dataset, val_dataset, epochs=epochs, batch_size=batch_size, warmup_prop=warmup_prop, lr=lr)

I have attached all the codes above.

Error:

**0%| | 0/29 [00:00<?, ?it/s]

KeyError: 337**

pratikchhapolika commented 2 years ago

@patrickvonplaten any help on this, please !!

LysandreJik commented 2 years ago

I think this is an issue with the ConvBERT tokenizer conversion cc @abhishekkrthakur

abhishekkrthakur commented 2 years ago

@pratikchhapolika where does this error occur? would you mind posting the full stacktrace?

pratikchhapolika commented 2 years ago

@abhishekkrthakur This is the only error I get. The KeyError: *** keeps changing after I re-run the model.

Uploaded the notebook. Please change it to .ipynb

20211213_se_model.pdf

pratikchhapolika commented 2 years ago

@abhishekkrthakur any help?

github-actions[bot] commented 2 years ago

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

Please note that issues that do not follow the contributing guidelines are likely to be ignored.

pratikchhapolika commented 2 years ago

@pratikchhapolika where does this error occur? would you mind posting the full stacktrace?

Any help please!

abhishekkrthakur commented 2 years ago

Thanks for the ping. I kinda lost it during christmas time. Unfortunately, im not able to see your pdf file. Could you please upload an ipynb version?

pratikchhapolika commented 2 years ago

Thanks for the ping. I kinda lost it during christmas time. Unfortunately, im not able to see your pdf file. Could you please upload an ipynb version?

Just rename .pdf to .ipynb

abhishekkrthakur commented 2 years ago

I'm not sure what the error is but its not related to the model. Here is my code for imdb (since i don't have your dataset) that works just fine:

import pandas as pd
import tez
import torch
import torch.nn as nn
import transformers
from sklearn import metrics, model_selection
from transformers import AdamW, get_linear_schedule_with_warmup

class BERTDataset:
    def __init__(self, review, target):
        self.review = review
        self.target = target
        self.tokenizer = transformers.AutoTokenizer.from_pretrained("YituTech/conv-bert-base")
        self.max_len = 64

    def __len__(self):
        return len(self.review)

    def __getitem__(self, item):
        review = str(self.review[item])
        review = " ".join(review.split())

        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float),
        }

class BERTBaseUncased(tez.Model):
    def __init__(self, num_train_steps):
        super().__init__()
        config = transformers.AutoConfig.from_pretrained("YituTech/conv-bert-base")
        config.update(
            {
                "output_hidden_states": True,
            }
        )
        self.tokenizer = transformers.AutoTokenizer.from_pretrained("YituTech/conv-bert-base")
        self.bert = transformers.AutoModel.from_pretrained("YituTech/conv-bert-base", config=config)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)
        self.num_train_steps = num_train_steps
        self.step_scheduler_after = "batch"

    def fetch_optimizer(self):
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.bias"]
        optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        opt = AdamW(optimizer_parameters, lr=3e-5)
        return opt

    def fetch_scheduler(self):
        sch = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=0, num_training_steps=self.num_train_steps
        )
        return sch

    def loss(self, outputs, targets):
        if targets is None:
            return None
        return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

    def monitor_metrics(self, outputs, targets):
        if targets is None:
            return {}
        outputs = torch.sigmoid(outputs).cpu().detach().numpy() >= 0.5
        targets = targets.cpu().detach().numpy()
        accuracy = metrics.accuracy_score(targets, outputs)
        return {"accuracy": accuracy}

    def forward(self, ids, mask, token_type_ids, targets=None):
        o_2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        pooled_output = torch.mean(o_2.last_hidden_state, dim=1)
        print(pooled_output.shape)
        b_o = self.bert_drop(pooled_output)
        output = self.out(b_o)
        loss = self.loss(output, targets)
        acc = self.monitor_metrics(output, targets)
        return output, loss, acc

if __name__ == "__main__":
    dfx = pd.read_csv("/home/abhishek/workspace/autoxgb/datasets/imdb.csv").fillna("none")
    dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = BERTDataset(review=df_train.review.values, target=df_train.sentiment.values)

    valid_dataset = BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values)

    n_train_steps = int(len(df_train) / 32 * 10)
    model = BERTBaseUncased(num_train_steps=n_train_steps)

    tb_logger = tez.callbacks.TensorBoardLogger(log_dir=".logs/")
    es = tez.callbacks.EarlyStopping(monitor="valid_loss", model_path="model.bin")
    model.fit(
        train_dataset,
        valid_dataset=valid_dataset,
        train_bs=32,
        device="cuda",
        epochs=50,
        callbacks=[tb_logger, es],
        fp16=True,
    )
github-actions[bot] commented 2 years ago

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.

Please note that issues that do not follow the contributing guidelines are likely to be ignored.

zhongsheng-chen commented 2 years ago

I got the same error when I used transformer to perform NER on Chinese text. my code is : from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese") model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")

nlp = pipeline("ner", model=model, tokenizer=tokenizer) example = "我的名字叫大头,男,生于1900年12月12日"

ner_results = nlp(example) print(ner_results)

Then I got: KeyError Traceback (most recent call last) in 7 example = "我的名字叫大头,男,生于1900年12月12日" 8 ----> 9 ner_results = nlp(example) 10 print(ner_results)

~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/token_classification.py in call(self, inputs, kwargs) 187 kwargs["offset_mapping"] = offset_mapping 188 --> 189 return super().call(inputs, kwargs) 190 191 def preprocess(self, sentence, offset_mapping=None):

~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/base.py in call(self, inputs, num_workers, batch_size, *args, **kwargs) 1025 return self.iterate(inputs, preprocess_params, forward_params, postprocess_params) 1026 else: -> 1027 return self.run_single(inputs, preprocess_params, forward_params, postprocess_params) 1028 1029 def run_multi(self, inputs, preprocess_params, forward_params, postprocess_params):

~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/base.py in run_single(self, inputs, preprocess_params, forward_params, postprocess_params) 1033 model_inputs = self.preprocess(inputs, preprocess_params) 1034 model_outputs = self.forward(model_inputs, forward_params) -> 1035 outputs = self.postprocess(model_outputs, **postprocess_params) 1036 return outputs 1037

~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/token_classification.py in postprocess(self, model_outputs, aggregation_strategy, ignore_labels) 240 sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy 241 ) --> 242 grouped_entities = self.aggregate(pre_entities, aggregation_strategy) 243 # Filter anything that is in self.ignore_labels 244 entities = [

~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/token_classification.py in aggregate(self, pre_entities, aggregation_strategy) 319 score = pre_entity["scores"][entity_idx] 320 entity = { --> 321 "entity": self.model.config.id2label[entity_idx], 322 "score": score, 323 "index": pre_entity["index"],

KeyError: 7357

nf78 commented 2 years ago

I've actually found a solution for this and posted it on a stackoverflow answer