Closed pratikchhapolika closed 2 years ago
@patrickvonplaten any help on this, please !!
I think this is an issue with the ConvBERT tokenizer conversion cc @abhishekkrthakur
@pratikchhapolika where does this error occur? would you mind posting the full stacktrace?
@abhishekkrthakur This is the only error I get. The KeyError: ***
keeps changing after I re-run the model.
Uploaded the notebook. Please change it to .ipynb
@abhishekkrthakur any help?
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Please note that issues that do not follow the contributing guidelines are likely to be ignored.
@pratikchhapolika where does this error occur? would you mind posting the full stacktrace?
Any help please!
Thanks for the ping. I kinda lost it during christmas time. Unfortunately, im not able to see your pdf file. Could you please upload an ipynb version?
Thanks for the ping. I kinda lost it during christmas time. Unfortunately, im not able to see your pdf file. Could you please upload an ipynb version?
Just rename .pdf to .ipynb
I'm not sure what the error is but its not related to the model. Here is my code for imdb (since i don't have your dataset) that works just fine:
import pandas as pd
import tez
import torch
import torch.nn as nn
import transformers
from sklearn import metrics, model_selection
from transformers import AdamW, get_linear_schedule_with_warmup
class BERTDataset:
def __init__(self, review, target):
self.review = review
self.target = target
self.tokenizer = transformers.AutoTokenizer.from_pretrained("YituTech/conv-bert-base")
self.max_len = 64
def __len__(self):
return len(self.review)
def __getitem__(self, item):
review = str(self.review[item])
review = " ".join(review.split())
inputs = self.tokenizer.encode_plus(
review,
None,
add_special_tokens=True,
max_length=self.max_len,
padding="max_length",
truncation=True,
)
ids = inputs["input_ids"]
mask = inputs["attention_mask"]
token_type_ids = inputs["token_type_ids"]
return {
"ids": torch.tensor(ids, dtype=torch.long),
"mask": torch.tensor(mask, dtype=torch.long),
"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
"targets": torch.tensor(self.target[item], dtype=torch.float),
}
class BERTBaseUncased(tez.Model):
def __init__(self, num_train_steps):
super().__init__()
config = transformers.AutoConfig.from_pretrained("YituTech/conv-bert-base")
config.update(
{
"output_hidden_states": True,
}
)
self.tokenizer = transformers.AutoTokenizer.from_pretrained("YituTech/conv-bert-base")
self.bert = transformers.AutoModel.from_pretrained("YituTech/conv-bert-base", config=config)
self.bert_drop = nn.Dropout(0.3)
self.out = nn.Linear(768, 1)
self.num_train_steps = num_train_steps
self.step_scheduler_after = "batch"
def fetch_optimizer(self):
param_optimizer = list(self.named_parameters())
no_decay = ["bias", "LayerNorm.bias"]
optimizer_parameters = [
{
"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
"weight_decay": 0.001,
},
{
"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
"weight_decay": 0.0,
},
]
opt = AdamW(optimizer_parameters, lr=3e-5)
return opt
def fetch_scheduler(self):
sch = get_linear_schedule_with_warmup(
self.optimizer, num_warmup_steps=0, num_training_steps=self.num_train_steps
)
return sch
def loss(self, outputs, targets):
if targets is None:
return None
return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))
def monitor_metrics(self, outputs, targets):
if targets is None:
return {}
outputs = torch.sigmoid(outputs).cpu().detach().numpy() >= 0.5
targets = targets.cpu().detach().numpy()
accuracy = metrics.accuracy_score(targets, outputs)
return {"accuracy": accuracy}
def forward(self, ids, mask, token_type_ids, targets=None):
o_2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
pooled_output = torch.mean(o_2.last_hidden_state, dim=1)
print(pooled_output.shape)
b_o = self.bert_drop(pooled_output)
output = self.out(b_o)
loss = self.loss(output, targets)
acc = self.monitor_metrics(output, targets)
return output, loss, acc
if __name__ == "__main__":
dfx = pd.read_csv("/home/abhishek/workspace/autoxgb/datasets/imdb.csv").fillna("none")
dfx.sentiment = dfx.sentiment.apply(lambda x: 1 if x == "positive" else 0)
df_train, df_valid = model_selection.train_test_split(
dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values
)
df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)
train_dataset = BERTDataset(review=df_train.review.values, target=df_train.sentiment.values)
valid_dataset = BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values)
n_train_steps = int(len(df_train) / 32 * 10)
model = BERTBaseUncased(num_train_steps=n_train_steps)
tb_logger = tez.callbacks.TensorBoardLogger(log_dir=".logs/")
es = tez.callbacks.EarlyStopping(monitor="valid_loss", model_path="model.bin")
model.fit(
train_dataset,
valid_dataset=valid_dataset,
train_bs=32,
device="cuda",
epochs=50,
callbacks=[tb_logger, es],
fp16=True,
)
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Please note that issues that do not follow the contributing guidelines are likely to be ignored.
I got the same error when I used transformer to perform NER on Chinese text. my code is : from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese") model = AutoModelForMaskedLM.from_pretrained("bert-base-chinese")
nlp = pipeline("ner", model=model, tokenizer=tokenizer) example = "我的名字叫大头,男,生于1900年12月12日"
ner_results = nlp(example) print(ner_results)
Then I got: KeyError Traceback (most recent call last) in 7 example = "我的名字叫大头,男,生于1900年12月12日" 8 ----> 9 ner_results = nlp(example) 10 print(ner_results)
~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/token_classification.py in call(self, inputs, kwargs) 187 kwargs["offset_mapping"] = offset_mapping 188 --> 189 return super().call(inputs, kwargs) 190 191 def preprocess(self, sentence, offset_mapping=None):
~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/base.py in call(self, inputs, num_workers, batch_size, *args, **kwargs) 1025 return self.iterate(inputs, preprocess_params, forward_params, postprocess_params) 1026 else: -> 1027 return self.run_single(inputs, preprocess_params, forward_params, postprocess_params) 1028 1029 def run_multi(self, inputs, preprocess_params, forward_params, postprocess_params):
~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/base.py in run_single(self, inputs, preprocess_params, forward_params, postprocess_params) 1033 model_inputs = self.preprocess(inputs, preprocess_params) 1034 model_outputs = self.forward(model_inputs, forward_params) -> 1035 outputs = self.postprocess(model_outputs, **postprocess_params) 1036 return outputs 1037
~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/token_classification.py in postprocess(self, model_outputs, aggregation_strategy, ignore_labels) 240 sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy 241 ) --> 242 grouped_entities = self.aggregate(pre_entities, aggregation_strategy) 243 # Filter anything that is in self.ignore_labels 244 entities = [
~/opt/anaconda3/lib/python3.8/site-packages/transformers/pipelines/token_classification.py in aggregate(self, pre_entities, aggregation_strategy) 319 score = pre_entity["scores"][entity_idx] 320 entity = { --> 321 "entity": self.model.config.id2label[entity_idx], 322 "score": score, 323 "index": pre_entity["index"],
KeyError: 7357
I've actually found a solution for this and posted it on a stackoverflow answer
I am training a simple binary classification model using
Hugging face models
usingpytorch.
Bert PyTorch HuggingFace.
Here is the code:
I am reading a text-data and classifying it as toxic or non-toxic. I have downloaded and saved model in path.
I have attached all the codes above.
Error:
**0%| | 0/29 [00:00<?, ?it/s]
KeyError: 337**