IDEA-CCNL / Fengshenbang-LM

Fengshenbang-LM(封神榜大模型)是IDEA研究院认知计算与自然语言研究中心主导的大模型开源体系,成为中文AIGC和认知智能的基础设施。
Apache License 2.0
4k stars 374 forks source link

训练一个LongformerForSequenceClassification模型,报错 #413

Open haonit opened 1 year ago

haonit commented 1 year ago

训练一个模型,报错: 代码大致如下: """ Erlangshen Longformer """

from datasets import Dataset import pandas as pd from fengshen.models.longformer import LongformerForSequenceClassification from transformers import BertTokenizer from transformers import DataCollatorWithPadding import evaluate import numpy as np from transformers import TrainingArguments, Trainer import torch

input_field = "content" model_name = "IDEA-CCNL/Erlangshen-Longformer-110M" tokenizer = BertTokenizer.from_pretrained(model_name) max_input_size = 1536 accuracy = evaluate.load("accuracy") data_collator = DataCollatorWithPadding(tokenizer=tokenizer) id2label = {0: "正面", 1: "中性", 2: "负面"} label2id = {"正面": 0, "中性": 1, "负面": 2} model = LongformerForSequenceClassification.from_pretrained( model_name, num_labels=len(id2label), id2label=id2label, label2id=label2id )

def preprocess_function(examples): return tokenizer(examples[input_field], truncation=True, max_length=max_input_size)

def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) return accuracy.compute(predictions=predictions, references=labels)

train = pd.read_csv("data/Train_DataSet.csv") train_label = pd.read_csv("data/Train_DataSet_Label.csv") train=train.merge(train_label,on='id',how='left') train['label']=train['label'].fillna(-1) train=train[train['label']!=-1] train['label']=train['label'].astype(int) train["content"] = train["content"].fillna("无") train["title"] = train["title"].fillna("无")

dataset = Dataset.from_pandas(train[[input_field, 'label']]) dataset = dataset.train_test_split()

tokenized_ds = dataset.map(preprocess_function, batched=True)

inputs, labels = tokenized_ds['test'][input_field], tokenized_ds['test']['label']

torch.autograd.set_detect_anomaly(True) training_args = TrainingArguments( output_dir="longformer_chinese_sentiment", learning_rate=5e-4, per_device_train_batch_size=1, per_device_eval_batch_size=1, num_train_epochs=4, weight_decay=0.01, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, push_to_hub=False, optim='adamw_torch', )

trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_ds["train"], eval_dataset=tokenized_ds["test"], tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, )

trainer.train()

报错如下: [2023-08-25 13:35:24,607] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect) Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5505/5505 [00:19<00:00, 286.92 examples/s] Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1835/1835 [00:06<00:00, 281.76 examples/s] /root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/transformers/modeling_utils.py:900: FutureWarning: The device argument is deprecated and will be removed in v5 of Transformers. warnings.warn( /root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/torch/autograd/init.py:200: UserWarning: Error detected in BmmBackward0. Traceback of forward call that caused the error: File "ErLS_LF.py", line 90, in trainer.train() File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/transformers/trainer.py", line 1555, in train return inner_training_loop( File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/transformers/trainer.py", line 1837, in _inner_training_loop tr_loss_step = self.training_step(model, inputs) File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/transformers/trainer.py", line 2682, in training_step loss = self.compute_loss(model, inputs) File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/transformers/trainer.py", line 2707, in compute_loss outputs = model(inputs) File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, *kwargs) File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/fengshen-0.0.1-py3.8.egg/fengshen/models/longformer/modeling_longformer.py", line 2064, in forward outputs = self.longformer( File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/fengshen-0.0.1-py3.8.egg/fengshen/models/longformer/modeling_longformer.py", line 1880, in forward encoder_outputs = self.encoder( File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/fengshen-0.0.1-py3.8.egg/fengshen/models/longformer/modeling_longformer.py", line 1472, in forward layer_outputs = layer_module( File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, *kwargs) File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/fengshen-0.0.1-py3.8.egg/fengshen/models/longformer/modeling_longformer.py", line 1396, in forward self_attn_outputs = self.attention( File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(args, kwargs) File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/fengshen-0.0.1-py3.8.egg/fengshen/models/longformer/modeling_longformer.py", line 1331, in forward self_outputs = self.self( File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, kwargs) File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/fengshen-0.0.1-py3.8.egg/fengshen/models/longformer/modeling_longformer.py", line 779, in forward attn_output = self._compute_attn_output_with_global_indices( File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/fengshen-0.0.1-py3.8.egg/fengshen/models/longformer/modeling_longformer.py", line 1113, in _compute_attn_output_with_global_indices attn_output_only_global = torch.matmul( (Triggered internally at ../torch/csrc/autograd/python_anomaly_mode.cpp:114.) Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass Traceback (most recent call last): File "ErLS_LF.py", line 90, in trainer.train() File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/transformers/trainer.py", line 1555, in train return inner_training_loop( File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/transformers/trainer.py", line 1837, in _inner_training_loop tr_loss_step = self.training_step(model, inputs) File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/transformers/trainer.py", line 2693, in training_step self.accelerator.backward(loss) File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/accelerate/accelerator.py", line 1923, in backward loss.backward(kwargs) File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/torch/_tensor.py", line 487, in backward torch.autograd.backward( File "/root/miniconda3/envs/deeplearning/lib/python3.8/site-packages/torch/autograd/init.py", line 200, in backward Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [12, 1536, 1]], which is output 0 of AsStridedBackward0, is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

请帮忙看一下问题出在哪里

haiahaiah commented 4 months ago

I also encountered the same problem, have you solved it? @ganzhiruyi @hejunqing May I ask how to solve this problem? Thanks a lot!