Traceback (most recent call last):
File "pretrain_gpt2.py", line 794, in
main()
File "pretrain_gpt2.py", line 722, in main
args.eod_token = get_train_val_test_data(args)
File "pretrain_gpt2.py", line 664, in get_train_val_test_data
(train_data, val_data, test_data), tokenizer = data_config.apply(
File "/home/ubuntu/newdisk/zjj/Chinese-Transformer-XL-master/configure_data.py", line 34, in apply
return make_loaders(args)
File "/home/ubuntu/newdisk/zjj/Chinese-Transformer-XL-master/configure_data.py", line 182, in make_loaders
train, tokenizer = data_utils.make_dataset(**data_set_args)
File "/home/ubuntu/newdisk/zjj/Chinese-Transformer-XL-master/data_utils/init.py", line 136, in make_dataset
ds = [get_dataset(p, tokenizer=tokenizer, pre_tokenize=pre_tokenize, local_rank=local_rank) for p in path]
File "/home/ubuntu/newdisk/zjj/Chinese-Transformer-XL-master/data_utils/init.py", line 136, in
ds = [get_dataset(p, tokenizer=tokenizer, pre_tokenize=pre_tokenize, local_rank=local_rank) for p in path]
File "/home/ubuntu/newdisk/zjj/Chinese-Transformer-XL-master/data_utils/init.py", line 83, in get_dataset
sample_tokens = text[rand_id]['tokens'][:1024]
File "/home/ubuntu/newdisk/zjj/Chinese-Transformer-XL-master/data_utils/corpora.py", line 91, in getitem
prompt = self.prompts[index]
File "/home/ubuntu/newdisk/zjj/Chinese-Transformer-XL-master/data_utils/lazy_loader.py", line 180, in getitem
end = self.ends[index]
IndexError: list index out of range
请问正确的数据格式是什么样子呢
{"prompt": "建设多式联运示范体系", "text":"第一章研究概述"} {"prompt": "建设多式联运示范", "text":"第一章研究概述"}
Traceback (most recent call last): File "pretrain_gpt2.py", line 794, in
main()
File "pretrain_gpt2.py", line 722, in main
args.eod_token = get_train_val_test_data(args)
File "pretrain_gpt2.py", line 664, in get_train_val_test_data
(train_data, val_data, test_data), tokenizer = data_config.apply(
File "/home/ubuntu/newdisk/zjj/Chinese-Transformer-XL-master/configure_data.py", line 34, in apply
return make_loaders(args)
File "/home/ubuntu/newdisk/zjj/Chinese-Transformer-XL-master/configure_data.py", line 182, in make_loaders
train, tokenizer = data_utils.make_dataset(**data_set_args)
File "/home/ubuntu/newdisk/zjj/Chinese-Transformer-XL-master/data_utils/init.py", line 136, in make_dataset
ds = [get_dataset(p, tokenizer=tokenizer, pre_tokenize=pre_tokenize, local_rank=local_rank) for p in path]
File "/home/ubuntu/newdisk/zjj/Chinese-Transformer-XL-master/data_utils/init.py", line 136, in
ds = [get_dataset(p, tokenizer=tokenizer, pre_tokenize=pre_tokenize, local_rank=local_rank) for p in path]
File "/home/ubuntu/newdisk/zjj/Chinese-Transformer-XL-master/data_utils/init.py", line 83, in get_dataset
sample_tokens = text[rand_id]['tokens'][:1024]
File "/home/ubuntu/newdisk/zjj/Chinese-Transformer-XL-master/data_utils/corpora.py", line 91, in getitem
prompt = self.prompts[index]
File "/home/ubuntu/newdisk/zjj/Chinese-Transformer-XL-master/data_utils/lazy_loader.py", line 180, in getitem
end = self.ends[index]
IndexError: list index out of range
请问正确的数据格式是什么样子呢