Traceback (most recent call last):
File "train_unicoil.py", line 102, in
main()
File "train_unicoil.py", line 95, in main
trainer.train() # TODO: resume training
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/transformers/trainer.py", line 1500, in train
return inner_training_loop(
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/transformers/trainer.py", line 1716, in _inner_training_loop
for step, inputs in enumerate(epoch_iterator):
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 681, in next
data = self._next_data()
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 721, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/src/tevatron/data.py", line 53, in getitem
encoded_query = self.create_one_example(qry, is_query=True)
File "/src/tevatron/src/tevatron/data.py", line 33, in create_one_example
item = self.tok.prepare_for_model(
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/transformers/tokenization_utils_base.py", line 3121, in prepare_for_model
sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/transformers/models/bert/tokenization_bert.py", line 289, in build_inputs_with_special_tokens
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
TypeError: can only concatenate list (not "str") to list
if I run the unicoil example with the param:
--dataset_name Tevatron/msmarco-passage \
There is no problems, but I would like to be able to use the param:
--train_dir /train.jsonl.gz \
Hi, Instead of connecting directly with the tevatron/msmarco-passage on huggingface, I downloaded the dataset and ran following:
def run(): os.system(f"CUDA_VISIBLE_DEVICES=0 \ python3 train_unicoil.py \ --output_dir unicoil_distilbert \ --model_name_or_path distilbert-base-uncased \ --save_steps 20000 \ --train_dir /train.jsonl.gz \ --per_device_train_batch_size 8 \ --train_n_passages 8 \ --learning_rate 5e-6 \ --q_max_len 16 \ --p_max_len 128 \ --num_train_epochs 3 \ --add_pooler \ --projection_in_dim 768 \ --projection_out_dim 1 \ --logging_steps 500 \ --overwrite_output_dir")
if name == 'main': run()
BUT i get the following error:
Traceback (most recent call last): File "train_unicoil.py", line 102, in
main()
File "train_unicoil.py", line 95, in main
trainer.train() # TODO: resume training
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/transformers/trainer.py", line 1500, in train
return inner_training_loop(
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/transformers/trainer.py", line 1716, in _inner_training_loop
for step, inputs in enumerate(epoch_iterator):
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 681, in next
data = self._next_data()
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 721, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 49, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/src/tevatron/data.py", line 53, in getitem
encoded_query = self.create_one_example(qry, is_query=True)
File "/src/tevatron/src/tevatron/data.py", line 33, in create_one_example
item = self.tok.prepare_for_model(
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/transformers/tokenization_utils_base.py", line 3121, in prepare_for_model
sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
File "/src/tevatron/examples/unicoil/test_teva/lib/python3.8/site-packages/transformers/models/bert/tokenization_bert.py", line 289, in build_inputs_with_special_tokens
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
TypeError: can only concatenate list (not "str") to list
if I run the unicoil example with the param: --dataset_name Tevatron/msmarco-passage \ There is no problems, but I would like to be able to use the param:
--train_dir /train.jsonl.gz \
What can I do to fix the issue?