0%| | 0/18140 [00:00<?, ?it/s]Didn't find an RNG file, if you are resuming a training that was launched in a distributed fashion, reproducibility is not guaranteed.
Traceback (most recent call last):
File "main.py", line 305, in
main()
File "main.py", line 202, in main
trainer.train(
File "/root/miniconda3/lib/python3.8/site-packages/transformers/trainer.py", line 1543, in train
return inner_training_loop(
File "/root/miniconda3/lib/python3.8/site-packages/transformers/trainer.py", line 1791, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/trainer.py", line 2539, in training_step
loss = self.compute_loss(model, inputs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/trainer.py", line 2571, in compute_loss
outputs = model(inputs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply
output.reraise()
File "/root/miniconda3/lib/python3.8/site-packages/torch/_utils.py", line 644, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
output = module(input, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/workspace/bin/multimodal_transformers/model/tabular_transformers.py", line 114, in forward
outputs = self.bert(
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 1012, in forward
embedding_output = self.embeddings(
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 236, in forward
embeddings += position_embeddings
RuntimeError: The size of tensor a (1487) must match the size of tensor b (512) at non-singleton dimension 1
Hi @LinXin04 could you offer some context on this error? Is this the example code provided or is it your own? If the latter, could you share the snippet?
Number of trainable parameters = 110197407
0%| | 0/18140 [00:00<?, ?it/s]Didn't find an RNG file, if you are resuming a training that was launched in a distributed fashion, reproducibility is not guaranteed. Traceback (most recent call last): File "main.py", line 305, in
main()
File "main.py", line 202, in main
trainer.train(
File "/root/miniconda3/lib/python3.8/site-packages/transformers/trainer.py", line 1543, in train
return inner_training_loop(
File "/root/miniconda3/lib/python3.8/site-packages/transformers/trainer.py", line 1791, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/trainer.py", line 2539, in training_step
loss = self.compute_loss(model, inputs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/trainer.py", line 2571, in compute_loss
outputs = model(inputs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 171, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 181, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 89, in parallel_apply
output.reraise()
File "/root/miniconda3/lib/python3.8/site-packages/torch/_utils.py", line 644, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 64, in _worker
output = module(input, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/workspace/bin/multimodal_transformers/model/tabular_transformers.py", line 114, in forward
outputs = self.bert(
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 1012, in forward
embedding_output = self.embeddings(
File "/root/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(args, kwargs)
File "/root/miniconda3/lib/python3.8/site-packages/transformers/models/bert/modeling_bert.py", line 236, in forward
embeddings += position_embeddings
RuntimeError: The size of tensor a (1487) must match the size of tensor b (512) at non-singleton dimension 1