Traceback (most recent call last):
File "/Data/baban/anaconda3/envs/it2/bin/fairseq-train", line 8, in <module>
sys.exit(cli_main())
File "/Data/baban/scripts/it2/fairseq/fairseq_cli/train.py", line 574, in cli_main
distributed_utils.call_main(cfg, main)
File "/Data/baban/scripts/it2/fairseq/fairseq/distributed/utils.py", line 404, in call_main
main(cfg, **kwargs)
File "/Data/baban/scripts/it2/fairseq/fairseq_cli/train.py", line 205, in main
valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/contextlib.py", line 75, in inner
return func(*args, **kwds)
File "/Data/baban/scripts/it2/fairseq/fairseq_cli/train.py", line 331, in train
log_output = trainer.train_step(samples)
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/contextlib.py", line 75, in inner
return func(*args, **kwds)
File "/Data/baban/scripts/it2/fairseq/fairseq/trainer.py", line 868, in train_step
raise e
File "/Data/baban/scripts/it2/fairseq/fairseq/trainer.py", line 843, in train_step
loss, sample_size_i, logging_output = self.task.train_step(
File "/Data/baban/scripts/it2/fairseq/fairseq/tasks/fairseq_task.py", line 532, in train_step
loss, sample_size, logging_output = criterion(model, sample)
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/Data/baban/scripts/it2/fairseq/fairseq/criterions/label_smoothed_cross_entropy.py", line 80, in forward
net_output = model(**sample["net_input"])
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/Data/baban/scripts/it2/fairseq/fairseq/models/transformer/transformer_base.py", line 164, in forward
encoder_out = self.encoder(
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/Data/baban/scripts/it2/fairseq/fairseq/models/transformer/transformer_encoder.py", line 166, in forward
return self.forward_scriptable(
File "/Data/baban/scripts/it2/fairseq/fairseq/models/transformer/transformer_encoder.py", line 216, in forward_scriptable
x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings)
File "/Data/baban/scripts/it2/fairseq/fairseq/models/transformer/transformer_encoder.py", line 125, in forward_embedding
token_embedding = self.embed_tokens(src_tokens)
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/site-packages/torch/nn/modules/sparse.py", line 162, in forward
return F.embedding(
File "/Data/baban/anaconda3/envs/it2/lib/python3.8/site-packages/torch/nn/functional.py", line 2210, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)
I am using two GPUs for training. Any help regarding this will be really appreciated.
I am trying to train a model with pipeline
This is inspired from https://github.com/facebookresearch/fairseq/tree/main/examples/m2m_100#generation-for-the-12b-model However, I am receiving
I am using two GPUs for training. Any help regarding this will be really appreciated.