Traceback (most recent call last):
File "/home/usr/miniconda3/envs/text_gen/bin/fairseq-train", line 33, in <module>
sys.exit(load_entry_point('fairseq', 'console_scripts', 'fairseq-train')())
File "/home/usr/project_2022/text_gen/fairseq/fairseq_cli/train.py", line 528, in cli_main
distributed_utils.call_main(cfg, main)
File "/home/usr/project_2022/text_gen/fairseq/fairseq/distributed/utils.py", line 351, in call_main
join=True,
File "/home/usr/miniconda3/envs/text_gen/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/usr/miniconda3/envs/text_gen/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/home/usr/miniconda3/envs/text_gen/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 150, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/usr/miniconda3/envs/text_gen/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
File "/home/usr/project_2022/text_gen/fairseq/fairseq/distributed/utils.py", line 328, in distributed_main
main(cfg, **kwargs)
File "/home/usr/project_2022/text_gen/fairseq/fairseq_cli/train.py", line 188, in main
valid_losses, should_stop = train(cfg, trainer, task, epoch_itr)
File "/home/usr/miniconda3/envs/text_gen/lib/python3.6/contextlib.py", line 52, in inner
return func(*args, **kwds)
File "/home/usr/project_2022/text_gen/fairseq/fairseq_cli/train.py", line 318, in train
cfg, trainer, task, epoch_itr, valid_subsets, end_of_epoch
File "/home/usr/project_2022/text_gen/fairseq/fairseq_cli/train.py", line 408, in validate_and_save
valid_losses = validate(cfg, trainer, task, epoch_itr, valid_subsets)
File "/home/usr/project_2022/text_gen/fairseq/fairseq_cli/train.py", line 481, in validate
trainer.valid_step(sample)
File "/home/usr/miniconda3/envs/text_gen/lib/python3.6/contextlib.py", line 52, in inner
return func(*args, **kwds)
File "/home/usr/project_2022/text_gen/fairseq/fairseq/trainer.py", line 1122, in valid_step
logging_output = self._reduce_and_log_stats(logging_outputs, sample_size)
File "/home/usr/project_2022/text_gen/fairseq/fairseq/trainer.py", line 1492, in _reduce_and_log_stats
logging_output = agg.get_smoothed_values()
File "/home/usr/project_2022/text_gen/fairseq/fairseq/logging/meters.py", line 302, in get_smoothed_values
for key in self.keys()
File "/home/usr/project_2022/text_gen/fairseq/fairseq/logging/meters.py", line 303, in <listcomp>
if not key.startswith("_")
File "/home/usr/project_2022/text_gen/fairseq/fairseq/logging/meters.py", line 293, in get_smoothed_value
return meter.fn(self)
File "/home/usr/project_2022/text_gen/fairseq/fairseq/tasks/translation.py", line 443, in compute_bleu
**smooth,
File "/home/usr/miniconda3/envs/text_gen/lib/python3.6/site-packages/sacrebleu/metrics/bleu.py", line 281, in compute_bleu
return BLEUScore(score, correct, total, precisions, bp, sys_len, ref_len)
File "/home/usr/miniconda3/envs/text_gen/lib/python3.6/site-packages/sacrebleu/metrics/bleu.py", line 102, in __init__
self._verbose += f"ratio = {self.ratio:.3f} hyp_len = {self.sys_len:d} "
File "/home/usr/miniconda3/envs/text_gen/lib/python3.6/site-packages/torch/_tensor.py", line 571, in __format__
return self.item().__format__(format_spec)
ValueError: Unknown format code 'd' for object of type 'float'
Code sample
Expected behavior
It's expected to run evaluations without any error even under distributed data parallel mode.
Environment
fairseq Version (e.g., 1.0 or main):1.0.0a0+40eb731
PyTorch Version (e.g., 1.0): 1.10.1+cu113
OS (e.g., Linux): Ubuntu 20
How you installed fairseq (pip, source): source
Build command you used (if compiling from source): pip install -e .
🐛 Bug
During training a translation model, the evaluation failed under distributed data parallel mode.
To Reproduce
Steps to reproduce the behavior (always include the command you ran):
Run cmd
Here is traceback during the evaluation
Code sample
Expected behavior
It's expected to run evaluations without any error even under distributed data parallel mode.
Environment
pip
, source): sourceAdditional context