Hello,When I use DP or DDP distribute mode, i met this error. Below is a detailed error log.
Traceback (most recent call last):
File "train.py", line 353, in
train(opt)
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/site-packages/gin/config.py", line 1069, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/site-packages/gin/utils.py", line 41, in augment_exception_message_and_reraise
raise proxy.with_traceback(exception.traceback) from None
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/site-packages/gin/config.py", line 1046, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "train.py", line 227, in train
scaled_loss.backward()
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/contextlib.py", line 88, in exit
next(self.gen)
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/site-packages/apex/amp/handle.py", line 123, in scale_loss
optimizer._post_amp_backward(loss_scaler)
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/site-packages/apex/amp/_process_optimizer.py", line 249, in post_backward_no_master_weights
post_backward_models_are_masters(scaler, params, stashed_grads)
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/site-packages/apex/amp/_process_optimizer.py", line 135, in post_backward_models_are_masters
scale_override=(grads_have_scale, stashed_have_scale, out_scale))
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/site-packages/apex/amp/scaler.py", line 183, in unscale_with_stashed
out_scale/grads_have_scale,
ZeroDivisionError: float division by zero
Hello,When I use DP or DDP distribute mode, i met this error. Below is a detailed error log.
Traceback (most recent call last): File "train.py", line 353, in
train(opt)
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/site-packages/gin/config.py", line 1069, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/site-packages/gin/utils.py", line 41, in augment_exception_message_and_reraise
raise proxy.with_traceback(exception.traceback) from None
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/site-packages/gin/config.py", line 1046, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "train.py", line 227, in train
scaled_loss.backward()
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/contextlib.py", line 88, in exit
next(self.gen)
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/site-packages/apex/amp/handle.py", line 123, in scale_loss
optimizer._post_amp_backward(loss_scaler)
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/site-packages/apex/amp/_process_optimizer.py", line 249, in post_backward_no_master_weights
post_backward_models_are_masters(scaler, params, stashed_grads)
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/site-packages/apex/amp/_process_optimizer.py", line 135, in post_backward_models_are_masters
scale_override=(grads_have_scale, stashed_have_scale, out_scale))
File "/ssd/exec/xuejt/home/anaconda3/envs/py36/lib/python3.6/site-packages/apex/amp/scaler.py", line 183, in unscale_with_stashed
out_scale/grads_have_scale,
ZeroDivisionError: float division by zero