Open HaoRenkk123 opened 1 year ago
when I use BF16,I also got the same error。
same error, too
same error, too
Going further, I found that somehow the tensors whitch send to the next pipeline while forward become all zero, then the grads whitch are sended through the backward become inf or a huge number(like e+22 or e+30). Then all the tensor (pipeline 1 output and grad) become nan, whitch result in all_groups_norm = -1. So, anybody know about this?
I print the tensors while they do communicate. The logs are down there.
!!! rank: 7
tensor_send_prev: None
tensor_send_next: None
!!! rank: 6
tensor_send_prev: None
tensor_send_next: None
!!! rank: 5
tensor_send_prev: None
tensor_send_next: None
!!! rank: 4
tensor_send_prev: None
tensor_send_next: None
!!! rank: 1
tensor_send_prev: None
!!! rank: 2
tensor_send_prev: None
!!! rank: 3
tensor_send_prev: None
!!! rank: 0
tensor_send_prev: None
tensor_send_next: tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]],
[[-66.0000, 18.1250, 18.6250, ..., -34.7500, -35.0000, -77.5000]],
[[-67.5000, 18.7500, 18.8750, ..., -35.2500, -36.2500, -79.5000]],
...,
[[-96.0000, 26.8750, 25.3750, ..., -44.7500, -58.2500, -95.0000]],
[[-96.0000, 26.5000, 24.7500, ..., -44.2500, -58.2500, -94.5000]],
[[-95.0000, 26.6250, 24.7500, ..., -44.2500, -58.2500, -95.0000]]],
device='cuda:0', dtype=torch.bfloat16,
grad_fn=<DifferentiableGraphBackward>)
tensor_send_next: tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]],
[[-66.0000, 18.1250, 18.6250, ..., -34.7500, -35.0000, -77.5000]],
[[-67.5000, 18.7500, 18.8750, ..., -35.2500, -36.2500, -79.5000]],
...,
[[-96.0000, 26.8750, 25.3750, ..., -44.7500, -58.2500, -95.0000]],
[[-96.0000, 26.5000, 24.7500, ..., -44.2500, -58.2500, -94.5000]],
[[-95.0000, 26.6250, 24.7500, ..., -44.2500, -58.2500, -95.0000]]],
device='cuda:2', dtype=torch.bfloat16,
grad_fn=<DifferentiableGraphBackward>)5<----*1 tensor_send_next: tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]],
[[-66.0000, 18.1250, 18.6250, ..., -34.7500, -35.0000, -77.5000]],
[[-67.5000, 18.7500, 18.8750, ..., -35.2500, -36.2500, -79.5000]],
...,
[[-96.0000, 26.8750, 25.3750, ..., -44.7500, -58.2500, -95.0000]],
[[-96.0000, 26.5000, 24.7500, ..., -44.2500, -58.2500, -94.5000]],
[[-95.0000, 26.6250, 24.7500, ..., -44.2500, -58.2500, -95.0000]]],
device='cuda:1', dtype=torch.bfloat16,
grad_fn=<DifferentiableGraphBackward>)
tensor_recv_prev: None
tensor_recv_next: None
tensor_recv_prev: None
tensor_recv_next: None
!!! rank: 1
tensor_send_prev: None
tensor_send_next: None!!! rank: 2
tensor_send_prev: None
tensor_send_next: None
tensor_send_next: tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]],
[[-66.0000, 18.1250, 18.6250, ..., -34.7500, -35.0000, -77.5000]],
[[-67.5000, 18.7500, 18.8750, ..., -35.2500, -36.2500, -79.5000]],
...,
[[-96.0000, 26.8750, 25.3750, ..., -44.7500, -58.2500, -95.0000]],
[[-96.0000, 26.5000, 24.7500, ..., -44.2500, -58.2500, -94.5000]],
[[-95.0000, 26.6250, 24.7500, ..., -44.2500, -58.2500, -95.0000]]],
device='cuda:3', dtype=torch.bfloat16,
grad_fn=<DifferentiableGraphBackward>)
tensor_recv_prev: None
tensor_recv_next: None
!!! rank: 0
tensor_send_prev: None
tensor_send_next: None
tensor_recv_prev: None
tensor_recv_next: None
!!! rank: 3
tensor_send_prev: None
tensor_send_next: None
tensor_recv_prev: tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]],
[[-66.0000, 18.1250, 18.6250, ..., -34.7500, -35.0000, -77.5000]],
[[-67.5000, 18.7500, 18.8750, ..., -35.2500, -36.2500, -79.5000]],
...,
[[-96.0000, 26.8750, 25.3750, ..., -44.7500, -58.2500, -95.0000]],
[[-96.0000, 26.5000, 24.7500, ..., -44.2500, -58.2500, -94.5000]],
[[-95.0000, 26.6250, 24.7500, ..., -44.2500, -58.2500, -95.0000]]],
device='cuda:5', dtype=torch.bfloat16, requires_grad=True)
tensor_recv_next: None
tensor_recv_prev: tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]],
[[-66.0000, 18.1250, 18.6250, ..., -34.7500, -35.0000, -77.5000]],
[[-67.5000, 18.7500, 18.8750, ..., -35.2500, -36.2500, -79.5000]],
...,
[[-96.0000, 26.8750, 25.3750, ..., -44.7500, -58.2500, -95.0000]],
[[-96.0000, 26.5000, 24.7500, ..., -44.2500, -58.2500, -94.5000]],
[[-95.0000, 26.6250, 24.7500, ..., -44.2500, -58.2500, -95.0000]]],
device='cuda:6', dtype=torch.bfloat16, requires_grad=True)
tensor_recv_next: None
tensor_recv_prev: tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]],
[[-66.0000, 18.1250, 18.6250, ..., -34.7500, -35.0000, -77.5000]],
[[-67.5000, 18.7500, 18.8750, ..., -35.2500, -36.2500, -79.5000]],
...,
[[-96.0000, 26.8750, 25.3750, ..., -44.7500, -58.2500, -95.0000]],
[[-96.0000, 26.5000, 24.7500, ..., -44.2500, -58.2500, -94.5000]],
[[-95.0000, 26.6250, 24.7500, ..., -44.2500, -58.2500, -95.0000]]],
device='cuda:4', dtype=torch.bfloat16, requires_grad=True)
tensor_recv_next: None
tensor_recv_prev: tensor([[[ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]],
[[-66.0000, 18.1250, 18.6250, ..., -34.7500, -35.0000, -77.5000]],
[[-67.5000, 18.7500, 18.8750, ..., -35.2500, -36.2500, -79.5000]],
...,
[[-96.0000, 26.8750, 25.3750, ..., -44.7500, -58.2500, -95.0000]],
[[-96.0000, 26.5000, 24.7500, ..., -44.2500, -58.2500, -94.5000]],
[[-95.0000, 26.6250, 24.7500, ..., -44.2500, -58.2500, -95.0000]]],
device='cuda:7', dtype=torch.bfloat16, requires_grad=True)
*7<----11 tensor_recv_next: None
### loss: 14.612119674682617
### loss: 14.612119674682617
### loss: 14.612119674682617
### loss: 14.612119674682617
### loss_reduced: {'lm loss': tensor(14.6121, device='cuda:7')}
### loss_reduced: {'lm loss': tensor(14.6121, device='cuda:4')}
### loss_reduced: {'lm loss': tensor(14.6121, device='cuda:6')}
### loss_reduced: {'lm loss': tensor(14.6121, device='cuda:5')}
!!! rank: 4
!!! rank: 6
!!! rank: 5
!!! rank: 7
tensor_send_prev: tensor([[[ inf, inf, inf, ..., 4.1954e+36,
-inf, -inf]],
[[ 2.3865e-08, 2.1770e-08, -1.1700e-08, ..., 5.7742e-08,
-2.2235e-08, -3.8883e-08]],
[[ 4.7265e-08, -4.2608e-08, -1.4727e-08, ..., 1.8440e-07,
-5.7276e-08, -1.4342e-07]],
...,
[[-7.3633e-09, -8.0559e-08, -2.3632e-08, ..., -8.8010e-08,
1.9500e-09, 1.8161e-08]],
[[ 3.3528e-08, -6.9384e-08, -4.0047e-08, ..., -6.4028e-09,
4.8429e-08, -1.6880e-08]],
[[-1.6415e-08, -1.4319e-08, -3.0734e-08, ..., 5.8906e-08,
-4.0978e-08, -1.0477e-08]]], device='cuda:6', dtype=torch.bfloat16)
tensor_send_next: None
tensor_send_prev: tensor([[[ inf, inf, inf, ..., 4.1954e+36,
-inf, -inf]],
[[ 2.3865e-08, 2.1770e-08, -1.1700e-08, ..., 5.7742e-08,
-2.2235e-08, -3.8883e-08]],
[[ 4.7265e-08, -4.2608e-08, -1.4727e-08, ..., 1.8440e-07,
-5.7276e-08, -1.4342e-07]],
...,
[[-7.3633e-09, -8.0559e-08, -2.3632e-08, ..., -8.8010e-08,
1.9500e-09, 1.8161e-08]],
[[ 3.3528e-08, -6.9384e-08, -4.0047e-08, ..., -6.4028e-09,
4.8429e-08, -1.6880e-08]],
[[-1.6415e-08, -1.4319e-08, -3.0734e-08, ..., 5.8906e-08,
-4.0978e-08, -1.0477e-08]]], device='cuda:7', dtype=torch.bfloat16)
tensor_send_next: None
tensor_send_prev: tensor([[[ inf, inf, inf, ..., 4.1954e+36,
-inf, -inf]],
[[ 2.3865e-08, 2.1770e-08, -1.1700e-08, ..., 5.7742e-08,
-2.2235e-08, -3.8883e-08]],
[[ 4.7265e-08, -4.2608e-08, -1.4727e-08, ..., 1.8440e-07,
-5.7276e-08, -1.4342e-07]],
...,
[[-7.3633e-09, -8.0559e-08, -2.3632e-08, ..., -8.8010e-08,
1.9500e-09, 1.8161e-08]],
[[ 3.3528e-08, -6.9384e-08, -4.0047e-08, ..., -6.4028e-09,
4.8429e-08, -1.6880e-08]],
[[-1.6415e-08, -1.4319e-08, -3.0734e-08, ..., 5.8906e-08,
-4.0978e-08, -1.0477e-08]]], device='cuda:4', dtype=torch.bfloat16)
tensor_send_next: None
tensor_recv_prev: None
tensor_recv_next: None
tensor_recv_prev: None
tensor_recv_prev: None
tensor_recv_next: None
tensor_recv_prev: None
tensor_send_prev: tensor([[[ inf, inf, inf, ..., 4.1954e+36,
-inf, -inf]],
[[ 2.3865e-08, 2.1770e-08, -1.1700e-08, ..., 5.7742e-08,
-2.2235e-08, -3.8883e-08]],
[[ 4.7265e-08, -4.2608e-08, -1.4727e-08, ..., 1.8440e-07,
-5.7276e-08, -1.4342e-07]],
...,
[[-7.3633e-09, -8.0559e-08, -2.3632e-08, ..., -8.8010e-08,
1.9500e-09, 1.8161e-08]],
[[ 3.3528e-08, -6.9384e-08, -4.0047e-08, ..., -6.4028e-09,
4.8429e-08, -1.6880e-08]],
[[-1.6415e-08, -1.4319e-08, -3.0734e-08, ..., 5.8906e-08,
-4.0978e-08, -1.0477e-08]]], device='cuda:5', dtype=torch.bfloat16)
tensor_send_next: None
tensor_recv_prev: None
tensor_recv_next: None
tensor_recv_next: None
tensor_recv_prev: None
tensor_recv_prev: None
tensor_recv_next: tensor([[[ inf, inf, inf, ..., 4.1954e+36,
-inf, -inf]],
[[ 2.3865e-08, 2.1770e-08, -1.1700e-08, ..., 5.7742e-08,
-2.2235e-08, -3.8883e-08]],
[[ 4.7265e-08, -4.2608e-08, -1.4727e-08, ..., 1.8440e-07,
-5.7276e-08, -1.4342e-07]],
...,
[[-7.3633e-09, -8.0559e-08, -2.3632e-08, ..., -8.8010e-08,
1.9500e-09, 1.8161e-08]],
[[ 3.3528e-08, -6.9384e-08, -4.0047e-08, ..., -6.4028e-09,
4.8429e-08, -1.6880e-08]],
[[-1.6415e-08, -1.4319e-08, -3.0734e-08, ..., 5.8906e-08,
-4.0978e-08, -1.0477e-08]]], device='cuda:2', dtype=torch.bfloat16,
requires_grad=True)
tensor_recv_next: tensor([[[ inf, inf, inf, ..., 4.1954e+36,
-inf, -inf]],
[[ 2.3865e-08, 2.1770e-08, -1.1700e-08, ..., 5.7742e-08,
-2.2235e-08, -3.8883e-08]],
[[ 4.7265e-08, -4.2608e-08, -1.4727e-08, ..., 1.8440e-07,
-5.7276e-08, -1.4342e-07]],
...,
[[-7.3633e-09, -8.0559e-08, -2.3632e-08, ..., -8.8010e-08,
1.9500e-09, 1.8161e-08]],
[[ 3.3528e-08, -6.9384e-08, -4.0047e-08, ..., -6.4028e-09,
4.8429e-08, -1.6880e-08]],
[[-1.6415e-08, -1.4319e-08, -3.0734e-08, ..., 5.8906e-08,
-4.0978e-08, -1.0477e-08]]], device='cuda:3', dtype=torch.bfloat16,
requires_grad=True)
tensor_recv_next: tensor([[[ inf, inf, inf, ..., 4.1954e+36,
-inf, -inf]],
[[ 2.3865e-08, 2.1770e-08, -1.1700e-08, ..., 5.7742e-08,
-2.2235e-08, -3.8883e-08]],
[[ 4.7265e-08, -4.2608e-08, -1.4727e-08, ..., 1.8440e-07,
-5.7276e-08, -1.4342e-07]],
...,
[[-7.3633e-09, -8.0559e-08, -2.3632e-08, ..., -8.8010e-08,
1.9500e-09, 1.8161e-08]],
[[ 3.3528e-08, -6.9384e-08, -4.0047e-08, ..., -6.4028e-09,
4.8429e-08, -1.6880e-08]],
[[-1.6415e-08, -1.4319e-08, -3.0734e-08, ..., 5.8906e-08,
-4.0978e-08, -1.0477e-08]]], device='cuda:1', dtype=torch.bfloat16,
requires_grad=True)
tensor_recv_next: tensor([[[ inf, inf, inf, ..., 4.1954e+36,
-inf, -inf]],
[[ 2.3865e-08, 2.1770e-08, -1.1700e-08, ..., 5.7742e-08,
-2.2235e-08, -3.8883e-08]],
[[ 4.7265e-08, -4.2608e-08, -1.4727e-08, ..., 1.8440e-07,
-5.7276e-08, -1.4342e-07]],
...,
[[-7.3633e-09, -8.0559e-08, -2.3632e-08, ..., -8.8010e-08,
1.9500e-09, 1.8161e-08]],
[[ 3.3528e-08, -6.9384e-08, -4.0047e-08, ..., -6.4028e-09,
4.8429e-08, -1.6880e-08]],
[[-1.6415e-08, -1.4319e-08, -3.0734e-08, ..., 5.8906e-08,
-4.0978e-08, -1.0477e-08]]], device='cuda:0', dtype=torch.bfloat16,
requires_grad=True)
iteration 10/ 100 | consumed samples: 10 | consumed tokens: 40960 | elapsed time per iteration (ms): 2208.9 | learning rate: 3.000E-04 | global batch size: 1 | lm loss: 1.461212E+01 | loss scale: 1.0 | grad norm: nan | actual seqlen: 4096 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 0.453 | TFLOPs: 16.04 |
!!! rank: 7
tensor_send_prev: None
tensor_send_next: None
!!! rank: 5
tensor_send_prev: None
tensor_send_next: None
!!! rank: 4
tensor_send_prev: None
tensor_send_next: None
!!! rank: 6
tensor_send_prev: None
tensor_send_next: None
!!! rank: 1
!!! rank: *1---->-3 tensor_send_prev: None
0
tensor_send_prev: None
!!! rank: !!! rank: 3
tensor_send_prev: None
tensor_send_prev: None
tensor_send_next: tensor([[[nan, nan, nan, ..., nan, nan, nan]],
[[nan, nan, nan, ..., nan, nan, nan]],
[[nan, nan, nan, ..., nan, nan, nan]],
...,
[[nan, nan, nan, ..., nan, nan, nan]],
[[nan, nan, nan, ..., nan, nan, nan]],
[[nan, nan, nan, ..., nan, nan, nan]]], device='cud
same error. I encounter this error several times within 500 steps when training Llama7B
@KenwayZZZ Hi, any solution to this bug?