when i train llama2 13b, AssertionError

same error, too
Going further, I found that somehow the tensors whitch send to the next pipeline while forward become all zero, then the grads whitch are sended through the backward become inf or a huge number(like e+22 or e+30). Then all the tensor (pipeline 1 output and grad) become nan, whitch result in all_groups_norm = -1. So, anybody know about this?
I print the tensors while they do communicate. The logs are down there.
!!! rank:  7
 tensor_send_prev: None
 tensor_send_next: None
!!! rank:  6
 tensor_send_prev: None
 tensor_send_next: None
!!! rank:  5
 tensor_send_prev: None
 tensor_send_next: None
!!! rank:  4
 tensor_send_prev: None
 tensor_send_next: None
!!! rank:  1
 tensor_send_prev: None
!!! rank:  2
 tensor_send_prev: None
!!! rank:  3
 tensor_send_prev: None
!!! rank:  0
 tensor_send_prev: None
 tensor_send_next: tensor([[[  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]],

        [[-66.0000,  18.1250,  18.6250,  ..., -34.7500, -35.0000, -77.5000]],

        [[-67.5000,  18.7500,  18.8750,  ..., -35.2500, -36.2500, -79.5000]],

        ...,

        [[-96.0000,  26.8750,  25.3750,  ..., -44.7500, -58.2500, -95.0000]],

        [[-96.0000,  26.5000,  24.7500,  ..., -44.2500, -58.2500, -94.5000]],

        [[-95.0000,  26.6250,  24.7500,  ..., -44.2500, -58.2500, -95.0000]]],
       device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<DifferentiableGraphBackward>)
 tensor_send_next: tensor([[[  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]],

        [[-66.0000,  18.1250,  18.6250,  ..., -34.7500, -35.0000, -77.5000]],

        [[-67.5000,  18.7500,  18.8750,  ..., -35.2500, -36.2500, -79.5000]],

        ...,

        [[-96.0000,  26.8750,  25.3750,  ..., -44.7500, -58.2500, -95.0000]],

        [[-96.0000,  26.5000,  24.7500,  ..., -44.2500, -58.2500, -94.5000]],

        [[-95.0000,  26.6250,  24.7500,  ..., -44.2500, -58.2500, -95.0000]]],
       device='cuda:2', dtype=torch.bfloat16,
       grad_fn=<DifferentiableGraphBackward>)5<----*1 tensor_send_next: tensor([[[  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]],

        [[-66.0000,  18.1250,  18.6250,  ..., -34.7500, -35.0000, -77.5000]],

        [[-67.5000,  18.7500,  18.8750,  ..., -35.2500, -36.2500, -79.5000]],

        ...,

        [[-96.0000,  26.8750,  25.3750,  ..., -44.7500, -58.2500, -95.0000]],

        [[-96.0000,  26.5000,  24.7500,  ..., -44.2500, -58.2500, -94.5000]],

        [[-95.0000,  26.6250,  24.7500,  ..., -44.2500, -58.2500, -95.0000]]],
       device='cuda:1', dtype=torch.bfloat16,
       grad_fn=<DifferentiableGraphBackward>)

 tensor_recv_prev: None
 tensor_recv_next: None
 tensor_recv_prev: None
 tensor_recv_next: None
!!! rank:  1
 tensor_send_prev: None
 tensor_send_next: None!!! rank: 2
 tensor_send_prev: None
 tensor_send_next: None
 tensor_send_next: tensor([[[  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]],

        [[-66.0000,  18.1250,  18.6250,  ..., -34.7500, -35.0000, -77.5000]],

        [[-67.5000,  18.7500,  18.8750,  ..., -35.2500, -36.2500, -79.5000]],

        ...,

        [[-96.0000,  26.8750,  25.3750,  ..., -44.7500, -58.2500, -95.0000]],

        [[-96.0000,  26.5000,  24.7500,  ..., -44.2500, -58.2500, -94.5000]],

        [[-95.0000,  26.6250,  24.7500,  ..., -44.2500, -58.2500, -95.0000]]],
       device='cuda:3', dtype=torch.bfloat16,
       grad_fn=<DifferentiableGraphBackward>)
 tensor_recv_prev: None
 tensor_recv_next: None
!!! rank:  0
 tensor_send_prev: None
 tensor_send_next: None
 tensor_recv_prev: None
 tensor_recv_next: None
!!! rank:  3
 tensor_send_prev: None
 tensor_send_next: None
 tensor_recv_prev: tensor([[[  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]],

        [[-66.0000,  18.1250,  18.6250,  ..., -34.7500, -35.0000, -77.5000]],

        [[-67.5000,  18.7500,  18.8750,  ..., -35.2500, -36.2500, -79.5000]],

        ...,

        [[-96.0000,  26.8750,  25.3750,  ..., -44.7500, -58.2500, -95.0000]],

        [[-96.0000,  26.5000,  24.7500,  ..., -44.2500, -58.2500, -94.5000]],

        [[-95.0000,  26.6250,  24.7500,  ..., -44.2500, -58.2500, -95.0000]]],
       device='cuda:5', dtype=torch.bfloat16, requires_grad=True)
 tensor_recv_next: None
 tensor_recv_prev: tensor([[[  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]],

        [[-66.0000,  18.1250,  18.6250,  ..., -34.7500, -35.0000, -77.5000]],

        [[-67.5000,  18.7500,  18.8750,  ..., -35.2500, -36.2500, -79.5000]],

        ...,

        [[-96.0000,  26.8750,  25.3750,  ..., -44.7500, -58.2500, -95.0000]],

        [[-96.0000,  26.5000,  24.7500,  ..., -44.2500, -58.2500, -94.5000]],

        [[-95.0000,  26.6250,  24.7500,  ..., -44.2500, -58.2500, -95.0000]]],
       device='cuda:6', dtype=torch.bfloat16, requires_grad=True)
 tensor_recv_next: None
 tensor_recv_prev: tensor([[[  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]],

        [[-66.0000,  18.1250,  18.6250,  ..., -34.7500, -35.0000, -77.5000]],

        [[-67.5000,  18.7500,  18.8750,  ..., -35.2500, -36.2500, -79.5000]],

        ...,

        [[-96.0000,  26.8750,  25.3750,  ..., -44.7500, -58.2500, -95.0000]],

        [[-96.0000,  26.5000,  24.7500,  ..., -44.2500, -58.2500, -94.5000]],

        [[-95.0000,  26.6250,  24.7500,  ..., -44.2500, -58.2500, -95.0000]]],
       device='cuda:4', dtype=torch.bfloat16, requires_grad=True)
 tensor_recv_next: None
 tensor_recv_prev: tensor([[[  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]],

        [[-66.0000,  18.1250,  18.6250,  ..., -34.7500, -35.0000, -77.5000]],

        [[-67.5000,  18.7500,  18.8750,  ..., -35.2500, -36.2500, -79.5000]],

        ...,

        [[-96.0000,  26.8750,  25.3750,  ..., -44.7500, -58.2500, -95.0000]],

        [[-96.0000,  26.5000,  24.7500,  ..., -44.2500, -58.2500, -94.5000]],

        [[-95.0000,  26.6250,  24.7500,  ..., -44.2500, -58.2500, -95.0000]]],
       device='cuda:7', dtype=torch.bfloat16, requires_grad=True)
*7<----11 tensor_recv_next: None
### loss: 14.612119674682617
### loss: 14.612119674682617
### loss: 14.612119674682617
### loss: 14.612119674682617
### loss_reduced: {'lm loss': tensor(14.6121, device='cuda:7')}
### loss_reduced: {'lm loss': tensor(14.6121, device='cuda:4')}
### loss_reduced: {'lm loss': tensor(14.6121, device='cuda:6')}
### loss_reduced: {'lm loss': tensor(14.6121, device='cuda:5')}
!!! rank:  4
!!! rank:  6
!!! rank:  5
!!! rank:  7
 tensor_send_prev: tensor([[[        inf,         inf,         inf,  ...,  4.1954e+36,
                 -inf,        -inf]],

        [[ 2.3865e-08,  2.1770e-08, -1.1700e-08,  ...,  5.7742e-08,
          -2.2235e-08, -3.8883e-08]],

        [[ 4.7265e-08, -4.2608e-08, -1.4727e-08,  ...,  1.8440e-07,
          -5.7276e-08, -1.4342e-07]],

        ...,

        [[-7.3633e-09, -8.0559e-08, -2.3632e-08,  ..., -8.8010e-08,
           1.9500e-09,  1.8161e-08]],

        [[ 3.3528e-08, -6.9384e-08, -4.0047e-08,  ..., -6.4028e-09,
           4.8429e-08, -1.6880e-08]],

        [[-1.6415e-08, -1.4319e-08, -3.0734e-08,  ...,  5.8906e-08,
          -4.0978e-08, -1.0477e-08]]], device='cuda:6', dtype=torch.bfloat16)
 tensor_send_next: None
 tensor_send_prev: tensor([[[        inf,         inf,         inf,  ...,  4.1954e+36,
                 -inf,        -inf]],

        [[ 2.3865e-08,  2.1770e-08, -1.1700e-08,  ...,  5.7742e-08,
          -2.2235e-08, -3.8883e-08]],

        [[ 4.7265e-08, -4.2608e-08, -1.4727e-08,  ...,  1.8440e-07,
          -5.7276e-08, -1.4342e-07]],

        ...,

        [[-7.3633e-09, -8.0559e-08, -2.3632e-08,  ..., -8.8010e-08,
           1.9500e-09,  1.8161e-08]],

        [[ 3.3528e-08, -6.9384e-08, -4.0047e-08,  ..., -6.4028e-09,
           4.8429e-08, -1.6880e-08]],

        [[-1.6415e-08, -1.4319e-08, -3.0734e-08,  ...,  5.8906e-08,
          -4.0978e-08, -1.0477e-08]]], device='cuda:7', dtype=torch.bfloat16)
 tensor_send_next: None
 tensor_send_prev: tensor([[[        inf,         inf,         inf,  ...,  4.1954e+36,
                 -inf,        -inf]],

        [[ 2.3865e-08,  2.1770e-08, -1.1700e-08,  ...,  5.7742e-08,
          -2.2235e-08, -3.8883e-08]],

        [[ 4.7265e-08, -4.2608e-08, -1.4727e-08,  ...,  1.8440e-07,
          -5.7276e-08, -1.4342e-07]],

        ...,

        [[-7.3633e-09, -8.0559e-08, -2.3632e-08,  ..., -8.8010e-08,
           1.9500e-09,  1.8161e-08]],

        [[ 3.3528e-08, -6.9384e-08, -4.0047e-08,  ..., -6.4028e-09,
           4.8429e-08, -1.6880e-08]],

        [[-1.6415e-08, -1.4319e-08, -3.0734e-08,  ...,  5.8906e-08,
          -4.0978e-08, -1.0477e-08]]], device='cuda:4', dtype=torch.bfloat16)
 tensor_send_next: None
 tensor_recv_prev: None
 tensor_recv_next: None
 tensor_recv_prev: None
 tensor_recv_prev: None
 tensor_recv_next: None
 tensor_recv_prev: None
 tensor_send_prev: tensor([[[        inf,         inf,         inf,  ...,  4.1954e+36,
                 -inf,        -inf]],

        [[ 2.3865e-08,  2.1770e-08, -1.1700e-08,  ...,  5.7742e-08,
          -2.2235e-08, -3.8883e-08]],

        [[ 4.7265e-08, -4.2608e-08, -1.4727e-08,  ...,  1.8440e-07,
          -5.7276e-08, -1.4342e-07]],

        ...,

        [[-7.3633e-09, -8.0559e-08, -2.3632e-08,  ..., -8.8010e-08,
           1.9500e-09,  1.8161e-08]],

        [[ 3.3528e-08, -6.9384e-08, -4.0047e-08,  ..., -6.4028e-09,
           4.8429e-08, -1.6880e-08]],

        [[-1.6415e-08, -1.4319e-08, -3.0734e-08,  ...,  5.8906e-08,
          -4.0978e-08, -1.0477e-08]]], device='cuda:5', dtype=torch.bfloat16)
 tensor_send_next: None
 tensor_recv_prev: None
 tensor_recv_next: None
 tensor_recv_next: None
 tensor_recv_prev: None
 tensor_recv_prev: None
 tensor_recv_next: tensor([[[        inf,         inf,         inf,  ...,  4.1954e+36,
                 -inf,        -inf]],

        [[ 2.3865e-08,  2.1770e-08, -1.1700e-08,  ...,  5.7742e-08,
          -2.2235e-08, -3.8883e-08]],

        [[ 4.7265e-08, -4.2608e-08, -1.4727e-08,  ...,  1.8440e-07,
          -5.7276e-08, -1.4342e-07]],

        ...,

        [[-7.3633e-09, -8.0559e-08, -2.3632e-08,  ..., -8.8010e-08,
           1.9500e-09,  1.8161e-08]],

        [[ 3.3528e-08, -6.9384e-08, -4.0047e-08,  ..., -6.4028e-09,
           4.8429e-08, -1.6880e-08]],

        [[-1.6415e-08, -1.4319e-08, -3.0734e-08,  ...,  5.8906e-08,
          -4.0978e-08, -1.0477e-08]]], device='cuda:2', dtype=torch.bfloat16,
       requires_grad=True)
 tensor_recv_next: tensor([[[        inf,         inf,         inf,  ...,  4.1954e+36,
                 -inf,        -inf]],

        [[ 2.3865e-08,  2.1770e-08, -1.1700e-08,  ...,  5.7742e-08,
          -2.2235e-08, -3.8883e-08]],

        [[ 4.7265e-08, -4.2608e-08, -1.4727e-08,  ...,  1.8440e-07,
          -5.7276e-08, -1.4342e-07]],

        ...,

        [[-7.3633e-09, -8.0559e-08, -2.3632e-08,  ..., -8.8010e-08,
           1.9500e-09,  1.8161e-08]],

        [[ 3.3528e-08, -6.9384e-08, -4.0047e-08,  ..., -6.4028e-09,
           4.8429e-08, -1.6880e-08]],

        [[-1.6415e-08, -1.4319e-08, -3.0734e-08,  ...,  5.8906e-08,
          -4.0978e-08, -1.0477e-08]]], device='cuda:3', dtype=torch.bfloat16,
       requires_grad=True)
 tensor_recv_next: tensor([[[        inf,         inf,         inf,  ...,  4.1954e+36,
                 -inf,        -inf]],

        [[ 2.3865e-08,  2.1770e-08, -1.1700e-08,  ...,  5.7742e-08,
          -2.2235e-08, -3.8883e-08]],

        [[ 4.7265e-08, -4.2608e-08, -1.4727e-08,  ...,  1.8440e-07,
          -5.7276e-08, -1.4342e-07]],

        ...,

        [[-7.3633e-09, -8.0559e-08, -2.3632e-08,  ..., -8.8010e-08,
           1.9500e-09,  1.8161e-08]],

        [[ 3.3528e-08, -6.9384e-08, -4.0047e-08,  ..., -6.4028e-09,
           4.8429e-08, -1.6880e-08]],

        [[-1.6415e-08, -1.4319e-08, -3.0734e-08,  ...,  5.8906e-08,
          -4.0978e-08, -1.0477e-08]]], device='cuda:1', dtype=torch.bfloat16,
       requires_grad=True)
 tensor_recv_next: tensor([[[        inf,         inf,         inf,  ...,  4.1954e+36,
                 -inf,        -inf]],

        [[ 2.3865e-08,  2.1770e-08, -1.1700e-08,  ...,  5.7742e-08,
          -2.2235e-08, -3.8883e-08]],

        [[ 4.7265e-08, -4.2608e-08, -1.4727e-08,  ...,  1.8440e-07,
          -5.7276e-08, -1.4342e-07]],

        ...,

        [[-7.3633e-09, -8.0559e-08, -2.3632e-08,  ..., -8.8010e-08,
           1.9500e-09,  1.8161e-08]],

        [[ 3.3528e-08, -6.9384e-08, -4.0047e-08,  ..., -6.4028e-09,
           4.8429e-08, -1.6880e-08]],

        [[-1.6415e-08, -1.4319e-08, -3.0734e-08,  ...,  5.8906e-08,
          -4.0978e-08, -1.0477e-08]]], device='cuda:0', dtype=torch.bfloat16,
       requires_grad=True)
 iteration       10/     100 | consumed samples:           10 | consumed tokens:        40960 | elapsed time per iteration (ms): 2208.9 | learning rate: 3.000E-04 | global batch size:     1 | lm loss: 1.461212E+01 | loss scale: 1.0 | grad norm: nan | actual seqlen:  4096 | number of skipped iterations:   0 | number of nan iterations:   0 | samples per second: 0.453 | TFLOPs: 16.04 |
!!! rank:  7
 tensor_send_prev: None
 tensor_send_next: None
!!! rank:  5
 tensor_send_prev: None
 tensor_send_next: None
!!! rank:  4
 tensor_send_prev: None
 tensor_send_next: None
!!! rank:  6
 tensor_send_prev: None
 tensor_send_next: None
!!! rank:  1
!!! rank: *1---->-3 tensor_send_prev: None 
0
 tensor_send_prev: None
!!! rank: !!! rank:  3 
 tensor_send_prev: None
 tensor_send_prev: None
 tensor_send_next: tensor([[[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan]],

        ...,

        [[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan]],

        [[nan, nan, nan,  ..., nan, nan, nan]]], device='cud
microsoft / Megatron-DeepSpeed

when i train llama2 13b, AssertionError #203