autograd.backward yeilds different grad between loss and copied function of hybrid_forward in loss

Description

resume = _validate_checkpoint('fcn', 'fcn_resnet50_Cityscapes_11_06_23_42_45_best.params')
net1 = get_model_by_name('fcn', ctx=ctx, model_kwargs=model_kwargs)
net1.load_parameters(resume, ctx=ctx, ignore_extra=True)
net2 = get_model_by_name('fcn', ctx=ctx, model_kwargs=model_kwargs)
net2.load_parameters(resume, ctx=ctx, ignore_extra=True)

train_iter, num_train = _data_iter('Cityscapes', batch_size=1, shuffle=False,
                                   last_batch='keep', root=get_dataset_info('Cityscapes')[0], split='train',
                                   mode='val', base_size=2048, crop_size=224)

from gluoncv.loss import SoftmaxCrossEntropyLoss
loss2 = SoftmaxCrossEntropyLoss()

_ignore_label = -1
_sparse_label = True
_size_average = False
_batch_axis = 0

def hybrid_forward(F, pred, label):
    """Compute loss"""
    softmaxout = F.SoftmaxOutput(
        pred, label.astype(pred.dtype), ignore_label=_ignore_label,
        multi_output=_sparse_label,
        use_ignore=True, normalization='valid' if _size_average else 'null')
    loss = -F.pick(F.log(softmaxout), label, axis=1, keepdims=True)
    loss = F.where(label.expand_dims(axis=1) == _ignore_label,
                   F.zeros_like(loss), loss)
    return F.mean(loss, axis=_batch_axis, exclude=True)

for i, (data, target) in enumerate(train_iter):
    with autograd.record(True):
        # for comparison, remember to set dropout layer to None
        loss_11 = hybrid_forward(nd, *net1(data), target)
        loss_12 = loss2(*net2(data), target)

    autograd.backward([loss_11, loss_12])
    params1 = net1.collect_params()
    params2 = net2.collect_params()
    grad1 = params1['fcnresnet0_fcnhead0_conv0_weight'].grad()
    grad2 = params2['fcnresnet1_fcnhead0_conv0_weight'].grad()
    sum = nd.sum(grad2 - grad1)
    t = grad2 / grad1

with two same models, i copy the hybrid_forward function from gluoncv.loss(softmaxcrossentropy loss in gluoncv). note that i already take away the dropout layer of models for comparison. this function yeilds the loss_11 value which is equal to loss_12(softmaxcrossentropy loss from gluoncv). but after autograd.backward, grad1(from the copied function) isn't equal to grad2(from gluoncv.loss.softmaxcrossentropy). why the forward leads to the same loss value yet the gradients are different between loss from gluoncv and copied function? the debug shotcut picture is below. softmaxcrossentropy loss from gluoncv is below.

class SoftmaxCrossEntropyLoss(Loss):
    r"""SoftmaxCrossEntropyLoss with ignore labels

    Parameters
    ----------
    axis : int, default -1
        The axis to sum over when computing softmax and entropy.
    sparse_label : bool, default True
        Whether label is an integer array instead of probability distribution.
    from_logits : bool, default False
        Whether input is a log probability (usually from log_softmax) instead
        of unnormalized numbers.
    weight : float or None
        Global scalar weight for loss.
    batch_axis : int, default 0
        The axis that represents mini-batch.
    ignore_label : int, default -1
        The label to ignore.
    size_average : bool, default False
        Whether to re-scale loss with regard to ignored labels.
    """
    def __init__(self, sparse_label=True, batch_axis=0, ignore_label=-1,
                 size_average=True, **kwargs):
        super(SoftmaxCrossEntropyLoss, self).__init__(None, batch_axis, **kwargs)
        self._sparse_label = sparse_label
        self._ignore_label = ignore_label
        self._size_average = size_average

    def hybrid_forward(self, F, pred, label):
        """Compute loss"""
        softmaxout = F.SoftmaxOutput(
            pred, label.astype(pred.dtype), ignore_label=self._ignore_label,
            multi_output=self._sparse_label,
            use_ignore=True, normalization='valid' if self._size_average else 'null')
        if self._sparse_label:
            loss = -F.pick(F.log(softmaxout), label, axis=1, keepdims=True)
        else:
            label = _reshape_like(F, label, pred)
            loss = -F.sum(F.log(softmaxout) * label, axis=-1, keepdims=True)
        loss = F.where(label.expand_dims(axis=1) == self._ignore_label,
                       F.zeros_like(loss), loss)
        return F.mean(loss, axis=self._batch_axis, exclude=True)

Occurrences

values in grad2/grad1 are usually close to 1.09....

stuck by this, please help me if you know anything that may cause this.

apache / mxnet

autograd.backward yeilds different grad between loss and copied function of hybrid_forward in loss #20299

Description

Occurrences