backward implementation softmax (post rnn studies)

tedchou12 commented 4 months ago

backward implementation softmax ypred - y

tedchou12 commented 4 months ago

Derivation of softmax derivative and back prop formula (rnn)

For normal cost function and forward prop: y_pred = softmax(np.dot(Wya, a1), + by) J = -y * log(y_pred)

Backward prop _The derivative of ypred with respect to J cost function dy_pred = -y / y_pred

_The derivative of by with respect to ypred would be where s is "np.dot(Wya, a1) + by"

The derivative of by with respect to the J cost function (chain rule)

simplifying

ending up for dby to be:

tedchou12 commented 4 months ago

reference: mattpetersen-githu....pdf

tedchou12 commented 4 months ago

made example with following code:

import numpy as np

s = np.array([0.3, 0.6, 0.1])
y = np.array([1, 0, 0])
s = np.array([0.6, 0.3, 0.1])
y = np.array([0, 0, 1])

# Calculate the gradient of the cross-entropy loss with respect to s
grad_H_s = -y / s

# Create a diagonal matrix from the vector s
diag_s = np.diag(s)

# Compute the outer product of s with itself
outer_product = np.outer(s, s)

# Calculate the Jacobian of the softmax function
jacobian_s = diag_s - outer_product
# print(grad_H_s)

# Calculate the gradient of the cost function with respect to the input x
grad_H_x = np.dot(grad_H_s, jacobian_s)

# print(grad_H_x)
#
# exit()

# print(np.array([[-0.7,  0.6,  0.1],[ 0.6,  0.3, -0.9]]).T)

def softmax(z):
    exp_z = np.exp(z)  # for numerical stability
    return exp_z / exp_z.sum(axis=0)

def softmax_jacobian(z, y) :
    # y = softmax(z)
    # jacobian = np.diag(y) - np.outer(y, y)
    #
    # return jacobian
    Z = z
    n, m = Z.shape
    A = softmax(Z)
    dA = np.eye(n, n)
    tensor1 = np.einsum('ij,ik->ijk', A.T, A.T)
    tensor2 = np.einsum('ij,jk->ijk', A.T, np.eye(n, n))
    # print(tensor2)
    dSoftmax = (tensor2 - tensor1).T
    # print(dSoftmax.shape)
    # print(dSoftmax)
    grad_H_s = -y / A
    # print(grad_H_s.T.shape)

    # Initialize result array
    # result = np.zeros((3, 3))

    # # Compute the desired output
    # for i in range(dSoftmax.shape[0]):
    #     result[i] = np.dot(dSoftmax[i], grad_H_s[i])
    # print(result)

    # print(dSoftmax.shape)

    dZ = np.einsum('ijk,ki->jk', dSoftmax, grad_H_s.T)

    return dZ

def softmax_backward(cache, dA):
    """
    Implement the backward propagation for a single SIGMOID unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """

    Z = cache
    n, m = Z.shape
    A = softmax(Z).T
    tensor1 = np.einsum('ij,ik->ijk', A, A)
    tensor2 = np.einsum('ij,jk->ijk', A, np.eye(n, n))
    dSoftmax = tensor2 - tensor1
    dZ = np.einsum('ijk,ik->ij', dSoftmax, dA.T).T

    assert (dZ.shape == Z.shape)

    return dZ
    # exit()
    # print(np.tensordot(dSoftmax, grad_H_s, axes=([0, 1], [0, 1])))
    # print(dSoftmax.T.shape)

    # print(np.dot(dSoftmax.T.squeeze(), grad_H_s))
    # dZ = np.einsum('ijk,ik->ij', dSoftmax, dA.T).T
    # print(grad_H_s)
    # print(dZ)
    # grad_H_x = np.einsum('ijk,ik->ij', dSoftmax.T, grad_H_s.T).T
    # print(grad_H_x)
    # s = softmax(z)
    # a = np.eye(s.shape[-1])
    # temp1 = np.zeros((s.shape[0], s.shape[1], s.shape[1]), dtype=np.float64)
    # temp2 = np.zeros((s.shape[0], s.shape[1], s.shape[1]), dtype=np.float64)
    # temp1 = np.einsum('ij,jk->ijk', s, a)
    # temp2 = np.einsum('ij,ik->ijk', s, s)
    # dZ_tmp = temp1 - temp2
    # print(dZ_tmp.shape)

y = np.array([[[1, 0, 0]], [[0, 0, 1]], [[0, 1, 0]], [[0, 1, 0]]])
# y = np.array([[[1, 0, 0]]])
# y = np.array([[[0, 0, 1]]])
y = np.swapaxes(y, 0, 2)
y = np.swapaxes(y, 1, 2)
y = y.squeeze(-1)
# print(y.shape)
z = np.array([[[-1.203972804, -0.510825624, -2.302585093]], [[-0.510825624, -1.203972804, -2.302585093]], [[-0.510825624, -1.203972804, -2.302585093]], [[-0.510825624, -1.203972804, -2.302585093]]])
# z = np.array([[[-1.203972804, -0.510825624, -2.302585093]]])
# z = np.array([[[-0.510825624, -1.203972804, -2.302585093]]])
z = np.swapaxes(z, 0, 2)
z = np.swapaxes(z, 1, 2)
z = z.squeeze(-1)
# print(softmax(z))
# print(z.shape)
# print(z-y)

print(softmax_jacobian(z, y))
print(softmax_backward(z, -y / softmax(z)))

softmax_studies.xlsx

tedchou12 commented 4 months ago

of course y_pred = softmax(np.dot(Wya, a1), + by)

dWya and da1 requires quotient rule of derivative: dWya = np.dot(softmax_backward(y_pred, -y / y_pred), a1.T) da1 = np.dot(Wya.T, softmax_backward(y_pred, -y / y_pred))

tedchou12 / dnn

backward implementation softmax (post rnn studies) #9

Derivation of softmax derivative and back prop formula (rnn)