Open tedchou12 opened 4 months ago
For normal cost function and forward prop: y_pred = softmax(np.dot(Wya, a1), + by) J = -y * log(y_pred)
Backward prop _The derivative of ypred with respect to J cost function dy_pred = -y / y_pred
_The derivative of by with respect to ypred would be where s is "np.dot(Wya, a1) + by"
The derivative of by with respect to the J cost function (chain rule)
simplifying
ending up for dby to be:
reference: mattpetersen-githu....pdf
made example with following code:
import numpy as np
s = np.array([0.3, 0.6, 0.1])
y = np.array([1, 0, 0])
s = np.array([0.6, 0.3, 0.1])
y = np.array([0, 0, 1])
# Calculate the gradient of the cross-entropy loss with respect to s
grad_H_s = -y / s
# Create a diagonal matrix from the vector s
diag_s = np.diag(s)
# Compute the outer product of s with itself
outer_product = np.outer(s, s)
# Calculate the Jacobian of the softmax function
jacobian_s = diag_s - outer_product
# print(grad_H_s)
# Calculate the gradient of the cost function with respect to the input x
grad_H_x = np.dot(grad_H_s, jacobian_s)
# print(grad_H_x)
#
# exit()
# print(np.array([[-0.7, 0.6, 0.1],[ 0.6, 0.3, -0.9]]).T)
def softmax(z):
exp_z = np.exp(z) # for numerical stability
return exp_z / exp_z.sum(axis=0)
def softmax_jacobian(z, y) :
# y = softmax(z)
# jacobian = np.diag(y) - np.outer(y, y)
#
# return jacobian
Z = z
n, m = Z.shape
A = softmax(Z)
dA = np.eye(n, n)
tensor1 = np.einsum('ij,ik->ijk', A.T, A.T)
tensor2 = np.einsum('ij,jk->ijk', A.T, np.eye(n, n))
# print(tensor2)
dSoftmax = (tensor2 - tensor1).T
# print(dSoftmax.shape)
# print(dSoftmax)
grad_H_s = -y / A
# print(grad_H_s.T.shape)
# Initialize result array
# result = np.zeros((3, 3))
# # Compute the desired output
# for i in range(dSoftmax.shape[0]):
# result[i] = np.dot(dSoftmax[i], grad_H_s[i])
# print(result)
# print(dSoftmax.shape)
dZ = np.einsum('ijk,ki->jk', dSoftmax, grad_H_s.T)
return dZ
def softmax_backward(cache, dA):
"""
Implement the backward propagation for a single SIGMOID unit.
Arguments:
dA -- post-activation gradient, of any shape
cache -- 'Z' where we store for computing backward propagation efficiently
Returns:
dZ -- Gradient of the cost with respect to Z
"""
Z = cache
n, m = Z.shape
A = softmax(Z).T
tensor1 = np.einsum('ij,ik->ijk', A, A)
tensor2 = np.einsum('ij,jk->ijk', A, np.eye(n, n))
dSoftmax = tensor2 - tensor1
dZ = np.einsum('ijk,ik->ij', dSoftmax, dA.T).T
assert (dZ.shape == Z.shape)
return dZ
# exit()
# print(np.tensordot(dSoftmax, grad_H_s, axes=([0, 1], [0, 1])))
# print(dSoftmax.T.shape)
# print(np.dot(dSoftmax.T.squeeze(), grad_H_s))
# dZ = np.einsum('ijk,ik->ij', dSoftmax, dA.T).T
# print(grad_H_s)
# print(dZ)
# grad_H_x = np.einsum('ijk,ik->ij', dSoftmax.T, grad_H_s.T).T
# print(grad_H_x)
# s = softmax(z)
# a = np.eye(s.shape[-1])
# temp1 = np.zeros((s.shape[0], s.shape[1], s.shape[1]), dtype=np.float64)
# temp2 = np.zeros((s.shape[0], s.shape[1], s.shape[1]), dtype=np.float64)
# temp1 = np.einsum('ij,jk->ijk', s, a)
# temp2 = np.einsum('ij,ik->ijk', s, s)
# dZ_tmp = temp1 - temp2
# print(dZ_tmp.shape)
y = np.array([[[1, 0, 0]], [[0, 0, 1]], [[0, 1, 0]], [[0, 1, 0]]])
# y = np.array([[[1, 0, 0]]])
# y = np.array([[[0, 0, 1]]])
y = np.swapaxes(y, 0, 2)
y = np.swapaxes(y, 1, 2)
y = y.squeeze(-1)
# print(y.shape)
z = np.array([[[-1.203972804, -0.510825624, -2.302585093]], [[-0.510825624, -1.203972804, -2.302585093]], [[-0.510825624, -1.203972804, -2.302585093]], [[-0.510825624, -1.203972804, -2.302585093]]])
# z = np.array([[[-1.203972804, -0.510825624, -2.302585093]]])
# z = np.array([[[-0.510825624, -1.203972804, -2.302585093]]])
z = np.swapaxes(z, 0, 2)
z = np.swapaxes(z, 1, 2)
z = z.squeeze(-1)
# print(softmax(z))
# print(z.shape)
# print(z-y)
print(softmax_jacobian(z, y))
print(softmax_backward(z, -y / softmax(z)))
of course
y_pred = softmax(np.dot(Wya, a1), + by)
dWya and da1 requires quotient rule of derivative:
dWya = np.dot(softmax_backward(y_pred, -y / y_pred), a1.T)
da1 = np.dot(Wya.T, softmax_backward(y_pred, -y / y_pred))
backward implementation softmax ypred - y