Hello, I wrote a small test where I calculate the Sinkhorn distance between a target distribution (univariate normal with mean 0 and variance 1) and a bunch of sample distributions (univariate normals with different means and variances), and the results I get are quite surprising.
If my sampled distribution is a normal with mean 0 and variance < 1, the Sinkhorn distance from the target distribution is lower than if my sampled distribution has variance 1. This seems to indicate that the loss is biased, as I would expect Sinkhorn to be at a minimum when the two distributions have same mean and variance. I played a bit with the parameters of SamplesLoss but the result didn't change significantly.
I attached a script below that does a mean and variance sweep and saves a bunch of plots with the corresponding Sinkhorn distance values. I'd be grateful if you could shed some light on this.
import matplotlib.pyplot as plt
import numpy as np
import torch
from geomloss import SamplesLoss
np.random.seed(31)
torch.manual_seed(31)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def normal(imgshape, mu, sigma):
"""
Sample a random tensor from a normal
"""
return torch.randn(imgshape) * sigma + mu
def main():
N = 10000
mus = np.arange(-5, 5, 0.5)
sigmas = np.arange(0.1, 2, 0.1)
target = normal((1, N), mu=0, sigma=1)
musweep = np.zeros(len(mus))
sigmasweep = np.zeros(len(sigmas))
sinkhorn_loss = SamplesLoss(loss="sinkhorn", p=1, blur=.05)
# Cycle thru all mus
for i, mu in enumerate(mus):
sample = normal((1, N), mu=mu, sigma=1)
sinkhorn_D = sinkhorn_loss(sample, target)
musweep[i] = sinkhorn_D
print("mu:", mu, "sinkhorn:", sinkhorn_D)
# Log histograms
fig = plt.figure(figsize=(5, 4), dpi=500)
fig.tight_layout(pad=0)
ax = fig.add_subplot(111)
plt.subplots_adjust(wspace=0, hspace=0)
ax.margins(0)
plt.hist(sample.flatten().cpu().numpy(), bins=np.arange(-4, 4 + 0.05, 0.05), alpha=0.5)
plt.hist(target.flatten().cpu().numpy(), bins=np.arange(-4, 4 + 0.05, 0.05), alpha=0.5)
plt.title(f'sample vs target')
plt.legend(["sample", "target"])
plt.xlim(-4, 4)
fig.text(.5, 0,
f"sinkhorn={float(sinkhorn_D):.4}", ha="center")
plt.savefig(f"mu_{mu}.png")
plt.close()
# Cycle thru all sigmas
for i, sigma in enumerate(sigmas):
sample = normal((1, N), mu=0, sigma=sigma)
sinkhorn_D = sinkhorn_loss(sample, target)
sigmasweep[i] = sinkhorn_D
print("sigma:", sigma, "sinkhorn:", sinkhorn_D)
# Log histograms
fig = plt.figure(figsize=(5, 4), dpi=500)
fig.tight_layout(pad=0)
ax = fig.add_subplot(111)
plt.subplots_adjust(wspace=0, hspace=0)
ax.margins(0)
plt.hist(sample.flatten().cpu().numpy(), bins=np.arange(-4, 4 + 0.05, 0.05), alpha=0.5)
plt.hist(target.flatten().cpu().numpy(), bins=np.arange(-4, 4 + 0.05, 0.05), alpha=0.5)
plt.title(f'sample vs target')
plt.legend(["sample", "target"])
plt.xlim(-4, 4)
fig.text(.5, 0,
f"sinkhorn={float(sinkhorn_D):.4}", ha="center")
plt.savefig(f"sigma_{sigma:.2}.png")
plt.close()
im = plt.imread(f"sigma_{sigma:.2}.png")
# Plot sinkhorn vs mu
fig = plt.figure(figsize=(5, 4), dpi=500)
fig.tight_layout(pad=0)
ax = fig.add_subplot(111)
plt.subplots_adjust(wspace=0, hspace=0)
ax.margins(0)
plt.plot(mus, musweep)
plt.title(f'sinkhorn vs mu')
plt.xlim(-4, 4)
plt.savefig(f"sinkhorn_vs_mu.png")
plt.close()
# Plot sinkhorn vs sigma
fig = plt.figure(figsize=(5, 4), dpi=500)
fig.tight_layout(pad=0)
ax = fig.add_subplot(111)
plt.subplots_adjust(wspace=0, hspace=0)
ax.margins(0)
plt.plot(sigmas, sigmasweep)
plt.title(f'sinkhorn vs sigma')
plt.xlim(-4, 4)
plt.savefig(f"sinkhorn_vs_sigma.png")
plt.close()
if __name__ == "__main__":
main()
Hello, I wrote a small test where I calculate the Sinkhorn distance between a target distribution (univariate normal with mean 0 and variance 1) and a bunch of sample distributions (univariate normals with different means and variances), and the results I get are quite surprising.
If my sampled distribution is a normal with mean 0 and variance < 1, the Sinkhorn distance from the target distribution is lower than if my sampled distribution has variance 1. This seems to indicate that the loss is biased, as I would expect Sinkhorn to be at a minimum when the two distributions have same mean and variance. I played a bit with the parameters of SamplesLoss but the result didn't change significantly.
I attached a script below that does a mean and variance sweep and saves a bunch of plots with the corresponding Sinkhorn distance values. I'd be grateful if you could shed some light on this.