geri-brs / machine-learning

My codes and and tutorials
1 stars 0 forks source link

Plot the centroid, data samples per cluster label (different color) #2

Open mithi opened 5 years ago

mithi commented 5 years ago

Use the following to generate the point set

import numpy as np
from numpy.random import multivariate_normal, shuffle

def generate_synthetic_2dpoints(mean, cov, npoints):
    points = multivariate_normal(mean, cov, npoints).T
    return points

# generate random covariance
def rcov(r=9):
    np.random.seed(r)
    x = np.random.uniform(-0.35, 0.35, size = (2, 2))
    x = np.dot(x, x.transpose())
    return x 

# generate synthetic data 
def gen_set1():
    # 2 clusters
    cov1 = [[0.02, 0], [0, 0.15]]
    cov2 = [[0.02, 0], [0, 0.25]]

    means = [[-1.00, +1.10], 
             [+0.25, -0.75]]
    cov = [cov1, rcov()]

    npoints = [75, 90]

    p1 = generate_synthetic_2dpoints(means[0], cov[0], npoints[0])
    p2 = generate_synthetic_2dpoints(means[1], cov[1], npoints[1])
    pa = np.hstack((p1, p2))
    pa = pa.T

    # 3 clusters
    means = [[-0.50, -0.00], 
             [+1.65, -1.50], 
             [+1.50, +0.50]]
    cov = [cov1, rcov(), rcov()]
    npoints = [40, 60, 70]

    p1 = generate_synthetic_2dpoints(means[0], cov[0], npoints[0])
    p2 = generate_synthetic_2dpoints(means[1], cov[1], npoints[1])
    p3 = generate_synthetic_2dpoints(means[2], cov[2], npoints[2])
    pb = np.hstack((p1, p2, p3))
    pb = pb.T

    # 4 clusters
    means = [[+1.70, -1.15], 
             [+2.15, +1.10], 
             [-1.20, +1.50],
             [-1.20, -0.2]]
    cov = [rcov() for i in range(4)]
    npoints = [50, 75, 60, 95]

    p1 = generate_synthetic_2dpoints(means[0], cov[0], npoints[0])
    p2 = generate_synthetic_2dpoints(means[1], cov[1], npoints[1])
    p3 = generate_synthetic_2dpoints(means[2], cov[2], npoints[2])
    p4 = generate_synthetic_2dpoints(means[3], cov[3], npoints[3])
    pc = np.hstack((p1, p2, p3, p4))
    pc = pc.T

    # 5 clusters 
    means = [[-2.1, -1.5], 
             [-2.5, +1.5], 
             [+1.8, -1.6],
             [+2.3, +1.7],
             [+0.1, +0.1]]

    cov = [cov1, rcov(), rcov(), cov2, rcov()]
    npoints = [90, 55, 70, 40, 60]
    p1 = generate_synthetic_2dpoints(means[0], cov[0], npoints[0])
    p2 = generate_synthetic_2dpoints(means[1], cov[1], npoints[1])
    p3 = generate_synthetic_2dpoints(means[2], cov[2], npoints[2])
    p4 = generate_synthetic_2dpoints(means[3], cov[3], npoints[3])
    p5 = generate_synthetic_2dpoints(means[4], cov[4], npoints[4])
    pd = np.hstack((p1, p2, p3, p4, p5))
    pd = pd.T

return [pa, pb, pc, pd]

Call it like this

# load 4 data sets of 2d points of clusters [2, 3, 4, 5] 
pointset = gen_set1()

samples1, samples2, samples3, sample4 = pointset[0], pointset[1], pointset[2], pointset[3]

Plot the centroid, data samples per cluster label (different color)

Screen Shot 2019-05-10 at 3 47 21 PM
geri-brs commented 5 years ago

I think I succeeded to plot in different colors, I uploaded the code here , named "plot_diff_color.ipynb" Tomorrow I continue with determinie the best number of clusters.