Closed ghost closed 5 years ago
Hi - I think that it should be possible, as long as the data is appropriately shaped. Would you be able to provide a toy example?
Hi @idroz , thank you for your prompt response! I have uploaded the file to https://drive.google.com/open?id=1qo2R8Rz4dWvX9nMLQc50AVc1tb-PCZPg
It is a pickle file, with a dictionary inside {'X':[...], 'Y':[...]}
X has dimensions (N, T, f) with N as the number of samples, T: number of time steps, f: as the feature dimension at a time stamp.
Here is the file I am using right now to read the data and to apply ivis on a concatenated version.
"""
experiments with ivis
"""
import sys
import numpy as np
import matplotlib
##set headless mode
#matplotlib.use('Agg')
from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.manifold import SpectralEmbedding
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
import pickle
import matplotlib.pyplot as plt
import time
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib
import logging
import argparse
from sklearn.preprocessing import MinMaxScaler
from collections.abc import Iterable
recfmt = '(%(threadName)s) %(asctime)s.%(msecs)03d %(levelname)s %(filename)s:%(lineno)d %(message)s'
timefmt = '%y%m%d_%H:%M:%S'
logging.basicConfig(filename=time.strftime("log/visualize_embedding.log"), #%y%m%d_%H%M%S.log"),
filemode="w",
level=logging.INFO,
format=recfmt, datefmt=timefmt)
parser = argparse.ArgumentParser()
parser.add_argument("xy_file", help="The X and Y data to be used. X:[[...], [...], ...], Y:[...]")
args = parser.parse_args()
def binarizeY(y):
yc=[]
for yi in y:
if yi<0.1:
yc.append(False)
else:
yc.append(True)
return np.array(yc)
def embedClassical(x,ygt, do_scaling=True, ndim=2):
xi=x
if do_scaling:
xi= MinMaxScaler().fit_transform(x)
ygt=np.array(ygt)
print("x.shape: "+str(xi.shape))
print("y.shape: "+str(ygt.shape))
print("starting embedding process")
#X_embedded = TSNE(n_components=2).fit_transform(x, ygt)
#X_embedded = LocallyLinearEmbedding(n_neighbors=5, method='modified').fit_transform(x,ygt)
#X_embedded = SpectralEmbedding().fit_transform(x, ygt)
#pca=PCA(n_components=2).fit(xi, ygt)
#print("pca explains: "+str(pca.explained_variance_))
X_embedded = PCA(n_components=ndim).fit_transform(xi, ygt)
return X_embedded
def embedDeep(x,ygt, ndim=2, do_scaling=False, epochs=400):
from tensorflow.python.keras import utils
from ivis import Ivis
xi=x
#print("x: "+str(x))
if do_scaling:
xi= MinMaxScaler().fit_transform(x)
#see https://keras.io/metrics/ for more supervision metrics
model = Ivis(embedding_dims=ndim, k=5, epochs=epochs)#, supervision_weight=0.01)#, supervision_metric='mae')
model.fit(xi, ygt)
embeddings=model.transform(xi)
y_pred = model.score_samples(xi)
print("pred err mean: "+str(np.mean(y_pred)))
return embeddings
def plotEmbedding(X_embedded, ygt, title="", bin_colors=True, colormap_name="jet"):
y_colors=ygt#binarizeY(ygt)
if bin_colors:
y_colors=binarizeY(ygt)
print("x_e.shape:"+str(X_embedded.shape))
print("y_c.shape: "+str(y_colors.shape))
sc=plt.scatter(X_embedded[:,0], X_embedded[:,1], c=y_colors, cmap= plt.get_cmap(colormap_name))#, y_colors)
plt.grid()
plt.colorbar(sc)
#plt.savefig("./log/vis_embedded.pdf", bbox_inches='tight', format='pdf')
plt.savefig("./log/vis_embedded_"+title+".png", bbox_inches='tight')
plt.show()
def classificationTest(x,y, do_scaling=False):
from sklearn.linear_model import LogisticRegression
from sklearn import tree, svm
from sklearn.metrics import confusion_matrix, average_precision_score, roc_auc_score, classification_report
xe=embedDeep(x,y, do_scaling= do_scaling, epochs=500)
y_bin=binarizeY(y)
#clf=LogisticRegression(solver="lbfgs").fit(xe, y_bin)
#clf=tree.DecisionTreeClassifier(min_samples_split=10).fit(xe,y_bin)
clf = svm.SVC(probability=True).fit(xe, y_bin)
pred_labels = clf.predict(xe)
proba = clf.predict_proba(xe)
print(classification_report(y_bin, pred_labels))
print('Confusion Matrix')
print(confusion_matrix(y_bin, pred_labels))
print('Average Precision: '+str(average_precision_score(y_bin, proba[:, 1])))
print('ROC AUC: '+str(roc_auc_score(y_bin, pred_labels)))
plotSamples(xe, y, title="classification")
generateRegressionField(xe, y, clf, n=350, title="classification")
def flatten1(lst):
out=[]
for li in lst:
if not isinstance(li, Iterable):
out.append(li)
else:
for lii in li:
out.append(lii)
return out
def treeRegressionTest(x,y, do_scaling=False):
from sklearn import tree
from sklearn import linear_model
from sklearn import svm
xe=embedDeep(x,y, ndim=2, do_scaling= do_scaling, epochs=305)
#xe=embedClassical(x,y, ndim=2, do_scaling= do_scaling)
clf = tree.DecisionTreeRegressor(min_samples_split=10)#max_depth=4)
#clf=linear_model.Ridge(alpha=.5)
#clf = svm.SVR()
n2=int(len(xe)/2)
xe1=xe[:n2]
y1=y[:n2]
xe2=xe[n2:]
y2=y[n2:]
clf = clf.fit(xe1, y1)
score=clf.score(xe2, y2)
print("Score with split: "+str(score))
clf = clf.fit(xe, y)
score=clf.score(xe, y)
print("Score w/o split: "+str(score))
mnx=np.min(xe[:,0])
mxx=np.max(xe[:,0])
xnoisy=xe+np.random.rand(*xe.shape)*0.05*(mxx-mnx)
score=clf.score(xnoisy, y)
print("Score w/o split on xnoisy: "+str(score))
plotSamples(xe, y, title="regression")
generateRegressionField(xe, y, clf, n=350, title="regression")
def plotSamples(x,y, title=""):
plt.scatter(x[:,0], x[:,1], c=y)
plt.colorbar()
plt.savefig("./log/vis_xy_embedding_scatter"+title+".png")
plt.show()
def generateRegressionField(x,y, clf, n, title=""):
"""
generates an image of the regression results of the clf
"""
data=np.zeros((n,n))
mnx=np.min(x[:,0])
mxx=np.max(x[:,0])
xnoisy=x+np.random.rand(*x.shape)*0.05*(mxx-mnx)
mny=np.min(x[:,1])
mxy=np.max(x[:,1])
idr=-1
for ri in np.linspace(mny, mxy, n):
idr+=1
idc=-1
for ci in np.linspace(mnx, mxx, n):
idc+=1
vi=clf.predict_proba([[ri,ci]])[0][0]
data[idr, idc]=vi
fig, (ax, ax2) = plt.subplots(2, sharex=False, sharey=False, figsize=(12, 5))
ax.imshow(np.flip(data, axis=0), alpha=1.0)
ax.set_title("regression reproduced")
#plt.colorbar()
#plt.scatter((x[:,0]-mnx)/(mxx-mnx)*n, (x[:,1]-mny)/(mxy-mny)*n, c=y)
#plt.show()
ax2.plot(y, color="black")
ypred=clf.predict(xnoisy)
ax2.plot(ypred, color="green")
ax2.set_title("y")
ax2.grid()
plt.savefig("./log/vis_xy_embedding_space"+title+".png")
plt.show()
def seq2vec(x):
"""
produces a single vector from F=(f[0], ... f[k]...f[T]) by concatenation
:param x: a 3d list (N,T,f) with N: number of samples, T: number of time steps, f: feature dim
:return: 2d list
"""
out=[]
for xi in x: #N
vt=flatten1(xi)
out.append(vt)
return out
print("Loading file: "+str(args.xy_file))
data=pickle.load( open( args.xy_file, "rb" ) )
ygt = np.array(data['Y'])
x=seq2vec(data['X'])
#x=np.array([np.array(xi) for xi in x]) #this does not work if the sublists have varying lengths
#enforce same length of features. there is still something dirty..
length = max(map(len, x))
if length!=min(map(len,x)):
print("WARNING! The sublists have varying lengths!")
x=np.array([xi+[0.]*(length-len(xi)) for xi in x])
#pure embedding
#----------------
#x_embedded=embedClassical(x, ygt, do_scaling=False)
#x_embedded=embedDeep(x,ygt, do_scaling=True)
#plotEmbedding(x_embedded, ygt)
#embedding + some classification
#--------------
classificationTest(x, ygt, do_scaling=True)
#treeRegressionTest(x,ygt, do_scaling=True)
sorry, for the long file paste. The project in not on github...
And thanks alot for your support.
Thanks for the code - it's pretty cool that you're using ivis as a classifier/regressor!
There are only a few tweaks that need to be done to get ivis to train on your data.
1. Classification vs. Regression
Supervised ivis doesn't impose a format on the response variable Y. Instead, the default option is to run a classifier. If you specify supervision_metric='mae'
(or some other regression loss supported by keras), ivis will run in regression mode. The embedDeep
method can be modified as:
def embedDeep(x, ygt, ndim=2, do_scaling=False, method='classification'):
from tensorflow.python.keras import utils
from ivis import Ivis
xi=x
#print("x: "+str(x))
if do_scaling:
xi= MinMaxScaler().fit_transform(x)
#see https://keras.io/metrics/ for more supervision metrics
if method=='classification':
model = Ivis(embedding_dims=ndim, k=5, n_epochs_without_progress=5)
else:
model = Ivis(embedding_dims=ndim, k=5, n_epochs_without_progress=5, supervision_metric='mae', supervision_weight=0.9)
model.fit(xi, ygt)
embeddings=model.transform(xi)
y_pred = model.score_samples(xi)
print("pred err mean: "+str(np.mean(y_pred)))
return embeddings
2. Label binarisation ivis works with numpy arrays, and we can binarise Y as:
def binarizeY(y):
return np.where(np.array(y)<0.1, 0, 1)
Run your binariser before invoking ivis.fit
method in classificationTest
:
y_bin=binarizeY(y)
xe=embedDeep(x, y_bin, do_scaling= do_scaling)
3. Hyperparameter tuning
I would recommend changing default ivis parameters to:
k=15
, model='maaten'
, and n_epochs_without_progress=5
This will converge a lot faster and won't overfit the data. We generally recommend leaving the epochs
parameter set to the default value and tuning the early stopping parameter via n_epochs_without_progress
supervision_weight
is also a very interesting parameter as it controls the degree to which classifier focuses on supervision vs. unsupervised dimensionality reduction. Setting this value higher, e.g. 0.90 or 0.95 will produce desirable classifiers.
If you're interested in classification rather than visualisation, increasing embedding_dims
could also be useful. Increasing dimensionality to 50 produces a reasonable AUC:
precision recall f1-score support 0 0.85 0.92 0.89 222 1 0.92 0.85 0.88 233 accuracy 0.88 455
macro avg 0.89 0.88 0.88 455 weighted avg 0.89 0.88 0.88 455
Confusion Matrix [[205 17] [ 36 197]] Average Precision: 0.966985040236684 ROC AUC: 0.8844584928275915
The jury is still out a little bit on how to best scale the data - MinMaxScaler, StandardScaler, or no scaling seem to work well depending on the dataset!
Thank you very much for your comments! They are very valuable to me.
Regrading the sequence: would you concatenate the sequence data to a vector as it is done in my script?
Best
Ignat Drozdov notifications@github.com schrieb am Mo., 23. Sep. 2019, 06:23:
Thanks for the code - it's pretty cool that you're using ivis as a classifier/regressor!
There are only a few tweaks that need to be done to get ivis to train on your data.
1. Classification vs. Regression Supervised ivis doesn't impose format on the response variable Y. Instead, the default option is to run a classifier. If you specify supervision_metric='mae' (or some other regression loss supported by keras), ivis will run in regression mode. The embedDeep method can be modified as:
def embedDeep(x, ygt, ndim=2, do_scaling=False, method='classification'): from tensorflow.python.keras import utils from ivis import Ivis xi=x
print("x: "+str(x))
if do_scaling: xi= MinMaxScaler().fit_transform(x) #see https://keras.io/metrics/ for more supervision metrics if method=='classification': model = Ivis(embedding_dims=ndim, k=5, n_epochs_without_progress=5) else: model = Ivis(embedding_dims=ndim, k=5, n_epochs_without_progress=5, supervision_metric='mae', supervision_weight=0.9) model.fit(xi, ygt) embeddings=model.transform(xi) y_pred = model.score_samples(xi) print("pred err mean: "+str(np.mean(y_pred))) return embeddings
2. Label binarisation ivis works with numpy arrays, and we can binarise Y as:
def binarizeY(y): return np.where(np.array(y)<0.1, 0, 1)
Run your binariser before invoking ivis.fit method in classificationTest:
y_bin=binarizeY(y) xe=embedDeep(x, y_bin, do_scaling= do_scaling)
3. Hyperparameter tuning I would recommend changing default ivis parameters to: k=15, model='maaten', and n_epochs_without_progress=5
This will converge a lot faster and won't overfit the data. We generally recommend leaving the epochs parameter set to the default value and tuning the early stopping parameter via n_epochs_without_progress
supervision_weight is also a very interesting parameter as it controls the degree to which classifier focuses on supervision vs. unsupervised dimensionality reduction. Setting this value higher, e.g. 0.90 or 0.95 will produce desirable classifiers.
If you're interested in classification rather than visualisation, increasing embedding_dims could also be useful.
The jury is still out a little bit on how to best scale the data - MinMaxScaler, StandardScaler, or no scaling seem to work well depending on the dataset!
— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/beringresearch/ivis/issues/46?email_source=notifications&email_token=AAAWFSB3N5DSOMXCSMWJ7HLQLBAEBA5CNFSM4IZCDTL2YY3PNVWWK3TUL52HS4DFVREXG43VMVBW63LNMVXHJKTDN5WW2ZLOORPWSZGOD7JYI7Q#issuecomment-533955710, or mute the thread https://github.com/notifications/unsubscribe-auth/AAAWFSCQQITARWP4SEUIUKLQLBAEBANCNFSM4IZCDTLQ .
I think what you have is fine. You could also look into Keras' pad_sequences
method, which I think does pretty much what you've written.
Ivis uses KNN retrieval as the first step in the algorithm, which relies on a 2-D array and doesn't support nested multidimensional lists.
Theoretically, you could write your own base neural network using Keras layers and pass that into model
hyperparameter. Something like:
Input -> Reshape -> 1DConv -> MaxPooling -> Dense
You could then pass the 2D array into ivis and let your base network do the rest. It may be an overkill, as maaten
architecture (3 Dense layers 500-500-2000) will work well in most cases.
@mojovski Hope this helped you a bit. Will close the issue, but feel free to re-open if anything else is outstanding.
Thanks a lot! Will be able to continue the work on this in 2 weeks and, if appropriate, post some results.
I would like to apply ivis on a high dim time series/sequence data. Is there a way to achieve this with the current version?