A flexible version of CorEx developed for bio-data challenges that handles missing data, continuous/discrete variables, multi-CPU, overlapping structure, and includes visualizations
Been running some unit tests of my own.
I rather like this one. Although the nodes are formed in different order for each run, the grouping are relatively consistent.
def n2a(n):
if n <50:return "L%0d"%n
return "R%0d"%(99-n)
def raindrop():
import matplotlib.pyplot as plt
import numpy as np
x = np.arange(-5, 5, 0.1)
y = np.arange(-5, 5, 0.1)
xx, yy = np.meshgrid(x, y, sparse=True)
z = np.sin(xx2 + yy2) / (xx2 + yy2)
h = plt.contourf(x,y,z)
header=",".join([n2a(i) for i in range(100)])
np.savetxt(r'data/raindrop.csv',z,delimiter=',',header=header)#, fmt=('%s, %f'))
plt.show()
Been running some unit tests of my own. I rather like this one. Although the nodes are formed in different order for each run, the grouping are relatively consistent.![raindrop](https://user-images.githubusercontent.com/4753653/106405598-a31d4180-63eb-11eb-95ea-a930d3c939af.png)
raindrop() generateCohort(numberOfTrials=10,targetDirectory="test_output",cmd="python vis_corex.py data/raindrop.csv --layers=30,3,1 --dim_hidden=3 --max_iter=100 --edges=400 --no_row_names --missing=-1e6 -c -v -o %s --ram=32")
Generate test case
def n2a(n): if n <50:return "L%0d"%n return "R%0d"%(99-n)
def raindrop(): import matplotlib.pyplot as plt import numpy as np x = np.arange(-5, 5, 0.1) y = np.arange(-5, 5, 0.1) xx, yy = np.meshgrid(x, y, sparse=True) z = np.sin(xx2 + yy2) / (xx2 + yy2) h = plt.contourf(x,y,z) header=",".join([n2a(i) for i in range(100)]) np.savetxt(r'data/raindrop.csv',z,delimiter=',',header=header)#, fmt=('%s, %f')) plt.show()
===========================================================================================================
import subprocess import time def generateCohort(numberOfTrials=10,targetDirectory="finance_output",#targetDirectory="nasdaq_output",#targetDirectory="finance_output", cmd="python vis_corex.py data/mchange,stocks,mdates.txt --layers=30,3,1 --dim_hidden=3 --max_iter=100 --edges=400 --no_row_names --missing=-1e6 -c -v -o %s --ram=32"): timestamp=generateTimeStampedDirectoryName() start_time = time.time() for i in range(numberOfTrials): trialPath = "%s/%s/trial%d"%(targetDirectory,timestamp,i) print("Processing "+trialPath) result = subprocess.check_output(cmd%trialPath, shell=True) with open(trialPath+"/log%d"%i,'w') as f: f.write(cmd+"\n"+str(result)) # Save command line + generated text elapsedTime=time.time() - start_time print(("Finished in --- %s seconds ---" % elapsedTime)) return timestamp