Open gagolews opened 6 years ago
I tried to apply the following obvious patches, but with no success. Giving up
diff --git a/__init__.py b/__init__.py
index bed0237..e3ba8e2 100644
--- a/__init__.py
+++ b/__init__.py
@@ -1,3 +1,3 @@
-from itm import ITM
+from .itm import ITM
__all__ = ['ITM']
diff --git a/itm.py b/itm.py
index 5a06ea6..9cf87c9 100644
--- a/itm.py
+++ b/itm.py
@@ -1,16 +1,17 @@
import warnings
-from scipy import sparse
+from scipy.sparse.csgraph import connected_components
+
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.neighbors import NearestNeighbors
-from block_diag import block_diag
-from tree_entropy import tree_information_sparse
-from infer_dimensionality import estimate_dimension
+from .block_diag import block_diag
+from .tree_entropy import tree_information_sparse
+from .infer_dimensionality import estimate_dimension
-from mst import euclidean_mst
+from .mst import euclidean_mst
DTYPE = np.float
ITYPE = np.int
@@ -119,11 +120,11 @@ class ITM(BaseEstimator, ClusterMixin):
removed_edges.append((old_inds[list(edge[:2])], edge[2]))
n_split_components, split_components_indicator = \
- sparse.cs_graph_components(split + split.T)
+ connected_components(split + split.T, False)
assert(n_split_components == 2)
assert(len(np.unique(split_components_indicator)) == 2)
- for i in xrange(n_split_components):
+ for i in range(n_split_components):
inds = np.where(split_components_indicator == i)[0]
clusters.append((split[inds[np.newaxis, :], inds],
old_inds[inds]))
@@ -188,7 +189,7 @@ def itm_binary(graph, intrinsic_dimensionality, return_edge=False):
while to_visit:
x = to_visit.pop()
visited[x] = True
- for i in xrange(indptr[x], indptr[x + 1]):
+ for i in range(indptr[x], indptr[x + 1]):
n = neighbors[i]
if visited[n]:
#this is where we were coming from
@@ -214,7 +215,7 @@ def itm_binary(graph, intrinsic_dimensionality, return_edge=False):
while to_visit:
x = to_visit.pop()
visited[x] = True
- for i in xrange(indptr[x], indptr[x + 1]):
+ for i in range(indptr[x], indptr[x + 1]):
n = neighbors[i]
if n != parent[x]:
parent[n] = x
@@ -222,7 +223,7 @@ def itm_binary(graph, intrinsic_dimensionality, return_edge=False):
best_cut = None
best_objective = -np.inf
- for x in xrange(n_samples):
+ for x in range(n_samples):
if parent[x] == -1:
# was the root, doesn't have parent
continue
diff --git a/mst.py b/mst.py
index f8ce62b..ced989e 100644
--- a/mst.py
+++ b/mst.py
@@ -1,7 +1,7 @@
import numpy as np
-from scipy import sparse
+from scipy.sparse.csgraph import connected_components
-from sklearn.utils.sparsetools import minimum_spanning_tree
+from scipy.sparse.csgraph import minimum_spanning_tree
def euclidean_mst(X, neighbors_estimator, verbose=2):
@@ -15,12 +15,12 @@ def euclidean_mst(X, neighbors_estimator, verbose=2):
distances = neighbors_estimator.kneighbors_graph(
X, n_neighbors=n_neighbors, mode='distance')
n_components, component_indicators =\
- sparse.cs_graph_components(distances + distances.T)
+ connected_components(distances + distances.T, False)
if len(np.unique(component_indicators)) > 1:
continue
distances.sort_indices()
forest = minimum_spanning_tree(distances)
- _, inds = sparse.cs_graph_components(forest + forest.T)
+ _, inds = connected_components(forest + forest.T, False)
assert(len(np.unique(inds)) == 1)
break
return forest
diff --git a/tree_entropy.py b/tree_entropy.py
index 1015326..08b08a4 100644
--- a/tree_entropy.py
+++ b/tree_entropy.py
@@ -1,9 +1,9 @@
import numpy as np
import warnings
-from scipy import sparse
+from scipy.sparse.csgraph import connected_components
-from sklearn.utils.sparsetools import minimum_spanning_tree
+from scipy.sparse.csgraph import minimum_spanning_tree
from sklearn.metrics import euclidean_distances
@@ -61,13 +61,13 @@ def tree_information_sparse(forest, n_features):
"""
entropy = 0
sym_forest = forest + forest.T
- n_components, components = sparse.cs_graph_components(sym_forest)
+ n_components, components = connected_components(sym_forest, False)
if np.any(components < 0):
# there is a lonely node
entropy -= 1e10
#n_samples = len(components)
- for i in xrange(n_components):
+ for i in range(n_components):
inds = np.where(components == i)[0]
subforest = forest[inds[:, np.newaxis], inds]
L = subforest.sum()
My test script errors with the following message:
$ cat itm_test.py
import numpy as np
import information_theoretic_mst as itm
import sklearn.metrics, sklearn.cluster
nn = 5
np.random.seed(123)
X = np.concatenate((np.random.rand(nn,2)+0.5, np.random.rand(nn,2)-0.5), axis=0)
y = np.r_[[nn]*1, [nn]*2]
cl = itm.ITM(2)
cl.fit(X)
print(sklearn.metrics.adjusted_rand_score(y, cl._labels))
$ python itm_test.py
/usr/lib/python3.6/site-packages/scipy/sparse/compressed.py:742: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
SparseEfficiencyWarning)
Traceback (most recent call last):
File "itm_test.py", line 11, in <module>
cl.fit(X)
File "information_theoretic_mst/itm.py", line 131, in fit
mi = tree_information_sparse(clusters[-1][0], intrinsic_dimensionality)
File "information_theoretic_mst/tree_entropy.py", line 63, in tree_information_sparse
sym_forest = forest + forest.T
File "/usr/lib/python3.6/site-packages/scipy/sparse/base.py", line 374, in __add__
raise ValueError("inconsistent shapes")
ValueError: inconsistent shapes
I'll fix it today or tomorrow.
Sent from phone. Please excuse spelling and brevity.
On Sat, Apr 21, 2018, 15:17 Marek Gagolewski notifications@github.com wrote:
I tried to apply the following obvious patches, but with no success. Giving up
diff --git a/init.py b/init.py index bed0237..e3ba8e2 100644 --- a/init.py +++ b/init.py @@ -1,3 +1,3 @@ -from itm import ITM +from .itm import ITM
all = ['ITM'] diff --git a/itm.py b/itm.py index 5a06ea6..9cf87c9 100644 --- a/itm.py +++ b/itm.py @@ -1,16 +1,17 @@ import warnings
-from scipy import sparse +from scipy.sparse.csgraph import connected_components + import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin from sklearn.neighbors import NearestNeighbors
-from block_diag import block_diag -from tree_entropy import tree_information_sparse -from infer_dimensionality import estimate_dimension +from .block_diag import block_diag +from .tree_entropy import tree_information_sparse +from .infer_dimensionality import estimate_dimension
-from mst import euclidean_mst +from .mst import euclidean_mst
DTYPE = np.float ITYPE = np.int @@ -119,11 +120,11 @@ class ITM(BaseEstimator, ClusterMixin): removed_edges.append((old_inds[list(edge[:2])], edge[2]))
n_split_components, split_components_indicator = \
sparse.cs_graph_components(split + split.T)
connected_components(split + split.T, False) assert(n_split_components == 2) assert(len(np.unique(split_components_indicator)) == 2)
for i in xrange(n_split_components):
for i in range(n_split_components): inds = np.where(split_components_indicator == i)[0] clusters.append((split[inds[np.newaxis, :], inds], old_inds[inds])) @@ -188,7 +189,7 @@ def itm_binary(graph, intrinsic_dimensionality, return_edge=False): while to_visit: x = to_visit.pop() visited[x] = True
for i in xrange(indptr[x], indptr[x + 1]):
for i in range(indptr[x], indptr[x + 1]): n = neighbors[i] if visited[n]:
this is where we were coming from
@@ -214,7 +215,7 @@ def itm_binary(graph, intrinsic_dimensionality, return_edge=False): while to_visit: x = to_visit.pop() visited[x] = True
for i in xrange(indptr[x], indptr[x + 1]):
for i in range(indptr[x], indptr[x + 1]): n = neighbors[i] if n != parent[x]: parent[n] = x @@ -222,7 +223,7 @@ def itm_binary(graph, intrinsic_dimensionality, return_edge=False):
best_cut = None best_objective = -np.inf
for x in xrange(n_samples):
for x in range(n_samples): if parent[x] == -1:
was the root, doesn't have parent
continue
diff --git a/mst.py b/mst.py index f8ce62b..ced989e 100644 --- a/mst.py +++ b/mst.py @@ -1,7 +1,7 @@ import numpy as np -from scipy import sparse +from scipy.sparse.csgraph import connected_components
-from sklearn.utils.sparsetools import minimum_spanning_tree +from scipy.sparse.csgraph import minimum_spanning_tree
def euclidean_mst(X, neighbors_estimator, verbose=2): @@ -15,12 +15,12 @@ def euclidean_mst(X, neighbors_estimator, verbose=2): distances = neighbors_estimator.kneighbors_graph( X, n_neighbors=n_neighbors, mode='distance') n_components, component_indicators =\
- sparse.cs_graph_components(distances + distances.T)
- connected_components(distances + distances.T, False) if len(np.unique(component_indicators)) > 1: continue distances.sort_indices() forest = minimum_spanning_tree(distances)
- _, inds = sparse.cs_graph_components(forest + forest.T)
- _, inds = connected_components(forest + forest.T, False) assert(len(np.unique(inds)) == 1) break return forest diff --git a/tree_entropy.py b/tree_entropy.py index 1015326..08b08a4 100644 --- a/tree_entropy.py +++ b/tree_entropy.py @@ -1,9 +1,9 @@ import numpy as np import warnings
-from scipy import sparse +from scipy.sparse.csgraph import connected_components
-from sklearn.utils.sparsetools import minimum_spanning_tree +from scipy.sparse.csgraph import minimum_spanning_tree from sklearn.metrics import euclidean_distances
@@ -61,13 +61,13 @@ def tree_information_sparse(forest, n_features): """ entropy = 0 sym_forest = forest + forest.T
n_components, components = sparse.cs_graph_components(sym_forest)
n_components, components = connected_components(sym_forest, False) if np.any(components < 0):
there is a lonely node
entropy -= 1e10
n_samples = len(components)
for i in xrange(n_components):
for i in range(n_components): inds = np.where(components == i)[0] subforest = forest[inds[:, np.newaxis], inds] L = subforest.sum()
My test script errors with the following message:
$ cat itm_test.py import numpy as np import information_theoretic_mst as itm import sklearn.metrics, sklearn.cluster
nn = 5 np.random.seed(123) X = np.concatenate((np.random.rand(nn,2)+0.5, np.random.rand(nn,2)-0.5), axis=0) y = np.r_[[nn]1, [nn]2]
cl = itm.ITM(2) cl.fit(X) print(sklearn.metrics.adjusted_rand_score(y, cl._labels))
$ python itm_test.py /usr/lib/python3.6/site-packages/scipy/sparse/compressed.py:742: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient. SparseEfficiencyWarning) Traceback (most recent call last): File "itm_test.py", line 11, in
cl.fit(X) File "information_theoretic_mst/itm.py", line 131, in fit mi = tree_information_sparse(clusters[-1][0], intrinsic_dimensionality) File "information_theoretic_mst/tree_entropy.py", line 63, in tree_information_sparse sym_forest = forest + forest.T File "/usr/lib/python3.6/site-packages/scipy/sparse/base.py", line 374, in add raise ValueError("inconsistent shapes") ValueError: inconsistent shapes — You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/amueller/information-theoretic-mst/issues/1#issuecomment-383321585, or mute the thread https://github.com/notifications/unsubscribe-auth/AAbcFi_XzHbQWEzkHhukA1dLGNUfQc6wks5tq4XTgaJpZM4TegGG .
Try now please. Sorry for the inconvenience. I also changed infer_dimensionality
to default to True
(from False
). I think that's better but it's not tested extensively.
My algorithm tunes automatically how much compactness it wants ;)
Cool, thanks! Will check it out soon
Minimal reproducible example that gives an error:
import information_theoretic_mst as itm
import numpy as np
# generate data:
np.random.seed(123)
X = np.random.randn(2048,2)
X[:1024,:] -= 2
X = np.round(X, 2)
# call ITM:
itm.ITM(n_clusters=2, verbose=666).fit_predict(X)
Error log:
Fitting neighbors data structure.
Datastructure used: kd_tree
Bulding minimum spanning tree.
Trying to build mst with 4 neighbors.
Trying to build mst with 8 neighbors.
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
<ipython-input-46-a19591eca992> in <module>
7 X = np.round(X, 2)
8 # call ITM:
----> 9 itm.ITM(n_clusters=2, verbose=666).fit_predict(X)
[...]
information_theoretic_mst/mst.py in euclidean_mst(X, neighbors_estimator, verbose)
20 forest = minimum_spanning_tree(distances)
21 _, inds = connected_components(forest + forest.T)
---> 22 assert(len(np.unique(inds)) == 1)
23 break
24 return forest
AssertionError:
Cool, thanks! Will check it out soon
LOL, "check it out soon" == 2+ years :)
I've definitely gone longer ;)
Andreas, the package uses some deprecated scipy api which was removed in v 0.11 (concerning the
sparse
module). Are you planning to file an update to your method?