amueller / information-theoretic-mst

Information Theoretic Clustering using Minimum Spanning Trees
19 stars 5 forks source link

doesn't work with python 3 and scipy > 0.10 #1

Open gagolews opened 6 years ago

gagolews commented 6 years ago

Andreas, the package uses some deprecated scipy api which was removed in v 0.11 (concerning the sparse module). Are you planning to file an update to your method?

gagolews commented 6 years ago

I tried to apply the following obvious patches, but with no success. Giving up

diff --git a/__init__.py b/__init__.py
index bed0237..e3ba8e2 100644
--- a/__init__.py
+++ b/__init__.py
@@ -1,3 +1,3 @@
-from itm import ITM
+from .itm import ITM

 __all__ = ['ITM']
diff --git a/itm.py b/itm.py
index 5a06ea6..9cf87c9 100644
--- a/itm.py
+++ b/itm.py
@@ -1,16 +1,17 @@
 import warnings

-from scipy import sparse
+from scipy.sparse.csgraph import connected_components
+
 import numpy as np

 from sklearn.base import BaseEstimator, ClusterMixin
 from sklearn.neighbors import NearestNeighbors

-from block_diag import block_diag
-from tree_entropy import tree_information_sparse
-from infer_dimensionality import estimate_dimension
+from .block_diag import block_diag
+from .tree_entropy import tree_information_sparse
+from .infer_dimensionality import estimate_dimension

-from mst import euclidean_mst
+from .mst import euclidean_mst

 DTYPE = np.float
 ITYPE = np.int
@@ -119,11 +120,11 @@ class ITM(BaseEstimator, ClusterMixin):
             removed_edges.append((old_inds[list(edge[:2])], edge[2]))

             n_split_components, split_components_indicator = \
-                sparse.cs_graph_components(split + split.T)
+                connected_components(split + split.T, False)
             assert(n_split_components == 2)
             assert(len(np.unique(split_components_indicator)) == 2)

-            for i in xrange(n_split_components):
+            for i in range(n_split_components):
                 inds = np.where(split_components_indicator == i)[0]
                 clusters.append((split[inds[np.newaxis, :], inds],
                                  old_inds[inds]))
@@ -188,7 +189,7 @@ def itm_binary(graph, intrinsic_dimensionality, return_edge=False):
     while to_visit:
         x = to_visit.pop()
         visited[x] = True
-        for i in xrange(indptr[x], indptr[x + 1]):
+        for i in range(indptr[x], indptr[x + 1]):
             n = neighbors[i]
             if visited[n]:
                 #this is where we were coming from
@@ -214,7 +215,7 @@ def itm_binary(graph, intrinsic_dimensionality, return_edge=False):
     while to_visit:
         x = to_visit.pop()
         visited[x] = True
-        for i in xrange(indptr[x], indptr[x + 1]):
+        for i in range(indptr[x], indptr[x + 1]):
             n = neighbors[i]
             if n != parent[x]:
                 parent[n] = x
@@ -222,7 +223,7 @@ def itm_binary(graph, intrinsic_dimensionality, return_edge=False):

     best_cut = None
     best_objective = -np.inf
-    for x in xrange(n_samples):
+    for x in range(n_samples):
         if parent[x] == -1:
             # was the root, doesn't have parent
             continue
diff --git a/mst.py b/mst.py
index f8ce62b..ced989e 100644
--- a/mst.py
+++ b/mst.py
@@ -1,7 +1,7 @@
 import numpy as np
-from scipy import sparse
+from scipy.sparse.csgraph import connected_components

-from sklearn.utils.sparsetools import minimum_spanning_tree
+from scipy.sparse.csgraph import minimum_spanning_tree

 def euclidean_mst(X, neighbors_estimator, verbose=2):
@@ -15,12 +15,12 @@ def euclidean_mst(X, neighbors_estimator, verbose=2):
         distances = neighbors_estimator.kneighbors_graph(
             X, n_neighbors=n_neighbors, mode='distance')
         n_components, component_indicators =\
-            sparse.cs_graph_components(distances + distances.T)
+            connected_components(distances + distances.T, False)
         if len(np.unique(component_indicators)) > 1:
             continue
         distances.sort_indices()
         forest = minimum_spanning_tree(distances)
-        _, inds = sparse.cs_graph_components(forest + forest.T)
+        _, inds = connected_components(forest + forest.T, False)
         assert(len(np.unique(inds)) == 1)
         break
     return forest
diff --git a/tree_entropy.py b/tree_entropy.py
index 1015326..08b08a4 100644
--- a/tree_entropy.py
+++ b/tree_entropy.py
@@ -1,9 +1,9 @@
 import numpy as np
 import warnings

-from scipy import sparse
+from scipy.sparse.csgraph import connected_components

-from sklearn.utils.sparsetools import minimum_spanning_tree
+from scipy.sparse.csgraph import minimum_spanning_tree
 from sklearn.metrics import euclidean_distances

@@ -61,13 +61,13 @@ def tree_information_sparse(forest, n_features):
     """
     entropy = 0
     sym_forest = forest + forest.T
-    n_components, components = sparse.cs_graph_components(sym_forest)
+    n_components, components = connected_components(sym_forest, False)
     if np.any(components < 0):
         # there is a lonely node
         entropy -= 1e10
     #n_samples = len(components)

-    for i in xrange(n_components):
+    for i in range(n_components):
         inds = np.where(components == i)[0]
         subforest = forest[inds[:, np.newaxis], inds]
         L = subforest.sum()

My test script errors with the following message:

$ cat itm_test.py 
import numpy as np
import information_theoretic_mst as itm
import sklearn.metrics, sklearn.cluster

nn = 5
np.random.seed(123)
X = np.concatenate((np.random.rand(nn,2)+0.5, np.random.rand(nn,2)-0.5), axis=0)
y = np.r_[[nn]*1, [nn]*2]

cl = itm.ITM(2)
cl.fit(X)
print(sklearn.metrics.adjusted_rand_score(y, cl._labels))

$ python itm_test.py 
/usr/lib/python3.6/site-packages/scipy/sparse/compressed.py:742: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
  SparseEfficiencyWarning)
Traceback (most recent call last):
  File "itm_test.py", line 11, in <module>
    cl.fit(X)
  File "information_theoretic_mst/itm.py", line 131, in fit
    mi = tree_information_sparse(clusters[-1][0], intrinsic_dimensionality)
  File "information_theoretic_mst/tree_entropy.py", line 63, in tree_information_sparse
    sym_forest = forest + forest.T
  File "/usr/lib/python3.6/site-packages/scipy/sparse/base.py", line 374, in __add__
    raise ValueError("inconsistent shapes")
ValueError: inconsistent shapes
amueller commented 6 years ago

I'll fix it today or tomorrow.

Sent from phone. Please excuse spelling and brevity.

On Sat, Apr 21, 2018, 15:17 Marek Gagolewski notifications@github.com wrote:

I tried to apply the following obvious patches, but with no success. Giving up

diff --git a/init.py b/init.py index bed0237..e3ba8e2 100644 --- a/init.py +++ b/init.py @@ -1,3 +1,3 @@ -from itm import ITM +from .itm import ITM

all = ['ITM'] diff --git a/itm.py b/itm.py index 5a06ea6..9cf87c9 100644 --- a/itm.py +++ b/itm.py @@ -1,16 +1,17 @@ import warnings

-from scipy import sparse +from scipy.sparse.csgraph import connected_components + import numpy as np

from sklearn.base import BaseEstimator, ClusterMixin from sklearn.neighbors import NearestNeighbors

-from block_diag import block_diag -from tree_entropy import tree_information_sparse -from infer_dimensionality import estimate_dimension +from .block_diag import block_diag +from .tree_entropy import tree_information_sparse +from .infer_dimensionality import estimate_dimension

-from mst import euclidean_mst +from .mst import euclidean_mst

DTYPE = np.float ITYPE = np.int @@ -119,11 +120,11 @@ class ITM(BaseEstimator, ClusterMixin): removed_edges.append((old_inds[list(edge[:2])], edge[2]))

         n_split_components, split_components_indicator = \
  • sparse.cs_graph_components(split + split.T)

  • connected_components(split + split.T, False) assert(n_split_components == 2) assert(len(np.unique(split_components_indicator)) == 2)

  • for i in xrange(n_split_components):

  • for i in range(n_split_components): inds = np.where(split_components_indicator == i)[0] clusters.append((split[inds[np.newaxis, :], inds], old_inds[inds])) @@ -188,7 +189,7 @@ def itm_binary(graph, intrinsic_dimensionality, return_edge=False): while to_visit: x = to_visit.pop() visited[x] = True

  • for i in xrange(indptr[x], indptr[x + 1]):

  • for i in range(indptr[x], indptr[x + 1]): n = neighbors[i] if visited[n]:

    this is where we were coming from

    @@ -214,7 +215,7 @@ def itm_binary(graph, intrinsic_dimensionality, return_edge=False): while to_visit: x = to_visit.pop() visited[x] = True

  • for i in xrange(indptr[x], indptr[x + 1]):

  • for i in range(indptr[x], indptr[x + 1]): n = neighbors[i] if n != parent[x]: parent[n] = x @@ -222,7 +223,7 @@ def itm_binary(graph, intrinsic_dimensionality, return_edge=False):

    best_cut = None best_objective = -np.inf

  • for x in xrange(n_samples):

  • for x in range(n_samples): if parent[x] == -1:

    was the root, doesn't have parent

         continue

    diff --git a/mst.py b/mst.py index f8ce62b..ced989e 100644 --- a/mst.py +++ b/mst.py @@ -1,7 +1,7 @@ import numpy as np -from scipy import sparse +from scipy.sparse.csgraph import connected_components

-from sklearn.utils.sparsetools import minimum_spanning_tree +from scipy.sparse.csgraph import minimum_spanning_tree

def euclidean_mst(X, neighbors_estimator, verbose=2): @@ -15,12 +15,12 @@ def euclidean_mst(X, neighbors_estimator, verbose=2): distances = neighbors_estimator.kneighbors_graph( X, n_neighbors=n_neighbors, mode='distance') n_components, component_indicators =\

  • sparse.cs_graph_components(distances + distances.T)
  • connected_components(distances + distances.T, False) if len(np.unique(component_indicators)) > 1: continue distances.sort_indices() forest = minimum_spanning_tree(distances)
  • _, inds = sparse.cs_graph_components(forest + forest.T)
  • _, inds = connected_components(forest + forest.T, False) assert(len(np.unique(inds)) == 1) break return forest diff --git a/tree_entropy.py b/tree_entropy.py index 1015326..08b08a4 100644 --- a/tree_entropy.py +++ b/tree_entropy.py @@ -1,9 +1,9 @@ import numpy as np import warnings

-from scipy import sparse +from scipy.sparse.csgraph import connected_components

-from sklearn.utils.sparsetools import minimum_spanning_tree +from scipy.sparse.csgraph import minimum_spanning_tree from sklearn.metrics import euclidean_distances

@@ -61,13 +61,13 @@ def tree_information_sparse(forest, n_features): """ entropy = 0 sym_forest = forest + forest.T

  • n_components, components = sparse.cs_graph_components(sym_forest)

  • n_components, components = connected_components(sym_forest, False) if np.any(components < 0):

    there is a lonely node

     entropy -= 1e10

    n_samples = len(components)

  • for i in xrange(n_components):

  • for i in range(n_components): inds = np.where(components == i)[0] subforest = forest[inds[:, np.newaxis], inds] L = subforest.sum()

My test script errors with the following message:

$ cat itm_test.py import numpy as np import information_theoretic_mst as itm import sklearn.metrics, sklearn.cluster

nn = 5 np.random.seed(123) X = np.concatenate((np.random.rand(nn,2)+0.5, np.random.rand(nn,2)-0.5), axis=0) y = np.r_[[nn]1, [nn]2]

cl = itm.ITM(2) cl.fit(X) print(sklearn.metrics.adjusted_rand_score(y, cl._labels))

$ python itm_test.py /usr/lib/python3.6/site-packages/scipy/sparse/compressed.py:742: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient. SparseEfficiencyWarning) Traceback (most recent call last): File "itm_test.py", line 11, in cl.fit(X) File "information_theoretic_mst/itm.py", line 131, in fit mi = tree_information_sparse(clusters[-1][0], intrinsic_dimensionality) File "information_theoretic_mst/tree_entropy.py", line 63, in tree_information_sparse sym_forest = forest + forest.T File "/usr/lib/python3.6/site-packages/scipy/sparse/base.py", line 374, in add raise ValueError("inconsistent shapes") ValueError: inconsistent shapes

— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/amueller/information-theoretic-mst/issues/1#issuecomment-383321585, or mute the thread https://github.com/notifications/unsubscribe-auth/AAbcFi_XzHbQWEzkHhukA1dLGNUfQc6wks5tq4XTgaJpZM4TegGG .

amueller commented 6 years ago

Try now please. Sorry for the inconvenience. I also changed infer_dimensionality to default to True (from False). I think that's better but it's not tested extensively. My algorithm tunes automatically how much compactness it wants ;)

gagolews commented 6 years ago

Cool, thanks! Will check it out soon

gagolews commented 4 years ago

Minimal reproducible example that gives an error:

import information_theoretic_mst as itm
import numpy as np
# generate data:
np.random.seed(123)
X = np.random.randn(2048,2)
X[:1024,:] -= 2
X = np.round(X, 2)
# call ITM:
itm.ITM(n_clusters=2, verbose=666).fit_predict(X)

Error log:

Fitting neighbors data structure.
Datastructure used: kd_tree
Bulding minimum spanning tree.
Trying to build mst with 4 neighbors.
Trying to build mst with 8 neighbors.
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-46-a19591eca992> in <module>
      7 X = np.round(X, 2)
      8 # call ITM:
----> 9 itm.ITM(n_clusters=2, verbose=666).fit_predict(X)
[...]
information_theoretic_mst/mst.py in euclidean_mst(X, neighbors_estimator, verbose)
     20         forest = minimum_spanning_tree(distances)
     21         _, inds = connected_components(forest + forest.T)
---> 22         assert(len(np.unique(inds)) == 1)
     23         break
     24     return forest

AssertionError: 
gagolews commented 4 years ago

Cool, thanks! Will check it out soon

LOL, "check it out soon" == 2+ years :)

amueller commented 4 years ago

I've definitely gone longer ;)