kunaldahiya / pyxclib

Tools for multi-label classification problems.
MIT License
126 stars 36 forks source link

Replaced deprecated numpy types np.* with '*' as dtype #37

Closed shikharmn closed 8 months ago

shikharmn commented 9 months ago

PR for #34.

The np.*_t doesn't have to be modified since that is a registered type-identifier for ndarrays in Cython. Only the deprecated dtype references np.int and np.float have been changed to 'int' and 'float' respectively. Following is a simple test script I used to ensure correctness.

import numpy as np
import scipy.sparse as sp
from xclib.utils.sparse import topk, rank

def obtain_topk(preds_, k, threshold=0):
"""
Definition:
Inefficient function to obtain topk, and pad with zeros if fewer elements than k.

Implementation:
Obtains top-k indices and values for every row and dumps to row and column ndarrays
to create a COO matrix, and subsequently converts to CSR before returning.
"""
    coo_rows = np.repeat(np.arange(preds_.shape[0]), k).astype(np.int64)
    coo_cols = np.zeros(coo_rows.shape, dtype=np.int64)
    coo_data = np.zeros(coo_rows.shape, dtype=np.float32)

    for idx in trange(preds_.shape[0]):
        data = preds_[idx].data
        indices = preds_[idx].indices
        if len(data) == 0:
            data = np.zeros(k, dtype=np.float32)
            indices = np.zeros(k, dtype=np.int64)
        elif len(data) < k:
            data = np.concatenate([data, np.zeros(k - len(data), dtype=np.float32)])
            indices = np.concatenate([indices, np.zeros(k - len(indices), dtype=np.int64)])
        topk = np.argsort(data)[::-1][:k]
        coo_cols[idx * k: (idx + 1) * k] = indices[topk]
        coo_data[idx * k: (idx + 1) * k] = data[topk]

    coo_data[coo_data < threshold] = 0

    topk_preds = sp.coo_matrix(
        (coo_data, (coo_rows, coo_cols)),
        shape=(preds_.shape[0], preds_.shape[1])
    ).tocsr()

    topk_preds.eliminate_zeros()

    return topk_preds

path = # Path to .npz sparse matrix

x = sp.load_npz(path)

x_mine = obtain_topk(x, 5, threshold=-20)
x_lib_nb = topk(rank(-x), k=5, pad_ind=0, pad_val=0, use_cython=True)

for i_mine, i_lib in zip(x_mine[0].indices, x_lib_nb[0]):
    print(x[0, i_mine], x[0, i_lib], i_mine, i_lib)

This returns identical values.