k3yavi / vpolo

To remember what have been lost !
GNU General Public License v3.0
4 stars 4 forks source link

PyPI version is currently out of date (v0.2.4) and does not work #2

Closed ColeWunderlich closed 3 years ago

ColeWunderlich commented 3 years ago

The PyPI version of vpolo appears to be out of date, sitting at v0.2.4.

After installing vpolo using pip3 install vpolo, I get the following error when I try to use the matrix parser.

>>> from vpolo.alevin import parser
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "<path>/lib/python3.8/site-packages/vpolo/alevin/parser.py", line 9, in <module>
    from sce import sce
  File "<path>/lib/python3.8/site-packages/sce/__init__.py", line 1, in <module>
    from sce import sce
ImportError: cannot import name 'sce' from partially initialized module 'sce' (most likely due to a circular import) (<path>/lib/python3.8/site-packages/sce/__init__.py)

My python knowledge is pretty rudimentary, but after poking around the package my guess is this is because the rust module is not installed/does not exist in this version.

ColeWunderlich commented 3 years ago

It might also be worth noting that after installing rust on my machine, then calling pip3 git+https://github.com/k3yavi/vpolo.git everything worked well.

k3yavi commented 3 years ago

Awesome ! thanks for raising the issue.

jolespin commented 10 months ago

Work around. Just load it manually without sce.

I created an executable called convert_data_mat.py

from __future__ import print_function
from collections import defaultdict
from struct import Struct
import numpy as np
import pandas as pd
import gzip
import sys
import os
#import sce
from scipy.io import mmread
from scipy.sparse import csr_matrix

# https://github.com/k3yavi/vpolo/blob/406ade967bf418eae2e673717e6eb6be0f4163f9/vpolo/alevin/parser.py#L12
def read_quants_bin(base_location, clipped=False, mtype="data", rmode=None):
    '''
    Read the quants Sparse Binary output of Alevin and generates a dataframe
    Parameters
    ----------
    base_location: string
        Path to the folder containing the output of the alevin run
    clipped: bool (default False)
        Clip off all zero rows and columns
    mtype: "[data(default), tier, var, mean]"
        Alevin's matrix type to load into memory
    '''
    if not os.path.isdir(base_location):
        print("{} is not a directory".format( base_location ), file=sys.stderr)
        sys.exit(1)

    base_location = os.path.join(base_location, "alevin")

    print(base_location, file=sys.stderr)
    if not os.path.exists(base_location):
        print("{} directory doesn't exist".format( base_location ), file=sys.stderr)
        sys.exit(1)

    data_type = "f"
    if mtype == "data":
        quant_file = os.path.join(base_location, "quants_mat.gz")
    elif mtype == "tier":
        data_type = "B"
        quant_file = os.path.join(base_location, "quants_tier_mat.gz")
    elif mtype == "mean":
        quant_file = os.path.join(base_location, "quants_mean_mat.gz")
    elif mtype == "var":
        quant_file = os.path.join(base_location, "quants_var_mat.gz")
    else:
        print("wrong mtype:".format( mtype ), file=sys.stderr)
        sys.exit(1)

    if not os.path.exists(quant_file):
        print("quant file {} doesn't exist".format( quant_file ))
        sys.exit(1)

    if mtype in ["mean", "var"]:
        cb_file = os.path.join(base_location, "quants_boot_rows.txt")
    else:
        cb_file = os.path.join(base_location, "quants_mat_rows.txt")

    if not os.path.exists(cb_file):
        print("quant file's index: {} doesn't exist".format( cb_file ), file=sys.stderr)
        sys.exit(1)

    gene_file = os.path.join(base_location, "quants_mat_cols.txt")

    if not os.path.exists(gene_file):
        print("quant file's header: {} doesn't exist".format(gene_file), file=sys.stderr)
        sys.exit(1)

    cb_names = pd.read_csv(cb_file, header=None)[0].values
    gene_names = pd.read_csv(gene_file, header=None)[0].values
    num_genes = len(gene_names)
    num_cbs = len(cb_names)
    num_entries = int(np.ceil(num_genes/8))

    if rmode == "rust":
        print("Using rust mode with {} rows and {} columns".format(num_cbs, num_genes), file=sys.stderr)
        mat = sce.read_quants(quant_file, num_cbs, num_genes)
        umi_matrix = csr_matrix((mat[2], mat[1], mat[0]), shape=(num_cbs, num_genes)).todense()
    else:
        with gzip.open( quant_file ) as f:
            line_count = 0
            tot_umi_count = 0
            umi_matrix = []

            header_struct = Struct( "B" * num_entries)
            while True:
                line_count += 1
                if line_count%100 == 0:
                    print ("\r Done reading " + str(line_count) + " cells", end= "", file=sys.stderr)
                    sys.stdout.flush()
                try:
                    num_exp_genes = 0
                    exp_counts = header_struct.unpack_from( f.read(header_struct.size) )
                    for exp_count in exp_counts:
                        num_exp_genes += bin(exp_count).count("1")

                    data_struct = Struct( data_type * num_exp_genes)
                    sparse_cell_counts_vec = list(data_struct.unpack_from( f.read(data_struct.size) ))[::-1]
                    cell_umi_counts = sum(sparse_cell_counts_vec)

                except:
                    print ("\nRead total " + str(line_count-1) + " cells", file=sys.stderr)
                    print ("Found total " + str(tot_umi_count) + " reads", file=sys.stderr)
                    break

                if cell_umi_counts > 0.0:
                    tot_umi_count += cell_umi_counts

                    cell_counts_vec = []
                    for exp_count in exp_counts:
                        for bit in format(exp_count, '08b'):
                            if len(cell_counts_vec) >= num_genes:
                                break

                            if bit == '0':
                                cell_counts_vec.append(0.0)
                            else:
                                abund = sparse_cell_counts_vec.pop()
                                cell_counts_vec.append(abund)

                    if len(sparse_cell_counts_vec) > 0:
                        print("Failure in consumption of data", file=sys.stderr)
                        print("left with {} entry(ies)".format(len(sparse_cell_counts_vec)))
                    umi_matrix.append( cell_counts_vec )
                else:
                    print("Found a CB with no read count, something is wrong", file=sys.stderr)
                    sys.exit(1)

    alv = pd.DataFrame(umi_matrix)
    alv.columns = gene_names
    alv.index = cb_names
    if clipped:
        alv = alv.loc[:, (alv != 0).any(axis=0)]

    return alv

def main():
    df = read_quants_bin(sys.argv[1])
    df.to_csv(sys.stdout, sep="\t")

if __name__ == "__main__":
    main()

I ran it like this:

python convert_quant_mat.py alevin_output/1a_021722_Saliva10XSC_1a_S1/ > test.tsv
alevin_output/1a_021722_Saliva10XSC_1a_S1/alevin
 Done reading 500 cells
Read total 562 cells
Found total 203935.0000764782 reads

@k3yavi since this issue seems to be quite fatal, it might be useful having import sce loaded within the function only if rmode='rust'. I was going to give up on this package if the above didn't work. I tried pip install, conda install, regular python install and nothing worked. Glad I was able to still convert my files with this code.