Closed ColeWunderlich closed 3 years ago
It might also be worth noting that after installing rust on my machine, then calling pip3 git+https://github.com/k3yavi/vpolo.git
everything worked well.
Awesome ! thanks for raising the issue.
Work around. Just load it manually without sce.
I created an executable called convert_data_mat.py
from __future__ import print_function
from collections import defaultdict
from struct import Struct
import numpy as np
import pandas as pd
import gzip
import sys
import os
#import sce
from scipy.io import mmread
from scipy.sparse import csr_matrix
# https://github.com/k3yavi/vpolo/blob/406ade967bf418eae2e673717e6eb6be0f4163f9/vpolo/alevin/parser.py#L12
def read_quants_bin(base_location, clipped=False, mtype="data", rmode=None):
'''
Read the quants Sparse Binary output of Alevin and generates a dataframe
Parameters
----------
base_location: string
Path to the folder containing the output of the alevin run
clipped: bool (default False)
Clip off all zero rows and columns
mtype: "[data(default), tier, var, mean]"
Alevin's matrix type to load into memory
'''
if not os.path.isdir(base_location):
print("{} is not a directory".format( base_location ), file=sys.stderr)
sys.exit(1)
base_location = os.path.join(base_location, "alevin")
print(base_location, file=sys.stderr)
if not os.path.exists(base_location):
print("{} directory doesn't exist".format( base_location ), file=sys.stderr)
sys.exit(1)
data_type = "f"
if mtype == "data":
quant_file = os.path.join(base_location, "quants_mat.gz")
elif mtype == "tier":
data_type = "B"
quant_file = os.path.join(base_location, "quants_tier_mat.gz")
elif mtype == "mean":
quant_file = os.path.join(base_location, "quants_mean_mat.gz")
elif mtype == "var":
quant_file = os.path.join(base_location, "quants_var_mat.gz")
else:
print("wrong mtype:".format( mtype ), file=sys.stderr)
sys.exit(1)
if not os.path.exists(quant_file):
print("quant file {} doesn't exist".format( quant_file ))
sys.exit(1)
if mtype in ["mean", "var"]:
cb_file = os.path.join(base_location, "quants_boot_rows.txt")
else:
cb_file = os.path.join(base_location, "quants_mat_rows.txt")
if not os.path.exists(cb_file):
print("quant file's index: {} doesn't exist".format( cb_file ), file=sys.stderr)
sys.exit(1)
gene_file = os.path.join(base_location, "quants_mat_cols.txt")
if not os.path.exists(gene_file):
print("quant file's header: {} doesn't exist".format(gene_file), file=sys.stderr)
sys.exit(1)
cb_names = pd.read_csv(cb_file, header=None)[0].values
gene_names = pd.read_csv(gene_file, header=None)[0].values
num_genes = len(gene_names)
num_cbs = len(cb_names)
num_entries = int(np.ceil(num_genes/8))
if rmode == "rust":
print("Using rust mode with {} rows and {} columns".format(num_cbs, num_genes), file=sys.stderr)
mat = sce.read_quants(quant_file, num_cbs, num_genes)
umi_matrix = csr_matrix((mat[2], mat[1], mat[0]), shape=(num_cbs, num_genes)).todense()
else:
with gzip.open( quant_file ) as f:
line_count = 0
tot_umi_count = 0
umi_matrix = []
header_struct = Struct( "B" * num_entries)
while True:
line_count += 1
if line_count%100 == 0:
print ("\r Done reading " + str(line_count) + " cells", end= "", file=sys.stderr)
sys.stdout.flush()
try:
num_exp_genes = 0
exp_counts = header_struct.unpack_from( f.read(header_struct.size) )
for exp_count in exp_counts:
num_exp_genes += bin(exp_count).count("1")
data_struct = Struct( data_type * num_exp_genes)
sparse_cell_counts_vec = list(data_struct.unpack_from( f.read(data_struct.size) ))[::-1]
cell_umi_counts = sum(sparse_cell_counts_vec)
except:
print ("\nRead total " + str(line_count-1) + " cells", file=sys.stderr)
print ("Found total " + str(tot_umi_count) + " reads", file=sys.stderr)
break
if cell_umi_counts > 0.0:
tot_umi_count += cell_umi_counts
cell_counts_vec = []
for exp_count in exp_counts:
for bit in format(exp_count, '08b'):
if len(cell_counts_vec) >= num_genes:
break
if bit == '0':
cell_counts_vec.append(0.0)
else:
abund = sparse_cell_counts_vec.pop()
cell_counts_vec.append(abund)
if len(sparse_cell_counts_vec) > 0:
print("Failure in consumption of data", file=sys.stderr)
print("left with {} entry(ies)".format(len(sparse_cell_counts_vec)))
umi_matrix.append( cell_counts_vec )
else:
print("Found a CB with no read count, something is wrong", file=sys.stderr)
sys.exit(1)
alv = pd.DataFrame(umi_matrix)
alv.columns = gene_names
alv.index = cb_names
if clipped:
alv = alv.loc[:, (alv != 0).any(axis=0)]
return alv
def main():
df = read_quants_bin(sys.argv[1])
df.to_csv(sys.stdout, sep="\t")
if __name__ == "__main__":
main()
I ran it like this:
python convert_quant_mat.py alevin_output/1a_021722_Saliva10XSC_1a_S1/ > test.tsv
alevin_output/1a_021722_Saliva10XSC_1a_S1/alevin
Done reading 500 cells
Read total 562 cells
Found total 203935.0000764782 reads
@k3yavi since this issue seems to be quite fatal, it might be useful having import sce
loaded within the function only if rmode='rust'
. I was going to give up on this package if the above didn't work. I tried pip install, conda install, regular python install and nothing worked. Glad I was able to still convert my files with this code.
The PyPI version of vpolo appears to be out of date, sitting at
v0.2.4
.After installing vpolo using
pip3 install vpolo
, I get the following error when I try to use the matrix parser.My python knowledge is pretty rudimentary, but after poking around the package my guess is this is because the rust module is not installed/does not exist in this version.