EmptyDataError on Test Data

cap76 commented 7 months ago

Hi, I'm trying to get the newer versions of CPDB up and running (virtual environment using Python 3.9.6), however I am having some issues with the demo data. At the moment I'm following the example T1_Method1.ipynd in the Notebooks using data from the demo_data folder. The first part of the code runs fine (I need to make a minor change to the line: list(adata.obs.index).sort() == list(metadata['barcode_sample']).sort() ) as the meta data doesn't have a column 'barcode_sample' which is instead names 'Cell'. Other than that the other pre-run checks are all fine.

At the section of code where you come to running CPDB, however, it starts throwing an EmptyDataError (see below). Do you have any suggestion about where things are falling down?

EmptyDataError Traceback (most recent call last) Cell In[12], line 3 1 from cellphonedb.src.core.methods import cpdb_analysis_method ----> 3 cpdb_results = cpdb_analysis_method.call( 4 cpdb_file_path = cpdb_file_path, # mandatory: CellphoneDB database zip file. 5 meta_file_path = meta_file_path, # mandatory: tsv file defining barcodes to cell label. 6 counts_file_path = counts_file_path, # mandatory: normalized count matrix - a path to the counts file, or an in-memory AnnData object 7 counts_data = 'hgnc_symbol', # defines the gene annotation in counts matrix. 8 microenvs_file_path = microenvs_file_path, # optional (default: None): defines cells per microenvironment. 9 score_interactions = True, # optional: whether to score interactions or not. 10 output_path = out_path, # Path to save results microenvs_file_path = None, 11 separator = '|', # Sets the string to employ to separate cells in the results dataframes "cellA|CellB". 12 threads = 5, # number of threads to use in the analysis. 13 threshold = 0.1, # defines the min % of cells expressing a gene for this to be employed in the analysis. 14 result_precision = 3, # Sets the rounding for the mean values in significan_means. 15 debug = False, # Saves all intermediate tables emplyed during the analysis in pkl format. 16 output_suffix = None # Replaces the timestamp in the output files by a user defined string in the (default: None) 17 )

File ~/Desktop/Code/cpdb/lib/python3.9/site-packages/cellphonedb/src/core/methods/cpdb_analysis_method.py:85, in call(cpdb_file_path, meta_file_path, counts_file_path, counts_data, output_path, microenvs_file_path, separator, threshold, result_precision, debug, output_suffix, score_interactions, threads) 80 raise MissingRequiredArgumentsException(description="All of the following arguments need to be provided: {}".format( 81 "cpdb_file_path, meta_file_path, counts_file_path, counts_data, output_path")) 83 # Load into memory CellphoneDB data 84 interactions, genes, complex_compositions, complexes, gene_synonym2gene_name, receptor2tfs = \ ---> 85 db_utils.get_interactions_genes_complex(cpdb_filepath) 87 # Load user files into memory 88 counts, meta, microenvs, degs, = file_utils.get_user_files( 89 counts_fp=counts_file_path, meta_fp=meta_file_path, microenvs_fp=microenvs_file_path, 90 gene_synonym2gene_name=gene_synonym2gene_name, counts_data=counts_data)

File ~/Desktop/Code/cpdb/lib/python3.9/site-packages/cellphonedb/utils/db_utils.py:94, in get_interactions_genes_complex(cpdb_file_path) 75 """ 76 Returns a tuple of four DataFrames containing data from /cellphonedb.zip. 77 (...) 91 - receptor2tfs: dict 92 """ 93 # Extract csv files from db_files_path/cellphonedb.zip into dbTableDFs ---> 94 dbTableDFs = extract_dataframes_from_db(cpdb_file_path) 95 # Convert dbTableDFs into interactions, genes, complex_composition, complex_expanded data frames 96 # and gene_synonym2gene_name dict 97 gene_synonym2gene_name = {}

File ~/Desktop/Code/cpdb/lib/python3.9/site-packages/cellphonedb/utils/db_utils.py:168, in extract_dataframes_from_db(cpdb_file_path) 166 file_handle = tuple[1] 167 dbg("Retrieving from zip file: " + file_name) --> 168 dfs[file_name.replace('.csv', '')] = pd.read_csv(file_handle) 169 duration = time.time() - start 170 dbg("Loaded DB into memory in " + str(round(duration, 2)) + "s")

File ~/Desktop/Code/cpdb/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1024, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend) 1011 kwds_defaults = _refine_defaults_read( 1012 dialect, 1013 delimiter, (...) 1020 dtype_backend=dtype_backend, 1021 ) 1022 kwds.update(kwds_defaults) -> 1024 return _read(filepath_or_buffer, kwds)

File ~/Desktop/Code/cpdb/lib/python3.9/site-packages/pandas/io/parsers/readers.py:618, in _read(filepath_or_buffer, kwds) 615 _validate_names(kwds.get("names", None)) 617 # Create the parser. --> 618 parser = TextFileReader(filepath_or_buffer, **kwds) 620 if chunksize or iterator: 621 return parser

File ~/Desktop/Code/cpdb/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1618, in TextFileReader.init(self, f, engine, **kwds) 1615 self.options["has_index_names"] = kwds["has_index_names"] 1617 self.handles: IOHandles | None = None -> 1618 self._engine = self._make_engine(f, self.engine)

File ~/Desktop/Code/cpdb/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1896, in TextFileReader._make_engine(self, f, engine) 1893 raise ValueError(msg) 1895 try: -> 1896 return mapping[engine](f, **self.options) 1897 except Exception: 1898 if self.handles is not None:

File ~/Desktop/Code/cpdb/lib/python3.9/site-packages/pandas/io/parsers/c_parser_wrapper.py:93, in CParserWrapper.init(self, src, kwds) 90 if kwds["dtype_backend"] == "pyarrow": 91 # Fail here loudly instead of in cython after reading 92 import_optional_dependency("pyarrow") ---> 93 self._reader = parsers.TextReader(src, kwds) 95 self.unnamed_cols = self._reader.unnamed_cols 97 # error: Cannot determine type of 'names'

File parsers.pyx:581, in pandas._libs.parsers.TextReader.cinit()

EmptyDataError: No columns to parse from file

datasome commented 7 months ago

Hi Christopher,

From the errors above it looks as if there's a problem with cellphonedb.zip you're using. Did you perhaps generate cellphonedb.zip file yourself from your input files or are you using the latest one (https://github.com/ventolab/cellphonedb-data/blob/master/cellphonedb.zip) ?

Best,

Robert.

cap76 commented 7 months ago

Brilliant. I think I was using the latest one, but had Safari set to auto-unzip; when I re-zipped it doesn't seem to like the new .zip file. It now runs a bit:

Reading user files... The following user files were loaded successfully: /Users/christopherpenfold/Desktop/Code/CellPhoneDB/DemoData/test.h5ad /Users/christopherpenfold/Desktop/Code/CellPhoneDB/DemoData/test_meta.txt /Users/christopherpenfold/Desktop/Code/CellPhoneDB/DemoData/test_microenviroments.txt

Before throwing an error:

AllCountsFilteredException Traceback (most recent call last) Cell In[44], line 3 1 from cellphonedb.src.core.methods import cpdb_analysis_method ----> 3 cpdb_results = cpdb_analysis_method.call( 4 cpdb_file_path = cpdb_file_path, # mandatory: CellphoneDB database zip file. 5 meta_file_path = meta_file_path, # mandatory: tsv file defining barcodes to cell label. 6 counts_file_path = counts_file_path, # mandatory: normalized count matrix - a path to the counts file, or an in-memory AnnData object 7 counts_data = 'hgnc_symbol', # defines the gene annotation in counts matrix. 8 microenvs_file_path = microenvs_file_path, # optional (default: None): defines cells per microenvironment. 9 score_interactions = True, # optional: whether to score interactions or not. 10 output_path = out_path, # Path to save results microenvs_file_path = None, 11 separator = '|', # Sets the string to employ to separate cells in the results dataframes "cellA|CellB". 12 threads = 5, # number of threads to use in the analysis. 13 threshold = 0.1, # defines the min % of cells expressing a gene for this to be employed in the analysis. 14 result_precision = 3, # Sets the rounding for the mean values in significan_means. 15 debug = True, # Saves all intermediate tables emplyed during the analysis in pkl format. 16 output_suffix = None # Replaces the timestamp in the output files by a user defined string in the (default: None) 17 )

File ~/Desktop/Code/cpdb/lib/python3.9/site-packages/cellphonedb/src/core/methods/cpdb_analysis_method.py:99, in call(cpdb_file_path, meta_file_path, counts_file_path, counts_data, output_path, microenvs_file_path, separator, threshold, result_precision, debug, output_suffix, score_interactions, threads) 96 counts, counts_relations = cpdb_statistical_analysis_helper.add_multidata_and_means_to_counts( 97 counts, genes, counts_data) 98 if counts.empty: ---> 99 raise AllCountsFilteredException(hint='Are you using human data?') 101 interactions_filtered, counts_filtered, complex_composition_filtered = \ 102 cpdb_statistical_analysis_helper.prefilters(interactions_reduced, 103 counts, 104 complexes, 105 complex_compositions) 106 if interactions_filtered.empty:

AllCountsFilteredException: All counts filtered

cap76 commented 7 months ago

Okay, my last post was a known issue. Seems to be running now.

ventolab / CellphoneDB

EmptyDataError on Test Data #173