ventolab / CellphoneDB

CellPhoneDB can be used to search for a particular ligand/receptor, or interrogate your own HUMAN single-cell transcriptomics data.
https://www.cellphonedb.org/
MIT License
325 stars 51 forks source link

Invalid Counts data, again #205

Closed alohaia closed 1 week ago

alohaia commented 1 week ago

Basic information

❯ pip show cellphonedb
Name: cellphonedb
Version: 5.0.1
Summary: Inferring cell-cell communication.
Home-page: https://github.com/ventolab/cellphonedb
Author: The CellPhoneDB development team
Author-email: contact@cellphonedb.org
License: MIT
Location: /home/aloha/Desktop/TLS/cellchat/SX12/cpdb/lib/python3.12/site-packages
Requires: anndata, geosketch, ktplotspy, numpy, numpy-groupies, pandas, pytest, requests, scanpy, scikit-learn, tqdm
Required-by:

Reproduction

meta_file_path = 'data/metadata.txt'
counts_file_path = 'data/counts.txt'

out_path = 'results/method1'

metadata = pd.read_csv(meta_file_path, sep = '\t')
metadata.head(10)

image

count_data = pd.read_csv(counts_file_path, sep = '\t')
count_data.head(10) # normalized with log1p, not all zeros

image

list(count_data.columns[1:]).sort() == list(metadata['cell_type']).sort()
True

Then analysis using method 1.

from cellphonedb.src.core.methods import cpdb_analysis_method

cpdb_results = cpdb_analysis_method.call(
    cpdb_file_path = cpdb_file_path,           # mandatory: CellphoneDB database zip file.
    meta_file_path = meta_file_path,           # mandatory: tsv file defining barcodes to cell label.
    counts_file_path = counts_file_path,       # mandatory: normalized count matrix - a path to the counts file, or an in-memory AnnData object
    counts_data = 'hgnc_symbol',               # defines the gene annotation in counts matrix.
    # microenvs_file_path = microenvs_file_path, # optional (default: None): defines cells per microenvironment.
    score_interactions = True,                 # optional: whether to score interactions or not. 
    output_path = out_path,                    # Path to save results    microenvs_file_path = None,
    separator = '|',                           # Sets the string to employ to separate cells in the results dataframes "cellA|CellB".
    threads = 5,                               # number of threads to use in the analysis.
    threshold = 0.01,                           # defines the min % of cells expressing a gene for this to be employed in the analysis.
    result_precision = 3,                      # Sets the rounding for the mean values in significan_means.
    debug = True,                             # Saves all intermediate tables emplyed during the analysis in pkl format.
    output_suffix = None                       # Replaces the timestamp in the output files by a user defined string in the  (default: None)
)

Output:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File ~/Desktop/TLS/cellchat/SX12/cpdb/lib/python3.12/site-packages/cellphonedb/src/core/preprocessors/counts_preprocessors.py:27, in counts_preprocessor(counts, meta)
     26     if np.any(counts.dtypes.values != np.dtype('float32')):
---> 27         counts = counts.astype(np.float32)  # type: pd.DataFrame
     28 except Exception:

File ~/Desktop/TLS/cellchat/SX12/cpdb/lib/python3.12/site-packages/pandas/core/generic.py:6643, in NDFrame.astype(self, dtype, copy, errors)
   6641 else:
   6642     # else, only a single dtype is given
-> 6643     new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   6644     res = self._constructor_from_mgr(new_data, axes=new_data.axes)

File ~/Desktop/TLS/cellchat/SX12/cpdb/lib/python3.12/site-packages/pandas/core/internals/managers.py:430, in BaseBlockManager.astype(self, dtype, copy, errors)
    428     copy = False
--> 430 return self.apply(
    431     "astype",
    432     dtype=dtype,
    433     copy=copy,
    434     errors=errors,
    435     using_cow=using_copy_on_write(),
    436 )

File ~/Desktop/TLS/cellchat/SX12/cpdb/lib/python3.12/site-packages/pandas/core/internals/managers.py:363, in BaseBlockManager.apply(self, f, align_keys, **kwargs)
    362 else:
--> 363     applied = getattr(b, f)(**kwargs)
    364 result_blocks = extend_blocks(applied, result_blocks)

File ~/Desktop/TLS/cellchat/SX12/cpdb/lib/python3.12/site-packages/pandas/core/internals/blocks.py:758, in Block.astype(self, dtype, copy, errors, using_cow, squeeze)
    756     values = values[0, :]  # type: ignore[call-overload]
--> 758 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
    760 new_values = maybe_coerce_values(new_values)

File ~/Desktop/TLS/cellchat/SX12/cpdb/lib/python3.12/site-packages/pandas/core/dtypes/astype.py:237, in astype_array_safe(values, dtype, copy, errors)
    236 try:
--> 237     new_values = astype_array(values, dtype, copy=copy)
    238 except (ValueError, TypeError):
    239     # e.g. _astype_nansafe can fail on object-dtype of strings
    240     #  trying to convert to float

File ~/Desktop/TLS/cellchat/SX12/cpdb/lib/python3.12/site-packages/pandas/core/dtypes/astype.py:182, in astype_array(values, dtype, copy)
    181 else:
--> 182     values = _astype_nansafe(values, dtype, copy=copy)
    184 # in pandas we don't store numpy str dtypes, so convert to object

File ~[/](http://localhost:8888/lab/tree/cellphone_test.ipynb)Desktop/TLS/cellchat/SX12/cpdb/lib/python3.12/site-packages/pandas/core/dtypes/astype.py:133, in _astype_nansafe(arr, dtype, copy, skipna)
    131 if copy or arr.dtype == object or dtype == object:
    132     # Explicit copy, or required since NumPy can't view from / to object.
--> 133     return arr.astype(dtype, copy=True)
    135 return arr.astype(dtype, copy=copy)

ValueError: could not convert string to float: 'Xkr4'

During handling of the above exception, another exception occurred:

ParseCountsException                      Traceback (most recent call last)
Cell In[42], line 3
      1 from cellphonedb.src.core.methods import cpdb_analysis_method
----> 3 cpdb_results = cpdb_analysis_method.call(
      4     cpdb_file_path = cpdb_file_path,           # mandatory: CellphoneDB database zip file.
      5     meta_file_path = meta_file_path,           # mandatory: tsv file defining barcodes to cell label.
      6     counts_file_path = counts_file_path,       # mandatory: normalized count matrix - a path to the counts file, or an in-memory AnnData object
      7     counts_data = 'hgnc_symbol',               # defines the gene annotation in counts matrix.
      8     # microenvs_file_path = microenvs_file_path, # optional (default: None): defines cells per microenvironment.
      9     score_interactions = True,                 # optional: whether to score interactions or not. 
     10     output_path = out_path,                    # Path to save results    microenvs_file_path = None,
     11     separator = '|',                           # Sets the string to employ to separate cells in the results dataframes "cellA|CellB".
     12     threads = 5,                               # number of threads to use in the analysis.
     13     threshold = 0.01,                           # defines the min % of cells expressing a gene for this to be employed in the analysis.
     14     result_precision = 3,                      # Sets the rounding for the mean values in significan_means.
     15     debug = True,                             # Saves all intermediate tables emplyed during the analysis in pkl format.
     16     output_suffix = None                       # Replaces the timestamp in the output files by a user defined string in the  (default: None)
     17 )

File ~/Desktop/TLS/cellchat/SX12/cpdb/lib/python3.12/site-packages/cellphonedb/src/core/methods/cpdb_analysis_method.py:88, in call(cpdb_file_path, meta_file_path, counts_file_path, counts_data, output_path, microenvs_file_path, separator, threshold, result_precision, debug, output_suffix, score_interactions, threads)
     84 interactions, genes, complex_compositions, complexes, gene_synonym2gene_name, receptor2tfs = \
     85     db_utils.get_interactions_genes_complex(cpdb_file_path)
     87 # Load user files into memory
---> 88 counts, meta, microenvs, degs, _ = file_utils.get_user_files(
     89     counts=counts_file_path, meta_fp=meta_file_path, microenvs_fp=microenvs_file_path,
     90     gene_synonym2gene_name=gene_synonym2gene_name, counts_data=counts_data)
     92 # get reduced interactions (drop duplicates)
     93 interactions_reduced = interactions[['multidata_1_id', 'multidata_2_id']].drop_duplicates()

File ~/Desktop/TLS/cellchat/SX12/cpdb/lib/python3.12/site-packages/cellphonedb/utils/file_utils.py:446, in get_user_files(counts, meta_fp, microenvs_fp, degs_fp, active_tfs_fp, gene_synonym2gene_name, counts_data)
    444 loaded_user_files.append(meta_fp)
    445 # Ensure that counts values are of type float32, and that all cells in meta exist in counts
--> 446 counts = counts_preprocessors.counts_preprocessor(counts, meta)
    447 if microenvs_fp:
    448     microenvs = _load_microenvs(microenvs_fp, meta)

File ~/Desktop/TLS/cellchat/SX12/cpdb/lib/python3.12/site-packages/cellphonedb/src/core/preprocessors/counts_preprocessors.py:29, in counts_preprocessor(counts, meta)
     27         counts = counts.astype(np.float32)  # type: pd.DataFrame
     28 except Exception:
---> 29     raise ParseCountsException
     31 meta.index = meta.index.astype(str)
     33 if np.any(~meta.index.isin(counts.columns)):

ParseCountsException: Invalid Counts data
alohaia commented 1 week ago

I finally find that this is for HUMAN ONLY...

ktroule commented 1 week ago

Hi.

As the error indicates: ValueError: could not convert string to float: 'Xkr4' Your first column contains strings, this should be store in the index of the data frame. You can convert, as a proxy, your mouse genes to humans by capitalizing them.

Regards