zktuong / dandelion

dandelion - A single cell BCR/TCR V(D)J-seq analysis package for 10X Chromium 5' data
https://sc-dandelion.readthedocs.io/
GNU Affero General Public License v3.0
108 stars 25 forks source link

ddl.tl.define_clones(vdjx, key_added="changeo_clone_id", model="hh_s5f") #273

Closed hai178912522 closed 11 months ago

hai178912522 commented 1 year ago

Description of the bug

import os
os.environ['R_HOME'] = '/data_disk/ST01/Software/conda_env/scirpy/lib/R'
os.environ['PATH'] = '/data_disk/ST01/Software/conda_env/scirpy/lib/R/bin:' + os.environ['PATH']
import warnings

warnings.filterwarnings(
    "ignore",
    ".*IProgress not found*",
)
warnings.simplefilter(action="ignore", category=FutureWarning)

from palmotif import compute_motif, svg_logo
import scanpy as sc
import dandelion as ddl
import scirpy as ir
import pandas as pd
import numpy as np
import seaborn as sb

import os

import matplotlib as mpl
import matplotlib.pyplot as plt

warnings.simplefilter(action="ignore", category=pd.errors.DtypeWarning)
sc.logging.print_versions()
-----
anndata     0.9.1
scanpy      1.9.3
-----
Bio                         1.81
Levenshtein                 0.21.0
PIL                         9.5.0
adjustText                  0.8
airr                        1.4.1
argcomplete                 NA
asttokens                   NA
backcall                    0.2.0
cffi                        1.15.1
changeo                     1.3.0
colorama                    0.4.6
comm                        0.1.3
cycler                      0.10.0
cython_runtime              NA
dandelion                   0.3.1
dateutil                    2.8.2
debugpy                     1.6.7
decorator                   5.1.1
distance                    NA
executing                   1.2.0
fontTools                   4.39.4
h5py                        3.8.0
igraph                      0.10.4
importlib_resources         NA
ipykernel                   6.23.1
jedi                        0.18.2
jinja2                      3.1.2
joblib                      1.2.0
kiwisolver                  1.4.4
leidenalg                   0.9.1
llvmlite                    0.40.0
markupsafe                  2.1.3
matplotlib                  3.7.1
mizani                      0.9.0
mpl_toolkits                NA
natsort                     8.3.1
networkx                    3.1
numba                       0.57.0
numpy                       1.24.3
packaging                   23.1
palettable                  3.3.3
palmotif                    NA
pandas                      1.4.4
parasail                    1.3.3
parso                       0.8.3
patsy                       0.5.3
pexpect                     4.8.0
pickleshare                 0.7.5
pkg_resources               NA
platformdirs                3.5.1
plotnine                    0.10.1
polyleven                   NA
presto                      0.7.1
prompt_toolkit              3.0.38
psutil                      5.9.5
ptyprocess                  0.7.0
pure_eval                   0.2.2
pydev_ipython               NA
pydevconsole                NA
pydevd                      2.9.5
pydevd_file_utils           NA
pydevd_plugins              NA
pydevd_tracing              NA
pygments                    2.15.1
pyparsing                   3.0.9
pytz                        2023.3
rapidfuzz                   2.15.1
rpy2                        3.5.11
scipy                       1.10.1
scirpy                      0.12.2
seaborn                     0.12.2
session_info                1.0.0
setuptools                  67.7.2
setuptools_scm              NA
six                         1.16.0
sklearn                     1.2.2
stack_data                  0.6.2
statsmodels                 0.14.0
svgwrite                    1.4.3
texttable                   1.6.7
threadpoolctl               3.1.0
tornado                     6.3.2
tqdm                        4.65.0
tracerlib                   NA
traitlets                   5.9.0
typing_extensions           NA
tzlocal                     NA
wcwidth                     0.2.6
yaml                        6.0
yamlordereddictloader       NA
zipp                        NA
zmq                         25.1.0
zoneinfo                    NA
-----
IPython             8.14.0
jupyter_client      8.2.0
jupyter_core        5.3.0
-----
Python 3.9.16 | packaged by conda-forge | (main, Feb  1 2023, 21:39:03) [GCC 11.3.0]
Linux-5.15.0-71-generic-x86_64-with-glibc2.35
-----
Session information updated at 2023-06-05 16:04
path_gex = "/data_disk/ST01/User/zhanghh/Scripts/jupyterlab/sc-best-practices/data/BCR_01_preprocessed.h5ad"
adata_bcr = sc.read(path_gex)
adata = adata_bcr[adata_bcr.obs["patient_id"].isin(["COVID-064", "COVID-014"])].copy()
vdjx = ddl.from_scirpy(adata)
vdjx
WARNING: Non-standard locus name ignored: Multi 
Dandelion class object with n_obs = 10049 and n_contigs = 23320
    data: 'sequence_id', 'sequence', 'rev_comp', 'productive', 'v_call', 'd_call', 'j_call', 'sequence_alignment', 'germline_alignment', 'junction', 'junction_aa', 'v_cigar', 'd_cigar', 'j_cigar', 'c_call', 'consensus_count', 'duplicate_count', 'locus', 'cell_id', 'multi_chain', 'patient_id', 'is_cell', 'receptor_subtype', 'chain_pairing', 'receptor_type', 'high_confidence', 'rearrangement_status'
    metadata: 'locus_VDJ', 'locus_VJ', 'productive_VDJ', 'productive_VJ', 'v_call_VDJ', 'd_call_VDJ', 'j_call_VDJ', 'v_call_VJ', 'j_call_VJ', 'c_call_VDJ', 'c_call_VJ', 'junction_VDJ', 'junction_VJ', 'junction_aa_VDJ', 'junction_aa_VJ', 'v_call_B_VDJ', 'd_call_B_VDJ', 'j_call_B_VDJ', 'v_call_B_VJ', 'j_call_B_VJ', 'c_call_B_VDJ', 'c_call_B_VJ', 'productive_B_VDJ', 'productive_B_VJ', 'duplicate_count_B_VDJ', 'duplicate_count_B_VJ', 'v_call_VDJ_main', 'v_call_VJ_main', 'd_call_VDJ_main', 'j_call_VDJ_main', 'j_call_VJ_main', 'c_call_VDJ_main', 'c_call_VJ_main', 'v_call_B_VDJ_main', 'd_call_B_VDJ_main', 'j_call_B_VDJ_main', 'v_call_B_VJ_main', 'j_call_B_VJ_main', 'isotype', 'isotype_status', 'locus_status', 'chain_status', 'rearrangement_status_VDJ', 'rearrangement_status_VJ'
vdjx.data["v_call"].replace("", np.nan, inplace=True)
vdjx.data.dropna(subset=["v_call"], inplace=True)

vdjx.data["j_call"].replace("", np.nan, inplace=True)
vdjx.data.dropna(subset=["j_call"], inplace=True)

vdjx.data["junction_aa"].replace("", np.nan, inplace=True)
vdjx.data.dropna(subset=["junction_aa"], inplace=True)

vdjx.data["junction_length"] = [len(a) for a in vdjx.data["junction_aa"]]
ddl.pp.calculate_threshold(vdjx, model="hh_s5f", plot=False)
/data_disk/ST01/Software/conda_env/scirpy/lib/python3.9/site-packages/rpy2/robjects/pandas2ri.py:65: UserWarning: Error while trying to convert the column "productive". Fall back to string conversion. The error is: Series can only be of one type, or None (and here we have <class 'str'> and <class 'bool'>). If happening with a pandas DataFrame the method infer_objects() will normalize data types before conversion.
R[write to console]: Error in (function (db, sequenceColumn = "junction", vCallColumn = "v_call",  : 
  The locus column contains invalid loci annotations.
vdjx.threshold
0.13828874822670625
ddl.tl.define_clones(vdjx, key_added="changeo_clone_id", model="hh_s5f")
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[10], line 1
----> 1 ddl.tl.define_clones(vdjx, key_added="changeo_clone_id", model="hh_s5f")

File /data_disk/ST01/Software/conda_env/scirpy/lib/python3.9/site-packages/dandelion/tools/_tools.py:1273, in define_clones(vdj_data, dist, action, model, norm, doublets, fileformat, ncpu, dirs, outFilePrefix, key_added, verbose)
   1270     return (heavy_df, light_df)
   1272 logg.info("Running command: %s\n" % (" ".join(cmd)))
-> 1273 run(cmd)
   1275 h_df, l_df = _lightCluster(
   1276     h_file2, l_file, outfile, doublets=doublets, fileformat=fileformat
   1277 )
   1279 h_df = load_data(h_df)

File /data_disk/ST01/Software/conda_env/scirpy/lib/python3.9/subprocess.py:505, in run(input, capture_output, timeout, check, *popenargs, **kwargs)
    502     kwargs['stdout'] = PIPE
    503     kwargs['stderr'] = PIPE
--> 505 with Popen(*popenargs, **kwargs) as process:
    506     try:
    507         stdout, stderr = process.communicate(input, timeout=timeout)

File /data_disk/ST01/Software/conda_env/scirpy/lib/python3.9/subprocess.py:951, in Popen.__init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask)
    947         if self.text_mode:
    948             self.stderr = io.TextIOWrapper(self.stderr,
    949                     encoding=encoding, errors=errors)
--> 951     self._execute_child(args, executable, preexec_fn, close_fds,
    952                         pass_fds, cwd, env,
    953                         startupinfo, creationflags, shell,
    954                         p2cread, p2cwrite,
    955                         c2pread, c2pwrite,
    956                         errread, errwrite,
    957                         restore_signals,
    958                         gid, gids, uid, umask,
    959                         start_new_session)
    960 except:
    961     # Cleanup if the child failed starting.
    962     for f in filter(None, (self.stdin, self.stdout, self.stderr)):

File /data_disk/ST01/Software/conda_env/scirpy/lib/python3.9/subprocess.py:1821, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, gid, gids, uid, umask, start_new_session)
   1819     if errno_num != 0:
   1820         err_msg = os.strerror(errno_num)
-> 1821     raise child_exception_type(errno_num, err_msg, err_filename)
   1822 raise child_exception_type(err_msg)

FileNotFoundError: [Errno 2] No such file or directory: 'DefineClones.py'

### Minimal reproducible example

```python
ddl.tl.define_clones(vdjx, key_added="changeo_clone_id", model="hh_s5f")

The error message produced by the code above

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[10], line 1
----> 1 ddl.tl.define_clones(vdjx, key_added="changeo_clone_id", model="hh_s5f")

File /data_disk/ST01/Software/conda_env/scirpy/lib/python3.9/site-packages/dandelion/tools/_tools.py:1273, in define_clones(vdj_data, dist, action, model, norm, doublets, fileformat, ncpu, dirs, outFilePrefix, key_added, verbose)
   1270     return (heavy_df, light_df)
   1272 logg.info("Running command: %s\n" % (" ".join(cmd)))
-> 1273 run(cmd)
   1275 h_df, l_df = _lightCluster(
   1276     h_file2, l_file, outfile, doublets=doublets, fileformat=fileformat
   1277 )
   1279 h_df = load_data(h_df)

File /data_disk/ST01/Software/conda_env/scirpy/lib/python3.9/subprocess.py:505, in run(input, capture_output, timeout, check, *popenargs, **kwargs)
    502     kwargs['stdout'] = PIPE
    503     kwargs['stderr'] = PIPE
--> 505 with Popen(*popenargs, **kwargs) as process:
    506     try:
    507         stdout, stderr = process.communicate(input, timeout=timeout)

File /data_disk/ST01/Software/conda_env/scirpy/lib/python3.9/subprocess.py:951, in Popen.__init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask)
    947         if self.text_mode:
    948             self.stderr = io.TextIOWrapper(self.stderr,
    949                     encoding=encoding, errors=errors)
--> 951     self._execute_child(args, executable, preexec_fn, close_fds,
    952                         pass_fds, cwd, env,
    953                         startupinfo, creationflags, shell,
    954                         p2cread, p2cwrite,
    955                         c2pread, c2pwrite,
    956                         errread, errwrite,
    957                         restore_signals,
    958                         gid, gids, uid, umask,
    959                         start_new_session)
    960 except:
    961     # Cleanup if the child failed starting.
    962     for f in filter(None, (self.stdin, self.stdout, self.stderr)):

File /data_disk/ST01/Software/conda_env/scirpy/lib/python3.9/subprocess.py:1821, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, gid, gids, uid, umask, start_new_session)
   1819     if errno_num != 0:
   1820         err_msg = os.strerror(errno_num)
-> 1821     raise child_exception_type(errno_num, err_msg, err_filename)
   1822 raise child_exception_type(err_msg)

FileNotFoundError: [Errno 2] No such file or directory: 'DefineClones.py'

OS information

ubuntu

Version information

dandelion==0.3.1 pandas==1.4.4 numpy==1.24.3 matplotlib==3.7.1 networkx==3.1 scipy==1.10.1

scanpy==1.9.3 anndata==0.9.1 umap==0.5.3 numpy==1.24.3 scipy==1.10.1 pandas==1.4.4 scikit-learn==1.2.2 statsmodels==0.14.0 python-igraph==0.10.4 pynndescent==0.5.10

Additional context

No response

zktuong commented 1 year ago

hi the errror message is telling you that DefineClones.py is not being found - this is a executable file provided by changeo. As i'm not sure how you installed changeo, i can only suggest that you uninstall and install changeo again, perhaps from pypi.

Ngort commented 1 year ago

This worked for me (though it may be exactly what you already did). Run "which DefineClones.py" and then append the directory of that to the interpreter's environment through os.environ.