vib-singlecell-nf / vsn-pipelines

A repository of pipelines for single-cell data in Nextflow DSL2
GNU General Public License v3.0
74 stars 30 forks source link

Scrublet: ValueError: columns overlap but no suffix specified: Index(['nGene', 'nUMI'], dtype='object') [BUG] #353

Open cbravo93 opened 2 years ago

cbravo93 commented 2 years ago

Describe the bug

Error executing process > 'single_sample_scrublet:SCRUBLET__DOUBLET_REMOVAL:SC__SCRUBLET__DOUBLET_DETECTION_REPORT (1)'                                                                                                     

Caused by:                                                                                                                                                                                                                  
  Process `single_sample_scrublet:SCRUBLET__DOUBLET_REMOVAL:SC__SCRUBLET__DOUBLET_DETECTION_REPORT (1)` terminated with an error exit status (1)                                                                            

Command executed:                                                                                                                                                                                                           

  papermill sc_doublet_detection_report.ipynb  changing     --report-mode k dir and showing the snare_mouse_cortex_rna.SC_Scrublet_doublet_detection_report.ipynb                       -p SCRUBLET_OBJECT_FILE snare_mouse_
cortex_rna.SC__SCRUBLET__DOUBLET_DETECTION.ScrubletObject.pklz             -p H5AD_WITH_SCRUBLET_INFO snare_mouse_cortex_rna.SCRUBLET.SC__ANNOTATE_BY_CELL_METADATA.h5ad             -p H5AD_WITH_DIM_RED snare_mouse_cortex
_rna.SC__SCANPY__PARAM_EXPLORE_MARKER_GENES.leiden_0.6.h5ad                     -p WORKFLOW_MANIFEST '{"nextflowVersion":"!20.04.1","defaultBranch":"master","version":"0.24.0","homePage":"https://github.com/vib-singlecel
l-nf/vsn-pipelines","gitmodules":null,"description":"A repository of pipelines for single-cell data in Nextflow DSL2","name":"vib-singlecell-nf/vsn-pipelines","mainScript":"main.nf","author":null}'                   -p W
ORKFLOW_PARAMETERS '{"global":{"project_name":"snare_mouse_cortex","outdir":"out","species":"mouse","genome":{"assembly":"mm10"},"seed":240},"misc":{"test":{"enabled":false},"manifestAsJSON":"{\"nextflowVersion\":\"!20.0
4.1\",\"defaultBranch\":\"master\",\"version\":\"0.24.0\",\"homePage\":\"https://github.com/vib-singlecell-nf/vsn-pipelines\",\"gitmodules\":null,\"description\":\"A repository of pipelines for single-cell data in Nextfl
ow DSL2\",\"name\":\"vib-singlecell-nf/vsn-pipelines\",\"mainScript\":\"main.nf\",\"author\":null}"},"utils":{"container":"vibsinglecellnf/utils:0.3.0","publish":{"compressionLevel":6,"annotateWithBatchVariableName":fals
e}},"sc":{"file_converter":{"off":"h5ad","tagCellWithSampleId":true,"useFilteredMatrix":true,"makeVarIndexUnique":false},"scanpy":{"container":"vibsinglecellnf/scanpy:0.5.2","report":{"annotations_to_plot":[]},"feature_s
election":{"report_ipynb":"/src/scanpy/bin/reports/sc_select_variable_genes_report.ipynb","method":"mean_disp_plot","minMean":0.0125,"maxMean":3,"minDisp":0.5,"off":"h5ad"},"feature_scaling":{"method":"zscore_scale","max
SD":10,"off":"h5ad"},"neighborhood_graph":{"off":"h5ad"},"dim_reduction":{"report_ipynb":"/src/scanpy/bin/reports/sc_dim_reduction_report.ipynb","pca":{"method":"pca","off":"h5ad"},"umap":{"method":"umap","off":"h5ad"},"
tsne":{"method":"tsne","off":"h5ad"}},"clustering":{"preflight_checks":true,"report_ipynb":"/src/scanpy/bin/reports/sc_clustering_report.ipynb","method":"leiden","resolutions":[0.3,0.6,0.9,1.2],"off":"h5ad"},"marker_gene
s":{"method":"wilcoxon","ngenes":0,"groupby":"leiden","off":"h5ad"},"filter":{"report_ipynb":"/src/scanpy/bin/reports/sc_filter_qc_report.ipynb","cellFilterStrategy":"fixedthresholds","cellFilterMinNCounts":800,"cellFilt
erMaxPercentMito":0.05,"geneFilterMinNCells":3,"off":"h5ad","outdir":"out"},"data_transformation":{"method":"log1p","off":"h5ad"},"normalization":{"method":"cpx","countsPerCellAfter":10000,"off":"h5ad"}},"scope":{"genome":"mm10","tree":{"level_1":"snare_mouse_cortex","level_2":"rna-vsn","level_3":""}},"scrublet":{"container":"vibsinglecellnf/scrublet:0.1.4","doublet_detection":{"report_ipynb":"/src/scrublet/bin/reports/sc_doublet_detection_report.ipynb","useVariableFeatures":"False","technology":"10x","off":"h5ad"},"cell_annotate":{"off":"h5ad","method":"obo","indexColumnName":"index"},"cell_filter":{"off":"h5ad","method":"internal","filters":[{"id":"NO_DOUBLETS","sampleColumnName":"sample_id","filterColumnName":"scrublet__predicted_doublets","valuesToKeepFromFilterColumn":["False"]}]}}},"data":{"loom":{"file_paths":"/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/SNARE_mouse/data/rna/snare_mouse_cortex_rna.loom","suffix":".loom"}},"pcacv":{"container":"vibsinglecellnf/pcacv:0.2.0","find_optimal_npcs":{"accessor":"@assays$RNA@scale.data"}}}'

Command exit status:
  1

Command output:
  (empty)

Command error:
    File "/opt/venv/lib/python3.7/site-packages/papermill/execute.py", line 222, in raise_for_execution_errors 
      raise error
  papermill.exceptions.PapermillExecutionError: 
  ---------------------------------------------------------------------------
  Exception encountered at "In [8]":
  ---------------------------------------------------------------------------
  ValueError                                Traceback (most recent call last)
  <ipython-input-8-7dd5d14d7bd7> in <module>
        3 )
                                                                                                                                           4 adata_dr.obs = adata_dr.obs.join(
  ----> 5     other=adata_sl_obs
        6 )

  /opt/venv/lib/python3.7/site-packages/pandas/core/frame.py in join(self, other, on, how, lsuffix, rsuffix, sort)
     7244         # For SparseDataFrame's benefit
     7245         return self._join_compat(
  -> 7246             other, on=on, how=how, lsuffix=lsuffix, rsuffix=rsuffix, sort=sort
     7247         )
     7248 

  /opt/venv/lib/python3.7/site-packages/pandas/core/frame.py in _join_compat(self, other, on, how, lsuffix, rsuffix, sort)
     7267                 right_index=True,
     7268                 suffixes=(lsuffix, rsuffix),
  -> 7269                 sort=sort,
     7270             )
     7271         else:

  /opt/venv/lib/python3.7/site-packages/pandas/core/reshape/merge.py in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)
       81         validate=validate,
       82     )
  ---> 83     return op.get_result()
       84 
       85 

  /opt/venv/lib/python3.7/site-packages/pandas/core/reshape/merge.py in get_result(self)
      646 
      647         llabels, rlabels = _items_overlap_with_suffix(
  --> 648             ldata.items, lsuf, rdata.items, rsuf
      649         )
      650 

  /opt/venv/lib/python3.7/site-packages/pandas/core/reshape/merge.py in _items_overlap_with_suffix(left, lsuffix, right, rsuffix)
     2009         raise ValueError(
     2010             "columns overlap but no suffix specified: "
  -> 2011             "{rename}".format(rename=to_rename)
     2012         )
     2013 

  ValueError: columns overlap but no suffix specified: Index(['nGene', 'nUMI'], dtype='object')

Work dir:
  /lustre1/project/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/SNARE_mouse/output/rna/vsn/scrublet/work/1c/e18edb995c50de753d1a874642c070

Tip: you can try to figure out what's wrong by changing to the process work dir and showing the script file named `.command.sh`

To Reproduce It is the first time I see this error. Something that may be relevant, this is the first time I run scrublet using loom as input.

Additional context Add any other context about the problem here.

cbravo93 commented 2 years ago

UPDATE: Removing nGene and nUMI from the loom metadata works.

library(SCopeLoomR)
loom <- open_loom('/staging/leuven/stg_00002/lcb/cbravo/Multiomics_pipeline/analysis/SNARE_mouse/data/rna/snare_mouse_cortex_rna.loom', mode='r+')
get_cell_annotation(loom)
remove_col_attr(loom, 'nGene')
remove_col_attr(loom, 'nUMI')
gmd<-get_global_meta_data(loom = loom)
gmd[['metrics']] <- NULL
update_global_meta_data(loom = loom, meta.data.json = rjson::toJSON(x = gmd))