Open dicklim opened 6 months ago
@dicklim Can you make a smaller and reproducible example? Also could you add more details of your environment? For example:
import scanpy as sc
adata = sc.datasets.pbmc3k()
sc.pp.scrublet(adata, n_neighbors=10)
works fine with the details you provided.
Hi, sorry that I haven't reply timely. However, after several mounths, I can't solve the problem, and the error still occured for several times. Here I want to process the data from Kim et al. 2020. GSE131907
The matrix file can be downloaded in GSE131907_Lung_Cancer_raw_UMI_matrix.txt
I also exported a file before scrublet, then you can reproduct the Error. However the file is so large that I can only uploaded it to the BaiduNetDisk. If you can download the h5ad file, you can just run the scrublet part.
I greatly appreciate your efforts in developing Scanpy. I look forward to hearing from you soon!
The enviroument is :
scanpy==1.10.3 anndata==0.10.9 umap==0.5.3 numpy==1.26.4 scipy==1.13.1 pandas==2.2.2 scikit-learn==1.0.2 statsmodels==0.14.2 igraph==0.11.6 pynndescent==0.5.10
The process code is shown below:
import os
import glob
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import anndata as ad
import matplotlib
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
import re
# pd.set_option('mode.use_inf_as_na', True)
# import SCCAF
import infercnvpy as cnv
import cytotrace2_py
sc.settings.cachedir = 'D:/BaiduSyncdisk/博士课题/scRNA/Kim/cache' # 缓存的单细胞数据
sc.settings.figdir = 'D:/BaiduSyncdisk/博士课题/scRNA/Kim/figure' # 图片输出
sc.settings.logpath = 'D:/BaiduSyncdisk/博士课题/scRNA/Kim/log' # 日志
sc.settings.max_memory = 50 # 最大内存
sc.settings.n_jobs = 12 # 最大线程
sc.settings.verbosity = 1 # 设置日志等级: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
# plt.rcParams['pdf.fonttype'] = 'truetype'
config = {
"font.family":'Arial', # 设置字体类型
"axes.unicode_minus": False, #解决负号无法显示的问题
'pdf.fonttype': 'truetype'
}
matplotlib.rcParams.update(config)
sc.settings.set_figure_params(
dpi=80, dpi_save=600,fontsize=12,
figsize=(4, 4),
transparent =True,
vector_friendly=True,
format='pdf')
# Raw counts
alldata = sc.read_text('GSE131907_Lung_Cancer_raw_UMI_matrix.txt.gz').T
# alldata.write(LUAD_Kim.h5ad")
alldata.obs['sample'] = alldata.obs.index
alldata.obs['sample'] = alldata.obs['sample'].apply(lambda x: '_'.join(x.split('_')[1:]))
# 只要转移灶加原发灶
alldata = alldata[(alldata.obs['sample'].str.startswith('LUNG_T')) | # 肺肿瘤
(alldata.obs['sample'].str.startswith('NS')) | # 远处脑转移
(alldata.obs['sample'].str.startswith('EBUS')) | # EBUS 测的转移性淋巴结和肺肿瘤
(alldata.obs['sample'].str.startswith('BRONCHO')),:] # BRONCHO 测的转移性淋巴结和肺肿瘤
# 提取一下样本种类
category_dict = {
'EBUS_':'EBUS',
'LUNG_T':'Tumor',
'BRONCHO_':'Borncho',
'NS_':'BrainMetastasis'
}
alldata.obs['sample_type'] = alldata.obs['sample'].apply(lambda x:category_dict[x[:-2]])
## QC
alldata.var[alldata.var_names.str.match(r'^MT-')]
alldata.var[alldata.var_names.str.match(r'^RP[SL0-9]')]
alldata.var[alldata.var_names.str.contains(("^HB[^(P)^(E)]"))]
alldata.var['mt'] = alldata.var_names.str.match(r'^MT-') # 线粒体基因
alldata.var['ribo'] = alldata.var_names.str.match(r'^RP[SL0-9]') # 核糖体基因
alldata.var['hb'] = alldata.var_names.str.contains(("^HB[^(P)^(E)]")) # 血红蛋白基因
sc.pp.calculate_qc_metrics(alldata, qc_vars=['ribo', 'hb','mt'], inplace=True, percent_top=[20], log1p=True)
mito_filter = 15
n_counts_filter = 4300
fig, axs = plt.subplots(nrows = 2, ncols = 2, figsize = (8, 8))
sc.pl.scatter(alldata, x='total_counts', y='pct_counts_mt',ax = axs[0, 0], show=False)
sc.pl.scatter(alldata, x='total_counts', y='pct_counts_ribo',ax = axs[0, 1], show = False)
sc.pl.scatter(alldata, x='total_counts', y='pct_counts_hb',ax = axs[1, 0], show = False)
sc.pl.scatter(alldata, x='total_counts', y='n_genes_by_counts',ax = axs[1, 1], show = False)
#draw horizontal red lines indicating thresholds.
axs[0, 0].hlines(y = mito_filter, xmin = 0, xmax = max(alldata.obs['total_counts']), color = 'red', ls = 'dashed')
axs[0, 1].hlines(y = mito_filter, xmin = 0, xmax = max(alldata.obs['total_counts']), color = 'red', ls = 'dashed')
axs[1, 0].hlines(y = mito_filter, xmin = 0, xmax = max(alldata.obs['total_counts']), color = 'red', ls = 'dashed')
axs[1, 1].hlines(y = n_counts_filter, xmin = 0, xmax = max(alldata.obs['total_counts']), color = 'red', ls = 'dashed')
fig.tight_layout()
plt.show()
# Original QC plot
n0 = alldata.shape[0]
print(f'Original cell number: {n0}')
# Start scrublet You can load the h5ad file here.
# alldata=sc.read_h5ad("D:/BaiduSyncdisk/博士课题/scRNA/36368318/Kim/LUAD_Kim_ForScrublet_DeBug.h5ad")
print('Begin of post doublets removal and QC plot')
sc.pp.scrublet(alldata)
alldata = alldata[alldata.obs['predicted_doublet']==False, :].copy()
n1 = alldata.shape[0]
print(f'Cells retained after scrublet: {n1}, {n0-n1} removed.')
print(f'End of post doublets removal and QC plots.')
The error log is :
Begin of post doublets removal and QC plot
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[9], line 2
1 print('Begin of post doublets removal and QC plot')
----> 2 sc.pp.scrublet(alldata)
3 alldata = alldata[alldata.obs['predicted_doublet']==False, :].copy()
4 n1 = alldata.shape[0]
File ~\.conda\envs\DL\lib\site-packages\legacy_api_wrap\__init__.py:80, in legacy_api.<locals>.wrapper.<locals>.fn_compatible(*args_all, **kw)
77 @wraps(fn)
78 def fn_compatible(*args_all: P.args, **kw: P.kwargs) -> R:
79 if len(args_all) <= n_positional:
---> 80 return fn(*args_all, **kw)
82 args_pos: P.args
83 args_pos, args_rest = args_all[:n_positional], args_all[n_positional:]
File ~\.conda\envs\DL\lib\site-packages\scanpy\preprocessing\_scrublet\__init__.py:285, in scrublet(adata, adata_sim, batch_key, sim_doublet_ratio, expected_doublet_rate, stdev_doublet_rate, synthetic_doublet_umi_subsampling, knn_dist_metric, normalize_variance, log_transform, mean_center, n_prin_comps, use_approx_neighbors, get_doublet_neighbor_parents, n_neighbors, threshold, verbose, copy, random_state)
282 adata.uns["scrublet"]["batched_by"] = batch_key
284 else:
--> 285 scrubbed = _run_scrublet(adata_obs, adata_sim)
287 # Copy outcomes to input object from our processed version
289 adata.obs["doublet_score"] = scrubbed["obs"]["doublet_score"]
File ~\.conda\envs\DL\lib\site-packages\scanpy\preprocessing\_scrublet\__init__.py:211, in scrublet.<locals>._run_scrublet(ad_obs, ad_sim)
206 ad_obs = ad_obs[:, ad_obs.var["highly_variable"]].copy()
208 # Simulate the doublets based on the raw expressions from the normalised
209 # and filtered object.
--> 211 ad_sim = scrublet_simulate_doublets(
212 ad_obs,
213 layer="raw",
214 sim_doublet_ratio=sim_doublet_ratio,
215 synthetic_doublet_umi_subsampling=synthetic_doublet_umi_subsampling,
216 random_seed=random_state,
217 )
218 del ad_obs.layers["raw"]
219 if log_transform:
File ~\.conda\envs\DL\lib\site-packages\legacy_api_wrap\__init__.py:80, in legacy_api.<locals>.wrapper.<locals>.fn_compatible(*args_all, **kw)
77 @wraps(fn)
78 def fn_compatible(*args_all: P.args, **kw: P.kwargs) -> R:
79 if len(args_all) <= n_positional:
---> 80 return fn(*args_all, **kw)
82 args_pos: P.args
83 args_pos, args_rest = args_all[:n_positional], args_all[n_positional:]
File ~\.conda\envs\DL\lib\site-packages\scanpy\preprocessing\_scrublet\__init__.py:552, in scrublet_simulate_doublets(adata, layer, sim_doublet_ratio, synthetic_doublet_umi_subsampling, random_seed)
549 X = _get_obs_rep(adata, layer=layer)
550 scrub = Scrublet(X, random_state=random_seed)
--> 552 scrub.simulate_doublets(
553 sim_doublet_ratio=sim_doublet_ratio,
554 synthetic_doublet_umi_subsampling=synthetic_doublet_umi_subsampling,
555 )
557 adata_sim = AnnData(scrub._counts_sim)
558 adata_sim.obs["n_counts"] = scrub._total_counts_sim
File ~\.conda\envs\DL\lib\site-packages\scanpy\preprocessing\_scrublet\core.py:233, in Scrublet.simulate_doublets(self, sim_doublet_ratio, synthetic_doublet_umi_subsampling)
230 n_obs = self._counts_obs.shape[0]
231 n_sim = int(n_obs * sim_doublet_ratio)
--> 233 pair_ix = sample_comb((n_obs, n_obs), n_sim, random_state=self._random_state)
235 E1 = cast(sparse.csc_matrix, self._counts_obs[pair_ix[:, 0], :])
236 E2 = cast(sparse.csc_matrix, self._counts_obs[pair_ix[:, 1], :])
File ~\.conda\envs\DL\lib\site-packages\scanpy\preprocessing\_utils.py:157, in sample_comb(dims, nsamp, random_state, method)
147 def sample_comb(
148 dims: tuple[int, ...],
149 nsamp: int,
(...)
154 ] = "auto",
155 ) -> NDArray[np.int64]:
156 """Randomly sample indices from a grid, without repeating the same tuple."""
--> 157 idx = sample_without_replacement(
158 np.prod(dims), nsamp, random_state=random_state, method=method
159 )
160 return np.vstack(np.unravel_index(idx, dims)).T
File sklearn\utils\_random.pyx:220, in sklearn.utils._random.sample_without_replacement()
File sklearn\utils\_random.pyx:274, in sklearn.utils._random.sample_without_replacement()
File sklearn\utils\_random.pyx:28, in sklearn.utils._random._sample_without_replacement_check_input()
ValueError: n_population should be greater than 0, got -1272468767.
Please make sure these conditions are met
What happened?
Hi, I am running the Scrublet function to remove doublets. the annadata was generated by concat several samples from two articles:
[1] Wu F, Fan J, He Y, et al. Single-cell profiling of tumor heterogeneity and the microenvironment in advanced non-small cell lung cancer[J]. Nature Communications, 2021, 12(1): 2540. [2] Wang Y, Chen D, Liu Y, et al. Multidirectional characterization of cellular composition and spatial architecture in human multiple primary lung cancers[J]. Cell Death & Disease, 2023, 14(7): 462.
However, there was an error I cann't handle.
Minimal code sample
Error output
Versions