ebi-gene-expression-group / scanpy-scripts

Scripts for using scanpy
Apache License 2.0
29 stars 13 forks source link

Add filtering by quantiles #122

Open pcm32 opened 1 year ago

pcm32 commented 1 year ago

As suggested by Ming:

def scanpy_qc(adata, batch, up_range, down_range, **kwarg):
    adata.var[‘mt’] = adata.var_names.str.startswith(‘MT-’)
    adata.var[‘ribo’] = adata.var_names.str.startswith((‘RPS’,‘RPL’))
    sc.pp.calculate_qc_metrics(adata, qc_vars=[‘mt’, ‘ribo’], log1p=False, percent_top=None, inplace=True)
#     sc.pl.violin(adata, keys=[‘n_genes_by_counts’,‘pct_counts_mt’, ‘pct_counts_ribo’, ‘total_counts’],
#              jitter=0.4, multi_panel=True, groupby=batch, **kwarg)
#     sc.pl.scatter(adata, x=‘total_counts’, y=‘n_genes_by_counts’, color=batch)
    batch_selected_cell = list()
    for qc in [‘pct_counts_mt’, ‘pct_counts_ribo’, ‘total_counts’, ‘n_genes_by_counts’]:
        if len(adata.obs[batch].unique()) > 1:
            selected = adata.obs.groupby(batch).apply(lambda x: (x[qc] >= x[qc].quantile(down_range)) & \
                                                    (x[qc] <= x[qc].quantile(up_range))).reset_index()
            batch_selected_cell.append(selected.loc[selected[qc]][‘level_1’].tolist())
        else:
            batch_selected_cell = adata[(adata.obs[qc] > adata.obs[qc].quantile(down_range)) & \
                                         (adata.obs[qc] < adata.obs[qc].quantile(up_range)), :].obs_names
    all_selected_cell = batch_selected_cell if len(batch_selected_cell) == 1 else list(set.intersection(*map(set,batch_selected_cell)))
    adata = adata[all_selected_cell,:]
    return adata

the first parts will happen elsewhere, and we could split the AnnData outside of this function through whichever condition is considered adequate.