ChoBioLab / corescpy

A wrapper to simplify the execution of Single Cell analysis with the sc verse
Other
3 stars 0 forks source link

Augur fails with CRISPRi_ess Replogle scPerturb-seq #7

Closed easlinger closed 1 year ago

easlinger commented 1 year ago

Background

Problem emerged with commit e476eb2a68b813ff145b387939d130c5adb9696b.

Reproducible Code

To save time,

from config import DIR
file_path = f"{DIR}/replogle_2022_k562_esss.h5ad"

Then replace pt.data.replogle_2022_k562_essential() with file_path in the code below. (This only works if you have the file.)

# Packages
import crispr as cr 
from crispr.crispr_class import Crispr
import pertpy as pt
import muon
import os
import pandas as pd
import numpy as np

# Initialize Object
ann = Crispr(pt.data.replogle_2022_k562_essential(), **kwargs_init)

# Subset Large Data to Save Time/Memory
ann.adata.obs[ann._columns["col_target_genes"]] = ann.adata.obs[
    ann._columns["col_target_genes"]].astype(str).replace("", ann._keys[
        "key_control"]).replace(np.nan,  ann._keys["key_control"])
ann.adata = ann.adata[ann.adata.obs["guide_ids"].isin(
    ["NT", "CDKN1A", "CDKN1A,CDKN1B", "CEBPA", "CEBPB", 
     "CEBPA,CEBPB", "DUSP9,KLF1", "SAMD1,UBASH3B", "TGFBR2", 
     "FEV,ISL2", "PRTG,TGFBR2", "JUN", "CLDN6,KLF1", 
     "CEBPE,SPI1", "PTPN13", "CEBPE,PTPN12", "CDKN1B,CDKN1C", 
     "FOXF1,FOXL2", "AHR,FEV", "CDKN1A,CDKN1B",])]  # subset for speed

# Add Control Keys Where Needed
ann.adata.obs[ann._columns["col_perturbation"]] = ann.adata.obs[
    ann._columns["col_perturbation"]].replace("", np.nan).replace(
        np.nan, ann._keys["key_control"])

# Fix Gene Columns
if ann._columns["col_gene_symbols"] in ann.adata.var.index.names:
    ann.adata.var = ann.adata.var.reset_index()

# Binary Perturbation Column      
conds = list(ann.adata.obs[ann._columns["col_perturbation"]].unique())
lab_tx = "Perturbed" if ann._keys[
    "key_treatment"] is None else ann._keys["key_treatment"]
ann.adata.obs[ann._columns["col_perturbation"] + "_old"] = ann.adata.obs[
    ann._columns["col_perturbation"]].copy()
ann.adata.obs[ann._columns[
    "col_perturbation"] + "_binary"] = ann.adata.obs[
        ann._columns["col_perturbation"]].apply(
            lambda x: lab_tx if x != ann._keys["key_control"] else x)
ann._keys["key_treatment"] = lab_tx

# Preprocess
process_kws = dict(kws_hvg=dict(min_mean=0.0125, max_mean=3, min_disp=0.5),
                   target_sum=1e4, max_genes_by_counts=2500, max_pct_mt=5, 
                   min_genes=200, min_cells=3, scale=10, regress_out=None)
_ = ann.preprocess(**process_kws, kws_umap=None)  # preprocessing 

# Cluster
kws_pca = dict(n_comps=None, use_highly_variable=True)
ann.cluster(paga=False, method_cluster="leiden", kws_pca=kws_pca, 
            kws_neighbors=None, kws_umap=None, kws_cluster=None)

# Run Augur
augur_data, augur_results, figs_augur = ann.run_augur(
    col_perturbation=ann._columns["col_perturbation"] + "_binary", 
    key_treatment=ann._keys["key_treatment"], 
    classifier="random_forest_classifier", n_threads=True,
    augur_mode="default", select_variance_features=True, n_folds=2,
    subsample_size=20, kws_augur_predict=dict(span=0.5))

Behavior

Runs briefly (makes classifier, loads data) and fails on Augur predict method.

---------------------------------------------------------------------------

Un-used Keyword Arguments: {'col_sample_id': 'gemgroup', 'col_batch': 'gemgroup', 'col_guide_rna': 'guide_ids', 'col_target_genes': 'guide_ids', 'layer': 'X_pert'}
AnnData object with n_obs × n_vars = 18844 × 19420
    obs: 'guide_identity', 'read_count', 'UMI_count', 'coverage', 'gemgroup', 'good_coverage', 'number_of_cells', 'guide_AHR', 'guide_ARID1A', 'guide_ARRDC3', 'guide_ATL1', 'guide_BAK1', 'guide_BCL2L11', 'guide_BCORL1', 'guide_BPGM', 'guide_C19orf26', 'guide_C3orf72', 'guide_CBFA2T3', 'guide_CBL', 'guide_CDKN1A', 'guide_CDKN1B', 'guide_CDKN1C', 'guide_CEBPA', 'guide_CEBPB', 'guide_CEBPE', 'guide_CELF2', 'guide_CITED1', 'guide_CKS1B', 'guide_CLDN6', 'guide_CNN1', 'guide_CNNM4', 'guide_COL1A1', 'guide_COL2A1', 'guide_CSRNP1', 'guide_DLX2', 'guide_DUSP9', 'guide_EGR1', 'guide_ELMSAN1', 'guide_ETS2', 'guide_FEV', 'guide_FOSB', 'guide_FOXA1', 'guide_FOXA3', 'guide_FOXF1', 'guide_FOXL2', 'guide_FOXO4', 'guide_GLB1L2', 'guide_HES7', 'guide_HK2', 'guide_HNF4A', 'guide_HOXA13', 'guide_HOXB9', 'guide_HOXC13', 'guide_IER5L', 'guide_IGDCC3', 'guide_IKZF3', 'guide_IRF1', 'guide_ISL2', 'guide_JUN', 'guide_KIAA1804', 'guide_KIF18B', 'guide_KIF2C', 'guide_KLF1', 'guide_KMT2A', 'guide_LHX1', 'guide_LYL1', 'guide_MAML2', 'guide_MAP2K3', 'guide_MAP2K6', 'guide_MAP4K3', 'guide_MAP4K5', 'guide_MAP7D1', 'guide_MAPK1', 'guide_MEIS1', 'guide_MIDN', 'guide_NCL', 'guide_NIT1', 'guide_OSR2', 'guide_PLK4', 'guide_POU3F2', 'guide_PRDM1', 'guide_PRTG', 'guide_PTPN1', 'guide_PTPN12', 'guide_PTPN13', 'guide_PTPN9', 'guide_RHOXF2', 'guide_RREB1', 'guide_RUNX1T1', 'guide_S1PR2', 'guide_SAMD1', 'guide_SET', 'guide_SGK1', 'guide_SLC38A2', 'guide_SLC4A1', 'guide_SLC6A9', 'guide_SNAI1', 'guide_SPI1', 'guide_STIL', 'guide_TBX2', 'guide_TBX3', 'guide_TGFBR2', 'guide_TMSB4X', 'guide_TP73', 'guide_TSC22D1', 'guide_UBASH3A', 'guide_UBASH3B', 'guide_ZBTB1', 'guide_ZBTB10', 'guide_ZBTB25', 'guide_ZC3HAV1', 'guide_ZNF318', 'guide_ids', 'guide_ids_old', 'guide_ids_binary', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'leiden', 'label', 'cell_type'
    var: 'gene_symbols', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
Filtering samples with NT and Perturbed labels.


ValueError                                Traceback (most recent call last)
Cell In[6], line 1
----> 1 augur_data, augur_results, figs_augur = ann.run_augur(
      2     col_perturbation=ann._columns["col_perturbation"] + "_binary", 
      3     key_treatment=ann._keys["key_treatment"], 
      4     classifier="random_forest_classifier", n_threads=True,
      5     augur_mode="default", select_variance_features=True, n_folds=2,
      6     subsample_size=20, kws_augur_predict=dict(span=0.5))

File [~/projects/crispr/crispr/crispr_class.py:361](https://file+.vscode-resource.vscode-cdn.net/home/asline01/projects/crispr/examples/~/projects/crispr/crispr/crispr_class.py:361), in Crispr.run_augur(self, assay, col_perturbation, key_treatment, augur_mode, classifier, kws_augur_predict, n_folds, subsample_size, n_threads, select_variance_features, seed, plot, run_label, test)
    356     kws_augur_predict = {}
    357 # if run_label != "main":
    358 #     kws_augur_predict.update(
    359 #         {"key_added": f"augurpy_results_{run_label}"}
    360 #         )  # run label incorporated in key in adata
--> 361 data, results, figs_aug = cr.ax.perform_augur(
    362     self.adata.copy() if test is True else self.adata, 
    363     assay=assay, classifier=classifier, 
    364     augur_mode=augur_mode, subsample_size=subsample_size,
    365     select_variance_features=select_variance_features, 
    366     n_folds=n_folds,
    367     **{**self._columns, "col_perturbation": col_perturbation},  
    368     kws_augur_predict=kws_augur_predict,
    369     key_control=self._keys["key_control"], key_treatment=key_treatment,
    370     layer=self._layer_perturbation,
    371     seed=seed, n_threads=n_threads, plot=plot)
    372 if test is False:
    373     if run_label not in self.results:

File [~/projects/crispr/crispr/analysis/perturbations.py:282](https://file+.vscode-resource.vscode-cdn.net/home/asline01/projects/crispr/examples/~/projects/crispr/crispr/analysis/perturbations.py:282), in perform_augur(adata, assay, layer_perturbation, classifier, augur_mode, subsample_size, n_folds, select_variance_features, col_cell_type, col_perturbation, col_gene_symbols, key_control, key_treatment, seed, plot, n_threads, kws_augur_predict, **kwargs)
    280 # Augur Model & Data
    281 ag_rfc = pt.tl.Augur(classifier)
--> 282 loaded_data = ag_rfc.load(
    283     adata_pert, condition_label=key_control, 
    284     treatment_label=key_treatment,
    285     cell_type_col=col_cell_new,
    286     label_col=col_pert_new
    287     )  # add dummy variables, rename cell type & label columns
    289 # Run Augur Predict
    290 data, results = ag_rfc.predict(
    291     loaded_data, subsample_size=subsample_size, augur_mode=augur_mode, 
    292     select_variance_features=select_variance_features,
    293     n_threads=n_threads, random_state=seed,
    294     **kws_augur_predict)  # AUGUR model prediction

File [~/anaconda3/envs/py-bio/lib/python3.10/site-packages/pertpy/tools/_augur.py:138](https://file+.vscode-resource.vscode-cdn.net/home/asline01/projects/crispr/examples/~/anaconda3/envs/py-bio/lib/python3.10/site-packages/pertpy/tools/_augur.py:138), in Augur.load(self, input, meta, label_col, cell_type_col, condition_label, treatment_label)
    136 if condition_label is not None and treatment_label is not None:
    137     print(f"Filtering samples with {condition_label} and {treatment_label} labels.")
--> 138     adata = AnnData.concatenate(
    139         adata[adata.obs["label"] == condition_label], adata[adata.obs["label"] == treatment_label]
    140     )
    141 label_encoder = LabelEncoder()
    142 adata.obs["y_"] = label_encoder.fit_transform(adata.obs["label"])

File [~/anaconda3/envs/py-bio/lib/python3.10/site-packages/anndata/_core/anndata.py:1772](https://file+.vscode-resource.vscode-cdn.net/home/asline01/projects/crispr/examples/~/anaconda3/envs/py-bio/lib/python3.10/site-packages/anndata/_core/anndata.py:1772), in AnnData.concatenate(self, join, batch_key, batch_categories, uns_merge, index_unique, fill_value, *adatas)
   1769     adatas = adatas[0]  # backwards compatibility
   1770 all_adatas = (self,) + tuple(adatas)
-> 1772 out = concat(
   1773     all_adatas,
   1774     axis=0,
   1775     join=join,
   1776     label=batch_key,
   1777     keys=batch_categories,
   1778     uns_merge=uns_merge,
   1779     fill_value=fill_value,
   1780     index_unique=index_unique,
   1781     pairwise=False,
   1782 )
   1784 # Backwards compat (some of this could be more efficient)
   1785 # obs used to always be an outer join
   1786 out.obs = concat(
   1787     [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
   1788     axis=0,
   (...)
   1792     index_unique=index_unique,
   1793 ).obs

File [~/anaconda3/envs/py-bio/lib/python3.10/site-packages/anndata/_core/merge.py:1036](https://file+.vscode-resource.vscode-cdn.net/home/asline01/projects/crispr/examples/~/anaconda3/envs/py-bio/lib/python3.10/site-packages/anndata/_core/merge.py:1036), in concat(adatas, axis, join, merge, uns_merge, label, keys, index_unique, fill_value, pairwise)
   1033     concat_annot[label] = label_col
   1035 # Annotation for other axis
-> 1036 alt_annot = merge_dataframes(
   1037     [getattr(a, alt_dim) for a in adatas], alt_indices, merge
   1038 )
   1040 X = concat_Xs(adatas, reindexers, axis=axis, fill_value=fill_value)
   1042 if join == "inner":

File [~/anaconda3/envs/py-bio/lib/python3.10/site-packages/anndata/_core/merge.py:729](https://file+.vscode-resource.vscode-cdn.net/home/asline01/projects/crispr/examples/~/anaconda3/envs/py-bio/lib/python3.10/site-packages/anndata/_core/merge.py:729), in merge_dataframes(dfs, new_index, merge_strategy)
    726 def merge_dataframes(
    727     dfs: Iterable[pd.DataFrame], new_index, merge_strategy=merge_unique
    728 ) -> pd.DataFrame:
--> 729     dfs = [df.reindex(index=new_index) for df in dfs]
    730     # New dataframe with all shared data
    731     new_df = pd.DataFrame(merge_strategy(dfs), index=new_index)

File [~/anaconda3/envs/py-bio/lib/python3.10/site-packages/anndata/_core/merge.py:729](https://file+.vscode-resource.vscode-cdn.net/home/asline01/projects/crispr/examples/~/anaconda3/envs/py-bio/lib/python3.10/site-packages/anndata/_core/merge.py:729), in (.0)
    726 def merge_dataframes(
    727     dfs: Iterable[pd.DataFrame], new_index, merge_strategy=merge_unique
    728 ) -> pd.DataFrame:
--> 729     dfs = [df.reindex(index=new_index) for df in dfs]
    730     # New dataframe with all shared data
    731     new_df = pd.DataFrame(merge_strategy(dfs), index=new_index)

File [~/anaconda3/envs/py-bio/lib/python3.10/site-packages/pandas/core/frame.py:5055](https://file+.vscode-resource.vscode-cdn.net/home/asline01/projects/crispr/examples/~/anaconda3/envs/py-bio/lib/python3.10/site-packages/pandas/core/frame.py:5055), in DataFrame.reindex(self, labels, index, columns, axis, method, copy, level, fill_value, limit, tolerance)
   5036 @doc(
   5037     NDFrame.reindex,
   5038     klass=_shared_doc_kwargs["klass"],
   (...)
   5053     tolerance=None,
   5054 ) -> DataFrame:
-> 5055     return super().reindex(
   5056         labels=labels,
   5057         index=index,
   5058         columns=columns,
   5059         axis=axis,
   5060         method=method,
   5061         copy=copy,
   5062         level=level,
   5063         fill_value=fill_value,
   5064         limit=limit,
   5065         tolerance=tolerance,
   5066     )

File [~/anaconda3/envs/py-bio/lib/python3.10/site-packages/pandas/core/generic.py:5360](https://file+.vscode-resource.vscode-cdn.net/home/asline01/projects/crispr/examples/~/anaconda3/envs/py-bio/lib/python3.10/site-packages/pandas/core/generic.py:5360), in NDFrame.reindex(self, labels, index, columns, axis, method, copy, level, fill_value, limit, tolerance)
   5357     return self._reindex_multi(axes, copy, fill_value)
   5359 # perform the reindex on the axes
-> 5360 return self._reindex_axes(
   5361     axes, level, limit, tolerance, method, fill_value, copy
   5362 ).__finalize__(self, method="reindex")

File [~/anaconda3/envs/py-bio/lib/python3.10/site-packages/pandas/core/frame.py:4896](https://file+.vscode-resource.vscode-cdn.net/home/asline01/projects/crispr/examples/~/anaconda3/envs/py-bio/lib/python3.10/site-packages/pandas/core/frame.py:4896), in DataFrame._reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   4894 index = axes["index"]
   4895 if index is not None:
-> 4896     frame = frame._reindex_index(
   4897         index, method, copy, level, fill_value, limit, tolerance
   4898     )
   4900 return frame

File [~/anaconda3/envs/py-bio/lib/python3.10/site-packages/pandas/core/frame.py:4912](https://file+.vscode-resource.vscode-cdn.net/home/asline01/projects/crispr/examples/~/anaconda3/envs/py-bio/lib/python3.10/site-packages/pandas/core/frame.py:4912), in DataFrame._reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
   4902 def _reindex_index(
   4903     self,
   4904     new_index,
   (...)
   4910     tolerance=None,
   4911 ):
-> 4912     new_index, indexer = self.index.reindex(
   4913         new_index, method=method, level=level, limit=limit, tolerance=tolerance
   4914     )
   4915     return self._reindex_with_indexers(
   4916         {0: [new_index, indexer]},
   4917         copy=copy,
   4918         fill_value=fill_value,
   4919         allow_dups=False,
   4920     )

File [~/anaconda3/envs/py-bio/lib/python3.10/site-packages/pandas/core/indexes/category.py:368](https://file+.vscode-resource.vscode-cdn.net/home/asline01/projects/crispr/examples/~/anaconda3/envs/py-bio/lib/python3.10/site-packages/pandas/core/indexes/category.py:368), in CategoricalIndex.reindex(self, target, method, level, limit, tolerance)
    364 if limit is not None:
    365     raise NotImplementedError(
    366         "argument limit is not implemented for CategoricalIndex.reindex"
    367     )
--> 368 return super().reindex(target)

File [~/anaconda3/envs/py-bio/lib/python3.10/site-packages/pandas/core/indexes/base.py:4274](https://file+.vscode-resource.vscode-cdn.net/home/asline01/projects/crispr/examples/~/anaconda3/envs/py-bio/lib/python3.10/site-packages/pandas/core/indexes/base.py:4274), in Index.reindex(self, target, method, level, limit, tolerance)
   4271     raise ValueError("cannot handle a non-unique multi-index!")
   4272 elif not self.is_unique:
   4273     # GH#42568
-> 4274     raise ValueError("cannot reindex on an axis with duplicate labels")
   4275 else:
   4276     indexer, _ = self.get_indexer_non_unique(target)

ValueError: cannot reindex on an axis with duplicate labels

Expected Behavior

Runs. (See attached Jupyter notebook)

easlinger commented 1 year ago

Solution

Removed the following code from crispr.analysis.perturbations (perform_augur() function):

if adata_pert.var_names[0] != adata_pert.var.reset_index()[
    col_gene_symbols][0]:  # so gene names plots use if index differs
    adata_pert.var_names = pd.Index(adata_pert.var.reset_index()[
        col_gene_symbols])

It was originally intended to address the issue where the gene symbols aren't the index of .adata.var (usually, it's the Ensembl IDs instead), which

Code

# Packages
import crispr as cr 
from crispr.crispr_class import Crispr
import pertpy as pt
import muon
import os
import pandas as pd
import numpy as np
from config import DIR

# Initialize Object
kwargs_init = dict(
    assay=None, assay_protein=None, 
    col_gene_symbols="gene_symbols",
    layer_perturbation="X_pert", 
    col_cell_type="leiden", 
    col_sample_id="gemgroup", 
    col_batch="gemgroup", 
    col_perturbation="guide_ids", 
    col_guide_rna="guide_ids", 
    col_target_genes="guide_ids", 
    label_perturbation_type="KD", 
    key_control="NT", key_treatment=None)
# file_path = f"{DIR}/replogle_2022_k562_esss.h5ad"
file_path = f"{DIR}/replogle_2022_k562_esss_processed.h5ad"

# Initialize Object
ann = Crispr(file_path, **kwargs_init)

# Subset Large Data to Save Time/Memory
ann.adata.obs[ann._columns["col_target_genes"]] = ann.adata.obs[
    ann._columns["col_target_genes"]].astype(str).replace("", ann._keys[
        "key_control"]).replace(np.nan,  ann._keys["key_control"])
ann.adata = ann.adata[ann.adata.obs["guide_ids"].isin(
    ["NT", "CDKN1A", "CDKN1A,CDKN1B", "CEBPA", "CEBPB", 
     "CEBPA,CEBPB", "DUSP9,KLF1", "SAMD1,UBASH3B", "TGFBR2", 
     "FEV,ISL2", "PRTG,TGFBR2", "JUN", "CLDN6,KLF1", 
     "CEBPE,SPI1", "PTPN13", "CEBPE,PTPN12", "CDKN1B,CDKN1C", 
     "FOXF1,FOXL2", "AHR,FEV", "CDKN1A,CDKN1B",])]  # subset for speed

# Add Control Keys Where Needed
ann.adata.obs[ann._columns["col_perturbation"]] = ann.adata.obs[
    ann._columns["col_perturbation"]].replace("", np.nan).replace(
        np.nan, ann._keys["key_control"])

# Fix Gene Columns
if ann._columns["col_gene_symbols"] in ann.adata.var.index.names:
    ann.adata.var = ann.adata.var.reset_index()

# Binary Perturbation Column      
conds = list(ann.adata.obs[ann._columns["col_perturbation"]].unique())
lab_tx = "Perturbed" if ann._keys[
    "key_treatment"] is None else ann._keys["key_treatment"]
ann.adata.obs[ann._columns["col_perturbation"] + "_old"] = ann.adata.obs[
    ann._columns["col_perturbation"]].copy()
ann.adata.obs[ann._columns[
    "col_perturbation"] + "_binary"] = ann.adata.obs[
        ann._columns["col_perturbation"]].apply(
            lambda x: lab_tx if x != ann._keys["key_control"] else x)
ann._keys["key_treatment"] = lab_tx
ann.adata

# Run Augur
kws_augur_predict = dict(span=1)
augur_data, augur_results, figs_augur = ann.run_augur(
    col_perturbation=ann._columns["col_perturbation"] + "_binary", 
    key_treatment=ann._keys["key_treatment"], 
    classifier="random_forest_classifier", n_threads=True,
    augur_mode="default", select_variance_features=True, n_folds=2,
    subsample_size=20, kws_augur_predict=kws_augur_predict)

Output

image