Closed easlinger closed 1 year ago
Removed the following code from crispr.analysis.perturbations
(perform_augur()
function):
if adata_pert.var_names[0] != adata_pert.var.reset_index()[
col_gene_symbols][0]: # so gene names plots use if index differs
adata_pert.var_names = pd.Index(adata_pert.var.reset_index()[
col_gene_symbols])
It was originally intended to address the issue where the gene symbols aren't the index of .adata.var
(usually, it's the Ensembl IDs instead), which
gene_symbols
argument to override the use of the .var
index for labeling and# Packages
import crispr as cr
from crispr.crispr_class import Crispr
import pertpy as pt
import muon
import os
import pandas as pd
import numpy as np
from config import DIR
# Initialize Object
kwargs_init = dict(
assay=None, assay_protein=None,
col_gene_symbols="gene_symbols",
layer_perturbation="X_pert",
col_cell_type="leiden",
col_sample_id="gemgroup",
col_batch="gemgroup",
col_perturbation="guide_ids",
col_guide_rna="guide_ids",
col_target_genes="guide_ids",
label_perturbation_type="KD",
key_control="NT", key_treatment=None)
# file_path = f"{DIR}/replogle_2022_k562_esss.h5ad"
file_path = f"{DIR}/replogle_2022_k562_esss_processed.h5ad"
# Initialize Object
ann = Crispr(file_path, **kwargs_init)
# Subset Large Data to Save Time/Memory
ann.adata.obs[ann._columns["col_target_genes"]] = ann.adata.obs[
ann._columns["col_target_genes"]].astype(str).replace("", ann._keys[
"key_control"]).replace(np.nan, ann._keys["key_control"])
ann.adata = ann.adata[ann.adata.obs["guide_ids"].isin(
["NT", "CDKN1A", "CDKN1A,CDKN1B", "CEBPA", "CEBPB",
"CEBPA,CEBPB", "DUSP9,KLF1", "SAMD1,UBASH3B", "TGFBR2",
"FEV,ISL2", "PRTG,TGFBR2", "JUN", "CLDN6,KLF1",
"CEBPE,SPI1", "PTPN13", "CEBPE,PTPN12", "CDKN1B,CDKN1C",
"FOXF1,FOXL2", "AHR,FEV", "CDKN1A,CDKN1B",])] # subset for speed
# Add Control Keys Where Needed
ann.adata.obs[ann._columns["col_perturbation"]] = ann.adata.obs[
ann._columns["col_perturbation"]].replace("", np.nan).replace(
np.nan, ann._keys["key_control"])
# Fix Gene Columns
if ann._columns["col_gene_symbols"] in ann.adata.var.index.names:
ann.adata.var = ann.adata.var.reset_index()
# Binary Perturbation Column
conds = list(ann.adata.obs[ann._columns["col_perturbation"]].unique())
lab_tx = "Perturbed" if ann._keys[
"key_treatment"] is None else ann._keys["key_treatment"]
ann.adata.obs[ann._columns["col_perturbation"] + "_old"] = ann.adata.obs[
ann._columns["col_perturbation"]].copy()
ann.adata.obs[ann._columns[
"col_perturbation"] + "_binary"] = ann.adata.obs[
ann._columns["col_perturbation"]].apply(
lambda x: lab_tx if x != ann._keys["key_control"] else x)
ann._keys["key_treatment"] = lab_tx
ann.adata
# Run Augur
kws_augur_predict = dict(span=1)
augur_data, augur_results, figs_augur = ann.run_augur(
col_perturbation=ann._columns["col_perturbation"] + "_binary",
key_treatment=ann._keys["key_treatment"],
classifier="random_forest_classifier", n_threads=True,
augur_mode="default", select_variance_features=True, n_folds=2,
subsample_size=20, kws_augur_predict=kws_augur_predict)
Background
Problem emerged with commit e476eb2a68b813ff145b387939d130c5adb9696b.
Reproducible Code
To save time,
Then replace
pt.data.replogle_2022_k562_essential()
withfile_path
in the code below. (This only works if you have the file.)Behavior
Runs briefly (makes classifier, loads data) and fails on Augur predict method.
Expected Behavior
Runs. (See attached Jupyter notebook)