Open Chris-lang478 opened 1 year ago
@Chris-lang478, below is the singularity definition for that container.
However, the main piece you're after is this (very outdated) helper tool scanpy_recipes which @yulianatan found helpful for making QC plots and thresholding. I honestly wouldn't recommend it because it's reliant on old versions of scanpy that didn't have current functionality. Realistically, you could likely get away with not using it and replacing it with something like:
import scanpy as sc
import matplotlib.pyplot as plt
from pathlib import Path
def query_ribosomal_genes(biomart_annos):
return biomart_annos.loc[
biomart_annos.go_id.isin(["GO:0005840"]) |\
biomart_annos.external_gene_name.str.lower().str.match("^m?rp[ls]\d+"),
"external_gene_name"
].dropna().unique()
def query_mitochondrial_genes(biomart_annos):
return biomart_annos.loc[
biomart_annos.chromosome_name.str.match("^(mt|MT)") & True,
"external_gene_name"
].dropna().unique()
def query_hemoglobin_genes(biomart_annos):
return biomart_annos.loc[
biomart_annos.go_id.isin(["GO:0005833"]),
"external_gene_name"
].dropna().unique()
def annotate_var(adata, species):
biomart_annos = sc.queries.biomart_annotations(
species,
["external_gene_name", "chromosome_name", "go_id", "start_position", "end_position", "ensembl_gene_id"],
use_cache=True
)
idx = adata.var.species.isin([species])
adata.var.loc[idx, "hemoglobin"] = adata.var_names.isin(query_hemoglobin_genes(biomart_annos))[idx]
adata.var.loc[idx, "mitochondrial"] = adata.var_names.isin(query_mitochondrial_genes(biomart_annos))[idx]
adata.var.loc[idx, "ribosomal"] = adata.var_names.isin(query_ribosomal_genes(biomart_annos))[idx]
adata.var.loc[idx, "exclude_from_highly_variable"] = \
adata.var.hemoglobin[idx] | \
adata.var.mitochondrial[idx] |\
adata.var.ribosomal[idx]
def qc_metrics_violin(adata):
titles = ["UMIs", "Genes","% mtRNA", "% rRNA", "Hemoglobin"]
keys = [
"total_counts", "n_genes_by_counts", "pct_counts_mitochondrial",
"pct_counts_ribosomal", "total_counts_hemoglobin"
]
L = len(keys)
fig, axs = plt.subplots(1, L, figsize=(L*2, 3), dpi=200, gridspec_kw=dict(wspace=1))
for ax, key, title in zip(axs.flat, keys, titles):
sc.pl.violin(adata, key, layer="log_raw", color="0.85", use_raw=False, ax=ax, show=False, size=0.5)
ax.set_xlabel(""); ax.set_ylabel(adata.obs.processed_id.head(1).values[0])
ax.set_xticks([])
ax.set_title(title)
sns.despine(fig, ax)
fig.subplots_adjust(wspace=1)
return fig
raws = []
paths = list(sorted(Path("/path/to/h5s").glob("*filtered_feature_bc_matrix.h5")))
for path in paths:
raw = sc.read_10x_h5(path)
raw.var_names_make_unique()
raw.obs["sample_id"] = path.stem.split("_")[0]
species = "hsapiens"
raw.var.species = species
for key in ["hemoglobin","mitochondrial","ribosomal","exclude_from_highly_variable"]:
raw.var[key] = False
annotate_var(raw, species)
sc.pp.calculate_qc_metrics(
raw,
qc_vars=("mitochondrial", "hemoglobin", "ribosomal"),
percent_top=None,
log1p=True,
inplace=True
)
qc_metrics_violin(raw)
raws.append(raw)
# then based on those plots, threshold:
qcs = []
out_path = Path("/place/to/store/h5ads/")
for raw in raws:
qc = raw.copy()
sc.pp.filter_cells(qc, min_genes=500)
includes = qc.obs.pct_counts_mitochondrial < 25
includes &= qc.obs.total_counts_hemoglobin < 25
qc = qc[includes, :].copy()
output_id = qc.obs.sample_id.head(1).values[0]
qc.write_h5ad(out_path / f"{output_id}_qc.h5ad")
qcs.append(qc)
# note that genes were not filtered here.
# I always find it easier to AnnData.concat() first then filter genes so I don't have to mess with join="outer"
Hope this helps. Alternatively, here's the singularity definition in case you want to try to build the container.
BootStrap: docker
From: r-base:3.6.2
%help
Run JupyterLab with Python and R
Best way to run is
singularity run -B /projects:/projects -B /fastscratch:/fastscratch <img.sif>
Key Python packages:
- scanpy
- scientific python stack
singularity run <img.sif> launches a jupyter lab server.
%environment
export "PATH=/opt/conda/bin:$PATH"
export "LD_LIBRARY_PATH=/usr/local/lib"
%post
apt-get update --fix-missing
apt-get install -y \
wget \
bzip2 \
locales \
build-essential \
ca-certificates \
tar \
gfortran \
libopenblas-base \
libopenblas-dev \
libcurl4-openssl-dev \
libxml2-dev \
libssl-dev \
libssh-dev \
libgit2-dev \
xorg \
libfreetype6-dev \
git
mkdir -p /opt && cd /opt
# we want a python installation before biocmanager installs
wget --quiet -O ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
/bin/bash ~/miniconda.sh -q -b -p /opt/conda && rm ~/miniconda.sh
/opt/conda/bin/conda clean -tipsy && \
ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
echo "conda activate base" >> ~/.bashrc
/opt/conda/bin/conda update -q -n base -c defaults conda
# note that the we !append! python to the path so that conda fortran/gcc
# libraries do not mess up R installs since the R installation is outside of
# conda. I tried having it inside of conda and got weird gcc linking errors
export PATH="$PATH:/opt/conda/bin"
# install jupyterlab
/opt/conda/bin/conda install --quiet -y \
-c bioconda -c conda-forge \
jupyterlab \
numpy pandas scikit-learn scikit-image \
matplotlib seaborn \
leidenalg scanpy==1.4.6 scrublet \
psutil lxml bioservices \
tzlocal \
xlrd \
cmocean hmmlearn
/opt/conda/bin/conda install --quiet -y \
-c conda-forge -c bioconda gseapy
mkdir /opt/notebooks
# finally install packages requiring pip install
/opt/conda/bin/python -m pip install --quiet rpy2 anndata2ri bbknn fa2 cellphonedb goatools
/opt/conda/bin/python -m pip install -e git+https://github.com/TheJacksonLaboratory/scanpy_recipes.git#egg=scanpy_recipes
# locale fix
echo "LC_ALL=en_US.UTF-8" >> /etc/environment
echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen
echo "LANG=en_US.UTF-8" > /etc/locale.conf
locale-gen en_US.UTF-8
# try conda cleaning
/opt/conda/bin/conda clean -a -y
# clean up
apt-get clean && \
rm -rf /var/lib/apt/lists/*
%runscript
echo "Starting notebook..."
PORT=$(shuf -i8899-11999 -n1)
echo "Open browser to $(hostname -A|tr -d ' '):${PORT}"
exec /opt/conda/bin/jupyter lab --notebook-dir=/ --ip='*' --port=$PORT --no-browser --allow-root
@wflynny Thank you for providing me with this script!
@wflynny Thanks again for your offer. And there is a new problem. In the article, Scrublet was used for doublet identification..But the article dosn't offer the parameters(expected_doublet_rate ), and no related scripts in endometriosis-scrnaseq analysis codes. Could you offer me the related parameters or the scripts? Thanks very much!
Thanks for your work! However, in 02_EC19001-SC2100428-Tissue-global.ipynb,it shows "done with scanpy-recipes with jupyterlab_1.3.1.sif". I can't find it in the page. Could you offer it ? Thanks a lot!