Closed ixxmu closed 3 days ago
import scanpy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
from pybedtools import BedTool
import pickle as pkl
%matplotlib inline
import itertools
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
import sys
sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/dist/memento-0.0.6-py3.8.egg')
sys.path.append('/home/ssm-user/Github/misc-seq/miscseq')
import encode
import memento
data_path = '/data_volume/memento/demux/'
# fig_path = '/data/home/Github/scrna-parameter-estimation/figures/fig6/'
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'medium',
'axes.labelsize': 'medium',
'axes.titlesize':'medium',
'figure.titlesize':'medium',
'xtick.labelsize':'small',
'ytick.labelsize':'small'}
pylab.rcParams.update(params)
cts = ['CD4 T cells', 'CD14+ Monocytes', 'FCGR3A+ Monocytes', 'NK cells','CD8 T cells', 'B cells', 'Dendritic cells', 'Megakaryocytes']
label_converter = dict(zip(cts, ['Th', 'cM', 'ncM', 'NK', 'Tc', 'B', 'DC', 'Mega']))
def simplify_name(name):
return name.split('.')[0]
adata = sc.read(data_path + 'interferon_filtered.h5ad')
adata = adata[(adata.obs.multiplets == 'singlet') & (adata.obs.cell != 'nan'), :].copy()
adata.X = adata.X.astype(np.int64)
# temp_adata = adata.copy()
# temp_adata = temp_adata[temp_adata.obs.cell == ct].copy()
norm_adata = adata.copy()
sc.pp.normalize_total(norm_adata, target_sum=1e4)
# sc.pp.log1p(norm_adata)
adata.obs['ct'] = adata.obs['cell'].apply(lambda x: label_converter[x])
adata.obs['cell_type'] = (adata.obs['cell'].astype(str) + ' - ' + adata.obs['stim'].astype(str)).astype('category')
sc.pl.tsne(adata, color='cell_type')
with open(data_path + 'all_highcount_tfs.pkl', 'rb') as f:
tfs = pkl.load(f)
tf_df = pd.read_csv('../ifn_hbec/version3/baseline/human_tf.txt', sep='\t')
tf_list = tf_df['Symbol'].tolist()
tf_list += ['CIITA', 'NLRC5']
####Select control cells
cts = ['Th', 'cM']
ctrl_adata = adata[
adata.obs['ct'].isin(cts) & \
adata.obs['stim'].isin(['ctrl'])].copy().copy()
####Setup memento
ctrl_adata.obs['q'] = 0.07
ctrl_adata.X = ctrl_adata.X.astype(float)
memento.setup_memento(ctrl_adata, q_column='q', trim_percent=0.1, filter_mean_thresh=0.07)
simplified_cts = ['T', 'B', 'M']
for ct in ['Th']:
print(ct)
labeled_ctrl_adata = ctrl_adata.copy().copy()
labeled_ctrl_adata.obs['is_ct'] = labeled_ctrl_adata.obs['ct'].str.contains(ct).astype(int)
memento.create_groups(labeled_ctrl_adata, label_columns=['is_ct', 'ind', 'ct'])
memento.compute_1d_moments(labeled_ctrl_adata, min_perc_group=.7)
available_tfs = set(tf_list) & set(labeled_ctrl_adata.var.index)
available_genes = set(labeled_ctrl_adata.var.index)
print(len(available_tfs),labeled_ctrl_adata.shape)
memento.compute_2d_moments(labeled_ctrl_adata, list(itertools.product(available_tfs, available_genes)))
# memento.compute_2d_moments(labeled_ctrl_adata, list(itertools.combinations(available_genes, 2)))
candidates = memento.get_2d_moments(labeled_ctrl_adata, groupby='is_ct').query('is_ct_1 > 0.15 | is_ct_0 > 0.15')
print(candidates.shape)
memento.compute_2d_moments(labeled_ctrl_adata, list(zip(candidates['gene_1'], candidates['gene_2'])))
memento.ht_2d_moments(
labeled_ctrl_adata,
formula_like='1 + is_ct + ind',
treatment_col='is_ct',
num_boot=10000,
verbose=1,
num_cpus=94,
resampling='permutation',
approx=False)
labeled_ctrl_adata.write(data_path + 'coex_markers/Th_vs_cM.h5ad'.format(ct))
####heatmap
ht_result_all = []
for ct in ['Th_vs_cM']:
result = sc.read(data_path + 'coex_markers/{}.h5ad'.format(ct))
ht_result = memento.get_2d_ht_result(result)
ht_result['corr_fdr'] = memento.util._fdrcorrect(ht_result['corr_pval'])
ht_result['ct'] = ct
ht_result_all.append(ht_result)
ht_result_all = pd.concat(ht_result_all)
ht_result_all['sign'] = ht_result_all['corr_coef'] > 0
sig_df = ht_result_all.query('corr_fdr < 0.1')
pairs = sig_df[['gene_1', 'gene_2']].drop_duplicates()
labeled_ctrl_adata = ctrl_adata.copy().copy()
memento.create_groups(labeled_ctrl_adata, label_columns=['ct', 'ind'])
memento.compute_1d_moments(labeled_ctrl_adata, min_perc_group=.7)
memento.compute_2d_moments(labeled_ctrl_adata, list(zip(ht_result_all['gene_1'], ht_result_all['gene_2'])))
corr_df = memento.get_2d_moments(labeled_ctrl_adata, groupby='ct')
heatmap = corr_df.query('gene_1 == "TCF25"')#[['ct_cM', 'ct_Th']]
heatmap.index = heatmap['gene_2']
heatmap = heatmap[['ct_cM', 'ct_Th']]
sns.clustermap(heatmap, cmap='viridis')
subsets = {}
for tf in ht_result_all.gene_1.drop_duplicates():
subset = ht_result_all.query('gene_1 == "{}"'.format(tf)).copy()
subset['corr_fdr'] = memento.util._fdrcorrect(subset['corr_pval'])
# Pick some highly correlated genes that don't change
subset_1 = subset.query('corr_pval > 0.9').head(25)
subset_1['type'] = 'both'
subset_2 = subset.query('corr_fdr < 0.1 & corr_coef > 0').sort_values('corr_pval').head(25)
subset_2['type'] = 'T'
subset_3 = subset.query('corr_fdr < 0.1 & corr_coef < 0').sort_values('corr_pval').head(25)
subset_3['type'] = 'M'
subsets[tf] = pd.concat([subset_2, subset_1, subset_3])
heatmap = subsets['SUB1'].merge(corr_df)
heatmap.index = heatmap['gene_2']
heatmap = heatmap[['ct_cM', 'ct_Th']]
tf = 'CEBPB'
heatmap = subsets[tf].merge(corr_df)
heatmap.index = heatmap['gene_2']
heatmap = heatmap[['ct_cM', 'ct_Th']]
plt.figure(figsize=(2,6))
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'medium',
'axes.labelsize': 'medium',
'axes.titlesize':'medium',
'figure.titlesize':'medium',
'xtick.labelsize':'small',
'ytick.labelsize':'xx-small'}
pylab.rcParams.update(params)
sns.heatmap(heatmap, cmap='viridis', yticklabels=False)
plt.ylabel('Genes (correlation)'); plt.title(tf)
plt.savefig('{}_heatmap.png'.format(tf), dpi=800, bbox_inches='tight')
ct = 'Th_tf'
result = sc.read(data_path + 'coex_markers/{}.h5ad'.format(ct))
ht_result = memento.get_2d_ht_result(result)
ht_result['corr_fdr'] = memento.util._fdrcorrect(ht_result['corr_pval'])
moments = memento.get_2d_moments(result, groupby='is_ct')
moments_1d = memento.get_1d_moments(result, groupby='ct')
merged = moments.merge(ht_result, on=['gene_1', 'gene_2'])
merged.query('corr_fdr < 0.1 & is_ct_0 < 0.2').sort_values('is_ct_1')
merged['sign'] = merged['corr_coef']>0
merged.query('corr_fdr < 0.1').groupby('gene_1').agg({'sign':['count', 'mean']})
moments_1d[0].query('gene == "IRF1"')
moments.query('gene_2 == "ATF4"')
merged['sign'] = merged['corr_coef'] > 0
hit_tfs = merged.query('corr_fdr < 0.1').gene_1.value_counts()
hit_tfs = hit_tfs[hit_tfs > 25].index.tolist()
print(merged.query('corr_fdr < 0.1 & gene_1 in @hit_tfs').groupby('gene_1').sign.mean().sort_values())
gene_1
YBX1 0.000000
HMGB1 0.009174
ZFP36L2 0.027027
LYAR 0.040000
BCLAF1 0.507463
SUB1 0.520833
ZNF24 0.590909
CNBP 0.634921
LRRFIP1 0.666667
GPBP1 0.682927
ELF1 0.758621
ZNF394 0.769231
SFPQ 0.894737
NFE2L2 0.909091
CREM 0.938776
HBP1 0.950820
DNAJC2 0.966292
KLF6 0.973684
ATF4 0.975155
JUNB 0.987013
SON 0.991071
JUN 1.000000
IRF1 1.000000
Name: sign, dtype: float64
tf = 'GPBP1'
heatmap = merged.query('gene_1 == "{}"'.format(tf))
sns.clustermap(heatmap[['is_ct_0', 'is_ct_1']], cmap='viridis')
merged
heatmap = pd.DataFrame()
ct = 'Th'
result = sc.read(data_path + 'coex_markers/{}.h5ad'.format(ct))
ht_result = memento.get_2d_ht_result(result)
ht_result['corr_fdr'] = memento.util._fdrcorrect(ht_result['corr_pval'])
moments = memento.get_2d_moments(result, groupby='is_ct')
merged = moments.merge(ht_result, on=['gene_1', 'gene_2'])
merged.query('corr_fdr < 0.1 & (is_ct_0 < 0.1 | is_ct_1 < 0.1)')
def plot_microscopy(X, c1, c2, s=5, q_up=.95, q_doawn=0.1, min_val=0.1, alpha=0.1, xlim=None, ylim=None, remove_axis=True):
N = X.shape[0]
c1 = np.clip(c1, a_min=np.quantile(c1, q_down), a_max=np.quantile(c1, q_up))
c2 = np.clip(c2, a_min=np.quantile(c2, q_down), a_max=np.quantile(c2, q_up))
c1 = (c1 - c1.min())/(c1.max()-c1.min())
c2 = (c2 - c2.min())/(c2.max()-c2.min())
c1 = np.clip(c1, a_min=min_val, a_max=1)
c2 = np.clip(c2, a_min=min_val, a_max=1)
plt.subplot(1, 3, 1); plt.scatter(X[:, 0], X[:, 1], c=np.vstack([c1, np.zeros(N), np.zeros(N)]).T, s=s, alpha=alpha)
plt.gca().set_facecolor((0, 0, 0))
if remove_axis:
plt.xticks([]); plt.yticks([])
if xlim is not None and ylim is not None:
plt.xlim(xlim); plt.ylim(ylim);
plt.subplot(1, 3, 2); plt.scatter(X[:, 0], X[:, 1], c=np.vstack([np.zeros(N), c2, np.zeros(N)]).T, s=s, alpha=alpha)
plt.gca().set_facecolor((0, 0, 0))
if remove_axis:
plt.xticks([]); plt.yticks([])
if xlim is not None and ylim is not None:
plt.xlim(xlim); plt.ylim(ylim);
plt.subplot(1, 3, 3); plt.scatter(X[:, 0], X[:, 1], c=np.vstack([c1, c2, np.zeros(N)]).T, s=s, alpha=alpha)
plt.gca().set_facecolor((0, 0, 0))
if xlim is not None and ylim is not None:
plt.xlim(xlim); plt.ylim(ylim);
if remove_axis:
plt.xticks([]); plt.yticks([])
def get_ct_ind_corr(adata, gene_1, gene_2):
adata_temp = adata.copy()
scmemo.create_groups(adata_temp, label_columns=['cell', 'stim','ind'], inplace=True)
scmemo.compute_1d_moments(
adata_temp, inplace=True, filter_genes=False,
residual_var=True, use_n_umi=False, filter_mean_thresh=0.125,
min_perc_group=0.99)
scmemo.compute_2d_moments(adata_temp, [gene_1], [gene_2])
df_list = []
for group in adata_temp.uns['scmemo']['groups']:
_, ct, stim, ind = group.split('^')
if ct not in cts:
continue
df_list.append((label_converter[ct], stim,ind,adata_temp.uns['scmemo']['2d_moments'][group]['corr'][0][0]))
df = pd.DataFrame(df_list, columns=['ct', 'stim','ind', 'corr']).sort_values('ct')
df['corr'] = df['corr'].apply(lambda x: np.nan if abs(x) > 1 else x)
return df
imp.reload(hypothesis_test)
imp.reload(bootstrap)
imp.reload(scmemo)
imp.reload(estimator)
adata_dict = {}
for ct in cts:
print('Processing', ct)
adata_ct = adata[adata.obs.stim == 'ctrl'].copy()
# adata_ct.obs['cell'] = np.random.choice(adata_ct.obs['cell'], adata_ct.shape[0], replace=False)
adata_ct.obs['ct'] = adata_ct.obs['cell'].apply( lambda x: int(x == ct))# adata_ct.obs['stim'] = np.random.choice(adata_ct.obs['stim'], adata_ct.shape[0])
scmemo.create_groups(adata_ct, label_columns=['ct', 'cell' ,'ind'], inplace=True)
scmemo.compute_1d_moments(
adata_ct, inplace=True, filter_genes=True,
residual_var=True, use_n_umi=False, filter_mean_thresh=0.25,
min_perc_group=0.99)
print('Size of data', adata_ct.shape)
available_tfs = list(set(tfs) & set(adata_ct.var.index.tolist()))
target_genes = adata_ct.var.index.tolist()
target_genes = [gene for gene in target_genes if gene[:2] != 'RP' and gene[:3] != 'HLA']
# target_genes = np.random.choice(target_genes, 50)
print('TF list length', len(available_tfs))
print('target gene length', len(target_genes))
scmemo.compute_2d_moments(adata_ct, target_genes, target_genes)
scmemo.ht_2d_moments(adata_ct, formula_like='1 + ct', cov_column='ct', num_boot=5000, num_cpus=6)
adata_ct.write(data_path + 'result_2d/ct_specific_{}_05292020.h5ad'.format(label_converter[ct]))
adata_dict[ct] = adata_ct.copy()
adata_dict = {}
for ct in cts:
adata_dict[ct] = sc.read(data_path + 'result_2d/ct_specific_{}_05292020.h5ad'.format(label_converter[ct]))
def get_2d_ht_result(adata):
result_df = pd.DataFrame(
itertools.product(
adata.uns['scmemo']['2d_moments']['gene_1'],
adata.uns['scmemo']['2d_moments']['gene_2']),
columns=['gene_1', 'gene_2'])
result_df['corr_coef'] = adata.uns['scmemo']['2d_ht']['corr_coef'].ravel()
result_df['corr_pval'] = adata.uns['scmemo']['2d_ht']['corr_asl'].ravel()
result_df['corr_fdr'] = util._fdrcorrect(result_df['corr_pval'].values)
return result_df
for ct in cts:
print(ct)
print(get_2d_ht_result(adata_dict[ct]).query('corr_fdr < 0.15').shape)
df = scmemo.get_2d_ht_result(adata_dict['CD4 T cells'])
df.query('corr_fdr < 0.2').sort_values('corr_coef')#.head(20)
# EEF1A1 EEF1D
plt.figure(figsize=(2, 2))
gene_1, gene_2 = 'FTH1', 'TMSB4X'
plot_df = get_ct_ind_corr(adata_ct, gene_1, gene_2).query('ct in ["B","Th","cM","ncM"]')
sns.boxplot(x='ct', y='corr',
data=plot_df,
palette='Set2')
sns.stripplot(x='ct', y='corr',
data=plot_df,
palette='Set2', linewidth=2)
plt.ylabel('FTH1 and TMSB4X\ncorrelation')
plt.xlabel('cell type')
# plt.title('Correlation\nbetween\nFTH1 and TMSB4X')
plt.savefig(fig_path + 'fth1_tmsb4x.pdf', bbox_inches='tight')
norm_adata = adata.copy()
sc.pp.normalize_total(norm_adata, target_sum=1e4)
# sc.pp.log1p(norm_adata)
plt.figure(figsize=(15, 5))
plot_ct = 'CD14+ Monocytes'
dat = norm_adata[(norm_adata.obs.stim == 'ctrl')]
plot_microscopy(
X=dat.obsm['X_tsne'],
c1=dat[:,gene_1].X.todense().A1,
c2=dat[:,gene_2].X.todense().A1,
s=1,
q_down=0.0,
q_up=0.9,
alpha=0.5,
remove_axis=True)
plt.figure(figsize=(5, 1))
plot_ct = 'CD14+ Monocytes'
dat = norm_adata[(norm_adata.obs.stim == 'ctrl') & (norm_adata.obs.cell == plot_ct)]
plot_microscopy(
X=dat.obsm['X_tsne'],
c1=dat[:,gene_1].X.todense().A1,
c2=dat[:,gene_2].X.todense().A1,
s=1,
q_down=0.5,
q_up=0.9,
alpha=0.5,
remove_axis=True,
xlim=(-40, -5),ylim=(2, 37))
plt.savefig(fig_path + 'fth1_tmsb4x_cd14.pdf', bbox_inches='tight')
plt.figure(figsize=(5, 1))
plot_ct = 'CD4 T cells'
dat = norm_adata[(norm_adata.obs.stim == 'ctrl') & (norm_adata.obs.cell == plot_ct)]
plot_microscopy(
X=dat.obsm['X_tsne'],
c1=dat[:,gene_1].X.todense().A1,
c2=dat[:,gene_2].X.todense().A1,
s=1,
q_down=0.5,
q_up=0.9,
alpha=0.5,
remove_axis=True,
xlim=(-25, 25),ylim=(-40, 0))
plt.savefig(fig_path + 'fth1_tmsb4x_cd4.pdf', bbox_inches='tight')
norm_adata = adata.copy()
sc.pp.normalize_total(norm_adata, target_sum=1e4)
sc.pp.log1p(norm_adata)
sc.pl.tsne(norm_adata, color=['PPBP', 'PF4'])
sc.pl.tsne(norm_adata, color=['FTH1', 'TMSB4X', 'cell'])
https://mp.weixin.qq.com/s/DrSeSNR02J4MAH6pi0g9Rg