Closed npokorzynski closed 3 years ago
Hi @npokorzynski Sure let me prepare that script for you over the next few days. It will take as input the genewalk_scatterplots.csv file which contains the data for those plots. If it turns out useful, I will consider including it into a future GeneWalk release. Robert
@ri23 great, thanks Robert!
Hi @npokorzynski
Here is the Python script to plot the regulator and moonlighter scatter plots. I have added the script on the tutorial website as well in the visualization section 5: https://churchman.med.harvard.edu/genewalk/#tutorial
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['pdf.fonttype'] = 42 #embed fonts into pdf figure
import seaborn as sns
# ### Load GeneWalk scatterplots file
project_folder = '/home/genewalk/qki/'
filename = 'genewalk_scatterplots.csv'
scatter_data = pd.read_csv(os.path.join(project_folder,'figures',filename))
# ### Regulator and Moonlighting plot functions
# - If necessary customize plot sizes, plot filenames etc in functions below
# - The plots are automatically saved as png and pdf in `project_folder/figures` with default filenames starting with custom_regulators and custom_moonlighters
# - Amended from source file:
# `https://github.com/churchmanlab/genewalk/blob/master/genewalk/plot.py`
def scatterplot_regulators(named_genes,sd=scatter_data,name_space='hgnc_symbol'):
"""Scatter plot with fraction of (globally) relevant GO annotations
as a function of gene connectivity (to other genes) for all input
genes.
Arguments
scat_data : genewalk_scatterplots.csv loaded as pandas dataframe
named_genes: list with genes of type name_space. These will be labeled
in the plot.
name_space : which column name from genewalk_scatterplots.csv used as
identifier for genes. Default value: 'hgnc_symbol'. Other
possible values: {'mgi_id','hgnc_id'}
Default visualization thresholds:
T_frac: minimal fraction of relevant GO annotations, set to 0.5
T_gcon: minimal gene connectivity (to other
genes), set to 75th quartile of distribution
"""
xlab = 'Connections with other genes (per gene)'
ylab = 'Fraction of relevant GO terms (per gene)'
xvar = 'gene_con'
yvar = 'frac_rel_go'
plot_filename = 'custom_regulators_x_' + xvar + '_y_' + yvar #adjust as needed
plot_title = 'GeneWalk regulator genes' #Adjust as needed
sizex = 4 #unit: inch, adjust as needed
sizey = 4 #unit: inch, adjust as needed
font_sz = 10 # adjust as needed
xmin = 0.45
xmax = max(sd[xvar])*1.2
T_gcon = sd[xvar].quantile(q=0.75)
T_frac = 0.5
# seaborn static plot
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(sizex, sizey)) # inches
g = sns.scatterplot(x=xvar, y=yvar, hue=yvar,
linewidth=0, alpha=0.5,
# sizes=(40, 400),
data=sd,
ax=ax, legend=False)
plt.axvline(x=T_gcon, color='grey', linestyle='--')
plt.axhline(y=T_frac, color='grey', linestyle='--')
plt.xlabel(xlab, size=font_sz)
plt.ylabel(ylab, size=font_sz)
plt.xlim([xmin, xmax])
plt.xticks(size=font_sz)
plt.yticks(size=font_sz)
dreg = sd[sd[name_space].isin(named_genes)]
for r in dreg.index:
gname = dreg[name_space][r]
x_txt = dreg[xvar][r]
y_txt = dreg[yvar][r]
g.text(x_txt, y_txt, gname, size=font_sz*0.8,
horizontalalignment='center',
color='black', weight='semibold', fontstyle='italic')
g.set(xscale="log")
plt.title(plot_title, size=font_sz)
plt.savefig(os.path.join(project_folder, 'figures', plot_filename + '.pdf'),
bbox_inches="tight", transparent=True)
plt.savefig(os.path.join(project_folder, 'figures', plot_filename + '.png'),
bbox_inches="tight", transparent=True)
def scatterplot_moonlighters(named_genes,sd=scatter_data,name_space='hgnc_symbol'):
"""Scatter plot with fraction of (globally) relevant GO annotations
as a function of number of GO annotations for all input genes.
Arguments
sd : genewalk_scatterplots.csv loaded as pandas dataframe: scatter_data
named_genes: list with genes of type name_space. These will be labeled
in the plot.
name_space : which column name from genewalk_scatterplots.csv used as
identifier for genes. Default value: 'hgnc_symbol'. Other
possible values: {'mgi_id','hgnc_id'}
Visualization thresholds:
T_frac: maximal fraction of relevant GO annotations, set to 0.5
T_gocon: minimal number of GO annotations per gene,
set to max of 30 and 75th quartile of distribution.
"""
xlab = 'Number of GO annotations (per gene)'
ylab = 'Fraction of relevant GO terms (per gene)'
xvar = 'go_con'
yvar = 'frac_rel_go'
plot_filename = 'custom_moonlighters_x_' + xvar + '_y_' + yvar #adjust as needed
plot_title = 'GeneWalk moonlighting genes' #Adjust as needed
sizex = 4 #unit: inch, adjust as needed
sizey = 4 #unit: inch, adjust as needed
font_sz = 10 # adjust as needed
xmin = 0.45
xmax = max(sd[xvar])*1.2
T_gocon = max(30, sd[xvar].quantile(q=0.75))
T_frac = 0.5
#seaborn static plot
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(sizex, sizey)) # inches
g = sns.scatterplot(x=xvar, y=yvar, hue=yvar,
linewidth=0, alpha=0.5,
# sizes=(40, 400),
data=sd,
ax=ax, legend=False)
plt.axvline(x=T_gocon, color='grey', linestyle='--')
plt.axhline(y=T_frac, color='grey', linestyle='--')
plt.xlabel(xlab, size=font_sz)
plt.ylabel(ylab, size=font_sz)
plt.xlim([xmin, xmax])
plt.xticks(size=font_sz)
plt.yticks(size=font_sz)
dmoon = sd[sd[name_space].isin(named_genes)]
for m in dmoon.index:
gname = dmoon[name_space][m]
x_txt = dmoon[xvar][m]
y_txt = dmoon[yvar][m]
g.text(x_txt, y_txt, gname, size=font_sz*0.8,
horizontalalignment='center',
color='black', weight='semibold', fontstyle='italic')
g.set(xscale="log")
plt.title(plot_title, size=font_sz)
plt.savefig(os.path.join(project_folder, 'figures', plot_filename + '.pdf'),
bbox_inches="tight", transparent=True)
plt.savefig(os.path.join(project_folder, 'figures', plot_filename + '.png'),
bbox_inches="tight", transparent=True)
# ### Plot Regulator scatter plots with custom chosen genes
# specify genes in labeled_regulators.
labeled_regulators = ['JUN','TCF3','STAT3']
scatterplot_regulators(labeled_regulators,
scatter_data,
'hgnc_symbol')
# ### Plot Moonlighting scatter plots with custom chosen genes
#
# specify genes in labeled_moonlighters
labeled_moonlighters = ['WNT5A','TLR4']
scatterplot_moonlighters(labeled_moonlighters,
scatter_data,
'hgnc_symbol')
Awesome, thanks a lot!
Nick
Hi,
I was curious if you have a python or R script for reproducing the scatterplots such that individual genes of interest can be labeled and the size of the plot can be adjusted for easier visualization for publication?
Thanks, Nick