churchmanlab / genewalk

GeneWalk identifies relevant gene functions for a biological context using network representation learning
https://churchman.med.harvard.edu/genewalk
BSD 2-Clause "Simplified" License
127 stars 14 forks source link

Code for regulator/moonlighter plots #49

Closed npokorzynski closed 3 years ago

npokorzynski commented 3 years ago

Hi,

I was curious if you have a python or R script for reproducing the scatterplots such that individual genes of interest can be labeled and the size of the plot can be adjusted for easier visualization for publication?

Thanks, Nick

ri23 commented 3 years ago

Hi @npokorzynski Sure let me prepare that script for you over the next few days. It will take as input the genewalk_scatterplots.csv file which contains the data for those plots. If it turns out useful, I will consider including it into a future GeneWalk release. Robert

npokorzynski commented 3 years ago

@ri23 great, thanks Robert!

ri23 commented 3 years ago

Hi @npokorzynski

Here is the Python script to plot the regulator and moonlighter scatter plots. I have added the script on the tutorial website as well in the visualization section 5: https://churchman.med.harvard.edu/genewalk/#tutorial

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['pdf.fonttype'] = 42 #embed fonts into pdf figure
import seaborn as sns

# ### Load GeneWalk scatterplots file

project_folder = '/home/genewalk/qki/'   

filename = 'genewalk_scatterplots.csv'
scatter_data = pd.read_csv(os.path.join(project_folder,'figures',filename)) 

# ### Regulator and Moonlighting plot functions
# - If necessary customize plot sizes, plot filenames etc in functions below
# - The plots are automatically saved as png and pdf in `project_folder/figures` with default filenames starting with custom_regulators and custom_moonlighters
# - Amended from source file:
# `https://github.com/churchmanlab/genewalk/blob/master/genewalk/plot.py`

def scatterplot_regulators(named_genes,sd=scatter_data,name_space='hgnc_symbol'):
    """Scatter plot with fraction of (globally) relevant GO annotations
    as a function of gene connectivity (to other genes) for all input
    genes.

    Arguments
    scat_data : genewalk_scatterplots.csv loaded as pandas dataframe
    named_genes: list with genes of type name_space. These will be labeled
                 in the plot.
    name_space : which column name from genewalk_scatterplots.csv used as
                 identifier for genes. Default value: 'hgnc_symbol'. Other 
                 possible values: {'mgi_id','hgnc_id'}

    Default visualization thresholds:
    T_frac: minimal fraction of relevant GO annotations, set to 0.5
    T_gcon: minimal gene connectivity (to other
    genes), set to 75th quartile of distribution
    """

    xlab = 'Connections with other genes (per gene)'
    ylab = 'Fraction of relevant GO terms (per gene)'
    xvar = 'gene_con'
    yvar = 'frac_rel_go'

    plot_filename = 'custom_regulators_x_' + xvar + '_y_' + yvar #adjust as needed
    plot_title = 'GeneWalk regulator genes' #Adjust as needed
    sizex = 4 #unit: inch, adjust as needed
    sizey = 4 #unit: inch, adjust as needed
    font_sz = 10 # adjust as needed

    xmin = 0.45
    xmax = max(sd[xvar])*1.2
    T_gcon = sd[xvar].quantile(q=0.75)
    T_frac = 0.5

    # seaborn static plot
    sns.set(style="whitegrid")
    fig, ax = plt.subplots(figsize=(sizex, sizey))  # inches
    g = sns.scatterplot(x=xvar, y=yvar, hue=yvar,
                        linewidth=0, alpha=0.5,
#                         sizes=(40, 400),
                        data=sd,
                        ax=ax, legend=False)
    plt.axvline(x=T_gcon, color='grey', linestyle='--')
    plt.axhline(y=T_frac, color='grey', linestyle='--')
    plt.xlabel(xlab, size=font_sz)
    plt.ylabel(ylab, size=font_sz)
    plt.xlim([xmin, xmax])
    plt.xticks(size=font_sz)
    plt.yticks(size=font_sz)

    dreg = sd[sd[name_space].isin(named_genes)]
    for r in dreg.index:
        gname = dreg[name_space][r]
        x_txt = dreg[xvar][r]
        y_txt = dreg[yvar][r]
        g.text(x_txt, y_txt, gname, size=font_sz*0.8, 
               horizontalalignment='center',
               color='black', weight='semibold', fontstyle='italic')
    g.set(xscale="log")
    plt.title(plot_title, size=font_sz)
    plt.savefig(os.path.join(project_folder, 'figures', plot_filename + '.pdf'),
                bbox_inches="tight", transparent=True)
    plt.savefig(os.path.join(project_folder, 'figures', plot_filename + '.png'),
                bbox_inches="tight", transparent=True)

def scatterplot_moonlighters(named_genes,sd=scatter_data,name_space='hgnc_symbol'):
    """Scatter plot with fraction of (globally) relevant GO annotations
    as a function of number of GO annotations for all input genes.

    Arguments
    sd : genewalk_scatterplots.csv loaded as pandas dataframe: scatter_data
    named_genes: list with genes of type name_space. These will be labeled
                 in the plot.
    name_space : which column name from genewalk_scatterplots.csv used as
                 identifier for genes. Default value: 'hgnc_symbol'. Other 
                 possible values: {'mgi_id','hgnc_id'}

    Visualization thresholds:
    T_frac: maximal fraction of relevant GO annotations, set to 0.5
    T_gocon: minimal number of GO annotations per gene,
    set to max of 30 and 75th quartile of distribution.
    """
    xlab = 'Number of GO annotations (per gene)'
    ylab = 'Fraction of relevant GO terms (per gene)'
    xvar = 'go_con'
    yvar = 'frac_rel_go'

    plot_filename = 'custom_moonlighters_x_' + xvar + '_y_' + yvar #adjust as needed
    plot_title = 'GeneWalk moonlighting genes' #Adjust as needed
    sizex = 4 #unit: inch, adjust as needed
    sizey = 4 #unit: inch, adjust as needed
    font_sz = 10 # adjust as needed

    xmin = 0.45
    xmax = max(sd[xvar])*1.2
    T_gocon = max(30, sd[xvar].quantile(q=0.75))
    T_frac = 0.5

    #seaborn static plot
    sns.set(style="whitegrid")
    fig, ax = plt.subplots(figsize=(sizex, sizey))  # inches
    g = sns.scatterplot(x=xvar, y=yvar, hue=yvar,
                        linewidth=0, alpha=0.5,
#                         sizes=(40, 400),
                        data=sd,
                        ax=ax, legend=False)
    plt.axvline(x=T_gocon, color='grey', linestyle='--')
    plt.axhline(y=T_frac, color='grey', linestyle='--')
    plt.xlabel(xlab, size=font_sz)
    plt.ylabel(ylab, size=font_sz)
    plt.xlim([xmin, xmax])
    plt.xticks(size=font_sz)
    plt.yticks(size=font_sz)

    dmoon = sd[sd[name_space].isin(named_genes)]
    for m in dmoon.index:
        gname = dmoon[name_space][m]
        x_txt = dmoon[xvar][m]
        y_txt = dmoon[yvar][m]
        g.text(x_txt, y_txt, gname, size=font_sz*0.8, 
               horizontalalignment='center',
               color='black', weight='semibold', fontstyle='italic')
    g.set(xscale="log")
    plt.title(plot_title, size=font_sz)

    plt.savefig(os.path.join(project_folder, 'figures', plot_filename + '.pdf'),
                bbox_inches="tight", transparent=True)
    plt.savefig(os.path.join(project_folder, 'figures', plot_filename + '.png'),
                bbox_inches="tight", transparent=True)

# ### Plot Regulator scatter plots with custom chosen genes
# specify genes in labeled_regulators.

labeled_regulators = ['JUN','TCF3','STAT3']

scatterplot_regulators(labeled_regulators,
                       scatter_data,
                       'hgnc_symbol')

# ### Plot Moonlighting scatter plots with custom chosen genes
# 
# specify genes in labeled_moonlighters

labeled_moonlighters = ['WNT5A','TLR4']

scatterplot_moonlighters(labeled_moonlighters,
                         scatter_data,
                         'hgnc_symbol')
npokorzynski commented 3 years ago

Awesome, thanks a lot!

Nick