different results by using run_pathfindR() vs step-by-step workflow

AliSaadatV commented 12 months ago

Hello

Thanks for the great tool.

I noticed that when I use run_pathfindR(), I get an output with 9 enriched pathways. But when I follow the step-by-step workflow, the output is empty (Although I use similar parameters). May I kindly ask what could be the reason?

Here is the code I used for the run_pathfindR():

PPI = "STRING"
PATHWAY = "KEGG"

run_pathfindR(df_input,
                                      p_val_threshold = 0.2, 
                                      min_gset_size = 5,
                                      max_gset_size = 500, 
                                      adj_method = "fdr",
                                      gene_sets =  PATHWAY,
                                      pin_name_path = PPI,
                                      list_active_snw_genes = TRUE)

Here is the code I used for the step-by-step workflow:

PPI = "STRING"
PATHWAY = "KEGG"

df_input_processed <- input_processing(
  input = df_input, # the input: in this case, differential expression results
  p_val_threshold = 0.2, # p value threshold to filter significant genes
  pin_name_path = PPI, # the name of the PIN to use for active subnetwork search
  convert2alias = TRUE # boolean indicating whether or not to convert missing symbols to alias symbols in the PIN
)

# using "KEGG" as our gene sets for enrichment
pathway_list <- fetch_gene_set(
  gene_sets = PATHWAY,
  min_gset_size = 5,
  max_gset_size = 500
)
pathway_gsets <- pathway_list[[1]]
pathway_descriptions <- pathway_list[[2]]

# run pathfindR
n_iter <- 10 ## number of iterations
combined_res <- NULL ## to store the result of each iteration

for (i in 1:n_iter) {
  ###### Active Subnetwork Search
  snws_file <- paste0("active_snws_", i) # Name of output file
  active_snws <- active_snw_search(
    input_for_search = df_input_processed,
    pin_name_path = PPI,
    snws_file = snws_file,
    score_quan_thr = 0.8, # you may tweak these arguments for optimal filtering of subnetworks
    sig_gene_thr = 0.02, # you may tweak these arguments for optimal filtering of subnetworks
    search_method = "GR", # we suggest using GR
    seedForRandom = i # setting seed to ensure reproducibility per iteration
  )

  ###### Enrichment Analyses
  current_res <- enrichment_analyses(
    snws = active_snws,
    sig_genes_vec = df_input_processed$GENE,
    pin_name_path = PPI,
    genes_by_term = pathway_gsets,
    term_descriptions = pathway_descriptions,
    adj_method = "fdr",
    enrichment_threshold = 0.05,
    list_active_snw_genes = TRUE
  ) # listing the non-input active snw genes in output

  ###### Combine results via `rbind`
  combined_res <- rbind(combined_res, current_res)
}

###### Summarize Combined Enrichment Results
summarized_df <- summarize_enrichment_results(combined_res,
  list_active_snw_genes = TRUE
)

###### Annotate Affected Genes Involved in Each Enriched Term
final_res <- annotate_term_genes(
  result_df = summarized_df,
  input_processed = df_input_processed,
  genes_by_term = pathway_gsets)

I am attaching the df_input and output of run_pathfindR. the output of step-by-step is empty.

Thank you in advance Ali files.zip

egeulgen commented 12 months ago

I suspected it's because the seeds (seedForRandom) are different but that does not seem to be the case. I'll try to reproduce/look into it and let you know soon

egeulgen commented 12 months ago

I couldn't reproduce the issue, getting the exact number of enriched pathways with the script below:

# setwd("misc/issues/issue183/")

library(pathfindR)

input_df <- readRDS("files/df_input.rds")
PPI <- "STRING"
PATHWAY <- "KEGG"

# wrapper -----------------------------------------------------------------
out_df1 <- run_pathfindR(
  input_df,
  p_val_threshold = 0.2,
  min_gset_size = 5,
  max_gset_size = 500,
  adj_method = "fdr",
  gene_sets = PATHWAY,
  pin_name_path = PPI,
  list_active_snw_genes = TRUE
)

# manual ------------------------------------------------------------------

df_input_processed <- input_processing(
  input = input_df,
  p_val_threshold = 0.2,
  pin_name_path = PPI,
  convert2alias = TRUE
)

pathway_list <- fetch_gene_set(
  gene_sets = PATHWAY,
  min_gset_size = 5,
  max_gset_size = 500
)
pathway_gsets <- pathway_list[[1]]
pathway_descriptions <- pathway_list[[2]]

# run pathfindR
n_iter <- 10
combined_res <- NULL

for (i in 1:n_iter) {
  ###### Active Subnetwork Search
  snws_file <- paste0("active_snws_", i)
  active_snws <- active_snw_search(
    input_for_search = df_input_processed,
    pin_name_path = PPI,
    snws_file = snws_file,
    score_quan_thr = 0.8,
    sig_gene_thr = 0.02,
    search_method = "GR",
    seedForRandom = i
  )

  ###### Enrichment Analyses
  current_res <- enrichment_analyses(
    snws = active_snws,
    sig_genes_vec = df_input_processed$GENE,
    pin_name_path = PPI,
    genes_by_term = pathway_gsets,
    term_descriptions = pathway_descriptions,
    adj_method = "fdr",
    enrichment_threshold = 0.05,
    list_active_snw_genes = TRUE
  )

  ###### Combine results via `rbind`
  combined_res <- rbind(combined_res, current_res)
}

###### Summarize Combined Enrichment Results
summarized_df <- summarize_enrichment_results(
  combined_res,
  list_active_snw_genes = TRUE
)

###### Annotate Affected Genes Involved in Each Enriched Term
out_df2 <- annotate_term_genes(
  result_df = summarized_df,
  input_processed = df_input_processed,
  genes_by_term = pathway_gsets)

# compare -----------------------------------------------------------------
nrow(out_df1) == nrow(out_df2)
# [1] TRUE
identical(out_df1, out_df2)
# [1] TRUE

egeulgen commented 12 months ago

feel free to re-open the issue if this persists

egeulgen / pathfindR

different results by using run_pathfindR() vs step-by-step workflow #183