YuLab-SMU / clusterProfiler

:bar_chart: A universal enrichment tool for interpreting omics data
https://yulab-smu.top/biomedical-knowledge-mining-book/
1.02k stars 256 forks source link

All p-values are equal to 1 #690

Open aminakur opened 6 months ago

aminakur commented 6 months ago

I was running ClusterProfiler without any problems for several datasets, except for this time. I am using rice genome and follow this protocol My issue is that all p-values are equal to 1. This is my code:

col_names <- c("chromosome", "start", "end")

NPB <- read.table(file = "/Users/soniamukherjee/Desktop/TADS_project/NPB_TADs.bed", col.names = col_names, header = FALSE)
NPB
genes <- read.table("/Users/soniamukherjee/Desktop/TADS_project/IRGSP_genes.bed", header = FALSE, col.names = c("chromosome", "start", "end", "gene_id"))
gene_names <- genes$gene_id

library(GenomicRanges)

genes_gr <- GRanges(
  seqnames = Rle(genes$chromosome),
  ranges = IRanges(start = genes$start, end = genes$end),
  gene_id = genes$gene_id  
)

all_gr <- GRanges(
  seqnames = Rle(NPB$chromosome),
  ranges = IRanges(start = NPB$start, end = NPB$end)
)

intersecting_genesA <- genes_gr[genes_gr %over% all_gr]

gene_namesA <- mcols(intersecting_genesA)$gene_id
gene_namesA
noT <- setdiff(gene_names, gene_namesA)

library (GO.db)
library (dplyr)
## make GO ID-GO name mapping tables
# extract GO term information from a Bioconductor package, GO. db
go_table <- as.data.frame(GOTERM)
# only take go_id, Term, and Oncology columns and remove duplicated rows
godb <- unique(go_table[, c (1,3,4)])
# separate the GO terms into 3 ontology types
godb_BP <- godb[godb$Ontology =="BP", 1:2]
godb_MF <- godb[godb$Ontology == "MF", 1:2]
godb_CC <- godb[godb$Ontology =="CC", 1:2]
# save results
write.table(data.frame(godb_BP), file="/Users/soniamukherjee/Desktop/godb_BP.txt", sep = "\t", quote = FALSE, row.names = FALSE)
write.table(godb_MF, "/Users/soniamukherjee/Desktop/godb_MF.txt", sep = "\t", quote = FALSE,
            row.names = FALSE)
write.table(godb_CC, "/Users/soniamukherjee/Desktop/godb_CC.txt", sep = "\t", quote = FALSE,
            row.names = FALSE)

goid2gene_all <- read.table("/Users/soniamukherjee/Desktop/TADS_project/cache/combinedU.tsv", header=T,
                            sep="\t", quote="'", stringsAsFactors = F)
colnames(goid2gene_all) <- c("go_id", "gene_id")

# seperate genes based on their GO annotation types
goid2gene_BP <- goid2gene_all %>% dplyr::filter(go_id %in% godb_BP$go_id)
goid2gene_MF <- goid2gene_all %>% dplyr:: filter(go_id %in% godb_MF$go_id)
goid2gene_CC <- goid2gene_all %>% dplyr:: filter(go_id%in% godb_CC$go_id)
# save results
write.table(goid2gene_BP, "/Users/soniamukherjee/Desktop/goid2gene_BP.txt", sep="\t", row.names=FALSE, quote=FALSE)
write.table(goid2gene_MF, "/Users/soniamukherjee/Desktop/goid2gene_MF.txt", sep="\t",row.names=FALSE, quote=FALSE)
write.table(goid2gene_CC, "/Users/soniamukherjee/Desktop/goid2gene_CC.txt", sep="\t",row.names=FALSE, quote=FALSE)

library (clusterProfiler)
library(ggplot2)

term2gene <- read.table("/Users/soniamukherjee/Desktop/goid2gene_BP.txt", sep="\t", header=T, quote="", stringsAsFactors = F)
term2name <- read.table("/Users/soniamukherjee/Desktop/godb_BP.txt", sep="\t", header=T, quote="", stringsAsFactors= F)

go <- enricher(gene = gene_namesA, # a vector of gene id
               universe = noT, # background genes
               TERM2GENE = term2gene, # user input annotation of TERM TO GENE mapping
               TERM2NAME = term2name, # user input of TERM TO NAME mapping
               pvalueCutoff = 0.05, # p-value cutoff (default)
               pAdjustMethod = "BH", # multiple testing correction method to calculate adjusted p-value (default)
               qvalueCutoff = 0.2, # q-value cutoff (default). q value: local FDR corrected p-value.
               minGSSize = 10, # minimal size of genes annotated for testing (default)
               maxGSSize = 500, # maximal size of genes annotated for  testing (default)
)

go_df <- as.data.frame (go)
write.table(go_df, "cache/go_df.txt", sep="\t", row.names=FALSE,quote=FALSE)
p1 <- dotplot(go, showCategory=10,title = "Top 3 most statistically significant enriched GO terms (CC) of genes located within all TAD ",
              font.size = 10)
p1
ggsave (p1,
        filename = "/Users/soniamukherjee/Desktop/TADS_project/2kbCC_dotplot.png",
        height = 12, width = 30, units = "cm" )

The resulting p-values are all 1. What is the issue?

shihsama commented 6 months ago

It's not your fault. The p-values are showed as "all one" because they are too large and very close to one (the decimal digits can be hundreds). So don't worry a lot. After all, they are not significant.