Closed GuangchuangYu closed 1 year ago
# an early -unpolished -alpha version of the script
`library(tidyverse)
library(httr)
library(jsonlite)
library(rlist)
library(magrittr)
library(igraph)
library(clipr)
genelist <- DEG_names
out_form <- "json" # output format includes:tsv, tsv-no-header, json, xml, psi-mi, psi-mi-tab input_genes <- paste(genelist, collapse = "%0d") url_genes <- paste("https://string-db.org/api", out_form, paste("network?identifiers=",input_genes,"&species=10090",sep = ""), sep = "/") response <-GET(url_genes) result <- fromJSON(content(response, as="text"))
node <- unique(c(result$preferredName_A, # make sure all the genes are included result$preferredName_B)) network <- graph_from_data_frame(d = result[,c(3,4,6)], vertices = node, directed=F) deg <- degree(network, mode="all") plot(network, edge.arrow.size=.2, edge.curved=0, vertex.size = deg, vertex.color="orange", vertex.frame.color="firebrick", vertex.shape = "circle", vertex.label=node, vertex.label.color="black", vertex.label.cex=.7) `
library(pacman)
pacman::p_load(tidyverse, httr, jsonlite, rlist, magrittr, igraph, clipr)
DEGs_inter <- function (genelist, species) {
fromJSON(content(paste("https://string-db.org/api",
"json",
paste("network?identifiers=",
genelist %>% paste(collapse = "%0d"),
paste("&species=",species,sep = ""),
sep = ""),
sep = "/") %>% GET(),as="text"))
}
result <- DEGs_inter(genelist = DEG_names, species = 10090) # Mum,10090; Homo,9606; Rat,10116; etc.
write.csv(result, "~/Desktop/test_PPI.csv") # output the result if needed
network <- graph_from_data_frame(d = result[,c(3,4,6)],
vertices = unique(c(result$preferredName_A,
result$preferredName_B)),
directed=F)
plot(network,
edge.arrow.size=.2, edge.curved=0,
vertex.size = degree(network, mode="all"),
vertex.color="orange", vertex.frame.color="firebrick",
vertex.shape = "circle",
vertex.label=node, vertex.label.color="black",
vertex.label.cex=.7)
An reproducible Error has been found when combining the use of "org.Mm.eg.db" and the customized function "DEGs_inter".
DEGs_inter <- function (genelist, species) { fromJSON(content(paste("https://string-db.org/api", "json", paste("network?identifiers=", genelist %>% paste(collapse = "%0d"), paste("&species=",species,sep = ""), sep = ""), sep = "/") %>% GET(),as="text")) }
Once the "org.Mm.eg.db" is loaded, the content() function will always report an error, regardless of whether the functions in "org.Mm.eg.db" are called or not. Can't figure out the reason for now.😢
Problem Nailed by avoiding the use of content(). Now the script is primed to work with the result of enrichKEGG() or enrichGO().
library(pacman)
pacman::p_load(tidyverse, httr, jsonlite, rlist, magrittr, igraph, clipr, clusterProfiler, org.Mm.eg.db)
DEGs_inter <- function (genelist, species) {
res <- paste("https://string-db.org/api/json",
paste("network?identifiers=",genelist,
paste("&species=",species,sep = ""),
sep = ""),
sep = "/") %>% GET()
res$content %>% rawToChar() %>% fromJSON()
}
### load your own data
DEG_test <- na.omit(read.csv("~/wonderland/test.csv", header = T, sep = ","))
DEG_test_id <- bitr(DEG_test$geneID, fromType = "ENSEMBL", toType = c("ENTREZID","SYMBOL"), OrgDb =
"org.Mm.eg.db")
### enrichment analysis
kegg_test <- enrichKEGG(gene = DEG_test_id$ENTREZID,
organism = "mmu",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05) %>%
setReadable(OrgDb = org.Mm.eg.db, keyType = "ENTREZID")
kegg_sig <- kegg_test@result[kegg_test@result$p.adjust < 0.05,]
pathway_gene_id <- kegg_sig[kegg_sig$Description == "Apoptosis",]$geneID %>% #or your own interested pathway
strsplit("/") %>% unlist() %>% paste(collapse = "%0d")
result <- DEGs_inter(genelist = pathway_gene_id, species = 10090)
#write.csv(result, "~/Desktop/test_PPI.csv") # output the result if needed.
node <- unique(c(result$preferredName_A, result$preferredName_B))
network <- graph_from_data_frame(d = result[,c(3,4,6)],
vertices = node,
directed=F)
plot(network,
edge.arrow.size=.2, edge.curved=0,
vertex.size = network %>% degree(mode = "all"),
vertex.color="orange", vertex.frame.color="firebrick",
vertex.shape = "circle",
vertex.label=node, vertex.label.color="black",
vertex.label.cex=.7)
Currently, all the nodes in the plot are belonging to the selected pathway. Is it possible to incorporate genes that directly interact with the input genes?
As STRING use taxonomy ID, we need a function to get the ID for users.
Here is a prototype:
getTaxID <- function(species) {
if (inherits(species, "OrgDb")) {
m <- metadata(species)
res <- m$value[m$name == "TAXID"]
return(res)
}
## query from a lookup table that stores kegg.code to taxa.id
}
getTaxID(org.Hs.eg.db)
# [1] "9606"
Cool!! Need few more hours to think it through. Very helpful. 😬😬
A lookup table is something like this https://github.com/datapplab/SBGNhub/blob/9f94cde57c721e5392dfa1e574ca445d4d06d23b/data/species.specifid.pathway_pathwayCommons/trees/kegg.code.to.taxa.tsv.
However, the above table is outdated. We need to prepare an up-to-date version.
Luckily, I just found that the Ensembl Project has provided an Api for extracting taxonomy ID (https://rest.ensembl.org/documentation/info/taxonomy_id). It might also be a good way to get to it.
library(httr)
library(jsonlite)
library(magrittr)
getTax_id <- function(species) {
res_url <- paste("https://rest.ensembl.org/taxonomy/id",
(strsplit(species," ") %>% unlist() %>% paste(collapse = "%20") %>%
paste("?application/json", sep = "")),
sep = "/") %>% GET()
res <- res_url$content %>% rawToChar() %>% fromJSON()
res$id
}
getTax_id("Mus Musculus")
# [1] "10090"
getTax_id("Homo Sapiens")
# [1] "9606"
getTax_id("Acetitomaculum ruminis DSM 5522")
# [1] "1120918"
A new function has been built to better cooperate with the result passed from enrichKEGG() or enrichGO (). Now the users can easily get the input gene list by using _catid().
cat_id <- function(enrich_res, show_Category) {
input_res <- as.data.frame(enrich_res@result)
input_res[input_res$Description %in% show_Category,]$geneID %>%
strsplit("/") %>%
unlist() %>%
paste(collapse = "%0d")
}
cat_id(kegg_result, show_Category = c("Apoptosis", "Cell cycle"))
Currently, all the nodes in the plot are belonging to the selected pathway. Is it possible to incorporate genes that directly interact with the input genes?
Now the partners that possibly interacted with input genes could easily be obtained by using _getPartnernet().
getPartner_net <- function(genelist, species, limit) {
res <- paste("https://string-db.org/api/json",
paste("interaction_partners?identifiers=",genelist,
paste("&species=",species,sep = ""),
paste("&limit=",limit,sep = ""), # The number of wanted partner genes.
sep = ""),
sep = "/") %>% GET()
res$content %>% rawToChar() %>% fromJSON()
}
Let's finish this part first.
getTaxID <- function(species) {
if (inherits(species, "OrgDb")) {
m <- metadata(species)
res <- m$value[m$name == "TAXID"]
return(res)
}
## load cached taxonomy ID information
## kegg.code taxa.id
## e.g. hsa 9606
##
## use this function (will call `getTaxaInfo`) to prepare the cached data
## species list from https://rest.kegg.jp/list/organism
## extract species from the third column
## use second column as kegg.code
i <- which(kegg_taxa$kegg.code == species)
if (length(i) != 0)
return(kegg_taxa$taxa.id[i])
res <- getTaxaInfo(species)
return(res$id)
}
getTaxInfo <- function(species) {
url <- paste0(
"https://rest.ensembl.org/taxonomy/id/",
sub(" ", "%20", species),
"?application/json"
)
fromJSON(url)
}
Em... after trying the code I have to admit getting things done as locally as we could is a more elegant way. Let me try to keep improving it.
library(httr)
# Data fetch --------------------------------------------------------------
# Set stringDB base URL
address <- "https://string-db.org"
# Validate the address
httr::stop_for_status(httr::GET(address))
# Version info
current_version <- read.table(url(paste(address, "/api/tsv/version", sep = "")), header = TRUE)
available_version <- read.table(url(paste(address, "/api/tsv/available_api_versions", sep = "")), header = TRUE)
#' networkParamsParser
#'
#' parameters parser for [Getting the STRING network interactions](https://string-db.org/cgi/help.pl?sessionId=btsvnCeNrBk7).
#'
#' @param identifiers required parameter for multiple items, e.g. `c("PTCH1", "TP53", "BRCA1", "BRCA2")`
#' @param species NCBI taxon identifiers (e.g. Human is 9606, see: [STRING organisms](https://string-db.org/cgi/input.pl?input_page_active_form=organisms).
#' @param required_score threshold of significance to include a interaction, a number between 0 and 1000 (default depends on the network)
#' @param network_type network type: functional (default), physical
#' @param add_nodes adds a number of proteins with to the network based on their confidence score (default:1)
#' @param show_query_node_labels when available use submitted names in the preferredName column when (0 or 1) (default:0)
#' @param caller_identity your identifier for us.
#'
#' @return a list contain parameters for query
networkParamsParser <- function(
identifiers,
species,
required_score = NULL,
network_type = "functional",
add_nodes = 1,
show_query_node_labels = 0,
caller_identity = NULL
) {
# Format the identifiers
identifiers <- paste(identifiers, collapse = "\n")
# Check parameters
if (missing(species)) {
stop("Please provide an NCBI taxon identifier for the species.")
}
# Create parameters list
params <- list(
identifiers = identifiers,
species = species,
required_score = required_score,
network_type = network_type,
add_nodes = add_nodes,
show_query_node_labels = show_query_node_labels,
caller_identity = caller_identity
)
# Remove NULL elements from the list
filtered_params <- Filter(Negate(is.null), params)
return(filtered_params)
}
networkParams <- networkParamsParser(
identifiers = c("PTCH1", "TP53", "BRCA1", "BRCA2"),
species = 9606,
network_type = "functional",
add_nodes = 1,
show_query_node_labels = 0
)
# Define a cache directory
cache_dir <- tempdir()
# Define a function to read the file with caching
read_tsv_with_cache <- memoise::memoise(function(file_url,header = FALSE) {
# Generate a unique cache filename based on the file URL
cache_filename <- fs::path_join(c(cache_dir, paste0(digest::digest(file_url), ".rds")))
# Check if the cached file exists
if (file.exists(cache_filename)) {
# If cached file exists, load and return the cached data
cached_data <- readRDS(cache_filename)
return(cached_data)
} else {
# If cached file does not exist, download and cache the data
data <- read.delim(url(file_url), sep = "\t", header = header)
saveRDS(data, cache_filename)
return(data)
}
})
# read data from stringDB api
response <- httr::GET(paste(address, "/api/tsv/network", sep = ""), query = networkParams)
res <- read_tsv_with_cache(response$url, header = TRUE)
# read data from species code list
# Define the URLs for the files
kegg_species_url <- 'https://rest.kegg.jp/list/organism'
stringdb_species_url <- "https://stringdb-static.org/download/species.v11.5.txt"
# Read the files with caching
kegg_species <- read_tsv_with_cache(kegg_species_url)
stringdb_species <- read_tsv_with_cache(stringdb_species_url,header = TRUE)
COOL. As a beginner, needs more than one night to read and digest your code. Thanx.
Thanks, @Potato-tudou and @xiayh17 . We now have the first version of getTaxID
and getPPI
functions in clusterProfiler
, see https://github.com/YuLab-SMU/clusterProfiler/commit/d27ae964a26c73c6baab95fab7c79d7709f1dd29.
Thanks for the opportunity and very happy to be part of it.😬😬😬
From Potato's iPhone. 1Message ID: @.***>
详见公众号推送:当clusterProfiler遇见stringdb...