phenoscape / rphenoscape

R package to make phenotypic traits from the Phenoscape Knowledgebase available from within R.
https://rphenoscape.phenoscape.org/
Other
5 stars 5 forks source link

Retrieving a semantic similarity matrix #37

Open uyedaj opened 5 years ago

uyedaj commented 5 years ago

title: "Pipeline for retrieving semantic similarity matrix" output: html_notebook

This is our pipeline for retrieving a semantic similarity matrix that we would like to be streamlined and robust in rphenoscape. Our first step is to obtain a treedata object for anatomical entities in catfish.

library(rphenoscape)
library(treeplyr)
library(RCurl)
library(rjson)
library(readr)
library(urltools)
library(pracma)
library(httr)

nex <- pk_get_ontotrace_xml(taxon = c("Siluriformes"), entity = "anatomical entity")

m <- pk_get_ontotrace(nex)
m$taxa <- gsub(" ", "_", m$taxa)
#write.csv(m, file="~/repos/ontologyPCM/data/Ontotrace_Siluriformes_AnatomicalEntity.csv")

tree <- read.tree("https://datadryad.org/bitstream/handle/10255/dryad.199127/actinopt_12k_treePL.tre?sequence=1")
td <- make.treedata(tree, m)
td

We remove the otu data and look at the traits.

traits <- colnames(td$dat)
traits <- traits[-(1:2)] #delete otu data
traits

Get IRI ids for each trait.

traitDetails <- lapply(traits, function(x) pk_anatomical_detail(x, verbose=TRUE))
traitDetails[1:5]
traitIDs <- unname(do.call(c, sapply(traitDetails, function(x) x[,'@id'])))
irisPhenotypes <- sapply(traitIDs, url_encode)

This is the ugliest part, making sure the URLencoding works. I think it's the hardest (for me) to make robust as well.

filename <- "../output/siluriformesFormData.txt"

cat("iris=%5B%0A%20%20", file=filename)
irisPhenotypes <- lapply(irisPhenotypes, function(x) gsub("/", "%2F", x, fixed=TRUE))
irisPhenotypes <- lapply(irisPhenotypes, function(x) gsub(":", "%3A", x, fixed=TRUE))
irisPhenotypes <- lapply(irisPhenotypes, function(x) gsub("=", "%3D%0A", x, fixed=TRUE))
dum <- lapply(irisPhenotypes[1:(length(irisPhenotypes)-1)],function(x) cat(paste0('%22', x,'%22%2C', sep=""), file=filename, append=TRUE))
cat(paste0('%22', irisPhenotypes[[length(irisPhenotypes)]],'%22',"%5D%0A", sep=""), file=filename, append=TRUE)

Submit the api request.

api.semanticSimilarity_query <- "curl -X POST -d @../output/siluriformesFormData.txt 'http://kb.phenoscape.org/api/similarity/jaccard'"
semanticSimilarityAPIResults <- system(api.semanticSimilarity_query, intern=TRUE)

Process the results from the api request.


results <- fromJSON(semanticSimilarityAPIResults)
scores <- lapply(results$results, function(x) x$score)
scores <- sapply(scores, function(x) if(is.null(x)) NA else(x))
result_terms <- do.call(rbind, lapply(results$results, function(x) do.call(cbind, lapply(x$terms, curlUnescape))))
semanticSimilarityMatrix <- matrix(NA, nrow=length(irisPhenotypes), ncol=length(irisPhenotypes))
diag(semanticSimilarityMatrix) <- 1
rownames(semanticSimilarityMatrix) <- colnames(semanticSimilarityMatrix) <- curlUnescape(irisPhenotypes)

for(i in 1:nrow(result_terms)){
  semanticSimilarityMatrix[result_terms[i,1], result_terms[i,2]] <- semanticSimilarityMatrix[result_terms[i,2], result_terms[i,1]] <- scores[i]
}

rownames(semanticSimilarityMatrix) <- colnames(semanticSimilarityMatrix) <- traits

write.csv(semanticSimilarityMatrix, file="../output/siluriformesSemanticSimMatrix.csv")

Check to see if semantic similarity matrix makes sense by visualizing the highest and lowest matching semantic similarity values.

maxSS <- list()
for(i in 1:ncol(semanticSimilarityMatrix)){
  ss <- semanticSimilarityMatrix[-i,]
  j <- which(ss[,i]==max(ss[,i]))[1]
  maxSS[[i]] <- cbind(colnames(semanticSimilarityMatrix)[i], rownames(ss)[j], round(ss[j, i],4))
}
maxSS <- do.call(rbind, maxSS)
as.data.frame(maxSS)

minSS <- list()
for(i in 1:ncol(semanticSimilarityMatrix)){
  ss <- semanticSimilarityMatrix[-i,]
  j <- which(ss[,i]==min(ss[,i]))[1]
  minSS[[i]] <- cbind(colnames(semanticSimilarityMatrix)[i], rownames(ss)[j], round(ss[j, i],4))
}
minSS <- do.call(rbind, minSS)
as.data.frame(minSS)

Create a neighbor-joining tree of SS matrix to see trait clusters.

njt <- nj(1-semanticSimilarityMatrix)
pdf("../output/njTreeSiluriformesSemanticMatrix.pdf", height=30, width=30)
plot(njt, type="unrooted", cex=0.35)
dev.off()
hlapp commented 1 year ago

@uyedaj and @diegosasso I think this can be considered complete, right?