ms609 / TreeDist

Calculate distances between phylogenetic trees in R
https://ms609.github.io/TreeDist/
29 stars 6 forks source link

Plotting large (400 taxa) trees with VisualizeMatching #124

Closed ammaraziz closed 2 months ago

ammaraziz commented 2 months ago

Hi @ms609

Thanks for the fantastic R package, it's been great to work with.

One issue I am facing is that I'd like to compare 2 trees with identical taxa. When using VisualizeMatching to compare these two trees the result is not great:

image

Could you provide advice on how to plot large trees?

Thanks,

Ammar

ms609 commented 2 months ago

These are large trees! What is your desired outcome in visualizing the matching? Presumably inspecting all 400 matched clades is going to yield limited information, or at the very least be difficult to interpret. Is there a subset of clades you are particularly interested in matching, for example?

ammaraziz commented 2 months ago

I'd like to see which internal branches are incongruent between a known good tree and another tree. I am following this example here: https://ms609.github.io/TreeDist/articles/using-distances.html#testing-similarity-to-a-known-tree

For big trees there are a few things that would be great to control:

I hacked together a solution that gets about 80% of the way there by modifying VisualizeMatching. However, I got stuck around the last point to declutter the tree.

Here's the modified function:

function(Func, tree1, tree2, setPar = TRUE, precision = 3L, 
                 Plot = plot.phylo, matchZeros = TRUE, plainEdges = FALSE, 
                 edge.width = 1, edge.color = "black", cex = cex, 
                 show.edgelabels = TRUE, ...) 
{
    splits1 <- as.Splits(tree1)
    edge1 <- tree1$edge
    child1 <- edge1[, 2]
    nTip <- attr(splits1, "nTip")
    splitEdges1 <- vapply(as.integer(rownames(splits1)), function(node) which(child1 == 
                                                                                  node), integer(1))
    splits2 <- as.Splits(tree2, tipLabels = tree1)
    edge2 <- tree2$edge
    child2 <- edge2[, 2]
    splitEdges2 <- vapply(as.integer(rownames(splits2)), function(node) which(child2 == 
                                                                                  node), integer(1))
    matching <- Func(tree1, tree2, reportMatching = TRUE)
    pairings <- attr(matching, "matching")
    scores <- attr(matching, "pairScores")
    pairScores <- signif(mapply(function(i, j) scores[i, j], 
                                seq_along(pairings), pairings), precision)
    adjNo <- c(0.5, -0.2)
    adjVal <- c(0.5, 1.1)
    faint <- "#aaaaaa"
    if (setPar) {
        origPar <- par(mfrow = c(1, 2), mar = rep(0.5, 4))
        on.exit(par(origPar))
    }
    LabelUnpaired <- function(splitEdges, unpaired) {
        if (any(unpaired)) {
            edgelabels(text = expression("-"), edge = splitEdges[unpaired], 
                       frame = "n", col = faint, adj = adjNo)
            edgelabels(text = "0", edge = splitEdges[unpaired], 
                       frame = "n", col = faint, cex = cex, adj = adjVal)
        }
    }
    if (plainEdges) {
        Plot(tree1, edge.width = edge.width, edge.color = edge.color, 
             ...)
    }
    else {
        Normalize <- function(scores, na.rm = FALSE) {
            if (length(scores) == 0) 
                return(scores)
            if (na.rm) {
                scores <- scores[!is.na(scores)]
            }
            else {
                scores[is.na(scores)] <- 0
            }
            if (any(scores < 0)) {
                stop("Negative scores not supported")
            }
            if (max(scores) == 0) 
                return(scores)
            if (min(scores) == max(scores)) 
                return(rep(1L, length(scores)))
            scores/max(scores)
        }
        OtherRootEdge <- function(splitNodes, edge) {
            parent <- edge[, 1]
            child <- edge[, 2]
            rootEdges <- which(parent == min(parent))
            rootChildren <- child[rootEdges]
            treeIsRooted <- length(rootChildren) < 3
            if (treeIsRooted) {
                splitEdges <- vapply(splitNodes, match, table = child, 
                                     0)
                got <- rootChildren %in% splitNodes
                if (any(got)) {
                    if (sum(got) != 1) {
                        warning("Unexpected polytomy")
                    }
                    c(score = as.integer(which(splitNodes %in% 
                                                   rootChildren[got])), edge = rootEdges[!got])
                }
                else {
                    c(score = NA_integer_, edge = NA_integer_)
                }
            }
            else {
                c(score = NA_integer_, edge = NA_integer_)
            }
        }
        edgeColPalette <- sequential_hcl(n = 256L, palette = "Viridis")
        EdgyPlot <- function(tree, splits, edge, splitEdges, 
                             normalizedScores, ...) {
            splitNodes <- as.integer(names(splits))
            ore <- OtherRootEdge(splitNodes, edge)
            if (length(normalizedScores) && !is.na(ore[1])) {
                ns <- c(normalizedScores, normalizedScores[ore["score"]])
                se <- c(splitEdges, ore[2])
            }
            else {
                ns <- normalizedScores
                se <- splitEdges
            }
            edge.width <- rep(1, nrow(edge))
            edge.width[se] <- 1 + (10 * ns)
            edge.color <- rep("black", nrow(edge))
            edge.color[se] <- edgeColPalette[1 + ceiling(255 * 
                                                             ns)]
            Plot(tree, edge.width = edge.width, edge.color = edge.color, 
                 ...)
        }
        EdgyPlot(tree1, splits1, edge1, splitEdges1, Normalize(pairScores), 
                 ...)
    }
    paired1 <- if (matchZeros) {
        !is.na(pairings)
    }
    else {
        !is.na(pairings) & pairScores > 0
    }
    palette <- qualitative_hcl(sum(paired1), c = 42, l = 88)
    pairedPairScores <- pairScores[paired1]
    pairLabels <- seq_len(sum(paired1))

    if (any(pairLabels)) {
        if (show.edgelabels){
            edgelabels(text = pairLabels, edge = splitEdges1[paired1], 
                       frame = "n", bg = palette, adj = adjNo, cex = cex)
            edgelabels(text = pairedPairScores, edge = splitEdges1[paired1], 
                       frame = "n", adj = adjVal, cex = cex, col = ifelse(pairedPairScores, 
                                                                          "black", faint))
        }
    }

    LabelUnpaired(splitEdges1, !paired1)
    paired2 <- seq_along(splitEdges2) %in% pairings[paired1]
    pairNames2 <- pairings[paired1]
    if (plainEdges) {
        Plot(tree2, edge.width = edge.width, edge.color = edge.color, cex=cex,
             ...)
    }
    else {
        EdgyPlot(tree2, splits2[[pairNames2]], edge2, splitEdges2[pairNames2], 
                 Normalize(pairedPairScores, na.rm = TRUE), ...)
    }
    if (any(pairLabels)) {
        if (show.edgelabels){
            edgelabels(text = pairLabels, edge = splitEdges2[pairNames2], 
                       frame = 'n', bg = palette, adj = adjNo, cex = cex)
            edgelabels(text = pairedPairScores, edge = splitEdges2[pairNames2], 
                       frame = "n", adj = adjVal, cex = cex, col = ifelse(pairedPairScores, 
                                                                          "black", faint))
        }
    }
    LabelUnpaired(splitEdges2, !paired2)
    invisible()
}

I can submit a pull request with the above changes (with some modification to be more customisable) if you think it's acceptable.

ammaraziz commented 2 months ago

Alternatively, is there a way to find which internal branches differ (have low score) between two trees and which taxa are affected?

ms609 commented 2 months ago

125 follows up on your suggestions by:

Thus you can review the contributions of individual matched splits with

matching <- VisualizeMatching(SharedPhylogeneticInfo, tree1, tree2)
attr(matching, "matchedScores")

I hope this helps you to make the best use of the function! Thanks for your suggestions.

ammaraziz commented 2 months ago

Amazing thank you. Once it's merged I'll give it a go.

Feel free to close this issue.