joshwlambert / DAISIEprep

Extracts phylogenetic island community data from phylogenetic trees
https://joshwlambert.github.io/DAISIEprep
GNU General Public License v3.0
6 stars 3 forks source link

`extract_island_species` also extracts a mainland species? #2

Closed TheoPannetier closed 2 years ago

TheoPannetier commented 2 years ago

Running code from the Tutorial vignette:

library(DAISIEprep)
library(ape)
library(phylobase)
#> 
#> Attaching package: 'phylobase'
#> The following object is masked from 'package:ape':
#> 
#>     edges
library(ggtree)
#> ggtree v3.5.1.900 For help: https://yulab-smu.top/treedata-book/
#> 
#> If you use the ggtree package suite in published research, please cite
#> the appropriate paper(s):
#> 
#> Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
#> ggtree: an R package for visualization and annotation of phylogenetic
#> trees with their covariates and other associated data. Methods in
#> Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
#> 
#> G Yu. Data Integration, Manipulation and Visualization of Phylogenetic
#> Trees (1st ed.). Chapman and Hall/CRC. 2022. ISBN: 9781032233574
#> 
#> S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L
#> Zhan, T Wu, E Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact
#> visualization of richly annotated phylogenetic data. Molecular Biology
#> and Evolution. 2021, 38(9):4039-4042. doi: 10.1093/molbev/msab166
#> 
#> Attaching package: 'ggtree'
#> The following object is masked from 'package:phylobase':
#> 
#>     MRCA
#> The following object is masked from 'package:ape':
#> 
#>     rotate
library(ggimage)
#> Loading required package: ggplot2
library(castor)
#> Loading required package: Rcpp
set.seed(
  1,
  kind = "Mersenne-Twister",
  normal.kind = "Inversion",
  sample.kind = "Rejection"
)
phylo <- ape::rcoal(10)
phylo$tip.label <- c("bird_a", "bird_b", "bird_c", "bird_d", "bird_e", "bird_f",
                     "bird_g", "bird_h", "bird_i", "bird_j")
phylo <- phylobase::phylo4(phylo)
endemicity_status <- sample(
  x = c("not_present", "endemic", "nonendemic"),
  size = length(phylobase::tipLabels(phylo)),
  replace = TRUE,
  prob = c(0.6, 0.2, 0.2)
)
phylod <- phylobase::phylo4d(phylo, as.data.frame(endemicity_status))
island_tbl <- island_tbl()
tip_states <- c()
endemicity_status <- phylobase::tipData(phylod)$endemicity_status
for (i in seq_along(endemicity_status)) {
  if (grepl(pattern = "^not_present$", x = endemicity_status[i])) {
    tip_states[i] <- 1
  } else if (grepl(pattern = "^nonendemic$", x = endemicity_status[i])) {
    tip_states[i] <- 2
  } else if (grepl(pattern = "^endemic$", x = endemicity_status[i])) {
    tip_states[i] <- 3
  }
}
phylo <- as(phylo, "phylo")
asr <- castor::asr_max_parsimony(
  tree = phylo,
  tip_states = tip_states,
  transition_costs = "sequential"
)
if (ncol(asr$ancestral_likelihoods) == 2) {
  colnames(asr$ancestral_likelihoods) <- c("not_present", "nonendemic")
} else if (ncol(asr$ancestral_likelihoods) == 3) {
  colnames(asr$ancestral_likelihoods) <-
    c("not_present", "nonendemic", "endemic")
}
node_states <- max.col(asr$ancestral_likelihoods, ties.method = "last")
node_states <- gsub(
  pattern = "1", replacement = "not_present", x = node_states
)
node_states <- gsub(
  pattern = "2", replacement = "nonendemic", x = node_states
)
node_states <- gsub(
  pattern = "3", replacement = "endemic", x = node_states
)
node_data <- data.frame(
  island_status = node_states,
  endemic_prob = asr$ancestral_likelihoods[, "endemic"],
  nonendemic_prob = asr$ancestral_likelihoods[, "nonendemic"],
  not_present_prob = asr$ancestral_likelihoods[, "not_present"],
  row.names = phylobase::nodeId(phylod, "internal")
)
phylod <- phylobase::phylo4d(
  phylo,
  tip.data = as.data.frame(endemicity_status),
  node.data = node_data
)
phylod <- add_asr_node_states(phylod = phylod, asr_method = "parsimony")
island_tbl <- extract_island_species(
  phylod = phylod,
  extraction_method = "asr",
  include_not_present = FALSE
)
island_tbl
#> Class:  Island_tbl 
#>   clade_name  status missing_species branching_times min_age      species
#> 1     bird_g endemic               0    0.764855....      NA bird_g, ....
island_tbl@island_tbl$species
#> [[1]]
#> [1] "bird_g" "bird_h" "bird_i"
island_tbl@island_tbl$branching_times
#> [[1]]
#> [1] 0.7648553 0.3800300
plot_phylod(phylod = phylod)

Shouldn't bird_h be left out by extract_island_species() when include_not_present = FALSE ?

Created on 2022-06-29 by the reprex package (v2.0.1)

joshwlambert commented 2 years ago

@TheoPannetier thanks for spotting this I will take a look at the weekend and get back to you.

joshwlambert commented 2 years ago

@TheoPannetier this should now be fixed, it was indeed a bug and had even slipped through the tests I'd written. Good spot 😄

TheoPannetier commented 2 years ago

This is what I get after the fix – problem fixed :)

library(DAISIEprep)
library(ape)
library(phylobase)
#> 
#> Attaching package: 'phylobase'
#> The following object is masked from 'package:ape':
#> 
#>     edges
library(ggtree)
#> ggtree v3.5.1.900 For help: https://yulab-smu.top/treedata-book/
#> 
#> If you use the ggtree package suite in published research, please cite
#> the appropriate paper(s):
#> 
#> Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
#> ggtree: an R package for visualization and annotation of phylogenetic
#> trees with their covariates and other associated data. Methods in
#> Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
#> 
#> Guangchuang Yu, Tommy Tsan-Yuk Lam, Huachen Zhu, Yi Guan. Two methods
#> for mapping and visualizing associated data on phylogeny using ggtree.
#> Molecular Biology and Evolution. 2018, 35(12):3041-3043.
#> doi:10.1093/molbev/msy194
#> 
#> LG Wang, TTY Lam, S Xu, Z Dai, L Zhou, T Feng, P Guo, CW Dunn, BR
#> Jones, T Bradley, H Zhu, Y Guan, Y Jiang, G Yu. treeio: an R package
#> for phylogenetic tree input and output with richly annotated and
#> associated data. Molecular Biology and Evolution. 2020, 37(2):599-603.
#> doi: 10.1093/molbev/msz240
#> 
#> Attaching package: 'ggtree'
#> The following object is masked from 'package:phylobase':
#> 
#>     MRCA
#> The following object is masked from 'package:ape':
#> 
#>     rotate
library(ggimage)
#> Loading required package: ggplot2
library(castor)
#> Loading required package: Rcpp
set.seed(
  1,
  kind = "Mersenne-Twister",
  normal.kind = "Inversion",
  sample.kind = "Rejection"
)
phylo <- ape::rcoal(10)
phylo$tip.label <- c("bird_a", "bird_b", "bird_c", "bird_d", "bird_e", "bird_f",
                     "bird_g", "bird_h", "bird_i", "bird_j")
phylo <- phylobase::phylo4(phylo)
endemicity_status <- sample(
  x = c("not_present", "endemic", "nonendemic"),
  size = length(phylobase::tipLabels(phylo)),
  replace = TRUE,
  prob = c(0.6, 0.2, 0.2)
)
phylod <- phylobase::phylo4d(phylo, as.data.frame(endemicity_status))
island_tbl <- island_tbl()
tip_states <- c()
endemicity_status <- phylobase::tipData(phylod)$endemicity_status
for (i in seq_along(endemicity_status)) {
  if (grepl(pattern = "^not_present$", x = endemicity_status[i])) {
    tip_states[i] <- 1
  } else if (grepl(pattern = "^nonendemic$", x = endemicity_status[i])) {
    tip_states[i] <- 2
  } else if (grepl(pattern = "^endemic$", x = endemicity_status[i])) {
    tip_states[i] <- 3
  }
}
phylo <- as(phylo, "phylo")
asr <- castor::asr_max_parsimony(
  tree = phylo,
  tip_states = tip_states,
  transition_costs = "sequential"
)
if (ncol(asr$ancestral_likelihoods) == 2) {
  colnames(asr$ancestral_likelihoods) <- c("not_present", "nonendemic")
} else if (ncol(asr$ancestral_likelihoods) == 3) {
  colnames(asr$ancestral_likelihoods) <-
    c("not_present", "nonendemic", "endemic")
}
node_states <- max.col(asr$ancestral_likelihoods, ties.method = "last")
node_states <- gsub(
  pattern = "1", replacement = "not_present", x = node_states
)
node_states <- gsub(
  pattern = "2", replacement = "nonendemic", x = node_states
)
node_states <- gsub(
  pattern = "3", replacement = "endemic", x = node_states
)
node_data <- data.frame(
  island_status = node_states,
  endemic_prob = asr$ancestral_likelihoods[, "endemic"],
  nonendemic_prob = asr$ancestral_likelihoods[, "nonendemic"],
  not_present_prob = asr$ancestral_likelihoods[, "not_present"],
  row.names = phylobase::nodeId(phylod, "internal")
)
phylod <- phylobase::phylo4d(
  phylo,
  tip.data = as.data.frame(endemicity_status),
  node.data = node_data
)
phylod <- add_asr_node_states(phylod = phylod, asr_method = "parsimony")
island_tbl <- extract_island_species(
  phylod = phylod,
  extraction_method = "asr",
  include_not_present = FALSE
)
island_tbl
#> Class:  Island_tbl 
#>   clade_name  status missing_species  col_time col_max_age branching_times
#> 1     bird_g endemic               0 0.7648553       FALSE         0.38003
#>   min_age      species clade_type
#> 1      NA bird_g, ....          1
island_tbl@island_tbl$species
#> [[1]]
#> [1] "bird_g" "bird_i"
island_tbl@island_tbl$branching_times
#> [[1]]
#> [1] 0.38003
plot_phylod(phylod = phylod)

Created on 2022-07-05 by the reprex package (v2.0.1)