wzmli / phyloglmm

GNU General Public License v3.0
4 stars 2 forks source link

SE extremely high for emmeans estimated from phylo_glmmTMB #12

Open nicolaycunha opened 2 years ago

nicolaycunha commented 2 years ago

Hi there,

The reason for opening this issue is regarding the emmeans estimation from a phylo_glmmTMB model. My model is basically:

Y ~ X1 + X2 + X3 + X4 + (1|species) + (1|rand1) + (1|rand3:rand2)

The model points that Y differs between the different levels of X1 (P-value = 0.00047), even accounting for the confounding effect of the remaining covariables X2, X3 and X4. Nevertheless, when estimating the emmeans for such a model, I get a really high SEs, which is quite unexpected given the very low P-value. If I do the same model but without adjusting for the phylogeny ( a regular glmmTMB), I get more reasonable SE estimates - about 4x lower. My impression is that it is related to the "phylogenetic" component of the phylo_glmmTMB that emmeans might get lost.

I am attaching the data, the used tree and a MRE as a zip file. I am also pasting the MRE below.

Any clarification on this issue is much appreciated.

Many thanks! Nicolay

#########################################################################

Reading and preparing the data

{
  data <- read.table("MRE/data2.txt")
  data$species_PM2 <- factor(data$species_PM2)
  data$X1 <- factor(data$X1, levels = c("1-10", "11-100", "101-1000", "> 1000"))

  tree_MRE <- ape::read.tree("MRE/tree_MRE.txt")
  library(phyloglmm)
  tol <- sqrt(.Machine$double.eps)
  if (any(tree_MRE$edge.length<(-1*tol))) stop("negative edge lengths")
  tree_MRE$edge.length <- pmax(0, tree_MRE$edge.length)

  ## enforced match between
  system.time(phyloZ <- phylo.to.Z(tree_MRE))
  phyloZ <- phyloZ[levels(factor(data$species_PM2)), ]
}

Model with phylogeny

MRE_m1_PHY <- phylo_glmmTMB(Y ~
                              X1 +
                              X2 +
                              X3 +
                              X4 +
                              (1|species_PM2) + (1|rand1) + (1|rand3:rand2),
                            , data = data
                            , phyloZ = phyloZ
                            , phylonm = "species_PM2"
                            , REML = FALSE
                            , family = "gaussian"
)

Model without phylogeny

library(glmmTMB)
MRE_m1_NoPHY <- glmmTMB(Y ~
                          X1 +
                          X2 +
                          X3 +
                          X4 +
                          (1|species_PM2) + (1|rand1) + (1|rand3:rand2),
                        , data = data
                        # , phyloZ = phyloZ
                        # , phylonm = "species_PM2"
                        , REML = FALSE
                        , family = "gaussian"
)

Emmeans

{
  MRE_m1_NoPHY_emmeans <- emmeans::lsmeans(MRE_m1_NoPHY, pairwise ~ X1 + X2 + X3 + X4)
  MRE_m1_PHY_emmeans <- emmeans::lsmeans(MRE_m1_PHY , pairwise ~ X1 + X2 + X3 + X4)

}
cbind(as.data.frame(MRE_m1_NoPHY_emmeans$lsmeans)[, 5:6], as.data.frame(MRE_m1_PHY_emmeans$lsmeans)[, 5:6])

Anova Type-II

car::Anova(MRE_m1_PHY)
car::Anova(MRE_m1_NoPHY)

MRE.zip

nicolaycunha commented 1 year ago

Hi again,

I am still struggling with the least square means and back transformation of estimates derived from phylo_glmmTMB. I have tried different datasets and models, but in general, even having a highly significant effect, when estimating the SE from such models, I found overlapping between levels of the such significant factor. I am not sure if there is something different to be done for a phylogenetic model when back transforming and calculating lsmeans and SE. Thus, I am not sure if it is a problem with the phylogenetic mixed model or lsmeans transformation. Please let me know where to seek for some help.

I am pasting below a MRE with the dataset I am currently working on. Notice that it can be readily tested by copying and paste into a R environment.

Please notice the overlapping of the SE in the generated plot. I have also found the estimates generated really low compared to the proportion of each category in the dataset.

Any help is much appreciated. Nicolay

library(ape)
library(phyloglmm)  

# Importing the data
data_MRE <- read.table("https://raw.githubusercontent.com/nicolaycunha/MRE/main/data_MRE2.txt", header = TRUE)
tree_MRE <- read.tree("https://raw.githubusercontent.com/nicolaycunha/MRE/main/tree_MRE2.txt")

head(data_MRE)

# Estimating the Phylo_Z matrix
{
  tol <- sqrt(.Machine$double.eps)
  if (any(tree_MRE$edge.length<(-1*tol))) stop("negative edge lengths")
  tree_MRE$edge.length <- pmax(0, tree_MRE$edge.length)

  system.time(phyloZ_MRE <- phylo.to.Z(tree_MRE))

  rr1 <- rownames(phyloZ_MRE)
  rr2 <- data_MRE$species
  identical(rr1, rr2)

}

# converting to factor
data_MRE$species <- factor(data_MRE$species)
data_MRE$LF <- factor(data_MRE$LF)
data_MRE$X <- factor(data_MRE$X)

# Ordering phyto Z
phyloZ_MRE = phyloZ_MRE[levels(data_MRE$species), ]

# setting numeric
data_MRE$y <- ifelse(data_MRE$y == "yes", 1, 0)

model_MRE <- phylo_glmmTMB(y ~ X + LF +
                              (1|species)
                            , data = data_MRE
                            , phyloZ = phyloZ_MRE
                            , phylonm = "species"
                            , REML = FALSE
                            , family = "binomial")

library(car)
Anova(model_MRE)

library(emmeans)
library(ggeffects)

# calculating the lsmeans and asymmetrical SE
{
  df <- as.data.frame(emmeans::lsmeans(model_MRE, pairwise ~ X))[1:2, ]
  summary((lsmeans(model_MRE, pairwise ~ X)), type = "response")

  df$inv_logit_mean <- arm:::invlogit(as.data.frame(emmeans::lsmeans(model_MRE, pairwise ~ X))[1:2, 3])
  df$inv_logit_SE_POSITIVE <- arm:::invlogit(as.data.frame(emmeans::lsmeans(model_MRE, pairwise ~ X))[1:2, 3] + as.data.frame(emmeans::lsmeans(model_MRE, pairwise ~ X))[1:2, 4])
  df$inv_logit_SE_NEGATIVE <- arm:::invlogit(as.data.frame(emmeans::lsmeans(model_MRE, pairwise ~ X))[1:2, 3] - as.data.frame(emmeans::lsmeans(model_MRE, pairwise ~ X))[1:2, 4])
}

# plotting the results
library(ggplot2)
ggplot(df, aes(x = X, y = inv_logit_mean))+
  geom_errorbar(aes(ymin =  inv_logit_SE_NEGATIVE, 
                    ymax =  inv_logit_SE_POSITIVE),
                width = 0.10, size = 0.6,  colour = "black", alpha = 0.9, position = position_dodge(width = 0.5)) +
  geom_point(aes(), size = 3, shape = 21, fill = "black", position = position_dodge(width = 0.5)) +
  theme_classic() + 
  ylab("Y") + xlab("X")