Closed MaxKerney closed 5 years ago
Hi, thanks for using sensiPhy and reporting your issue.
Indeed, the error is related to the way phylo.d function calls the argument binvar. because miss.phylo.d calls phylo.d directly from the package environment (caper::phylo.d) there is no way to solve this without changing the function itself. I will definitely consider adding the functionality you have suggested in the next version of sensiPhy. Thanks.
One temporary way to run miss.phylo.d
inside a loop is to change both phylo.d
and miss.phylo.d
functions to call the binvar
argument as a character. You can find the altered functions here: phylo.d2 and miss.phylo.d2.
If you load both functions on the environment you should be able to run your example, as below:
library(sensiPhy)
library(caper)
# Load data and functions: phylo.d2 & miss.phylo.d2
data(alien)
data <- alien.data
phy = alien.phy[[1]]
# Try with a loop
vars <- names(data[, 2:4])
lst <- list()
for (var in vars) {
lst[[var]] <- miss.phylo.d2(data, phy, binvar = var)
}
lst$adultMass
lst$gestaLen
lst$homeRange
Please let me know if this have solved your problem.
cheers
Thanks a lot for getting back to me and providing a fix so quickly. Unfortunately, my loop is still not working on my data set, but I think this is now due to some of the variables being categorical and/or some not having any missing values. I understand why variables without any missing values would cause the function to fail, but I would have thought categorical variables should work ok? For categorical variables I get the error: Error in caper::phylo.d(compdat, ...) : 'categorical_variable' is not a variable in data.
Thanks.
Hi @MK212, thanks for you feedback, we get back to you soon. Any ideas @caterinap ?
Hi,
thanks for reporting this! The problem is not linked to the fact that the variable is categorical but to this line of code in caper::phylo.d
:
binvar <- deparse(substitute(binvar))
My "quick and dirty" solution for the moment would be to create two new functions to run the loop:
phylo.d2<- function (data, phy, names.col, binvar, permut = 1000, rnd.bias = NULL)
{
if (!missing(data)) {
if (!inherits(data, "comparative.data")) {
if (missing(names.col))
stop("names column is missing")
names.col <- deparse(substitute(names.col))
data <- caicStyleArgs(data = data, phy = phy, names.col = names.col)
}
}
binvar <- binvar
bininds <- match(binvar, names(data$data))
if (is.na(bininds))
(stop("'", binvar, "' is not a variable in data."))
ds <- data$data[, bininds]
if (any(is.na(ds)))
stop("'", binvar, "' contains missing values.")
if (is.character(ds))
ds <- as.factor(ds)
if (length(unique(ds)) > 2)
stop("'", binvar, "' contains more than two states.")
if (length(unique(ds)) < 2)
stop("'", binvar, "' only contains a single state.")
propStates <- unclass(table(ds))
propState1 <- propStates[1]/sum(propStates)
names(dimnames(propStates)) <- binvar
if (is.factor(ds))
ds <- as.numeric(ds)
if (!is.numeric(permut))
(stop("'", permut, "' is not numeric."))
if (!is.null(rnd.bias)) {
rnd.bias <- deparse(substitute(rnd.bias))
rnd.ind <- match(rnd.bias, names(data$data))
if (is.na(rnd.ind))
(stop("'", rnd.bias, "' is not a variable in data."))
rnd.bias <- data$data[, rnd.bias]
}
el <- data$phy$edge.length
elTip <- data$phy$edge[, 2] <= length(data$phy$tip.label)
if (any(el[elTip] == 0))
stop("Phylogeny contains pairs of tips on zero branch lengths, cannot currently simulate")
if (any(el[!elTip] == 0))
stop("Phylogeny contains zero length internal branches. Use di2multi.")
ds.ran <- replicate(permut, sample(ds, prob = rnd.bias))
if (is.null(data$vcv)) {
vcv <- VCV.array(data$phy)
}
else {
vcv <- data$vcv
}
ds.phy <- rmvnorm(permut, sigma = unclass(vcv))
ds.phy <- as.data.frame(t(ds.phy))
ds.phy.thresh <- apply(ds.phy, 2, quantile, propState1)
ds.phy <- sweep(ds.phy, 2, ds.phy.thresh, "<")
ds.phy <- as.numeric(ds.phy)
dim(ds.phy) <- dim(ds.ran)
ds.ran <- cbind(Obs = ds, ds.ran)
ds.phy <- cbind(Obs = ds, ds.phy)
dimnames(ds.ran) <- dimnames(ds.phy) <- list(data$phy$tip.label,
c("Obs", paste("V", 1:permut, sep = "")))
phy <- reorder(data$phy, "pruningwise")
ds.ran.cc <- contrCalc(vals = ds.ran, phy = phy, ref.var = "V1",
picMethod = "phylo.d", crunch.brlen = 0)
ds.phy.cc <- contrCalc(vals = ds.phy, phy = phy, ref.var = "V1",
picMethod = "phylo.d", crunch.brlen = 0)
ransocc <- colSums(ds.ran.cc$contrMat)
physocc <- colSums(ds.phy.cc$contrMat)
if (round(ransocc[1], digits = 6) != round(physocc[1], digits = 6))
stop("Problem with character change calculation in phylo.d")
obssocc <- ransocc[1]
ransocc <- ransocc[-1]
physocc <- physocc[-1]
soccratio <- (obssocc - mean(physocc))/(mean(ransocc) -
mean(physocc))
soccpval1 <- sum(ransocc < obssocc)/permut
soccpval0 <- sum(physocc > obssocc)/permut
dvals <- list(DEstimate = soccratio, Pval1 = soccpval1,
Pval0 = soccpval0, Parameters = list(Observed = obssocc,
MeanRandom = mean(ransocc), MeanBrownian = mean(physocc)),
StatesTable = propStates, Permutations = list(random = ransocc,
brownian = physocc), NodalVals = list(observed = ds.ran.cc$nodVal[,
1, drop = FALSE], random = ds.ran.cc$nodVal[, -1,
drop = FALSE], brownian = ds.phy.cc$nodVal[, -1,
drop = FALSE]), binvar = binvar, data = data, nPermut = permut,
rnd.bias = rnd.bias)
class(dvals) <- "phylo.d"
return(dvals)
}
and
miss.phylo.d2<-function(data, phy,...){
sp.nam <- NULL
names.col <- NULL
#error check
if (class(data) != "data.frame") stop("data must be class 'data.frame'")
if (class(phy) != "phylo") stop("phy must be class 'phylo'")
#calculate % of NAs per trait
tot.sp <- nrow(data)
nNA <- colSums(is.na(data))
percNA <- round(nNA/tot.sp*100,digits=2)
print("Percentage of missing data in traits:")
print(percNA)
#remove factor columns (categorical traits)
factCols <- sapply(data,is.factor)
data <- data[,!factCols]
#recode traits with missing data into binary
data[!is.na(data)] <- 0
data[is.na(data)] <- 1
#match with phylogeny
if(is.null("names.col")){
compdat<-caper::comparative.data(phy,data,names.col=names.col)}
if(!is.null("names.col")){
data$sp.nam<-row.names(data)
compdat<-caper::comparative.data(phy,data,names.col=sp.nam)}
#calculate d statistic using caper::phylo.d
d.stat<-phylo.d2(compdat,...)
}
And then use miss.phylo.d2 in the loop:
vars <- names(data[, 2:4])
lst <- list()
for (var in vars) { lst[[var]] <- miss.phylo.d2(data, phy, binvar = var) }
Not very elegant, but it should work until we find something better. Hope it helps!
Hi,
Thanks for taking a look at this. I think this is the same as the fix that @paternogbc made though? The function still isn't working for me with categorical variables, but I can see more clearly from your script that this is because categorical variables seem to be dropped in the miss.phylo.d function. Is it just the case that checking for phylogenetic signal in missingness in categorical variables isn't possible? I'd have thought that categorical variables could be recoded into a binary "missing" / "not missing" format for the phylo.d function in the same way as continuous variables?
Hi,
the solution above should work with categorical variables too, e.g.:
data <- alien.data
phy = alien.phy[[1]]
data$catvar<-rep(c("a","b"),47)
data[is.na(data$adultMass),"catvar"]<-NA
vars <- names(data[, 2:8])
lst <- list()
for (var in vars) { lst[[var]] <- miss.phylo.d2(data, phy, binvar = var) }
lst$catvar
I might be wrong but it seems that in the error you reported above (Error in caper::phylo.d(compdat, ...) : 'categorical_variable' is not a variable in data.
) 'categorical_variable' is a name attributed to the columns with categorical variables. The problem was linked to "binvar" not recognizing the column names ("is not a variable in data"). phylo.d2 should now avoid this issue. Please let me know if the solution does not work with your data.
Hi,
Your and @paternogbc's modified functions fix the issue of "binvar" not recognising column names, but I think the issue now is being caused by the fact that categorical variables (by which I mean factor variables - sorry if I wasn't clear in my language!) are being dropped by miss.phylo.d, so that when the data reaches phylo.d the factor variables are no longer there, so phylo.d can't find that variable in the data and returns an error. Your example works ok because catvar
is a character variable, but when converted to a factor it's dropped by miss.phylo.d2 and the loop fails. To be clear, the loop with the modified functions works fine on my data set for numeric variables - so the problem is no longer with column names not being recognised - it just fails with the factor variables in my data set. I think it's these lines in miss.phylo.d that are the culprit:
#remove factor columns (categorical traits)
factCols <- sapply(data,is.factor)
data <- data[,!factCols]
Is there a reason why factor variables need to be dropped?
Hi, ok, sorry for not getting that before! I have to admit that I couldn't remember why we were removing the categorical traits. While we test and try to recall the reason, you can use this modified function (that converts the factors into character):
miss.phylo.d2<-function(data, phy,...){
sp.nam <- NULL
names.col <- NULL
#error check
if (class(data) != "data.frame") stop("data must be class 'data.frame'")
if (class(phy) != "phylo") stop("phy must be class 'phylo'")
#calculate % of NAs per trait
tot.sp <- nrow(data)
nNA <- colSums(is.na(data))
percNA <- round(nNA/tot.sp*100,digits=2)
print("Percentage of missing data in traits:")
print(percNA)
#convert factor columns into character
factCols <- sapply(data,is.factor)
data[,factCols] <- sapply(data[,factCols],as.character)
#recode traits with missing data into binary
data[!is.na(data)] <- 0
data[is.na(data)] <- 1
#match with phylogeny
if(is.null("names.col")){
compdat<-caper::comparative.data(phy,data,names.col=names.col)}
if(!is.null("names.col")){
data$sp.nam<-row.names(data)
compdat<-caper::comparative.data(phy,data,names.col=sp.nam)}
#calculate d statistic using caper::phylo.d
d.stat<-phylo.d2(compdat,...)
}
It should work fine with factors:
data <- alien.data
phy = alien.phy[[1]]
data$catvar<-rep(c("a","b"),47)
data$catvar<-as.factor(data$catvar)
data[is.na(data$adultMass),"catvar"]<-NA
summary(data)
vars <- names(data[, 2:8])
lst <- list()
for (var in vars) { lst[[var]] <- miss.phylo.d2(data, phy, binvar = var) }
lst$catvar
Sorry for this and thanks for your patience!
No problem, thanks for taking the time to look into this! It would be good to find out just in case there is a good reason why I shouldn't be trying to do this for categorical variables ;)
Hi, after spending some more time on it, I really don't see any reason why it shouldn't be used for categorical variables. In any case all variables are transformed into 0/1 corresponding to nonNA/NA information in the variable (of any type). I think the removal of factors is simply a mistake, that we will now fix. In the meantime miss.phylo.d2 above should be able to do the job. Sorry and thanks again very much for bringing this out!
Great, thank you!
Hi. I'm having the same problem with phylo.d not being possible to loop over. I was wondering if anyone knows the best way to get a hold of the package maintainers of caper?
Huh. Sorry about this! Because it is expecting an object name in binvar - and that is extracted to a string in the deparse line - there isn't an easy way to pass a column name in as a string. The function should handle this better.
In the short term, there is a fix but it is a little ugly. You basically have to construct a call within the loop that has the right variable name as an object name and not as a string. Having to program on the language is usually a sign that the developer has done something ugly - I can't disagree.
fun <- function(df, var){
# Simple function showing the same behaviour
vname <- deparse(substitute(var))
vmean <- mean(df[,vname])
print(sprintf('%s: %0.5f', vname, vmean))
}
mydf <- data.frame(a=rnorm(10), b=rnorm(10), c=rnorm(10))
fun(mydf, b)
for (v in names(mydf)){
# Within the loop, turn the string into an object name, substitute that into the call and then evaluate it.
eval(substitute(fun(mydf, v_to_insert), list(v_to_insert=as.name(v))))
}
Thank you very much @davidorme! Much appreciated!
Just to make it supert clear, if I had something like this:
library(ape)
library(caper)
library(tidyverse)
tree <- rtree(10)
df <- tree$tip.label %>%
as.data.frame() %>%
rename(tip.label = ".") %>%
mutate(var1 = c("a", "b", "b", "b", "b", "b", "b", "b", "b", "b"),
var2= c(c("a", "b", "b", "b", "b", "b", "b", "b", "b", "a")))
vars <- colnames(df[,2:3])
result_df <- tibble(Feature = NULL,
Destimate = NULL,
Pval1 = NULL,
Pval0 = NULL)
for(var in vars){
output <- phylo.d(data = df, phy = tree, names.col = tip.label, binvar = var)
df <- tibble(Feature = {{feature}},
Destimate = output$DEstimate,
Pval1 = output$Pval1,
Pval0 = output$Pval0)
result_df <- rbind(result_df , df)
}
How should I implement this fix? I've tried a couple of approaches so far, and I haven't figured it out yet. I'll keep trying but if anyone has some advice I'd appreciate it :)
Thanks for the reprex, @HedvigS. I'd do this - I've moved the comparative dataset creation out of the loop rather than having phylo.d
create it each time and I've defined the results table size up front rather than using rbind
. That can be a huge performance hit (unlikely to be here, but worth noting).
library(ape)
library(caper)
library(tidyverse)
tree <- rtree(10)
df <- tree$tip.label %>%
as.data.frame() %>%
rename(tip.label = ".") %>%
mutate(var1 = c("a", "b", "b", "b", "b", "b", "b", "b", "b", "b"),
var2= c(c("a", "b", "b", "b", "b", "b", "b", "b", "b", "a")))
# Build the comparative dataset once
ds <- comparative.data(tree, df, names.col=tip.label)
vars <- colnames(ds$data)
# Create the rows all at once, to avoid rbind (real performance hit in
# larger examples) although it does mean having to loop over indices
# not names below.
nvar <- length(vars)
result_df <- tibble(Feature = character(nvar),
Destimate = numeric(nvar),
Pval1 = numeric(nvar),
Pval0 = numeric(nvar))
for (idx in seq_along(vars)) {
var <- vars[idx]
output <- eval(substitute(phylo.d(data = ds, binvar = this_var),
list(this_var=as.name(var))))
result_df[idx, 1] <- var
result_df[idx, 2] <- output$DEstimate
result_df[idx, 3] <- output$Pval1
result_df[idx, 3] <- output$Pval0
}
@davidorme Aha! Thank you very much!!
(P.S.
result_df[idx, 3] <- output$Pval0
should be
result_df[idx, 4] <- output$Pval0
)
function
miss.phylo.d
Problem
Hi,
I've been trying to use miss.phylo.d in a loop to apply it to every variable in a dataset and put the results for each variable in a list, but the "binvar" argument doesn't seem to accept a loop element as a variable name; it produces the error:
Error in caper::phylo.d(compdat, ...) : 'var' is not a variable in data.
.I've tried various ways of unquoting 'var', such as using
as.name(var)
, or!! var
, but with no luck.Looking at the code for caper::phylo.d() I'm thinking that the issue might be being caused by line 11:
binvar <- deparse(substitute(binvar))
?Maybe a future feature for miss.phylo.d would be to have it be applied to all variables in a dataset by default, and only a specific one if "binvar" is specified?
Reproducible example