kogalur / randomForestSRC

DOCUMENTATION:
https://www.randomforestsrc.org/
GNU General Public License v3.0
115 stars 18 forks source link

Problem parsing formulas #306

Closed bbbruce closed 2 years ago

bbbruce commented 2 years ago

From help of package:

> library(randomForestSRC)
> data(pbc, package = "randomForestSRC")
> pbc.obj <- rfsrc(Surv(days, status) ~ ., pbc)
Error in parseFormula(formula, data, ytry) : 
  Survival formula incorrectly specified.

I think problem arises in parseFormula line 14 as.character(f)[2] should be as.character(f[2])

ishwaran commented 2 years ago

We do not see any error with this example on our side. This is a pretty basic example that we test all the time.

First, can you confirm which version of randomForestSRC you are using?

bbbruce commented 2 years ago

3.1.1 on Windows 10 compiled from source (although this is an R file so doubt that should be related to issue)

> library(randomForestSRC)

 randomForestSRC 3.1.1 

 Type rfsrc.news() to see new features, changes, and bug fixes. 
> data(pbc, package = "randomForestSRC")
> pbc.obj <- rfsrc(Surv(days, status) ~ ., pbc)
Error in parseFormula(formula, data, ytry) : 
  Survival formula incorrectly specified.
> randomForestSRC:::parseFormula
function (f, data, ytry = NULL, coerce.factor = NULL) 
{
    if (!inherits(f, "formula")) {
        stop("'formula' is not a formula object.")
    }
    if (is.null(data)) {
        stop("'data' is missing.")
    }
    if (!is.data.frame(data)) {
        stop("'data' must be a data frame.")
    }
    fmly <- all.names(f, max.names = 1e+07)[2]
    all.names <- all.vars(f, max.names = 1e+07)
    yvar.names <- all.vars(formula(paste(as.character(f)[2], 
        "~ .")), max.names = 1e+07)
    yvar.names <- yvar.names[-length(yvar.names)]
    subj.names <- NULL
    coerce.factor.org <- coerce.factor
    coerce.factor <- vector("list", 2)
    names(coerce.factor) <- c("xvar.names", "yvar.names")
    if (!is.null(coerce.factor.org)) {
        coerce.factor$yvar.names <- intersect(yvar.names, coerce.factor.org)
        if (length(coerce.factor$yvar.names) == 0) {
            coerce.factor$yvar.names <- NULL
        }
        coerce.factor$xvar.names <- intersect(setdiff(colnames(data), 
            yvar.names), coerce.factor.org)
    }
    if (fmly == "Surv") {
        if ((sum(is.element(yvar.names, names(data))) != 2) && 
            (sum(is.element(yvar.names, names(data))) != 4)) {
            stop("Survival formula incorrectly specified.")
        }
        else {
            if (sum(is.element(yvar.names, names(data))) == 4) {
                subj.names <- yvar.names[1]
                yvar.names <- yvar.names[-1]
            }
        }
        family <- "surv"
        ytry <- 0
    }
    else if ((fmly == "Multivar" || fmly == "cbind") && 
        length(yvar.names) > 1) {
        if (sum(is.element(yvar.names, names(data))) < length(yvar.names)) {
            stop("Multivariate formula incorrectly specified: y's listed in formula are not in data.")
        }
        Y <- data[, yvar.names, drop = FALSE]
        logical.names <- unlist(lapply(Y, is.logical))
        if (sum(logical.names) > 0) {
            Y[, logical.names] <- 1 * Y[, logical.names, drop = FALSE]
        }
        if ((sum(unlist(lapply(Y, is.factor))) + length(coerce.factor$yvar.names)) == 
            length(yvar.names)) {
            family <- "class+"
        }
        else if ((sum(unlist(lapply(Y, is.factor))) + length(coerce.factor$yvar.names)) == 
            0) {
            family <- "regr+"
        }
        else if (((sum(unlist(lapply(Y, is.factor))) + length(coerce.factor$yvar.names)) > 
            0) && ((sum(unlist(lapply(Y, is.factor))) + length(coerce.factor$yvar.names)) < 
            length(yvar.names))) {
            family <- "mix+"
        }
        else {
            stop("y-outcomes must be either real or factors in multivariate forests.")
        }
        if (!is.null(ytry)) {
            if ((ytry < 1) || (ytry > length(yvar.names))) {
                stop("invalid value for ytry:  ", ytry)
            }
        }
        else {
            ytry <- length(yvar.names)
        }
    }
    else if (fmly == "Unsupervised") {
        if (length(yvar.names) != 0) {
            stop("Unsupervised forests require no y-responses")
        }
        family <- "unsupv"
        yvar.names <- NULL
        temp <- gsub(fmly, "", as.character(f)[2])
        temp <- gsub("\\(|\\)", "", temp)
        ytry <- as.integer(temp)
        if (is.na(ytry)) {
            ytry <- 1
        }
        else {
            if (ytry <= 0) {
                stop("Unsupervised forests require positive ytry value")
            }
        }
    }
    else {
        if (sum(is.element(yvar.names, names(data))) != 1) {
            stop("formula is incorrectly specified.")
        }
        Y <- data[, yvar.names]
        if (is.logical(Y)) {
            Y <- as.numeric(Y)
        }
        if (!(is.factor(Y) | is.numeric(Y))) {
            stop("the y-outcome must be either real or a factor.")
        }
        if (is.factor(Y) || length(coerce.factor$yvar.names) == 
            1) {
            family <- "class"
        }
        else {
            family <- "regr"
        }
        ytry <- 1
    }
    return(list(all.names = all.names, family = family, subj.names = subj.names, 
        yvar.names = yvar.names, ytry = ytry, coerce.factor = coerce.factor))
}
<bytecode: 0x000001cf15a0eee8>
<environment: namespace:randomForestSRC>
bbbruce commented 2 years ago

Problem solved... was simultaneously using a package that imports the formula.tools package which contains an as.character.formula that behaves differently than base R's as.character.default (as no as.character.formula exists in base R) and does not report masking another function when loaded because there was no such function before. Sorry about thinking the issue was on your side, but hopefully if others experience this behavior this will help them figure it out!

Explicitly calling base::as.character.default would be a simple change on your side that would prevent another package from interfering in this way if you were willing to make that change.