modelop / hadrian

Implementations of the Portable Format for Analytics (PFA)
Apache License 2.0
130 stars 49 forks source link

randomForest with all numerical values fails because of buildOneTree's implementation #21

Open bchazalet opened 7 years ago

bchazalet commented 7 years ago

I had been working on converting a randomForest model from R to PFA without success until I found that buildOneTree seems to be harcoded for a leaf string value. My trees are all numerical.

If I duplicate the function for the same one returning list(double = node[[6]]) instead it all seems to work.

My final R code looks like this:

buildMyTree <- function(tree, whichNode, valueNeedsTag, dataLevels, fieldTypes = NULL) {
    node <- tree[whichNode,]

    if (node[[1]] > 0) {  # branch node
        f <- gsub("\\.", "_", node[[3]])
        dl <- dataLevels[[f]]

        if (is.null(fieldTypes)  ||  is.null(fieldTypes[[f]]))
            t <- avro.type(node[[4]])
        else {
            t <- fieldTypes[[f]]
            if (is.list(t)  &&  "name" %in% names(t))
                t <- t$name
        }

        if (!is.null(dl)  &&  length(dl) == 2  &&  (node[[4]] == 1  ||  node[[4]] == 2  ||  node[[4]] == 0.5)) {
            if (dl[[node[[4]]]] == 0.5)
                val <- FALSE
            else
                val <- dl[[node[[4]]]]
            if (valueNeedsTag) {
                out <- list()
                out[[t]] <- val
                val <- out
            }
            op <- "=="
        }
        else if (!is.null(dl)  &&  length(dl) > 0) {
            l <- length(dl)
            a <- 2^(0:l)
            b <- 2*a
            val <- list()
            for (i in 1:l) {
                if ((node[[4]] %% b[i]) >= a[i])
                    val[length(val) + 1] <- dl[[i]]
            }
            if (valueNeedsTag) {
                out <- list(array = val)
                val <- out
            }
            op <- "in"
        }
        else {
            val <- node[[4]]
            if (valueNeedsTag) {
                out <- list()
                out[[t]] <- val
                val <- out
            }
            op <- "<="
        }

        list(TreeNode =
             list(field = f,
                  operator = op,
                  value = val,
                  pass = buildMyTree(tree, node[[1]], valueNeedsTag, dataLevels, fieldTypes),
                  fail = buildMyTree(tree, node[[2]], valueNeedsTag, dataLevels, fieldTypes)))
    }
    else
        ## the only modified line is the one below
        list(double = node[[6]])   # leaf node
}

aurelius_forest <- list()
for (i in 1:forestObject$ntree) {
  treeTable <- pfa.randomForest.extractTree(forestObject, i, labelVar = TRUE)
  aurelius_forest[[length(aurelius_forest) + 1]] <-
    buildMyTree(treeTable, 1, valueNeedsTag = FALSE, dataLevels = list(), fieldTypes = NULL)$TreeNode
}
bchazalet commented 7 years ago

This is what I get if I run it with buildOneTree untouched:

com.opendatagroup.hadrian.errors.PFAInitializationException: JSON datum {"string":1} does not match any type in union [{"type":"record","fields":[{"name":"field","type":{"type":"enum","symbols":["field1", "field2"],"name":"Enum_3"}},{"name":"operator","type":"string"},{"name":"value","type":"double"},{"name":"pass","type":["TreeNode","double"]},{"name":"fail","type":["TreeNode","double"]}],"name":"TreeNode"},"double"]