openml / openml-r

R package to interface with OpenML
http://openml.github.io/openml-r/
Other
95 stars 37 forks source link

Error: cannot handle categorical predictors #445

Closed chim3y closed 5 years ago

chim3y commented 5 years ago

Hello Sir/Madam, For the past 2 days, I have running following program to compute training time for 3 learners: random forest, logistic regression and gradient boost. However, for data.id=4135, it generates following error and goes into infinite loop . Please, can you point out why I'm getting the error? and how I can solve it? Thank you in advance for your time.

Error: Warning in train(learner, task, subset = train.i, weights = weights[train.i]) : Could not train learner classif.randomForest: Error in randomForest.default(m, y, ...) : Can not handle categorical predictors with more than 53 categories.

Program:

Task used

tasks = listOMLTasks(limit = NULL)

classifTasks.infos = subset(tasks, task.type == "Supervised Classification" & # classification

number.of.classes == 2 & # binary classification

number.of.instances.with.missing.values == 0) # no missing values

save(classifTasks.infos, file = "Data/OpenML/classifTasks.infos.RData" )

for(index in c(1:256)){
  # Index
  j = index.not.done[index]

  # begin
  df.infos$began[j] = TRUE

  # Try
  tryCatch({

    # Loading the dataset
    omldataset = getOMLDataSet(data.id = clas$data.id[j], verbosity = 0)
    if (identical(omldataset$target.features, character(0))) {
      omldataset$target.features="Class"
      omldataset$desc$default.target.attribute="Class"
    }
    df.infos$loaded[j] = "TRUE" 

    # check the target
    df.infos$target_type = class(omldataset$data[, omldataset$target.features])

    # Transform to mlr task
    configureMlr(on.learner.error = "warn", show.learner.output = TRUE, show.info = FALSE)
    mlrtask = convertOMLDataSetToMlr(omldataset, verbosity = 0)
    df.infos$converted[j] = TRUE

    # Get the dimension
    df.infos$dimension[j] = getTaskDimension(mlrtask)

    **if (computeTime) {

      # Compute the time for lr, rf, gb
      learners = list(makeLearner("classif.randomForest"),
                      makeLearner("classif.logreg"),
                      makeLearner("classif.bst"))
      rdesc = makeResampleDesc("Holdout", split = 0.8, stratify = TRUE)
      configureMlr(on.learner.error = "warn", show.learner.output = TRUE, show.info = TRUE)

      sink(df.infos.file)
      sink(df.infos.file, type = "message")
      print(paste("Iteration",j,"dataset",clas$data.id[j]), quote = FALSE)
      set.seed(seed)
      bmr = benchmark(learners, mlrtask, rdesc, list(acc,timetrain), 
                      keep.pred = TRUE, models = FALSE, show.info = FALSE)
      sink() 
      sink(type="message")

      perfs=NA
      perfs = getBMRPerformances(bmr, as.df = TRUE)
      time.train = sum(perfs$timetrain)

      df.infos$rf_time[j]=perfs$timetrain[which(perfs$learner.id=="classif.randomForest")]
      df.infos$lr_time[j]=perfs$timetrain[which(perfs$learner.id=="classif.logreg")]
      df.infos$gb_time[j]=perfs$timetrain[which(perfs$learner.id=="classif.bst")]

      df.infos$rf_NA[j] = is.na(perfs$acc[which(perfs$learner.id=="classif.randomForest")])
      df.infos$lr_NA[j] = is.na(perfs$acc[which(perfs$learner.id=="classif.logreg")])
      df.infos$gb_NA[j] = is.na(perfs$acc[which(perfs$learner.id=="classif.bst")])

      print(paste("compute Time", df.infos$data.id[j], " rf_time", df.infos$rf_time[j], " lr_time", df.infos$lr_time[j], " gb_time", df.infos$gb_time[j]))
    }

  }, error = function(e) return(paste0("The variable '", j, "'", 
                                       " caused the error: '", e, "'")))
  i=i+1;
  setTxtProgressBar(pb, index)
  df.infos$done[j] = TRUE
  save(df.infos, file = "Data/OpenML/df.infos.RData")
}

}**

giuseppec commented 5 years ago

This is a combinatorics "issue" with the randomForest package as it is highly inefficient to compute all possible split-combinations with categorical features with lots of levels. So, you should try either the ranger package (maybe this works as ranger is a bit faster) or you need to change the data, e.g., convert the categorical features to dummy features with mlrtask = createDummyFeatures(mlrtask).