ModelOriented / randomForestExplainer

A set of tools to understand what is happening inside a Random Forest
https://ModelOriented.github.io/randomForestExplainer/
230 stars 37 forks source link

Cannot extract important variables with `accuracy_decrease` #3

Closed gundalav closed 5 years ago

gundalav commented 6 years ago

I have the following importance_frame:

importance_frame <- structure(list(variable = structure(1:20, .Label = c(
  "A", "C",
  "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R",
  "S", "T", "V", "W", "Y"
), class = "factor"), mean_min_depth = c(
  1.9761861386314,
  2.5220853029533, 2.15539883255869, 1.61935396654558, 1.45123463631321,
  1.53296953170083, 1.77518115811586, 1.52151167552988, 1.89182019096144,
  2.14429040818413, 1.26326405034901, 1.93502763567771, 1.26898183744519,
  2.02060547195198, 1.54217481302459, 1.67384650439192, 1.5485857685783,
  2.09727178410599, 2.75747046937195, 2.35864404092358
), times_a_root = c(
  23.4,
  5.5, 13.3, 27.9, 39.3, 31.3, 29.7, 34.2, 24.2, 13, 43, 22.7,
  45.3, 16.8, 31.5, 30.1, 33.5, 19.3, 1.75, 14.6
), no_of_nodes = c(
  68.1,
  32.6, 62.2, 103.2, 103.3, 104.7, 75.6, 105.7, 72.4, 64.6, 118.4,
  73.6, 116.6, 74.5, 104.6, 95.6, 103.2, 60.3, 8.875, 36.1
), no_of_trees = c(
  65.1,
  32.3, 59.8, 96.1, 94.7, 99.9, 74.8, 100.6, 69.4, 62.8, 111.2,
  71.2, 108.3, 72.4, 98.8, 90, 97.6, 58.4, 8.875, 35.9
), p_value = c(
  0.669119230058558,
  0.999999783867775, 0.824720803698331, 0.10305110839386, 0.160596787513604,
  0.141119826647113, 0.52735342045046, 0.162403671879659, 0.713272963278132,
  0.817225145266696, 0.0104446472288876, 0.546649197487473, 0.0330726857615005,
  0.672936592800508, 0.0310135225001855, 0.182169849737794, 0.274905137508873,
  0.873388429679101, 1, 0.999021554764331
), gini_decrease = c(
  0.233831386391386,
  0.0886505361305361, 0.185330422910423, 0.358267377067377, 0.401108053058053,
  0.397634655344655, 0.308835228105228, 0.389097318237318, 0.250707615717616,
  0.191033563103563, 0.476535763125763, 0.249038827838828, 0.47133199023199,
  0.243902473082473, 0.372547632367632, 0.33646759018759, 0.382999447219447,
  0.203790450660451, 0.0253906843156843, 0.133164814074814
), accuracy_decrease = c(
  -0.00445119047619048,
  -0.00289380952380952, -0.00482809523809524, -0.00530904761904762,
  0.0051652380952381, 0.00616785714285714, 0.00289238095238095,
  -0.00079095238095238, -0.00239095238095238, -0.00648809523809524,
  0.00383690476190476, -0.00413857142857143, 0.00331214285714286,
  -0.00290619047619048, -0.00131714285714286, -0.0046781746031746,
  0.00534214285714286, -0.00532571428571429, 0, -0.000374047619047619
)), class = "data.frame", .Names = c(
  "variable", "mean_min_depth",
  "times_a_root", "no_of_nodes", "no_of_trees", "p_value", "gini_decrease",
  "accuracy_decrease"
), row.names = c(NA, -20L), na.action = structure(c(
  80L,
  180L
), .Names = c("80", "180"), class = "omit"))

importance_frame
#>    variable mean_min_depth times_a_root no_of_nodes no_of_trees    p_value
#> 1         A       1.976186        23.40      68.100      65.100 0.66911923
#> 2         C       2.522085         5.50      32.600      32.300 0.99999978
#> 3         D       2.155399        13.30      62.200      59.800 0.82472080
#> 4         E       1.619354        27.90     103.200      96.100 0.10305111
#> 5         F       1.451235        39.30     103.300      94.700 0.16059679
#> 6         G       1.532970        31.30     104.700      99.900 0.14111983
#> 7         H       1.775181        29.70      75.600      74.800 0.52735342
#> 8         I       1.521512        34.20     105.700     100.600 0.16240367
#> 9         K       1.891820        24.20      72.400      69.400 0.71327296
#> 10        L       2.144290        13.00      64.600      62.800 0.81722515
#> 11        M       1.263264        43.00     118.400     111.200 0.01044465
#> 12        N       1.935028        22.70      73.600      71.200 0.54664920
#> 13        P       1.268982        45.30     116.600     108.300 0.03307269
#> 14        Q       2.020605        16.80      74.500      72.400 0.67293659
#> 15        R       1.542175        31.50     104.600      98.800 0.03101352
#> 16        S       1.673847        30.10      95.600      90.000 0.18216985
#> 17        T       1.548586        33.50     103.200      97.600 0.27490514
#> 18        V       2.097272        19.30      60.300      58.400 0.87338843
#> 19        W       2.757470         1.75       8.875       8.875 1.00000000
#> 20        Y       2.358644        14.60      36.100      35.900 0.99902155
#>    gini_decrease accuracy_decrease
#> 1     0.23383139     -0.0044511905
#> 2     0.08865054     -0.0028938095
#> 3     0.18533042     -0.0048280952
#> 4     0.35826738     -0.0053090476
#> 5     0.40110805      0.0051652381
#> 6     0.39763466      0.0061678571
#> 7     0.30883523      0.0028923810
#> 8     0.38909732     -0.0007909524
#> 9     0.25070762     -0.0023909524
#> 10    0.19103356     -0.0064880952
#> 11    0.47653576      0.0038369048
#> 12    0.24903883     -0.0041385714
#> 13    0.47133199      0.0033121429
#> 14    0.24390247     -0.0029061905
#> 15    0.37254763     -0.0013171429
#> 16    0.33646759     -0.0046781746
#> 17    0.38299945      0.0053421429
#> 18    0.20379045     -0.0053257143
#> 19    0.02539068      0.0000000000
#> 20    0.13316481     -0.0003740476

And I tried to get the important variables with the following code:

library(randomForestExplainer)
x_measure <- "gini_decrease"
y_measure <- "accuracy_decrease"
important_variables(importance_frame,
  k = 10,
  measures = c(x_measure, y_measure, size_measure)
)

The error I get is this:

Error in `[.data.frame`(rankings, , measures) : 
  undefined columns selected

How can I fix the issue?

Also what is the meaning of negative accuracy_decrease ?