ModelOriented / EIX

Structure mining for xgboost model
https://modeloriented.github.io/EIX/
25 stars 5 forks source link

Issue interactions lightgbm uneven columns when rbindlist(treeList) #6

Open felxcon opened 2 years ago

felxcon commented 2 years ago

Hi, I tried to run your function EIX::interactions on a data set of 1(qf2020_div)+8 numeric columns with about 80,000 rows together with a tuned lightgbm model. However, an error is reported as follows: "Error in rbindlist(treeList) : Item 97 has 13 columns, inconsistent with item 1 which has 19 columns. To fill missing columns use fill=TRUE." This does not happen when running the default lightgbm model parameter settings.

When I try to do with a small subset (100 rows) I encouter the same Error of "uneven columns" when num_leaves to min_gain_to_split are turned off. Otherwise "Error: comparison (1) is possible only for atomic and list types"happens, or that error happens: "" Any idea why this occurs?

I used this code:

mmmf_df_100 = structure(list(qf2020_div = c(-0.683344740108416, -0.62200251820213, -0.660933392581695, -0.931454042941375, -0.678846234812683, -0.678709195184706, -0.62200251820213, -0.619032040654088, -0.741462927558781, -0.882350949746443, -0.747540455479868, -0.743496834435778, 0, -0.63301735032532, -0.850596218655163, -0.860808275916884, -0.62200251820213, -0.669529409627363, -0.675469611757471, 0, 0, 0, -0.388044330254854, -0.850634478054759, 0, 0, -0.617546396858118, -0.8891822325675, -0.703075765512668, -0.886130787928763, -0.681806828303268, 0, -0.88604646624308, -0.926167021298114, -0.692090760819216, -0.660933392581695, -0.83931706735653, -0.881578476738358, -0.684460497124147, -0.705416304923849, -0.685713271747449, -0.686152296703342, -0.88723658127604, -0.846382748304772, -0.62200251820213, -0.720211468617393, -0.684998539883293, -0.675830994910749, -0.61719971562315, -0.908777071672487, 0, 0, 0, -0.813671235655738, 0, -0.886130787928763, 0, -0.388179591352467, -0.889236363195927, -0.883763006684634, 0, -0.681806828303268, -0.692090760819216, -0.670785617377905, -0.675573715067695, 0, -0.746739480916366, -0.684460497124147, -0.738360299567337, 0, -0.692090760819216, 0, -0.640423140555064, -0.695504563944157, 0, -0.613657933810985, -0.74923545834839, -0.660933392581695, -0.821653413397282, -0.738971403646119, -0.61719971562315, -0.678846234812683, -0.819372375152443, -0.720211468617393, -0.886130787928763, -0.629409957539496, -0.680296374263876, 0, -0.844873743332596, -0.619032040654088, 0, -0.670286891070436, -0.678278455996463, -0.739735765831987, -0.602360477184269, 0, -0.692034388476076, -0.675469611757471, -0.886130787928763, -0.684998539883293), watershed = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), compactnes = cmmm_fsize = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), curve_numbe = c(65, 70, 70, 87, 65, 65, 70, 70, 65, 84, 65, 65, 56, 70, 77, 77, 70, 65, 65, 50, 50, 56, 56, 77, 56, 50, 70, 77, 65, 79, 70, 50, 77, 87, 65, 70, 77, 84, 70, 65, 70, 70, 79, 77, 70, 65, 70, 70, 70, 79, 56, 50, 50, 83, 56, 79, 50, 56, 84, 79, 56, 70, 65, 65, 65, 56, 65, 70, 65, 50, 65, 50, 70, 65, 50, 70, 65, 70, 83, 65, 70, 65, 83, 65, 79, 70, 70, 50, 77, 70, 56, 65, 65, 65, 70, 50, 65, 65, 79, 70), hsg = c(0, 2, 2, 0, 0, 0, 2, 2, 0, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 2, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 2), aspect = c(86.1312789916992, 16.4371280670166, 5.29735994338989, 201.93327331543, 322.773468017578, 89.2040252685547, 219.443618774414, 314.053527832031, 180.033554077148, 312.130004882812, 193.037216186523, 9.60710716247559, 50.0658378601074, 279.368316650391, 63.079231262207, 296.165985107422, 249.407730102539, 264.652557373047, 263.472015380859, 113.461738586426, 352.356231689453, 73.4116973876953, 325.854156494141, 153.332122802734, 111.455612182617, 213.973266601562, 235.915802001953, 44.4550132751465, 266.452331542969, 130.592666625977, 278.087646484375, 12.3049230575562, 194.30876159668, 269.795562744141, 273.022857666016, 181.224151611328, 27.4992580413818, 8.57164478302002, 198.986557006836, 33.6422309875488, 38.4557991027832, 178.001922607422, 200.945281982422, 359.575592041016, 348.970916748047, 145.922546386719, 303.911651611328, 272.455993652344, 337.481353759766, 83.7395782470703, 3.79256057739258, 16.1068820953369, 342.483032226562, 250.845794677734, 202.625839233398, 115.868446350098, 125.002998352051, 54.4174537658691, 136.00732421875, 238.046249389648, 203.473831176758, 288.734497070312, 106.890609741211, 128.663162231445, 12.6799297332764, 71.0660247802734, 281.640441894531, 154.839492797852, 312.834503173828, 275.901824951172, 39.8677558898926, 90.3907852172852, 194.46012878418, 302.63037109375, 19.0449523925781, 12.9935855865479, 132.882751464844, 97.9574356079102, 336.990753173828, 59.7389221191406, 157.052444458008, 329.076629638672, 41.2350616455078, 49.0847129821777, 39.1097717285156, 44.3229064941406, 351.448303222656, 231.318969726562, 291.198272705078, 225.222579956055, 224.549331665039, 244.679962158203, 263.660064697266, 191.419311523438, 205.053527832031, 77.8348999023438, 320.839141845703, 270.162658691406, 147.001251220703, 101.014167785645), number_rain = c(4.11478662490845, 4.11478662490845, 4.69904613494873, 4.22641563415527, 4.11478662490845, 4.11478662490845, 4.11478662490845, 4.11478662490845, 5.06366062164307, 4.97407245635986, 4.97407245635986, 5.06366062164307, 4.22641563415527, 4.22641563415527, 4.11478662490845, 4.22641563415527, 4.11478662490845, 4.11478662490845, 4.11478662490845, 4.11478662490845, 4.11478662490845, 4.11478662490845, 4.11478662490845, 4.11478662490845, 4.4235634803772, 4.22641563415527, 4.11478662490845, 5.12175559997559, 4.22641563415527, 4.22641563415527, 5.06366062164307, 5.12175559997559, 5.06366062164307, 4.22641563415527, 4.22641563415527, 4.69904613494873, 4.11478662490845, 5.06366062164307, 5.06366062164307, 4.22641563415527, 5.06366062164307, 4.97407245635986, 4.22641563415527, 4.11478662490845, 4.11478662490845, 4.69904613494873, 5.06366062164307, 5.06366062164307, 4.11478662490845, 4.97407245635986, 4.22641563415527, 4.11478662490845, 5.12175559997559, 4.11478662490845, 4.22641563415527, 4.22641563415527, 4.22641563415527, 4.11478662490845, 4.11478662490845, 4.22641563415527, 4.11478662490845, 5.06366062164307, 4.22641563415527, 4.11478662490845, 4.11478662490845, 5.12175559997559, 5.06366062164307, 5.06366062164307, 5.06366062164307, 4.11478662490845, 4.22641563415527, 4.11478662490845, 4.22641563415527, 4.22641563415527, 4.22641563415527, 4.11478662490845, 4.97407245635986, 4.69904613494873, 4.11478662490845, 5.06366062164307, 4.11478662490845, 4.11478662490845, 4.11478662490845, 4.69904613494873, 4.22641563415527, 4.22641563415527, 5.12175559997559, 5.06366062164307, 4.11478662490845, 4.11478662490845, 4.22641563415527, 4.11478662490845, 4.11478662490845, 5.06366062164307, 4.11478662490845, 4.97407245635986, 4.22641563415527, 4.11478662490845, 4.22641563415527, 5.06366062164307), precipitat = c(105.631990780906, 105.631990780906, 113.525026987469, 95.7414173890674, 108.81347918132, 111.585622257657, 105.631990780906, 107.371452626728, 108.427198425172, 104.19571208197, 106.32300672077, 108.237226047213, 108.663529055459, 105.526280009557, 105.631990780906, 103.14488716731, 105.631990780906, 117.181404984187, 111.968032095167, 105.631990780906, 105.631990780906, 117.875289985112, 114.660130235884, 105.89557216281, 117.985597080655, 105.526280009557, 110.971759538802, 113.277256125496, 103.226915200551, 95.7414173890674, 107.448963218265, 113.277256125496, 115.204860694825, 102.406323243701, 108.663529055459, 113.525026987469, 116.909620224483, 106.32300672077, 104.19571208197, 100.017380517627, 105.95153774534, 105.95153774534, 96.1293804077875, 111.585622257657, 105.631990780906, 117.757825435154, 105.14771898966, 111.93966234298, 108.81347918132, 106.32300672077, 103.226915200551, 114.660130235884, 111.252964557163, 116.909620224483, 105.89557216281, 95.7414173890674, 96.1293804077875, 111.968032095167, 118.493426474314, 99.5515276136853, 117.875289985112, 107.448963218265, 108.663529055459, 117.875289985112, 112.081975460053, 107.530949713692, 105.14771898966, 104.19571208197, 113.237426341526, 118.493426474314, 108.663529055459, 105.631990780906, 103.226915200551, 105.526280009557, 102.406323243701, 112.081975460053, 105.95153774534, 113.525026987469, 111.585622257657, 111.93966234298, 108.81347918132, 108.81347918132, 112.081975460053, 117.757825435154, 95.7414173890674, 108.663529055459, 111.608108346424, 113.237426341526, 111.900469795106, 107.371452626728, 108.146229834784, 116.909620224483, 108.206884308467, 112.154623977722, 119.449564721849, 106.32300672077, 105.89557216281, 111.968032095167, 95.7414173890674, 105.14771898966)), row.names = c(NA, -100L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x000001df3a7b1ef0>)

preprocess dataframe

mmmf_df_rules= lgb.convert_with_rules( data=mmmf_df)

extracted prepared dataframe

mmmf_df_prep = mmmf_df_rules$data

remove dependent variables

mmmf_df_prep_indie_vars <- as.matrix(mmmf_df_prep[, 2:8, with = FALSE])

create correct dataset for lightgbm model "training"

mmmf_lgb_ds<- lgb.Dataset(data = mmmf_df_prep_indie_vars , label = mmmf_df_prep $qf2020_div )

or define specific data.matrix

mmmf_df_prep_indie_vars_2 <- Matrix::sparse.model.matrix( qf2020_div ~ .,data = mmmf_df_prep, with = FALSE) mmmf_lgb_ds_2 <- lightgbm::lgb.Dataset(mmmf_df_prep_indie_vars_2)

define parameter space from tuned lightgbm model

params <- list(objective = "regression" , num_leaves = 100, num_iterations = 1863, learning_rate = 0.2556561, max_depth = 12, min_data_in_leaf = 34, min_gain_to_split = 0.001104944, num_threads = 1, boosting = "goss", tree_learner = "data", extra_trees = T, monotone_constraints_method = "advanced", feature_pre_filter = F, pre_partition = T, two_round = F, force_row_wise = T, force_col_wise = F, device_type = "cpu", verbosity = -1 )

train model on data and settings

lgb_model_intax <- lightgbm::lgb.train(params, mmmf_lgb_ds)

check if trees were created

treedt = lightgbm::lgb.model.dt.tree(lgb_model_intax)

extract the interactions for plotting

inter <- EIX::interactions(lgb_model_intax, mmmf_lgb_ds, option = "interactions");plot(inter)

Of course I could also create and provide a reprex.

Happy for advises :)

Felix

felxcon commented 2 years ago

I tried with https://github.com/AppliedDataSciencePartners/xgboostExplainer/issues/23

but no name clashes, and dataset is matrix and lgb.Dataset

Echo-Bong commented 2 years ago

I also met this problem, checked the feature name and used the xgb.DMatrix data, nothing works.

denysed commented 1 year ago

I've run into the same issue. Using the argument fill = TRUE for every instance of rbindlist across all functions called in interactions and lollipop appears to be a solution to this problem (or at least worked for me, though I haven't thoroughly tested whether the output has any issues).