mljs / random-forest

Random forest for classification and regression.
https://mljs.github.io/random-forest/
MIT License
61 stars 21 forks source link

Feature Importance #15

Closed xdadda closed 2 years ago

xdadda commented 5 years ago

Hi, I was looking to get some sort of feature importance out of RF, at least the basic Mean Decrease in Impurity. Quickly browsed through the source code (including ml-cart) but it's clearly above my current skills to implement such a function (as present in scikit-learn). Any chance it can be implemented in the future? thanks

xdadda commented 5 years ago

Update. I started looking into this, implementing the calculation I found in scikit-learn source code https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L1052

I modified the cart source code to add the samples value to the decision tree estimator. However I stumbled upon a problem. It seems all estimators are identical to each other in the RF ensemble. This is the simple source code I've used to browse through the decision trees:

import IrisDataset from "ml-dataset-iris";
import { RandomForestClassifier as RFClassifier } from "ml-random-forest";

const trainingSet = IrisDataset.getNumbers();
const predictions = IrisDataset.getClasses().map(
  (elem) => IrisDataset.getDistinctClasses().indexOf(elem)
);

function getAccuracy(
  predictions,
  target
) {
  const nSamples = predictions.length;
  let nCorrect = 0;
  predictions.forEach((val, idx) => {
      if (val == target[idx]) {
          nCorrect++;
      }
  });
  return nCorrect / nSamples;
}

(() => {
  console.log("Random Forest Model");
  const options = {
      seed: 5,
      maxFeatures: 0.8,
      replacement: true,
      nEstimators: 25
  };

  const classifier = new RFClassifier(options);
  classifier.train(trainingSet, predictions);
  const result = classifier.predict(trainingSet);

  console.log("Accuracy: "+getAccuracy(result, predictions)); // Accuracy: 0.74

  var trees = JSON.parse(JSON.stringify(classifier.estimators))

  //PRINT DECISION TREES
  function print_node(node, depth=0, label='root') {
    if(!node) return;
    console.log(
      '\t'.repeat(depth), '['+label+']', ("splitColumn" in node)?node.splitColumn:'-', node.splitValue||'-' , Math.round(node.gain*1000)/1000||'-' , node.samples||'-', '->'
    )
    if(!!node.left) print_node(node.left,depth+1,'left')
    if(!!node.right) print_node(node.right,depth+1,'right')
  }

  for (var i = 0; i<trees.length; i++) {
    console.log('--------------------')
    console.log('TREE',i)
    console.log('--------------------')
    console.log('label, feature, split, gini, samples')
    print_node(trees[i].root)
    console.log('--------------------')
    console.log('--------------------')
  }
})();

I'm sure I'm missing something. Can anyone point me in the right direction? thanks

zemlyansky commented 5 years ago

It would be awesome to have feature importance implemented!

xdadda commented 5 years ago

Almost forgot. This is my first try at calculating feature importance. BTW, set seed to undefined! Is this a bug in the rf or a wanted feature? Let me know what you think.

import IrisDataset from "ml-dataset-iris";
import { RandomForestClassifier as RFClassifier } from "ml-random-forest";
import ConfusionMatrix from 'ml-confusion-matrix';

const trainingSet = IrisDataset.getNumbers();
const predictions = IrisDataset.getClasses().map(
  (elem) => IrisDataset.getDistinctClasses().indexOf(elem)
);
const features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

function print_accuracy(labels, predictions) {
  const CM2 = ConfusionMatrix.fromLabels(labels, predictions)
  const accuracy = Math.round(CM2.getAccuracy()*1000)
  console.log("Accuracy:\t\t" +accuracy/10+'%')
  //console.log(CM2.getConfusionTable(1))
}

function _rf_feature_importance(rf, num_features) {
  //EVALUATE FEATURE IMPORTANCE FOR EACH TREE IN THE ENSEMBLE
  const trees = JSON.parse(JSON.stringify(rf.estimators))
  const indexes = JSON.parse(JSON.stringify(rf.indexes))
  let importance = []

  function compute_feature_importances(i,node) {
    if(!node || !("splitColumn" in node) || !(node.gain>0)) return
    let f = node.gain * node.samples
    if("left" in node) f -= (node.left.gain || 0) * (node.left.samples || 0)
    if("right" in node) f -= (node.right.gain || 0) * (node.right.samples || 0)
    importance[i][node.splitColumn] += f
    if(!!node.left) compute_feature_importances(i,node.left)
    if(!!node.right) compute_feature_importances(i,node.right)
  }

  function normalize_importance(i) {
    const s = importance[i].reduce((cum,v)=>{return cum+=v},0)
    importance[i] = importance[i].map((v)=>{return v/s})
  }

  for (let i = 0; i<trees.length; i++) {
    importance.push( new Array(num_features).fill(0.0) )
    compute_feature_importances(i, trees[i].root)
    normalize_importance(i)
  }

  let avg_importance = new Array(num_features).fill(0.0)
  //CALCULATE MEAN
  for (let i = 0; i<importance.length; i++) {
    for (let x=0; x<num_features; x++) {
      avg_importance[indexes[i][x]] += importance[i][x]
    }
  }
  const s = avg_importance.reduce((cum,v)=>{return cum+=v},0)
  return avg_importance.map((v)=>{return v/s})
}

(() => {
  console.log("*******************");
  console.log("Random Forest Model");
  console.log("*******************");
  const rfOptions = {
      seed: undefined, //important!! otherwise all trees in the ensemble will be identical
      maxFeatures: 0.8,
      replacement: true,
      nEstimators: 45,
      useSampleBagging: true
  };

  const rf = new RFClassifier(rfOptions);
  rf.train(trainingSet, predictions);
  const result = rf.predict(trainingSet);

  print_accuracy(predictions, result)
  const avg_importance = _rf_feature_importance(rf, features.length)
  console.log('Feature importance:')
  let str = ''
  for (let f = 0; f<features.length; f++) {
    str = str.concat('\t'+features[f]+'\t'+Math.round(avg_importance[f]*1000)/10+'%\n')
  }
  console.log(str)
})();
xdadda commented 5 years ago

Forgot to add this edits to ml-cart/cart.js (sorry I don't know how to show it here as diffs)

Add SAMPLES variable to the code in the bestSplit function

  bestSplit(XTranspose, y) {
    // Depending in the node tree class, we set the variables to check information gain (to classify)
    // or error (for regression)

    let bestGain = this.kind === 'classifier' ? -Infinity : Infinity;
    let check = this.kind === 'classifier' ? (a, b) => a > b : (a, b) => a < b;

    let maxColumn;
    let maxValue;
    let samples;

    for (let i = 0; i < XTranspose.rows; ++i) {
      let currentFeature = XTranspose.getRow(i);
      let splitValues = this.featureSplit(currentFeature, y);
      for (let j = 0; j < splitValues.length; ++j) {
        let currentSplitVal = splitValues[j];
        let splitted = this.split(currentFeature, y, currentSplitVal);

        let gain = gainFunctions[this.gainFunction](y, splitted);
        if (check(gain, bestGain)) {
          maxColumn = i;
          maxValue = currentSplitVal;
          bestGain = gain;
          samples = currentFeature.length
        }
      }
    }

    return {
      maxGain: bestGain,
      maxColumn: maxColumn,
      maxValue: maxValue,
      samples: samples
    };
  }

and in the train function

  train(X, y, currentDepth, parentGain) {
    if (X.rows <= this.minNumSamples) {
      this.calculatePrediction(y);
      return;
    }
    if (parentGain === undefined) parentGain = 0.0;

    let XTranspose = X.transpose();
    let split = this.bestSplit(XTranspose, y);

    this.splitValue = split.maxValue;
    this.splitColumn = split.maxColumn;
    this.gain = split.maxGain;
    this.samples = split.samples
    [...]
aiday-mar commented 2 years ago

closed by 5ca902d074257b82be5947fc56478ab6d369f7b0