Closed xdadda closed 2 years ago
Update. I started looking into this, implementing the calculation I found in scikit-learn source code https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/_tree.pyx#L1052
I modified the cart source code to add the samples value to the decision tree estimator. However I stumbled upon a problem. It seems all estimators are identical to each other in the RF ensemble. This is the simple source code I've used to browse through the decision trees:
import IrisDataset from "ml-dataset-iris";
import { RandomForestClassifier as RFClassifier } from "ml-random-forest";
const trainingSet = IrisDataset.getNumbers();
const predictions = IrisDataset.getClasses().map(
(elem) => IrisDataset.getDistinctClasses().indexOf(elem)
);
function getAccuracy(
predictions,
target
) {
const nSamples = predictions.length;
let nCorrect = 0;
predictions.forEach((val, idx) => {
if (val == target[idx]) {
nCorrect++;
}
});
return nCorrect / nSamples;
}
(() => {
console.log("Random Forest Model");
const options = {
seed: 5,
maxFeatures: 0.8,
replacement: true,
nEstimators: 25
};
const classifier = new RFClassifier(options);
classifier.train(trainingSet, predictions);
const result = classifier.predict(trainingSet);
console.log("Accuracy: "+getAccuracy(result, predictions)); // Accuracy: 0.74
var trees = JSON.parse(JSON.stringify(classifier.estimators))
//PRINT DECISION TREES
function print_node(node, depth=0, label='root') {
if(!node) return;
console.log(
'\t'.repeat(depth), '['+label+']', ("splitColumn" in node)?node.splitColumn:'-', node.splitValue||'-' , Math.round(node.gain*1000)/1000||'-' , node.samples||'-', '->'
)
if(!!node.left) print_node(node.left,depth+1,'left')
if(!!node.right) print_node(node.right,depth+1,'right')
}
for (var i = 0; i<trees.length; i++) {
console.log('--------------------')
console.log('TREE',i)
console.log('--------------------')
console.log('label, feature, split, gini, samples')
print_node(trees[i].root)
console.log('--------------------')
console.log('--------------------')
}
})();
I'm sure I'm missing something. Can anyone point me in the right direction? thanks
It would be awesome to have feature importance implemented!
Almost forgot. This is my first try at calculating feature importance. BTW, set seed to undefined! Is this a bug in the rf or a wanted feature? Let me know what you think.
import IrisDataset from "ml-dataset-iris";
import { RandomForestClassifier as RFClassifier } from "ml-random-forest";
import ConfusionMatrix from 'ml-confusion-matrix';
const trainingSet = IrisDataset.getNumbers();
const predictions = IrisDataset.getClasses().map(
(elem) => IrisDataset.getDistinctClasses().indexOf(elem)
);
const features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
function print_accuracy(labels, predictions) {
const CM2 = ConfusionMatrix.fromLabels(labels, predictions)
const accuracy = Math.round(CM2.getAccuracy()*1000)
console.log("Accuracy:\t\t" +accuracy/10+'%')
//console.log(CM2.getConfusionTable(1))
}
function _rf_feature_importance(rf, num_features) {
//EVALUATE FEATURE IMPORTANCE FOR EACH TREE IN THE ENSEMBLE
const trees = JSON.parse(JSON.stringify(rf.estimators))
const indexes = JSON.parse(JSON.stringify(rf.indexes))
let importance = []
function compute_feature_importances(i,node) {
if(!node || !("splitColumn" in node) || !(node.gain>0)) return
let f = node.gain * node.samples
if("left" in node) f -= (node.left.gain || 0) * (node.left.samples || 0)
if("right" in node) f -= (node.right.gain || 0) * (node.right.samples || 0)
importance[i][node.splitColumn] += f
if(!!node.left) compute_feature_importances(i,node.left)
if(!!node.right) compute_feature_importances(i,node.right)
}
function normalize_importance(i) {
const s = importance[i].reduce((cum,v)=>{return cum+=v},0)
importance[i] = importance[i].map((v)=>{return v/s})
}
for (let i = 0; i<trees.length; i++) {
importance.push( new Array(num_features).fill(0.0) )
compute_feature_importances(i, trees[i].root)
normalize_importance(i)
}
let avg_importance = new Array(num_features).fill(0.0)
//CALCULATE MEAN
for (let i = 0; i<importance.length; i++) {
for (let x=0; x<num_features; x++) {
avg_importance[indexes[i][x]] += importance[i][x]
}
}
const s = avg_importance.reduce((cum,v)=>{return cum+=v},0)
return avg_importance.map((v)=>{return v/s})
}
(() => {
console.log("*******************");
console.log("Random Forest Model");
console.log("*******************");
const rfOptions = {
seed: undefined, //important!! otherwise all trees in the ensemble will be identical
maxFeatures: 0.8,
replacement: true,
nEstimators: 45,
useSampleBagging: true
};
const rf = new RFClassifier(rfOptions);
rf.train(trainingSet, predictions);
const result = rf.predict(trainingSet);
print_accuracy(predictions, result)
const avg_importance = _rf_feature_importance(rf, features.length)
console.log('Feature importance:')
let str = ''
for (let f = 0; f<features.length; f++) {
str = str.concat('\t'+features[f]+'\t'+Math.round(avg_importance[f]*1000)/10+'%\n')
}
console.log(str)
})();
Forgot to add this edits to ml-cart/cart.js (sorry I don't know how to show it here as diffs)
Add SAMPLES variable to the code in the bestSplit function
bestSplit(XTranspose, y) {
// Depending in the node tree class, we set the variables to check information gain (to classify)
// or error (for regression)
let bestGain = this.kind === 'classifier' ? -Infinity : Infinity;
let check = this.kind === 'classifier' ? (a, b) => a > b : (a, b) => a < b;
let maxColumn;
let maxValue;
let samples;
for (let i = 0; i < XTranspose.rows; ++i) {
let currentFeature = XTranspose.getRow(i);
let splitValues = this.featureSplit(currentFeature, y);
for (let j = 0; j < splitValues.length; ++j) {
let currentSplitVal = splitValues[j];
let splitted = this.split(currentFeature, y, currentSplitVal);
let gain = gainFunctions[this.gainFunction](y, splitted);
if (check(gain, bestGain)) {
maxColumn = i;
maxValue = currentSplitVal;
bestGain = gain;
samples = currentFeature.length
}
}
}
return {
maxGain: bestGain,
maxColumn: maxColumn,
maxValue: maxValue,
samples: samples
};
}
and in the train function
train(X, y, currentDepth, parentGain) {
if (X.rows <= this.minNumSamples) {
this.calculatePrediction(y);
return;
}
if (parentGain === undefined) parentGain = 0.0;
let XTranspose = X.transpose();
let split = this.bestSplit(XTranspose, y);
this.splitValue = split.maxValue;
this.splitColumn = split.maxColumn;
this.gain = split.maxGain;
this.samples = split.samples
[...]
closed by 5ca902d074257b82be5947fc56478ab6d369f7b0
Hi, I was looking to get some sort of feature importance out of RF, at least the basic Mean Decrease in Impurity. Quickly browsed through the source code (including ml-cart) but it's clearly above my current skills to implement such a function (as present in scikit-learn). Any chance it can be implemented in the future? thanks