WMJi commented 6 years ago

I want to generate code (Python for now, but ultimately C) from a trained gradient boosted classifier (from sklearn). As far as I understand it, the model takes an initial predictor, and then adds predictions from sequentially trained regression trees (scaled by the learning factor). The chosen class is then the class with the highest output value.

This is the code I have so far:

def recursep_gbm(left, right, threshold, features, node, depth, value, out_name, scale):
    # Functions for spacing
    tabs = lambda n: (' ' * n * 4)[:-1]
    def print_depth():
        if depth: print tabs(depth),
    def print_depth_b():
        if depth: 
            print tabs(depth), 
            if (depth-1): print tabs(depth-1),

    if (threshold[node] != -2):
        print "if " + features[node] + " <= " + str(threshold[node]) + ":"
        if left[node] != -1:
            recursep_gbm(left, right, threshold, features, left[node], depth+1, value, out_name, scale)
        print "else:"
        if right[node] != -1:
            recursep_gbm(left, right, threshold, features, right[node], depth+1, value, out_name, scale)
        # This is an end node, add results
        print out_name + " += " + str(scale) + " * " + str(value[node][0, 0])

def print_GBM_python(gbm_model, feature_names, X_data, l_rate):
    print "PYTHON CODE"

    # Get trees
    trees = gbm_model.estimators_

    # F0
    f0_probs = np.mean(clf.predict_log_proba(X_data), axis=0)
    probs    = ", ".join([str(prob) for prob in f0_probs])
    print "# Initial probabilities (F0)"
    print "scores = np.array([%s])" % probs

    print "# Update scores for each estimator"
    for j, tree_group in enumerate(trees):
        for k, tree in enumerate(tree_group):
            left      = tree.tree_.children_left
            right     = tree.tree_.children_right
            threshold = tree.tree_.threshold
            features  = [feature_names[i] for i in tree.tree_.feature]
            value = tree.tree_.value

            recursep_gbm(left, right, threshold, features, 0, 0, value, "scores[%i]" % k, l_rate)

    print "# Get class with max score"
    print "return np.argmax(scores)"

This is an example of what it generates (with 3 classes, 2 estimators, 1 max depth and 0.1 learning rate):

# Initial probabilities (F0)
scores = np.array([-0.964890, -1.238279, -1.170222])

# Update scores for each estimator
if X1 <= 57.5:
    scores[0] += 0.1 * 1.60943587225
    scores[0] += 0.1 * -0.908433703247
if X2 <= 0.000394500006223:
    scores[1] += 0.1 * -0.900203054177
    scores[1] += 0.1 * 0.221484425933
if X2 <= 0.0340005010366:
    scores[2] += 0.1 * -0.848148803219
    scores[2] += 0.1 * 1.98100820717

if X1 <= 57.5:
    scores[0] += 0.1 * 1.38506104792
    scores[0] += 0.1 * -0.855930587354
if X1 <= 43.5:
    scores[1] += 0.1 * -0.810729087535
    scores[1] += 0.1 * 0.237980820334
if X2 <= 0.027434501797:
    scores[2] += 0.1 * -0.815242297324
    scores[2] += 0.1 * 1.69970863021

# Get class with max score
return np.argmax(scores)

I used the log probability as F0, based on this.

For one estimator it gives me the same predictions as the predict method on the trained model. However when I add more estimators the predictions start to deviate. Am I supposed to incorporate the step length (described here)? Also, is my F0 correct? Should I be taking the mean? And should I convert the log-probabilities to something else?