neurodata / SPORF

This is the implementation of Sparse Projection Oblique Randomer Forest
https://neurodata.io/forests/
97 stars 46 forks source link

oob score changes test accuracy #343

Open rflperry opened 4 years ago

rflperry commented 4 years ago

When oob=True, the classification accuracy doesn't match oob=False and also shows variability even with a set seed.

from rerf.rerfClassifier import rerfClassifier

# Import scikit-learn dataset library
from sklearn import datasets
# Load dataset
iris = datasets.load_iris()
import pandas as pd
from sklearn.model_selection import train_test_split
X = data[["sepal length", "sepal width", "petal length", "petal width"]]  # Features
y = data["species"]  # Labels
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3
)  # 70% training and 30% test
clf1 = rerfClassifier(n_estimators=10, oob_score=False, random_state=2)
clf2 = rerfClassifier(n_estimators=10, oob_score=True, random_state=2)
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
rerfClassifier(feature_combinations=1.5, image_height=None, image_width=None,
               max_depth=None, max_features='auto', min_samples_split=1,
               n_estimators=10, n_jobs=None, oob_score=True,
               patch_height_max=None, patch_height_min=1, patch_width_max=None,
               patch_width_min=1, projection_matrix='RerF', random_state=2)
y_pred1 = clf1.predict(X_test)
y_pred2 = clf2.predict(X_test)
from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_pred1))
print("Accuracy oob:", metrics.accuracy_score(y_test, y_pred2))
Accuracy: 0.9555555555555556
Accuracy oob: 0.9333333333333333