Open LuBolin opened 4 years ago
Thank you for reporting this issue.
I checked the demo for reproducing this issue. In this case with warm_start=True
, I found that the population became very homologous in the later iteration since population were passed over each iteration when warm_start=True
and then point mutation operator somehow could not generate new pipeline (which is only happened when warm_start=True
and TPOTClassifier was fitted in a loop). My guesses about this issue:
warm_start=True
We need fix this bug related to warm_start
.
I just submit a PR #989 for fixing the bug mentioned in point 1 above.
Below is a updated demo for testing this issue (because in this PR evaluated_individuals_
is also shared between iterations)
import numpy as np
from tpot import TPOTClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# fix random state
np.random.seed(123)
independent=np.random.randint(100,size=1000)
dependent=np.random.randint(2,size=1000)
X_train, X_test, Y_train, Y_test = train_test_split(
independent, dependent, train_size=0.7, test_size=0.3
)
X_train=X_train.reshape(-1,1)
medium_config = {
"sklearn.linear_model.LogisticRegression": {
"penalty": ["l1", "l2"],
"C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0],
"dual": [False],
},
"sklearn.tree.DecisionTreeClassifier": {
"max_depth": range(1, 21),
"min_samples_split": range(2, 21),
"min_samples_leaf": range(1, 21),
},
"sklearn.ensemble.RandomForestClassifier": {
"n_estimators": np.arange(10, 201, 5),
"max_features": np.arange(0.05, 1.01, 0.05),
"min_samples_split": range(2, 21),
"min_samples_leaf": range(1, 21),
"bootstrap": [True, False],
},
"xgboost.XGBClassifier": {
"objective": ['reg:squarederror'],
"n_estimators": np.arange(10, 201, 5),
"max_depth": range(1, 21),
"learning_rate": [1e-3, 1e-2, 1e-1, 0.5, 1.0],
"subsample": np.arange(0.05, 1, 0.05),
"min_child_weight": range(1, 21),
"nthread": [1],
},
# Transformers
"sklearn.preprocessing.Binarizer": {"threshold": np.arange(0.0, 1.01, 0.05)},
"sklearn.preprocessing.MinMaxScaler": {},
"sklearn.preprocessing.RobustScaler": {},
"sklearn.preprocessing.StandardScaler": {},
}
# In[2]:
testC2 = TPOTClassifier(
generations=2,
population_size=30,
verbosity=2,
config_dict=medium_config,
n_jobs=2,
scoring="accuracy",
random_state=123,
use_dask=True,
template="Transformer-Classifier",
warm_start=True,
)
scatter = []
pipelineNames=[]
uniq_ind_count = []
old_inds = []
for i in range(20):
testC2.fit(X_train,Y_train)
all_inds=list(testC2.evaluated_individuals_.items())
tuples = [i for i in all_inds if i not in old_inds] # new pipelines in a iteration
old_inds = all_inds
tuples.sort(key=lambda x: x[1]["internal_cv_score"], reverse=True)
shownModels = []
uniq_ind = []
for ind in tuples:
if not uniq_ind.count(ind[0]):
uniq_ind.append(ind[0])
print('Iteration', i, '# Unique new pipelines',len(uniq_ind))
uniq_ind_count.append(len(uniq_ind))
for x in tuples:
pipeline = x[0]
name = pipeline[: pipeline.find("(")]
if name in shownModels:
continue
if(i==0):
pipelineNames.append(name)
scatter.append([])
shownModels.append(name)
description = x[1]
score = description["internal_cv_score"]
pipelineNames_idx = pipelineNames.index(name)
scatter[pipelineNames_idx].append(score)
# In[6]:
fig,ax=plt.subplots(1,1)
for j in range(len(scatter)):
ax.plot(range(0, len(scatter[j])), scatter[j], "-x", label=pipelineNames[j])
ax.legend(pipelineNames)
plt.show()
# In[4]:
print('Number of unique/new pipelines in each iteration', uniq_ind_count)
# In[5]:
print('Total number of unique pipelines in 20 iterations', len(list(testC2.evaluated_individuals_.keys())))
# In[7]:
testC = TPOTClassifier(
generations=2,
population_size=30,
verbosity=2,
config_dict=medium_config,
n_jobs=2,
scoring="accuracy",
random_state=123,
use_dask=True,
template="Classifier",
warm_start=True,
)
scatter = []
pipelineNames=[]
uniq_ind_count = []
old_inds = []
for i in range(20):
testC.fit(X_train,Y_train)
all_inds=list(testC.evaluated_individuals_.items())
tuples = [i for i in all_inds if i not in old_inds] # new pipelines in a iteration
old_inds = all_inds
tuples.sort(key=lambda x: x[1]["internal_cv_score"], reverse=True)
shownModels = []
uniq_ind = []
for ind in tuples:
if not uniq_ind.count(ind[0]):
uniq_ind.append(ind[0])
print('Iteration', i, '# Unique new pipelines',len(uniq_ind))
uniq_ind_count.append(len(uniq_ind))
for x in tuples:
pipeline = x[0]
name = pipeline[: pipeline.find("(")]
if name in shownModels:
continue
if(i==0):
pipelineNames.append(name)
scatter.append([])
shownModels.append(name)
description = x[1]
score = description["internal_cv_score"]
pipelineNames_idx = pipelineNames.index(name)
scatter[pipelineNames_idx].append(score)
# In[8]:
fig,ax=plt.subplots(1,1)
for j in range(len(scatter)):
ax.plot(range(0, len(scatter[j])), scatter[j], "-x", label=pipelineNames[j])
ax.legend(pipelineNames)
plt.show()
As the figures above, when template='Classifier', not all models did not appear new evaluated pipelines in > 3 iterations. I think, similar to my guess above, the reason is that points mutations could only happen on transformer step or hyper-parameters of 1-2 classifier in this small-size population since the population became homologous in one or two classifiers with high accuracy scores after a few iterations.
For testing the demo above, you may install TPOT with patch into your environment via:
pip install --upgrade --no-deps --force-reinstall git+https://github.com/EpistasisLab/tpot.git@development
Note: scikit-learn may need to be updated to 0.22 for using dev branch.
Thank you for the fix. I am still unsure though, what does "mutation could happen on transformer step only in this small-size population" mean? And are you planning to fix that problem anytime soon? Again, thank you.
I meant that that points mutations could only happen on transformer step or hyper-parameters of 1-2 classifier in this small-size population since the population in later iteration is full of solutions with only 1-2 classifier. For example, if only XGBClassifier become dominant pipelines with high fitness score in a iteration, the point mutation in large chance should just tune one hyperparameter of XGBClassifier or Transformer (if hyperparameter is available) via one point mutations and switching to new Classifier via point muation should also happen but the chance is lower. And, unless the solutions with new Classifier has a better or similar fitness score, those solutions can not survive to next generation or iteration after selection step in GP. I will dig into this issue a little more and maybe there is a better mutation method can be included into TPOT.
Understood, thank you.
Context of the issue
So I am trying to "stalk" the score of pipelines during training with warm-start and small runs. I have managed to do so for both "Classifier" and "Regressor" template, but pipelines start to not appear in evaluated pipelines after I add "Transformer-" to the config
Process to reproduce the issue
Run the following codes and compare testC and testC2's output.
independent=np.random.randint(100,size=1000) dependent=np.random.randint(2,size=1000) X_train, X_test, Y_train, Y_test = train_test_split( independent, dependent, train_size=0.7, test_size=0.3 ) X_train=X_train.reshape(-1,1)
medium_config = { "sklearn.linear_model.LogisticRegression": { "penalty": ["l1", "l2"], "C": [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1.0, 5.0, 10.0, 15.0, 20.0, 25.0], "dual": [False], }, "sklearn.tree.DecisionTreeClassifier": { "max_depth": range(1, 21), "min_samples_split": range(2, 21), "min_samples_leaf": range(1, 21), }, "sklearn.ensemble.RandomForestClassifier": { "n_estimators": np.arange(10, 201, 5), "max_features": np.arange(0.05, 1.01, 0.05), "min_samples_split": range(2, 21), "min_samples_leaf": range(1, 21), "bootstrap": [True, False], }, "xgboost.XGBClassifier": { "objective": ['reg:squarederror'], "n_estimators": np.arange(10, 201, 5), "max_depth": range(1, 21), "learning_rate": [1e-3, 1e-2, 1e-1, 0.5, 1.0], "subsample": np.arange(0.05, 1, 0.05), "min_child_weight": range(1, 21), "nthread": [1], },
Transformers
}
testC = tpotC( generations=2, population_size=30, verbosity=3, config_dict=medium_config, n_jobs=2, scoring="accuracy", random_state=123, use_dask=True, template="Classifier", warm_start=True, )
testC2 = tpotC( generations=2, population_size=30, verbosity=3, config_dict=medium_config, n_jobs=2, scoring="accuracy", random_state=123, use_dask=True, template="Transformer-Classifier", warm_start=True, )
scatter = [] pipelineNames=[]
Change testC to testC2 to see the problem
for i in range(20): testC.fit(X_train,Y_train)
Expected result
template Transformer-Classifier to behave like template Classifier, where every time I request for evaluated_pipelines, all models are in there. Also Regressors for the same thing.
Current result
With Transformer in the template, certain models start to not be included in the evaluatedpipelines, and the models to be left behind are not constant over different runs.
Possible fix
No idea actually, help!
Screenshot
Classifier
Transformer-Classifier
Thank you.