Trying to replicate with different data set

huongvu16 commented 6 years ago

Hi Nicolo! Thanks for posting the example of Gradient Boosting in Tensorflow. I am trying to replicate your model using different data set (lending club data - sample data used to run examples in h2o.ai) and trying to customize your code to fit this dataset (named 'processed.csv' here, from which I have deleted all rows that include 'nan' values)

By the way, I am only interested in running the Tensorflow model, not XGBoost.

I am running on Python 3.6 in Anaconda environment, Tensorflow version 1.4, on Mac OS X 10.12.6

The pre-processing part is as follows (I have omitted the all the imports)

cols=['loan_amnt', 'term','int_rate', 'emp_length','annual_inc','dti', 'delinq_2yrs',
      'bad_loan','revol_util', 'total_acc', 'longest_credit_length',
      'home_ownership','purpose', 'addr_state','verification_status']
def _get_df_from_file(file_name):

```    ```
df = pd.read_csv(file_name)
    labels = df['bad_loan']
    del df['bad_loan']
    return df, labels
if __name__ == '__main__':
    df, labels = _get_df_from_file('processed.csv')
    X_train, X_test, y_train, y_test = train_test_split(
        df, labels, test_size=0.25, random_state=42)

    data = dict(
        feature_names=df.columns,
        X_train=X_train, y_train=y_train,
        X_test=X_test, y_test=y_test,)
for k, v in data.items():
        print(k, v.shape)

    np.savez('processed.npz', **data)

Results of this is:

X_train (118497, 14) y_train (118497,) feature_names (14,) X_test (39499, 14) y_test (39499,)

Now the model in Tensorflow (all the imports omitted):

FLAGS = None
    def _get_tfbt(output_dir,feature_cols):
    learner_config = learner_pb2.LearnerConfig()
    learner_config.learning_rate_tuner.fixed.learning_rate=FLAGS.learning_rate
    learner_config.regularization.l1 = 0.0
    learner_config.regularization.l2 = FLAGS.l2/FLAGS.batch_size
    learner_config.constraints.max_tree_depth = FLAGS.depth
    learner_config.growing_mode = learner_pb2.LearnerConfig.LAYER_BY_LAYER

    run_config = tf.contrib.learn.RunConfig(save_checkpoints_secs=30)

    estimator = GradientBoostedDecisionTreeClassifier(
        learner_config=learner_config,
        examples_per_layer=FLAGS.examples_per_layer,
        n_classes=2,
        num_trees=FLAGS.num_trees,
        feature_columns=feature_cols,
        model_dir=output_dir,
        config=run_config,
        center_bias=False)
    return estimator

def _matrix_to_dict(matrix, col_names):
        return{
        feat_name: matrix[:,feat_idx,np.newaxis]
        for feat_idx,feat_name in enumerate(col_names)}
def _make_input_fn(which_set):
        data = np.load('processed.npz')
        feature_names = data['feature_names']
        feature_columns = [feature_column.real_valued_column(k) for k in feature_names]
        if which_set == 'train':
        return feature_columns,tf.estimator.inputs.numpy_input_fn(
                                                                    x=_matrix_to_dict(data['X_train'],feature_names),
                                                                    y=data['y_train'],
                                                                    batch_size=100,
                                                                    num_epochs=None,
                                                                    shuffle=True)
        elif which_set == 'test':
        return feature_columns, tf.estimator.inputs.numpy_input_fn(
                                                                    x=_matrix_to_dict(data['X_test'],feature_names),
                                                                    y=data['y_test'],
                                                                    num_epochs=1,
                                                                    shuffle=False)
        else:
        raise NotImplementedError()

def _make_experiment_fn(output_dir):
    feature_columns, train_input_fn = _make_input_fn('train')
    feature_columns, test_input_fn = _make_input_fn('test')

return tf.contrib.learn.Experiment(
                                        estimator=_get_tfbt(output_dir,feature_columns),
                                        train_input_fn=train_input_fn,
                                        eval_input_fn=test_input_fn,
                                        train_steps=None,
                                        eval_metrics=None,
                                        eval_steps=None,)

def main(unused_argv):
    learn_runner.run(
                    experiment_fn=_make_experiment_fn,
                    output_dir=FLAGS.output_dir,
                    schedule='train_and_evaluate')
    feature_columns,test_input_fn = _make_input_fn('test')

    estimator = _get_tfbt(FLAGS.output_dir,feature_columns)
    results = estimator.predict(input_fn=test_input_fn)

    y_predict = np.array([r['probabilities'][1] for r in results])
    np.save(os.path.join(FLAGS.output_dir,'prediction_tf.npy'),y_predict)

if __name__ == '__main__':
    tf.logging.set_verbosity(tf.logging.INFO)
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--batch_size",
        type=int,
        default=10000,
        help="The batch size for reading data.")
    parser.add_argument(
        "--depth",
        type=int,
        default=6,
        help="Maximum depth of weak learners.")
    parser.add_argument(
        "--l2",
        type=float,
        default=1.0,
        help="l2 regularization per batch.")
    parser.add_argument(
        "--learning_rate",
        type=float,
        default=0.1,
        help="Learning rate (shrinkage weight) with which each new tree is added.")
    parser.add_argument(
        "--examples_per_layer",
        type=int,
        default=5000,
        help="Number of examples to accumulate stats for per layer.")
    parser.add_argument(
        "--num_trees",
        type=int,
        default=10,
        help="Number of trees to grow before stopping.")

    FLAGS, unparsed = parser.parse_known_args()

    FLAGS.output_dir = 'outputs/tf_t{:03d}_d{:02d}_ex{:05d}'.format(
        FLAGS.num_trees, FLAGS.depth, FLAGS.examples_per_layer)

    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

And the result is:

INFO:tensorflow:Using config: {'_master': '', '_num_worker_replicas': 0, '_save_summary_steps': 100, '_tf_config': gpu_options { per_process_gpu_memory_fraction: 1 } , '_keep_checkpoint_every_n_hours': 10000, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12c509438>, '_save_checkpoints_steps': None, '_task_id': 0, '_environment': 'local', '_log_step_count_steps': 100, '_model_dir': 'outputs/tf_t010_d06_ex05000', '_keep_checkpoint_max': 5, '_evaluation_master': '', '_is_chief': True, '_session_config': None, '_save_checkpoints_secs': 30, '_tf_random_seed': None, '_task_type': None, '_num_ps_replicas': 0} INFO:tensorflow:Active Feature Columns: ['addr_state', 'annual_inc', 'delinq_2yrs', 'dti', 'emp_length', 'home_ownership', 'int_rate', 'loan_amnt', 'longest_credit_length', 'purpose', 'revol_util', 'term', 'total_acc', 'verification_status'] WARNING:tensorflow:Casting <dtype: 'int64'> labels to bool. WARNING:tensorflow:Casting <dtype: 'int64'> labels to bool. INFO:tensorflow:Create CheckpointSaverHook. INFO:tensorflow:Restoring parameters from outputs/tf_t010_d06_ex05000/model.ckpt-0 INFO:tensorflow:Error reported to Coordinator: <class 'tensorflow.python.framework.errors_impl.InternalError'>, Unable to get element as bytes. INFO:tensorflow:Saving checkpoints for 0 into outputs/tf_t010_d06_ex05000/model.ckpt. WARNING:tensorflow:Error encountered when serializing resources. Type is unsupported, or the types of the items don't match field type in CollectionDef. '_Resource' object has no attribute 'name' TypeError Traceback (most recent call last) TypeError: expected bytes, float found

During handling of the above exception, another exception occurred:

SystemError Traceback (most recent call last) ..... InternalError: Unable to get element as bytes.

From the codes I pasted above, do you have any pointer to which could be causing this problem?

Many thanks!

nicolov commented 6 years ago

Did you end up solving this?

huongvu16 commented 6 years ago

Hi, yes I fixed the issued with the categorical columns.

I'm trying to the also produce the prediction array for the training set as well, using the following code:

def main(unused_argv):
    learn_runner.run(
                    experiment_fn=_make_experiment_fn,
                    output_dir=FLAGS.output_dir,
                    schedule='train_and_evaluate')
    feature_columns,train_input_fn = _make_input_fn('train')

    estimator = _get_tfbt(FLAGS.output_dir,feature_columns)
    results = estimator.predict(input_fn=train_input_fn)

    y_predict = np.array([r['probabilities'][1] for r in results])
    np.save(os.path.join(FLAGS.output_dir,'train_prediction_tf.npy'),y_predict)
if __name__ == '__main__':
    tf.logging.set_verbosity(tf.logging.INFO)
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--batch_size",
        type=int,
        default=10000,
        help="The batch size for reading data.")
    parser.add_argument(
        "--depth",
        type=int,
        default=6,
        help="Maximum depth of weak learners.")
    parser.add_argument(
        "--l2",
        type=float,
        default=1.0,
        help="l2 regularization per batch.")
    parser.add_argument(
        "--learning_rate",
        type=float,
        default=0.1,
        help="Learning rate (shrinkage weight) with which each new tree is added.")
    parser.add_argument(
        "--examples_per_layer",
        type=int,
        default=5000,
        help="Number of examples to accumulate stats for per layer.")
    parser.add_argument(
        "--num_trees",
        type=int,
        default=10,
        help="Number of trees to grow before stopping.")

    FLAGS, unparsed = parser.parse_known_args()

    FLAGS.output_dir = 'outputs/tf_t{:03d}_d{:02d}_ex{:05d}'.format(
        FLAGS.num_trees, FLAGS.depth, FLAGS.examples_per_layer)

    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

The kernel runs but seems to take forever (I leave it for an hour and when I come back it is still running - without any new logs though) Do you think this is because the X_train file is too big?

nicolov commented 6 years ago

From my tests, it did seem very finicky, so I wouldn't be surprised if it's acting up. Try raising the verbosity options, maybe you'll get some more output.

nicolov / gradient_boosting_tensorflow_xgboost

Trying to replicate with different data set #1