Error with training - Githubissues

It seems like theres an error when I try to use the module with a custom env, that occurs after the first iter:

Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
Traceback (most recent call last):
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
    return fn(*args)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py", line 1350, in _run_fn
    target_list, run_metadata)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
    run_metadata)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [1024,2] vs. [1024]
         [[{{node gradients/loss/sub_8_grad/BroadcastGradientArgs}}]]

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "train.py", line 184, in <module>
    cli()
  File "train.py", line 179, in cli
    main(args)
  File "train.py", line 118, in main
    model.learn(total_timesteps=int(1e9), callback=[eval_callback], reset_num_timesteps = False, tb_log_name="tb")
  File "/home/selfplay/.local/lib/python3.6/site-packages/stable_baselines/ppo1/pposgd_simple.py", line 297, in learn
    cur_lrmult, sess=self.sess)
  File "/home/selfplay/.local/lib/python3.6/site-packages/stable_baselines/common/tf_util.py", line 330, in __call__
    results = sess.run(self.outputs_update, feed_dict=feed_dict, **kwargs)[:-1]
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py", line 956, in run
    run_metadata_ptr)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py", line 1180, in _run
    feed_dict_tensor, options, run_metadata)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py", line 1359, in _do_run
    run_metadata)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/client/session.py", line 1384, in _do_call
    raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [1024,2] vs. [1024]
         [[node gradients/loss/sub_8_grad/BroadcastGradientArgs (defined at /home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1748) ]]

Original stack trace for 'gradients/loss/sub_8_grad/BroadcastGradientArgs':
  File "train.py", line 184, in <module>
    cli()
  File "train.py", line 179, in cli
    main(args)
  File "train.py", line 82, in main
    model = PPO1.load(os.path.join(model_dir, 'base.zip'), env, **params)
  File "/home/selfplay/.local/lib/python3.6/site-packages/stable_baselines/common/base_class.py", line 947, in load
    model.setup_model()
  File "/home/selfplay/.local/lib/python3.6/site-packages/stable_baselines/ppo1/pposgd_simple.py", line 193, in setup_model
    [self.summary, tf_util.flatgrad(total_loss, self.params)] + losses)
  File "/home/selfplay/.local/lib/python3.6/site-packages/stable_baselines/common/tf_util.py", line 381, in flatgrad
    grads = tf.gradients(loss, var_list)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/gradients_impl.py", line 158, in gradients
    unconnected_gradients)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/gradients_util.py", line 679, in _GradientsHelper
    lambda: grad_fn(op, *out_grads))
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/gradients_util.py", line 350, in _MaybeCompile
    return grad_fn()  # Exit early
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/gradients_util.py", line 679, in <lambda>
    lambda: grad_fn(op, *out_grads))
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/math_grad.py", line 1144, in _SubGrad
    SmartBroadcastGradientArgs(x, y, grad))
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/math_grad.py", line 99, in SmartBroadcastGradientArgs
    rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/gen_array_ops.py", line 830, in broadcast_gradient_args
    "BroadcastGradientArgs", s0=s0, s1=s1, name=name)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()

...which was originally created as op 'loss/sub_8', defined at:
  File "train.py", line 184, in <module>
    cli()
[elided 2 identical lines from previous traceback]
  File "/home/selfplay/.local/lib/python3.6/site-packages/stable_baselines/common/base_class.py", line 947, in load
    model.setup_model()
  File "/home/selfplay/.local/lib/python3.6/site-packages/stable_baselines/ppo1/pposgd_simple.py", line 147, in setup_model
    vf_loss = tf.reduce_mean(tf.square(self.policy_pi.value_flat - ret))
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/math_ops.py", line 899, in binary_op_wrapper
    return func(x, y, name=name)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/gen_math_ops.py", line 11086, in sub
    "Sub", x=x, y=y, name=name)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "/home/selfplay/.local/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()

How could this be caused? I defined my action space as a discrete value with 11 possible, and observation as 2 values with 100 discrete values. I repurpused the Tic Tac Toe model, with a few changes below:

import tensorflow as tf
tf.get_logger().setLevel('INFO')
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

from tensorflow.keras.layers import BatchNormalization, Activation, Flatten, Conv2D, Add, Dense, Dropout

from stable_baselines.common.policies import ActorCriticPolicy
from stable_baselines.common.distributions import CategoricalProbabilityDistributionType, CategoricalProbabilityDistribution

class CustomPolicy(ActorCriticPolicy):
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **kwargs):
        super(CustomPolicy, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=True)

        with tf.variable_scope("model", reuse=reuse):

            self._policy = policy_head(self.processed_obs)
            self._value_fn, self.q_value = value_head(self.processed_obs)

            self._proba_distribution  = CategoricalProbabilityDistribution(self._policy)

        self._setup_init()

    def step(self, obs, state=None, mask=None, deterministic=True):
        if deterministic:
            action, value, neglogp = self.sess.run([self.deterministic_action, self.value_flat, self.neglogp],
                                                   {self.obs_ph: obs})
        else:
            action, value, neglogp = self.sess.run([self.action, self.value_flat, self.neglogp],
                                                   {self.obs_ph: obs})
        return action, value[0], self.initial_state, neglogp

    def proba_step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy_proba, {self.obs_ph: obs})

    def value(self, obs, state=None, mask=None):
        return self.sess.run(self.value_flat, {self.obs_ph: obs})

def value_head(y):
    vf = dense(y, 2, batch_norm = False, activation = 'tanh', name='vf')
    q = dense(y, 11, batch_norm = False, activation = 'tanh', name='q')
    return vf, q

def policy_head(y):
    policy = dense(y, 11, batch_norm = False, activation = None, name='pi')
    return policy

def resnet_extractor(y, **kwargs):

    y = convolutional(y, 32, 3)
    y = residual(y, 32, 3)

    return y

def convolutional(y, filters, kernel_size):
    y = Conv2D(filters, kernel_size=kernel_size, strides=1, padding='same')(y)
    y = BatchNormalization(momentum = 0.9)(y)
    y = Activation('relu')(y)
    return y

def residual(y, filters, kernel_size):
    shortcut = y

    y = Conv2D(filters, kernel_size=kernel_size, strides=1, padding='same')(y)
    y = BatchNormalization(momentum = 0.9)(y)
    y = Activation('relu')(y)

    y = Conv2D(filters, kernel_size=kernel_size, strides=1, padding='same')(y)
    y = BatchNormalization(momentum = 0.9)(y)
    y = Add()([shortcut, y])
    y = Activation('relu')(y)

    return y

def dense(y, filters, batch_norm = True, activation = 'relu', name = None):

    if batch_norm or activation:
        y = Dense(filters)(y)
    else:
        y = Dense(filters, name = name)(y)

    if batch_norm:
        if activation:
            y = BatchNormalization(momentum = 0.9)(y)
        else:
            y = BatchNormalization(momentum = 0.9, name = name)(y)

    if activation:
        y = Activation(activation, name = name)(y)

    return y

davidADSP / SIMPLE

Error with training #12