tensorflow / agents

TF-Agents: A reliable, scalable and easy to use TensorFlow library for Contextual Bandits and Reinforcement Learning.
Apache License 2.0
2.8k stars 722 forks source link

Help with Observation Masking #730

Open masterkey2000 opened 2 years ago

masterkey2000 commented 2 years ago

Hello,

I am currently trying to build a DQN where the legal actions are to be masked. To understand the whole thing I built a small test setup. I also want to use the observation_and_action_constraint_splitter. I haven't applied it yet, because I get an error message before. Maybe someone can help me.

As starting point I took the example from @ormandi from here: https://github.com/tensorflow/agents/issues/397

The network is a DQN. The complete code is this:

#!pip install "gym>=0.21.0"
#!pip install tf-agents

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf
import abc

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

from tf_agents.agents.dqn import dqn_agent
from tf_agents.networks import q_rnn_network
from tf_agents.utils import common

class CardGameEnvWithMask(py_environment.PyEnvironment):

  def __init__(self):
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
    self._observation_spec = {
        'observation': array_spec.BoundedArraySpec(
            shape=(1,), dtype=np.int32, minimum=0, name='observation'),
        'mask': array_spec.ArraySpec(
            shape=(1,), dtype=np.bool, name='mask')}
    self._state = 0
    self._episode_ended = False

  def action_spec(self):
    return self._action_spec

  def observation_spec(self):
    return self._observation_spec

  def _obs(self, obs, mask=True):
    return {'observation': obs, 'mask': np.array([mask], dtype=np.bool_)}

  def _reset(self):
    self._state = 0
    self._episode_ended = False
    return ts.restart(self._obs(np.array([self._state], dtype=np.int32)))

  def _step(self, action):

    if self._episode_ended:
      # The last action ended the episode. Ignore the current action and start
      # a new episode.
      return self.reset()

    # Make sure episodes don't go on forever.
    if action == 1:
      self._episode_ended = True
    elif action == 0:
      new_card = np.random.randint(1, 11)
      self._state += new_card
    else:
      raise ValueError('`action` should be 0 or 1.')

    if self._episode_ended or self._state >= 21:
      reward = self._state - 21 if self._state <= 21 else -21
      return ts.termination(
          self._obs(np.array([self._state], dtype=np.int32), False), reward)
    else:
      return ts.transition(
          self._obs(np.array([self._state], dtype=np.int32), True),
          reward=0.0, discount=1.0)

environment = CardGameEnvWithMask()
utils.validate_py_environment(environment, episodes=5)

get_new_card_action = np.array(0, dtype=np.int32)
end_round_action = np.array(1, dtype=np.int32)

environment = CardGameEnvWithMask()
time_step = environment.reset()
print(time_step)
cumulative_reward = time_step.reward

for _ in range(3):
  time_step = environment.step(get_new_card_action)
  print(time_step)
  cumulative_reward += time_step.reward

time_step = environment.step(end_round_action)
print(time_step)
cumulative_reward += time_step.reward
print('Final Reward = ', cumulative_reward)

environment.action_spec()

environment.observation_spec()

environment.step(1)

learning_rate = 1e-3  

#network configuration
input_fc_layer_params = (40,)
lstm_size=(20,)
output_fc_layer_params=(20,)

# as we are using dictionary in our enviroment, we will create preprocessing layer
preprocessing_layers = {
    'mask': tf.keras.layers.Flatten(),
    'observation': tf.keras.layers.Flatten()
    }
preprocessing_combiner = tf.keras.layers.Concatenate(axis=-1)

#create a q_RNNnet
q_net = q_rnn_network.QRnnNetwork(
    #train_env.time_step_spec(),
    environment.observation_spec(),
    environment.action_spec(),
    preprocessing_layers=preprocessing_layers,
    preprocessing_combiner=preprocessing_combiner,
    input_fc_layer_params=input_fc_layer_params,
    lstm_size=lstm_size,
    output_fc_layer_params=output_fc_layer_params
)    

#create optimizer
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)

#create a global step counter
global_step = tf.compat.v1.train.get_or_create_global_step()

#create agent
agent = dqn_agent.DqnAgent(
    environment.time_step_spec(),
    environment.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    #observation_and_action_constraint_splitter=CardGameEnvWithMask.observation_action_splitter,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=global_step)

agent.initialize()

# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

This is the error i get:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Input In [10], in <cell line: 8>()
      5 global_step = tf.compat.v1.train.get_or_create_global_step()
      7 #create agent
----> 8 agent = dqn_agent.DqnAgent(
      9     environment.time_step_spec(),
     10     environment.action_spec(),
     11     q_network=q_net,
     12     optimizer=optimizer,
     13     #observation_and_action_constraint_splitter=CardGameEnvWithMask.observation_action_splitter,
     14     td_errors_loss_fn=common.element_wise_squared_loss,
     15     train_step_counter=global_step)
     17 agent.initialize()
     19 # (Optional) Optimize by wrapping some of the code in a graph using TF function.

File ~\Anaconda3\envs\dqn\lib\site-packages\gin\config.py:1605, in _make_gin_wrapper.<locals>.gin_wrapper(*args, **kwargs)
   1603 scope_info = " in scope '{}'".format(scope_str) if scope_str else ''
   1604 err_str = err_str.format(name, fn_or_cls, scope_info)
-> 1605 utils.augment_exception_message_and_reraise(e, err_str)

File ~\Anaconda3\envs\dqn\lib\site-packages\gin\utils.py:41, in augment_exception_message_and_reraise(exception, message)
     39 proxy = ExceptionProxy()
     40 ExceptionProxy.__qualname__ = type(exception).__qualname__
---> 41 raise proxy.with_traceback(exception.__traceback__) from None

File ~\Anaconda3\envs\dqn\lib\site-packages\gin\config.py:1582, in _make_gin_wrapper.<locals>.gin_wrapper(*args, **kwargs)
   1579 new_kwargs.update(kwargs)
   1581 try:
-> 1582   return fn(*new_args, **new_kwargs)
   1583 except Exception as e:  # pylint: disable=broad-except
   1584   err_str = ''

File ~\Anaconda3\envs\dqn\lib\site-packages\tf_agents\agents\dqn\dqn_agent.py:235, in DqnAgent.__init__(self, time_step_spec, action_spec, q_network, optimizer, observation_and_action_constraint_splitter, epsilon_greedy, n_step_update, boltzmann_temperature, emit_log_probability, target_q_network, target_update_tau, target_update_period, td_errors_loss_fn, gamma, reward_scale_factor, gradient_clipping, debug_summaries, summarize_grads_and_vars, train_step_counter, name)
    232 if observation_and_action_constraint_splitter:
    233   net_observation_spec, _ = observation_and_action_constraint_splitter(
    234       net_observation_spec)
--> 235 q_network.create_variables(net_observation_spec)
    236 if target_q_network:
    237   target_q_network.create_variables(net_observation_spec)

File ~\Anaconda3\envs\dqn\lib\site-packages\tf_agents\networks\network.py:217, in Network.create_variables(self, input_tensor_spec, **kwargs)
    212 if input_tensor_spec is None:
    213   raise ValueError(
    214       "Unable to create_variables: no input_tensor_spec provided, and "
    215       "Network did not define one.")
--> 217 random_input = tensor_spec.sample_spec_nest(
    218     input_tensor_spec, outer_dims=(1,))
    219 initial_state = self.get_initial_state(batch_size=1)
    220 step_type = tf.fill((1,), time_step.StepType.FIRST)

File ~\Anaconda3\envs\dqn\lib\site-packages\tf_agents\specs\tensor_spec.py:400, in sample_spec_nest(structure, seed, outer_dims, minimum, maximum)
    397   else:
    398     raise TypeError("Spec type not supported: '{}'".format(spec))
--> 400 return tf.nest.map_structure(sample_fn, structure)

File ~\Anaconda3\envs\dqn\lib\site-packages\tensorflow\python\util\nest.py:914, in map_structure(func, *structure, **kwargs)
    910 flat_structure = (flatten(s, expand_composites) for s in structure)
    911 entries = zip(*flat_structure)
    913 return pack_sequence_as(
--> 914     structure[0], [func(*x) for x in entries],
    915     expand_composites=expand_composites)

File ~\Anaconda3\envs\dqn\lib\site-packages\tensorflow\python\util\nest.py:914, in <listcomp>(.0)
    910 flat_structure = (flatten(s, expand_composites) for s in structure)
    911 entries = zip(*flat_structure)
    913 return pack_sequence_as(
--> 914     structure[0], [func(*x) for x in entries],
    915     expand_composites=expand_composites)

File ~\Anaconda3\envs\dqn\lib\site-packages\tf_agents\specs\tensor_spec.py:378, in sample_spec_nest.<locals>.sample_fn(spec)
    374   return tf.as_string(
    375       sample_bounded_spec(
    376           sample_spec, outer_dims=outer_dims, seed=seed_stream()))
    377 else:
--> 378   bounded_spec = BoundedTensorSpec.from_spec(spec)
    380   spec_max = bounded_spec.maximum
    381   if maximum is not None:

File ~\Anaconda3\envs\dqn\lib\site-packages\tensorflow\python\framework\tensor_spec.py:321, in BoundedTensorSpec.from_spec(cls, spec)
    305 """Returns a `TensorSpec` with the same shape and dtype as `spec`.
    306 
    307 If `spec` is a `BoundedTensorSpec`, then the new spec's bounds are set to
   (...)
    318   spec: The `TypeSpec` used to create the new `BoundedTensorSpec`.
    319 """
    320 dtype = dtypes.as_dtype(spec.dtype)
--> 321 minimum = getattr(spec, "minimum", dtype.min)
    322 maximum = getattr(spec, "maximum", dtype.max)
    323 return BoundedTensorSpec(spec.shape, dtype, minimum, maximum, spec.name)

File ~\Anaconda3\envs\dqn\lib\site-packages\tensorflow\python\framework\dtypes.py:97, in DType.min(self)
     89 """Returns the minimum representable value in this data type.
     90 
     91 Raises:
     92   TypeError: if this is a non-numeric, unordered, or quantized type.
     93 
     94 """
     95 if (self.is_quantized or
     96     self.base_dtype in (bool, string, complex64, complex128)):
---> 97   raise TypeError(f"Cannot find minimum value of {self} with "
     98                   f"{'quantized type' if self.is_quantized else 'type'} "
     99                   f"{self.base_dtype}.")
    101 # there is no simple way to get the min value of a dtype, we have to check
    102 # float and int types separately
    103 try:

TypeError: Cannot find minimum value of <dtype: 'bool'> with type <dtype: 'bool'>.
  In call to configurable 'DqnAgent' (<class 'tf_agents.agents.dqn.dqn_agent.DqnAgent'>)

Can anyone help me please? Or a sample code with action masking would also be good to understand all this better.

Thank you!

sguada commented 2 years ago

If you don't pass the observation_action_splitter to DQN then the observation contains both the observation and the mask. And the mask is bool which cannot be sampled. Not sure what is the intended use of the mask when you only have 2 actions.

sguada commented 2 years ago

Also can you make sure you have the latest version since sample_nest_spec can now handle tf.bool https://github.com/tensorflow/agents/blob/master/tf_agents/specs/tensor_spec.py#L380

masterkey2000 commented 2 years ago

If you don't pass the observation_action_splitter to DQN then the observation contains both the observation and the mask. And the mask is bool which cannot be sampled. Not sure what is the intended use of the mask when you only have 2 actions.

Also with implementation of:

def observation_action_splitter(obs):
    return obs['mask'], obs['observation'] 

and:

observation_and_action_constraint_splitter=observation_action_splitter

the error message persists.

I know that it makes no sence in this example to mask. It should be an easy example to fix this issue.

masterkey2000 commented 2 years ago

Also can you make sure you have the latest version since sample_nest_spec can now handle tf.bool https://github.com/tensorflow/agents/blob/master/tf_agents/specs/tensor_spec.py#L380

the tf-agents version I use is 0.12.0