tensorforce / tensorforce

Tensorforce: a TensorFlow library for applied reinforcement learning
Apache License 2.0
3.3k stars 530 forks source link

Issues with multiple continuous actions #51

Closed JannesKlaas closed 7 years ago

JannesKlaas commented 7 years ago

Hi, first of all, thanks for the hard work that is going into this project. You are saving me a ton of work. Second, I encountered some strange behavior when trying to define an agent with multiple continuous actions. All code below was run in a Jupyter notebook with Anaconda and Python 3.5:

#Configuration, adapted from config in readme
config = Configuration(
    batch_size=100,
    states=dict(shape=(4,), type='float'),
    actions=dict(opt_a = dict(continuous=True, min_value = 0, max_value = 2),
                opt_b = dict(continuous=True, min_value = 0, max_value = 2)),
    network=layered_network_builder([dict(type='dense', size=50), dict(type='dense', size=50)])
)

# Create a TRPO agent
agent = TRPOAgent(config=config)

This code crashes with the trace:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-70-b10cf4edc1d7> in <module>()
      1 # Create a VPGA agent
----> 2 agent = TRPOAgent(config=config)

/Users/jannes/AnacondaProjects/tensorforce/tensorforce/agents/batch_agent.py in __init__(self, config)
     48     def __init__(self, config):
     49         config.default(BatchAgent.default_config)
---> 50         super(BatchAgent, self).__init__(config)
     51         self.batch_size = config.batch_size
     52         self.batch = None

/Users/jannes/AnacondaProjects/tensorforce/tensorforce/agents/agent.py in __init__(self, config)
    141         self.actions_config = config.actions
    142 
--> 143         self.model = self.__class__.model(config)
    144 
    145         self.episode = 0

/Users/jannes/AnacondaProjects/tensorforce/tensorforce/models/trpo_model.py in __init__(self, config)
     52     def __init__(self, config):
     53         config.default(TRPOModel.default_config)
---> 54         super(TRPOModel, self).__init__(config)
     55 
     56         self.override_line_search = config.override_line_search

/Users/jannes/AnacondaProjects/tensorforce/tensorforce/models/policy_gradient_model.py in __init__(self, config)
     81             self.baseline = Baseline.from_config(config=config.baseline)
     82 
---> 83         super(PolicyGradientModel, self).__init__(config)
     84 
     85         # advantage estimation

/Users/jannes/AnacondaProjects/tensorforce/tensorforce/models/model.py in __init__(self, config)
    118                 scope = scope_context.__enter__()
    119 
--> 120             self.create_tf_operations(config)
    121 
    122             if config.distributed:

/Users/jannes/AnacondaProjects/tensorforce/tensorforce/models/trpo_model.py in create_tf_operations(self, config)
    117 
    118             gradients = tf.gradients(fixed_kl_divergence, variables)
--> 119             gradient_vector_product = [tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents)]
    120 
    121             self.flat_variable_helper = FlatVarHelper(variables)

/Users/jannes/AnacondaProjects/tensorforce/tensorforce/models/trpo_model.py in <listcomp>(.0)
    117 
    118             gradients = tf.gradients(fixed_kl_divergence, variables)
--> 119             gradient_vector_product = [tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents)]
    120 
    121             self.flat_variable_helper = FlatVarHelper(variables)

/Users/jannes/anaconda/lib/python3.5/site-packages/tensorflow/python/ops/math_ops.py in r_binary_op_wrapper(y, x)
    895   def r_binary_op_wrapper(y, x):
    896     with ops.name_scope(None, op_name, [x, y]) as name:
--> 897       x = ops.convert_to_tensor(x, dtype=y.dtype.base_dtype, name="x")
    898       return func(x, y, name=name)
    899 

/Users/jannes/anaconda/lib/python3.5/site-packages/tensorflow/python/framework/ops.py in convert_to_tensor(value, dtype, name, preferred_dtype)
    649       name=name,
    650       preferred_dtype=preferred_dtype,
--> 651       as_ref=False)
    652 
    653 

/Users/jannes/anaconda/lib/python3.5/site-packages/tensorflow/python/framework/ops.py in internal_convert_to_tensor(value, dtype, name, as_ref, preferred_dtype)
    714 
    715         if ret is None:
--> 716           ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
    717 
    718         if ret is NotImplemented:

/Users/jannes/anaconda/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py in _constant_tensor_conversion_function(v, dtype, name, as_ref)
    174                                          as_ref=False):
    175   _ = as_ref
--> 176   return constant(v, dtype=dtype, name=name)
    177 
    178 

/Users/jannes/anaconda/lib/python3.5/site-packages/tensorflow/python/framework/constant_op.py in constant(value, dtype, shape, name, verify_shape)
    163   tensor_value = attr_value_pb2.AttrValue()
    164   tensor_value.tensor.CopyFrom(
--> 165       tensor_util.make_tensor_proto(value, dtype=dtype, shape=shape, verify_shape=verify_shape))
    166   dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
    167   const_tensor = g.create_op(

/Users/jannes/anaconda/lib/python3.5/site-packages/tensorflow/python/framework/tensor_util.py in make_tensor_proto(values, dtype, shape, verify_shape)
    358   else:
    359     if values is None:
--> 360       raise ValueError("None values not supported.")
    361     # if dtype is provided, forces numpy array to be the type
    362     # provided if possible.

ValueError: None values not supported.

I tried different agents and encountered another strange behavior:

# Create a VPG agent
agent = VPGAgent(config=config)
state = np.array([1,2,3,4])
agent.act(state)

Crashes with:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-73-565d0bd87882> in <module>()
----> 1 agent.act(state)

/Users/jannes/AnacondaProjects/tensorforce/tensorforce/agents/agent.py in act(self, state, deterministic)
    194 
    195         # model action
--> 196         self.current_action, self.next_internal = self.model.get_action(state=self.current_state, internal=self.current_internal, deterministic=deterministic)
    197 
    198         # exploration

/Users/jannes/AnacondaProjects/tensorforce/tensorforce/models/model.py in get_action(self, state, internal, deterministic)
    219         fetches.update({n: internal_output for n, internal_output in enumerate(self.internal_outputs)})
    220 
--> 221         feed_dict = {state_input: (state[name],) for name, state_input in self.state.items()}
    222         feed_dict.update({internal_input: (internal[n],) for n, internal_input in enumerate(self.internal_inputs)})
    223         feed_dict[self.deterministic] = deterministic

/Users/jannes/AnacondaProjects/tensorforce/tensorforce/models/model.py in <dictcomp>(.0)
    219         fetches.update({n: internal_output for n, internal_output in enumerate(self.internal_outputs)})
    220 
--> 221         feed_dict = {state_input: (state[name],) for name, state_input in self.state.items()}
    222         feed_dict.update({internal_input: (internal[n],) for n, internal_input in enumerate(self.internal_inputs)})
    223         feed_dict[self.deterministic] = deterministic

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

But when I redefine config, that is, I run

#Configuration, adapted from config in readme
config = Configuration(
    batch_size=100,
    states=dict(shape=(4,), type='float'),
    actions=dict(opt_a = dict(continuous=True, min_value = 0, max_value = 2),
                opt_b = dict(continuous=True, min_value = 0, max_value = 2)),
    network=layered_network_builder([dict(type='dense', size=50), dict(type='dense', size=50)])
)

again, it does not crash, but it occasionally outputs negative values for actions, although min_value = 0 {'opt_a': 0.28892395, 'opt_b': -0.10657883} The PPO agent displays the same behavior as the VPG Agent.

I have tried this with many slightly different configurations, it seems to be a consistent issue. Please let me know if you need any more code / info / data to reproduce the issue. Kindly, Jannes

AlexKuhnle commented 7 years ago

Hey,

First, thanks for reporting the issue. The first problem you encounter is most likely due to a bug in our current implementation of TRPO with multiple actions, which should hopefully be fixed in the next 1-2 days. I'll let you know.

I'm not sure about the second exception you get -- what are you redefining afterwards such that it works? Anyway, the problem with min_value and max_value you mention afterwards is a general problem currently. Although the feature, which I guess makes sense generally, is already supported for the action interface, it is not yet supported for action distributions etc, so it is essentially just ignored. This is because we so far only provide Gaussian as continuous distribution, which does not naturally define min/max values. Does it nevertheless work, ignoring the out-of-bound values?

JannesKlaas commented 7 years ago

Regarding the second issue, I looked at it further: Configuration objects are changed when they are used to create an agent. This makes them unusable when creating the next agent. Here is how the issue shows itself:

#Define config
config = Configuration(
    batch_size=100,
    states=dict(shape=(10,), type='float'),
    actions=dict(continuous=False, num_actions=2),
    network=layered_network_builder([dict(type='dense', size=50), dict(type='dense', size=50)])
)

#Define first agent (works)
agent = TRPOAgent(config=config)

#Define second agent (also works)
agent1 = TRPOAgent(config=config)

#Define state
state = np.array([1,2,3,4,5,6,7,8,9,10])

#First agent acts (works)
agent.act(state)

#Second agent acts (crashes)
agent1.act(state)

I looked into the agent code and I think I found the issue. The code creating the agent modifies the configuration passed along. Before declaring the agent, print(config) prints: {actions={continuous=False, num_actions=2}, states={type=float, shape=(10,)}, batch_size=100, network=<function layered_network_builder.<locals>.network_builder at 0x111b8d598>}

after

agent = TRPOAgent(config=config)

print(config) outputs: {device=None, cg_iterations=20, optimizer=None, cg_damping=0.001, log_level=info, network=<function layered_network_builder.<locals>.network_builder at 0x111b8d598>, global_model=False, exploration=None, normalize_advantage=False, max_kl_divergence=0.001, preprocessing=None, discount=0.97, states={state={type=float, shape=(10,)}}, session=None, distributed=False, line_search_steps=20, batch_size=100, actions={action={continuous=False, num_actions=2}}, tf_summary=None, learning_rate=0.0001, generalized_advantage_estimation=False, tf_saver=False, baseline=None, gae_lambda=0.97, override_line_search=False}

On several points, the agent class directly modifies the config passed along. This leads to problems when the config is used later. A better way would probably be to create a copy of the config before modifying it.

AlexKuhnle commented 7 years ago

First, the problem with TRPO should be fixed now. Second, you're absolutely right, this is unexpected behavior, and what you suggest seems like a good solution. I will open an issue to track the config and the min/max value problem, and will close this one.