ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

I am trying to get my own environment running with my own network/model under ray.rllib. The setup is minimal, the code is shown below. The environment is a small 10x10 grid world environment and the network has two hidden layers with policy and value output. I am using the PPO algorithm with the default configuration. I have followed the tutorials for a custom environment and network/model, yet I always get the error shown below. The error seems to occur when initializing the rollout worker. Since my setting is relatively simple and I followed the tutorials exactly, I suspect there is a bug here.

Stacktrace:

(pid=4889) 2021-05-30 16:01:10,586      ERROR worker.py:382 -- Exception raised in creation task: The actor died because of an error raised in its creation task, ray::RolloutWorker.__init__() (pid=4889, ip=172.24.140.227)
(pid=4889)   File "python/ray/_raylet.pyx", line 505, in ray._raylet.execute_task
(pid=4889)   File "python/ray/_raylet.pyx", line 449, in ray._raylet.execute_task.function_executor
(pid=4889)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/_private/function_manager.py", line 556, in actor_method_executor
(pid=4889)     return method(__ray_actor, *args, **kwargs)
(pid=4889)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 516, in __init__
(pid=4889)     self.policy_map, self.preprocessors = self._build_policy_map(
(pid=4889)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 1158, in _build_policy_map
(pid=4889)     policy_map[name] = cls(obs_space, act_space, merged_conf)
(pid=4889)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/rllib/policy/policy_template.py", line 266, in __init__
(pid=4889)     self._initialize_loss_from_dummy_batch(
(pid=4889)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/rllib/policy/policy.py", line 631, in _initialize_loss_from_dummy_batch
(pid=4889)     postprocessed_batch = self.postprocess_trajectory(self._dummy_batch)
(pid=4889)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/rllib/policy/policy_template.py", line 290, in postprocess_trajectory
(pid=4889)     return postprocess_fn(self, sample_batch,
(pid=4889)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/rllib/evaluation/postprocessing.py", line 127, in compute_gae_for_sample_batch
(pid=4889)     batch = compute_advantages(
(pid=4889)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/rllib/evaluation/postprocessing.py", line 49, in compute_advantages
(pid=4889)     vpred_t = np.concatenate(
(pid=4889)   File "<__array_function__ internals>", line 5, in concatenate
(pid=4889) ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)
(pid=4893) 2021-05-30 16:01:10,952      ERROR worker.py:382 -- Exception raised in creation task: The actor died because of an error raised in its creation task, ray::RolloutWorker.__init__() (pid=4893, ip=172.24.140.227)
(pid=4893)   File "python/ray/_raylet.pyx", line 505, in ray._raylet.execute_task
(pid=4893)   File "python/ray/_raylet.pyx", line 449, in ray._raylet.execute_task.function_executor
(pid=4893)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/_private/function_manager.py", line 556, in actor_method_executor
(pid=4893)     return method(__ray_actor, *args, **kwargs)
(pid=4893)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 516, in __init__
(pid=4893)     self.policy_map, self.preprocessors = self._build_policy_map(
(pid=4893)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 1158, in _build_policy_map
(pid=4893)     policy_map[name] = cls(obs_space, act_space, merged_conf)
(pid=4893)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/rllib/policy/policy_template.py", line 266, in __init__
(pid=4893)     self._initialize_loss_from_dummy_batch(
(pid=4893)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/rllib/policy/policy.py", line 631, in _initialize_loss_from_dummy_batch
(pid=4893)     postprocessed_batch = self.postprocess_trajectory(self._dummy_batch)
(pid=4893)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/rllib/policy/policy_template.py", line 290, in postprocess_trajectory
(pid=4893)     return postprocess_fn(self, sample_batch,
(pid=4893)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/rllib/evaluation/postprocessing.py", line 127, in compute_gae_for_sample_batch
(pid=4893)     batch = compute_advantages(
(pid=4893)   File "/home/lucci/anaconda3/lib/python3.8/site-packages/ray/rllib/evaluation/postprocessing.py", line 49, in compute_advantages
(pid=4893)     vpred_t = np.concatenate(
(pid=4893)   File "<__array_function__ internals>", line 5, in concatenate
(pid=4893) ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s)

main.py

import gym, ray
from ray.rllib.agents import ppo
from grid import Grid
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override
from ray.rllib.utils.typing import Dict, TensorType, List, ModelConfigDict
from ray.rllib.models import ModelCatalog
import torch.nn as nn
import torch

# Custom network
class Network(TorchModelV2, nn.Module):
    def __init__(self, 
            obs_space: gym.spaces.Box,
            action_space: gym.spaces.Discrete,
            num_outputs: int,
            model_config: dict,
            name: str):
        TorchModelV2.__init__(self,
            obs_space=obs_space,
            action_space=action_space,
            num_outputs=num_outputs, 
            model_config=model_config,
            name=name
            )
        nn.Module.__init__(self)

        self.lin1 = nn.Linear(in_features=2, out_features=64, bias=True)
        self.lin2 = nn.Linear(in_features=64, out_features=64, bias=True)
        self.policy = nn.Linear(in_features=64, out_features=4, bias=True)
        self.value = nn.Linear(in_features=64, out_features=1, bias=True)
        self.relu = nn.ReLU()

    @override(TorchModelV2)
    def forward(self, input_dict: Dict[str, TensorType],
                state: List[TensorType],
                seq_lens: TensorType) -> (TensorType, List[TensorType]):
        #ray.util.pdb.set_trace()
        inp = input_dict["obs"]
        relu1 = self.relu(self.lin1(inp))
        relu2 = self.relu(self.lin2(relu1))
        logits = self.policy(relu2)
        self.vf = self.value(relu2)

        return logits, state

    @override(TorchModelV2)
    def value_function(self) -> TensorType:
        #ray.util.pdb.set_trace()
        return self.vf

ModelCatalog.register_custom_model("network", Network)
print("Initialzing ray")
ray.init(ignore_reinit_error=True)

trainer = ppo.PPOTrainer(env=Grid, config={
    "framework": "torch",
    "model": {
        "custom_model": "network",
        "custom_model_config": {
        }},
    "env_config": {}})

import pprint
pp = pprint.PrettyPrinter(indent=4)
while True:
    result = trainer.train()
    pp.pprint(result)

grid.py

import gym
from gym.spaces import Discrete
from gym.spaces import Box
import numpy as np
import torch.nn as nn

class Grid(gym.Env):
    def __init__(self, random_start=True):
        self.action_space = Discrete(4)
        self.observation_space = Box(low=0, high=9, shape=(2,), dtype=np.float32)
        self.map = ["x...x.....",
                    ".x......x.",
                    "...x..x...",
                    "....x...xx",
                    "...x...x..",
                    ".x........",
                    "...x...x..",
                    ".x..x....x",
                    "..x..x.E..",
                    ".........x"]

        # Define start point(s)
        if not random_start:
            self.start = [(1, 0)] 
        else:
            self.start = []
            for y in range(0, 10):
                for x in range(0, 10):
                    if self.map[y][x] == ".":
                        self.start.append((x, y))
        self.s = self.reset()

        # The rewards are chosen according to the FrozenLake environment
        self.reward = {}
        self.reward["."] = 0.0
        #self.reward["x"] = -2.0
        self.reward["x"] = 0.0
        self.reward["E"] = 1.0

    def step(self, action): 

        # Get current x and y value
        x = self.s[0]
        y = self.s[1]

        # Calculate next x and y value (nx, ny)
        if action == 0: # up
            nx = x
            ny = y - 1 if y > 0 else 9
        elif action == 1: # down
            nx = x
            ny = y + 1 if y < 9 else 0
        elif action == 2: # left
            nx = x - 1 if x > 0 else 9    
            ny = y
        else: # action == 3 # right
            nx = x + 1 if x < 9 else 0
            ny = y

        # Set new state
        self.s = np.array([nx, ny]).astype(np.float32)

        # Check if done
        done = True if self.map[int(ny)][int(nx)] in ["x", "E"] else False

        # Get reward
        reward = self.reward[self.map[int(ny)][int(nx)]]
        info = {}

        # Return state, reward, done, info
        return self.s, reward, done, info

    def reset(self):
        idx = np.random.randint(0, len(self.start))
        self.s = np.array(self.start[idx]).astype(np.float32)
        return self.s

    def render(self, mode="human"):
        pass

ray-project / ray-legacy

ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 2 dimension(s) and the array at index 1 has 1 dimension(s) #451