Wrong output of forward for custom policy

[ ] I have marked all applicable categories:
- [x] exception-raising bug
- [x] RL algorithm bug
- [ ] documentation request (i.e. "X is missing from the documentation.")
- [ ] new feature request
- [ ] design request (i.e. "X should be changed to Y.")
[ ] I have visited the source website
[ ] I have searched through the issue tracker for duplicates

[ ] I have mentioned version numbers, operating system and environment, where applicable:

import tianshou, gymnasium as gym, torch, numpy, sys
print(tianshou.__version__, gym.__version__, torch.__version__, numpy.__version__, sys.version, sys.platform)

import gymnasium as gym
from tianshou.data.batch import Batch
import torch
import numpy as np
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import tianshou as ts
from copy import deepcopy
from tianshou.env import DummyVectorEnv
from torch.optim.lr_scheduler import LambdaLR
import torch.nn.functional as F
import os
import time
import json
import math
from tqdm import tqdm
from env import SDN_Env
from network import conv_mlp_net

cloud_num = 1
edge_num = 1
expn = 'exp1'
config = 'multi-edge'
lr, epoch, batch_size = 1e-6, 1, 1024 * 4
train_num, test_num = 64, 1024
gamma, lr_decay = 0.9, None
buffer_size = 100000
eps_train, eps_test = 0.1, 0.00
step_per_epoch, episode_per_collect = 100 * train_num * 700, train_num
writer = SummaryWriter('tensor-board-log/ppo')  # tensorboard is also supported!
logger = ts.utils.TensorboardLogger(writer)
is_gpu_default = torch.cuda.is_available()  # Check if GPU is available
# ppo
gae_lambda, max_grad_norm = 0.95, 0.5
vf_coef, ent_coef = 0.5, 0.0
rew_norm, action_scaling = False, False
bound_action_method = "clip"
eps_clip, value_clip = 0.2, False
repeat_per_collect = 2
dual_clip, norm_adv = None, 0.0
recompute_adv = 0

INPUT_CH = 67
FEATURE_CH = 512
MLP_CH = 1024

class sdn_net(nn.Module):
  def __init__(self, mode='actor', is_gpu=is_gpu_default):
      super().__init__()
      self.is_gpu = is_gpu
      self.mode = mode

      if self.mode == 'actor':
          self.network = conv_mlp_net(conv_in=INPUT_CH, conv_ch=FEATURE_CH, mlp_in=(edge_num+cloud_num)*FEATURE_CH,\
                                  mlp_ch=MLP_CH, out_ch=edge_num+cloud_num, block_num=3)
      else:
          self.network = conv_mlp_net(conv_in=INPUT_CH, conv_ch=FEATURE_CH, mlp_in=(edge_num+cloud_num)*FEATURE_CH,\
                                  mlp_ch=MLP_CH, out_ch=cloud_num, block_num=3)

  def load_model(self, filename):
      map_location = lambda storage, loc: storage
      self.load_state_dict(torch.load(filename, map_location=map_location))
      print('load model!')

  def save_model(self, filename):
      torch.save(self.state_dict(), filename)
      # print('save model!')

  def forward(self, obs, state=None, info={}):
      state = obs.clone().detach().requires_grad_(True).to(torch.float32)
      if self.is_gpu:
          state = state.cuda()

      logits = self.network(state)
      return Batch(logits=logits, state=state),None

class Actor(nn.Module):
  def __init__(self, is_gpu=is_gpu_default):
      super().__init__()
      self.is_gpu = is_gpu
      self.net = sdn_net(mode='actor')

  def load_model(self, filename):
      map_location = lambda storage, loc: storage
      self.load_state_dict(torch.load(filename, map_location=map_location))
      print('load model!')

  def save_model(self, filename):
      torch.save(self.state_dict(), filename)
      # print('save model!')

  def forward(self, obs, state=None, info={}):
      result, _ = self.net(obs)
      logits, state = result[0]['logits'], result[0]['state']
      # Ensure logits is a PyTorch tensor
      logits = logits.to(torch.float32)
      logits = F.softmax(logits, dim=-1, dtype=torch.float32)

      return Batch(logits=logits, state=state),None
class Critic(nn.Module):
  def __init__(self, is_gpu=is_gpu_default):
      super().__init__()

      self.is_gpu = is_gpu

      self.net = sdn_net(mode='critic')

  def load_model(self, filename):
      map_location = lambda storage, loc: storage
      self.load_state_dict(torch.load(filename, map_location=map_location))
      print('load model!')

  def save_model(self, filename):
      torch.save(self.state_dict(), filename)
      # print('save model!')

  def forward(self, obs, state=None, info={}):
      result, _ = self.net(obs)
      logits, state = result[0]['logits'], result[0]['state']
      # Ensure logits is a PyTorch tensor
      logits = logits.to(torch.float32)
      return Batch(logits=logits, state=state), None

actor = Actor(is_gpu=is_gpu_default)
critic = Critic(is_gpu=is_gpu_default)
actor_critic = ts.utils.net.common.ActorCritic(actor, critic)
optim = torch.optim.Adam(actor_critic.parameters(), lr=lr)

dist = torch.distributions.Categorical

action_space = gym.spaces.Discrete(edge_num+cloud_num)

if lr_decay:
  lr_scheduler = LambdaLR(
      optim, lr_lambda=lambda epoch: lr_decay ** (epoch - 1)
  )
else:
  lr_scheduler = None

policy = ts.policy.PPOPolicy(actor, critic, optim, dist,
                           discount_factor=gamma, max_grad_norm=max_grad_norm,
                           eps_clip=eps_clip, vf_coef=vf_coef,
                           ent_coef=ent_coef, reward_normalization=rew_norm,
                           advantage_normalization=norm_adv, recompute_advantage=recompute_adv,
                           dual_clip=dual_clip, value_clip=value_clip,
                           gae_lambda=gae_lambda, action_space=action_space,
                           lr_scheduler=lr_scheduler)

for i in range(101):
  try:
      os.mkdir('save/pth-e%d/' % (edge_num) + 'cloud%d/' % (cloud_num) + expn + '/w%03d' % (i))
  except:
      pass

for wi in range(100, 0 - 1, -2):

  if wi == 100:
      epoch_a = epoch * 10
  else:
      epoch_a = epoch

  train_envs = DummyVectorEnv(
      [lambda: SDN_Env(conf_name=config, w=wi / 100.0, fc=4e9, fe=2e9, edge_num=edge_num, cloud_num=cloud_num) for _ in range(train_num)])
  test_envs = DummyVectorEnv(
      [lambda: SDN_Env(conf_name=config, w=wi / 100.0, fc=4e9, fe=2e9, edge_num=edge_num, cloud_num=cloud_num) for _ in range(test_num)])
  buffer = ts.data.VectorReplayBuffer(buffer_size, train_num)
  def preprocess_fn(**kwargs):
      obs = kwargs.get("obs", np.array([[]]))
      reward = kwargs.get("reward", 0)
      done = kwargs.get("done", {})
      truncated = kwargs.get("truncated", {})
      info = kwargs.get("info", {})
      env_id = kwargs.get("env_id", "default_value")

      # Convert obs to a PyTorch tensor
      obs = torch.tensor(obs, dtype=torch.float32)
      reward = torch.tensor(reward, dtype=torch.float32)

      # Make sure to include 'dim' attribute in the Batch object
      batch = Batch(
          obs=obs,
          reward=reward,
          done=done,
          info=info,
          truncated=truncated,
          env_id=env_id,
      )
      print(batch)

      # Assuming a normal environment step
      return batch

  # Initialize Collector with preprocess_fn
  train_collector = ts.data.Collector(
      policy=policy,
      env=train_envs,
      buffer=buffer,
      preprocess_fn=preprocess_fn,
  )
  print(train_collector)

  test_collector = ts.data.Collector(policy, test_envs)
  train_collector.collect(n_episode=train_num)

  def save_best_fn(policy):
      pass

  def test_fn(epoch, env_step, cloud_num):
      policy.actor.save_model('save/pth-e%d/' % (edge_num) + 'cloud%d/' % (cloud_num) + expn + '/w%03d/ep%02d-actor.pth' % (wi, epoch))
      policy.critic.save_model('save/pth-e%d/' % (edge_num) + 'cloud%d/' % (cloud_num) + expn + '/w%03d/ep%02d-critic.pth' % (wi, epoch))

  def train_fn(epoch, env_step):
      pass

  def reward_metric(rews):
      return rews

  result = ts.trainer.onpolicy_trainer(
      policy=policy,
      train_collector=train_collector,
      test_collector=test_collector,
      max_epoch=epoch_a,
      step_per_epoch=step_per_epoch,
      repeat_per_collect=repeat_per_collect,
      episode_per_test=test_num,
      batch_size=batch_size,
      step_per_collect=None,
      episode_per_collect=episode_per_collect,
      train_fn=train_fn,
      test_fn=test_fn,
      save_best_fn=save_best_fn,
      stop_fn=None,  # You may need to define your own stop function if needed
      save_checkpoint_fn=save_best_fn,
      reward_metric=reward_metric,
      logger=logger,
  )

I have ensure the logic but the result always:

Traceback (most recent call last): File "/home/ad/mec_morl_multipolicy/train.py", line 210, in train_collector.collect(n_episode=train_num) File "/home/ad/.local/lib/python3.10/site-packages/tianshou/data/collector.py", line 279, in collect result = self.policy(self.data, last_state) File "/home/ad/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, *kwargs) File "/home/ad/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl return forward_call(args, **kwargs) File "/home/ad/.local/lib/python3.10/site-packages/tianshou/policy/modelfree/pg.py", line 124, in forward dist = self.dist_fn(logits) File "/home/ad/.local/lib/python3.10/site-packages/torch/distributions/categorical.py", line 57, in init if probs.dim() < 1: File "/home/ad/.local/lib/python3.10/site-packages/tianshou/data/batch.py", line 213, in getattr return getattr(self.dict, key) AttributeError: 'dict' object has no attribute 'dim'

thu-ml / tianshou

Wrong output of forward for custom policy #1029