DLR-RM / stable-baselines3

PyTorch version of Stable Baselines, reliable implementations of reinforcement learning algorithms.
https://stable-baselines3.readthedocs.io
MIT License
9.07k stars 1.7k forks source link

It is recommended to give an example of off policy using the feature extractor #982

Closed Zero1366166516 closed 2 years ago

Zero1366166516 commented 2 years ago

Important Note: We do not do technical support, nor consulting and don't answer personal questions per email. Please post your question on the RL Discord, Reddit or Stack Overflow in that case.

If your issue is related to a custom gym environment, please use the custom gym env template.

๐Ÿ› Bug

I want to customize the feature extractor. According to the program written in the example, I get the following errors. I have seen: too many errors when customizing policy, a full example for off policy algorithms should be added in user guide #425, this issue, mentioned The off policy network should also use the feature extractor. It is recommended to give an example of off policy using the feature extractor. Thank you! class CustomCombinedExtractor(BaseFeaturesExtractor): def init(self, observation_space: gym.spaces.Dict):

We do not know features-dim here before going over all the items,

    # so put something dummy for now. PyTorch requires calling
    # nn.Module.__init__ before adding modules
    super(CustomCombinedExtractor, self).__init__(observation_space, features_dim=1)

    extractors = {}

    total_concat_size = 0
    print(observation_space)
    #print(observation_space.items(0))

    print(observation_space.spaces.items())
    exit()
    # We need to know size of the output of this extractor,
    # so go over all the spaces and compute output feature sizes
    for key, subspace in observation_space.spaces.items():
        if key == "image":
            # We will just downsample one channel of the image by 4x4 and flatten.
            # Assume the image is single-channel (subspace.shape[0] == 0)
            extractors[key] = nn.Sequential(nn.MaxPool2d(4), nn.Flatten())
            total_concat_size += subspace.shape[1] // 4 * subspace.shape[2] // 4
        elif key == "vector":
            # Run through a simple MLP
            extractors[key] = nn.Linear(subspace.shape[0], 16)
            total_concat_size += 16

    self.extractors = nn.ModuleDict(extractors)

    # Update the features dim manually
    self._features_dim = total_concat_size

def forward(self, observations) -> th.Tensor:
    encoded_tensor_list = []

    # self.extractors contain nn.Modules that do all the processing.
    for key, extractor in self.extractors.items():
        encoded_tensor_list.append(extractor(observations[key]))
    # Return a (B, self._features_dim) PyTorch tensor, where B is batch dimension.
    return th.cat(encoded_tensor_list, dim=1)

policy_kwargs = dict(
    features_extractor_class=CustomCombinedExtractor,
    share_features_extractor=False,
    features_extractor_kwargs=dict(features_dim=128))
#policy_kwargs = dict(activation_fn=th.nn.ReLU,
#                     net_arch=[dict(pi=[32, 32], vf=[32, 32])])
def get_model(
    self,
    model_name: str,
    #policy: str = "MlpPolicy",  
    policy: str = "MultiInputPolicy",
    policy_kwargs: dict = policy_kwargs,
    model_kwargs: dict = None,
    verbose: int = 1
) -> Any:

    print("set Debug!")

    if model_name not in MODELS:
        raise NotImplementedError("NotImplementedError")

    if model_kwargs is None:
        model_kwargs = MODEL_KWARGS[model_name]

    if "action_noise" in model_kwargs:
        n_actions = self.env.action_space.shape[-1]                         
        model_kwargs["action_noise"] = NOISE[model_kwargs["action_noise"]](
            mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)
        )
    print(model_kwargs)    
    print(policy, self.env)
    print(model_name)
    model = MODELS[model_name](         
        policy=policy,
        env=self.env,
        tensorboard_log="{}/{}".format(config.TENSORBOARD_LOG_DIR, model_name),
        verbose=verbose,
        policy_kwargs=policy_kwargs,
        **model_kwargs
    )

Traceback (most recent call last): File "C:/Users/Administrator/PycharmProjects/demo/utils/models.py", line 419, in model_sac = agent.get_model("sac", model_kwargs=SAC_PARAMS) File "C:/Users/Administrator/PycharmProjects/demo/utils/models.py", line 328, in get_model model = MODELS[model_name](
File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\sac\sac.py", line 144, in init self._setup_model() File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\sac\sac.py", line 147, in _setup_model super(SAC, self)._setup_model() File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\common\off_policy_algorithm.py", line 216, in _setup_model self.policy = self.policy_class( # pytype:disable=not-instantiable File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\sac\policies.py", line 498, in init super(MultiInputPolicy, self).init( File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\sac\policies.py", line 292, in init self._build(lr_schedule) File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\sac\policies.py", line 295, in _build self.actor = self.make_actor() File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\sac\policies.py", line 348, in make_actor actor_kwargs = self._update_features_extractor(self.actor_kwargs, features_extractor) File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\common\policies.py", line 112, in _update_features_extractor features_extractor = self.make_features_extractor() File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\common\policies.py", line 118, in make_features_extractor return self.features_extractor_class(self.observation_space, **self.features_extractor_kwargs) TypeError: init() got an unexpected keyword argument 'features_dim'

A clear and concise description of what the bug is.

To Reproduce

Steps to reproduce the behavior.

Please try to provide a minimal example to reproduce the bug. Error messages and stack traces are also helpful.

Please use the markdown code blocks for both code and stack traces.

from stable_baselines3 import ...
Traceback (most recent call last): File ...

Expected behavior

A clear and concise description of what you expected to happen.

ย System Info

Describe the characteristic of your environment:

You can use sb3.get_system_info() to print relevant packages info:

import stable_baselines3 as sb3
sb3.get_system_info()

Additional context

Add any other context about the problem here.

Checklist

qgallouedec commented 2 years ago

Next time, please help us to help you by taking the necessary time to feel the issue template.

As the error suggests, your feature extractor does not take the feature dimension as argument. Then try

def __init__(self, observation_space, features_dim):
araffin commented 2 years ago

Next time, please help us to help you by taking the necessary time to feel the issue template.

As the error suggests, your feature extractor does not take the feature dimension as argument. Then try

def init(self, observation_space, features_dim):

The features extractor for off-policy is the same as on-policy and is already documented: https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html#custom-feature-extractor

as @qgallouedec wrote, the error you get is because you pass argument to the features extractor (features_extractor_kwargs=dict(features_dim=128))) but you don't have that parameter (features_dim) as argument in your class.

Zero1366166516 commented 2 years ago

First of all, thank you very much for your help.

I modified the class CustomCNN as follows: MODELS = {"a2c": A2C, "ddpg": DDPG, "td3": TD3, "sac": SAC, "ppo": PPO} MODEL_KWARGS = {x: config.dict["{}_PARAMS".format(x.upper())] for x in MODELS.keys()}

NOISE = { "normal": NormalActionNoise, "ornstein_uhlenbeck": OrnsteinUhlenbeckActionNoise }

class CustomCNN(BaseFeaturesExtractor): """ :param observation_space: (gym.Space) :param features_dim: (int) Number of features extracted. This corresponds to the number of unit for the last layer. """

def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 1):
    super(CustomCNN, self).__init__(observation_space, features_dim)
    # We assume CxHxW images (channels first)
    # Re-ordering will be done by pre-preprocessing or wrapper
    n_input_channels = observation_space.shape[0]

    print("n_input_channels", observation_space.shape[0])
    #print("features = ", features_dim)
    #observation_space = observation_space.T
    self.cnn = nn.Sequential(
        nn.Conv1d(1, n_input_channels, kernel_size=1, stride=1, padding=0),
        nn.ReLU(),
        nn.Conv1d(n_input_channels, 1, kernel_size=1, stride=1, padding=0),
        nn.ReLU(),
        nn.Flatten(),
    )
    print(self.cnn.type)

    # Compute shape by doing one forward pass
    with th.no_grad():

        n_flatten = self.cnn(
            th.as_tensor(observation_space.sample()[None]).float()
        ).shape[1]
        print(observation_space, observation_space.sample()[None])
        print(self.cnn)

    self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.Tanh())
    print(self.linear)
    #self.linear = th.as_tensor(self.linear)
    #exit()

def forward(self, observations: th.Tensor) -> th.Tensor:
    print("go to the forward:", observations)
    return self.linear(self.cnn(observations))

policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    share_features_extractor=False,
    features_extractor_kwargs=dict(features_dim=1),
    net_arch=[dict(pi=[32, 32], qf=[64, 64])]
)
#policy_kwargs = dict(activation_fn=th.nn.ReLU,
#                     net_arch=[dict(pi=[32, 32], vf=[32, 32])])
def get_model(
    self,
    model_name: str,
    policy: str = "MlpPolicy",     
    #policy: str = "MultiInputPolicy",
    policy_kwargs: dict = policy_kwargs,
    model_kwargs: dict = None,
    verbose: int = 1
) -> Any:

    print("set Debug!")

    if model_name not in MODELS:
        raise NotImplementedError("NotImplementedError")

    if model_kwargs is None:
        model_kwargs = MODEL_KWARGS[model_name]

    if "action_noise" in model_kwargs:
        n_actions = self.env.action_space.shape[-1]                          
        model_kwargs["action_noise"] = NOISE[model_kwargs["action_noise"]](
            mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)
        )
    print(model_kwargs)    
    print(policy, self.env)
    print(model_name)
    print(observation)
    print(self.env.observation_space)
    print("dispaly: observation_space")
    model = MODELS[model_name](         
        policy=policy,
        env=self.env,
        tensorboard_log="{}/{}".format(config.TENSORBOARD_LOG_DIR, model_name),
        verbose=verbose,
        policy_kwargs=policy_kwargs,
        **model_kwargs
    )
    print("model display: ", model)
    exit()
    return model

I defined it before: self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.state_space,)

However, there is another error.

File "C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\linear.py", line 96, in init self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs)) TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:

I'm sorry to ask so many questions. However, I just want to use CNN feature extractor to extract features for mlppolicy policy network. I think this error is stable-baseline3. There is no complete example of off policy algorithm, especially using off policy algorithms such as ddpg, td3, sac.

qgallouedec commented 2 years ago

Can you try to provide a minimal and functional code example to reproduce the error. (Remove all your print, use a single agent, ...) Please also use the markdown code blocks for code. It will be easier for us to help you.

Zero1366166516 commented 2 years ago

Sincerely thank you for your help.The problem of off policy network has been bothering me for several days. I use the example to create a class (customcnn) as the feature extractor and define the policy kwargs = dict( features extractor class=CustomCNN, net arch=dict(qf=[256, 256], pi=[256, 256]) ) CNN neural network is used as the feature extractor, and the code is as follows:

ยทยทยท

from typing import Any import pandas as pd import numpy as np import time from stable_baselines3 import DDPG from stable_baselines3 import A2C from stable_baselines3 import PPO from stable_baselines3 import TD3 from stable_baselines3 import SAC from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise from stable_baselines3.common.policies import register_policy, ActorCriticPolicy from stable_baselines3.common.torch_layers import BaseFeaturesExtractor import gym from gym import spaces import torch as th import torch.nn as nn from typing import Callable, Dict, List, Optional, Tuple, Type, Union from utils import config from utils.preprocessors import split_data from utils.env import StockLearningEnv ยทยทยท

MODELS = {"a2c": A2C, "ddpg": DDPG, "td3": TD3, "sac": SAC, "ppo": PPO} MODEL_KWARGS = {x: config.dict["{}_PARAMS".format(x.upper())] for x in MODELS.keys()}

NOISE = { "normal": NormalActionNoise, "ornstein_uhlenbeck": OrnsteinUhlenbeckActionNoise } ยทยทยทThis is the class I defined CustomCNN,Because I want to analyze time series data, I use 1-dimensional conv1d. The input data is 13 columns, and the number of input rows is either 1 or 128, which is random.

`class CustomCNN(BaseFeaturesExtractor): """ :param observation_space: (gym.Space) :param features_dim: (int) Number of features extracted. This corresponds to the number of unit for the last layer. """

def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 1):
    super(CustomCNN, self).__init__(observation_space, features_dim)
    # We assume CxHxW images (channels first)
    # Re-ordering will be done by pre-preprocessing or wrapper
    n_input_channels = observation_space.shape[0]

    self.cnn = nn.Sequential(

        nn.Conv1d(self.features_dim, n_input_channels, kernel_size=1, stride=1, padding=0),
        nn.ReLU(),
        nn.Conv1d(n_input_channels, self.features_dim, kernel_size=1, stride=1, padding=0),
        nn.ReLU(),
        nn.Flatten(),
    )
        with th.no_grad():
        n_flatten = self.cnn(
            th.as_tensor(observation_space.sample()[None]).float()
        ).shape[1]
    self.linear = nn.Sequential(nn.Linear(n_flatten, self.features_dim), nn.Tanh())

ยทยทยท When observations is [1,1, 13], the referenced example can work normally, but when data is randomly sampled [1128,13], nn.sequential needs to be redefined. I modified the code of the example, but still reported an error.

def forward(self, observations: th.Tensor) -> th.Tensor:
    n_flatten = np.array(observations).shape[1]
    features_dim = np.array(observations).shape[0]
    print(features_dim, n_flatten)
    if features_dim != 1:
        self.cnn = nn.Sequential(
            nn.Conv1d(features_dim, n_flatten, kernel_size=1, stride=1, padding=0),
            nn.ReLU(),
            nn.Conv1d(n_flatten, features_dim, kernel_size=1, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )
        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.Tanh())

    return self.linear(self.cnn(observations))

ยทยทยทยทยท

policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    net_arch=dict(qf=[256, 256], pi=[256, 256])
)

ยทยทยทยท
Here is the definition of get_ Mode function,

def get_model(
    self,
    model_name: str,
    policy: str = "MlpPolicy",     
    #policy: str = "MultiInputPolicy",
    policy_kwargs: dict = policy_kwargs,
    model_kwargs: dict = None,
    verbose: int = 1
) -> Any:
    if model_name not in MODELS:
        raise NotImplementedError("NotImplementedError")
    if model_kwargs is None:
        model_kwargs = MODEL_KWARGS[model_name]
    if "action_noise" in model_kwargs:
        n_actions = self.env.action_space.shape[-1]                          
        model_kwargs["action_noise"] = NOISE[model_kwargs["action_noise"]](
            mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)
        )
    model = MODELS[model_name](          
        policy=policy,
        env=self.env,
        tensorboard_log="{}/{}".format(config.TENSORBOARD_LOG_DIR, model_name),
        verbose=verbose,
        policy_kwargs=policy_kwargs,
        **model_kwargs
    )
    return model

ยทยทยท this is train_model,the erroe in here,model.learn,

def train_model(
    self, model: Any, tb_log_name: str, total_timesteps: int = 5000
    ) -> Any:
    """train model"""
    model = model.learn(total_timesteps=total_timesteps, tb_log_name=tb_log_name)
    return model

Start testing here, input data, initial environment and model.

if __name__ == "__main__":
    from pull_data import Pull_data
    from preprocessors import FeatureEngineer, split_data
    from utils import config
    import time

    # pull data
    #df = Pull_data(config.SSE_50[:2], save_data=False).pull_data()
    df = Pull_data(config.SSE_50[:2]).pull_data()
    df = FeatureEngineer().preprocess_data(df)
    df = split_data(df, '2009-01-01', '2019-01-01')
    print(df.head())

    # 
    stock_dimension = len(df.tic.unique()) # 2
    state_space = 1 + 2*stock_dimension + \
        len(config.TECHNICAL_INDICATORS_LIST)*stock_dimension # 23 
    print("stock_dimension: {}, state_space: {}".format(stock_dimension, state_space))
    env_kwargs = {
        #"stock_dim": stock_dimension,
        "hmax": 100, 
        "initial_amount": 1e6, 
        "buy_cost_pct": 0.001,
        "sell_cost_pct": 0.001,
        #"reward_scaling": 1e-4,
        #"state_space": state_space,
        #"action_space": stock_dimension,
        #"tech_indicator_list": config.TECHNICAL_INDICATORS_LIST
    }

    # test env
    e_train_gym = StockLearningEnv(df=df, **env_kwargs)
    ## mulpt test
    observation = e_train_gym.reset()      
    count = 0
    for t in range(10):
        action = e_train_gym.action_space.sample()  
        observation, reward, done, info = e_train_gym.step(action)  

        if done:
            break
        count+=1
        time.sleep(0.2)      
    print("observation: ", observation)
    print("action: ", action)
    print("reward: {}, done: {},info: {}".format(reward, done, info))

    # test model
    env_train, _ = e_train_gym.get_sb_env()
    print(type(env_train))

    ##register_policy('CustomPolicy', CustomPolicy)
    ##register_policy('CustomActorCriticPolicy', CustomActorCriticPolicy)
    agent = DRL_Agent(env= env_train)
    SAC_PARAMS = {
        "batch_size": 128,
        "buffer_size": 1000000,
        "learning_rate": 0.0001,
        "learning_starts": 100,
        "ent_coef": "auto_0.1"
    }
    model_sac = agent.get_model("sac", model_kwargs=SAC_PARAMS)
        trained_sac = agent.train_model(
        model=model_sac,
        tb_log_name='sac', 
        total_timesteps= 50000
    )
`ยทยทยท
The following is the error message

The error prompt is as follows:
Traceback (most recent call last):
  File "C:/Users/Administrator/PycharmProjects/demo/utils/models.py", line 477, in <module>
    trained_sac = agent.train_model(
  File "C:/Users/Administrator/PycharmProjects/demo/utils/models.py", line 401, in train_model
    model = model.learn(total_timesteps=total_timesteps, tb_log_name=tb_log_name)
  File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\sac\sac.py", line 292, in learn
    return super(SAC, self).learn(
  File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\common\off_policy_algorithm.py", line 366, in learn
    self.train(batch_size=self.batch_size, gradient_steps=gradient_steps)
  File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\sac\sac.py", line 206, in train
    actions_pi, log_prob = self.actor.action_log_prob(replay_data.observations)
  File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\sac\policies.py", line 180, in action_log_prob
    mean_actions, log_std, kwargs = self.get_action_dist_params(obs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\sac\policies.py", line 163, in get_action_dist_params
    latent_pi = self.latent_pi(features)
  File "C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\container.py", line 139, in forward
    input = module(input)
  File "C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (128x128 and 1x256)

ยทยทยท
Thank you again for your help. I edited the question again according to your request. Because I do what I want to do on the basis of sunnyswag's code, which is to modify the feature extractor to see whether sac, ddpg and td3 can perform better on the stock portfolio. This problem has bothered me for several days. Thank you again!!!
qgallouedec commented 2 years ago

I would really like to help you, but you should at least take into consideration the remarks I give you. I need:

  1. A minimal and functional code. For example, this code is not minimal because one line can be deleted without removing the error:
import numpy as np

a = np.ones(2)
b = np.ones(2)
c = a / 0

and this code is not functional because the imports are missing:

a = np.ones(2)
c = a / 0

Your code is neither minimal nor functional. So I am not able to reproduce your error.

  1. A properly formatted code so that I can understand it. For that, I send you again the link of my previous message: format markdown code blocks

From what I can see, it appears to be a shape-related error. You may have made a mistake in the network specification.

Zero1366166516 commented 2 years ago

Sincerely thank you for your help.The problem of off policy network has been bothering me for several days. I use the example to create a class (customcnn) as the feature extractor and define the policy kwargs = dict( features extractor class=CustomCNN, net arch=dict(qf=[256, 256], pi=[256, 256]) ) CNN neural network is used as the feature extractor, and the code is as follows:

ยทยทยท

from typing import Any import pandas as pd import numpy as np import time from stable_baselines3 import DDPG from stable_baselines3 import A2C from stable_baselines3 import PPO from stable_baselines3 import TD3 from stable_baselines3 import SAC from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise from stable_baselines3.common.policies import register_policy, ActorCriticPolicy from stable_baselines3.common.torch_layers import BaseFeaturesExtractor import gym from gym import spaces import torch as th import torch.nn as nn from typing import Callable, Dict, List, Optional, Tuple, Type, Union from utils import config from utils.preprocessors import split_data from utils.env import StockLearningEnv ยทยทยท

MODELS = {"a2c": A2C, "ddpg": DDPG, "td3": TD3, "sac": SAC, "ppo": PPO} MODEL_KWARGS = {x: config.dict["{}_PARAMS".format(x.upper())] for x in MODELS.keys()}

NOISE = { "normal": NormalActionNoise, "ornstein_uhlenbeck": OrnsteinUhlenbeckActionNoise } ยทยทยทThis is the class I defined CustomCNN,Because I want to analyze time series data, I use 1-dimensional conv1d. The input data is 13 columns, and the number of input rows is either 1 or 128, which is random.

`class CustomCNN(BaseFeaturesExtractor): """ :param observation_space: (gym.Space) :param features_dim: (int) Number of features extracted. This corresponds to the number of unit for the last layer. """

def init(self, observation_space: gym.spaces.Box, features_dim: int = 1): super(CustomCNN, self).init(observation_space, features_dim)

We assume CxHxW images (channels first)

# Re-ordering will be done by pre-preprocessing or wrapper
n_input_channels = observation_space.shape[0]

self.cnn = nn.Sequential(

    nn.Conv1d(self.features_dim, n_input_channels, kernel_size=1, stride=1, padding=0),
    nn.ReLU(),
    nn.Conv1d(n_input_channels, self.features_dim, kernel_size=1, stride=1, padding=0),
    nn.ReLU(),
    nn.Flatten(),
)
    with th.no_grad():
    n_flatten = self.cnn(
        th.as_tensor(observation_space.sample()[None]).float()
    ).shape[1]
self.linear = nn.Sequential(nn.Linear(n_flatten, self.features_dim), nn.Tanh())

ยทยทยท When observations is [1,1, 13], the referenced example can work normally, but when data is randomly sampled [1128,13], nn.sequential needs to be redefined. I modified the code of the example, but still reported an error.

def forward(self, observations: th.Tensor) -> th.Tensor: n_flatten = np.array(observations).shape[1] features_dim = np.array(observations).shape[0] print(features_dim, n_flatten) if features_dim != 1: self.cnn = nn.Sequential( nn.Conv1d(features_dim, n_flatten, kernel_size=1, stride=1, padding=0), nn.ReLU(), nn.Conv1d(n_flatten, features_dim, kernel_size=1, stride=1, padding=0), nn.ReLU(), nn.Flatten(), ) self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.Tanh())

return self.linear(self.cnn(observations))

ยทยทยทยทยท

policy_kwargs = dict( features_extractor_class=CustomCNN, netarch=dict(qf=[256, 256], pi=[256, 256]) ) ยทยทยทยท Here is the definition of get Mode function,

def get_model( self, model_name: str, policy: str = "MlpPolicy",

policy: str = "MultiInputPolicy",

policy_kwargs: dict = policy_kwargs,
model_kwargs: dict = None,
verbose: int = 1

) -> Any: if model_name not in MODELS: raise NotImplementedError("NotImplementedError") if model_kwargs is None: model_kwargs = MODEL_KWARGS[model_name] if "action_noise" in model_kwargs: n_actions = self.env.action_space.shape[-1]
model_kwargs["action_noise"] = NOISE[model_kwargs["action_noise"]]( mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions) ) model = MODELS[model_name](
policy=policy, env=self.env, tensorboard_log="{}/{}".format(config.TENSORBOARD_LOG_DIR, model_name), verbose=verbose, policy_kwargs=policy_kwargs, **model_kwargs ) return model ยทยทยท this is train_model,the erroe in here,model.learn,

def train_model( self, model: Any, tb_log_name: str, total_timesteps: int = 5000 ) -> Any: """train model""" model = model.learn(total_timesteps=total_timesteps, tb_log_name=tb_log_name) return model Start testing here, input data, initial environment and model.

if name == "main": from pull_data import Pull_data from preprocessors import FeatureEngineer, split_data from utils import config import time

# pull data
#df = Pull_data(config.SSE_50[:2], save_data=False).pull_data()
df = Pull_data(config.SSE_50[:2]).pull_data()
df = FeatureEngineer().preprocess_data(df)
df = split_data(df, '2009-01-01', '2019-01-01')
print(df.head())

# 
stock_dimension = len(df.tic.unique()) # 2
state_space = 1 + 2*stock_dimension + \
    len(config.TECHNICAL_INDICATORS_LIST)*stock_dimension # 23 
print("stock_dimension: {}, state_space: {}".format(stock_dimension, state_space))
env_kwargs = {
    #"stock_dim": stock_dimension,
    "hmax": 100, 
    "initial_amount": 1e6, 
    "buy_cost_pct": 0.001,
    "sell_cost_pct": 0.001,
    #"reward_scaling": 1e-4,
    #"state_space": state_space,
    #"action_space": stock_dimension,
    #"tech_indicator_list": config.TECHNICAL_INDICATORS_LIST
}

# test env
e_train_gym = StockLearningEnv(df=df, **env_kwargs)
## mulpt test
observation = e_train_gym.reset()      
count = 0
for t in range(10):
    action = e_train_gym.action_space.sample()  
    observation, reward, done, info = e_train_gym.step(action)  

    if done:
        break
    count+=1
    time.sleep(0.2)      
print("observation: ", observation)
print("action: ", action)
print("reward: {}, done: {},info: {}".format(reward, done, info))

# test model
env_train, _ = e_train_gym.get_sb_env()
print(type(env_train))

##register_policy('CustomPolicy', CustomPolicy)
##register_policy('CustomActorCriticPolicy', CustomActorCriticPolicy)
agent = DRL_Agent(env= env_train)
SAC_PARAMS = {
    "batch_size": 128,
    "buffer_size": 1000000,
    "learning_rate": 0.0001,
    "learning_starts": 100,
    "ent_coef": "auto_0.1"
}
model_sac = agent.get_model("sac", model_kwargs=SAC_PARAMS)
    trained_sac = agent.train_model(
    model=model_sac,
    tb_log_name='sac', 
    total_timesteps= 50000
)

`ยทยทยท The following is the error message

The error prompt is as follows: Traceback (most recent call last): File "C:/Users/Administrator/PycharmProjects/demo/utils/models.py", line 477, in trained_sac = agent.train_model( File "C:/Users/Administrator/PycharmProjects/demo/utils/models.py", line 401, in train_model model = model.learn(total_timesteps=total_timesteps, tb_log_name=tb_log_name) File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\sac\sac.py", line 292, in learn return super(SAC, self).learn( File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\common\off_policy_algorithm.py", line 366, in learn self.train(batch_size=self.batch_size, gradient_steps=gradient_steps) File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\sac\sac.py", line 206, in train actions_pi, log_prob = self.actor.action_log_prob(replay_data.observations) File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\sac\policies.py", line 180, in action_log_prob mean_actions, log_std, kwargs = self.get_action_dist_params(obs) File "C:\ProgramData\Anaconda3\lib\site-packages\stable_baselines3\sac\policies.py", line 163, in get_action_dist_params latent_pi = self.latent_pi(features) File "C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl return forward_call(*input, *kwargs) File "C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\container.py", line 139, in forward input = module(input) File "C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl return forward_call(input, **kwargs) File "C:\ProgramData\Anaconda3\lib\site-packages\torch\nn\modules\linear.py", line 114, in forward return F.linear(input, self.weight, self.bias) RuntimeError: mat1 and mat2 shapes cannot be multiplied (128x128 and 1x256)

ยทยทยท Thank you again for your help. I edited the question again according to your request. Because I do what I want to do on the basis of sunnyswag's code, which is to modify the feature extractor to see whether sac, ddpg and td3 can perform better on the stock portfolio. This problem has bothered me for several days. Thank you again!!!

araffin commented 2 years ago

Closing as basic rules for asking for help where not followed despite asking multiple times (https://github.com/DLR-RM/stable-baselines3/issues/982#issuecomment-1197044014)