DLR-RM / stable-baselines3

PyTorch version of Stable Baselines, reliable implementations of reinforcement learning algorithms.
https://stable-baselines3.readthedocs.io
MIT License
8.96k stars 1.68k forks source link

ValueError: cannot reshape array of size 1792 into shape (1,28) #1369

Closed Mahdiehdn closed 1 year ago

Mahdiehdn commented 1 year ago

🐛 Bug

Hi, I am trying to add custom policy to code. after the code comes to "learner = PPO('MlpPolicy', env = env_train,policy_kwargs=policy_kwargs, verbose=1).learn(5000)", code have error. Main code is in this link: https://github.com/AI4Finance-Foundation/FinRL-Tutorials/blob/master/2-Advance/FinRL_PortfolioAllocation_Explainable_DRL.ipynb

Could any one help? Thanks.

Code example

class StockPortfolioEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, 
                df,
                stock_dim,
                hmax,
                initial_amount,
                transaction_cost_pct,
                reward_scaling,
                state_space,
                action_space,
                tech_indicator_list,
                turbulence_threshold=None,
                lookback=252,
                day = 0):
        #super(StockEnv, self).__init__()
        #money = 10 , scope = 1
        self.day = day
        self.lookback=lookback
        self.df = df
        self.stock_dim = stock_dim
        self.hmax = hmax
        self.initial_amount = initial_amount
        self.transaction_cost_pct =transaction_cost_pct
        self.reward_scaling = reward_scaling
        self.state_space = state_space
        self.action_space = action_space
        self.tech_indicator_list = tech_indicator_list

        # action_space normalization and shape is self.stock_dim
        self.action_space = spaces.Box(low = 0, high = 1,shape = (self.action_space,)) 
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape = (self.state_space+len(self.tech_indicator_list),self.state_space),dtype=np.float64)

        # load data from a pandas dataframe
        self.data = self.df.loc[self.day,:]
        self.covs = self.data['cov_list'].values[0]
        self.state =  np.append(np.array(self.covs), [self.data[tech].values.tolist() for tech in self.tech_indicator_list ], axis=0)
        self.terminal = False     
        self.turbulence_threshold = turbulence_threshold        
        # initalize state: inital portfolio return + individual stock return + individual weights
        self.portfolio_value = self.initial_amount

    def step(self, actions):
        self.terminal = self.day >= len(self.df.index.unique())-1

        if self.terminal:
            df = pd.DataFrame(self.portfolio_return_memory)
            df.columns = ['daily_return']
            plt.plot(df.daily_return.cumsum(),'r')
            plt.savefig('results/cumulative_reward.png')
            plt.close()

            plt.plot(self.portfolio_return_memory,'r')
            plt.savefig('results/rewards.png')
            plt.close()

            print("=================================")
            print("begin_total_asset:{}".format(self.asset_memory[0]))           
            print("end_total_asset:{}".format(self.portfolio_value))

            df_daily_return = pd.DataFrame(self.portfolio_return_memory)
            df_daily_return.columns = ['daily_return']
            if df_daily_return['daily_return'].std() !=0:
              sharpe = (252**0.5)*df_daily_return['daily_return'].mean()/ \
                       df_daily_return['daily_return'].std()
              print("Sharpe: ",sharpe)
            print("=================================")

            return self.state, self.reward, self.terminal,{}

        else:
            weights = self.softmax_normalization(actions) 
            self.actions_memory.append(weights)
            last_day_memory = self.data

            #load next state
            self.day += 1
            self.data = self.df.loc[self.day,:]
            self.covs = self.data['cov_list'].values[0]
            self.state =  np.append(np.array(self.covs), [self.data[tech].values.tolist() for tech in self.tech_indicator_list ], axis=0)
            portfolio_return = sum(((self.data.close.values / last_day_memory.close.values)-1)*weights)
            log_portfolio_return = np.log(sum((self.data.close.values / last_day_memory.close.values)*weights))
            # update portfolio value
            new_portfolio_value = self.portfolio_value*(1+portfolio_return)
            self.portfolio_value = new_portfolio_value

            # save into memory
            self.portfolio_return_memory.append(portfolio_return)
            self.date_memory.append(self.data.date.unique()[0])            
            self.asset_memory.append(new_portfolio_value)

            # the reward is the new portfolio value or end portfolo value
            self.reward = new_portfolio_value

        return self.state, self.reward, self.terminal, {}

    def reset(self):
        self.asset_memory = [self.initial_amount]
        self.day = 0
        self.data = self.df.loc[self.day,:]
        # load states
        self.covs = self.data['cov_list'].values[0]
        self.state =  np.append(np.array(self.covs), [self.data[tech].values.tolist() for tech in self.tech_indicator_list ], axis=0)
        self.portfolio_value = self.initial_amount
        #self.cost = 0
        #self.trades = 0
        self.terminal = False 
        self.portfolio_return_memory = [0]
        self.actions_memory=[[1/self.stock_dim]*self.stock_dim]
        self.date_memory=[self.data.date.unique()[0]] 
        return self.state

    def render(self, mode='human'):
        return self.state

    def softmax_normalization(self, actions):
        numerator = np.exp(actions)
        denominator = np.sum(np.exp(actions))
        softmax_output = numerator/denominator
        return softmax_output

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def get_sb_env(self):
        e = DummyVecEnv([lambda: self])
        obs = e.reset()
        return e, obs
env_train, _ = e_train_gym.get_sb_env()

class Net(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 512):
        super(Net, self).__init__(observation_space, features_dim)
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
        self.fc3 = nn.Linear(672, features_dim)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = nn.Flatten()(x)
        x = F.relu(self.fc3(x))
        return x
stock_dimension = len(train.tic.unique())
state_space = stock_dimension
tech_indicator_list = ['macd', 'rsi_30', 'cci_30', 'dx_30']
feature_dimension = len(tech_indicator_list)
env_kwargs = {
    "hmax": 100, 
    "initial_amount": 1000000, 
    "transaction_cost_pct": 0, 
    "state_space": state_space, 
    "stock_dim": stock_dimension, 
    "tech_indicator_list": tech_indicator_list, 
    "action_space": stock_dimension, 
    "reward_scaling": 1e-1

}
e_train_gym = StockPortfolioEnv(df = train, **env_kwargs)
policy_kwargs = {
    'features_extractor_class':Net,
}
learner = PPO('MlpPolicy', env = env_train,policy_kwargs=policy_kwargs, verbose=1).learn(5000)

### Relevant log output / Error message

```shell
ValueError                                Traceback (most recent call last)
<ipython-input-42-fb765bbf755b> in <module>
     18     'features_extractor_class':Net,
     19 }
---> 20 learner = PPO('MlpPolicy', env = env_train,policy_kwargs=policy_kwargs, verbose=1).learn(5000)
     21 
     22 learner.policy

3 frames
/usr/local/lib/python3.8/dist-packages/stable_baselines3/ppo/ppo.py in learn(self, total_timesteps, callback, log_interval, tb_log_name, reset_num_timesteps, progress_bar)
    305     ) -> SelfPPO:
    306 
--> 307         return super().learn(
    308             total_timesteps=total_timesteps,
    309             callback=callback,

/usr/local/lib/python3.8/dist-packages/stable_baselines3/common/on_policy_algorithm.py in learn(self, total_timesteps, callback, log_interval, tb_log_name, reset_num_timesteps, progress_bar)
    246         while self.num_timesteps < total_timesteps:
    247 
--> 248             continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)
    249 
    250             if continue_training is False:

/usr/local/lib/python3.8/dist-packages/stable_baselines3/common/on_policy_algorithm.py in collect_rollouts(self, env, callback, rollout_buffer, n_rollout_steps)
    202                     rewards[idx] += self.gamma * terminal_value
    203 
--> 204             rollout_buffer.add(self._last_obs, actions, rewards, self._last_episode_starts, values, log_probs)
    205             self._last_obs = new_obs
    206             self._last_episode_starts = dones

/usr/local/lib/python3.8/dist-packages/stable_baselines3/common/buffers.py in add(self, obs, action, reward, episode_start, value, log_prob)
    435 
    436         # Same reshape, for actions
--> 437         action = action.reshape((self.n_envs, self.action_dim))
    438 
    439         self.observations[self.pos] = np.array(obs).copy()

ValueError: cannot reshape array of size 1792 into shape (1,28)

System Info

({'OS': 'Linux-5.10.147+-x86_64-with-glibc2.29 # 1 SMP Sat Dec 10 16:00:40 UTC 2022', 'Python': '3.8.10', 'Stable-Baselines3': '1.7.0', 'PyTorch': '1.13.1+cu116', 'GPU Enabled': 'True', 'Numpy': '1.22.4', 'Gym': '0.21.0'}, '- OS: Linux-5.10.147+-x86_64-with-glibc2.29 # 1 SMP Sat Dec 10 16:00:40 UTC 2022\n- Python: 3.8.10\n- Stable-Baselines3: 1.7.0\n- PyTorch: 1.13.1+cu116\n- GPU Enabled: True\n- Numpy: 1.22.4\n- Gym: 0.21.0\n')

Checklist

araffin commented 1 year ago

I have checked my env using the env checker

did you ? what was the output?

Mahdiehdn commented 1 year ago

I use colab. When I run check_env(env,warn=True) for the first time, the output is:

None
/usr/local/lib/python3.8/dist-packages/stable_baselines3/common/env_checker.py:190: UserWarning: Your observation  has an unconventional shape (neither an image, nor a 1D vector). We recommend you to flatten the observation to have only a 1D vector or use a custom policy to properly process the data.
  warnings.warn(
/usr/local/lib/python3.8/dist-packages/stable_baselines3/common/env_checker.py:361: UserWarning: We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) cf https://stable-baselines3.readthedocs.io/en/master/guide/rl_tips.html
  warnings.warn(

But when I run the cell again

None
qgallouedec commented 1 year ago

The observations aren’t images, right? If so, why do you want to use a policy based on conv2d layers?

Mahdiehdn commented 1 year ago

Yes that's right. My data is a time series, I used conv1d layers and now there is no error.