Closed hn2 closed 5 years ago
Please add minimal code for replication.
from issue template : https://github.com/hill-a/stable-baselines/blob/master/.github/ISSUE_TEMPLATE/issue-template.md
# Code example
Please try to provide a minimal example to reproduce the bug. Error messages and stack traces are also helpful.
I was unable too replicate this behavior with minimal example:
import numpy as np
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO1
from env_minimal import EnvMinimal
env = EnvMinimal()
env = DummyVecEnv([lambda: env])
model = PPO1(MlpPolicy, env, verbose=0)
model.learn(total_timesteps=1000)
# model_name = str(settings['model_name']) + '_' + str(settings['total_timesteps']) + '_' + str(settings['steps']) + '_' + str(settings['window_length'])
model.save('ppo1_PortfolioEnv_MlpPolicy')
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
class EnvMinimal(gym.Env):
def __init__(self):
self.action_space = gym.spaces.Box(low=-1, high=1, shape=(10,), dtype=np.float32)
self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(11, 7, 50), dtype=np.float32)
self.seed()
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def step(self,action):
print(action)
return self.get_obs(), 1, False, {}
def reset(self):
self.last_u = None
return self.get_obs()
def get_obs(self):
return 1
I double checked my original env but still don't know where the problem is. I also looked at console_util.py line 39 seems to throw this error if (value < 1e-4 or value > 1e+4) and value > 0: I am not sure what it does
def fmt_item(item, min_width):
"""
fits items to a given string length
:param item: (Any) the item you wish to get the string representation
:param min_width: (int) the minimum width of the string
:return: (str) the string representation of 'x' of length >= 'l'
"""
if isinstance(item, np.ndarray):
assert item.ndim == 0
item = item.item()
if isinstance(item, (float, np.float32, np.float64)):
value = abs(item)
if (value < 1e-4 or value > 1e+4) and value > 0:
rep = "%7.2e" % item
else:
rep = "%7.5f" % item
else:
rep = str(item)
return " " * (min_width - len(rep)) + rep
This is my custom env. When I do not allow short, action space is 0,1 there is no problem. However when I allow short, action space is -1,1 and then I get Nan
import gym
import gym.spaces
import numpy as np
import csv
import copy
from gym.utils import seeding
from pprint import pprint
from utils import *
from config import *
class PortfolioEnv(gym.Env):
metadata = {'render.modes': ['human']}
def __init__(self,
total_steps=730, # 2 years
trading_cost=0.0025,
time_cost=0.00,
window_length=7,
start_idx=0,
start_date=None,
allow_short=False
):
datafile = DATA_DIR
self.history, self.abbreviation = read_stock_history(filepath=datafile)
self.total_steps = total_steps
self.current_step = 0
self.trading_cost = trading_cost
self.time_cost = time_cost
self.window_length = window_length
self.start_idx = start_idx
self.start_date = start_date
self.allow_short = allow_short
self.w0 = np.array([1.0] + [0.0] * len(self.abbreviation))
self.p0 = 1.0
self.out_of_money = False
# DataGenerator
assert self.history.shape[0] == len(self.history), 'Number of stock is not consistent'
# make immutable class
self.data = self.history.copy() # all data
# openai gym attributes
# action will be the portfolio weights [cash_bias,w1,w2...] where wn are [0, 1] for each asset
if self.allow_short:
self.action_space = gym.spaces.Box(low=-1., high=1., shape=(len(self.abbreviation) + 1,), dtype=np.float32) # include cash
else:
self.action_space = gym.spaces.Box(low=0., high=1., shape=(len(self.abbreviation) + 1,), dtype=np.float32) # include cash
# get the observation space from the data min and max
self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=((len(self.abbreviation) + 1) * window_length * self.history.shape[-1],), dtype=np.float32)
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def step(self, action):
self.current_step += 1
'''
print(action)
print(self.current_step)
'''
"""
Step the env.
Actions should be portfolio [w0...] Where wn is a portfolio weight from 0 to 1. The first is cash_bias
- cn is the portfolio conversion weights see PortioSim step for description
"""
np.testing.assert_almost_equal(action.shape, (len(self.abbreviation) + 1,))
assert ((action >= self.action_space.low) * (action <= self.action_space.high)).all(), \
'all action values should be between %s and %s. Not %s' % (self.action_space.low, self.action_space.high, action)
# normalise just in case
weights = np.clip(action, self.action_space.low, self.action_space.high)
weights /= (np.sum(np.abs(weights)) + EPS)
weights[0] += np.clip(1 - np.sum(np.abs(weights)), 0, 1) # so if weights are all zeros we normalise to [1,0...]
np.testing.assert_almost_equal(np.sum(np.abs(weights)), 1.0, 3, err_msg='absolute weights should sum to 1. weights="%s"' % weights)
# DataGenerator
# get price matrix (open, close, high, low) from self.history.
observation = self.data[:, self.current_step:self.current_step + self.window_length, :].copy()
# concatenate observation with ones
cash_observation = np.ones((1, self.window_length, observation.shape[2]))
'''
print(np.shape(observation))
print(np.shape(cash_observation))
'''
observation_with_cash = np.concatenate((cash_observation, observation), axis=0)
# relative price vector of last observation day (close/open)
close_price_vector = observation_with_cash[:, -1, 3]
open_price_vector = observation_with_cash[:, -1, 0]
y1 = close_price_vector / open_price_vector
# PortfolioSim
"""
Step.
w1 - new action of portfolio weights - e.g. [0.1,0.9,0.0]
y1 - price relative vector also called return
e.g. [1.0, 0.9, 1.1]
Numbered equations are from https://arxiv.org/abs/1706.10059
"""
w1 = weights
assert w1.shape == y1.shape, 'w1 and y1 must have the same shape'
assert y1[0] == 1.0, 'y1[0] must be 1'
w0 = self.w0
p0 = self.p0
dw1 = (y1 * w0) / (np.dot(y1, w0) + EPS) # (eq7) weights evolve into
mu1 = self.trading_cost * (np.abs(dw1 - w1)).sum() # (eq16) cost to change portfolio
# assert mu1 < 1.0, 'Cost is larger than current holding'
p1 = p0 * (1 - mu1) * np.dot(y1, w1) # (eq11) final portfolio value
p1 = p1 * (1 - self.time_cost) # we can add a cost to holding
if self.allow_short:
p1 = np.clip(p1, -np.inf, np.inf) # short allowed
else:
p1 = np.clip(p1, 0, np.inf) # short not allowed
rho1 = p1 / p0 - 1 # rate of returns
r1 = np.log((p1 + EPS) / (p0 + EPS)) # log rate of return
reward = r1 / self.total_steps * 1000. # (22) average logarithmic accumulated return
# remember for next step
self.w0 = w1
self.p0 = p1
# if we run out of money, we're done (losing all the money)
self.out_of_money = (p1 == 0)
info = {
"reward": reward,
"log_return": r1,
"portfolio_value": p1,
"return": y1.mean(),
"rate_of_return": rho1,
"weights_mean": w1.mean(),
"weights_std": w1.std(),
"cost": mu1,
}
self.infos.append(info)
# calculate return for buy and hold a bit of each asset
info['market_value'] = np.cumprod([inf["return"] for inf in self.infos + [info]])[-1]
# add dates
info['date'] = index_to_date(self.start_idx + self.idx + self.current_step)
info['total_steps'] = self.current_step
self.infos.append(info)
self.done = (self.current_step >= self.total_steps) or (self.out_of_money)
return observation_with_cash.reshape(-1), reward, self.done, info
def reset(self):
self.current_step = 0
self.infos = []
self.w0 = np.array([1.0] + [0.0] * len(self.abbreviation))
self.p0 = 1.0
# get data for this episode, each episode might be different.
if self.start_date is None:
self.idx = np.random.randint(
low=self.window_length, high=self.data.shape[1] - self.total_steps)
else:
# compute index corresponding to start_date for repeatable sequence
self.idx = date_to_index(self.start_date) - self.start_idx
assert self.idx >= self.window_length and self.idx <= self.data.shape[1] - self.total_steps, \
'Invalid start date, must be window_length day after start date and simulation total_steps day before end date'
self.data = self.data[:, self.idx - self.window_length:self.idx + self.total_steps + 1, :]
observation = self.data[:, self.current_step:self.current_step + self.window_length, :].copy()
cash_observation = np.ones((1, self.window_length, observation.shape[2]))
observation_with_cash = np.concatenate((cash_observation, observation), axis=0)
return observation_with_cash.reshape(-1)
ah, you are loading data from a file. Have you checked the validity of the file? It may have NaNs or infs in it.
Add this to the __init__()
function after self.history, self.abbreviation = read_stock_history(filepath=datafile)
:
print("has NaNs: {}, has infs: {}".format(np.any(np.isnan(self.history)), np.any(np.isinf(self.history))))
Furthermore, make sure that your epsilon value EPS
is not too low (as in higher than 1e-20 for example), as a low epsilon value that is too low can cause NaNs.
Finally, add this after importing numpy
np.seterr(invalid='raise')
It will crash when NaNs occure, allowing you to see exaclty where is issue is comming from. And whether or not it is from stable baselines, or your data, or a simple bug.
I added your code. Data seems ok. has NaNs: False, has infs: False I now get: File "c:\users\hanna\stable-baselines\stable_baselines\ppo1\pposgd_simple.py", line 255, in learn atarg = (atarg - atarg.mean()) / atarg.std() FloatingPointError: invalid value encountered in subtract
Interesting, well this could be either reward being nan, vpred being nan, or gamma not in [0,1].
If it is vpred, I'm not sure we can do much unfortunatly...
Otherwise check the reward before return observation_with_cash.reshape(-1), reward, self.done, info
if np.isnan(reward) or np.isinf(reward):
print("is invalid reward {}".format(reward))
What is vpred?
Value prediction for the PPO1 critic.
I will try to add your code for invalid reward. I did not change the defaults for ppo1.
related: https://github.com/hill-a/stable-baselines/issues/340 this answer was choosing the right hyperparameters.
@hn2 did you ever find a solution for this? I also have a trading environment where all the actions are nan and can't figure out why
You should investigate if whether you have any NaN in the observation.
Closing because it does not seem to be related to Stable Baselines but rather the environment. Also there is now documentation and wrapper to debug the NaNs: https://stable-baselines.readthedocs.io/en/master/guide/checking_nan.html
My action space is defined as:
When I run:
print(action)
gives this: First it is ok then gives a warning and then all actions are nan.