quantopian / zipline

Zipline, a Pythonic Algorithmic Trading Library
https://www.zipline.io
Apache License 2.0
17.49k stars 4.71k forks source link

IndexError: index 0 is out of bounds for axis 0 with size 0 #2346

Open mosfiqur-rahman opened 5 years ago

mosfiqur-rahman commented 5 years ago

Dear Zipline Maintainers,

Did anyone face this kind of errors before? I'm not really sure how to deal with it. The script and everything were running successfully several times before and even today, Then suddenly, it has started to show this error and I haven't made any change. Here's the log:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-6-b067148a1524> in <module>()
      9 for i in range(len(sec_lst)):
     10     sector_lst = sec_lst[i]
---> 11     get_ipython().magic(u'zipline --bundle=custom-csvdir-bundle --start 2018-10-22 --end 2018-10-22 --data-frequency=daily --capital-base=500000')
     12 
     13     corr_value, correlated_final_pairs, correlated_final_pairs_name = find_correlated_pairs(prices, value_in_Series)

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
   2158         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2159         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2160         return self.run_line_magic(magic_name, magic_arg_s)
   2161 
   2162     #-------------------------------------------------------------------------

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
   2079                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2080             with self.builtin_trap:
-> 2081                 result = fn(*args,**kwargs)
   2082             return result
   2083 

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/zipline/__main__.pyc in zipline_magic(line, cell)
    309             '%s%%zipline' % ((cell or '') and '%'),
    310             # don't use system exit and propogate errors to the caller
--> 311             standalone_mode=False,
    312         )
    313     except SystemExit as e:

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/click/core.pyc in main(self, args, prog_name, complete_var, standalone_mode, **extra)
    695             try:
    696                 with self.make_context(prog_name, args, **extra) as ctx:
--> 697                     rv = self.invoke(ctx)
    698                     if not standalone_mode:
    699                         return rv

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/click/core.pyc in invoke(self, ctx)
    893         """
    894         if self.callback is not None:
--> 895             return ctx.invoke(self.callback, **ctx.params)
    896 
    897 

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/click/core.pyc in invoke(*args, **kwargs)
    533         with augment_usage_errors(self):
    534             with ctx:
--> 535                 return callback(*args, **kwargs)
    536 
    537     def forward(*args, **kwargs):

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/click/decorators.pyc in new_func(*args, **kwargs)
     15     """
     16     def new_func(*args, **kwargs):
---> 17         return f(get_current_context(), *args, **kwargs)
     18     return update_wrapper(new_func, f)
     19 

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/zipline/__main__.pyc in run(ctx, algofile, algotext, define, data_frequency, capital_base, bundle, bundle_timestamp, start, end, output, trading_calendar, print_algo, metrics_set, local_namespace, blotter)
    274         local_namespace=local_namespace,
    275         environ=os.environ,
--> 276         blotter=blotter,
    277     )
    278 

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/zipline/utils/run_algo.pyc in _run(handle_data, initialize, before_trading_start, analyze, algofile, algotext, defines, data_frequency, capital_base, data, bundle, bundle_timestamp, start, end, output, trading_calendar, print_algo, metrics_set, local_namespace, environ, blotter)
    157             trading_calendar=trading_calendar,
    158             trading_day=trading_calendar.day,
--> 159             trading_days=trading_calendar.schedule[start:end].index,
    160         )
    161         first_trading_day =\

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/zipline/finance/trading.pyc in __init__(self, load, bm_symbol, exchange_tz, trading_calendar, trading_day, trading_days, asset_db_path, future_chain_predicates, environ)
    101             trading_day,
    102             trading_days,
--> 103             self.bm_symbol,
    104         )
    105 

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/zipline/data/loader.pyc in load_market_data(trading_day, trading_days, bm_symbol, environ)
    154         last_date,
    155         now,
--> 156         environ,
    157     )
    158 

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/zipline/data/loader.pyc in ensure_treasury_data(symbol, first_date, last_date, now, environ)
    263 
    264     data = _load_cached_data(filename, first_date, last_date, now, 'treasury',
--> 265                              environ)
    266     if data is not None:
    267         return data

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/zipline/data/loader.pyc in _load_cached_data(filename, first_date, last_date, now, resource_name, environ)
    321         try:
    322             data = from_csv(path)
--> 323             if has_data_for_dates(data, first_date, last_date):
    324                 return data
    325 

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/zipline/data/loader.pyc in has_data_for_dates(series_or_df, first_date, last_date)
     84     if not isinstance(dts, pd.DatetimeIndex):
     85         raise TypeError("Expected a DatetimeIndex, but got %s." % type(dts))
---> 86     first, last = dts[[0, -1]]
     87     return (first <= first_date) and (last >= last_date)
     88 

/home/mosfiqur/.conda/envs/env_zipline/lib/python2.7/site-packages/pandas/core/indexes/datetimelike.pyc in __getitem__(self, key)
    294             attribs['freq'] = freq
    295 
--> 296             result = getitem(key)
    297             if result.ndim > 1:
    298                 # To support MPL which performs slicing with 2 dim

IndexError: index 0 is out of bounds for axis 1 with size 0
davidstyers commented 5 years ago

Make sure you are using pandas-datareader==0.6.0.

kayveesin commented 5 years ago

This happened with me as well. The code was running perfectly before. I am trying to run a backtest on custom csv data through pandas panel. @Dstyers I am on pandas-datareader==0.6.0

Below is the code which was running fine earlier.

files1 = glob.glob("/Users/karanveersingh/Downloads/IntradayData_2018/*.txt")
startDate = datetime(2018, 1, 1)
endDate = datetime(2018, 12, 31)
nb_days = (endDate - startDate).days + 1  # + 1 because range is exclusive
dates = [startDate + timedelta(days=x) for x in range(nb_days)]
holidays = []
for x in dates:
    if x.isoweekday() in [6, 7]:
        holidays.append(x)

nse_holidays = ['2018-01-26',
    '2018-02-13',
    '2018-03-02',
    '2018-03-29',
    '2018-03-30',
    '2018-05-01',
    '2018-08-15',
    '2018-08-22',
    '2018-09-13',
    '2018-09-20',
    '2018-10-02',
    '2018-10-18',
    #'2018-11-07', NSE is open on diwali holiday
    '2018-11-08',
    '2018-11-23',
    '2018-12-25',]

holidays.extend([datetime.strptime(x, '%Y-%m-%d') for x in nse_holidays])

data = OrderedDict()
for file in files1[100:200]:
    final_data = pd.read_csv(file,
                               names=['Stock', 'Date', 'Time', 'open', 'high', 'low', 'close', 'volume', 'Something'])

    final_data.drop(final_data.columns[[0,8]], inplace=True, axis=1)
    #final_data.rename(columns={'Close': stock[:len(stock) - 7]}, inplace=True)
    #final_data['Date'] = 
    final_data.index = pd.to_datetime(final_data['Date'].astype(str) + ' ' + final_data['Time'], utc=True)
    final_data.drop(final_data.columns[[0, 1]], inplace=True, axis=1)
    #final_data.index = final_data.index-pd.offsets.Minute(1)
    #final_data = final_data.resample('15Min').apply(custom_sampler)
    final_data['open'] = final_data['open'].astype(float)
    final_data['high'] = final_data['high'].astype(float)
    final_data['low'] = final_data['low'].astype(float)
    final_data['close'] = final_data['close'].astype(float)
    final_data['volume'] = final_data['volume'].astype(int)

    final_data = final_data.resample('1D').agg(
    OrderedDict([
        ('open', 'first'),
        ('high', 'max'),
        ('low', 'min'),
        ('close', 'last'),
        ('volume', 'sum'),
    ]))
    #final_data['dividend'] = 0.0
    #final_data['split'] = 1.0
    mask = np.logical_not(final_data.index.isin(holidays))
    final_data = final_data[mask]
    cols = ['open', 'high', 'low', 'close']
    final_data[cols] = final_data[cols].ffill()
    #final_data['volumne'] = final_data['volume'].fillna(0)

    ticker = file.replace('/Users/karanveersingh/Downloads/IntradayData_2018/', '')
    ticker = ticker.replace('.txt', '')
    data[ticker] = final_data

def before_trading(context, data):
    print(data)

def initialize(context):
    #set_benchmark(symbol('NIFTY'))
    pass

def handle_data(context, data):
    #order(symbol('YESBANK'), 50)
    #record(APOLLOTYRE=data.current(symbol('YESBANK'), fields = 'price'))
    pass

performace = zipline.run_algorithm(start=datetime(2018, 1, 1, 0, 0, 0, tzinfo = pytz.timezone('Asia/Calcutta')),
                                  end = datetime(2018, 12, 31, 0, 0, 0, tzinfo = pytz.timezone('Asia/Calcutta')),
                                  initialize = initialize,
                                  handle_data=handle_data,
                                  capital_base = 1000000,
                                  data_frequency = 'daily',
                                  trading_calendar = get_calendar('XBOM'),
                                  data = panel)
kayveesin commented 5 years ago

A bit of digging in the framework helped me fix this issue. When you call run_algorithm zipline downloads treasury_curves.csv from www.federalreserve.gov in loader.py. If this file is not available locally in the correct format it leads to this error. Mine had column names in it without any actual data.

Referenced from loader.py(33:40):

INDEX_MAPPING = {
    'SPY':
    (treasuries, 'treasury_curves.csv', 'www.federalreserve.gov'),
    '^GSPTSE':
    (treasuries_can, 'treasury_curves_can.csv', 'bankofcanada.ca'),
    '^FTSE':  # use US treasuries until UK bonds implemented
    (treasuries, 'treasury_curves.csv', 'www.federalreserve.gov'),
}

This file is cached for prabably obvious reasons to speed up processing. Deleting this file from .zipline/data and then running should allow zipline to redownload it which should fix this issue.

I was at the tipping point of abandoning zipline when I saw this issue.

A format check can be added before loading the cached version of these benchmark files. Something like:

def check_file_format(file):
    #check if it has all required columns and data in it

cached_treasury_file = fetch_treasury_file()

if cached_treasury_file is not None:
    format_ok = check_file_format(cached_treasury_file)

    if not format_ok:
        cached_treasury_file = download_file(file)

@freddiev4 I can add this check if you are open to my pull requests.

surajthorat commented 4 years ago

A bit of digging in the framework helped me fix this issue. When you call run_algorithm zipline downloads treasury_curves.csv from www.federalreserve.gov in loader.py. If this file is not available locally in the correct format it leads to this error. Mine had column names in it without any actual data.

Referenced from loader.py(33:40):

INDEX_MAPPING = {
    'SPY':
    (treasuries, 'treasury_curves.csv', 'www.federalreserve.gov'),
    '^GSPTSE':
    (treasuries_can, 'treasury_curves_can.csv', 'bankofcanada.ca'),
    '^FTSE':  # use US treasuries until UK bonds implemented
    (treasuries, 'treasury_curves.csv', 'www.federalreserve.gov'),
}

This file is cached for prabably obvious reasons to speed up processing. Deleting this file from .zipline/data and then running should allow zipline to redownload it which should fix this issue.

I was at the tipping point of abandoning zipline when I saw this issue.

A format check can be added before loading the cached version of these benchmark files. Something like:

def check_file_format(file):
    #check if it has all required columns and data in it

cached_treasury_file = fetch_treasury_file()

if cached_treasury_file is not None:
    format_ok = check_file_format(cached_treasury_file)

    if not format_ok:
        cached_treasury_file = download_file(file)

@freddiev4 I can add this check if you are open to my pull requests.

I have the exact same problem. Can you help me with it? I don't have any treasury_curve.csv in my data folder of Zipline.

This is my code.

from datetime import datetime
from zipline.api import order, symbol, record, order_target, set_benchmark
from zipline.algorithm import TradingAlgorithm
import zipline
from trading_calendars.exchange_calendar_twentyfourhr import TwentyFourHR

ticker = 'TCS'

#code
def initialize(context):
    context.security = symbol(ticker)
#     set_benchmark(symbol(ticker))

#code
def handle_data(context, data):
    price_hist_25 = data.history(context.security, 'price', 25, '1m')  
    price_hist_50 = data.history(context.security, 'price', 50, '1m')  
    MA1 = price_hist_25.mean()  
    MA2= price_hist_50.mean()
    print(price_hist_25.head())
    print(price_hist_50.head())
    current_price = data.current(context.security, 'price') 
    current_positions = context.portfolio.positions[symbol(ticker)].amount
    cash = context.portfolio.cash
    value = context.portfolio.portfolio_value
    current_pnl = context.portfolio.pnl
    #code (this will come under handle_data function only)
    if (MA1 > MA2) and current_positions == 0:
        number_of_shares = int(cash/current_price)
        order(context.security, number_of_shares)
        record(MA1 = MA1, MA2 = MA2, Price=current_price,status="buy",shares=number_of_shares,PnL=current_pnl,cash=cash,value=value)
    elif (MA1 < MA2) and current_positions != 0:
         order_target(context.security, 0)
         record(MA1 = MA1, MA2 = MA2, Price= current_price,status="sell",shares="--",PnL=current_pnl,cash=cash,value=value)
    else:
        record(MA1 = MA1, MA2 = MA2, Price= current_price,status="--",shares="--",PnL=current_pnl,cash=cash,value=value)  

#initializing trading enviroment
perf = zipline.run_algorithm(start=datetime(2019, 10, 14, 3, 45, 0, 0, pytz.utc),
                              end=datetime(2019, 10, 15, 9, 59, 0, 0, pytz.utc),
                              initialize=initialize,
                              capital_base=100000,
                              handle_data=handle_data,
                              trading_calendar=TwentyFourHR(),
                              data_frequency ='minute',
                              data=panel)

This is the error I am getting.

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-10-7f6751593a27> in <module>
     45                               trading_calendar=TwentyFourHR(),
     46                               data_frequency ='minute',
---> 47                               data=panel)

/usr/local/lib/python3.5/site-packages/zipline/utils/run_algo.py in run_algorithm(start, end, initialize, capital_base, handle_data, before_trading_start, analyze, data_frequency, data, bundle, bundle_timestamp, trading_calendar, metrics_set, default_extension, extensions, strict_extensions, environ, blotter)
    428         local_namespace=False,
    429         environ=environ,
--> 430         blotter=blotter,
    431     )

/usr/local/lib/python3.5/site-packages/zipline/utils/run_algo.py in _run(handle_data, initialize, before_trading_start, analyze, algofile, algotext, defines, data_frequency, capital_base, data, bundle, bundle_timestamp, start, end, output, trading_calendar, print_algo, metrics_set, local_namespace, environ, blotter)
    186             trading_calendar=trading_calendar,
    187             trading_day=trading_calendar.day,
--> 188             trading_days=trading_calendar.schedule[start:end].index,
    189         )
    190         choose_loader = None

/usr/local/lib/python3.5/site-packages/zipline/finance/trading.py in __init__(self, load, bm_symbol, exchange_tz, trading_calendar, trading_day, trading_days, asset_db_path, future_chain_predicates, environ)
    101             trading_day,
    102             trading_days,
--> 103             self.bm_symbol,
    104         )
    105 

/usr/local/lib/python3.5/site-packages/zipline/data/loader.py in load_market_data(trading_day, trading_days, bm_symbol, environ)
    154         last_date,
    155         now,
--> 156         environ,
    157     )
    158 

/usr/local/lib/python3.5/site-packages/zipline/data/loader.py in ensure_treasury_data(symbol, first_date, last_date, now, environ)
    263 
    264     data = _load_cached_data(filename, first_date, last_date, now, 'treasury',
--> 265                              environ)
    266     if data is not None:
    267         return data

/usr/local/lib/python3.5/site-packages/zipline/data/loader.py in _load_cached_data(filename, first_date, last_date, now, resource_name, environ)
    321         try:
    322             data = from_csv(path)
--> 323             if has_data_for_dates(data, first_date, last_date):
    324                 return data
    325 

/usr/local/lib/python3.5/site-packages/zipline/data/loader.py in has_data_for_dates(series_or_df, first_date, last_date)
     84     if not isinstance(dts, pd.DatetimeIndex):
     85         raise TypeError("Expected a DatetimeIndex, but got %s." % type(dts))
---> 86     first, last = dts[[0, -1]]
     87     return (first <= first_date) and (last >= last_date)
     88 

/usr/local/lib/python3.5/site-packages/pandas/core/indexes/datetimelike.py in __getitem__(self, key)
    294             attribs['freq'] = freq
    295 
--> 296             result = getitem(key)
    297             if result.ndim > 1:
    298                 # To support MPL which performs slicing with 2 dim

IndexError: index 0 is out of bounds for axis 0 with size 0
BinhKieu82 commented 2 years ago

Hi folks, I got quite similar issue as below. Please anyone shares any advice on that. I also followed up the above conversations but still no hope. Thank you

Sector class

class Sector(Classifier):
    dtype = int64_dtype
    window_length = 0
    inputs = ()
    missing_value = -1

    def __init__(self):
        self.data = np.load('./data.npy')

    def _compute(self, arrays, dates, assets, mask): #mask here is 500 tickers filtered from market
        return np.where(
            mask,
            self.data[assets],
            self.missing_value,
        )

Momentum function

def momentum_1yr(window_length, universe, sector):
    return Returns(window_length=window_length, mask=universe) \
        .demean(groupby=sector) \
        .rank() \
        .zscore()

Create & run pipeline

factor_start_date = universe_end_date - pd.DateOffset(years=2, days=2)
universe = AverageDollarVolume(window_length=120).top(500)
sector = Sector()

pipeline = Pipeline(screen=universe)
pipeline.add(
    momentum_1yr(252, universe, sector),
    'Momentum_1YR')

engine.run_pipeline(pipeline, factor_start_date, universe_end_date)

Then I got the following traceback:

IIndexError                                Traceback (most recent call last)
<ipython-input-35-51b9aa5fd536> in <module>
     21 #     overnight_sentiment_smoothed(2, 5, universe),
---> 23 all_factors = engine.run_pipeline(pipeline, factor_start_date, universe_end_date)
     24 
     25 all_factors.head()

e:\temp\Python36\lib\site-packages\zipline\pipeline\engine.py in run_pipeline(self, pipeline, start_date, end_date)
    309             dates,
    310             assets,
--> 311             initial_workspace,
    312         )
    313 

e:\temp\Python36\lib\site-packages\zipline\pipeline\engine.py in compute_chunk(self, graph, dates, assets, initial_workspace)
    535                     mask_dates,
    536                     assets,
--> 537                     mask,
    538                 )
    539                 if term.ndim == 2:

in _compute(self, arrays, dates, assets, mask)
     36         return np.where(
     37             mask,
---> 38             self.data[assets],
     39             self.missing_value,
     40         )

IndexError: index 500 is out of bounds for axis 0 with size 500

Versions: Vscode, Win10 Python: 3.6.8 Pandas: 0.18.1 Matplotlib: 3.3.4 Numpy: 1.17.0 Scipy: 1.0.0 Statsmodels: 0.12.2 Zipline: 1.2.0

hanisalah commented 1 year ago

Hi folks, I got quite similar issue as below. Please anyone shares any advice on that. I also followed up the above conversations but still no hope. Thank you

Sector class

class Sector(Classifier):
    dtype = int64_dtype
    window_length = 0
    inputs = ()
    missing_value = -1

    def __init__(self):
        self.data = np.load('./data.npy')

    def _compute(self, arrays, dates, assets, mask): #mask here is 500 tickers filtered from market
        return np.where(
            mask,
            self.data[assets],
            self.missing_value,
        )

Momentum function

def momentum_1yr(window_length, universe, sector):
    return Returns(window_length=window_length, mask=universe) \
        .demean(groupby=sector) \
        .rank() \
        .zscore()

Create & run pipeline

factor_start_date = universe_end_date - pd.DateOffset(years=2, days=2)
universe = AverageDollarVolume(window_length=120).top(500)
sector = Sector()

pipeline = Pipeline(screen=universe)
pipeline.add(
    momentum_1yr(252, universe, sector),
    'Momentum_1YR')

engine.run_pipeline(pipeline, factor_start_date, universe_end_date)

Then I got the following traceback:

IIndexError                                Traceback (most recent call last)
<ipython-input-35-51b9aa5fd536> in <module>
     21 #     overnight_sentiment_smoothed(2, 5, universe),
---> 23 all_factors = engine.run_pipeline(pipeline, factor_start_date, universe_end_date)
     24 
     25 all_factors.head()

e:\temp\Python36\lib\site-packages\zipline\pipeline\engine.py in run_pipeline(self, pipeline, start_date, end_date)
    309             dates,
    310             assets,
--> 311             initial_workspace,
    312         )
    313 

e:\temp\Python36\lib\site-packages\zipline\pipeline\engine.py in compute_chunk(self, graph, dates, assets, initial_workspace)
    535                     mask_dates,
    536                     assets,
--> 537                     mask,
    538                 )
    539                 if term.ndim == 2:

in _compute(self, arrays, dates, assets, mask)
     36         return np.where(
     37             mask,
---> 38             self.data[assets],
     39             self.missing_value,
     40         )

IndexError: index 500 is out of bounds for axis 0 with size 500

Versions: Vscode, Win10 Python: 3.6.8 Pandas: 0.18.1 Matplotlib: 3.3.4 Numpy: 1.17.0 Scipy: 1.0.0 Statsmodels: 0.12.2 Zipline: 1.2.0

I am facing exactly the same issue, with the only difference that I already build my 'sectors.npy' as a numpy array of 500 entries (500 symbols), and I earlier defined my pipeline universe as AverageDollarVolume(window_length=120).top(500). However, when I print the size of mask in the Sector class, I get 3175 columns! I have explained my issue in detail in https://stackoverflow.com/questions/76931864/zipline-reloaded-demean-by-sector-is-not-working-correctly Hope someone can help us out with this.

hanisalah commented 1 year ago

Hi folks, I got quite similar issue as below. Please anyone shares any advice on that. I also followed up the above conversations but still no hope. Thank you Sector class

class Sector(Classifier):
    dtype = int64_dtype
    window_length = 0
    inputs = ()
    missing_value = -1

    def __init__(self):
        self.data = np.load('./data.npy')

    def _compute(self, arrays, dates, assets, mask): #mask here is 500 tickers filtered from market
        return np.where(
            mask,
            self.data[assets],
            self.missing_value,
        )

Momentum function

def momentum_1yr(window_length, universe, sector):
    return Returns(window_length=window_length, mask=universe) \
        .demean(groupby=sector) \
        .rank() \
        .zscore()

Create & run pipeline

factor_start_date = universe_end_date - pd.DateOffset(years=2, days=2)
universe = AverageDollarVolume(window_length=120).top(500)
sector = Sector()

pipeline = Pipeline(screen=universe)
pipeline.add(
    momentum_1yr(252, universe, sector),
    'Momentum_1YR')

engine.run_pipeline(pipeline, factor_start_date, universe_end_date)

Then I got the following traceback:

IIndexError                                Traceback (most recent call last)
<ipython-input-35-51b9aa5fd536> in <module>
     21 #     overnight_sentiment_smoothed(2, 5, universe),
---> 23 all_factors = engine.run_pipeline(pipeline, factor_start_date, universe_end_date)
     24 
     25 all_factors.head()

e:\temp\Python36\lib\site-packages\zipline\pipeline\engine.py in run_pipeline(self, pipeline, start_date, end_date)
    309             dates,
    310             assets,
--> 311             initial_workspace,
    312         )
    313 

e:\temp\Python36\lib\site-packages\zipline\pipeline\engine.py in compute_chunk(self, graph, dates, assets, initial_workspace)
    535                     mask_dates,
    536                     assets,
--> 537                     mask,
    538                 )
    539                 if term.ndim == 2:

in _compute(self, arrays, dates, assets, mask)
     36         return np.where(
     37             mask,
---> 38             self.data[assets],
     39             self.missing_value,
     40         )

IndexError: index 500 is out of bounds for axis 0 with size 500

Versions: Vscode, Win10 Python: 3.6.8 Pandas: 0.18.1 Matplotlib: 3.3.4 Numpy: 1.17.0 Scipy: 1.0.0 Statsmodels: 0.12.2 Zipline: 1.2.0

I am facing exactly the same issue, with the only difference that I already build my 'sectors.npy' as a numpy array of 500 entries (500 symbols), and I earlier defined my pipeline universe as AverageDollarVolume(window_length=120).top(500). However, when I print the size of mask in the Sector class, I get 3175 columns! I have explained my issue in detail in https://stackoverflow.com/questions/76931864/zipline-reloaded-demean-by-sector-is-not-working-correctly Hope someone can help us out with this.

I think I found the answer, and I posted also in https://stackoverflow.com/questions/76931864/zipline-reloaded-demean-by-sector-is-not-working-correctly

I think I found the answer, and I hope someone can verify. The solution lies in the Sector() print functions. Those prints tell us that the Sector(Classifier) is being passed assets of shape (3175,) and mask of shape (505,3175). This means that all assets (not jut the top 500) are being passed to the classifier, and it would be logical then that the compute function returns an index error.

So, what I did is that I re-wrote the part to get the Sector information as will be shown below. Another trick there was that the assets while of shape (3175,), the maximum value in the assets was 3198. Indeed though my sector acquiring function is looking only for the top 500 symbols, their index ranged from 0 to 3197. So the line np.zeros(max(ix)+1) returns an np array of size 3198 (indexed from 0 to 3197). If maximum assets index is 3198 and is used in the Sector(classifier), it will return error then because data[3198] is out of bounds. So, I simply modify the line to be np.zeros(max(ix)+2), so that indices work.

Below are my sector acquiring function, and the classifier class that don't produce an error any more. I hope if someone with more zipline insight to have a look and advise if indexing in this manner (while it works syntactically) would be semantically correct.

Sector Acquiring Function

def get_sector(sym):
    try:
        sector = yf.Ticker(sym[1]).info['sector'] #use yahoo finance to get sector information
    except:
        sector = 'NoSector'
    return [sym[0], sym[1], sector]

def build_sectors(prices, short_list=True):
    tickers = prices.columns.values.tolist() #tickers from prices columns in the form of 'Equity n ([symbol])'
    tick_seq = [int(str(t).split('(')[1].split('[')[0]) for t in tickers] #get list of equity index (n)
    tick_name = [str(t).split('[')[1][:-2] for t in tickers] #get list of symbol names
    tick_tuple = list(zip(tick_seq,tick_name)) #collect index and symbol name to list of tuples
    tick_list = [list(i) for i in tick_tuple] #convert list of tuples to list of lists

    try:
        df = pd.read_csv('sectors.csv')
        ix = df['symbol_seq'].tolist()
        val = df['sec_id'].tolist()
        sym_name = df['symbol'].tolist()
        if sym_name != tick_name:
            raise Exception('Symbol names stored on file are different from your dataset')
        if short_list == False:
            sym_seq = df['symbol_seq'].tolist()
            if sym_seq != tick_seq:
                raise Exception('Symbol sequences stored on file are different from your dataset')
    except:
        with ThreadPoolExecutor() as t:
            sectors = list(t.map(get_sector, tick_list)) #returns list of lists, each inner list is [index, symbol, sector]

        sectors_set = set([s[2] for s in sectors]) #get set of sectors
        sectors_set.remove('NoSector')
        sectors_set = {v:i for i,v in enumerate(sorted(sectors_set))} #assign a number for each sector
        sectors_set['NoSector']=-1 # identify an unfound sector with -1
        tmp = [s.append(sectors_set[s[2]]) for s in sectors] #append sector id number to each list; it becomes [index, symbol, sector, sec_no]
        ix = [int(s[0]) for s in sectors] #extract again the indices of the symbols
        val = [int(s[-1]) for s in sectors] #extract the sec_no for each index
        df = pd.DataFrame({'symbol_seq':[s[0] for s in sectors], 'symbol':[s[1] for s in sectors], 
                   'sector':[s[2] for s in sectors], 'sec_id': [s[3] for s in sectors]})
        df.to_csv('sectors.csv')

    if short_list:
        sectors_np = np.array(val)
    else:
        sectors_np = np.zeros(max(ix)+2) #since equity indices are not sequential, we take the max found
        sectors_np[:]=-1 #fill all as not found
        sectors_np[ix]=val #fill the found sectors at their corresponding symbol index
    sectors_np = sectors_np.astype(int) #convert the array to integer
    return sectors_np

Sector Classifier Class

class Sector(Classifier):
    dtype = int64_dtype
    window_length = 0
    inputs = ()
    missing_value = -1

    def __init__(self):
        self.data = build_sectors(prices, short_list=False)

    def _compute(self, arrays, dates, assets, mask):
        return np.where(mask, self.data[assets], self.missing_value)