Closed CarstVaartjes closed 9 years ago
can u show
ptdump -av
pls get with master/0.13 as well to see if it reprodices
Edit: will try a new build of 0.13; see the dump below
ptdump -av iteration_test3.h5
/ (RootGroup) ''
/._v_attrs (AttributeSet), 4 attributes:
[CLASS := 'GROUP',
PYTABLES_FORMAT_VERSION := '2.1',
TITLE := '',
VERSION := '1.0']
/store_0 (Group) ''
/store_0._v_attrs (AttributeSet), 14 attributes:
[CLASS := 'GROUP',
TITLE := '',
VERSION := '1.0',
data_columns := ['a1007', 'a1014', 'a1006', 'a1005', 'a1004', 'a1003', 'a1012', 'a1013', 'r6', 'o', 'a_n_8', 'r1', 'r2', 'r3', 'r4', 'r5'],
encoding := None,
index_cols := [(0, 'index')],
info := {'index': {}},
levels := 1,
nan_rep := 'nan',
non_index_axes := [(1, ['a1007', 'a1014', 'a1006', 'a1005', 'a1004', 'a1003', 'a1012', 'a1013', 'r6', 'o', 'a_n_8', 'r1', 'r2', 'r3', 'r4', 'r5', u'm102', u'm103', u'm101', u'm106', u'm107', u'm104', u'm108', u'm109', u'm110', u'm105_v', u'm105_b'])],
pandas_type := 'frame_table',
pandas_version := '0.10.1',
table_type := 'appendable_frame',
values_cols := ['values_block_0', 'a1007', 'a1014', 'a1006', 'a1005', 'a1004', 'a1003', 'a1012', 'a1013', 'r6', 'o', 'a_n_8', 'r1', 'r2', 'r3', 'r4', 'r5']]
/store_0/table (Table(6708525,), shuffle, blosc(9)) ''
description := {
"index": Int64Col(shape=(), dflt=0, pos=0),
"values_block_0": Float64Col(shape=(11,), dflt=0.0, pos=1),
"a1007": Int64Col(shape=(), dflt=0, pos=2),
"a1014": Int64Col(shape=(), dflt=0, pos=3),
"a1006": Int64Col(shape=(), dflt=0, pos=4),
"a1005": Int64Col(shape=(), dflt=0, pos=5),
"a1004": Int64Col(shape=(), dflt=0, pos=6),
"a1003": Int64Col(shape=(), dflt=0, pos=7),
"a1012": Int64Col(shape=(), dflt=0, pos=8),
"a1013": Int64Col(shape=(), dflt=0, pos=9),
"r6": Int64Col(shape=(), dflt=0, pos=10),
"o": Int64Col(shape=(), dflt=0, pos=11),
"a_n_8": Int64Col(shape=(), dflt=0, pos=12),
"r1": Int64Col(shape=(), dflt=0, pos=13),
"r2": Int64Col(shape=(), dflt=0, pos=14),
"r3": Int64Col(shape=(), dflt=0, pos=15),
"r4": Int64Col(shape=(), dflt=0, pos=16),
"r5": Int64Col(shape=(), dflt=0, pos=17)}
byteorder := 'little'
chunkshape := (2340,)
autoindex := True
colindexes := {
"r5": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"a_n_8": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"a1003": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"a1007": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"a1006": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"a1005": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"a1004": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"a1014": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"a1012": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"a1013": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"r4": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"index": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"r6": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"r1": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"r2": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"r3": Index(6, medium, shuffle, zlib(1)).is_csi=False,
"o": Index(6, medium, shuffle, zlib(1)).is_csi=False}
/store_0/table._v_attrs (AttributeSet), 75 attributes:
[CLASS := 'TABLE',
FIELD_0_FILL := 0,
FIELD_0_NAME := 'index',
FIELD_10_FILL := 0,
FIELD_10_NAME := 'r6',
FIELD_11_FILL := 0,
FIELD_11_NAME := 'o',
FIELD_12_FILL := 0,
FIELD_12_NAME := 'a_n_8',
FIELD_13_FILL := 0,
FIELD_13_NAME := 'r1',
FIELD_14_FILL := 0,
FIELD_14_NAME := 'r2',
FIELD_15_FILL := 0,
FIELD_15_NAME := 'r3',
FIELD_16_FILL := 0,
FIELD_16_NAME := 'r4',
FIELD_17_FILL := 0,
FIELD_17_NAME := 'r5',
FIELD_1_FILL := 0.0,
FIELD_1_NAME := 'values_block_0',
FIELD_2_FILL := 0,
FIELD_2_NAME := 'a1007',
FIELD_3_FILL := 0,
FIELD_3_NAME := 'a1014',
FIELD_4_FILL := 0,
FIELD_4_NAME := 'a1006',
FIELD_5_FILL := 0,
FIELD_5_NAME := 'a1005',
FIELD_6_FILL := 0,
FIELD_6_NAME := 'a1004',
FIELD_7_FILL := 0,
FIELD_7_NAME := 'a1003',
FIELD_8_FILL := 0,
FIELD_8_NAME := 'a1012',
FIELD_9_FILL := 0,
FIELD_9_NAME := 'a1013',
NROWS := 6708525,
TITLE := '',
VERSION := '2.7',
a1003_dtype := 'int64',
a1003_kind := ['a1003'],
a1004_dtype := 'int64',
a1004_kind := ['a1004'],
a1005_dtype := 'int64',
a1005_kind := ['a1005'],
a1006_dtype := 'int64',
a1006_kind := ['a1006'],
a1007_dtype := 'int64',
a1007_kind := ['a1007'],
a1012_dtype := 'int64',
a1012_kind := ['a1012'],
a1013_dtype := 'int64',
a1013_kind := ['a1013'],
a1014_dtype := 'int64',
a1014_kind := ['a1014'],
a_n_8_dtype := 'int64',
a_n_8_kind := ['a_n_8'],
index_kind := 'integer',
o_dtype := 'int64',
o_kind := ['o'],
r1_dtype := 'int64',
r1_kind := ['r1'],
r2_dtype := 'int64',
r2_kind := ['r2'],
r3_dtype := 'int64',
r3_kind := ['r3'],
r4_dtype := 'int64',
r4_kind := ['r4'],
r5_dtype := 'int64',
r5_kind := ['r5'],
r6_dtype := 'int64',
r6_kind := ['r6'],
values_block_0_dtype := 'float64',
values_block_0_kind := [u'm101', u'm102', u'm103', u'm104', u'm105_b', u'm105_v', u'm106', u'm107', u'm108', u'm109', u'm110']]
so at a glance it looks like the iteration is selecting incorrect indices and thus incorrect values (as that is what u see) let me see if u can create an example that reproduces
I'm building a the latest master atm to test it with that; if you want I can mail you a sftp link with the .h5 file?)
sure
jeff@reback.net
Small fyi, the 0.13 master gives an error because of where = [('o', '=', [-15534, -16280, -17113, -14786, -14790, -18074])]
File "/srv/www/li/venv/local/lib/python2.7/site-packages/pandas-0.13.0_120_gdd89ce4-py2.7-linux-x86_64.egg/pandas/computation/pytables.py", line 507, in __init__
where = ' & ' .join(["(%s)" % w for w in where])
TypeError: not all arguments converted during string formatting
So i made it:
where = 'o == [-15534, -16280, -17113, -14786, -14790, -18074]'
It still gives the same faulty selection in the 6th iteration however, with the others going correct
I've mailed the sftp access
got it will take a look
FYI your 0.12 way of specifying the where condition would have worked if it was part of a Term
This might be a but in pytables.....will create an issue their...I have a small program which needs your data but can create the issue (I can't create it otherwise)
will link back in a minute
linked issue: https://github.com/PyTables/PyTables/issues/319
@CarstVaartjes If you could provide a login/password to the file would be appreciated (the PyTables guys might post on either issue).
You can share the login/password I gave you with the PyTables guys no problem.
can u provide details on how u contested this file in the first place?
pls enumerate the steps/code as much as u can
Hi,
basically what I made is a way to handle DataFrames automatically as the backend for an analytics system. To give an idea: I started a company that tries to solve very specific issues for consumer goods manufacturers and retailers (such as trade promotion and basket analyses) in a (very cost effective) SAAS environment. The chunking read is important for us as some files can be really big (1+ billion records) The file you have is based on actual customer data (I cannot share the original file due to it containing margins and other things). So how does it work:
In itself quite straight forward and normally quite efficient and reliable (the behaviour of this file is not a common thing we run into regularly). I can reproduce creating the .h5 file from the source csv but you can do it too by basically reading the file into a pandas dataframe and then saving it with blosc with complevel 9. It reproduces the error. I tried some checks with less columns and then the problem did not recur. But as you see when you read the file at once of with a chunksize that fits the entire table, you do not get the same error but correct data. So I think we really have to use this file as-is and understand what is going wrong there.
p.s. (not sure if you remember, but i'm also looking into saving a dataframe as a columnar table type; it would really help myself too as my data is really well suited to it)
can you give some more specific code? as much as possible. I am trying to see why constructing the index fails.
This should help you reproduce it, exactly as I write it away now:
import pandas as pd
def df_to_hdf(store_path, store_name, input_df, standard_index=True, expected_rows=10000000):
'''
expected_rows is important
We now expect dataframes to be consistent input (no demixing of string columns needed)
'''
# natural name enforcing
store_name = store_name.replace('-', '_n_')
input_df = df_to_natural_name(input_df)
# save
with pd.get_store(path=store_path, mode='a', complib='blosc', complevel=9) as store_hdf:
# check which columns need indexes
if standard_index:
index_type = []
for column in list(input_df.columns):
if column[0:1] != 'm':
index_type.append(column)
else:
index_type = True
# up index numeration to ensure uniqueness
try:
nrows = store_hdf.get_storer(store_name).nrows
except:
nrows = 0
input_df = input_df.reset_index(drop=True)
input_df.index = pd.Series(input_df.index) + nrows
# append to hdf5 table
store_hdf.append(store_name, input_df, data_columns=index_type, expectedrows=expected_rows)
def df_to_natural_name(input_df):
column_rename = {}
for column in list(input_df.columns):
if '-' in column:
column_rename[column] = column.replace('-', '_n_')
if column_rename:
input_df = input_df.rename(columns=column_rename)
return input_df
def copy_hdf5(input_file, output_file):
# input
print 'Retrieve'
fact_hdf = pd.HDFStore(input_file, mode='r')
store_name = 'store_0'
copy_df = fact_hdf[store_name]
fact_hdf.close()
# output
# clean store
print 'Clean'
fact_hdf = pd.HDFStore(output_file, mode='w')
fact_hdf.close()
# save df
print 'Save'
df_to_hdf(output_file, store_name, copy_df)
def check_hdf5(input_file):
# prepare
fact_hdf = pd.HDFStore(input_file, mode='r')
store_name = 'store_0'
column_list = ['o', 'm101']
where = [('o', '=', [-15534, -16280, -17113, -14786, -14790, -18074])]
# describe
print 'Contents'
print fact_hdf[store_name]
# non iterated
print 'Non Iterated:'
print fact_hdf.select(store_name, columns=column_list, where=where)['o'].unique()
# iterated by 1mln
chunksize = 1000000 # 1 mln
print 'Iteration Size: ', chunksize
for sub_df in fact_hdf.select(store_name, columns=column_list, where=where, iterator=True, chunksize=chunksize):
print sub_df['o'].unique()
# iterated by 10mln (which fits entire dataframe)
chunksize = 10000000 # 1 mln
print 'Iteration Size: ', chunksize
for sub_df in fact_hdf.select(store_name, columns=column_list, where=where, iterator=True, chunksize=chunksize):
print sub_df['o'].unique()
# finish
fact_hdf.close()
def test():
print 'Copy'
copy_hdf5('iteration_test3.h5', 'iteration_test4.h5')
print 'Check input'
check_hdf5('iteration_test3.h5')
print 'Check output'
check_hdf5('iteration_test4.h5')
@CarstVaartjes thanks....I found a possible cause and created a reproducible example (see the pytables issue).
Don't pass expectedrows
and it should work just fine. Their must be a very subtle bug in the interaction of a larger than created table size when you create an index.
@CarstVaartjes If these are 'write-once' type files, I would just not pass expectedrows. Instead use ptrepack
to compute the chunksize after the fact (you have to run this as an external process though).
ptrepack --chunkshape=auto --propindexes --keep-source-filters in.h5 out.h5
This works especially well if you are appending small amounts at a time.
YMMV with this, it may not actually do much (nor may setting expectedrows). A lot depends on exactly how you are querying.
Furthermore, you might want to think about splitting your tables up to several sub-tables; select coordinates in the 'main' table and just select the data from all of them (this is very fast). Again, depends on what you are selecting (or your current soln may be good enough)
Thanks so much Jeff. For the large files there was a significant difference with the expected rows, but they tend to be a bit slow regardless :)
With the sub-tables you mean save each column as a separate datastore in the h5 file and combine the indices? I have to look if I can combine it with the columnar storage exploration (issue 2252, which I have to find time for; making this scale/perform better would be really nice. Now I load hot datasets into memory as pandas dataframes for performance, but it would be great if that would be less necessary).
I fyou know the number of rows before you write your table, make expected rows LESS than that number (I think the bug is only if its GREATER....)
what I mean is try: http://pandas.pydata.org/pandas-docs/dev/io.html#multiple-table-queries
this was closed by the linked Pytables issue
Linked Issue: https://github.com/PyTables/PyTables/issues/319
Hi,
I have a dataframe saved in HDF5 with 6.7 million records (about 425MB). As you can see below, it gives an incorrect result when it has to do multiple iterations (which is a serious issue for me actually). ->
Reproducing the bug can be hard (not all files have the same issue), but this specific case happens again (it's not a corrupted HDF5 file, if you write away the dataframe it happens again).
I can share the file with pandas/pytables developers if needed (it has to be personal though / not uploaded to a public location; message me for a sftp link)
Some technical stuff
Kind regards,
Carst