blaze / castra

Partitioned storage system based on blosc. **No longer actively maintained.**
BSD 3-Clause "New" or "Revised" License
153 stars 21 forks source link

df.to_castra breaking with pandas v0.18.0rc1 #58

Closed jreback closed 8 years ago

jreback commented 8 years ago

pretty much right out of here: https://github.com/dask/dask-tutorial/blob/master/03b-DataFrame-Storage.ipynb

note I am using python 3.5

In [18]: dask.__version__
Out[18]: '0.8.0'

In [19]: pd.__version__
Out[19]: u'0.18.0rc1'

In [22]: castra.__version__
Out[22]: '0.1.6'
In [6]: from prep import accounts_csvs

In [7]: accounts_csvs(3, 1000000, 500)

In [8]: import dask.dataframe as dd

In [10]: import os

In [11]: filename = os.path.join('data', 'accounts.*.csv')

In [12]: filename
Out[12]: 'data/accounts.*.csv'

In [13]: df = dd.read_csv(filename)

In [14]: df.head()
Out[14]: 
    id   names  amount
0  171   Laura     533
1   69   Alice     112
2  130   Sarah     259
3  313  George     -56
4  202     Ray    2205

In [15]: c = df.to_castra('accounts.castra', categories=['names'])
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-15-4e0e632fd837> in <module>()
----> 1 c = df.to_castra('accounts.castra', categories=['names'])

/Users/jreback/miniconda/lib/python2.7/site-packages/dask/dataframe/core.pyc in to_castra(self, fn, categories, sorted_index_column, compute)
   1440         from .io import to_castra
   1441         return to_castra(self, fn, categories, sorted_index_column,
-> 1442                          compute=compute)
   1443 
   1444     def to_bag(self, index=False):

/Users/jreback/miniconda/lib/python2.7/site-packages/dask/dataframe/io.pyc in to_castra(df, fn, categories, sorted_index_column, compute)
    798     keys = [(name, -1), (name, df.npartitions - 1)]
    799     if compute:
--> 800         c, _ = DataFrame._get(dsk, keys, get=get_sync)
    801         return c
    802     else:

/Users/jreback/miniconda/lib/python2.7/site-packages/dask/base.pyc in _get(cls, dsk, keys, get, **kwargs)
     41         get = get or _globals['get'] or cls._default_get
     42         dsk2 = cls._optimize(dsk, keys, **kwargs)
---> 43         return get(dsk2, keys, **kwargs)
     44 
     45     @classmethod

/Users/jreback/miniconda/lib/python2.7/site-packages/dask/async.pyc in get_sync(dsk, keys, **kwargs)
/Users/jreback/miniconda/lib/python2.7/site-packages/dask/async.pyc in fire_task()
    456         # Submit
    457         apply_async(execute_task, args=[key, dsk[key], data, queue,
--> 458                                         get_id, raise_on_exception])
    459 
    460     # Seed initial tasks into the thread pool

/Users/jreback/miniconda/lib/python2.7/site-packages/dask/async.pyc in apply_sync(func, args, kwds)
    506 def apply_sync(func, args=(), kwds={}):
    507     """ A naive synchronous version of apply_async """
--> 508     return func(*args, **kwds)
    509 
    510 

/Users/jreback/miniconda/lib/python2.7/site-packages/dask/async.pyc in execute_task(key, task, data, queue, get_id, raise_on_exception)
    262     """
    263     try:
--> 264         result = _execute_task(task, data)
    265         id = get_id()
    266         result = key, result, None, id

/Users/jreback/miniconda/lib/python2.7/site-packages/dask/async.pyc in _execute_task(arg, cache, dsk)
    244         func, args = arg[0], arg[1:]
    245         args2 = [_execute_task(a, cache) for a in args]
--> 246         return func(*args2)
    247     elif not ishashable(arg):
    248         return arg

/Users/jreback/miniconda/lib/python2.7/site-packages/castra/core.pyc in __init__(self, path, template, categories, readonly)
    118 
    119             self.partitions = pd.Series([], dtype='O',
--> 120                                         index=template2.index.__class__([]))
    121             self.minimum = None
    122 

/Users/jreback/miniconda/lib/python2.7/site-packages/pandas/indexes/range.py in __new__(cls, start, stop, step, name, dtype, fastpath, copy, **kwargs)
     70             start = 0
     71         else:
---> 72             start = _ensure_int(start, 'start')
     73         if stop is None:
     74             stop = start

/Users/jreback/miniconda/lib/python2.7/site-packages/pandas/indexes/range.py in _ensure_int(value, field)
     56         def _ensure_int(value, field):
     57             try:
---> 58                 new_value = int(value)
     59                 assert(new_value == value)
     60             except (ValueError, AssertionError):

TypeError: int() argument must be a string or a number, not 'list'
jreback commented 8 years ago

cc @mrocklin @jcrist

jcrist commented 8 years ago

This was fixed in #55. Try git master, it should work fine then.

jreback commented 8 years ago

thanks @jcrist works nicely.

can you do a release? (and make as a conda pack would be great as well).

jreback commented 8 years ago

@jcrist can you do a release of castra?

jcrist commented 8 years ago

Castra 0.1.7 has been released. Closing.

jreback commented 8 years ago

thanks Jim!