blaze / odo

Data Migration for the Blaze Project
http://odo.readthedocs.org/
BSD 3-Clause "New" or "Revised" License
1.01k stars 138 forks source link

Categoricals in pandas dataframe choke when inserting with sqlalchemy #584

Open makmanalp opened 7 years ago

makmanalp commented 7 years ago

Even though pandas to_sql handles them OK. Currently pandas converts them to a string type, eg TEXT in sqlite.

Backtrace:

NotImplementedError Traceback (most recent call last) in () ----> 1 odo("hdfstore://data.h5::/country_partner_year", "sqlite:///test.sqlite::asdf") /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/odo.py in odo(source, target, **kwargs) 89 odo.append.append - Add things onto existing things 90 """ ---> 91 return into(target, source, **kwargs) /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/multipledispatch/dispatcher.py in __call__(self, *args, **kwargs) 162 self._cache[types] = func 163 try: --> 164 return func(*args, **kwargs) 165 166 except MDNotImplementedError: /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/into.py in wrapped(*args, **kwargs) 41 raise TypeError('dshape argument is not an instance of DataShape') 42 kwargs['dshape'] = dshape ---> 43 return f(*args, **kwargs) 44 return wrapped 45 /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/into.py in into_string_string(a, b, **kwargs) 147 @validate 148 def into_string_string(a, b, **kwargs): --> 149 return into(a, resource(b, **kwargs), **kwargs) 150 151 /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/multipledispatch/dispatcher.py in __call__(self, *args, **kwargs) 162 self._cache[types] = func 163 try: --> 164 return func(*args, **kwargs) 165 166 except MDNotImplementedError: /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/into.py in wrapped(*args, **kwargs) 41 raise TypeError('dshape argument is not an instance of DataShape') 42 kwargs['dshape'] = dshape ---> 43 return f(*args, **kwargs) 44 return wrapped 45 /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/into.py in into_string(uri, b, dshape, **kwargs) 140 resource_ds = 0 * dshape.subshape[0] if isdimension(dshape[0]) else dshape 141 --> 142 a = resource(uri, dshape=resource_ds, expected_dshape=dshape, **kwargs) 143 return into(a, b, dshape=dshape, **kwargs) 144 /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/regex.py in __call__(self, s, *args, **kwargs) 89 90 def __call__(self, s, *args, **kwargs): ---> 91 return self.dispatch(s)(s, *args, **kwargs) 92 93 @property /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/resource.py in resource_split(uri, *args, **kwargs) 104 def resource_split(uri, *args, **kwargs): 105 uri, other = uri.rsplit('::', 1) --> 106 return resource(uri, other, *args, **kwargs) /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/regex.py in __call__(self, s, *args, **kwargs) 89 90 def __call__(self, s, *args, **kwargs): ---> 91 return self.dispatch(s)(s, *args, **kwargs) 92 93 @property /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/backends/sql.py in resource_sql(uri, *args, **kwargs) 606 t = dshape_to_table(table_name, ds, metadata=metadata, 607 foreign_keys=foreign_keys, --> 608 primary_key=primary_key) 609 t.create() 610 return t /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/backends/sql.py in dshape_to_table(name, ds, metadata, foreign_keys, primary_key) 386 validate_foreign_keys(ds, foreign_keys) 387 --> 388 cols = dshape_to_alchemy(ds, primary_key=primary_key or frozenset()) 389 cols.extend(sa.ForeignKeyConstraint([column_name], [referent]) 390 for column_name, referent in foreign_keys.items()) /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/backends/sql.py in dshape_to_alchemy(dshape, primary_key) 443 if isinstance(dshape, datashape.DataShape): 444 if isdimension(dshape[0]): --> 445 return dshape_to_alchemy(dshape[1], primary_key=primary_key) 446 else: 447 return dshape_to_alchemy(dshape[0], primary_key=primary_key) /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/backends/sql.py in dshape_to_alchemy(dshape, primary_key) 440 primary_key=name in primary_key, 441 nullable=isinstance(typ[0], Option)) --> 442 for name, typ in dshape.parameters[0]] 443 if isinstance(dshape, datashape.DataShape): 444 if isdimension(dshape[0]): /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/backends/sql.py in (.0) 440 primary_key=name in primary_key, 441 nullable=isinstance(typ[0], Option)) --> 442 for name, typ in dshape.parameters[0]] 443 if isinstance(dshape, datashape.DataShape): 444 if isdimension(dshape[0]): /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/backends/sql.py in dshape_to_alchemy(dshape, primary_key) 458 return sa.NUMERIC(dshape.precision, dshape.scale) 459 raise NotImplementedError("No SQLAlchemy dtype match for datashape: %s" --> 460 % dshape) 461 462 NotImplementedError: No SQLAlchemy dtype match for datashape: categorical[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, ...], type=int64, ordered=False] > /nfs/projects_nobackup/c/cidgrowlab/Mali/intl-atlas-api/env/lib/python3.4/site-packages/odo/backends/sql.py(460)dshape_to_alchemy() 458 return sa.NUMERIC(dshape.precision, dshape.scale) 459 raise NotImplementedError("No SQLAlchemy dtype match for datashape: %s" --> 460 % dshape) 461 462

I tried to dig further in the pandas code to see how they handle things but didn't get very far:

https://github.com/pandas-dev/pandas/blob/72c7a396fbd10559f0862e59f55a93beb52c35db/pandas/io/sql.py#L1148

Workaround is to just convert them to string right now. A better candidate could be SQLAlchemy's enum, which does native enum if possible (which comes with space savings) or otherwise a TEXT column with check constraints:

http://docs.sqlalchemy.org/en/latest/core/type_basics.html#sqlalchemy.types.Enum