yhat / pandasql

sqldf for pandas
MIT License
1.33k stars 185 forks source link

selecting on large dataframes #65

Open dfolch opened 6 years ago

dfolch commented 6 years ago

I'm getting an OperationalError when selecting from a large dataframe (full error message at bottom). The error seems to be related to the size of the dataframe not the size of the selection.

import numpy as np
import pandas as pd
from pandasql import sqldf

# works
df = pd.DataFrame(np.random.rand(45000,5))
select = sqldf('SELECT * FROM df')

# fails
df = pd.DataFrame(np.random.rand(60000,5))
select = sqldf('SELECT * FROM df')

# works
df = pd.DataFrame(np.random.rand(60000,2))
select = sqldf('SELECT * FROM df')

#works
df = pd.DataFrame(np.random.rand(45000,5))
select = sqldf('SELECT "1", "2" FROM df')

# fails
df = pd.DataFrame(np.random.rand(60000,5))
select = sqldf('SELECT "1", "2" FROM df')
Error message

```python --------------------------------------------------------------------------- OperationalError Traceback (most recent call last) ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_context(self, dialect, constructor, statement, parameters, *args) 1192 parameters, -> 1193 context) 1194 except BaseException as e: ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/sqlalchemy/engine/default.py in do_execute(self, cursor, statement, parameters, context) 506 def do_execute(self, cursor, statement, parameters, context=None): --> 507 cursor.execute(statement, parameters) 508 OperationalError: too many SQL variables The above exception was the direct cause of the following exception: OperationalError Traceback (most recent call last) in () 1 df = pd.DataFrame(np.random.randint(0,10,size=(60000,5))) ----> 2 select = sqldf('SELECT "1", "2" FROM df') ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/pandasql/sqldf.py in sqldf(query, env, db_uri) 154 >>> sqldf("select avg(x) from df;", locals()) 155 """ --> 156 return PandaSQL(db_uri)(query, env) ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/pandasql/sqldf.py in __call__(self, query, env) 56 continue 57 self.loaded_tables.add(table_name) ---> 58 write_table(env[table_name], table_name, conn) 59 60 try: ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/pandasql/sqldf.py in write_table(df, tablename, conn) 119 message='The provided table name \'%s\' is not found exactly as such in the database' % tablename) 120 to_sql(df, name=tablename, con=conn, --> 121 index=not any(name is None for name in df.index.names)) # load index into db if all levels are named 122 123 ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/pandas/io/sql.py in to_sql(frame, name, con, schema, if_exists, index, index_label, chunksize, dtype) 448 pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index, 449 index_label=index_label, schema=schema, --> 450 chunksize=chunksize, dtype=dtype) 451 452 ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/pandas/io/sql.py in to_sql(self, frame, name, if_exists, index, index_label, schema, chunksize, dtype) 1147 schema=schema, dtype=dtype) 1148 table.create() -> 1149 table.insert(chunksize) 1150 if (not name.isdigit() and not name.islower()): 1151 # check for potentially case sensitivity issues (GH7815) ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/pandas/io/sql.py in insert(self, chunksize) 661 662 chunk_iter = zip(*[arr[start_i:end_i] for arr in data_list]) --> 663 self._execute_insert(conn, keys, chunk_iter) 664 665 def _query_iterator(self, result, chunksize, columns, coerce_float=True, ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/pandas/io/sql.py in _execute_insert(self, conn, keys, data_iter) 636 """Insert data into this table with database connection""" 637 data = [{k: v for k, v in zip(keys, row)} for row in data_iter] --> 638 conn.execute(*self.insert_statement(data, conn)) 639 640 def insert(self, chunksize=None): ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/sqlalchemy/engine/base.py in execute(self, object, *multiparams, **params) 946 raise exc.ObjectNotExecutableError(object) 947 else: --> 948 return meth(self, multiparams, params) 949 950 def _execute_function(self, func, multiparams, params): ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/sqlalchemy/sql/elements.py in _execute_on_connection(self, connection, multiparams, params) 267 def _execute_on_connection(self, connection, multiparams, params): 268 if self.supports_execution: --> 269 return connection._execute_clauseelement(self, multiparams, params) 270 else: 271 raise exc.ObjectNotExecutableError(self) ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_clauseelement(self, elem, multiparams, params) 1058 compiled_sql, 1059 distilled_params, -> 1060 compiled_sql, distilled_params 1061 ) 1062 if self._has_events or self.engine._has_events: ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_context(self, dialect, constructor, statement, parameters, *args) 1198 parameters, 1199 cursor, -> 1200 context) 1201 1202 if self._has_events or self.engine._has_events: ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _handle_dbapi_exception(self, e, statement, parameters, cursor, context) 1411 util.raise_from_cause( 1412 sqlalchemy_exception, -> 1413 exc_info 1414 ) 1415 else: ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/sqlalchemy/util/compat.py in raise_from_cause(exception, exc_info) 201 exc_type, exc_value, exc_tb = exc_info 202 cause = exc_value if exc_value is not exception else None --> 203 reraise(type(exception), exception, tb=exc_tb, cause=cause) 204 205 if py3k: ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/sqlalchemy/util/compat.py in reraise(tp, value, tb, cause) 184 value.__cause__ = cause 185 if value.__traceback__ is not tb: --> 186 raise value.with_traceback(tb) 187 raise value 188 ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/sqlalchemy/engine/base.py in _execute_context(self, dialect, constructor, statement, parameters, *args) 1191 statement, 1192 parameters, -> 1193 context) 1194 except BaseException as e: 1195 self._handle_dbapi_exception( ~/anaconda2/envs/urban_prac4/lib/python3.6/site-packages/sqlalchemy/engine/default.py in do_execute(self, cursor, statement, parameters, context) 505 506 def do_execute(self, cursor, statement, parameters, context=None): --> 507 cursor.execute(statement, parameters) 508 509 def do_execute_no_params(self, cursor, statement, context=None): OperationalError: (sqlite3.OperationalError) too many SQL variables [SQL: 'INSERT INTO df ("0", "1", "2", "3", "4") VALUES (?, ?, ?, ?, ? ... ... (Background on this error at: http://sqlalche.me/e/e3q8) ```

jbf232 commented 5 years ago

I am getting this same error when queries a dataframe with around 72,000 rows.

rugg2 commented 4 years ago

yes performance on even modest data size is bad

zbrookle commented 4 years ago

@dfolch Part of the performance issue with this package is that it uses SqLite as a backend, which means that you're essentially not using one of the main benefits of pandas, which is in memory computation. As a solution to this problem, a few months back I created a package called dataframe_sql, which solves this problem by parsing the sql and translating it to native pandas operations. Hope this helps!