mymarilyn / clickhouse-driver

ClickHouse Python Driver with native interface support
https://clickhouse-driver.readthedocs.io
Other
1.21k stars 214 forks source link

Inserting data into LowCardinality(String) raises TypeError: Unsupported column type: <class 'pandas.core.arrays.categorical.Categorical'>. ndarray/DatetimeIndex is expected. #364

Open Katiii opened 1 year ago

Katiii commented 1 year ago

Describe the bug There is no way to use constant 'use_numpy' client settings for loading and inserting columns with LowCardinality(String) type

To Reproduce On the one side

import pandas as pd
from clickhouse_driver import Client

client1 = Client(
    host=host,
    user=user,
    password=password,
    settings={'use_numpy': True}
)

_ = client1.execute(f'''
CREATE TABLE IF NOT EXISTS {db}.test
(
    low_cardinality_col LowCardinality(String)
)
engine = MergeTree
ORDER BY low_cardinality_col
''')

df = client1.query_dataframe(f'SELECT low_cardinality_col FROM {db}.test limit 1')
client1.insert_dataframe(f'INSERT INTO {db}.test VALUES', df)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[42], line 30
     19 _ = client1.execute(f'''
     20 CREATE TABLE IF NOT EXISTS {db}.test
     21 (
   (...)
     25 ORDER BY low_cardinality_col
     26 ''')
     29 df = client1.query_dataframe(f'SELECT low_cardinality_col FROM {db}.test limit 1')
---> 30 client1.insert_dataframe(f'INSERT INTO {db}.test VALUES', df)

File ~/.local/lib/python3.8/site-packages/clickhouse_driver/client.py:447, in Client.insert_dataframe(self, query, dataframe, external_tables, query_id, settings)
    445     columns = [x[0] for x in sample_block.columns_with_types]
    446     data = [dataframe[column].values for column in columns]
--> 447     rv = self.send_data(sample_block, data, columnar=True)
    448     self.receive_end_of_query()
    450 self.last_query.store_elapsed(time() - start_time)

File ~/.local/lib/python3.8/site-packages/clickhouse_driver/client.py:559, in Client.send_data(self, sample_block, data, types_check, columnar)
    556 else:
    557     slicer = column_chunks if columnar else chunks
--> 559 for chunk in slicer(data, client_settings['insert_block_size']):
    560     block = block_cls(sample_block.columns_with_types, chunk,
    561                       types_check=types_check)
    562     self.connection.send_data(block)

File ~/.local/lib/python3.8/site-packages/clickhouse_driver/numpy/helpers.py:8, in column_chunks(columns, n)
      6 for column in columns:
      7     if not isinstance(column, (np.ndarray, pd.DatetimeIndex)):
----> 8         raise TypeError(
      9             'Unsupported column type: {}. '
     10             'ndarray/DatetimeIndex is expected.'
     11             .format(type(column))
     12         )
     14 # create chunk generator for every column
     15 chunked = [
     16     iter(np.array_split(c, len(c) // n) if len(c) > n else [c])
     17     for c in columns
     18 ]

TypeError: Unsupported column type: <class 'pandas.core.arrays.categorical.Categorical'>. ndarray/DatetimeIndex is expected.

On the other side

client2 = Client(
    host=host,
    user=user,
    password=password,
    settings={'use_numpy': False}
)

df = client2.query_dataframe(f'SELECT low_cardinality_col FROM {db}.test limit 3')
client2.insert_dataframe(f'INSERT INTO {db}.test VALUES', df, settings={'use_numpy': True})

Works and returns 3

Expected behavior No errors, exceptions and correct getting and inserting pandas.DataFrame

Versions

metelitsaas commented 1 year ago

Looks like the problem is not with LowCardinality itself, but with the Pandas category data type. It is not mapped to numpy types and crashes when trying to transform it.

To Reproduce

import os
import pandas as pd
from clickhouse_driver import Client

def main():
    client = Client(
        host=os.environ["HOST"],
        user=os.environ["USER"],
        password=os.environ["PASSWORD"],
    )

    client.execute("""
    CREATE TEMPORARY TABLE test
    (
        column1 String
    )
    engine = Null
    """)

    df = pd.DataFrame([["value1"]], columns=["column1"], dtype="category")
    client.insert_dataframe(
        query=f"INSERT INTO test VALUES",
        dataframe=df,
        settings={"use_numpy": True},
    )

if __name__ == "__main__":
    main()

Versions