ADBond / splinkclickhouse

Allows Clickhouse to be used as the execution engine for Splink
MIT License
5 stars 0 forks source link

`chdb` data ingestion from python does not always retain `NULL` values #17

Open ADBond opened 2 months ago

ADBond commented 2 months ago

This is true whether we go via pandas or pyarrow.

Coerces instead to empty strings. Numeric seem to be fine.

If we are accepting data of this type, we should probably do something about it, or at very least give a warning.

import pandas as pd
import pyarrow as pa
from chdb import dbapi

df_pd = pd.DataFrame(
    [
        {"id": 1, "name": "john", "city": "london", "count": None, "height": 12.42},
        {"id": 2, "name": None, "city": "glasgow", "count": 5, "height": 13.20},
        {"id": 3, "name": "tom", "city": None, "count": None, "height": None},
        {"id": 4, "name": None, "city": "birmingham", "count": 3, "height": 15.23},
        {"id": 5, "name": "hyacinth", "city": None, "count": None, "height": None},
    ]
)
df_pa = pa.Table.from_pandas(df_pd)
con = dbapi.connect()

cursor = con.cursor()

sql = "CREATE DATABASE IF NOT EXISTS db"
cursor.execute(sql)

sql = (
    "CREATE OR REPLACE TABLE db.pd "
    "ORDER BY tuple() "
    "AS SELECT * FROM Python(df_pd);"
)
cursor.execute(sql)
sql = (
    "CREATE OR REPLACE TABLE db.pa "
    "ORDER BY tuple() "
    "AS SELECT * FROM Python(df_pa);"
)
cursor.execute(sql)

cursor.execute("SELECT * FROM db.pd")
print(cursor.fetchall())

cursor.execute("SELECT * FROM db.pa")
print(cursor.fetchall())