deephaven / deephaven-core

Deephaven Community Core
Other
254 stars 80 forks source link

Missing support of certain Arrow types and issues with converting supported Arrow data types to DH ones #3223

Open jmao-denver opened 1 year ago

jmao-denver commented 1 year ago

These problems were discovered in testing the new arrow module in Python on the supposedly supported Arrow data types. 'timestamp' round-trip doesn't work if it starts from Arrow with a tz other than UTC.

    @unittest.skip("Not correctly widened")
    def test_arrow_types_unsigned_integers(self):
        with self.subTest("unsigned integers"):
            pa_types = [
                pa.uint16(),
            ]
            pa_data = [
                pa.array([2 ** 16 - 1, 0]),
            ]

    @unittest.skip("Not correctly converted by DH")
    def test_arrow_types_time(self):
        pa_types = [
            pa.time64('ns'),
            pa.date32(),
            pa.timestamp('ns', tz='Europe/Paris'),
        ]

        pa_data = [
            pa.array([1_000_001, 1_000_002]),
            pa.array([datetime(2022, 12, 7), datetime(2022, 12, 30)]),
            pa.array([pd.Timestamp('2017-01-01T12:01:01', tz='UTC'),
                      pd.Timestamp('2017-01-01T11:01:01', tz='Europe/Paris')]),
        ]
        self.verify_type_conversion(pa_types=pa_types, pa_data=pa_data)

Also the following Arrow types appear to be reasonable to support

            pa.null(),
            pa.uint8(),
            pa.uint32(),
            pa.uint64(),
            pa.float16(),
            pa.time32('s'),
            pa.time32('ms'),
            pa.time64('us'),
            pa.timestamp('s', tz=None),
            pa.timestamp('ms', tz=None),
            pa.timestamp('us', tz=None),
            pa.duration('s'),
            pa.duration('ms'),
            pa.duration('us'),
            pa.duration('ns'),
            pa.month_day_nano_interval(),
            pa.binary(),
            pa.large_binary(),
            pa.large_string(),
            pa.large_utf8(),
jmao-denver commented 1 year ago
from datetime import datetime
from typing import List, Any

import numpy as np
import pandas as pd
import pyarrow as pa
from deephaven import arrow as dharrow, dtypes, new_table, time_table

def verify_type_conversion(self, pa_types: List[pa.DataType], pa_data: List[Any]):
    fields = [pa.field(f"f{i}", ty) for i, ty in enumerate(pa_types)]
    schema = pa.schema(fields)
    pa_table = pa.table(pa_data, schema=schema)
    dh_table = dharrow.to_table(pa_table)
    arrow_table = dharrow.to_arrow(dh_table)
    print(pa_table.equals(arrow_table))

pa_types = [
    pa.time64('ns'),
    pa.date32(),
    pa.timestamp('ns', tz='Europe/Paris'),
]

pa_data = [
    pa.array([1_000_001, 1_000_002]),
    pa.array([datetime(2022, 12, 7), datetime(2022, 12, 30)]),
    pa.array([pd.Timestamp('2017-01-01T12:01:01', tz='UTC'),
              pd.Timestamp('2017-01-01T11:01:01', tz='Europe/Paris')]),
]
verify_type_conversion(pa_types=pa_types, pa_data=pa_data)

This should work in the WebUI's Python console with the PR #3216 checked out.

chipkent commented 1 year ago

Should support timestamp with and without a timezone specified.