unionai-oss / pandera

A light-weight, flexible, and expressive statistical data testing library
https://www.union.ai/pandera
MIT License
3.42k stars 311 forks source link

Handling columns with multiple timezones #1309

Open ludaavics opened 1 year ago

ludaavics commented 1 year ago

Validating a column of time zone aware datetimes fail when there are multiple time zones.

Note: Please read this guide detailing how to provide the necessary information for us to reproduce your bug.

Code Sample, a copy-pastable example

import pandera as pa
from pandera.typing import Series
import pandas as pd

class Model(pa.SchemaModel):
    timestamp: Series[pd.DatetimeTZDtype] = pa.Field(
        dtype_kwargs={"unit": "ns", "tz": "America/Chicago"}
    )

    class Config:
        coerce = True
        strict = False

df = pd.DataFrame(
    [
        [pd.to_datetime("2023-03-01 13:00:00").tz_localize("America/Chicago")],
        [pd.to_datetime("2023-03-01 13:00:00").tz_localize("America/New_York")],
    ],
    columns=["timestamp"],
)
Model.validate(df)

SchemaErrors                              Traceback (most recent call last)
File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/backends/pandas/container.py:81, in DataFrameSchemaBackend.validate(self, check_obj, schema, head, tail, sample, random_state, lazy, inplace)
     80 try:
---> 81     check_obj = parser(check_obj, *args)
     82 except SchemaError as exc:

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/backends/pandas/container.py:532, in DataFrameSchemaBackend.coerce_dtype(self, check_obj, schema)
    529 if error_handler.collected_errors:
    530     # raise SchemaErrors if this method is called without an
    531     # error_handler
--> 532     raise SchemaErrors(
    533         schema=schema,
    534         schema_errors=error_handler.collected_errors,
    535         data=check_obj,
    536     )
    538 return check_obj

SchemaErrors: Schema Model: A total of 1 schema errors were found.

Error Counts
------------
- SchemaErrorReason.SCHEMA_COMPONENT_CHECK: 1

Schema Error Summary
--------------------
Empty DataFrame
Columns: [failure_cases, n_failure_cases]
Index: []

Usage Tip
---------

Directly inspect all errors by catching the exception:

``
try:
    schema.validate(dataframe, lazy=True)
except SchemaErrors as err:
    err.failure_cases  # dataframe of schema errors
    err.data  # invalid dataframe
``

The above exception was the direct cause of the following exception:

SchemaError                               Traceback (most recent call last)
Cell In[1], line 23
     13         strict = False
     16 df = pd.DataFrame(
     17     [
     18         [pd.to_datetime("2023-03-01 13:00:00").tz_localize("America/Chicago")],
   (...)
     21     columns=["timestamp"],
     22 )
---> 23 Model.validate(df)

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/api/pandas/model.py:306, in DataFrameModel.validate(cls, check_obj, head, tail, sample, random_state, lazy, inplace)
    291 @classmethod
    292 @docstring_substitution(validate_doc=DataFrameSchema.validate.__doc__)
    293 def validate(
   (...)
    301     inplace: bool = False,
    302 ) -> DataFrameBase[TDataFrameModel]:
    303     """%(validate_doc)s"""
    304     return cast(
    305         DataFrameBase[TDataFrameModel],
--> 306         cls.to_schema().validate(
    307             check_obj, head, tail, sample, random_state, lazy, inplace
    308         ),
    309     )

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/api/pandas/container.py:366, in DataFrameSchema.validate(self, check_obj, head, tail, sample, random_state, lazy, inplace)
    354     check_obj = check_obj.map_partitions(  # type: ignore [operator]
    355         self._validate,
    356         head=head,
   (...)
    362         meta=check_obj,
    363     )
    364     return check_obj.pandera.add_schema(self)
--> 366 return self._validate(
    367     check_obj=check_obj,
    368     head=head,
    369     tail=tail,
    370     sample=sample,
    371     random_state=random_state,
    372     lazy=lazy,
    373     inplace=inplace,
    374 )

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/api/pandas/container.py:395, in DataFrameSchema._validate(self, check_obj, head, tail, sample, random_state, lazy, inplace)
    386 if self._is_inferred:
    387     warnings.warn(
    388         f"This {type(self)} is an inferred schema that hasn't been "
    389         "modified. It's recommended that you refine the schema "
   (...)
    392         UserWarning,
    393     )
--> 395 return self.get_backend(check_obj).validate(
    396     check_obj,
    397     schema=self,
    398     head=head,
    399     tail=tail,
    400     sample=sample,
    401     random_state=random_state,
    402     lazy=lazy,
    403     inplace=inplace,
    404 )

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/backends/pandas/container.py:85, in DataFrameSchemaBackend.validate(self, check_obj, schema, head, tail, sample, random_state, lazy, inplace)
     83         error_handler.collect_error(exc.reason_code, exc)
     84     except SchemaErrors as exc:
---> 85         error_handler.collect_errors(exc)
     87 # We may have modified columns, for example by
     88 # add_missing_columns, so regenerate column info
     89 column_info = self.collect_column_info(check_obj, schema)

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/error_handlers.py:63, in SchemaErrorHandler.collect_errors(self, schema_errors, original_exc)
     56 """Collect schema errors from a SchemaErrors exception.
     57 
     58 :param reason_code: string representing reason for error.
     59 :param schema_error: ``SchemaError`` object.
     60 :param original_exc: original exception associated with the SchemaError.
     61 """
     62 for schema_error in schema_errors.schema_errors:
---> 63     self.collect_error(
     64         schema_error.reason_code,
     65         schema_error,
     66         original_exc or schema_errors,
     67     )

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/error_handlers.py:38, in SchemaErrorHandler.collect_error(self, reason_code, schema_error, original_exc)
     31 """Collect schema error, raising exception if lazy is False.
     32 
     33 :param reason_code: string representing reason for error.
     34 :param schema_error: ``SchemaError`` object.
     35 :param original_exc: original exception associated with the SchemaError.
     36 """
     37 if not self._lazy:
---> 38     raise schema_error from original_exc
     40 # delete data of validated object from SchemaError object to prevent
     41 # storing copies of the validated DataFrame/Series for every
     42 # SchemaError collected.
     43 del schema_error.data

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/backends/pandas/container.py:576, in DataFrameSchemaBackend._coerce_dtype_helper.<locals>._try_coercion(coerce_fn, obj)
    574 def _try_coercion(coerce_fn, obj):
    575     try:
--> 576         return coerce_fn(obj)
    577     except SchemaError as exc:
    578         error_handler.collect_error(
    579             SchemaErrorReason.DATATYPE_COERCION,
    580             exc,
    581         )

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/api/pandas/array.py:146, in ArraySchema.coerce_dtype(self, check_obj)
    136 def coerce_dtype(
    137     self,
    138     check_obj: Union[pd.Series, pd.Index],
    139 ) -> Union[pd.Series, pd.Index]:
    140     """Coerce type of a pd.Series by type specified in dtype.
    141 
    142     :param pd.Series series: One-dimensional ndarray with axis labels
    143         (including time series).
    144     :returns: ``Series`` with coerced data type
    145     """
--> 146     return self.get_backend(check_obj).coerce_dtype(check_obj, schema=self)

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/backends/pandas/components.py:194, in ColumnBackend.coerce_dtype(self, check_obj, schema)
    190 # pylint: disable=super-with-arguments
    191 # pylint: disable=fixme
    192 # TODO: use singledispatchmethod here
    193 if is_field(check_obj) or is_index(check_obj):
--> 194     return super(ColumnBackend, self).coerce_dtype(
    195         check_obj,
    196         schema=schema,
    197     )
    198 return check_obj.apply(
    199     lambda x: super(ColumnBackend, self).coerce_dtype(
    200         x,
   (...)
    203     axis="columns",
    204 )

File ~/miniconda3/envs/dev/lib/python3.9/site-packages/pandera/backends/pandas/array.py:177, in ArraySchemaBackend.coerce_dtype(self, check_obj, schema)
    175     return schema.dtype.try_coerce(check_obj)
    176 except ParserError as exc:
--> 177     raise SchemaError(
    178         schema=schema,
    179         data=check_obj,
    180         message=(
    181             f"Error while coercing '{schema.name}' to type "
    182             f"{schema.dtype}: {exc}:\n{exc.failure_cases}"
    183         ),
    184         failure_cases=exc.failure_cases,
    185         check=f"coerce_dtype('{schema.dtype}')",
    186     ) from exc

SchemaError: Error while coercing 'timestamp' to type datetime64[ns, America/Chicago]: Could not coerce <class 'pandas.core.series.Series'> data_container into type datetime64[ns, America/Chicago]:
Empty DataFrame
Columns: [index, failure_case]
Index: []

Expected behavior

All the timezoe get converted ot the target time zone

Desktop (please complete the following information):

Screenshots

If applicable, add screenshots to help explain your problem.

Additional context

Add any other context about the problem here.

ddp111 commented 1 day ago

bump on this. another example

import pandas as pd

df_1 = pd.DataFrame({"ts":["2023-10-30T11:27:20.082372+01:00",
"2023-10-30T10:02:24.800916+01:00",
"2023-10-30T07:37:01.052617+01:00",
"2023-10-30T07:03:02.975448+01:00",
"2023-10-27T15:37:25.562608+02:00",
"2023-10-27T15:10:05.190293+02:00",
"2023-10-27T12:44:23.609281+02:00",
"2023-10-27T12:32:16.41568+02:00"]})
pd.to_datetime(df_1['ts']) # succeeds

from pandera import Column, DataFrameSchema, Timestamp

schema = DataFrameSchema(
            {
                "ts": Column(Timestamp)
            },
            coerce=True
)
schema.validate(df_1) # raises

df_2 = pd.DataFrame({"ts":["2023-10-30T11:27:20.082372+01:00",
"2023-10-30T10:02:24.800916+01:00",
"2023-10-30T07:37:01.052617+01:00",
"2023-10-30T07:03:02.975448+01:00",]})
schema.validate(df_2) # succeeds