ibis-project / ibis

the portable Python dataframe library
https://ibis-project.org
Apache License 2.0
5.28k stars 595 forks source link

bug: Date.to_pandas() errors, having trouble repro-ing #8129

Closed NickCrews closed 6 months ago

NickCrews commented 9 months ago

What happened?

In my app I am running into this. On ibis main, I can't repro. I'm guessing this has something to do with the combo of other libraries I have installed, eg pandas and duckdb. I figure this is worth pointing out to you because others might also have this incompatible version of a 3rd party lib installed, so it would be great if

  1. ibis added a version constraint to avoid that version
  2. or (better?) we added some compatibility wrapper to just make it work

Do you have any tips on what you think the cause could be? what libs I should start bisecting to try to pin down the cause?

import ibis

ibis.date("2019-01-01").to_pandas()
```python-traceback --------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[1], line 3 1 import ibis ----> 3 ibis.date("2019-01-01").to_pandas() File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/expr/types/generic.py:1202, in Value.to_pandas(self, **kwargs) 1180 def to_pandas(self, **kwargs) -> pd.Series: 1181 """Convert a column expression to a pandas Series or scalar object. 1182 1183 Parameters (...) 1200 [5 rows x 8 columns] 1201 """ -> 1202 return self.execute(**kwargs) File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/expr/types/core.py:322, in Expr.execute(self, limit, timecontext, params, **kwargs) 295 def execute( 296 self, 297 limit: int | str | None = "default", (...) 300 **kwargs: Any, 301 ): 302 """Execute an expression against its backend if one exists. 303 304 Parameters (...) 320 Keyword arguments 321 """ --> 322 return self._find_backend(use_default=True).execute( 323 self, limit=limit, timecontext=timecontext, params=params, **kwargs 324 ) File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/backends/base/sql/__init__.py:343, in BaseSQLBackend.execute(self, expr, params, limit, **kwargs) 340 schema = expr.as_table().schema() 342 with self._safe_raw_sql(sql, **kwargs) as cursor: --> 343 result = self.fetch_from_cursor(cursor, schema) 345 return expr.__pandas_result__(result) File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/backends/duckdb/__init__.py:1201, in Backend.fetch_from_cursor(self, cursor, schema) 1183 table = cursor.cursor.fetch_arrow_table() 1185 df = pd.DataFrame( 1186 { 1187 name: ( (...) 1199 } 1200 ) -> 1201 df = PandasData.convert_table(df, schema) 1202 if not df.empty and geospatial_supported: 1203 return self._to_geodataframe(df, schema) File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/formats/pandas.py:118, in PandasData.convert_table(cls, df, schema) 113 raise ValueError( 114 "schema column count does not match input data column count" 115 ) 117 for (name, series), dtype in zip(df.items(), schema.types): --> 118 df[name] = cls.convert_column(series, dtype) 120 # return data with the schema's columns which may be different than the 121 # input columns 122 df.columns = schema.names File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/formats/pandas.py:135, in PandasData.convert_column(cls, obj, dtype) 132 method_name = f"convert_{dtype.__class__.__name__}" 133 convert_method = getattr(cls, method_name, cls.convert_default) --> 135 result = convert_method(obj, dtype, pandas_type) 136 assert not isinstance(result, np.ndarray), f"{convert_method} -> {type(result)}" 137 return result File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/ibis/formats/pandas.py:201, in PandasData.convert_Date(cls, s, dtype, pandas_type) 199 s = s.dt.tz_convert("UTC").dt.tz_localize(None) 200 try: --> 201 return s.astype(pandas_type).dt.date 202 except (TypeError, pd._libs.tslibs.OutOfBoundsDatetime): 204 def try_date(v): File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/generic.py:6637, in NDFrame.astype(self, dtype, copy, errors) 6631 results = [ 6632 ser.astype(dtype, copy=copy, errors=errors) for _, ser in self.items() 6633 ] 6635 else: 6636 # else, only a single dtype is given -> 6637 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) 6638 res = self._constructor_from_mgr(new_data, axes=new_data.axes) 6639 return res.__finalize__(self, method="astype") File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/internals/managers.py:431, in BaseBlockManager.astype(self, dtype, copy, errors) 428 elif using_copy_on_write(): 429 copy = False --> 431 return self.apply( 432 "astype", 433 dtype=dtype, 434 copy=copy, 435 errors=errors, 436 using_cow=using_copy_on_write(), 437 ) File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/internals/managers.py:364, in BaseBlockManager.apply(self, f, align_keys, **kwargs) 362 applied = b.apply(f, **kwargs) 363 else: --> 364 applied = getattr(b, f)(**kwargs) 365 result_blocks = extend_blocks(applied, result_blocks) 367 out = type(self).from_blocks(result_blocks, self.axes) File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/internals/blocks.py:758, in Block.astype(self, dtype, copy, errors, using_cow, squeeze) 755 raise ValueError("Can not squeeze with more than one column.") 756 values = values[0, :] # type: ignore[call-overload] --> 758 new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) 760 new_values = maybe_coerce_values(new_values) 762 refs = None File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:237, in astype_array_safe(values, dtype, copy, errors) 234 dtype = dtype.numpy_dtype 236 try: --> 237 new_values = astype_array(values, dtype, copy=copy) 238 except (ValueError, TypeError): 239 # e.g. _astype_nansafe can fail on object-dtype of strings 240 # trying to convert to float 241 if errors == "ignore": File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:182, in astype_array(values, dtype, copy) 179 values = values.astype(dtype, copy=copy) 181 else: --> 182 values = _astype_nansafe(values, dtype, copy=copy) 184 # in pandas we don't store numpy str dtypes, so convert to object 185 if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str): File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:110, in _astype_nansafe(arr, dtype, copy, skipna) 107 if lib.is_np_dtype(dtype, "M"): 108 from pandas.core.arrays import DatetimeArray --> 110 dta = DatetimeArray._from_sequence(arr, dtype=dtype) 111 return dta._ndarray 113 elif lib.is_np_dtype(dtype, "m"): File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:327, in DatetimeArray._from_sequence(cls, scalars, dtype, copy) 325 @classmethod 326 def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): --> 327 return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy) File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:354, in DatetimeArray._from_sequence_not_strict(cls, data, dtype, copy, tz, freq, dayfirst, yearfirst, ambiguous) 351 else: 352 tz = timezones.maybe_get_tz(tz) --> 354 dtype = _validate_dt64_dtype(dtype) 355 # if dtype has an embedded tz, capture it 356 tz = _validate_tz_from_dtype(dtype, tz, explicit_tz_none) File ~/code/scg/atlas/.venv/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:2550, in _validate_dt64_dtype(dtype) 2544 raise ValueError(msg) 2546 if ( 2547 isinstance(dtype, np.dtype) 2548 and (dtype.kind != "M" or not is_supported_dtype(dtype)) 2549 ) or not isinstance(dtype, (np.dtype, DatetimeTZDtype)): -> 2550 raise ValueError( 2551 f"Unexpected value for 'dtype': '{dtype}'. " 2552 "Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', " 2553 "'datetime64[ns]' or DatetimeTZDtype'." 2554 ) 2556 if getattr(dtype, "tz", None): 2557 # https://github.com/pandas-dev/pandas/issues/18595 2558 # Ensure that we have a standard timezone for pytz objects. 2559 # Without this, things like adding an array of timedeltas and 2560 # a tz-aware Timestamp (with a tz specific to its datetime) will 2561 # be incorrect(ish?) for the array as a whole 2562 dtype = cast(DatetimeTZDtype, dtype) ValueError: Unexpected value for 'dtype': 'datetime64[D]'. Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]' or DatetimeTZDtype'. ```

What version of ibis are you using?

my app using 8.0.0.dev210, released Jan 27, has this bug.

In the ibis repo, I can't repro using commits from jan 27th. Is there a way to see the exact commit SHA that went into 8.0.0.dev210 on pypi?

Full output of pip freeze in my app:

altair==5.2.0 anyio==4.2.0 appnope==0.1.3 asttokens==2.4.1 attrs==23.1.0 beautifulsoup4==4.12.2 cachetools==5.3.2 certifi==2023.11.17 chardet==5.2.0 charset-normalizer==3.3.2 click==8.1.7 click-default-group==1.2.4 colorama==0.4.6 comm==0.2.1 contourpy==1.2.0 cycler==0.12.1 debugpy==1.8.0 decorator==5.1.1 distlib==0.3.8 duckdb==0.9.3.dev1045+g9c91b3a329 executing==2.0.1 fastjsonschema==2.19.1 filelock==3.13.1 fonttools==4.45.1 gdown==4.7.1 h11==0.14.0 humanize==4.9.0 idna==3.6 igraph==0.11.3 ipykernel==6.28.0 ipython==8.19.0 ipyvue==1.10.1 ipyvuetify==1.8.10 ipywidgets==8.1.1 jedi==0.19.1 Jinja2==3.1.2 jsonschema==4.20.0 jsonschema-specifications==2023.12.1 jupyter_client==8.6.0 jupyter_core==5.7.0 jupyterlab-widgets==3.0.9 kiwisolver==1.4.5 Markdown==3.5.1 markdown-it-py==3.0.0 MarkupSafe==2.1.3 matplotlib==3.8.2 matplotlib-inline==0.1.6 mdurl==0.1.2 nbformat==5.9.2 nest-asyncio==1.5.8 networkx==3.2.1 numpy==1.26.2 optree==0.10.0 packaging==23.2 pandas==2.2.0 parso==0.8.3 pexpect==4.9.0 Pillow==10.1.0 platformdirs==4.1.0 pluggy==1.3.0 prompt-toolkit==3.0.43 protobuf==4.25.2 psutil==5.9.7 psycopg2==2.9.9 ptyprocess==0.7.0 pure-eval==0.2.2 pyarrow==15.0.0 Pygments==2.17.2 pymdown-extensions==10.7 pyparsing==3.1.1 pyproject-api==1.6.1 PySocks==1.7.1 pytask==0.4.2 python-dateutil==2.8.2 pytz==2023.3.post1 PyYAML==6.0.1 pyzmq==25.1.2 reacton==1.8.2 referencing==0.32.1 requests==2.31.0 rich==13.7.0 rich-click==1.7.3 rpds-py==0.16.2 six==1.16.0 sniffio==1.3.0 solara @ git+https://github.com/NickCrews/solara@b4f7eee9d1292dd69eaa32b31f91901545d22a17 soupsieve==2.5 SQLAlchemy==2.0.23 stack-data==0.6.3 starlette==0.34.0 texttable==1.7.0 tomli==2.0.1 toolz==0.12.0 tornado==6.4 tox==4.12.1 tqdm==4.66.1 traitlets==5.14.1 typing_extensions==4.8.0 tzdata==2023.4 urllib3==2.1.0 uvicorn==0.25.0 vegafusion==1.6.1 vegafusion-python-embed==1.6.1 virtualenv==20.25.0 vl-convert-python==1.2.2 watchdog==3.0.0 watchfiles==0.21.0 wcwidth==0.2.13 websockets==12.0 widgetsnbextension==4.0.9

What backend(s) are you using, if any?

duckdb and pandas

Relevant log output

No response

Code of Conduct

cpcloud commented 9 months ago

This looks to be caused by pandas==2.2.0, which breaks a bunch of timestamp related functionality. We have a bot PR (#8056) that I am slowly working through to try to get pandas 2.2.0 working

It's getting more and more difficult for us to preserve compatibility with pandas 1.x, and apparently even between 2.1 and 2.2 there were some disruptive changes.

cpcloud commented 9 months ago

It seems like there was a bunch of churn in supported datetime64 units:

I'm not sure how to create a compatibility layer for that off the top of my head.

gforsyth commented 9 months ago

Here's the commit SHA for the 210th commit since the last release, which should correspond to the prerelease build you have from PyPI:

 ~/g/i/ibis  git rev-list 7.2.0..HEAD --count                                                14  12:33 
210
 ~/g/i/ibis  git rev-parse --short HEAD                                                      15  12:33 
0f4366743
NickCrews commented 9 months ago

@gforsyth ahh, thanks for the explanation of how those prerelease numbers work! Now in the future I can find the exact SHA myself. PS, would it be possible to include the SHA into the build, eg ibis.__sha__ or something? Not sure if there is some convention around that, or if there is already an extension for build tooling that does this.

@cpcloud I noticed how I had pandas 2.2.x in my environment, and you had 2.1.x in ibis, but I dismissed that as a cause because semantic versioning says it shouldnt' break. Buttttttt, we all know how much to trust semantic versioning 😉 I python -m pip install pandas==2.1.4 in my app's environment, and no error. Thanks for the quick unblocked!

2.2.x: datetime64[D] no longer supported

Do you know if this was an explicit choice? or a mistake they accidentally left this one out of that conversion/verification logic? I would love to see their reasoning for not supporting it, I would really like them to support it, how else are we supposed to represent dates in pandas?

tswast commented 7 months ago

2.2.x: datetime64[D] no longer supported

FWIW: this affected me too in https://github.com/googleapis/python-bigquery-dataframes/pull/492

stack trace
___________________ test_remote_function_stringify_with_ibis ___________________
[gw1] linux -- Python 3.11.6 /tmpfs/src/github/python-bigquery-dataframes/.nox/e2e/bin/python

session = 
scalars_table_id = 'bigframes-load-testing.bigframes_testing.scalars_269e578a0cb35c2ee0eedfef3d91d3fc'
ibis_client = 
dataset_id = 'bigframes-load-testing.bigframes_tests_system_20240322001149_109284_dataset_id'
bq_cf_connection = 'bigframes-rf-conn'

    @pytest.mark.flaky(retries=2, delay=120)
    def test_remote_function_stringify_with_ibis(
        session,
        scalars_table_id,
        ibis_client,
        dataset_id,
        bq_cf_connection,
    ):
        try:

            @session.remote_function(
                [int],
                str,
                dataset_id,
                bq_cf_connection,
                reuse=False,
            )
            def stringify(x):
                return f"I got {x}"

            project_id, dataset_name, table_name = scalars_table_id.split(".")
            if not ibis_client.dataset:
                ibis_client.dataset = dataset_name

            col_name = "int64_col"
            table = ibis_client.tables[table_name]
            table = table.filter(table[col_name].notnull()).order_by("rowindex").head(10)
>           pandas_df_orig = table.execute()

[tests/system/large/test_remote_function.py:197](https://cs.corp.google.com/piper///depot/google3/tests/system/large/test_remote_function.py?l=197): 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
[.nox/e2e/lib/python3.11/site-packages/ibis/expr/types/core.py:324](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/expr/types/core.py?l=324): in execute
    return self._find_backend(use_default=True).execute(
[.nox/e2e/lib/python3.11/site-packages/ibis/backends/bigquery/__init__.py:698](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/backends/bigquery/__init__.py?l=698): in execute
    result = self.fetch_from_cursor(cursor, expr.as_table().schema())
[.nox/e2e/lib/python3.11/site-packages/ibis/backends/bigquery/__init__.py:707](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/backends/bigquery/__init__.py?l=707): in fetch_from_cursor
    return PandasData.convert_table(df, schema)
[.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py:118](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py?l=118): in convert_table
    df[name] = cls.convert_column(series, dtype)
[.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py:135](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py?l=135): in convert_column
    result = convert_method(obj, dtype, pandas_type)
[.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py:201](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/ibis/formats/pandas.py?l=201): in convert_Date
    return s.astype(pandas_type).dt.date
[.nox/e2e/lib/python3.11/site-packages/pandas/core/generic.py:6640](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/generic.py?l=6640): in astype
    new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
[.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/managers.py:430](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/managers.py?l=430): in astype
    return self.apply(
[.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/managers.py:363](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/managers.py?l=363): in apply
    applied = getattr(b, f)(**kwargs)
[.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/blocks.py:758](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/internals/blocks.py?l=758): in astype
    new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
[.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:237](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py?l=237): in astype_array_safe
    new_values = astype_array(values, dtype, copy=copy)
[.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:182](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py?l=182): in astype_array
    values = _astype_nansafe(values, dtype, copy=copy)
[.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py:110](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/dtypes/astype.py?l=110): in _astype_nansafe
    dta = DatetimeArray._from_sequence(arr, dtype=dtype)
[.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:327](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py?l=327): in _from_sequence
    return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy)
[.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:354](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py?l=354): in _from_sequence_not_strict
    dtype = _validate_dt64_dtype(dtype)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

dtype = dtype('               raise ValueError(
                    f"Unexpected value for 'dtype': '{dtype}'. "
                    "Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', "
                    "'datetime64[ns]' or DatetimeTZDtype'."
                )
E               ValueError: Unexpected value for 'dtype': 'datetime64[D]'. Must be 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]' or DatetimeTZDtype'.

[.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py:2550](https://cs.corp.google.com/piper///depot/google3/.nox/e2e/lib/python3.11/site-packages/pandas/core/arrays/datetimes.py?l=2550): ValueError
=============================== warnings summary ===============================

I'm working around it by replacing table.execute() with sql = table.compile() ; pandas_df_orig = bigquery_client.query(sql).to_dataframe() which does the conversion to pandas in a different way.

cpcloud commented 6 months ago

Fixed by #8758.