alteryx / woodwork

Woodwork is a Python library that provides robust methods for managing and communicating data typing information.
https://woodwork.alteryx.com
BSD 3-Clause "New" or "Revised" License
145 stars 20 forks source link

Spike: Investigate PySpark bug with pandas 1.4.X #1423

Closed gsheni closed 2 years ago

gsheni commented 2 years ago
gsheni commented 2 years ago

I installed main pandas with the latest release of PySpark (3.2.1), and ran the Woodwork unit tests. 2 failing unit tests came up:

========================================================================= FAILURES =========================================================================
_________________________________________________________ test_value_counts[categorical_df_spark] __________________________________________________________

categorical_df =    ints categories1  bools categories2 categories3
0     1           1   True        test        test
1     2         ... <NA>        <NA>
8  <NA>           3  False        <NA>        <NA>
9  <NA>         100  False        <NA>        <NA>

    def test_value_counts(categorical_df):
        logical_types = {
            "ints": IntegerNullable,
            "categories1": Categorical,
            "bools": Boolean,
            "categories2": Categorical,
            "categories3": Categorical,
        }
        categorical_df.ww.init(logical_types=logical_types)
        val_cts = categorical_df.ww.value_counts()
        for col in categorical_df.ww.columns:
            if col in ["ints", "bools"]:
                assert col not in val_cts
            else:
                assert col in val_cts

        expected_cat1 = [
            {"value": 200, "count": 4},
            {"value": 100, "count": 3},
            {"value": 1, "count": 2},
            {"value": 3, "count": 1},
        ]
        # Spark converts numeric categories to strings, so we need to update the expected values for this
        # Spark will result in `None` instead of `np.nan` in categorical columns
        if _is_spark_dataframe(categorical_df):
            updated_results = []
            for items in expected_cat1:
                updated_results.append(
                    {k: (str(v) if k == "value" else v) for k, v in items.items()}
                )
            expected_cat1 = updated_results

        assert val_cts["categories1"] == expected_cat1
>       assert val_cts["categories2"] == [
            {"value": np.nan, "count": 6},
            {"value": "test", "count": 3},
            {"value": "test2", "count": 1},
        ]

woodwork/tests/accessor/test_statistics.py:1295:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

>   ???
E   TypeError: boolean value of NA is ambiguous

pandas/_libs/missing.pyx:382: TypeError
______________________________________________ test_get_invalid_schema_message_index_checks[sample_df_pandas] ______________________________________________

sample_df =    id         full_name                   email  ... datetime_with_NaT                                  url           ...               NaT                    http://google.com                                      NaN

[4 rows x 16 columns]

    def test_get_invalid_schema_message_index_checks(sample_df):
        if not isinstance(sample_df, pd.DataFrame):
            pytest.xfail("Index validation not performed for Dask or Spark DataFrames")

        schema_df = sample_df.copy()
        schema_df.ww.init(
            name="test_schema",
            index="id",
            logical_types={"id": "Double", "full_name": "PersonFullName"},
        )
        schema = schema_df.ww.schema

        different_underlying_index_df = schema_df.copy()
        different_underlying_index_df["id"] = pd.Series([9, 8, 7, 6], dtype="float64")
        assert (
            get_invalid_schema_message(different_underlying_index_df, schema)
            == "Index mismatch between DataFrame and typing information"
        )

        not_unique_df = schema_df.replace({3: 1})
        not_unique_df.index = not_unique_df["id"]
        not_unique_df.index.name = None
        assert (
            get_invalid_schema_message(not_unique_df, schema)
            == "Index column is not unique"
        )

        df = pd.DataFrame(
            {
                "id": pd.Series([5, 4, 3, 2], dtype="float64"),
                "col": pd.Series(["b", "b", "b", "d"], dtype="category"),
            }
        )
        df.ww.init(index="id")
        df_schema = df.ww.schema

        nan_df = df.replace({3: None})
        nan_df["id"] = nan_df["id"].astype("float64")
        nan_df = nan_df.set_index("id", drop=False)
        actual = get_invalid_schema_message(nan_df, df_schema)
>       assert actual == "Index contains null values"
E       AssertionError: assert 'dtype mismat...ype, category' == 'Index contains null values'
E         - Index contains null values
E         + dtype mismatch for column col between DataFrame dtype, object, and Categorical dtype, category

woodwork/tests/utils/test_accessor_utils.py:276: AssertionError