Closed gsheni closed 2 years ago
I installed main
pandas with the latest release of PySpark (3.2.1), and ran the Woodwork unit tests. 2 failing unit tests came up:
========================================================================= FAILURES =========================================================================
_________________________________________________________ test_value_counts[categorical_df_spark] __________________________________________________________
categorical_df = ints categories1 bools categories2 categories3
0 1 1 True test test
1 2 ... <NA> <NA>
8 <NA> 3 False <NA> <NA>
9 <NA> 100 False <NA> <NA>
def test_value_counts(categorical_df):
logical_types = {
"ints": IntegerNullable,
"categories1": Categorical,
"bools": Boolean,
"categories2": Categorical,
"categories3": Categorical,
}
categorical_df.ww.init(logical_types=logical_types)
val_cts = categorical_df.ww.value_counts()
for col in categorical_df.ww.columns:
if col in ["ints", "bools"]:
assert col not in val_cts
else:
assert col in val_cts
expected_cat1 = [
{"value": 200, "count": 4},
{"value": 100, "count": 3},
{"value": 1, "count": 2},
{"value": 3, "count": 1},
]
# Spark converts numeric categories to strings, so we need to update the expected values for this
# Spark will result in `None` instead of `np.nan` in categorical columns
if _is_spark_dataframe(categorical_df):
updated_results = []
for items in expected_cat1:
updated_results.append(
{k: (str(v) if k == "value" else v) for k, v in items.items()}
)
expected_cat1 = updated_results
assert val_cts["categories1"] == expected_cat1
> assert val_cts["categories2"] == [
{"value": np.nan, "count": 6},
{"value": "test", "count": 3},
{"value": "test2", "count": 1},
]
woodwork/tests/accessor/test_statistics.py:1295:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
> ???
E TypeError: boolean value of NA is ambiguous
pandas/_libs/missing.pyx:382: TypeError
______________________________________________ test_get_invalid_schema_message_index_checks[sample_df_pandas] ______________________________________________
sample_df = id full_name email ... datetime_with_NaT url ... NaT http://google.com NaN
[4 rows x 16 columns]
def test_get_invalid_schema_message_index_checks(sample_df):
if not isinstance(sample_df, pd.DataFrame):
pytest.xfail("Index validation not performed for Dask or Spark DataFrames")
schema_df = sample_df.copy()
schema_df.ww.init(
name="test_schema",
index="id",
logical_types={"id": "Double", "full_name": "PersonFullName"},
)
schema = schema_df.ww.schema
different_underlying_index_df = schema_df.copy()
different_underlying_index_df["id"] = pd.Series([9, 8, 7, 6], dtype="float64")
assert (
get_invalid_schema_message(different_underlying_index_df, schema)
== "Index mismatch between DataFrame and typing information"
)
not_unique_df = schema_df.replace({3: 1})
not_unique_df.index = not_unique_df["id"]
not_unique_df.index.name = None
assert (
get_invalid_schema_message(not_unique_df, schema)
== "Index column is not unique"
)
df = pd.DataFrame(
{
"id": pd.Series([5, 4, 3, 2], dtype="float64"),
"col": pd.Series(["b", "b", "b", "d"], dtype="category"),
}
)
df.ww.init(index="id")
df_schema = df.ww.schema
nan_df = df.replace({3: None})
nan_df["id"] = nan_df["id"].astype("float64")
nan_df = nan_df.set_index("id", drop=False)
actual = get_invalid_schema_message(nan_df, df_schema)
> assert actual == "Index contains null values"
E AssertionError: assert 'dtype mismat...ype, category' == 'Index contains null values'
E - Index contains null values
E + dtype mismatch for column col between DataFrame dtype, object, and Categorical dtype, category
woodwork/tests/utils/test_accessor_utils.py:276: AssertionError
main
and latest pyspark onmain