MrPowers / chispa

PySpark test helper methods with beautiful error messages
https://mrpowers.github.io/chispa/
MIT License
595 stars 65 forks source link

underline_cells failing if dataframes are different lengths #83

Open aodj opened 11 months ago

aodj commented 11 months ago

When using the underline_cells flag in assert_df_equality if the dataframes have different amounts of rows, the assertion function throws an exception.

from decimal import Decimal

from pyspark.sql import SparkSession
from pyspark.sql import types as T

from chispa.dataframe_comparer import assert_df_equality

spark = SparkSession.builder.getOrCreate()

schema = T.StructType(
    [
        T.StructField("id", T.StringType(), nullable=False),
        T.StructField("balance", T.DecimalType(38,6), nullable=True),
    ]
)

df1 = spark.createDataFrame(
    [
        [1, None],
        [2, Decimal(1.0)],
    ],
    schema=schema,
)
df2 = spark.createDataFrame(
    [
        [1, None],
        [2, Decimal(1.0)],
        [3, Decimal(100)],
    ],
    schema=schema,
)

This gives two dataframes, with different row counts:

df1.show()
+---+--------+
| id| balance|
+---+--------+
|  1|    null|
|  2|1.000000|
+---+--------+

df2.show()
+---+----------+
| id|   balance|
+---+----------+
|  1|      null|
|  2|  1.000000|
|  3|100.000000|
+---+----------+

When calling just assert_df_equality you get the expected comparison:

assert_df_equality(df1, df2)
---------------------------------------------------------------------------
DataFramesNotEqualError                   Traceback (most recent call last)
Cell In [16], line 1
----> 1 assert_df_equality(df1, df2)

File /opt/conda/lib/python3.9/site-packages/chispa/dataframe_comparer.py:27, in assert_df_equality(df1, df2, ignore_nullable, transforms, allow_nan_equality, ignore_column_order, ignore_row_order, underline_cells, ignore_metadata)
     24     assert_generic_rows_equality(
     25         df1.collect(), df2.collect(), are_rows_equal_enhanced, [True], underline_cells=underline_cells)
     26 else:
---> 27     assert_basic_rows_equality(
     28         df1.collect(), df2.collect(), underline_cells=underline_cells)

File /opt/conda/lib/python3.9/site-packages/chispa/rows_comparer.py:25, in assert_basic_rows_equality(rows1, rows2, underline_cells)
     23         else:
     24             t.add_row([r1, r2])
---> 25 raise chispa.DataFramesNotEqualError("\n" + t.get_string())

DataFramesNotEqualError: 
+------------------------------------------+--------------------------------------------+
|                   df1                    |                    df2                     |
+------------------------------------------+--------------------------------------------+
|        Row(id='1', balance=None)         |         Row(id='1', balance=None)          |
| Row(id='2', balance=Decimal('1.000000')) |  Row(id='2', balance=Decimal('1.000000'))  |
|                   None                   | Row(id='3', balance=Decimal('100.000000')) |
+------------------------------------------+--------------------------------------------+

but when adding underline_cells you get an exception:

assert_df_equality(df1, df2, underline_cells=True)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In [17], line 1
----> 1 assert_df_equality(df1, df2, underline_cells=True)

File /opt/conda/lib/python3.9/site-packages/chispa/dataframe_comparer.py:27, in assert_df_equality(df1, df2, ignore_nullable, transforms, allow_nan_equality, ignore_column_order, ignore_row_order, underline_cells, ignore_metadata)
     24     assert_generic_rows_equality(
     25         df1.collect(), df2.collect(), are_rows_equal_enhanced, [True], underline_cells=underline_cells)
     26 else:
---> 27     assert_basic_rows_equality(
     28         df1.collect(), df2.collect(), underline_cells=underline_cells)

File /opt/conda/lib/python3.9/site-packages/chispa/rows_comparer.py:21, in assert_basic_rows_equality(rows1, rows2, underline_cells)
     19 else:
     20     if underline_cells:
---> 21         t.add_row(__underline_cells_in_row(
     22             r1=r1, r2=r2, row_column_names=row_column_names, num_columns=num_columns))
     23     else:
     24         t.add_row([r1, r2])

File /opt/conda/lib/python3.9/site-packages/chispa/rows_comparer.py:73, in __underline_cells_in_row(r1, r2, row_column_names, num_columns)
     70 else:
     71     append_str = ", "
---> 73 if r1[column] != r2[column]:
     74     r1_string += underline_text(
     75         f"{column}='{r1[column]}'") + f"{append_str}"
     76     r2_string += underline_text(
     77         f"{column}='{r2[column]}'") + f"{append_str}"

TypeError: 'NoneType' object is not subscriptable
aodj commented 11 months ago

For reference this is using Chispa 0.9.4, with Spark 3.3.2, on Python 3.9.13