coiled / benchmarks

BSD 3-Clause "New" or "Revised" License
28 stars 17 forks source link

[TPC-H] Query 21 returns wrong (empty) result #1366

Closed hendrikmakait closed 7 months ago

hendrikmakait commented 7 months ago

https://github.com/coiled/benchmarks/actions/runs/7822955063/job/21343035360#step:12:225

____________________________ test_dask_results[21] _____________________________
[gw0] linux -- Python 3.11.7 /usr/share/miniconda3/envs/test/bin/python3.11

query = 21, local = False
answers_path = PosixPath('/tmp/pytest-of-runner/pytest-0/popen-gw0/answers0/answers/scale-1')
client = <Client: 'tls://10.0.23.114:8786' processes=2 threads=4, memory=14.30 GiB>

    @pytest.mark.tpch_correctness
    @pytest.mark.parametrize(
        "query",
        [
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10,
            11,
            12,
            13,
            14,
            15,
            16,
            17,
            18,
            19,
            20,
            21,
            22,
        ],
    )
    def test_dask_results(query, local, answers_path, client):
        from . import dask_queries

        func = getattr(dask_queries, f"query_{query}")
        result = func(get_dataset_path(local, VERIFICATION_SCALE), None).compute()
>       verify_result(result, query, answers_path)

tests/tpch/test_correctness.py:143: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

result = Empty DataFrame
Columns: [s_name, numwait]
Index: [], query = 21
answer_dir = PosixPath('/tmp/pytest-of-runner/pytest-0/popen-gw0/answers0/answers/scale-1')

    def verify_result(result: pd.DataFrame, query: int, answer_dir: pathlib.Path):
        expected = get_expected_answer(query, answer_dir)
        result = result.reset_index(drop=True)

        # The expected answers are provided as whitespace-padded pipe-separated data.
        # We must therefore strip both the expected as well as the actual answer.
        for column, dtype in expected.dtypes.items():
            if pd.api.types.is_object_dtype(dtype):
                expected[column] = expected[column].apply(lambda x: x.strip())
                result[column] = result[column].apply(lambda x: x.strip())
>       pd.testing.assert_frame_equal(result, expected, check_dtype=False, atol=1e-3)
E       AssertionError: DataFrame are different
E       
E       DataFrame shape mismatch
E       [left]:  (0, 2)
E       [right]: (100, 2)

tests/tpch/test_correctness.py:107: AssertionError
hendrikmakait commented 7 months ago

cc @phofl

phofl commented 7 months ago

this should be fixed now