test_kill_before_submit_is_finished fails

sondreso commented 1 month ago

Describe the bug

=================================== FAILURES ===================================
_____________________ test_kill_before_submit_is_finished ______________________

tmp_path = PosixPath('/private/f_scout_ci/pytest-tmp/tmp.MZAdampEHE/test_kill_before_submit_is_fin0')
monkeypatch = <_pytest.monkeypatch.MonkeyPatch object at 0x7fa6170e2d30>
caplog = <_pytest.logging.LogCaptureFixture object at 0x7fa6170fb6a0>
pytestconfig = <_pytest.config.Config object at 0x7fa63c4f3970>

    async def test_kill_before_submit_is_finished(
        tmp_path, monkeypatch, caplog, pytestconfig
    ):
        os.chdir(tmp_path)

        if pytestconfig.getoption("lsf"):
            # Allow more time when tested on a real compute cluster to avoid false positives.
            job_kill_window = 10
            test_grace_time = 20
        elif sys.platform.startswith("darwin"):
            # Mitigate flakiness on low-power test nodes
            job_kill_window = 5
            test_grace_time = 10
        else:
            job_kill_window = 1
            test_grace_time = 2

        bin_path = tmp_path / "bin"
        bin_path.mkdir()
        monkeypatch.setenv("PATH", f"{bin_path}:{os.environ['PATH']}")
        bsub_path = bin_path / "slow_bsub"
        bsub_path.write_text(
            "#!/bin/sh\nsleep 0.1\nbsub $@",
            encoding="utf-8",
        )
        bsub_path.chmod(bsub_path.stat().st_mode | stat.S_IEXEC)

        caplog.set_level(logging.DEBUG)
        driver = LsfDriver(bsub_cmd="slow_bsub")

        # Allow submit and kill to be interleaved by asyncio by issuing
        # submit() in its own asyncio Task:
        asyncio.create_task(
            driver.submit(
                # The sleep is the time window in which we can kill the job before
                # the unwanted finish message appears on disk.
                0,
                "sh",
                "-c",
                f"sleep {job_kill_window}; touch {tmp_path}/survived",
            )
        )
        await asyncio.sleep(0.01)  # Allow submit task to start executing
        await driver.kill(0)  # This will wait until the submit is done and then kill

        async def finished(iens: int, returncode: int):
            SIGTERM = 15
            assert iens == 0
            # If the kill is issued before the job really starts, you will not
            # get SIGTERM but rather LSF_FAILED_JOB. We should accept both.
            assert returncode in (SIGNAL_OFFSET + SIGTERM, LSF_FAILED_JOB)

>       await poll(driver, {0}, finished=finished)

/tmp/f_scout_ci/actions-runner-07/_temp/test_root/tests/integration_tests/scheduler/test_lsf_driver.py:281: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
/tmp/f_scout_ci/actions-runner-07/_temp/test_root/tests/utils.py:134: in poll
    await finished(event.iens, event.returncode)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

iens = 0, returncode = 15

    async def finished(iens: int, returncode: int):
        SIGTERM = 15
        assert iens == 0
        # If the kill is issued before the job really starts, you will not
        # get SIGTERM but rather LSF_FAILED_JOB. We should accept both.
>       assert returncode in (SIGNAL_OFFSET + SIGTERM, LSF_FAILED_JOB)
E       assert 15 in (143, 193)

/tmp/f_scout_ci/actions-runner-07/_temp/test_root/tests/integration_tests/scheduler/test_lsf_driver.py:279: AssertionError

Full log: https://github.com/equinor/komodo-releases/actions/runs/9491604303/job/26158039055

To reproduce Steps to reproduce the behaviour:

TBD

Expected behaviour Not fail

Screenshots N/A

Environment

OS: RHEL8
ERT/Komodo release: bleeding
Python version: 3.8
Remote/HPC execution involved: yes

Additional context N/A

sondreso commented 1 month ago

Passed after retry

berland commented 1 month ago

@JHolba , could this be due to get_exit_code_from_bhist being in use and then SIGNAL_OFFSET is not added?

berland commented 1 month ago

@JHolba , could this be due to get_exit_code_from_bhist being in use and then SIGNAL_OFFSET is not added?

Not super likely, both _get_exit_code and _get_exit_code_from_bhist return the raw number from LSF, the offset is never added manually by the driver.

equinor / ert

test_kill_before_submit_is_finished fails #8142