Investigate Flaky fuzz testing

Recent CI runs have revealed flaky tests in tests/test_fuzz.py.

Goal: Stabilize the test. A flaky test is worse than a failure because unlike the failure case a flaky result is able to hide behind false success.

Details:

The flaky test is test_invalid_Error_WHEN_cli_called_GIVEN_invalid_fuzz_input()

The failure scenarios can be reproduced with:

@reproduce_failure('6.119.4', b'AAEAAQABAAA=')

Logs:

hypothesis.errors.DeadlineExceeded: Test took 411.69ms, which exceeds the deadline of 300.00ms
Falsifying example: test_invalid_Error_WHEN_cli_called_GIVEN_invalid_fuzz_input(
    self=<tests.test_fuzz.HypothesisTestSuite testMethod=test_invalid_Error_WHEN_cli_called_GIVEN_invalid_fuzz_input>,
    text='000',
)

You can reproduce this example by temporarily adding @reproduce_failure('6.119.4', b'AAEAAQABAAA=') as a decorator on your test case
self = <tests.test_fuzz.HypothesisTestSuite testMethod=test_invalid_Error_WHEN_cli_called_GIVEN_invalid_fuzz_input>

    @given(st.text(alphabet=string.ascii_letters + string.digits, min_size=3, max_size=15))
>   @settings(deadline=300)

tests/test_fuzz.py:129: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

args = (<tests.test_fuzz.HypothesisTestSuite testMethod=test_invalid_Error_WHEN_cli_called_GIVEN_invalid_fuzz_input>, '000')
kwargs = {}, arg_drawtime = 0.0009458039999117318, arg_stateful = 0.0
arg_gctime = 0.08574491100171144, start = 3222.139485013, result = None
finish = 3222.551171842, in_drawtime = 0.0, in_stateful = 0.0, in_gctime = 0.0
runtime = 0.41168682899979103

    @proxies(self.test)
    def test(*args, **kwargs):
        arg_drawtime = math.fsum(data.draw_times.values())
        arg_stateful = math.fsum(data._stateful_run_times.values())
        arg_gctime = gc_cumulative_time()
        start = time.perf_counter()
        try:
            with unwrap_markers_from_group(), ensure_free_stackframes():
                result = self.test(*args, **kwargs)
        finally:
            finish = time.perf_counter()
            in_drawtime = math.fsum(data.draw_times.values()) - arg_drawtime
            in_stateful = (
                math.fsum(data._stateful_run_times.values()) - arg_stateful
            )
            in_gctime = gc_cumulative_time() - arg_gctime
            runtime = finish - start - in_drawtime - in_stateful - in_gctime
            self._timing_features = {
                "execute:test": runtime,
                "overall:gc": in_gctime,
                **data.draw_times,
                **data._stateful_run_times,
            }

        if (current_deadline := self.settings.deadline) is not None:
            if not is_final:
                current_deadline = (current_deadline // 4) * 5
            if runtime >= current_deadline.total_seconds():
>               raise DeadlineExceeded(
                    datetime.timedelta(seconds=runtime), self.settings.deadline
                )
E               hypothesis.errors.DeadlineExceeded: Test took 411.69ms, which exceeds the deadline of 300.00ms
E               Falsifying example: test_invalid_Error_WHEN_cli_called_GIVEN_invalid_fuzz_input(
E                   self=<tests.test_fuzz.HypothesisTestSuite testMethod=test_invalid_Error_WHEN_cli_called_GIVEN_invalid_fuzz_input>,
E                   text='000',
E               )
E               
E               You can reproduce this example by temporarily adding @reproduce_failure('6.119.4', b'AAEAAQABAAA=') as a decorator on your test case

../.local/lib/python3.12/site-packages/hypothesis/core.py:906: DeadlineExceeded

https://app.circleci.com/pipelines/github/reactive-firewall/multicast/593/workflows/421365ec-59b3-4205-8b6e-4e2c6d029bd3/jobs/2278/tests

@reproduce_failure('6.119.4', b'AAERASwBCgA=')

Logs:

hypothesis.errors.DeadlineExceeded: Test took 335.45ms, which exceeds the deadline of 300.00ms
Falsifying example: test_invalid_Error_WHEN_cli_called_GIVEN_invalid_fuzz_input(
    self=<tests.test_fuzz.HypothesisTestSuite testMethod=test_invalid_Error_WHEN_cli_called_GIVEN_invalid_fuzz_input>,
    text='HiA',
)

You can reproduce this example by temporarily adding @reproduce_failure('6.119.4', b'AAERASwBCgA=') as a decorator on your test case
self = <tests.test_fuzz.HypothesisTestSuite testMethod=test_invalid_Error_WHEN_cli_called_GIVEN_invalid_fuzz_input>

    @given(st.text(alphabet=string.ascii_letters + string.digits, min_size=3, max_size=15))
>   @settings(deadline=300)

tests/test_fuzz.py:129: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

args = (<tests.test_fuzz.HypothesisTestSuite testMethod=test_invalid_Error_WHEN_cli_called_GIVEN_invalid_fuzz_input>, 'HiA')
kwargs = {}, arg_drawtime = 0.00046866299999237526, arg_stateful = 0.0
arg_gctime = 0.027832944000692805, start = 6909.080621266, result = None
finish = 6909.416068751, in_drawtime = 0.0, in_stateful = 0.0, in_gctime = 0.0
runtime = 0.3354474850002589

    @proxies(self.test)
    def test(*args, **kwargs):
        arg_drawtime = math.fsum(data.draw_times.values())
        arg_stateful = math.fsum(data._stateful_run_times.values())
        arg_gctime = gc_cumulative_time()
        start = time.perf_counter()
        try:
            with unwrap_markers_from_group(), ensure_free_stackframes():
                result = self.test(*args, **kwargs)
        finally:
            finish = time.perf_counter()
            in_drawtime = math.fsum(data.draw_times.values()) - arg_drawtime
            in_stateful = (
                math.fsum(data._stateful_run_times.values()) - arg_stateful
            )
            in_gctime = gc_cumulative_time() - arg_gctime
            runtime = finish - start - in_drawtime - in_stateful - in_gctime
            self._timing_features = {
                "execute:test": runtime,
                "overall:gc": in_gctime,
                **data.draw_times,
                **data._stateful_run_times,
            }

        if (current_deadline := self.settings.deadline) is not None:
            if not is_final:
                current_deadline = (current_deadline // 4) * 5
            if runtime >= current_deadline.total_seconds():
>               raise DeadlineExceeded(
                    datetime.timedelta(seconds=runtime), self.settings.deadline
                )
E               hypothesis.errors.DeadlineExceeded: Test took 335.45ms, which exceeds the deadline of 300.00ms
E               Falsifying example: test_invalid_Error_WHEN_cli_called_GIVEN_invalid_fuzz_input(
E                   self=<tests.test_fuzz.HypothesisTestSuite testMethod=test_invalid_Error_WHEN_cli_called_GIVEN_invalid_fuzz_input>,
E                   text='HiA',
E               )
E               
E               You can reproduce this example by temporarily adding @reproduce_failure('6.119.4', b'AAERASwBCgA=') as a decorator on your test case

../.local/lib/python3.12/site-packages/hypothesis/core.py:906: DeadlineExceeded

reactive-firewall / multicast

Investigate Flaky fuzz testing #217