Unexpected test_plot_dist slowness

Test_plot_dist seems to run unexpectedly slowly sometimes. Here is the log:

Open Full log

``` ================================================================================ FAILURES ================================================================================= _____________________________________________________________________________ test_plot_dist ______________________________________________________________________________ datatree = DataTree('None', parent=None) ├── DataTree('posterior') │ Dimensions: (chain: 3, draw: 50, hierarchy: 2, grou...n_library: ArviZ creation_library_version: 0.1.0 creation_library_language: Python kind = 'kde', ci_kind = 'eti', point_estimate = 'mean', plot_kwargs = {} @given( > plot_kwargs=st.fixed_dictionaries( {}, optional={ "kind": plot_kwargs_value, "credible_interval": plot_kwargs_value, "point_estimate": plot_kwargs_value, "point_estimate_text": plot_kwargs_value, "title": plot_kwargs_value, "remove_axis": st.just(False), }, ), kind=kind_value, ci_kind=ci_kind_value, point_estimate=point_estimate_value, ) tests\test_hypothesis_plots.py:47: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ args = (DataTree('None', parent=None) ├── DataTree('posterior') │ Dimensions: (chain: 3, draw: 50, hierarchy: 2, gro... creation_library_version: 0.1.0 creation_library_language: Python, 'kde', 'eti', 'mean', {}) kwargs = {}, arg_drawtime = 0.0021275999606586993, start = 313391.6916042, result = None, finish = 313392.9458841, in_drawtime = 0.0 runtime = datetime.timedelta(seconds=1, microseconds=254280), current_deadline = datetime.timedelta(seconds=1, microseconds=250000) @proxies(self.test) def test(*args, **kwargs): arg_drawtime = math.fsum(data.draw_times.values()) start = time.perf_counter() try: result = self.test(*args, **kwargs) finally: finish = time.perf_counter() in_drawtime = math.fsum(data.draw_times.values()) - arg_drawtime runtime = datetime.timedelta(seconds=finish - start - in_drawtime) self._timing_features = { "execute:test": finish - start - in_drawtime, **data.draw_times, **data._stateful_run_times, } if (current_deadline := self.settings.deadline) is not None: if not is_final: current_deadline = (current_deadline // 4) * 5 if runtime >= current_deadline: > raise DeadlineExceeded(runtime, self.settings.deadline) E hypothesis.errors.DeadlineExceeded: Test took 1254.28ms, which exceeds the deadline of 1000.00ms .tox\py311\Lib\site-packages\hypothesis\core.py:846: DeadlineExceeded The above exception was the direct cause of the following exception: datatree = DataTree('None', parent=None) ├── DataTree('posterior') │ Dimensions: (chain: 3, draw: 50, hierarchy: 2, grou...n_library: ArviZ creation_library_version: 0.1.0 creation_library_language: Python @given( > plot_kwargs=st.fixed_dictionaries( {}, optional={ "kind": plot_kwargs_value, "credible_interval": plot_kwargs_value, "point_estimate": plot_kwargs_value, "point_estimate_text": plot_kwargs_value, "title": plot_kwargs_value, "remove_axis": st.just(False), }, ), kind=kind_value, ci_kind=ci_kind_value, point_estimate=point_estimate_value, ) tests\test_hypothesis_plots.py:47: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = , data = ConjectureData(VALID, 4 bytes, frozen) def execute_once( self, data, *, print_example=False, is_final=False, expected_failure=None, example_kwargs=None, ): """Run the test function once, using ``data`` as input. If the test raises an exception, it will propagate through to the caller of this method. Depending on its type, this could represent an ordinary test failure, or a fatal error, or a control exception. If this method returns normally, the test might have passed, or it might have placed ``data`` in an unsuccessful state and then swallowed the corresponding control exception. """ self.ever_executed = True data.is_find = self.is_find self._string_repr = "" text_repr = None if self.settings.deadline is None and not TESTCASE_CALLBACKS: test = self.test else: @proxies(self.test) def test(*args, **kwargs): arg_drawtime = math.fsum(data.draw_times.values()) start = time.perf_counter() try: result = self.test(*args, **kwargs) finally: finish = time.perf_counter() in_drawtime = math.fsum(data.draw_times.values()) - arg_drawtime runtime = datetime.timedelta(seconds=finish - start - in_drawtime) self._timing_features = { "execute:test": finish - start - in_drawtime, **data.draw_times, **data._stateful_run_times, } if (current_deadline := self.settings.deadline) is not None: if not is_final: current_deadline = (current_deadline // 4) * 5 if runtime >= current_deadline: raise DeadlineExceeded(runtime, self.settings.deadline) return result def run(data): # Set up dynamic context needed by a single test run. if self.stuff.selfy is not None: data.hypothesis_runner = self.stuff.selfy # Generate all arguments to the test function. args = self.stuff.args kwargs = dict(self.stuff.kwargs) if example_kwargs is None: kw, argslices = context.prep_args_kwargs_from_strategies( self.stuff.given_kwargs ) else: kw = example_kwargs argslices = {} kwargs.update(kw) if expected_failure is not None: nonlocal text_repr text_repr = repr_call(test, args, kwargs) if text_repr in self.xfail_example_reprs: warnings.warn( f"We generated {text_repr}, which seems identical " "to one of your `@example(...).xfail()` cases. " "Revise the strategy to avoid this overlap?", HypothesisWarning, # Checked in test_generating_xfailed_examples_warns! stacklevel=6, ) if print_example or current_verbosity() >= Verbosity.verbose: printer = RepresentationPrinter(context=context) if print_example: printer.text("Falsifying example:") else: printer.text("Trying example:") if self.print_given_args: printer.text(" ") printer.repr_call( test.__name__, args, kwargs, force_split=True, arg_slices=argslices, leading_comment=( "# " + context.data.slice_comments[(0, 0)] if (0, 0) in context.data.slice_comments else None ), ) report(printer.getvalue()) if TESTCASE_CALLBACKS: printer = RepresentationPrinter(context=context) printer.repr_call( test.__name__, args, kwargs, force_split=True, arg_slices=argslices, leading_comment=( "# " + context.data.slice_comments[(0, 0)] if (0, 0) in context.data.slice_comments else None ), ) self._string_repr = printer.getvalue() data._observability_arguments = { **dict(enumerate(map(to_jsonable, args))), **{k: to_jsonable(v) for k, v in kwargs.items()}, } try: return test(*args, **kwargs) except TypeError as e: # If we sampled from a sequence of strategies, AND failed with a # TypeError, *AND that exception mentions SearchStrategy*, add a note: if "SearchStrategy" in str(e) and hasattr( data, "_sampled_from_all_strategies_elements_message" ): msg, format_arg = data._sampled_from_all_strategies_elements_message add_note(e, msg.format(format_arg)) raise finally: if parts := getattr(data, "_stateful_repr_parts", None): self._string_repr = "\n".join(parts) # self.test_runner can include the execute_example method, or setup/teardown # _example, so it's important to get the PRNG and build context in place first. with local_settings(self.settings): with deterministic_PRNG(): with BuildContext(data, is_final=is_final) as context: # providers may throw in per_case_context_fn, and we'd like # `result` to still be set in these cases. result = None with data.provider.per_test_case_context_manager(): # Run the test function once, via the executor hook. # In most cases this will delegate straight to `run(data)`. result = self.test_runner(data, run) # If a failure was expected, it should have been raised already, so # instead raise an appropriate diagnostic error. if expected_failure is not None: exception, traceback = expected_failure if isinstance(exception, DeadlineExceeded) and ( runtime_secs := math.fsum( v for k, v in self._timing_features.items() if k.startswith("execute:") ) ): report( "Unreliable test timings! On an initial run, this " "test took %.2fms, which exceeded the deadline of " "%.2fms, but on a subsequent run it took %.2f ms, " "which did not. If you expect this sort of " "variability in your test timings, consider turning " "deadlines off for this test by setting deadline=None." % ( exception.runtime.total_seconds() * 1000, self.settings.deadline.total_seconds() * 1000, runtime_secs * 1000, ) ) else: report("Failed to reproduce exception. Expected: \n" + traceback) > raise Flaky( f"Hypothesis {text_repr} produces unreliable results: " "Falsified on the first call but did not on a subsequent one" ) from exception E hypothesis.errors.Flaky: Hypothesis test_plot_dist(datatree=DataTree('None', parent=None) E ├── DataTree('posterior') E │ Dimensions: (chain: 3, draw: 50, hierarchy: 2, group: 3) E │ Coordinates: E │ * chain (chain) int32 12B 0 1 2 E │ * draw (draw) int32 200B 0 1 2 3 4 5 6 7 8 ... 42 43 44 45 46 47 48 49 E │ * hierarchy (hierarchy) int32 8B 0 1 E │ * group (group) int32 12B 0 1 2 E │ Data variables: E │ mu (chain, draw) float64 1kB -0.3953 0.2639 0.6071 ... 1.442 -0.1601 E │ theta (chain, draw, hierarchy, group) float64 7kB 1.696 ... -0.3651 E │ tau (chain, draw, hierarchy) float64 2kB -0.3359 1.419 ... 0.8632 E │ Attributes: E │ created_at: 2024-06-04T10:51:37.302582+00:00 E │ creation_library: ArviZ E │ creation_library_version: 0.1.0 E │ creation_library_language: Python E └── DataTree('sample_stats') E Dimensions: (chain: 3, draw: 50) E Coordinates: E * chain (chain) int32 12B 0 1 2 E * draw (draw) int32 200B 0 1 2 3 4 5 6 7 8 ... 42 43 44 45 46 47 48 49 E Data variables: E diverging (chain, draw) bool 150B True False False ... False False False E Attributes: E created_at: 2024-06-04T10:51:37.303584+00:00 E creation_library: ArviZ E creation_library_version: 0.1.0 E creation_library_language: Python, kind='kde', ci_kind='eti', point_estimate='mean', plot_kwargs={}) produces unreliable results: Falsified on the first call but did not on a subsequent one E Falsifying example: test_plot_dist( E datatree=DataTree('None', parent=None) E ├── DataTree('posterior') E │ Dimensions: (chain: 3, draw: 50, hierarchy: 2, group: 3) E │ Coordinates: E │ * chain (chain) int32 12B 0 1 2 E │ * draw (draw) int32 200B 0 1 2 3 4 5 6 7 8 ... 42 43 44 45 46 47 48 49 E │ * hierarchy (hierarchy) int32 8B 0 1 E │ * group (group) int32 12B 0 1 2 E │ Data variables: E │ mu (chain, draw) float64 1kB -0.3953 0.2639 0.6071 ... 1.442 -0.1601 E │ theta (chain, draw, hierarchy, group) float64 7kB 1.696 ... -0.3651 E │ tau (chain, draw, hierarchy) float64 2kB -0.3359 1.419 ... 0.8632 E │ Attributes: E │ created_at: 2024-06-04T10:51:37.302582+00:00 E │ creation_library: ArviZ E │ creation_library_version: 0.1.0 E │ creation_library_language: Python E └── DataTree('sample_stats') E Dimensions: (chain: 3, draw: 50) E Coordinates: E * chain (chain) int32 12B 0 1 2 E * draw (draw) int32 200B 0 1 2 3 4 5 6 7 8 ... 42 43 44 45 46 47 48 49 E Data variables: E diverging (chain, draw) bool 150B True False False ... False False False E Attributes: E created_at: 2024-06-04T10:51:37.303584+00:00 E creation_library: ArviZ E creation_library_version: 0.1.0 E creation_library_language: Python, E plot_kwargs={}, E kind='kde', E ci_kind='eti', E point_estimate='mean', E ) E Unreliable test timings! On an initial run, this test took 1254.28ms, which exceeded the deadline of 1000.00ms, but on a subsequent run it took 238.95 ms, which did not. If you expect this sort of variability in your test timings, consider turning deadlines off for this test by setting deadline=None. .tox\py311\Lib\site-packages\hypothesis\core.py:974: Flaky ========================================================================== slowest 10 durations =========================================================================== 5.28s call tests/test_hypothesis_plots.py::test_plot_dist 2.05s call tests/test_plots.py::TestPlots::test_plot_dist_models[kde-matplotlib] 1.39s call tests/test_plots.py::TestPlots::test_plot_dist_models[ecdf-matplotlib] 1.01s call tests/test_plots.py::TestPlots::test_plot_dist_sample[hist-matplotlib] 0.93s call tests/test_plots.py::TestPlots::test_plot_dist[kde-matplotlib] 0.84s call tests/test_plots.py::TestPlots::test_plot_dist[hist-matplotlib] 0.83s call tests/test_plots.py::TestPlots::test_plot_dist[ecdf-matplotlib] 0.67s call tests/test_plots.py::TestPlots::test_plot_dist_sample[kde-matplotlib] 0.67s call tests/test_plots.py::TestPlots::test_plot_dist_sample[ecdf-matplotlib] 0.62s call tests/test_plots.py::TestPlots::test_plot_dist_models[kde-none] ========================================================================= short test summary info ========================================================================= FAILED tests/test_hypothesis_plots.py::test_plot_dist - hypothesis.errors.Flaky: Hypothesis test_plot_dist(datatree=DataTree('None', parent=None) ========================================================= 1 failed, 16 passed, 8 skipped, 91 deselected in 24.90s ========================================================= ```

arviz-devs / arviz-plots

Unexpected test_plot_dist slowness #50