LocalModuleTest.test_load_metric_code_eval fails with "The "code_eval" metric executes untrusted model-generated code in Python."

Environment:

Python 3.8 evaluate: a4bdc10c48a450b978d91389a48dbb5297835c7d OS: Ubuntu 24
Trace:


=========================================================== FAILURES ============================================================
__________________________________________ LocalModuleTest.test_load_metric_code_eval ___________________________________________
[gw0] linux -- Python 3.8.19 /var/git/repos/evaluate/.venv/bin/python

self = <tests.test_metric_common.LocalModuleTest testMethod=test_load_metric_code_eval>, evaluation_module_name = 'code_eval'
evaluation_module_type = 'metric'

    def test_load(self, evaluation_module_name, evaluation_module_type):
        doctest.ELLIPSIS_MARKER = "[...]"
        evaluation_module = importlib.import_module(
            evaluate.loading.evaluation_module_factory(
                os.path.join(evaluation_module_type + "s", evaluation_module_name), module_type=evaluation_module_type
            ).module_path
        )
        evaluation_instance = evaluate.loading.import_main_class(evaluation_module.__name__)
        # check parameters
        parameters = inspect.signature(evaluation_instance._compute).parameters
        self.assertTrue(all([p.kind != p.VAR_KEYWORD for p in parameters.values()]))  # no **kwargs
        # run doctest
        with self.patch_intensive_calls(evaluation_module_name, evaluation_module.__name__):
            with self.use_local_metrics(evaluation_module_type):
                try:
>                   results = doctest.testmod(evaluation_module, verbose=True, raise_on_error=True)

tests/test_metric_common.py:117: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/lib/python3.8/doctest.py:1956: in testmod
    runner.run(test)
/usr/lib/python3.8/doctest.py:1844: in run
    r = DocTestRunner.run(self, test, compileflags, out, False)
/usr/lib/python3.8/doctest.py:1483: in run
    return self.__run(test, compileflags, out)
/usr/lib/python3.8/doctest.py:1388: in __run
    self.report_unexpected_exception(out, test, example,
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <doctest.DebugRunner object at 0x785042c7ebe0>
out = <built-in method write of _io.TextIOWrapper object at 0x7851f6fd0520>
test = <DocTest evaluate_modules.metrics.code_eval.78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176.code_eval...dules/metrics/code_eval/78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176/code_eval.py:144 (5 examples)>
example = <doctest.Example object at 0x785042c7efd0>
exc_info = (<class 'ValueError'>, ValueError('\n################################################################################\...##############################################################################'), <traceback object at 0x785043fb5680>)

    def report_unexpected_exception(self, out, test, example, exc_info):
>       raise UnexpectedException(test, example, exc_info)
E       doctest.UnexpectedException: <DocTest evaluate_modules.metrics.code_eval.78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176.code_eval.CodeEval from /tmp/pytest-of-jpodivin/pytest-1/popen-gw0/cache/modules/evaluate_modules/metrics/code_eval/78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176/code_eval.py:144 (5 examples)>

/usr/lib/python3.8/doctest.py:1850: UnexpectedException

During handling of the above exception, another exception occurred:

self = <tests.test_metric_common.LocalModuleTest testMethod=test_load_metric_code_eval>, evaluation_module_name = 'code_eval'
evaluation_module_type = 'metric'

    def test_load(self, evaluation_module_name, evaluation_module_type):
        doctest.ELLIPSIS_MARKER = "[...]"
        evaluation_module = importlib.import_module(
            evaluate.loading.evaluation_module_factory(
                os.path.join(evaluation_module_type + "s", evaluation_module_name), module_type=evaluation_module_type
            ).module_path
        )
        evaluation_instance = evaluate.loading.import_main_class(evaluation_module.__name__)
        # check parameters
        parameters = inspect.signature(evaluation_instance._compute).parameters
        self.assertTrue(all([p.kind != p.VAR_KEYWORD for p in parameters.values()]))  # no **kwargs
        # run doctest
        with self.patch_intensive_calls(evaluation_module_name, evaluation_module.__name__):
            with self.use_local_metrics(evaluation_module_type):
                try:
                    results = doctest.testmod(evaluation_module, verbose=True, raise_on_error=True)
                except doctest.UnexpectedException as e:
>                   raise e.exc_info[1]  # raise the exception that doctest caught

tests/test_metric_common.py:119: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/lib/python3.8/doctest.py:1336: in __run
    exec(compile(example.source, filename, "single",
<doctest evaluate_modules.metrics.code_eval.78d307ea938083398db7d9815f03ed661e9c15f60d77880ce007a8a02648f176.code_eval.CodeEval[3]>:1: in <module>
    ???
.venv/lib/python3.8/site-packages/evaluate/module.py:467: in compute
    output = self._compute(**inputs, **compute_kwargs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = EvaluationModule(name: "code_eval", module_type: "metric", features: {'predictions': Sequence(feature=Value(dtype='str... predictions=candidates, k=[1, 2])
    >>> print(pass_at_k)
    {'pass@1': 0.5, 'pass@2': 1.0}
""", stored examples: 0)
predictions = [['def add(a,b): return a*b', 'def add(a, b): return a+b']], references = ['assert add(2,3)==5'], k = [1, 2]
num_workers = 4, timeout = 3.0

    def _compute(self, predictions, references, k=[1, 10, 100], num_workers=4, timeout=3.0):
        """Returns the scores"""

        if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1":
>           raise ValueError(_WARNING)
E           ValueError: 
E           ################################################################################
E                                             !!!WARNING!!!
E           ################################################################################
E           The "code_eval" metric executes untrusted model-generated code in Python.
E           Although it is highly unlikely that model-generated code will do something
E           overtly malicious in response to this test suite, model-generated code may act
E           destructively due to a lack of model capability or alignment.
E           Users are strongly encouraged to sandbox this evaluation suite so that it
E           does not perform destructive actions on their host or network. For more
E           information on how OpenAI sandboxes its code, see the paper "Evaluating Large
E           Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
E           
E           Once you have read this disclaimer and taken appropriate precautions,
E           set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this
E           with:
E           
E           >>> import os
E           >>> os.environ["HF_ALLOW_CODE_EVAL"] = "1"
E           
E           ################################################################################
huggingface / evaluate

LocalModuleTest.test_load_metric_code_eval fails with "The "code_eval" metric executes untrusted model-generated code in Python." #597

Environment:

Trace: