Closed bilgehanertan closed 6 months ago
Hi! Thanks a lot, it's super useful :) It looks like some of the tests failed because the results were initiially stored at a lower precision that what was expected - could you edit them so they match?
In case you don't have access to the test logs, the results are below, and the file you need to edit is tests/reference_scores/reference_task_scores.py
You can then test the scores locally by running pytest .
to make sure they pass
model_input = ('gpt2', 'lite', 'lighteval|bigbench:causal_judgment|3', 'acc_stderr', 0.16666666666666666, 0.1667)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval lighteval|bigbench:causal_judgment|3, metric acc_stderr incorrect
E assert 0.16666666666666666 == 0.1667 ± 1.7e-05
E comparison failed
E Obtained: 0.16666666666666666
E Expected: 0.1667 ± 1.7e-05
tests/test_main.py:123: AssertionError
_____________________ test_model_prediction[model_input97] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:causal_judgment|3', 'acc_norm_stderr', 0.16666666666666666, 0.1667)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:causal_judgment|3, metric acc_norm_stderr incorrect
E assert 0.16666666666666666 == 0.1667 ± 1.7e-05
E comparison failed
E Obtained: 0.16666666666666666
E Expected: 0.1667 ± 1.7e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input101] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:date_understanding|3', 'acc_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:date_understanding|3, metric acc_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input103] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:date_understanding|3', 'acc_norm_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:date_understanding|3, metric acc_norm_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input105] _____________________
model_input = ('gpt2', 'lite', 'lighteval|bigbench:disambiguation_qa|3', 'acc_stderr', 0.15275252316519466, 0.1528)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval lighteval|bigbench:disambiguation_qa|3, metric acc_stderr incorrect
E assert 0.15275252316519466 == 0.1528 ± 1.5e-05
E comparison failed
E Obtained: 0.15275252316519466
E Expected: 0.1528 ± 1.5e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input107] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:disambiguation_qa|3', 'acc_stderr', 0.15275252316519466, 0.1528)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:disambiguation_qa|3, metric acc_stderr incorrect
E assert 0.15275252316519466 == 0.1528 ± 1.5e-05
E comparison failed
E Obtained: 0.15275252316519466
E Expected: 0.1528 ± 1.5e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input109] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:disambiguation_qa|3', 'acc_norm_stderr', 0.15275252316519464, 0.1528)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:disambiguation_qa|3, metric acc_norm_stderr incorrect
E assert 0.15275252316519464 == 0.1528 ± 1.5e-05
E comparison failed
E Obtained: 0.15275252316519464
E Expected: 0.1528 ± 1.5e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input111] _____________________
model_input = ('gpt2', 'lite', 'lighteval|bigbench:geometric_shapes|3', 'acc_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval lighteval|bigbench:geometric_shapes|3, metric acc_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input115] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:geometric_shapes|3', 'acc_norm_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:geometric_shapes|3, metric acc_norm_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input119] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:logical_deduction_five_objects|3', 'acc_stderr', 0.15275252316519464, 0.1528)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:logical_deduction_five_objects|3, metric acc_stderr incorrect
E assert 0.15275252316519464 == 0.1528 ± 1.5e-05
E comparison failed
E Obtained: 0.15275252316519464
E Expected: 0.1528 ± 1.5e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input121] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:logical_deduction_five_objects|3', 'acc_norm_stderr', 0.15275252316519464, 0.1528)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:logical_deduction_five_objects|3, metric acc_norm_stderr incorrect
E assert 0.15275252316519464 == 0.1528 ± 1.5e-05
E comparison failed
E Obtained: 0.15275252316519464
E Expected: 0.1528 ± 1.5e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input123] _____________________
model_input = ('gpt2', 'lite', 'lighteval|bigbench:logical_deduction_seven_objects|3', 'acc_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval lighteval|bigbench:logical_deduction_seven_objects|3, metric acc_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input125] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:logical_deduction_seven_objects|3', 'acc_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:logical_deduction_seven_objects|3, metric acc_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input127] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:logical_deduction_seven_objects|3', 'acc_norm_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:logical_deduction_seven_objects|3, metric acc_norm_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input131] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:logical_deduction_three_objects|3', 'acc_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:logical_deduction_three_objects|3, metric acc_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input133] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:logical_deduction_three_objects|3', 'acc_norm_stderr', 0.15275252316519466, 0.1528)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:logical_deduction_three_objects|3, metric acc_norm_stderr incorrect
E assert 0.15275252316519466 == 0.1528 ± 1.5e-05
E comparison failed
E Obtained: 0.15275252316519466
E Expected: 0.1528 ± 1.5e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input135] _____________________
model_input = ('gpt2', 'lite', 'lighteval|bigbench:movie_recommendation|3', 'acc_stderr', 0.15275252316519466, 0.1528)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval lighteval|bigbench:movie_recommendation|3, metric acc_stderr incorrect
E assert 0.15275252316519466 == 0.1528 ± 1.5e-05
E comparison failed
E Obtained: 0.15275252316519466
E Expected: 0.1528 ± 1.5e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input137] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:movie_recommendation|3', 'acc_stderr', 0.16666666666666666, 0.1667)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:movie_recommendation|3, metric acc_stderr incorrect
E assert 0.16666666666666666 == 0.1667 ± 1.7e-05
E comparison failed
E Obtained: 0.16666666666666666
E Expected: 0.1667 ± 1.7e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input139] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:movie_recommendation|3', 'acc_norm_stderr', 0.15275252316519464, 0.1528)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:movie_recommendation|3, metric acc_norm_stderr incorrect
E assert 0.15275252316519464 == 0.1528 ± 1.5e-05
E comparison failed
E Obtained: 0.15275252316519464
E Expected: 0.1528 ± 1.5e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input147] _____________________
model_input = ('gpt2', 'lite', 'lighteval|bigbench:reasoning_about_colored_objects|3', 'acc_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval lighteval|bigbench:reasoning_about_colored_objects|3, metric acc_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input149] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:reasoning_about_colored_objects|3', 'acc_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:reasoning_about_colored_objects|3, metric acc_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input153] _____________________
model_input = ('gpt2', 'lite', 'lighteval|bigbench:ruin_names|3', 'acc_stderr', 0.15275252316519464, 0.1528)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval lighteval|bigbench:ruin_names|3, metric acc_stderr incorrect
E assert 0.15275252316519464 == 0.1528 ± 1.5e-05
E comparison failed
E Obtained: 0.15275252316519464
E Expected: 0.1528 ± 1.5e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input155] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:ruin_names|3', 'acc_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:ruin_names|3, metric acc_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input157] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:ruin_names|3', 'acc_norm_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:ruin_names|3, metric acc_norm_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input183] _____________________
model_input = ('gpt2', 'lite', 'lighteval|bigbench:tracking_shuffled_objects_five_objects|3', 'acc_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval lighteval|bigbench:tracking_shuffled_objects_five_objects|3, metric acc_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input185] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:tracking_shuffled_objects_five_objects|3', 'acc_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:tracking_shuffled_objects_five_objects|3, metric acc_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input189] _____________________
model_input = ('gpt2', 'lite', 'lighteval|bigbench:tracking_shuffled_objects_seven_objects|3', 'acc_stderr', 0.15275252316519464, 0.1528)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval lighteval|bigbench:tracking_shuffled_objects_seven_objects|3, metric acc_stderr incorrect
E assert 0.15275252316519464 == 0.1528 ± 1.5e-05
E comparison failed
E Obtained: 0.15275252316519464
E Expected: 0.1528 ± 1.5e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input197] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:tracking_shuffled_objects_three_objects|3', 'acc_stderr', 0.13333333333333333, 0.1333)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:tracking_shuffled_objects_three_objects|3, metric acc_stderr incorrect
E assert 0.13333333333333333 == 0.1333 ± 1.3e-05
E comparison failed
E Obtained: 0.13333333333333333
E Expected: 0.1333 ± 1.3e-05
tests/test_main.py:123: AssertionError
____________________ test_model_prediction[model_input199] _____________________
model_input = ('gpt2', 'lite', 'harness|bigbench:tracking_shuffled_objects_three_objects|3', 'acc_norm_stderr', 0.15275252316519466, 0.1528)
def test_model_prediction(model_input: tuple):
"""Evaluates a model on a full task - is parametrized using pytest_generate_test"""
model_name, test_type, eval_name, metric, source, prediction = model_input
> assert source == approx(
prediction, rel=1e-4
), f"Model {model_name} on {test_type} samples, for eval {eval_name}, metric {metric} incorrect"
E AssertionError: Model gpt2 on lite samples, for eval harness|bigbench:tracking_shuffled_objects_three_objects|3, metric acc_norm_stderr incorrect
E assert 0.15275252316519466 == 0.1528 ± 1.5e-05
E comparison failed
E Obtained: 0.15275252316519466
E Expected: 0.1528 ± 1.5e-05
tests/test_main.py:123: AssertionError
Tests are now passing :)
Resolves #125