beir-cellar / beir

A Heterogeneous Benchmark for Information Retrieval. Easy to use, evaluate your models across 15+ diverse IR datasets.
http://beir.ai
Apache License 2.0
1.64k stars 193 forks source link

Incorrect Evaluation Metrics Produced #187

Open NikhielRahulSingh opened 1 month ago

NikhielRahulSingh commented 1 month ago

`def beir_evaluation():

actual_contexts_dict = {'0': 
                           {'0': 1, '1': 1, '2': 1, '3': 1, '4': 1, '5': 1, '6': 1, '7': 1, '8': 1, '9': 1}
                        }
results_dict = {'0': 
                   {'0': 1, '1': 1, '2': 1, '3': 1, '4': 1, '5': 1, '6': 1, '7': 1, '8': 1, '9': 1}
                }

# Evaluate retrieval metrics
ndcg, map_score, recall, precision = EvaluateRetrieval.evaluate(
    actual_contexts_dict, results_dict, k_values=[10]
)
mrr = EvaluateRetrieval.evaluate_custom(actual_contexts_dict, results_dict, [10], metric="mrr")

# Print evaluation results
print(f"\nRecall@10   : {recall['Recall@10']:.2f}")
print(f"Precision@10: {precision['P@10']:.2f}")
print(f"\nNDCG@10     : {ndcg['NDCG@10']:.2f}")
print(f"MAP@10      : {map_score['MAP@10']:.2f}")
print(f"MRR@10      : {mrr['MRR@10']:.2f}")`

Recall@10 : 0.90 Precision@10: 0.90

NDCG@10 : 0.94 MAP@10 : 0.90 MRR@10 : 1.00

This is clearly incorrect as everything should be 1. Please can this be resolved

NikhielRahulSingh commented 1 month ago
from beir.retrieval.evaluation import EvaluateRetrieval

def beir_evaluation():

    actual_contexts_dict = {'0':{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9},
                            '1':{'10': 10, '11': 11, '12': 12, '13': 13, '14': 14, '15': 15, '16': 16, '17': 17, '18': 18, '19': 19}
                            }
    results_dict = {'0':{'0': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': 0, '6': 0, '7': 0, '8': 0, '9': 0},
                    '1':{'10': 10, '11': 11, '12': 12, '13': 13, '14': 14, '15': 15, '16': 16, '17': 17, '18': 18, '19': 19}
                    }

    # Evaluate retrieval metrics
    ndcg, map_score, recall, precision = EvaluateRetrieval.evaluate(
        actual_contexts_dict, results_dict, k_values=[10]
    )
    mrr = EvaluateRetrieval.evaluate_custom(actual_contexts_dict, results_dict, [10], metric="mrr")

    # Print evaluation results
    print(f"\nRecall@10   : {recall['Recall@10']:.2f}")
    print(f"Precision@10: {precision['P@10']:.2f}")
    print(f"\nNDCG@10     : {ndcg['NDCG@10']:.2f}")
    print(f"MAP@10      : {map_score['MAP@10']:.2f}")
    print(f"MRR@10      : {mrr['MRR@10']:.2f}")

# Run the evaluation
beir_evaluation()

Recall@10 : 1.00 Precision@10: 0.95

NDCG@10 : 1.00 MAP@10 : 1.00 MRR@10 : 1.00

Why is precision@10 not 1.00 ?

shivareddy0117 commented 2 weeks ago

`def beir_evaluation():

actual_contexts_dict = {'0': 
                           {'0': 1, '1': 1, '2': 1, '3': 1, '4': 1, '5': 1, '6': 1, '7': 1, '8': 1, '9': 1}
                        }
results_dict = {'0': 
                   {'0': 1, '1': 1, '2': 1, '3': 1, '4': 1, '5': 1, '6': 1, '7': 1, '8': 1, '9': 1}
                }

# Evaluate retrieval metrics
ndcg, map_score, recall, precision = EvaluateRetrieval.evaluate(
    actual_contexts_dict, results_dict, k_values=[10],  ignore_identical_ids=False
)    # please make sure to add the ignore_identical_ids parameters to False to consider all the documents or use the different Query and Documents_ID
mrr = EvaluateRetrieval.evaluate_custom(actual_contexts_dict, results_dict, [10], metric="mrr")

# Print evaluation results
print(f"\nRecall@10   : {recall['Recall@10']:.2f}")
print(f"Precision@10: {precision['P@10']:.2f}")
print(f"\nNDCG@10     : {ndcg['NDCG@10']:.2f}")
print(f"MAP@10      : {map_score['MAP@10']:.2f}")
print(f"MRR@10      : {mrr['MRR@10']:.2f}")`

please make sure to add the ignore_identical_ids parameters to False to consider all the documents or use the different Query and Documents_ID

please go through this code for better understanding:

https://github.com/beir-cellar/beir/blob/main/beir/retrieval/evaluation.py