cvangysel / pytrec_eval

pytrec_eval is an Information Retrieval evaluation tool for Python, based on the popular trec_eval.
http://ilps.science.uva.nl/
MIT License
282 stars 32 forks source link

The difference between pytrec_eval and sklearn ndcg_score and differ when change the dict key name #51

Closed relic-yuexi closed 2 months ago

relic-yuexi commented 3 months ago

Hello author, I have encountered some strange results. The reproduce code is below.

import pytrec_eval
from sklearn.metrics import ndcg_score
import numpy as np

# Create simulated data
qrels_dict = {
    'q1': {
        '1677128#0': 1,
        'doc4': 1,
    }
}

results_dict = {
    'q1': {
        '1677128#0': 2.921875,
        '5712855#212': 2.921875,
        '4200727#1': 2.890625,
        '3750848#3': 2.875,
        'doc4': 2.875,
        '512030#0': 2.8125,
        '512030#1': 2.8125,
        '5708398#5': 2.8125,
        '6615597#2': 2.796875,
        '512030#6': 2.765625,
    }
}

# Calculate metrics using pytrec_eval
evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, {'ndcg_cut.10'})
pytrec_results = evaluator.evaluate(results_dict)

# Prepare data for sklearn
specific_qid = 'q1'
specific_relevances = [
    qrels_dict[specific_qid].get(docid, 0)
    for docid in results_dict[specific_qid].keys()
]
print(specific_relevances)
specific_scores = [
    results_dict[specific_qid][docid] for docid in results_dict[specific_qid].keys()
]
print(specific_scores)

sklearn_specific_ndcg_10 = ndcg_score(
    [specific_relevances], [specific_scores], k=10, ignore_ties=False
)
pytrec_specific_ndcg_10 = pytrec_results[specific_qid]['ndcg_cut_10']

# Print results
print(f"qrels_dict['{specific_qid}']:", qrels_dict[specific_qid])
print(f"results_dict['{specific_qid}']:", results_dict[specific_qid])
print(
    f"Specific QID: {specific_qid} | pytrec_eval NDCG@10: {pytrec_specific_ndcg_10} | sklearn NDCG@10: {sklearn_specific_ndcg_10}"
)
[1, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[2.921875, 2.921875, 2.890625, 2.875, 2.875, 2.8125, 2.8125, 2.8125, 2.796875, 2.765625]
qrels_dict['q1']: {'1677128#0': 1, 'doc4': 1}
results_dict['q1']: {'1677128#0': 2.921875, '5712855#212': 2.921875, '4200727#1': 2.890625, '3750848#3': 2.875, 'doc4': 2.875, '512030#0': 2.8125, '512030#1': 2.8125, '5708398#5': 2.8125, '6615597#2': 2.796875, '512030#6': 2.765625}
Specific QID: q1 | pytrec_eval NDCG@10: 0.6509209298071326 | sklearn NDCG@10: 0.7506329176709435
import pytrec_eval
from sklearn.metrics import ndcg_score
import numpy as np

# Create simulated data
qrels_dict = {
    'q1': {
        '1677128#0': 1,
        '1677128#1': 1,
    }
}

results_dict = {
    'q1': {
        '1677128#0': 2.921875,
        '5712855#212': 2.921875,
        '4200727#1': 2.890625,
        '3750848#3': 2.875,
        '1677128#1': 2.875, # i only changed key name then i get a different ndcg
        '512030#0': 2.8125,
        '512030#1': 2.8125,
        '5708398#5': 2.8125,
        '6615597#2': 2.796875,
        '512030#6': 2.765625,
    }
}

# Calculate metrics using pytrec_eval
evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, {'ndcg_cut.10'})
pytrec_results = evaluator.evaluate(results_dict)

# Prepare data for sklearn
specific_qid = 'q1'
specific_relevances = [
    qrels_dict[specific_qid].get(docid, 0)
    for docid in results_dict[specific_qid].keys()
]
print(specific_relevances)
specific_scores = [
    results_dict[specific_qid][docid] for docid in results_dict[specific_qid].keys()
]
print(specific_scores)

sklearn_specific_ndcg_10 = ndcg_score(
    [specific_relevances], [specific_scores], k=10, ignore_ties=False
)
pytrec_specific_ndcg_10 = pytrec_results[specific_qid]['ndcg_cut_10']

# Print results
print(f"qrels_dict['{specific_qid}']:", qrels_dict[specific_qid])
print(f"results_dict['{specific_qid}']:", results_dict[specific_qid])
print(
    f"Specific QID: {specific_qid} | pytrec_eval NDCG@10: {pytrec_specific_ndcg_10} | sklearn NDCG@10: {sklearn_specific_ndcg_10}"
)
[1, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[2.921875, 2.921875, 2.890625, 2.875, 2.875, 2.8125, 2.8125, 2.8125, 2.796875, 2.765625]
qrels_dict['q1']: {'1677128#0': 1, '1677128#1': 1}
results_dict['q1']: {'1677128#0': 2.921875, '5712855#212': 2.921875, '4200727#1': 2.890625, '3750848#3': 2.875, '1677128#1': 2.875, '512030#0': 2.8125, '512030#1': 2.8125, '5708398#5': 2.8125, '6615597#2': 2.796875, '512030#6': 2.765625}
Specific QID: q1 | pytrec_eval NDCG@10: 0.6240505200038379 | sklearn NDCG@10: 0.7506329176709435
import pytrec_eval
from sklearn.metrics import ndcg_score
import numpy as np

# Create simulated data
qrels_dict = {
    'q1': {
        '1677128#0': 1,
        'doc4': 1,
    }
}

results_dict = {
    'q1': {
        '1677128#0': 2.921875,
        '5712855#212': 2.921875,
        '4200727#1': 2.890625,
        '3750848#3': 2.875,
        'doc4': 2.875,
        '512030#0': 2.8125,
        '512030#1': 2.8125,
        '5708398#5': 2.8125,
        '6615597#2': 2.796875,
        '512030#6': 2.765625,
    }
}

# Calculate metrics using pytrec_eval
evaluator = pytrec_eval.RelevanceEvaluator(qrels_dict, {'ndcg_cut.10'})
pytrec_results = evaluator.evaluate(results_dict)

# Prepare data for sklearn
specific_qid = 'q1'
specific_relevances = [
    qrels_dict[specific_qid].get(docid, 0)
    for docid in results_dict[specific_qid].keys()
]
print(specific_relevances)
specific_scores = [
    results_dict[specific_qid][docid] for docid in results_dict[specific_qid].keys()
]
print(specific_scores)

sklearn_specific_ndcg_10 = ndcg_score(
    [specific_relevances], [specific_scores], k=10, ignore_ties=True
)
pytrec_specific_ndcg_10 = pytrec_results[specific_qid]['ndcg_cut_10']

# Print results
print(f"qrels_dict['{specific_qid}']:", qrels_dict[specific_qid])
print(f"results_dict['{specific_qid}']:", results_dict[specific_qid])
print(
    f"Specific QID: {specific_qid} | pytrec_eval NDCG@10: {pytrec_specific_ndcg_10} | sklearn NDCG@10: {sklearn_specific_ndcg_10}"
)
[1, 0, 0, 0, 1, 0, 0, 0, 0, 0]
[2.921875, 2.921875, 2.890625, 2.875, 2.875, 2.8125, 2.8125, 2.8125, 2.796875, 2.765625]
qrels_dict['q1']: {'1677128#0': 1, 'doc4': 1}
results_dict['q1']: {'1677128#0': 2.921875, '5712855#212': 2.921875, '4200727#1': 2.890625, '3750848#3': 2.875, 'doc4': 2.875, '512030#0': 2.8125, '512030#1': 2.8125, '5708398#5': 2.8125, '6615597#2': 2.796875, '512030#6': 2.765625}
Specific QID: q1 | pytrec_eval NDCG@10: 0.6509209298071326 | sklearn NDCG@10: 0.6509209298071325
relic-yuexi commented 3 months ago

i have know why will make difference between pytrec_eval and ndcg_score. But i still don't know why changed key name will lead different score.

seanmacavaney commented 3 months ago

An example that isolates the difference more clearly would help us understand what's happening here. I suspect it comes down to how each handles tie-breaking in the run scores, given the document ID that's changing is tied with another one and would be sorted at a different position with the new ID.