[X] I have confirmed this bug exists on the latest version of valor.
Reproducible Example
test was taken from test_evaluate_detection_functional_test_with_rasters and converted to use bounding boxes instead of rasters. this test fails on the first assertion.
@pytest.fixture
def test_case_input():
gts = {
"boxes": [
(2, 10, 2, 10),
(2, 10, 2, 10),
(2, 10, 2, 10),
],
"label_values": ["label1", "label2", "label3"],
}
# labels 1 and 2 have IOU==1, labels 3 and 4 have IOU==0
preds = {
"boxes": [
(2, 10, 2, 10),
(2, 10, 2, 10),
(0, 1, 0, 1),
(0, 1, 0, 1),
],
"label_values": ["label1", "label2", "label3", "label4"],
"scores": [
0.3,
0.93,
0.92,
0.94,
],
}
groundtruths = [
BoundingBox(
xmin=xmin,
xmax=xmax,
ymin=ymin,
ymax=ymax,
labels=[("class", label_value)],
)
for (xmin, xmax, ymin, ymax), label_value in zip(
gts["boxes"], gts["label_values"]
)
]
predictions = [
BoundingBox(
xmin=xmin,
xmax=xmax,
ymin=ymin,
ymax=ymax,
labels=[("class", label_value)],
scores=[score],
)
for (xmin, xmax, ymin, ymax), label_value, score in zip(
preds["boxes"], preds["label_values"], preds["scores"]
)
]
return Detection(
uid="uid1", groundtruths=groundtruths, predictions=predictions
)
def test_case(test_case_input):
manager = DataLoader()
manager.add_data(detections=[test_case_input])
evaluator = manager.finalize()
metrics = evaluator.evaluate(iou_thresholds=[0.5, 0.75])
ap_metrics = [m.to_dict() for m in metrics[MetricType.AP]]
expected_ap_metrics = [
{
"parameters": {
"iou": 0.5,
"label": {"key": "class", "value": "label1"},
},
"value": 1.0,
"type": "AP",
},
{
"parameters": {
"iou": 0.75,
"label": {"key": "class", "value": "label1"},
},
"value": 1.0,
"type": "AP",
},
{
"parameters": {
"iou": 0.5,
"label": {"key": "class", "value": "label2"},
},
"value": 1.0,
"type": "AP",
},
{
"parameters": {
"iou": 0.75,
"label": {"key": "class", "value": "label2"},
},
"value": 1.0,
"type": "AP",
},
{
"parameters": {
"iou": 0.5,
"label": {"key": "class", "value": "label3"},
},
"value": 0.0,
"type": "AP",
},
{
"parameters": {
"iou": 0.75,
"label": {"key": "class", "value": "label3"},
},
"value": 0.0,
"type": "AP",
},
]
for m in ap_metrics:
assert m in expected_ap_metrics
for m in expected_ap_metrics:
assert m in ap_metrics
map_metrics = [m.to_dict() for m in metrics[MetricType.AP]]
expected_map_metrics = [
{
"parameters": {"label_key": "class", "iou": 0.5},
"value": 0.667,
"type": "mAP",
},
{
"parameters": {"label_key": "class", "iou": 0.75},
"value": 0.667,
"type": "mAP",
},
]
for m in map_metrics:
assert m in expected_map_metrics
for m in expected_map_metrics:
assert m in map_metrics
ap_averaged_over_iou_metrics = [
m.to_dict() for m in metrics[MetricType.AP]
]
expected_ap_averaged_over_iou_metrics = [
{
"parameters": {
"label": {"key": "class", "value": "label1"},
"ious": [
0.5,
0.55,
0.6,
0.65,
0.7,
0.75,
0.8,
0.85,
0.9,
0.95,
],
},
"value": 1.0,
"type": "APAveragedOverIOUs",
},
{
"parameters": {
"ious": [
0.5,
0.55,
0.6,
0.65,
0.7,
0.75,
0.8,
0.85,
0.9,
0.95,
],
"label": {"key": "class", "value": "label2"},
},
"value": 1.0,
"type": "APAveragedOverIOUs",
},
{
"parameters": {
"ious": [
0.5,
0.55,
0.6,
0.65,
0.7,
0.75,
0.8,
0.85,
0.9,
0.95,
],
"label": {"key": "class", "value": "label3"},
},
"value": 0.0,
"type": "APAveragedOverIOUs",
},
]
for m in ap_averaged_over_iou_metrics:
assert m in expected_ap_averaged_over_iou_metrics
for m in expected_ap_averaged_over_iou_metrics:
assert m in ap_averaged_over_iou_metrics
map_averaged_over_iou_metrics = [
m.to_dict() for m in metrics[MetricType.AP]
]
expected_map_averaged_over_iou_metrics = [
{
"parameters": {
"label_key": "class",
"ious": [
0.5,
0.55,
0.6,
0.65,
0.7,
0.75,
0.8,
0.85,
0.9,
0.95,
],
},
"value": 0.667,
"type": "mAPAveragedOverIOUs",
},
]
for m in map_averaged_over_iou_metrics:
assert m in expected_map_averaged_over_iou_metrics
for m in expected_map_averaged_over_iou_metrics:
assert m in map_averaged_over_iou_metrics
Issue Description
in this example, the groundtruths for label1 and label2 both have a perfectly matching prediction (e.g., IOU==1), so we'd expect AR and AP for both label1 and label2 to equal 1. here is what gets returned in ap_metrics:
in this output, we see that label2 has an incorrect AP score of 0, while label1 has the correct AP score of 1.
I think the issue lies in _compute_ranked_pairs_for_datum, since the filtered output for data doesn't include the row where the gt label equals the pd label:
valor version checks
Reproducible Example
test was taken from
test_evaluate_detection_functional_test_with_rasters
and converted to use bounding boxes instead of rasters. this test fails on the first assertion.Issue Description
in this example, the groundtruths for label1 and label2 both have a perfectly matching prediction (e.g., IOU==1), so we'd expect AR and AP for both label1 and label2 to equal 1. here is what gets returned in
ap_metrics
:in this output, we see that label2 has an incorrect AP score of 0, while label1 has the correct AP score of 1.
I think the issue lies in
_compute_ranked_pairs_for_datum
, since the filtered output fordata
doesn't include the row where the gt label equals the pd label:Expected Behavior
fix computations so that the test above passes