Striveworks / valor

Valor is a centralized evaluation store which makes it easy to measure, explore, and rank model performance.
https://striveworks.github.io/valor/
Other
38 stars 4 forks source link

BUG: `lite` fails an AP test from `core`, likely due to an issue with selecting the "best pairs" #754

Closed ntlind closed 2 months ago

ntlind commented 2 months ago

valor version checks

Reproducible Example

test was taken from test_evaluate_detection_functional_test_with_rasters and converted to use bounding boxes instead of rasters. this test fails on the first assertion.

@pytest.fixture
def test_case_input():

    gts = {
        "boxes": [
            (2, 10, 2, 10),
            (2, 10, 2, 10),
            (2, 10, 2, 10),
        ],
        "label_values": ["label1", "label2", "label3"],
    }

    # labels 1 and 2 have IOU==1, labels 3 and 4 have IOU==0
    preds = {
        "boxes": [
            (2, 10, 2, 10),
            (2, 10, 2, 10),
            (0, 1, 0, 1),
            (0, 1, 0, 1),
        ],
        "label_values": ["label1", "label2", "label3", "label4"],
        "scores": [
            0.3,
            0.93,
            0.92,
            0.94,
        ],
    }

    groundtruths = [
        BoundingBox(
            xmin=xmin,
            xmax=xmax,
            ymin=ymin,
            ymax=ymax,
            labels=[("class", label_value)],
        )
        for (xmin, xmax, ymin, ymax), label_value in zip(
            gts["boxes"], gts["label_values"]
        )
    ]

    predictions = [
        BoundingBox(
            xmin=xmin,
            xmax=xmax,
            ymin=ymin,
            ymax=ymax,
            labels=[("class", label_value)],
            scores=[score],
        )
        for (xmin, xmax, ymin, ymax), label_value, score in zip(
            preds["boxes"], preds["label_values"], preds["scores"]
        )
    ]

    return Detection(
        uid="uid1", groundtruths=groundtruths, predictions=predictions
    )

def test_case(test_case_input):
    manager = DataLoader()
    manager.add_data(detections=[test_case_input])
    evaluator = manager.finalize()
    metrics = evaluator.evaluate(iou_thresholds=[0.5, 0.75])
    ap_metrics = [m.to_dict() for m in metrics[MetricType.AP]]
    expected_ap_metrics = [
        {
            "parameters": {
                "iou": 0.5,
                "label": {"key": "class", "value": "label1"},
            },
            "value": 1.0,
            "type": "AP",
        },
        {
            "parameters": {
                "iou": 0.75,
                "label": {"key": "class", "value": "label1"},
            },
            "value": 1.0,
            "type": "AP",
        },
        {
            "parameters": {
                "iou": 0.5,
                "label": {"key": "class", "value": "label2"},
            },
            "value": 1.0,
            "type": "AP",
        },
        {
            "parameters": {
                "iou": 0.75,
                "label": {"key": "class", "value": "label2"},
            },
            "value": 1.0,
            "type": "AP",
        },
        {
            "parameters": {
                "iou": 0.5,
                "label": {"key": "class", "value": "label3"},
            },
            "value": 0.0,
            "type": "AP",
        },
        {
            "parameters": {
                "iou": 0.75,
                "label": {"key": "class", "value": "label3"},
            },
            "value": 0.0,
            "type": "AP",
        },
    ]

    for m in ap_metrics:
        assert m in expected_ap_metrics
    for m in expected_ap_metrics:
        assert m in ap_metrics

    map_metrics = [m.to_dict() for m in metrics[MetricType.AP]]
    expected_map_metrics = [
        {
            "parameters": {"label_key": "class", "iou": 0.5},
            "value": 0.667,
            "type": "mAP",
        },
        {
            "parameters": {"label_key": "class", "iou": 0.75},
            "value": 0.667,
            "type": "mAP",
        },
    ]

    for m in map_metrics:
        assert m in expected_map_metrics
    for m in expected_map_metrics:
        assert m in map_metrics

    ap_averaged_over_iou_metrics = [
        m.to_dict() for m in metrics[MetricType.AP]
    ]
    expected_ap_averaged_over_iou_metrics = [
        {
            "parameters": {
                "label": {"key": "class", "value": "label1"},
                "ious": [
                    0.5,
                    0.55,
                    0.6,
                    0.65,
                    0.7,
                    0.75,
                    0.8,
                    0.85,
                    0.9,
                    0.95,
                ],
            },
            "value": 1.0,
            "type": "APAveragedOverIOUs",
        },
        {
            "parameters": {
                "ious": [
                    0.5,
                    0.55,
                    0.6,
                    0.65,
                    0.7,
                    0.75,
                    0.8,
                    0.85,
                    0.9,
                    0.95,
                ],
                "label": {"key": "class", "value": "label2"},
            },
            "value": 1.0,
            "type": "APAveragedOverIOUs",
        },
        {
            "parameters": {
                "ious": [
                    0.5,
                    0.55,
                    0.6,
                    0.65,
                    0.7,
                    0.75,
                    0.8,
                    0.85,
                    0.9,
                    0.95,
                ],
                "label": {"key": "class", "value": "label3"},
            },
            "value": 0.0,
            "type": "APAveragedOverIOUs",
        },
    ]

    for m in ap_averaged_over_iou_metrics:
        assert m in expected_ap_averaged_over_iou_metrics
    for m in expected_ap_averaged_over_iou_metrics:
        assert m in ap_averaged_over_iou_metrics

    map_averaged_over_iou_metrics = [
        m.to_dict() for m in metrics[MetricType.AP]
    ]
    expected_map_averaged_over_iou_metrics = [
        {
            "parameters": {
                "label_key": "class",
                "ious": [
                    0.5,
                    0.55,
                    0.6,
                    0.65,
                    0.7,
                    0.75,
                    0.8,
                    0.85,
                    0.9,
                    0.95,
                ],
            },
            "value": 0.667,
            "type": "mAPAveragedOverIOUs",
        },
    ]

    for m in map_averaged_over_iou_metrics:
        assert m in expected_map_averaged_over_iou_metrics
    for m in expected_map_averaged_over_iou_metrics:
        assert m in map_averaged_over_iou_metrics

Issue Description

in this example, the groundtruths for label1 and label2 both have a perfectly matching prediction (e.g., IOU==1), so we'd expect AR and AP for both label1 and label2 to equal 1. here is what gets returned in ap_metrics:

[{'type': 'AP', 'value': np.float64(1.0), 'parameters': {'iou': 0.5, 'label': {'key': 'class', 'value': 'label1'}}}, {'type': 'AP', 'value': np.float64(0.0), 'parameters': {'iou': 0.5, 'label': {'key': 'class', 'value': 'label2'}}}, {'type': 'AP', 'value': np.float64(0.0), 'parameters': {'iou': 0.5, 'label': {'key': 'class', 'value': 'label3'}}}, {'type': 'AP', 'value': np.float64(1.0), 'parameters': {'iou': 0.75, 'label': {'key': 'class', 'value': 'label1'}}}, {'type': 'AP', 'value': np.float64(0.0), 'parameters': {'iou': 0.75, 'label': {'key': 'class', 'value': 'label2'}}}, {'type': 'AP', 'value': np.float64(0.0), 'parameters': {'iou': 0.75, 'label': {'key': 'class', 'value': 'label3'}}}]

in this output, we see that label2 has an incorrect AP score of 0, while label1 has the correct AP score of 1.

I think the issue lies in _compute_ranked_pairs_for_datum, since the filtered output for data doesn't include the row where the gt label equals the pd label:

array([[0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.3 ],
       [0.  , 1.  , 0.  , 1.  , 1.  , 0.  , 0.3 ],
       [0.  , 2.  , 0.  , 1.  , 2.  , 0.  , 0.3 ],
       [0.  , 0.  , 1.  , 1.  , 0.  , 1.  , 0.93], # this row gets kept...
       [0.  , 1.  , 1.  , 1.  , 1.  , 1.  , 0.93], # when (I believe) this row actually should be kept
       [0.  , 2.  , 1.  , 1.  , 2.  , 1.  , 0.93],
       [0.  , 0.  , 2.  , 0.  , 0.  , 2.  , 0.92],
       [0.  , 1.  , 2.  , 0.  , 1.  , 2.  , 0.92],
       [0.  , 2.  , 2.  , 0.  , 2.  , 2.  , 0.92],
       [0.  , 0.  , 3.  , 0.  , 0.  , 3.  , 0.94],
       [0.  , 1.  , 3.  , 0.  , 1.  , 3.  , 0.94],
       [0.  , 2.  , 3.  , 0.  , 2.  , 3.  , 0.94]])

Expected Behavior

fix computations so that the test above passes