Striveworks / valor

Valor is a centralized evaluation store which makes it easy to measure, explore, and rank model performance.
https://striveworks.github.io/valor/
Other
38 stars 4 forks source link

BUG: Add test to `lite` checking for classification bug #763

Closed ntlind closed 1 month ago

ntlind commented 2 months ago

valor version checks

Reproducible Example

i have a basically perfect classifier for a thress class classification problem but im getting thing slike ROC AUC 0.5145382939500587. any ideas?

dataset = Dataset.create(
    "wildfire-dataset", metadata={"description": "wildfire prediction dataset"}
)
model = Model.create("wildfire-predictor")

def create_gt_and_preds(n_datums=100):

    low = Label(key="class", value="LOW_RISK")
    med = Label(key="class", value="MEDIUM_RISK")
    high = Label(key="class", value="HIGH_RISK")

    labels = [low, med, high]

    for i in tqdm(range(n_datums)):
        datum = Datum(
            uid=str(i),
            metadata={
                "day": datetime.date(
                    2024, random.choice(range(1, 13)), random.choice(range(1, 29))
                )
            },
        )

        label = random.choice(labels)

        dataset.add_groundtruth(
            GroundTruth(
                datum=datum,
                annotations=[
                    Annotation(
                        labels=[label],
                    ),
                ],
            )
        )

        if random.uniform(0, 1) < 2:
            pred_label = label
        else:
            pred_label = random.choice(labels)

        pred_label = deepcopy(pred_label)

        other_label = deepcopy(random.choice([la for la in labels if la != pred_label]))
        pred_label.score = 0.99
        other_label.score = 0.01

        model.add_prediction(
            dataset,
            Prediction(
                datum=datum, annotations=[Annotation(labels=[pred_label, other_label])]
            ),
        )

create_gt_and_preds()

dataset.finalize()

eval_job = model.evaluate_classification(dataset)
eval_job.wait_for_completion()
print(eval_job.metrics)

Issue Description

Accuracy, precision, etc. are equal to 1 using the API service, but ROCAUC seems off.

making the code adjustment below seems to fix the issue:

if random.uniform(0, 1) < 0.8:
            pred_label = label
        else:
            pred_label = random.choice(labels)

        pred_label = deepcopy(pred_label)

        other_label = deepcopy(random.choice([la for la in labels if la != pred_label]))
        last_label = deepcopy(
            [la for la in labels if la != pred_label and la != other_label][0]
        )

        pred_label.score = 0.9
        other_label.score = 0.05
        last_label.score = 0.05

Expected Behavior

check to make sure we're handling zeroes correctly in lite

ntlind commented 1 month ago

wrote the following tests to confirm that we get the same output as sklearn:


def test_sklearn_output():
    from sklearn.metrics import roc_auc_score

    n_datums = 100

    y_true = [0] * (n_datums - 1) + [1]
    y_score = [0.99] * (n_datums - 1) + [0.01]
    output = roc_auc_score(y_true, y_score, multi_class="ovr")

    assert output == 0

def test_perfect_matches():

    n_datums = 100

    classifications = []

    for i in range(n_datums):
        classifications.append(
            Classification(
                uid=f"uid{i}",
                groundtruths=["low_risk"],
                predictions=["low_risk", "high_risk"],
                scores=[0.99, 0.01],
            )
        )

    loader = DataLoader()
    loader.add_data(classifications)
    evaluator = loader.finalize()

    metrics = evaluator.evaluate(as_dict=True)

    assert metrics[MetricType.ROCAUC][0]["value"] == 0