voxel51 / fiftyone-brain

Open source AI/ML capabilities for the FiftyOne ecosystem
https://fiftyone.ai/brain.html
Apache License 2.0
128 stars 3 forks source link

Fixing similarity index bugs #171

Closed brimoor closed 9 months ago

brimoor commented 10 months ago

Fixes issues ID'd in https://github.com/voxel51/fiftyone-brain/pull/168 and https://github.com/voxel51/fiftyone-brain/pull/170

mongodb backend

import fiftyone as fo
import fiftyone.brain as fob
import fiftyone.zoo as foz

dataset = foz.load_zoo_dataset(
    "quickstart",
    max_samples=10,
    dataset_name="zzz",
    drop_existing_dataset=True,
)

view = dataset[:5]

# optional: precompute embeddings
model = foz.load_zoo_model("clip-vit-base32-torch")
embeddings = view.compute_embeddings(model)
view.set_values("embeddings", embeddings.tolist())

index = fob.compute_similarity(
    view,  # although not recommended, this does work
    model="clip-vit-base32-torch",
    backend="mongodb",
    brain_key="img_sim",
    embeddings="embeddings",
)

print(index.total_index_size)  # 5
print(index.index_size)  # 5

# index is created even though embeddings already existed
assert index._index

# issues warnings for skipped IDs
embeddings, sample_ids, _ = index.compute_embeddings(dataset[3:6])
index.add_to_index(embeddings, sample_ids, overwrite=False, warn_existing=True)

print(index.total_index_size)  # 6
print(index.index_size)  # 5

# skips existing but no warnings
embeddings, sample_ids, _ = index.compute_embeddings(dataset[5:])
index.add_to_index(embeddings, sample_ids, overwrite=False)

print(index.total_index_size)  # 10
print(index.index_size)  # 5

del_view = dataset[4:6]
index.use_view(del_view)

print(index.total_index_size)  # 10
print(index.index_size)  # 2

# works even though some IDs are not in the index's original view
index.remove_from_index(sample_ids=del_view.values("id"))

print(index.total_index_size)  # 8
print(index.index_size)  # 0

index.use_view(dataset)

print(index.total_index_size)  # 8
print(index.index_size)  # 8

assert index.ready

# issues a warning rather than raising an error
puppies = dataset.take(5).sort_by_similarity("puppies")
# The MongoDB backend does not yet support views; the full index will instead be queried, which may result in fewer matches in your current view

sklearn backend

import fiftyone as fo
import fiftyone.brain as fob
import fiftyone.zoo as foz

dataset = foz.load_zoo_dataset(
    "quickstart",
    max_samples=10,
    dataset_name="zzz",
    drop_existing_dataset=True,
)

view = dataset[:5]

index = fob.compute_similarity(
    view,  # although not recommended, this does work
    model="clip-vit-base32-torch",
    backend="sklearn",
    brain_key="img_sim",
    embeddings="embeddings",
)

print(index.total_index_size)  # 5
print(index.index_size)  # 5

# issues warnings for skipped IDs
embeddings, sample_ids, _ = index.compute_embeddings(dataset[3:6])
index.add_to_index(embeddings, sample_ids, overwrite=False, warn_existing=True)

print(index.total_index_size)  # 6
print(index.index_size)  # 5

# skips existing but no warnings
embeddings, sample_ids, _ = index.compute_embeddings(dataset[5:])
index.add_to_index(embeddings, sample_ids, overwrite=False)

print(index.total_index_size)  # 10
print(index.index_size)  # 5

del_view = dataset[4:6]
index.use_view(del_view)

print(index.total_index_size)  # 10
print(index.index_size)  # 2

# works even though some IDs are not in the index's original view
index.remove_from_index(sample_ids=del_view.values("id"))

print(index.total_index_size)  # 8
print(index.index_size)  # 0

index.use_view(dataset)

print(index.total_index_size)  # 8
print(index.index_size)  # 8

puppies = dataset.take(5).sort_by_similarity("puppies")
brimoor commented 9 months ago

@allenleetc ah yes, you are correct! Added your fix in https://github.com/voxel51/fiftyone-brain/pull/171/commits/bd2fffe489ccc837e48faae0e3154af9db98927a 👍