milvus-io / milvus

A cloud-native vector database, storage for next generation AI applications
https://milvus.io
Apache License 2.0
29.09k stars 2.79k forks source link

[Bug]: Search results give incorrect output when using text embeddings and image embeddings. #34582

Closed arian360 closed 1 month ago

arian360 commented 1 month ago

Is there an existing issue for this?

Environment

- Milvus version:milvus:v2.4.1
- Deployment mode(standalone or cluster):standalone
- MQ type(rocksmq, pulsar or kafka):    
- SDK version(e.g. pymilvus v2.0.0rc2):
- OS(Ubuntu or CentOS): Windows
- CPU/Memory: 21 GB
- GPU: 
- Others:

Current Behavior

Query: whose this number 9948809447? id: ./train/banana\n07753592_14549.JPEG, distance: 1.0, entity: {'path': './train/banana\n07753592_14549.JPEG', 'text': ''} id: ./train/banana\n07753592_16664.JPEG, distance: 1.0, entity: {'path': './train/banana\n07753592_16664.JPEG', 'text': ''} id: ./train/banana\n07753592_18.JPEG, distance: 1.0, entity: {'path': './train/banana\n07753592_18.JPEG', 'text': ''} id: ./train/banana\n07753592_3043.JPEG, distance: 1.0, entity: {'path': './train/banana\n07753592_3043.JPEG', 'text': ''} id: 9948809447, distance: 1.1029038429260254, entity: {'path': '9948809447', 'text': 'Phone: 9948809447, First Name: chandu.k, Last Name: komirisetty, Email: chandu.k@ciera.ai, Address: AP, Pin:500050, Job: Hardware, Version: 1, Status: Active, Description: Hardware,Aadhar Number:22334455600,Expiry Date:22-06-2025,PAN nuber :BRRKU7298G,Expiry Date:20-07-2025'}

Expected Behavior

Query: whose this number 9948809447? id: 9948809447, distance: 1.1029038429260254, entity: {'path': '9948809447', 'text': 'Phone: 9948809447, First Name: chandu.k, Last Name: komirisetty, Email: chandu.k@ciera.ai, Address: AP, Pin:500050, Job: Hardware, Version: 1, Status: Active, Description: Hardware,Aadhar Number:22334455600,Expiry Date:22-06-2025,PAN nuber :BRRKU7298G,Expiry Date:20-07-2025'}

Steps To Reproduce

I am inserting images into separate embeddings and text into separate embeddings. When I search for text, the results show all the images first and then the text content. However, when I search for images, I encounter a dimension mismatch issue

Milvus Log

No response

Anything else?

Here is the code sample i am using: import csv from glob import glob from pathlib import Path

from towhee import pipe, ops, DataCollection from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility from openai import OpenAI from PIL import Image import numpy as np

Towhee parameters

MODEL = 'resnet50' DEVICE = None # if None, use default device (cuda is enabled if available)

Milvus parameters

HOST = '127.0.0.1' PORT = '19530' TOPK = 10 DIM = 2048 # dimension of embedding extracted by MODEL COLLECTION_NAME = 'image_text_collection' INDEX_TYPE = 'IVF_FLAT' METRIC_TYPE = 'L2'

OpenAI parameters

openai_client = OpenAI(api_key="<>") MODEL_NAME = "text-embedding-3-small" TEXT_DIMENSION = 1536

Image path for direct insertion

IMAGE_PATHS = './train/banana/*.JPEG'

def get_text_embedding(text): """Retrieve text embeddings from OpenAI.""" result = openai_client.embeddings.create(input=text, model=MODEL_NAME) return result.data[0].embedding

def load_image(x): """Load image paths.""" if x.endswith('csv'): with open(x) as f: reader = csv.reader(f) next(reader) for item in reader: yield item[1] else: for item in glob(x): yield item

def create_embedding_pipeline(): """Create an embedding pipeline.""" return ( pipe.input('src') .flat_map('src', 'img_path', load_image) .map('img_path', 'img', ops.image_decode()) .map('img', 'vec', ops.image_embedding.timm(model_name=MODEL, device=DEVICE)) .map(('img_path', 'vec'), ('path', 'text', 'image_embedding', 'text_embedding'), lambda img_path, vec: (img_path, "", vec, np.zeros(TEXT_DIMENSION))) )

def display_embeddings(): """Display embedding results.""" p_embed = create_embedding_pipeline() p_display = p_embed.output('img_path', 'img', 'vec') DataCollection(p_display('./test/banana/*.JPEG')).show()

def create_milvus_collection(collection_name, dim, text_dim): """Create a Milvus collection.""" if utility.has_collection(collection_name): utility.drop_collection(collection_name)

fields = [
    FieldSchema(name='path', dtype=DataType.VARCHAR, description='path to image', max_length=500, is_primary=True, auto_id=False),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=1000),
    FieldSchema(name='image_embedding', dtype=DataType.FLOAT_VECTOR, description='image embedding vectors', dim=dim),
    FieldSchema(name='text_embedding', dtype=DataType.FLOAT_VECTOR, description='text embedding vectors', dim=text_dim)
]
schema = CollectionSchema(fields=fields, description='reverse image search')
collection = Collection(name=collection_name, schema=schema)

index_params = {
    'metric_type': METRIC_TYPE,
    'index_type': INDEX_TYPE,
    'params': {"nlist": 2048}
}
collection.create_index(field_name='image_embedding', index_params=index_params)
collection.create_index(field_name='text_embedding', index_params=index_params)
return collection

def insert_image_data(image_paths): """Insert image data into Milvus.""" p_embed = create_embedding_pipeline() p_insert = ( p_embed.map(('path', 'text', 'image_embedding', 'text_embedding'), 'mr', ops.ann_insert.milvus_client( host=HOST, port=PORT, collection_name=COLLECTION_NAME )) .output('mr') ) p_insert(image_paths)

def insert_text_data(): """Insert additional text data into Milvus.""" collection = Collection(name=COLLECTION_NAME) sample_data = { "user_id": 1004560010, "username": "9948809447", "email": "chandu.k@ciera.ai", "biographical_details": "John Doe is a Software Engineer...", "contact_information": "Phone: 9948809447, First Name: chandu.k, Last Name: komirisetty," " Email: chandu.k@ciera.ai, Address: AP, Pin:500050, " "Job: Hardware, Version: 1, Status: Active, Description: Hardware," "Aadhar Number:22334455600,Expiry Date:22-06-2025," "PAN nuber :BRRKU7298G,Expiry Date:20-07-2025" }

# Get embeddings for contact information
contact_vector = get_text_embedding(sample_data["contact_information"])

# Insert data into Milvus collection
collection.insert([
    {
        "path": sample_data["username"],  # Use 'username' as path for consistency
        "text": sample_data["contact_information"],
        "image_embedding": [0] * DIM,  # Placeholder vector for image embedding
        "text_embedding": contact_vector,
    }
])
print("Text data saved successfully")
collection.load()

def data_fetcher(): """Fetch data from Milvus.""" queries = ["whose this number 9948809447?"]

query_vectors = [get_text_embedding(query) for query in queries]
collection = Collection(name=COLLECTION_NAME)
# collection.load()

res = collection.search(
    data=query_vectors,  # query vectors
    anns_field="text_embedding",
    limit=5,  # number of returned entities
    output_fields=["path", "text"],  # specifies fields to be returned
    param={"metric_type": "L2", "params": {"nprobe": 10}},
    # partition_names=['identity_information']
)

for q, r in zip(queries, res):
    print('Query:', q)
    for hit in r:
        print(hit)

def search_images(): """Create a search pipeline.""" p_embed = create_embedding_pipeline() p_search_pre = ( p_embed.map('vec', ('search_res'), ops.ann_search.milvus_client( host=HOST, port=PORT, limit=TOPK, collection_name=COLLECTION_NAME)) .map('search_res', 'pred', lambda x: [str(Path(y[0]).resolve()) for y in x]) )

import cv2
from towhee.types.image import Image

def read_images(img_paths):
    imgs = []
    for p in img_paths:
        imgs.append(Image(cv2.imread(p), 'BGR'))
    return imgs

p_search_img = (
    p_search_pre.map('pred', 'pred_images', read_images)
    .output('img', 'pred_images')
)
p_search = p_search_pre.output('img_path', 'pred')
collection = Collection(name=COLLECTION_NAME)
# Search for example query image(s)
collection.load()
dc = p_search('test/banana/*.JPEG')

# Display search results with image paths
DataCollection(dc).show()

if name == 'main': connections.connect(host=HOST, port=PORT)

# Insert image data
# insert_image_data(IMAGE_PATHS)

# Insert text data
# insert_text_data()
data_fetcher()
#search_images()
yanliang567 commented 1 month ago

/assign @jaelgu please help to take look /unassign

jaelgu commented 1 month ago

@arian360

  1. When you search for text, the image results come first because its text_embedding is [0]*dim, which leads to smaller L2 distance.
  2. When you search for image, you need to specify anns_field='image_embedding' in ops.ann_search.milvus_client. Otherwise, the towhee search operator will use the first detected embedding field (i.e. text_embedding in your case) as the anns field by default.
arian360 commented 1 month ago

@jaelgu,

Thank you for your response.

Could you please provide a solution for point 1? Specifically, I need to insert images and text into the same collection but in different vector fields. Could you suggest an implementation for this?

Here is my requirement: GitHub Milvus Discussion

Thank you!

jaelgu commented 1 month ago

@arian360