Closed arian360 closed 1 month ago
/assign @jaelgu please help to take look /unassign
@arian360
anns_field='image_embedding'
in ops.ann_search.milvus_client
. Otherwise, the towhee search operator will use the first detected embedding field (i.e. text_embedding
in your case) as the anns field by default.@jaelgu,
Thank you for your response.
Could you please provide a solution for point 1? Specifically, I need to insert images and text into the same collection but in different vector fields. Could you suggest an implementation for this?
Here is my requirement: GitHub Milvus Discussion
Thank you!
@arian360
Is there an existing issue for this?
Environment
Current Behavior
Query: whose this number 9948809447? id: ./train/banana\n07753592_14549.JPEG, distance: 1.0, entity: {'path': './train/banana\n07753592_14549.JPEG', 'text': ''} id: ./train/banana\n07753592_16664.JPEG, distance: 1.0, entity: {'path': './train/banana\n07753592_16664.JPEG', 'text': ''} id: ./train/banana\n07753592_18.JPEG, distance: 1.0, entity: {'path': './train/banana\n07753592_18.JPEG', 'text': ''} id: ./train/banana\n07753592_3043.JPEG, distance: 1.0, entity: {'path': './train/banana\n07753592_3043.JPEG', 'text': ''} id: 9948809447, distance: 1.1029038429260254, entity: {'path': '9948809447', 'text': 'Phone: 9948809447, First Name: chandu.k, Last Name: komirisetty, Email: chandu.k@ciera.ai, Address: AP, Pin:500050, Job: Hardware, Version: 1, Status: Active, Description: Hardware,Aadhar Number:22334455600,Expiry Date:22-06-2025,PAN nuber :BRRKU7298G,Expiry Date:20-07-2025'}
Expected Behavior
Query: whose this number 9948809447? id: 9948809447, distance: 1.1029038429260254, entity: {'path': '9948809447', 'text': 'Phone: 9948809447, First Name: chandu.k, Last Name: komirisetty, Email: chandu.k@ciera.ai, Address: AP, Pin:500050, Job: Hardware, Version: 1, Status: Active, Description: Hardware,Aadhar Number:22334455600,Expiry Date:22-06-2025,PAN nuber :BRRKU7298G,Expiry Date:20-07-2025'}
Steps To Reproduce
Milvus Log
No response
Anything else?
Here is the code sample i am using: import csv from glob import glob from pathlib import Path
from towhee import pipe, ops, DataCollection from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility from openai import OpenAI from PIL import Image import numpy as np
Towhee parameters
MODEL = 'resnet50' DEVICE = None # if None, use default device (cuda is enabled if available)
Milvus parameters
HOST = '127.0.0.1' PORT = '19530' TOPK = 10 DIM = 2048 # dimension of embedding extracted by MODEL COLLECTION_NAME = 'image_text_collection' INDEX_TYPE = 'IVF_FLAT' METRIC_TYPE = 'L2'
OpenAI parameters
openai_client = OpenAI(api_key="<>")
MODEL_NAME = "text-embedding-3-small"
TEXT_DIMENSION = 1536
Image path for direct insertion
IMAGE_PATHS = './train/banana/*.JPEG'
def get_text_embedding(text): """Retrieve text embeddings from OpenAI.""" result = openai_client.embeddings.create(input=text, model=MODEL_NAME) return result.data[0].embedding
def load_image(x): """Load image paths.""" if x.endswith('csv'): with open(x) as f: reader = csv.reader(f) next(reader) for item in reader: yield item[1] else: for item in glob(x): yield item
def create_embedding_pipeline(): """Create an embedding pipeline.""" return ( pipe.input('src') .flat_map('src', 'img_path', load_image) .map('img_path', 'img', ops.image_decode()) .map('img', 'vec', ops.image_embedding.timm(model_name=MODEL, device=DEVICE)) .map(('img_path', 'vec'), ('path', 'text', 'image_embedding', 'text_embedding'), lambda img_path, vec: (img_path, "", vec, np.zeros(TEXT_DIMENSION))) )
def display_embeddings(): """Display embedding results.""" p_embed = create_embedding_pipeline() p_display = p_embed.output('img_path', 'img', 'vec') DataCollection(p_display('./test/banana/*.JPEG')).show()
def create_milvus_collection(collection_name, dim, text_dim): """Create a Milvus collection.""" if utility.has_collection(collection_name): utility.drop_collection(collection_name)
def insert_image_data(image_paths): """Insert image data into Milvus.""" p_embed = create_embedding_pipeline() p_insert = ( p_embed.map(('path', 'text', 'image_embedding', 'text_embedding'), 'mr', ops.ann_insert.milvus_client( host=HOST, port=PORT, collection_name=COLLECTION_NAME )) .output('mr') ) p_insert(image_paths)
def insert_text_data(): """Insert additional text data into Milvus.""" collection = Collection(name=COLLECTION_NAME) sample_data = { "user_id": 1004560010, "username": "9948809447", "email": "chandu.k@ciera.ai", "biographical_details": "John Doe is a Software Engineer...", "contact_information": "Phone: 9948809447, First Name: chandu.k, Last Name: komirisetty," " Email: chandu.k@ciera.ai, Address: AP, Pin:500050, " "Job: Hardware, Version: 1, Status: Active, Description: Hardware," "Aadhar Number:22334455600,Expiry Date:22-06-2025," "PAN nuber :BRRKU7298G,Expiry Date:20-07-2025" }
def data_fetcher(): """Fetch data from Milvus.""" queries = ["whose this number 9948809447?"]
def search_images(): """Create a search pipeline.""" p_embed = create_embedding_pipeline() p_search_pre = ( p_embed.map('vec', ('search_res'), ops.ann_search.milvus_client( host=HOST, port=PORT, limit=TOPK, collection_name=COLLECTION_NAME)) .map('search_res', 'pred', lambda x: [str(Path(y[0]).resolve()) for y in x]) )
if name == 'main': connections.connect(host=HOST, port=PORT)