Closed zhuwenxing closed 3 months ago
- Milvus version:master - Deployment mode(standalone or cluster): - MQ type(rocksmq, pulsar or kafka): - SDK version(e.g. pymilvus v2.0.0rc2): - OS(Ubuntu or CentOS): - CPU/Memory: - GPU: - Others:
[2024-05-22T10:12:44.871Z] + python3 scripts/second_recall_test.py --host 10.255.242.203 [2024-05-22T10:12:45.432Z] 2024-05-22 10:12:45.251 | INFO | __main__:search_test:53 - recall test for index type HNSW [2024-05-22T10:12:46.003Z] 2024-05-22 10:12:45.944 | INFO | __main__:search_test:63 - [2024-05-22T10:12:46.003Z] Search... [2024-05-22T10:12:52.529Z] 2024-05-22 10:12:51.367 | INFO | __main__:search_test:69 - search cost 5.4230 seconds [2024-05-22T10:12:52.530Z] Traceback (most recent call last): [2024-05-22T10:12:52.530Z] File "scripts/second_recall_test.py", line 103, in <module> [2024-05-22T10:12:52.530Z] search_test(host, index_type) [2024-05-22T10:12:52.530Z] File "scripts/second_recall_test.py", line 82, in search_test [2024-05-22T10:12:52.530Z] assert len(item) == len(true_ids[index]), f"get {len(item)} but expect {len(true_ids[index])}" [2024-05-22T10:12:52.530Z] AssertionError: get 99 but expect 100
import h5py import numpy as np import time import sys import threading from pathlib import Path from loguru import logger from pymilvus import connections, Collection all_index_types = ["IVF_FLAT", "IVF_SQ8", "HNSW"] def read_benchmark_hdf5(file_path): f = h5py.File(file_path, 'r') train = np.array(f["train"]) test = np.array(f["test"]) neighbors = np.array(f["neighbors"]) f.close() return train, test, neighbors def gen_search_param(index_type, metric_type="L2"): search_params = [] if index_type in ["FLAT", "IVF_FLAT", "IVF_SQ8", "IVF_PQ"]: for nprobe in [10]: ivf_search_params = {"metric_type": metric_type, "params": {"nprobe": nprobe}} search_params.append(ivf_search_params) elif index_type in ["BIN_FLAT", "BIN_IVF_FLAT"]: for nprobe in [10]: bin_search_params = {"metric_type": "HAMMING", "params": {"nprobe": nprobe}} search_params.append(bin_search_params) elif index_type in ["HNSW"]: for ef in [150]: hnsw_search_param = {"metric_type": metric_type, "params": {"ef": ef}} search_params.append(hnsw_search_param) elif index_type == "ANNOY": for search_k in [1000]: annoy_search_param = {"metric_type": metric_type, "params": {"search_k": search_k}} search_params.append(annoy_search_param) else: logger.info("Invalid index_type.") raise Exception("Invalid index_type.") return search_params[0] dim = 128 TIMEOUT = 200 def search_test(host="127.0.0.1", index_type="HNSW"): logger.info(f"recall test for index type {index_type}") file_path = f"{str(Path(__file__).absolute().parent.parent.parent)}/assets/ann_hdf5/sift-128-euclidean.hdf5" train, test, neighbors = read_benchmark_hdf5(file_path) connections.connect(host=host, port="19530") collection = Collection(name=f"sift_128_euclidean_{index_type}") nq = 10000 topK = 100 search_params = gen_search_param(index_type) for i in range(3): t0 = time.time() logger.info(f"\nSearch...") # define output_fields of search result res = collection.search( test[:nq], "float_vector", search_params, topK, output_fields=["int64"], timeout=TIMEOUT ) t1 = time.time() logger.info(f"search cost {t1 - t0:.4f} seconds") result_ids = [] for hits in res: result_id = [] for hit in hits: result_id.append(hit.entity.get("int64")) result_ids.append(result_id) # calculate recall true_ids = neighbors[:nq, :topK] sum_radio = 0.0 for index, item in enumerate(result_ids): # tmp = set(item).intersection(set(flat_id_list[index])) assert len(item) == len(true_ids[index]), f"get {len(item)} but expect {len(true_ids[index])}" tmp = set(true_ids[index]).intersection(set(item)) sum_radio = sum_radio + len(tmp) / len(item) recall = round(sum_radio / len(result_ids), 6) logger.info(f"recall={recall}") if index_type in ["IVF_PQ", "ANNOY"]: assert recall >= 0.6, f"recall={recall} < 0.6" else: assert 0.95 <= recall < 1.0, f"recall is {recall}, less than 0.95, greater than or equal to 1.0" if __name__ == "__main__": import argparse import threading parser = argparse.ArgumentParser(description='config for recall test') parser.add_argument('--host', type=str, default="127.0.0.1", help='milvus server ip') args = parser.parse_args() host = args.host tasks = [] for index_type in ["HNSW"]: search_test(host, index_type)
No response
failed job: https://qa-jenkins.milvus.io/blue/organizations/jenkins/deploy_test_cron/detail/deploy_test_cron/2319/pipeline log: artifacts-rocksmq-standalone-reinstall-2319-server-logs.tar.gz
this issue can be stable reproduced in master-latest
/assign @congqixia /unassign
shall be fixed by #33359 could you please verify? /assign @zhuwenxing
not reproduced in master-20240530-589d4dfd-amd64
master-20240530-589d4dfd-amd64
Is there an existing issue for this?
Environment
Current Behavior
Expected Behavior
No response
Steps To Reproduce
No response
Milvus Log
failed job: https://qa-jenkins.milvus.io/blue/organizations/jenkins/deploy_test_cron/detail/deploy_test_cron/2319/pipeline log: artifacts-rocksmq-standalone-reinstall-2319-server-logs.tar.gz
Anything else?
this issue can be stable reproduced in master-latest