Open syang1997 opened 2 months ago
even proxy mq latency increase ![Uploading image.png…]()
this means this is most likely a proxy issue
do we have host machine monitoring metrics?
@xiaofan-luan proxy node monitoring
There is also a problem that PROXY includes cache memory high levels and is not GC.
querynode-71
log[2024/08/06 20:12:05.108 +00:00] [DEBUG] [querynodev2/services.go:69] ["QueryNode current state"] [NodeID=71] [StateCode=Healthy]
[2024/08/06 20:12:05.413 +00:00] [DEBUG] [config/etcd_source.go:142] ["etcd refreshConfigurations"] [prefix=milvus-c-customer-service/config] [endpoints="[milvus-c-customer-service-etcd.shein-paas-component:2379]"]
[2024/08/06 20:12:06.143 +00:00] [DEBUG] [querynodev2/services.go:69] ["QueryNode current state"] [NodeID=71] [StateCode=Healthy]
[2024/08/06 20:12:06.728 +00:00] [DEBUG] [pipeline/stream_pipeline.go:59] ["stream pipeline fetch msg"] [sum=0]
[2024/08/06 20:12:06.729 +00:00] [DEBUG] [querynodev2/services.go:696] ["start to search segments on worker"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [msgID=451667590983188481] [collectionID=451243423475883453] [channel=milvus-c-customer-service-rootcoord-dml_11_451243423475883453v0] [scope=Historical] [segmentIDs="[451243423471022002]"]
[2024/08/06 20:12:06.729 +00:00] [DEBUG] [querynodev2/services.go:703] ["search segments..."] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [msgID=451667590983188481] [collectionID=451243423475883453] [channel=milvus-c-customer-service-rootcoord-dml_11_451243423475883453v0] [scope=Historical]
[2024/08/06 20:12:06.729 +00:00] [DEBUG] [segments/validate.go:50] ["read target partitions"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [collectionID=451243423475883453] [partitionIDs="[451243423475883454]"]
[2024/08/06 20:12:06.729 +00:00] [DEBUG] [segments/segment.go:364] ["search segment..."] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [collectionID=451243423475883453] [segmentID=451243423471022002] [segmentType=Sealed] [withIndex=true]
[2024/08/06 20:12:06.751 +00:00] [DEBUG] [segments/segment.go:386] ["search segment done"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [collectionID=451243423475883453] [segmentID=451243423471022002] [segmentType=Sealed] [withIndex=true]
[2024/08/06 20:12:06.751 +00:00] [DEBUG] [querynodev2/services.go:727] [tr/searchSegments] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [msg="search segments done, channel = milvus-c-customer-service-rootcoord-dml_11_451243423475883453v0, segmentIDs = [451243423471022002]"] [duration=22.349629ms]
[2024/08/06 20:12:07.108 +00:00] [DEBUG] [querynodev2/services.go:69] ["QueryNode current state"] [NodeID=71] [StateCode=Healthy]
[2024/08/06 20:12:07.530 +00:00] [DEBUG] [querynodev2/services.go:696] ["start to search segments on worker"] [traceID=521a93e3eee743d974984af280207faf] [msgID=451667591821787137] [collectionID=451243423475883453] [channel=milvus-c-customer-service-rootcoord-dml_11_451243423475883453v0] [scope=Historical] [segmentIDs="[451243423471022002]"]
[2024/08/06 20:12:07.530 +00:00] [DEBUG] [querynodev2/services.go:703] ["search segments..."] [traceID=521a93e3eee743d974984af280207faf] [msgID=451667591821787137] [collectionID=451243423475883453] [channel=milvus-c-customer-service-rootcoord-dml_11_451243423475883453v0] [scope=Historical]
[2024/08/06 20:12:07.530 +00:00] [DEBUG] [segments/validate.go:50] ["read target partitions"] [traceID=521a93e3eee743d974984af280207faf] [collectionID=451243423475883453] [partitionIDs="[451243423475883454]"]
[2024/08/06 20:12:07.530 +00:00] [DEBUG] [segments/segment.go:364] ["search segment..."] [traceID=521a93e3eee743d974984af280207faf] [collectionID=451243423475883453] [segmentID=451243423471022002] [segmentType=Sealed] [withIndex=true]
[2024/08/06 20:12:07.550 +00:00] [DEBUG] [segments/segment.go:386] ["search segment done"] [traceID=521a93e3eee743d974984af280207faf] [collectionID=451243423475883453] [segmentID=451243423471022002] [segmentType=Sealed] [withIndex=true]
[2024/08/06 20:12:07.550 +00:00] [DEBUG] [querynodev2/services.go:727] [tr/searchSegments] [traceID=521a93e3eee743d974984af280207faf] [msg="search segments done, channel = milvus-c-customer-service-rootcoord-dml_11_451243423475883453v0, segmentIDs = [451243423471022002]"] [duration=20.633616ms]
[2024/08/06 20:12:07.857 +00:00] [DEBUG] [querynodev2/services.go:69] ["QueryNode current state"] [NodeID=71] [StateCode=Healthy]
proxy-66
[2024/08/06 20:12:04.300 +00:00] [DEBUG] [proxy/impl.go:2591] ["Search received"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [role=proxy] [db=ai_customer_service] [collection=tk_en_240731] [partitions="[]"] [dsl=] [len(PlaceholderGroup)=3084] [OutputFields="[standard_question_id]"] [search_params="[{\"key\":\"anns_field\",\"value\":\"embedding\"},{\"key\":\"topk\",\"value\":\"1000\"},{\"key\":\"metric_type\",\"value\":\"COSINE\"},{\"key\":\"round_decimal\",\"value\":\"-1\"},{\"key\":\"ignore_growing\",\"value\":\"false\"},{\"key\":\"offset\",\"value\":\"0\"},{\"key\":\"params\",\"value\":\"{\\\"nprobe\\\":64}\"}]"] [guarantee_timestamp=1]
[2024/08/06 20:12:04.301 +00:00] [DEBUG] [proxy/impl.go:2609] [tr/Search] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [msg="search request enqueue"] [duration=272.433µs]
[2024/08/06 20:12:04.301 +00:00] [DEBUG] [proxy/impl.go:2611] ["Search enqueued"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [role=proxy] [db=ai_customer_service] [collection=tk_en_240731] [partitions="[]"] [dsl=] [len(PlaceholderGroup)=3084] [OutputFields="[standard_question_id]"] [search_params="[{\"key\":\"anns_field\",\"value\":\"embedding\"},{\"key\":\"topk\",\"value\":\"1000\"},{\"key\":\"metric_type\",\"value\":\"COSINE\"},{\"key\":\"round_decimal\",\"value\":\"-1\"},{\"key\":\"ignore_growing\",\"value\":\"false\"},{\"key\":\"offset\",\"value\":\"0\"},{\"key\":\"params\",\"value\":\"{\\\"nprobe\\\":64}\"}]"] [guarantee_timestamp=1] [timestamp=451667590983188481]
[2024/08/06 20:12:04.301 +00:00] [DEBUG] [proxy/task_search.go:250] ["translate output fields"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [collID=451243423475883453] [collName=tk_en_240731] ["output fields"="[standard_question_id]"]
[2024/08/06 20:12:04.301 +00:00] [DEBUG] [proxy/task_search.go:314] ["create query plan"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [collID=451243423475883453] [collName=tk_en_240731] [nq=1] [dsl=] ["anns field"=embedding] ["query info"="topk:1000 metric_type:\"COSINE\" search_params:\"{\\\"nprobe\\\":64}\" round_decimal:-1 "]
[2024/08/06 20:12:04.301 +00:00] [DEBUG] [proxy/task_search.go:355] [Proxy::searchTask::PreExecute] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [collID=451243423475883453] [collName=tk_en_240731] [nq=1] [plan.OutputFieldIds="[107]"] [plan="vector_anns:<vector_type:FloatVector field_id:101 query_info:<topk:1000 metric_type:\"COSINE\" search_params:\"{\\\"nprobe\\\":64}\" round_decimal:-1 > placeholder_tag:\"$0\" > output_field_ids:107 "]
[2024/08/06 20:12:04.301 +00:00] [DEBUG] [proxy/task_search.go:402] ["search PreExecute done."] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [collID=451243423475883453] [collName=tk_en_240731] [nq=1] [guarantee_ts=451667589672468481] [use_default_consistency=true] ["consistency level"=Bounded] [timeout_ts=0]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/task_search.go:433] ["Search Execute done."] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [nq=1] [collection=451243423475883453] [partitionIDs="[]"]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/task_search.go:436] ["tr/proxy execute search 451667590983188481"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [msg=done] [duration=2.451684558s]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/task_search.go:712] ["all searches are finished or canceled"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/task_search.go:715] ["proxy receives one search result"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [sourceID=0]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/task_search.go:472] ["tr/searchTask PostExecute"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [msg=decodeResultStart] [duration=14.826µs]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/task_search.go:739] [tr/decodeSearchResults] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [msg="decodeSearchResults done"] [duration=119.946µs]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/task_search.go:487] ["proxy search post execute reduce"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [nq=1] [collection=451243423475883453] [partitionIDs="[]"] ["number of valid search results"=1]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/task_search.go:491] ["tr/searchTask PostExecute"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [msg=reduceResultStart] [duration=7.98µs]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/task_search.go:801] [reduceSearchResultData] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [len(subSearchResultData)=1] [nq=1] [offset=0] [limit=1000] [metricType=COSINE]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/task_search.go:839] [subSearchResultData] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] ["result No."=0] [nq=1] [topk=1000] ["length of pks"=1000] ["length of FieldsData"=1]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/task_search.go:931] ["skip duplicated search result"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [count=0]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/task_search.go:797] [tr/reduceSearchResultData] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [msg=done] [duration=249.528µs]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/task_search.go:518] ["Search post execute done"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [nq=1] [collection=451243423475883453] [partitionIDs="[]"]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/task_search.go:445] ["tr/searchTask PostExecute"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [msg=done] [duration=439.406µs]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/impl.go:2634] [tr/Search] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [msg="wait search result"] [duration=2.452387093s]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/impl.go:2640] [tr/Search] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [msg="wait search result"] [duration=17.01µs]
[2024/08/06 20:12:06.753 +00:00] [DEBUG] [proxy/impl.go:2641] ["Search done"] [traceID=d4e71f132bc05b66cd3f63063b7e04ed] [role=proxy] [db=ai_customer_service] [collection=tk_en_240731] [partitions="[]"] [dsl=] [len(PlaceholderGroup)=3084] [OutputFields="[standard_question_id]"] [search_params="[{\"key\":\"anns_field\",\"value\":\"embedding\"},{\"key\":\"topk\",\"value\":\"1000\"},{\"key\":\"metric_type\",\"value\":\"COSINE\"},{\"key\":\"round_decimal\",\"value\":\"-1\"},{\"key\":\"ignore_growing\",\"value\":\"false\"},{\"key\":\"offset\",\"value\":\"0\"},{\"key\":\"params\",\"value\":\"{\\\"nprobe\\\":64}\"}]"] [guarantee_timestamp=1]
it seems that when querynode receive this request, it already takes 2 seconds
Is there an existing issue for this?
Environment
Current Behavior
During the smooth request, p99 suddenly increased to 15kms, but the resources were sufficient and the CPU and memory were low. What was the reason?
The following is the monitoring
The following is the qureynode log
Expected Behavior
No response
Steps To Reproduce
Milvus Log
No response
Anything else?
No response