Open yanliang567 opened 2 years ago
can reproduce @yanliang567 's issue:
CPU run
[58.885 s] glove-200-angular | IVF_PQ | nlist=1024
================================================================================
nprobe = 1, nq = 10000, k = 100, elapse = 0.152s, R@ = 0.1351
nprobe = 2, nq = 10000, k = 100, elapse = 0.156s, R@ = 0.1545
nprobe = 4, nq = 10000, k = 100, elapse = 0.258s, R@ = 0.1654
nprobe = 8, nq = 10000, k = 100, elapse = 0.239s, R@ = 0.1707
nprobe = 16, nq = 10000, k = 100, elapse = 0.338s, R@ = 0.1733
nprobe = 32, nq = 10000, k = 100, elapse = 0.576s, R@ = 0.1750
nprobe = 64, nq = 10000, k = 100, elapse = 0.895s, R@ = 0.1757
nprobe = 128, nq = 10000, k = 100, elapse = 1.545s, R@ = 0.1758
nprobe = 256, nq = 10000, k = 100, elapse = 3.048s, R@ = 0.1758
nprobe = 512, nq = 10000, k = 100, elapse = 6.047s, R@ = 0.1758
================================================================================
[72.803 s] Test 'glove-200-angular/IVF_PQ' done
GPU run
[4.379 s] glove-200-angular | IVF_PQ | nlist=1024
================================================================================
nprobe = 1, nq = 10000, k = 100, elapse = 0.029s, R@ = 0.1170
nprobe = 2, nq = 10000, k = 100, elapse = 0.045s, R@ = 0.1367
nprobe = 4, nq = 10000, k = 100, elapse = 0.083s, R@ = 0.1483
nprobe = 8, nq = 10000, k = 100, elapse = 0.134s, R@ = 0.1548
nprobe = 16, nq = 10000, k = 100, elapse = 0.246s, R@ = 0.1582
nprobe = 32, nq = 10000, k = 100, elapse = 1.246s, R@ = 0.1601
nprobe = 64, nq = 10000, k = 100, elapse = 2.438s, R@ = 0.1610
nprobe = 128, nq = 10000, k = 100, elapse = 4.862s, R@ = 0.1613
nprobe = 256, nq = 10000, k = 100, elapse = 9.730s, R@ = 0.1613
nprobe = 512, nq = 10000, k = 100, elapse = 19.627s, R@ = 0.1613
================================================================================
[43.516 s] Test 'glove-200-angular/IVF_PQ' done
==34089== NVPROF is profiling process 34089, command: ./test
==34089== Profiling application: ./test
==34089== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 25.56% 2.32929s 1250 1.8634ms 1.5279ms 2.3547ms void faiss::gpu::pqCodeDistances<float, float, int=32, bool=1>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, int, faiss::gpu::pqCodeDistances<float, float, int=32, bool=1, float, int=2, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqCodeDistances<float, float, int=32, bool=1, float, int=3, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqCodeDistances<float, float, int=32, bool=1, int, int=2, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqCodeDistances<float, float, int=32, bool=1, float, int=4, bool=1, int, faiss::gpu::traits>)
17.12% 1.56048s 29608 52.704us 26.144us 81.952us void faiss::gpu::l2SelectMin1<float, int=8, int=256>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::l2SelectMin1<float, int=8, int=256, faiss::gpu::Tensor, int=1, bool=1, int, faiss::gpu::traits>, faiss::gpu::l2SelectMin1<float, int=8, int=256, unsigned char, int=1, bool=1, int, faiss::gpu::traits>, faiss::gpu::traits::DefaultPtrTraits, faiss::gpu::l2SelectMin1<float, int=8, int=256, int, int=2, bool=1, int, faiss::gpu::traits>)
17.10% 1.55805s 29628 52.586us 11.360us 182.88us volta_sgemm_128x32_tn
13.76% 1.25412s 1250 1.0033ms 722.82us 1.5675ms void faiss::gpu::pass1SelectLists
==34437== NVPROF is profiling process 34437, command: ./test
==34437== Profiling application: ./test
==34437== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 62.46% 13.6743s 67254 203.32us 672ns 11.811ms [CUDA memcpy DtoH]
7.19% 1.57450s 29608 53.178us 26.049us 75.840us void faiss::gpu::l2SelectMin1<float, int=8, int=256>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::l2SelectMin1<float, int=8, int=256, faiss::gpu::Tensor, int=1, bool=1, int, faiss::gpu::traits>, faiss::gpu::l2SelectMin1<float, int=8, int=256, unsigned char, int=1, bool=1, int, faiss::gpu::traits>, faiss::gpu::traits::DefaultPtrTraits, faiss::gpu::l2SelectMin1<float, int=8, int=256, int, int=2, bool=1, int, faiss::gpu::traits>)
5.04% 1.10281s 1250 882.25us 874.05us 889.00us void faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4>(faiss::gpu::Tensor<float, int=2, bool=1, int, faiss::gpu::traits::DefaultPtrTraits>, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, float, int=3, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, int, int=2, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, float, int=4, bool=1, int, faiss::gpu::traits>, void, int*, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, float, int=3, bool=1, int, faiss::gpu::traits>, faiss::gpu::pqScanNoPrecomputedMultiPass<int=8, float, float4, float, int=1, bool=1, int, faiss::gpu::traits>)
4.38% 958.22ms 1250 766.58us 698.53us 858.66us void faiss::gpu::pass1SelectLists
update the baseline first
compare the logs below, there are 2 issues actually: