yahoojapan / NGT

Nearest Neighbor Search with Neighborhood Graph and Tree for High-dimensional Data
Apache License 2.0
1.22k stars 112 forks source link

bugs #143

Closed lilothar closed 7 months ago

lilothar commented 1 year ago

I found that https://github.com/yahoojapan/NGT/blob/main/lib/NGT/PrimitiveComparator.h some compare have not deal with the corner that araray not aligned with 8, if the vector is align 4 will cause error

masajiro commented 1 year ago

The internal vectors in QBG are padded to avoid such errors you mentioned. Could you show me the errors?

lilothar commented 1 year ago

@masajiro got it, like this

inline static double compareL2(const float16 *a, const float16 *b, size_t size) {
            const float16 *last = a + size;
#if defined(NGT_AVX512)
            __m512 sum512 = _mm512_setzero_ps();
            while (a < last) {
          __m512 v = _mm512_sub_ps(_mm512_cvtph_ps(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(a))),
                       _mm512_cvtph_ps(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(b))));
          sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v, v));
          a += 16;
          b += 16;
            }

            __m256 sum256 = _mm256_add_ps(_mm512_extractf32x8_ps(sum512, 0), _mm512_extractf32x8_ps(sum512, 1));
            __m128 sum128 = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1));
#elif defined(NGT_AVX2)
            __m256 sum256 = _mm256_setzero_ps();
            __m256 v;
            while (a < last) {
                v = _mm256_sub_ps(_mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i *>(a))),
                                  _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i *>(b))));
                sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v, v));
                a += 8;
                b += 8;
                v = _mm256_sub_ps(_mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i *>(a))),
                                  _mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i *>(b))));
                sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v, v));
                a += 8;
                b += 8;
            }
            __m128 sum128 = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1));
#else
            __m128 sum128 = _mm_setzero_ps();
            __m128 v;
            while (a < last) {
          __m128i va = _mm_load_si128(reinterpret_cast<const __m128i*>(a));
          __m128i vb = _mm_load_si128(reinterpret_cast<const __m128i*>(b));
          v = _mm_sub_ps(_mm_cvtph_ps(va), _mm_cvtph_ps(vb));
          sum128 = _mm_add_ps(sum128, _mm_mul_ps(v, v));
          va = _mm_srli_si128(va, 8);
          vb = _mm_srli_si128(vb, 8);
          v = _mm_sub_ps(_mm_cvtph_ps(va), _mm_cvtph_ps(vb));
          sum128 = _mm_add_ps(sum128, _mm_mul_ps(v, v));
              a += 8;
              b += 8;
            }
#endif

            __attribute__((aligned(32))) float f[4];
            _mm_store_ps(f, sum128);
            double s = f[0] + f[1] + f[2] + f[3];
            return sqrt(s);
        }

if the input data are not aligned, it will case a core about memory access, but it has some hidden prerequisites, I think it should be Eliminates hidden prerequisites for smoother development

masajiro commented 7 months ago

Even if the specified vectors are not aligned, the stored vectors are automatically padded and aligned inside the index.