Closed lilothar closed 7 months ago
The internal vectors in QBG are padded to avoid such errors you mentioned. Could you show me the errors?
@masajiro got it, like this
inline static double compareL2(const float16 *a, const float16 *b, size_t size) {
const float16 *last = a + size;
#if defined(NGT_AVX512)
__m512 sum512 = _mm512_setzero_ps();
while (a < last) {
__m512 v = _mm512_sub_ps(_mm512_cvtph_ps(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(a))),
_mm512_cvtph_ps(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(b))));
sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v, v));
a += 16;
b += 16;
}
__m256 sum256 = _mm256_add_ps(_mm512_extractf32x8_ps(sum512, 0), _mm512_extractf32x8_ps(sum512, 1));
__m128 sum128 = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1));
#elif defined(NGT_AVX2)
__m256 sum256 = _mm256_setzero_ps();
__m256 v;
while (a < last) {
v = _mm256_sub_ps(_mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i *>(a))),
_mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i *>(b))));
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v, v));
a += 8;
b += 8;
v = _mm256_sub_ps(_mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i *>(a))),
_mm256_cvtph_ps(_mm_loadu_si128(reinterpret_cast<const __m128i *>(b))));
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v, v));
a += 8;
b += 8;
}
__m128 sum128 = _mm_add_ps(_mm256_extractf128_ps(sum256, 0), _mm256_extractf128_ps(sum256, 1));
#else
__m128 sum128 = _mm_setzero_ps();
__m128 v;
while (a < last) {
__m128i va = _mm_load_si128(reinterpret_cast<const __m128i*>(a));
__m128i vb = _mm_load_si128(reinterpret_cast<const __m128i*>(b));
v = _mm_sub_ps(_mm_cvtph_ps(va), _mm_cvtph_ps(vb));
sum128 = _mm_add_ps(sum128, _mm_mul_ps(v, v));
va = _mm_srli_si128(va, 8);
vb = _mm_srli_si128(vb, 8);
v = _mm_sub_ps(_mm_cvtph_ps(va), _mm_cvtph_ps(vb));
sum128 = _mm_add_ps(sum128, _mm_mul_ps(v, v));
a += 8;
b += 8;
}
#endif
__attribute__((aligned(32))) float f[4];
_mm_store_ps(f, sum128);
double s = f[0] + f[1] + f[2] + f[3];
return sqrt(s);
}
if the input data are not aligned, it will case a core about memory access, but it has some hidden prerequisites, I think it should be Eliminates hidden prerequisites for smoother development
Even if the specified vectors are not aligned, the stored vectors are automatically padded and aligned inside the index.
I found that https://github.com/yahoojapan/NGT/blob/main/lib/NGT/PrimitiveComparator.h some compare have not deal with the corner that araray not aligned with
8
, if the vector is align4
will cause error