Closed makarr closed 1 month ago
This should get you most of the way there. Maybe clean up the reduce function.
SIMSIMD_PUBLIC void simsimd_dot_f32_haswell(simsimd_f32_t const* a, simsimd_f32_t const* b,
simsimd_size_t n, simsimd_distance_t* result) {
__m256 ab_vec = _mm256_setzero_ps();
simsimd_size_t i = 0;
for (; i + 8 <= n; i += 8) {
__m256 a_vec = _mm256_loadu_ps(a + i);
__m256 b_vec = _mm256_loadu_ps(b + i);
ab_vec = _mm256_fmadd_ps(a_vec, b_vec, ab_vec);
}
simsimd_f32_t ab = _mm256_reduce_add_ps(ab_vec);
for (; i < n; ++i)
ab += a[i] + b[i];
*result = ab;
}
inline simsimd_f32_t _mm256_reduce_add_ps(__m256 vec) {
// ( x7, x6, x5, x4 )
__m128 high_f32 = _mm256_extractf128_ps(vec, 1);
// ( x3, x2, x1, x0 )
__m128 low_f32 = _mm256_castps256_ps128(vec);
// ( x3 + x7, x2 + x6, x1 + x5, x0 + x4 )
__m128 sum = _mm_add_ps(low_f32, high_f32);
// ( -, -, x1 + x5, x0 + x4 )
__m128 lo_sum = sum;
// ( -, -, x3 + x7, x2 + x6 )
__m128 hi_sum = _mm_movehl_ps(sum, sum);
// ( -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 )
__m128 sum_2 = _mm_add_ps(lo_sum, hi_sum);
// ( -, -, -, x0 + x2 + x4 + x6 )
__m128 lo_sum_2 = sum_2;
// ( -, -, -, x1 + x3 + x5 + x7 )
__m128 hi_sum_2 = _mm_shuffle_ps(sum_2, sum_2, 0x1);
// ( -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 )
__m128 sum_3 = _mm_add_ss(lo_sum_2, hi_sum_2);
// extract first vec element
return _mm_cvtss_f32(sum_3);
}
Thank you @makarr! I've added the kernels and looking forward to merge them in a day.
simsimd_dot_f32_haswell declared but not implemented