Closed lemire closed 7 years ago
You added the naive AVX2 algorithm, but without loop unrolling. Consider adding something like this:
size_t naiveavx2_unrolled(const char* s, size_t text_size, const char* needle, size_t needle_size) { assert(text_size % 32 == 0); // todo: fix it so we can handle variable-length inputs and // can catch matches at the end of the data. for (size_t i = 0; i < text_size - needle_size; i += 32) { uint32_t found = 0xFFFFFFFF; // 32 1-bits size_t j = 0; for (; (j + 3 < needle_size) && (found != 0) ; j += 4) { __m256i textvector1 = _mm256_loadu_si256((const __m256i *)(s + i + j)); __m256i needlevector1 = _mm256_set1_epi8(needle[j]); __m256i textvector2 = _mm256_loadu_si256((const __m256i *)(s + i + j + 1)); __m256i needlevector2 = _mm256_set1_epi8(needle[j + 1]); __m256i cmp1 = _mm256_cmpeq_epi8(textvector1, needlevector1); __m256i cmp2 = _mm256_cmpeq_epi8(textvector2, needlevector2); __m256i textvector3 = _mm256_loadu_si256((const __m256i *)(s + i + j + 2)); __m256i needlevector3 = _mm256_set1_epi8(needle[j + 2]); __m256i textvector4 = _mm256_loadu_si256((const __m256i *)(s + i + j + 3)); __m256i needlevector4 = _mm256_set1_epi8(needle[j + 3]); __m256i cmp3 = _mm256_cmpeq_epi8(textvector3, needlevector3); __m256i cmp4 = _mm256_cmpeq_epi8(textvector4, needlevector4); __m256i cmp12 = _mm256_and_si256(cmp1,cmp2); __m256i cmp34 = _mm256_and_si256(cmp3,cmp4); uint32_t bitmask = _mm256_movemask_epi8(_mm256_and_si256(cmp12,cmp34)); found = found & bitmask; } for (; (j < needle_size) && (found != 0) ; ++j) { __m256i textvector = _mm256_loadu_si256((const __m256i *)(s + i + j)); __m256i needlevector = _mm256_set1_epi8(needle[j]); uint32_t bitmask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(textvector, needlevector)); found = found & bitmask; } if(found != 0) { // got a match... maybe return i + __builtin_ctz(found); } } return text_size; }
Done! thanks
You added the naive AVX2 algorithm, but without loop unrolling. Consider adding something like this: