ashvardanian / StringZilla

Up to 10x faster strings for C, C++, Python, Rust, and Swift, leveraging NEON, AVX2, AVX-512, and SWAR to accelerate search, sort, edit distances, alignment scores, etc 🦖
https://ashvardanian.com/posts/stringzilla/
Apache License 2.0
2.05k stars 66 forks source link

SWAR acceleration for UTF8 Hamming Distance #93

Open ashvardanian opened 6 months ago

ashvardanian commented 6 months ago

The upcoming implementation contains only a serial variant. Once the demand for a more efficient implementation grows, one can add a SWAR accelerated implementation.

Draft ```c SZ_PUBLIC sz_size_t sz_hamming_distance_utf8( // sz_cptr_t a, sz_size_t a_length, // sz_cptr_t b, sz_size_t b_length, // sz_size_t bound) { sz_size_t const min_length = sz_min_of_two(a_length, b_length); sz_size_t const max_length = sz_max_of_two(a_length, b_length); sz_cptr_t const a_min_end = a + min_length; sz_size_t distance = 0; bound = bound == 0 ? max_length : bound; #if SZ_USE_MISALIGNED_LOADS && !SZ_DETECT_BIG_ENDIAN if (min_length >= SZ_SWAR_THRESHOLD) { sz_u64_vec_t a_vec, b_vec; sz_u64_vec_t a_2byte_vec, b_2byte_vec; sz_u64_vec_t prefixes_2byte_vec, match_vec, prefix_resolution_mask_vec; prefixes_2byte_vec.u64 = 0xC080C080C080C080ull; // Walk through both strings using SWAR and counting the number of differing characters. // At every point check if the next 8 bytes of content contain characters of equal length. for (; a + 8 <= a_min_end && distance < bound; a += 8, b += 8) { a_vec = sz_u64_load(a); b_vec = sz_u64_load(b); // Check if the prefixes contain up to 8x codepoints of length 1. // UTF8 1-byte codepoints look like: 0xxxxxxx. sz_size_t a_1byte_chars = sz_u64_ctz(a_vec.u64 & 0x8080808080808080ull) / 8; sz_size_t b_1byte_chars = sz_u64_ctz(b_vec.u64 & 0x8080808080808080ull) / 8; sz_size_t max_1byte_chars = sz_max_of_two(a_1byte_chars, b_1byte_chars); sz_size_t min_1byte_chars = sz_min_of_two(a_1byte_chars, b_1byte_chars); // Check if at least one of the strings starts with a 1-byte character. if (max_1byte_chars) { // One of the strings doesn't start with 1-byte characters. if (!a_1byte_chars && !b_1byte_chars) { a += min_1byte_chars, b += min_1byte_chars; continue; } // Both strings start with `min_1byte_chars`-many 1-byte characters. // Compare them using SWAR and skip. else { match_vec = _sz_u64_each_byte_equal(a_vec, b_vec); prefix_resolution_mask_vec = ...; sz_size_t matching_prefix_length = sz_u64_popcount((~match_vec.u64) & 0x8080808080808080ull); ... } } // Check if the prefixes contain up to 4x codepoints of length 2. // UTF8 2-byte codepoints look like: 110xxxxx 10xxxxxx. // Those bits will be matches with a 0xE0C0 repeated mask, matching the 0xC080 pattern. a_2byte_vec.u64 = a_vec.u64 & 0xE0C0E0C0E0C0E0C0ull; b_2byte_vec.u64 = b_vec.u64 & 0xE0C0E0C0E0C0E0C0ull; sz_size_t a_2byte_chars = sz_u64_ctz(_sz_u64_each_2byte_equal(a_2byte_vec, prefixes_2byte_vec).u64) / 16; sz_size_t b_2byte_chars = sz_u64_ctz(_sz_u64_each_2byte_equal(b_2byte_vec, prefixes_2byte_vec).u64) / 16; } } #endif sz_rune_t a_rune, b_rune; sz_rune_length_t a_rune_length, b_rune_length; for (; a != a_min_end && distance < bound; a += a_rune_length, b += b_rune_length) { _sz_extract_utf8_rune(a, &a_rune_length, &a_rune); _sz_extract_utf8_rune(b, &b_rune_length, &b_rune); distance += (a_rune != b_rune); } return sz_min_of_two(distance, bound); } ```