Switch rsyncable mode to the new LDM gear hash

terrelln commented 2 years ago

Rsyncable mode still uses the multiplicative rolling hash. It would likely get a large speedup by switching to the gear hash.

Cyan4973 commented 2 years ago

Pay attention that gear hash seems to be hard-wired to a rolling hash length of 64, that may be good enough for --rsyncable, but if there is a need for larger lengths, it will become a blocker.

terrelln commented 2 years ago

--rsyncable is hardwired to a 32-byte hash, which could likely be increased to a 64-byte hash without any issues.

I quickly tried to change it out, and saw a 2x speed improvement. Will try to test & land it for our next release.

diff --git i/lib/compress/zstd_compress_internal.h w/lib/compress/zstd_compress_internal.h
index bbae5330..5c5257dc 100644
--- i/lib/compress/zstd_compress_internal.h
+++ w/lib/compress/zstd_compress_internal.h
@@ -17,20 +17,21 @@

 /*-*************************************
 *  Dependencies
 ***************************************/
 #include "../common/zstd_internal.h"
 #include "zstd_cwksp.h"
 #ifdef ZSTD_MULTITHREAD
 #  include "zstdmt_compress.h"
 #endif
 #include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
+#include "zstd_ldm_geartab.h"

 #if defined (__cplusplus)
 extern "C" {
 #endif

 /*-*************************************
 *  Constants
 ***************************************/
 #define kSearchStrength      8
 #define HASH_READ_SIZE       8
@@ -775,76 +776,47 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
     {
     default:
     case 4: return ZSTD_hash4Ptr(p, hBits);
     case 5: return ZSTD_hash5Ptr(p, hBits);
     case 6: return ZSTD_hash6Ptr(p, hBits);
     case 7: return ZSTD_hash7Ptr(p, hBits);
     case 8: return ZSTD_hash8Ptr(p, hBits);
     }
 }

-/** ZSTD_ipow() :
- * Return base^exponent.
- */
-static U64 ZSTD_ipow(U64 base, U64 exponent)
-{
-    U64 power = 1;
-    while (exponent) {
-      if (exponent & 1) power *= base;
-      exponent >>= 1;
-      base *= base;
-    }
-    return power;
-}
-
-#define ZSTD_ROLL_HASH_CHAR_OFFSET 10
-
 /** ZSTD_rollingHash_append() :
  * Add the buffer to the hash value.
  */
 static U64 ZSTD_rollingHash_append(U64 hash, void const* buf, size_t size)
 {
     BYTE const* istart = (BYTE const*)buf;
     size_t pos;
     for (pos = 0; pos < size; ++pos) {
-        hash *= prime8bytes;
-        hash += istart[pos] + ZSTD_ROLL_HASH_CHAR_OFFSET;
+        hash = (hash << 1) + ZSTD_ldm_gearTab[istart[pos] & 0xff];
     }
     return hash;
 }

 /** ZSTD_rollingHash_compute() :
  * Compute the rolling hash value of the buffer.
  */
 MEM_STATIC U64 ZSTD_rollingHash_compute(void const* buf, size_t size)
 {
     return ZSTD_rollingHash_append(0, buf, size);
 }

-/** ZSTD_rollingHash_primePower() :
- * Compute the primePower to be passed to ZSTD_rollingHash_rotate() for a hash
- * over a window of length bytes.
- */
-MEM_STATIC U64 ZSTD_rollingHash_primePower(U32 length)
-{
-    return ZSTD_ipow(prime8bytes, length - 1);
-}
-
 /** ZSTD_rollingHash_rotate() :
  * Rotate the rolling hash by one byte.
  */
-MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toRemove, BYTE toAdd, U64 primePower)
+MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toAdd)
 {
-    hash -= (toRemove + ZSTD_ROLL_HASH_CHAR_OFFSET) * primePower;
-    hash *= prime8bytes;
-    hash += toAdd + ZSTD_ROLL_HASH_CHAR_OFFSET;
-    return hash;
+    return (hash << 1) + ZSTD_ldm_gearTab[toAdd & 0xff];
 }

 /*-*************************************
 *  Round buffer management
 ***************************************/
 #if (ZSTD_WINDOWLOG_MAX_64 > 31)
 # error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX"
 #endif
 /* Max current allowed */
 #define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX))
diff --git i/lib/compress/zstdmt_compress.c w/lib/compress/zstdmt_compress.c
index 0c10eb60..b397d411 100644
--- i/lib/compress/zstdmt_compress.c
+++ w/lib/compress/zstdmt_compress.c
@@ -807,35 +807,34 @@ typedef struct {
   size_t capacity;  /* The capacity of buffer. */
   size_t pos;       /* The position of the current inBuff in the round
                      * buffer. Updated past the end if the inBuff once
                      * the inBuff is sent to the worker thread.
                      * pos <= capacity.
                      */
 } roundBuff_t;

 static const roundBuff_t kNullRoundBuff = {NULL, 0, 0};

-#define RSYNC_LENGTH 32
+#define RSYNC_LENGTH 64
 /* Don't create chunks smaller than the zstd block size.
  * This stops us from regressing compression ratio too much,
  * and ensures our output fits in ZSTD_compressBound().
  *
  * If this is shrunk < ZSTD_BLOCKSIZELOG_MIN then
  * ZSTD_COMPRESSBOUND() will need to be updated.
  */
 #define RSYNC_MIN_BLOCK_LOG ZSTD_BLOCKSIZELOG_MAX
 #define RSYNC_MIN_BLOCK_SIZE (1<<RSYNC_MIN_BLOCK_LOG)

 typedef struct {
   U64 hash;
   U64 hitMask;
-  U64 primePower;
 } rsyncState_t;

 struct ZSTDMT_CCtx_s {
     POOL_ctx* factory;
     ZSTDMT_jobDescription* jobs;
     ZSTDMT_bufferPool* bufPool;
     ZSTDMT_CCtxPool* cctxPool;
     ZSTDMT_seqPool* seqPool;
     ZSTD_CCtx_params params;
     size_t targetSectionSize;
@@ -1268,21 +1267,20 @@ size_t ZSTDMT_initCStream_internal(
     if (params.rsyncable) {
         /* Aim for the targetsectionSize as the average job size. */
         U32 const jobSizeKB = (U32)(mtctx->targetSectionSize >> 10);
         U32 const rsyncBits = (assert(jobSizeKB >= 1), ZSTD_highbit32(jobSizeKB) + 10);
         /* We refuse to create jobs < RSYNC_MIN_BLOCK_SIZE bytes, so make sure our
          * expected job size is at least 4x larger. */
         assert(rsyncBits >= RSYNC_MIN_BLOCK_LOG + 2);
         DEBUGLOG(4, "rsyncLog = %u", rsyncBits);
         mtctx->rsync.hash = 0;
         mtctx->rsync.hitMask = (1ULL << rsyncBits) - 1;
-        mtctx->rsync.primePower = ZSTD_rollingHash_primePower(RSYNC_LENGTH);
     }
     if (mtctx->targetSectionSize < mtctx->targetPrefixSize) mtctx->targetSectionSize = mtctx->targetPrefixSize;  /* job size must be >= overlap size */
     DEBUGLOG(4, "Job Size : %u KB (note : set to %u)", (U32)(mtctx->targetSectionSize>>10), (U32)params.jobSize);
     DEBUGLOG(4, "inBuff Size : %u KB", (U32)(mtctx->targetSectionSize>>10));
     ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(mtctx->targetSectionSize));
     {
         /* If ldm is enabled we need windowSize space. */
         size_t const windowSize = mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable ? (1U << mtctx->params.cParams.windowLog) : 0;
         /* Two buffers of slack, plus extra space for the overlap
          * This is the minimum slack that LDM works with. One extra because
@@ -1682,21 +1680,20 @@ typedef struct {
 /**
  * Searches through the input for a synchronization point. If one is found, we
  * will instruct the caller to flush, and return the number of bytes to load.
  * Otherwise, we will load as many bytes as possible and instruct the caller
  * to continue as normal.
  */
 static syncPoint_t
 findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input)
 {
     BYTE const* const istart = (BYTE const*)input.src + input.pos;
-    U64 const primePower = mtctx->rsync.primePower;
     U64 const hitMask = mtctx->rsync.hitMask;

     syncPoint_t syncPoint;
     U64 hash;
     BYTE const* prev;
     size_t pos;

     syncPoint.toLoad = MIN(input.size - input.pos, mtctx->targetSectionSize - mtctx->inBuff.filled);
     syncPoint.flush = 0;
     if (!mtctx->params.rsyncable)
@@ -1756,27 +1753,26 @@ findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input)
     /* Starting with the hash of the previous RSYNC_LENGTH bytes, roll
      * through the input. If we hit a synchronization point, then cut the
      * job off, and tell the compressor to flush the job. Otherwise, load
      * all the bytes and continue as normal.
      * If we go too long without a synchronization point (targetSectionSize)
      * then a block will be emitted anyways, but this is okay, since if we
      * are already synchronized we will remain synchronized.
      */
     assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
     for (; pos < syncPoint.toLoad; ++pos) {
-        BYTE const toRemove = pos < RSYNC_LENGTH ? prev[pos] : istart[pos - RSYNC_LENGTH];
         /* This assert is very expensive, and Debian compiles with asserts enabled.
          * So disable it for now. We can get similar coverage by checking it at the
          * beginning & end of the loop.
          * assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
          */
-        hash = ZSTD_rollingHash_rotate(hash, toRemove, istart[pos], primePower);
+        hash = ZSTD_rollingHash_rotate(hash, istart[pos]);
         assert(mtctx->inBuff.filled + pos >= RSYNC_MIN_BLOCK_SIZE);
         if ((hash & hitMask) == hitMask) {
             syncPoint.toLoad = pos + 1;
             syncPoint.flush = 1;
             ++pos; /* for assert */
             break;
         }
     }
     assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
     return syncPoint;

Cyan4973 commented 2 years ago

--rsyncable is hardwired to a 32-byte hash,

Such a length should be compatible with gearhash. It's designed for 64-byte, but can also accept 32 or 16. It just can't accept, say, 48, or any non-power of 2 value.

I quickly tried to change it out, and saw a 2x speed improvement.

Observed performance improvement is indeed within expectation territory.

A really good surprise is how easy it seems to integrate.

gaby commented 2 years ago

👍🏻 for this

facebook / zstd

Switch rsyncable mode to the new LDM gear hash #3153