Open terrelln opened 2 years ago
Pay attention that gear hash seems to be hard-wired to a rolling hash length of 64,
that may be good enough for --rsyncable
,
but if there is a need for larger lengths, it will become a blocker.
--rsyncable
is hardwired to a 32-byte hash, which could likely be increased to a 64-byte hash without any issues.
I quickly tried to change it out, and saw a 2x speed improvement. Will try to test & land it for our next release.
diff --git i/lib/compress/zstd_compress_internal.h w/lib/compress/zstd_compress_internal.h
index bbae5330..5c5257dc 100644
--- i/lib/compress/zstd_compress_internal.h
+++ w/lib/compress/zstd_compress_internal.h
@@ -17,20 +17,21 @@
/*-*************************************
* Dependencies
***************************************/
#include "../common/zstd_internal.h"
#include "zstd_cwksp.h"
#ifdef ZSTD_MULTITHREAD
# include "zstdmt_compress.h"
#endif
#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
+#include "zstd_ldm_geartab.h"
#if defined (__cplusplus)
extern "C" {
#endif
/*-*************************************
* Constants
***************************************/
#define kSearchStrength 8
#define HASH_READ_SIZE 8
@@ -775,76 +776,47 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
{
default:
case 4: return ZSTD_hash4Ptr(p, hBits);
case 5: return ZSTD_hash5Ptr(p, hBits);
case 6: return ZSTD_hash6Ptr(p, hBits);
case 7: return ZSTD_hash7Ptr(p, hBits);
case 8: return ZSTD_hash8Ptr(p, hBits);
}
}
-/** ZSTD_ipow() :
- * Return base^exponent.
- */
-static U64 ZSTD_ipow(U64 base, U64 exponent)
-{
- U64 power = 1;
- while (exponent) {
- if (exponent & 1) power *= base;
- exponent >>= 1;
- base *= base;
- }
- return power;
-}
-
-#define ZSTD_ROLL_HASH_CHAR_OFFSET 10
-
/** ZSTD_rollingHash_append() :
* Add the buffer to the hash value.
*/
static U64 ZSTD_rollingHash_append(U64 hash, void const* buf, size_t size)
{
BYTE const* istart = (BYTE const*)buf;
size_t pos;
for (pos = 0; pos < size; ++pos) {
- hash *= prime8bytes;
- hash += istart[pos] + ZSTD_ROLL_HASH_CHAR_OFFSET;
+ hash = (hash << 1) + ZSTD_ldm_gearTab[istart[pos] & 0xff];
}
return hash;
}
/** ZSTD_rollingHash_compute() :
* Compute the rolling hash value of the buffer.
*/
MEM_STATIC U64 ZSTD_rollingHash_compute(void const* buf, size_t size)
{
return ZSTD_rollingHash_append(0, buf, size);
}
-/** ZSTD_rollingHash_primePower() :
- * Compute the primePower to be passed to ZSTD_rollingHash_rotate() for a hash
- * over a window of length bytes.
- */
-MEM_STATIC U64 ZSTD_rollingHash_primePower(U32 length)
-{
- return ZSTD_ipow(prime8bytes, length - 1);
-}
-
/** ZSTD_rollingHash_rotate() :
* Rotate the rolling hash by one byte.
*/
-MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toRemove, BYTE toAdd, U64 primePower)
+MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toAdd)
{
- hash -= (toRemove + ZSTD_ROLL_HASH_CHAR_OFFSET) * primePower;
- hash *= prime8bytes;
- hash += toAdd + ZSTD_ROLL_HASH_CHAR_OFFSET;
- return hash;
+ return (hash << 1) + ZSTD_ldm_gearTab[toAdd & 0xff];
}
/*-*************************************
* Round buffer management
***************************************/
#if (ZSTD_WINDOWLOG_MAX_64 > 31)
# error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX"
#endif
/* Max current allowed */
#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX))
diff --git i/lib/compress/zstdmt_compress.c w/lib/compress/zstdmt_compress.c
index 0c10eb60..b397d411 100644
--- i/lib/compress/zstdmt_compress.c
+++ w/lib/compress/zstdmt_compress.c
@@ -807,35 +807,34 @@ typedef struct {
size_t capacity; /* The capacity of buffer. */
size_t pos; /* The position of the current inBuff in the round
* buffer. Updated past the end if the inBuff once
* the inBuff is sent to the worker thread.
* pos <= capacity.
*/
} roundBuff_t;
static const roundBuff_t kNullRoundBuff = {NULL, 0, 0};
-#define RSYNC_LENGTH 32
+#define RSYNC_LENGTH 64
/* Don't create chunks smaller than the zstd block size.
* This stops us from regressing compression ratio too much,
* and ensures our output fits in ZSTD_compressBound().
*
* If this is shrunk < ZSTD_BLOCKSIZELOG_MIN then
* ZSTD_COMPRESSBOUND() will need to be updated.
*/
#define RSYNC_MIN_BLOCK_LOG ZSTD_BLOCKSIZELOG_MAX
#define RSYNC_MIN_BLOCK_SIZE (1<<RSYNC_MIN_BLOCK_LOG)
typedef struct {
U64 hash;
U64 hitMask;
- U64 primePower;
} rsyncState_t;
struct ZSTDMT_CCtx_s {
POOL_ctx* factory;
ZSTDMT_jobDescription* jobs;
ZSTDMT_bufferPool* bufPool;
ZSTDMT_CCtxPool* cctxPool;
ZSTDMT_seqPool* seqPool;
ZSTD_CCtx_params params;
size_t targetSectionSize;
@@ -1268,21 +1267,20 @@ size_t ZSTDMT_initCStream_internal(
if (params.rsyncable) {
/* Aim for the targetsectionSize as the average job size. */
U32 const jobSizeKB = (U32)(mtctx->targetSectionSize >> 10);
U32 const rsyncBits = (assert(jobSizeKB >= 1), ZSTD_highbit32(jobSizeKB) + 10);
/* We refuse to create jobs < RSYNC_MIN_BLOCK_SIZE bytes, so make sure our
* expected job size is at least 4x larger. */
assert(rsyncBits >= RSYNC_MIN_BLOCK_LOG + 2);
DEBUGLOG(4, "rsyncLog = %u", rsyncBits);
mtctx->rsync.hash = 0;
mtctx->rsync.hitMask = (1ULL << rsyncBits) - 1;
- mtctx->rsync.primePower = ZSTD_rollingHash_primePower(RSYNC_LENGTH);
}
if (mtctx->targetSectionSize < mtctx->targetPrefixSize) mtctx->targetSectionSize = mtctx->targetPrefixSize; /* job size must be >= overlap size */
DEBUGLOG(4, "Job Size : %u KB (note : set to %u)", (U32)(mtctx->targetSectionSize>>10), (U32)params.jobSize);
DEBUGLOG(4, "inBuff Size : %u KB", (U32)(mtctx->targetSectionSize>>10));
ZSTDMT_setBufferSize(mtctx->bufPool, ZSTD_compressBound(mtctx->targetSectionSize));
{
/* If ldm is enabled we need windowSize space. */
size_t const windowSize = mtctx->params.ldmParams.enableLdm == ZSTD_ps_enable ? (1U << mtctx->params.cParams.windowLog) : 0;
/* Two buffers of slack, plus extra space for the overlap
* This is the minimum slack that LDM works with. One extra because
@@ -1682,21 +1680,20 @@ typedef struct {
/**
* Searches through the input for a synchronization point. If one is found, we
* will instruct the caller to flush, and return the number of bytes to load.
* Otherwise, we will load as many bytes as possible and instruct the caller
* to continue as normal.
*/
static syncPoint_t
findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input)
{
BYTE const* const istart = (BYTE const*)input.src + input.pos;
- U64 const primePower = mtctx->rsync.primePower;
U64 const hitMask = mtctx->rsync.hitMask;
syncPoint_t syncPoint;
U64 hash;
BYTE const* prev;
size_t pos;
syncPoint.toLoad = MIN(input.size - input.pos, mtctx->targetSectionSize - mtctx->inBuff.filled);
syncPoint.flush = 0;
if (!mtctx->params.rsyncable)
@@ -1756,27 +1753,26 @@ findSynchronizationPoint(ZSTDMT_CCtx const* mtctx, ZSTD_inBuffer const input)
/* Starting with the hash of the previous RSYNC_LENGTH bytes, roll
* through the input. If we hit a synchronization point, then cut the
* job off, and tell the compressor to flush the job. Otherwise, load
* all the bytes and continue as normal.
* If we go too long without a synchronization point (targetSectionSize)
* then a block will be emitted anyways, but this is okay, since if we
* are already synchronized we will remain synchronized.
*/
assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
for (; pos < syncPoint.toLoad; ++pos) {
- BYTE const toRemove = pos < RSYNC_LENGTH ? prev[pos] : istart[pos - RSYNC_LENGTH];
/* This assert is very expensive, and Debian compiles with asserts enabled.
* So disable it for now. We can get similar coverage by checking it at the
* beginning & end of the loop.
* assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
*/
- hash = ZSTD_rollingHash_rotate(hash, toRemove, istart[pos], primePower);
+ hash = ZSTD_rollingHash_rotate(hash, istart[pos]);
assert(mtctx->inBuff.filled + pos >= RSYNC_MIN_BLOCK_SIZE);
if ((hash & hitMask) == hitMask) {
syncPoint.toLoad = pos + 1;
syncPoint.flush = 1;
++pos; /* for assert */
break;
}
}
assert(pos < RSYNC_LENGTH || ZSTD_rollingHash_compute(istart + pos - RSYNC_LENGTH, RSYNC_LENGTH) == hash);
return syncPoint;
--rsyncable
is hardwired to a 32-byte hash,
Such a length should be compatible with gearhash. It's designed for 64-byte, but can also accept 32 or 16. It just can't accept, say, 48, or any non-power of 2 value.
I quickly tried to change it out, and saw a 2x speed improvement.
Observed performance improvement is indeed within expectation territory.
A really good surprise is how easy it seems to integrate.
👍🏻 for this
Rsyncable mode still uses the multiplicative rolling hash. It would likely get a large speedup by switching to the gear hash.