Closed tersec closed 3 weeks ago
12 files ±0 1 810 suites ±0 56m 4s :stopwatch: +9s 5 227 tests ±0 4 879 :heavy_check_mark: ±0 348 :zzz: ±0 0 :x: ±0 29 057 runs ±0 28 605 :heavy_check_mark: ±0 452 :zzz: ±0 0 :x: ±0
Results for commit 5789e3c7. ± Comparison against base commit 58a34e00.
:recycle: This comment has been updated with latest results.
It would arguably be better
what you can do instead is:
var tmp: uint64 {.noinit.}
staticFor i, 0..<sizeof(mdigest)/sizeof(tmp):
copyMem(addr tmp, addr mdigest(i) ...)
if tmp == 0: ...
the compiler will then understand what's going on and use at least uint64 alignment, and if it detects something better, use that (this is based on gut feeling, needs measuring of course)
It would arguably be better
what you can do instead is:
var tmp: uint64 {.noinit.} staticFor i, 0..<sizeof(mdigest)/sizeof(tmp): copyMem(addr tmp, addr mdigest(i) ...) if tmp == 0: ...
the compiler will then understand what's going on and use at least uint64 alignment, and if it detects something better, use that (this is based on gut feeling, needs measuring of course)
https://github.com/status-im/nimbus-eth2/pull/6682/commits/4a8289f563235586d7fc29f070887a3753248374
It's faster:
PR: 1.48 microseconds/10k zero Eth2Digest comparisons
PR: 1.35 microseconds/10k zero Eth2Digest comparisons
PR: 0.631 microseconds/10k zero Eth2Digest comparisons
PR: 1.31 microseconds/10k zero Eth2Digest comparisons
PR: 1.33 microseconds/10k zero Eth2Digest comparisons
PR: 1.32 microseconds/10k zero Eth2Digest comparisons
PR: 0.621 microseconds/10k zero Eth2Digest comparisons
PR: 1.35 microseconds/10k zero Eth2Digest comparisons
PR: 1.31 microseconds/10k zero Eth2Digest comparisons
PR: 1.32 microseconds/10k zero Eth2Digest comparisons
PR: 1.36 microseconds/10k zero Eth2Digest comparisons
PR: 0.621 microseconds/10k zero Eth2Digest comparisons
PR: 1.35 microseconds/10k zero Eth2Digest comparisons
PR: 1.35 microseconds/10k zero Eth2Digest comparisons
PR: 0.621 microseconds/10k zero Eth2Digest comparisons
PR: 0.611 microseconds/10k zero Eth2Digest comparisons
PR: 1.29 microseconds/10k zero Eth2Digest comparisons
PR: 1.34 microseconds/10k zero Eth2Digest comparisons
PR: 1.29 microseconds/10k zero Eth2Digest comparisons
PR: 0.631 microseconds/10k zero Eth2Digest comparisons
PR: 0.611 microseconds/10k zero Eth2Digest comparisons
PR: 1.28 microseconds/10k zero Eth2Digest comparisons
PR: 1.33 microseconds/10k zero Eth2Digest comparisons
PR: 1.37 microseconds/10k zero Eth2Digest comparisons
PR: 0.631 microseconds/10k zero Eth2Digest comparisons
And the codegen looks reasonable:
static N_INLINE(void,
_ZN6system7copyMemE7pointer7pointer25range09223372036854775807)(
void *dest_p0, void *source_p1, NI size_p2);
static N_INLINE(void, nimCopyMem)(void *dest_p0, void *source_p1, NI size_p2);
static N_INLINE(void, nimCopyMem)(void *dest_p0, void *source_p1, NI size_p2) {
void *T1_;
T1_ = (void *)0;
T1_ = memcpy(dest_p0, source_p1, ((size_t)(size_p2)));
}
static N_INLINE(void,
_ZN6system7copyMemE7pointer7pointer25range09223372036854775807)(
void *dest_p0, void *source_p1, NI size_p2) {
nimCopyMem(dest_p0, source_p1, size_p2);
}
N_LIB_PRIVATE N_NIMCALL(NIM_BOOL, _ZN6digest6isZeroE7MDigestI6staticI3intEE)(
tyObject_MDigest__sonz1Q4qEjQxue9bhMfa9bfg *x_p0) {
NIM_BOOL result;
NU64 tmp;
{
result = (NIM_BOOL)0;
{
_ZN6system7copyMemE7pointer7pointer25range09223372036854775807(
((void *)((&tmp))), ((void *)((&(*x_p0).data[(((NI)0))-0]))),
((NI)8));
{
if (!!((tmp == 0ULL)))
goto LA4_;
result = NIM_FALSE;
goto BeforeRet_;
}
LA4_:;
}
{
_ZN6system7copyMemE7pointer7pointer25range09223372036854775807(
((void *)((&tmp))), ((void *)((&(*x_p0).data[(((NI)8))-0]))),
((NI)8));
{
if (!!((tmp == 0ULL)))
goto LA9_;
result = NIM_FALSE;
goto BeforeRet_;
}
LA9_:;
}
{
_ZN6system7copyMemE7pointer7pointer25range09223372036854775807(
((void *)((&tmp))), ((void *)((&(*x_p0).data[(((NI)16))-0]))),
((NI)8));
{
if (!!((tmp == 0ULL)))
goto LA14_;
result = NIM_FALSE;
goto BeforeRet_;
}
LA14_:;
}
{
_ZN6system7copyMemE7pointer7pointer25range09223372036854775807(
((void *)((&tmp))), ((void *)((&(*x_p0).data[(((NI)24))-0]))),
((NI)8));
{
if (!!((tmp == 0ULL)))
goto LA19_;
result = NIM_FALSE;
goto BeforeRet_;
}
LA19_:;
}
result = NIM_TRUE;
}
BeforeRet_:;
return result;
}
I also tried a staticFor
/equalMem
-based approach and it was not as fast; presumably the comparison to 0 specifically is something compilers and CPUs handle well:
PR: 2.75 microseconds/10k zero Eth2Digest comparisons
PR: 1.16 microseconds/10k zero Eth2Digest comparisons
PR: 1.18 microseconds/10k zero Eth2Digest comparisons
PR: 1.19 microseconds/10k zero Eth2Digest comparisons
PR: 2.37 microseconds/10k zero Eth2Digest comparisons
PR: 2.60 microseconds/10k zero Eth2Digest comparisons
PR: 2.66 microseconds/10k zero Eth2Digest comparisons
PR: 1.20 microseconds/10k zero Eth2Digest comparisons
PR: 1.19 microseconds/10k zero Eth2Digest comparisons
PR: 1.16 microseconds/10k zero Eth2Digest comparisons
PR: 1.12 microseconds/10k zero Eth2Digest comparisons
PR: 1.18 microseconds/10k zero Eth2Digest comparisons
PR: 1.19 microseconds/10k zero Eth2Digest comparisons
PR: 1.20 microseconds/10k zero Eth2Digest comparisons
PR: 1.21 microseconds/10k zero Eth2Digest comparisons
PR: 1.18 microseconds/10k zero Eth2Digest comparisons
PR: 1.18 microseconds/10k zero Eth2Digest comparisons
PR: 1.20 microseconds/10k zero Eth2Digest comparisons
PR: 2.67 microseconds/10k zero Eth2Digest comparisons
PR: 2.55 microseconds/10k zero Eth2Digest comparisons
PR: 2.56 microseconds/10k zero Eth2Digest comparisons
PR: 2.63 microseconds/10k zero Eth2Digest comparisons
PR: 2.61 microseconds/10k zero Eth2Digest comparisons
PR: 1.19 microseconds/10k zero Eth2Digest comparisons
PR: 1.18 microseconds/10k zero Eth2Digest comparisons
During Holesky syncing, Nimbus is spending 25% of its time in isZeroMem
uh, maybe a good question to ask is what is calling isZeroMem? the hashlist cache should be checking the first 8 bytes only
During Holesky syncing, Nimbus is spending 25% of its time in isZeroMem
uh, maybe a good question to ask is what is calling isZeroMem? the hashlist cache should be checking the first 8 bytes only
oh. oof, that old code is starting to show its age .. it was written for 20k validators, not almost 2M - not bad that it has held on for so long
oh. oof, that old code is starting to show its age .. it was written for 20k validators, not almost 2M - not bad that it has held on for so long
ie instead of comparing hashes, it could keep a bit list or any number of other solutions - anyway, that's unrelated to this pr, which certainly is a nice little improvement
Via profiling by @etan-status
During Holesky syncing, Nimbus is spending 25% of its time in
isZeroMemory
. This PR improves this.using
It would arguably be better to use cmpMem (i.e.
memcmp()
in thin disguise) toHowever, attempting to do this in a safe and efficient way triggers various combinations of:
So this can function as a stopgap approach.