Closed brentp closed 5 years ago
also, maybe I'm missing some impl details, but since it's actually using an int8, wouldn't it be faster to count the set bits on an int8 rather than converting to int32 and counting 4X as many bits?
here's a diff that counts just the 8 bits. It is a little faster:
diff --git a/lib/system/sets.nim b/lib/system/sets.nim
index a5f6c5d..c53097b 100644
--- a/lib/system/sets.nim
+++ b/lib/system/sets.nim
@@ -13,8 +13,8 @@ type
NimSet = array[0..4*2048-1, uint8]
proc countBits32(n: int32): int {.compilerproc.} =
- var v = n
- v = v -% ((v shr 1'i32) and 0x55555555'i32)
+ #var v = n
+ var v = n -% ((n shr 1'i32) and 0x55555555'i32)
v = (v and 0x33333333'i32) +% ((v shr 2'i32) and 0x33333333'i32)
result = ((v +% (v shr 4'i32) and 0xF0F0F0F'i32) *% 0x1010101'i32) shr 24'i32
@@ -22,7 +22,13 @@ proc countBits64(n: int64): int {.compilerproc.} =
result = countBits32(toU32(n and 0xffffffff'i64)) +
countBits32(toU32(n shr 32'i64))
-proc cardSet(s: NimSet, len: int): int {.compilerproc.} =
- result = 0
+proc countBits8(n: uint8): int {.compilerproc.} =
+ var v = n
+ while v != 0:
+ result.inc
+ v = v and (v - 1)
+
+proc cardSet(s: NimSet, len: int): int {.compilerproc, inline.} =
for i in countup(0, len-1):
- inc(result, countBits32(int32(s[i])))
+ if likely(s[i] == 0): continue
+ inc(result, countBits8(s[i]))
For the code here: https://gist.github.com/brentp/7d440560968c3ac97c40de17200e404a
running with
nim c -d:release -r
show that the system.sets are much faster than either hashsets or intsets as expected.however, when running with
nim c -d:card -d:release -r
they become ~20X slower.Possible Solution
This change:
improves the card speed almost 4X, but perhaps using a builtin popcount would be useful.