nim-lang / Nim

Nim is a statically typed compiled systems programming language. It combines successful concepts from mature languages like Python, Ada and Modula. Its design focuses on efficiency, expressiveness, and elegance (in that order of priority).
https://nim-lang.org
Other
16.6k stars 1.47k forks source link

system.sets `card` is slow #10617

Closed brentp closed 5 years ago

brentp commented 5 years ago

For the code here: https://gist.github.com/brentp/7d440560968c3ac97c40de17200e404a

running with nim c -d:release -r show that the system.sets are much faster than either hashsets or intsets as expected.

however, when running with nim c -d:card -d:release -r they become ~20X slower.

Possible Solution

This change:

diff --git a/lib/system/sets.nim b/lib/system/sets.nim
index a5f6c5d..ce30c1c 100644
--- a/lib/system/sets.nim
+++ b/lib/system/sets.nim
@@ -22,7 +22,7 @@ proc countBits64(n: int64): int {.compilerproc.} =
   result = countBits32(toU32(n and 0xffffffff'i64)) +
            countBits32(toU32(n shr 32'i64))

-proc cardSet(s: NimSet, len: int): int {.compilerproc.} =
-  result = 0
+proc cardSet(s: NimSet, len: int): int {.compilerproc, inline.} =
   for i in countup(0, len-1):
+    if s[i] == 0: continue
     inc(result, countBits32(int32(s[i])))

improves the card speed almost 4X, but perhaps using a builtin popcount would be useful.

brentp commented 5 years ago

also, maybe I'm missing some impl details, but since it's actually using an int8, wouldn't it be faster to count the set bits on an int8 rather than converting to int32 and counting 4X as many bits?

brentp commented 5 years ago

here's a diff that counts just the 8 bits. It is a little faster:

diff --git a/lib/system/sets.nim b/lib/system/sets.nim
index a5f6c5d..c53097b 100644
--- a/lib/system/sets.nim
+++ b/lib/system/sets.nim
@@ -13,8 +13,8 @@ type
   NimSet = array[0..4*2048-1, uint8]

 proc countBits32(n: int32): int {.compilerproc.} =
-  var v = n
-  v = v -% ((v shr 1'i32) and 0x55555555'i32)
+  #var v = n
+  var v = n -% ((n shr 1'i32) and 0x55555555'i32)
   v = (v and 0x33333333'i32) +% ((v shr 2'i32) and 0x33333333'i32)
   result = ((v +% (v shr 4'i32) and 0xF0F0F0F'i32) *% 0x1010101'i32) shr 24'i32

@@ -22,7 +22,13 @@ proc countBits64(n: int64): int {.compilerproc.} =
   result = countBits32(toU32(n and 0xffffffff'i64)) +
            countBits32(toU32(n shr 32'i64))

-proc cardSet(s: NimSet, len: int): int {.compilerproc.} =
-  result = 0
+proc countBits8(n: uint8): int {.compilerproc.} =
+  var v = n
+  while v != 0:
+    result.inc
+    v = v and (v - 1)
+
+proc cardSet(s: NimSet, len: int): int {.compilerproc, inline.} =
   for i in countup(0, len-1):
-    inc(result, countBits32(int32(s[i])))
+    if likely(s[i] == 0): continue
+    inc(result, countBits8(s[i]))