phuslu / lru

High performance LRU cache
MIT License
182 stars 6 forks source link

About hash collisions #10

Closed fufuok closed 3 months ago

fufuok commented 3 months ago
func TestCollision_RuntimeHasher(t *testing.T) {
    n := 20_000_000
    sHasher := getRuntimeHasher[string]()
    iHasher := getRuntimeHasher[int]()
    seed := uintptr(fastrand64())
    ms := make(map[uint32]string)
    mi := make(map[uint32]int)
    for i := 0; i < n; i++ {
        s := strconv.Itoa(i)
        hs := uint32(sHasher(noescape(unsafe.Pointer(&s)), seed))
        hi := uint32(iHasher(noescape(unsafe.Pointer(&i)), seed))
        if v, ok := ms[hs]; ok {
            t.Fatalf("Collision(seed: %d): %s(%d) == %s(%d)", seed, v, hs, s, hs)
        }
        if v, ok := mi[hi]; ok {
            t.Fatalf("Collision(seed: %d): %d(%d) == %d(%d)", seed, v, hi, i, hi)
        }
        ms[hs] = s
        mi[hi] = i
    }

    i := 7
    hi := uint32(iHasher(noescape(unsafe.Pointer(&i)), seed))
    if _, ok := mi[hi]; !ok {
        t.Fatalf("Number 7 should exist")
    }
    if len(ms) != len(mi) || len(ms) != n {
        t.Fatalf("Hash count: %d, %d, %d", len(ms), len(mi), n)
    }
}

func TestCollision_RuntimeHasher_Uint64(t *testing.T) {
    n := 20_000_000
    sHasher := getRuntimeHasher[string]()
    iHasher := getRuntimeHasher[int]()
    seed := uintptr(fastrand64())
    ms := make(map[uint64]string)
    mi := make(map[uint64]int)
    for i := 0; i < n; i++ {
        s := strconv.Itoa(i)
        hs := uint64(sHasher(noescape(unsafe.Pointer(&s)), seed))
        hi := uint64(iHasher(noescape(unsafe.Pointer(&i)), seed))
        if v, ok := ms[hs]; ok {
            t.Fatalf("Collision(seed: %d): %s(%d) == %s(%d)", seed, v, hs, s, hs)
        }
        if v, ok := mi[hi]; ok {
            t.Fatalf("Collision(seed: %d): %d(%d) == %d(%d)", seed, v, hi, i, hi)
        }
        ms[hs] = s
        mi[hi] = i
    }

    i := 7
    hi := uint64(iHasher(noescape(unsafe.Pointer(&i)), seed))
    if _, ok := mi[hi]; !ok {
        t.Fatalf("Number 7 should exist")
    }
    if len(ms) != len(mi) || len(ms) != n {
        t.Fatalf("Hash count: %d, %d, %d", len(ms), len(mi), n)
    }
}
# go version
go version go1.22.1 linux/amd64

# go test -count=1 -run=TestCollision_RuntimeHasher_Uint64
PASS
ok      github.com/phuslu/lru   9.913s

# go test -count=1 -run=TestCollision_RuntimeHasher
--- FAIL: TestCollision_RuntimeHasher (0.02s)
    runtime_test.go:21: Collision(seed: 852041415638190986): 55718(684107357) == 75813(684107357)
FAIL
exit status 1
FAIL    github.com/phuslu/lru   9.920s
phuslu commented 3 months ago

this is intentional.

  1. uint32 saves memory usage and benefits locality
  2. phuslu/lru shard.table is a rhh hash, it can handle the collisions
  3. lru.shard size is not too large, so I suppose/expect the uint32 hash collisions is not heavy.
fufuok commented 3 months ago

ok, ths.