phuslu / lru

High performance LRU cache
MIT License
182 stars 6 forks source link

performance evaluation on cpu usage #8

Closed ouvaa closed 3 months ago

ouvaa commented 3 months ago

because it's on hotpath and most used feature currently, was checking cpu utilization and found...

it's great and it's fast but i'm not sure what's wrong with this, with my 12 core laptop, i can only push to use max 660% of cpu cores meaning roughly 55% cpu usage using phuslu lru.

  1. is there anything wrong with my program?
  2. i tried the sharded version and it seemed best if it's used without

can you try out and see what i mean? i hope to push for 90% cpu utilization, u can comment out the cache.Set below and see the empty for loop with use around 99% cpu.

// +build linux

package main

/*
#define _GNU_SOURCE
#include <sched.h>
#include <pthread.h>
#include <string.h> // Include string.h for strerror

// Wrap CPU_ZERO and CPU_SET in functions to avoid macro expansion issues.
void my_CPU_ZERO(cpu_set_t *cs) { CPU_ZERO(cs); }
void my_CPU_SET(int cpu, cpu_set_t *cs) { CPU_SET(cpu, cs); }
*/
import "C"

import (
        "syscall"
        "fmt"
        "log"
        //"unsafe"
        "time"
        "runtime"
        //"sync"

        cxcputhread "github.com/cloudxaas/gocpu/thread"
        phuslu "github.com/phuslu/lru"

)

var shardcount = func() int {
        n := runtime.GOMAXPROCS(0) * 2
        k := 1
        for k < n {
                k = k * 2
        }
        return k
}()

var (
        cache = phuslu.NewLRUCache[string, string](32000000)
        //cache = phuslu.NewLRUCache[string, string](32000000, phuslu.WithShards[string, string](uint32(shardcount)))

        //cache = phuslu.NewLRUCache[string, string](32000,phuslu.WithShards(256))

)
/*
// setCPUAffinity sets the current thread's affinity to the specified CPU core.
func setCPUAffinity(cpu int) error {
        var cpuset C.cpu_set_t
        C.my_CPU_ZERO(&cpuset)
        C.my_CPU_SET(C.int(cpu), &cpuset)

        // pthread_setaffinity_np returns 0 on success, or an error number on failure.
        // The previous implementation incorrectly interpreted the return value.
        result := C.pthread_setaffinity_np(C.pthread_self(), C.size_t(unsafe.Sizeof(cpuset)), &cpuset)
        if result != 0 {
                // Convert the result (errno) to an error message using strerror.
                return fmt.Errorf("error setting thread affinity: %s", C.GoString(C.strerror(result)))
        }
        return nil
}
*/

func main() {
        numCores := runtime.NumCPU()
        //      runtime.GOMAXPROCS(numCores)

        //var wg sync.WaitGroup
        for i := 0; i < numCores; i++ {
                //              wg.Add(1)
                go func(coreID int) {
                        //                      defer wg.Done()
                        runtime.LockOSThread() // Lock the goroutine to the current thread.

                        //err := setCPUAffinity(coreID)

                        err := cxcputhread.SetCPUAffinity(uint16(coreID)+1)
                        if err != nil {
                                fmt.Printf("Failed to set CPU affinity for core %d: %v\n", coreID, err)
                                return
                        }

                        // Your code here. This goroutine is now running with affinity set to a specific CPU core.
                        for {
                                //tid := C.pthread_self()
                                //for z:=0;z<10000000000;z++{

                                //}
                                for {

                                        cache.Set("1231", "32141")
                                        //cache.Get("1231")

                                }

                                tid := syscall.Gettid()
                                log.Printf("Thread pointer: %v == Core ID: %d", tid, coreID)
                                time.Sleep(time.Second)
                        }
                        // Simulate some work...
                }(i)
        }
        //      wg.Wait()
        select {}
}
phuslu commented 3 months ago

I doubt that the root cause is mutex/lock contention(not very sure), I guess that switching to spinlock maybe could consume more/full cpu usage, but it will not increase the throughputs(more likely decrease).

let me investigate it later.

phuslu commented 3 months ago

By right, the lockless contention implementations(e.g. otter,ristretto,ccache) should avoid this problem.

Maybe I'll try this approach someday, but it obviously introduces much complexity and more memory usage/GC pressure.

phuslu commented 3 months ago

UPDATE: I add gcscan tests to measure the gc time of golang caches, the results is here https://github.com/phuslu/lru?tab=readme-ov-file#gc-scan

considering the significantly longer gcscan time of lockless contention implementations, I'd like keep current architectural of phuslu/lru.