StackExchange / StackExchange.Redis

General purpose redis client
https://stackexchange.github.io/StackExchange.Redis/
Other
5.88k stars 1.51k forks source link

Get data from Redis became slow #1605

Closed gruan01 closed 3 years ago

gruan01 commented 3 years ago

I don't know how to describe my confiuse...

I made cluster with redis-6.0.9 on 2 centos 7 servers. Cluster have 4 masters and 8 replicas.

Client driver use StackExchange.Redis 2.1.58....

I have a internal service, deploy as 10 websites on 2 windows server, use nginx as entry point, 20000 request per minutes over nginx.

Every request need get some data from redis.

When I restart IIS , redis can response in 1 ~ 2 ms, CPU is very smooth and low rate. But after 1 or 1.5 hour, redis become slow, need 10 ~ 100 ms to response, and CPU become busy, until restart iis again...

I have try :

  1. Use ConnectionMultiplexer pool.
  2. Custom SocketManager specify worker count.
  3. Specify SocketManagerOptions.
  4. ThreadPool.SetMinThreads
  5. Rewrite my code....
  6. Set server's tcp_max_syn_backlog to 10240 and set net.core.somaxconn to 10240

But not work, I still need restart IIS every hour...

Exported redis config:

rdbchecksum=yes
daemonize=yes
io-threads-do-reads=no
lua-replicate-commands=yes
always-show-logo=yes
protected-mode=yes
rdbcompression=yes
rdb-del-sync-files=no
activerehashing=yes
stop-writes-on-bgsave-error=yes
dynamic-hz=yes
lazyfree-lazy-eviction=no
lazyfree-lazy-expire=no
lazyfree-lazy-server-del=no
lazyfree-lazy-user-del=no
repl-disable-tcp-nodelay=no
repl-diskless-sync=yes
gopher-enabled=no
aof-rewrite-incremental-fsync=yes
no-appendfsync-on-rewrite=no
cluster-require-full-coverage=yes
rdb-save-incremental-fsync=yes
aof-load-truncated=yes
aof-use-rdb-preamble=yes
cluster-replica-no-failover=no
cluster-slave-no-failover=no
replica-lazy-flush=no
slave-lazy-flush=no
replica-serve-stale-data=yes
slave-serve-stale-data=yes
replica-read-only=yes
slave-read-only=yes
replica-ignore-maxmemory=yes
slave-ignore-maxmemory=yes
jemalloc-bg-thread=yes
activedefrag=no
syslog-enabled=no
cluster-enabled=yes
appendonly=no
cluster-allow-reads-when-down=no
oom-score-adj=no
aclfile=
unixsocket=
pidfile=/var/run/redis_4000.pid
replica-announce-ip=
slave-announce-ip=
masteruser=

cluster-announce-ip=
syslog-ident=redis
dbfilename=dump.rdb
appendfilename=appendonly.aof
server_cpulist=
bio_cpulist=
aof_rewrite_cpulist=
bgsave_cpulist=
supervised=no
syslog-facility=local0
repl-diskless-load=disabled
loglevel=notice
maxmemory-policy=noeviction
appendfsync=everysec
databases=16

io-threads=1
auto-aof-rewrite-percentage=100
cluster-replica-validity-factor=10
cluster-slave-validity-factor=10
list-max-ziplist-size=-2
tcp-keepalive=300
cluster-migration-barrier=1
active-defrag-cycle-min=1
active-defrag-cycle-max=25
active-defrag-threshold-lower=10
active-defrag-threshold-upper=100
lfu-log-factor=10
lfu-decay-time=1
replica-priority=100
slave-priority=100
repl-diskless-sync-delay=5
maxmemory-samples=5
timeout=0
replica-announce-port=0
slave-announce-port=0
tcp-backlog=511
cluster-announce-bus-port=0
cluster-announce-port=0
repl-timeout=60
repl-ping-replica-period=10
repl-ping-slave-period=10
list-compress-depth=0
rdb-key-save-delay=0
key-load-delay=0
active-expire-effort=1
hz=10
min-replicas-to-write=0
min-slaves-to-write=0
min-replicas-max-lag=10
min-slaves-max-lag=10
maxclients=10000
active-defrag-max-scan-fields=1000
slowlog-max-len=128
acllog-max-len=128
lua-time-limit=5000
cluster-node-timeout=15000
slowlog-log-slower-than=10000
latency-monitor-threshold=0
proto-max-bulk-len=536870912
stream-node-max-entries=100
repl-backlog-size=1048576
maxmemory=0
hash-max-ziplist-entries=512
set-max-intset-entries=512
zset-max-ziplist-entries=128
active-defrag-ignore-bytes=104857600
hash-max-ziplist-value=64
stream-node-max-bytes=4096
zset-max-ziplist-value=64
hll-sparse-max-bytes=3000
tracking-table-max-keys=1000000
repl-backlog-ttl=3600
auto-aof-rewrite-min-size=67108864
logfile=/home/q/redis/cluster/4000/log
client-query-buffer-limit=1073741824
watchdog-period=0

save=900 1 300 10 60 10000
client-output-buffer-limit=normal 0 0 0 slave 268435456 67108864 60 pubsub 33554432 8388608 60
unixsocketperm=0
slaveof=
notify-keyspace-events=

oom-score-adj-values=0 200 800

ConnectionPool

    /// <summary>
    /// 
    /// </summary>
    internal class ConnectionPool
    {
....
        public int PoolSize => Math.Max(this.future?.PoolSize ?? 1, 1);
        public int? WorkerCount => this.future?.WorkerCount;
        private readonly ConcurrentBag<Lazy<ConnectionMultiplexer>> conns;

        public ConnectionPool(string name, string connection, Future future)
        {
            this.name = name;
            this.connection = connection;

            this.future = future;

            this.conns = new ConcurrentBag<Lazy<ConnectionMultiplexer>>();
            for (var i = 0; i < this.PoolSize; i++)
            {
                this.conns.Add(new Lazy<ConnectionMultiplexer>(() => this.Connect()));
            }
        }
...
        private ConnectionMultiplexer Connect()
        {
            try
            {
                var opt = ConfigurationOptions.Parse(this.connection, true);
#if NETCORE || NET461
                if (this.future?.WorkerCount > 1 || this.future?.Option != null)
                {
                    opt.SocketManager = new SocketManager(
                        $"SocketManager-{this.name}",
                        this.future?.WorkerCount ?? 0,
                        this.future?.Option ?? SocketManager.SocketManagerOptions.None
                        );
                }
#endif
                var conn = ConnectionMultiplexer.Connect(opt, this.future?.Log);
                return conn;
            }
            catch (Exception e)
            {
                throw new Exception(string.Format("连接 Redis 失败 : {0}", this.name), e);
            }
        }

        public ConnectionMultiplexer Get()
        {
            Lazy<ConnectionMultiplexer> response;
            var created = this.conns.Where(c => c.IsValueCreated);

            if (created.Count() == this.conns.Count)
            {
                response = this.conns.OrderBy(c => c.Value.GetCounters().TotalOutstanding).First();
            }
            else
            {
                response = this.conns.First(lazy => !lazy.IsValueCreated);
            }

            return response.Value;
        }
    }

Get data from redis:

        public async Task<IEnumerable<HotelItemBase>> GetRedis_HotelAsync(IEnumerable<HotelGetCondition> datas, bool isOversea)
        {
            if (datas == null)
                throw new ArgumentNullException("datas");

            var redisName = isOversea ? RedisNames.OverseaHotel : RedisNames.Cluster;

            var conn = RedisExchangeManager.GetConnection(redisName);

            var keys = datas
                .Where(d => d.IsKeyValid)
                .SelectMany(d => d.GetKeys(redisName));

            var tsks = new List<Task<RedisValue[]>>();
            var result = new List<HotelItemBase>();

            var db = conn.GetDatabase();

            var sloatGroups = keys.Select(k => new { Key = (RedisKey)k, Sloat = conn.HashSlot(k) })
                    .GroupBy(t => t.Sloat);

            foreach (var grp in sloatGroups)
            {
                var ks = grp.Select(t => t.Key);
                var tsk = db.StringGetAsync(ks.ToArray(), CommandFlags.PreferReplica);
                tsks.Add(tsk);
            }

            var vss = await Task.WhenAll(tsks);
            var vs = vss.SelectMany(v => v);

            //db.WaitAll(tsks.ToArray());
            //var vs = tsks.SelectMany(t => t.Result);

            if (vs?.Count() > 0)
            {
                foreach (var d in vs)
                {
                    if (!d.HasValue)
                        continue;

                    var o = ProtobufHelper.Deserialize<HotelItemBase>(d);
                    result.Add(o);
                }
            }

            return result;
        }

Business code:

var prices = rst?.PriceList?.ToList() ?? new List<SupSynPrice>();

// 对没有返回价格的日期关房
var swClose = Stopwatch.StartNew();
var closer = new AutoCloser(supplier, criteria, this.PubFun);
await closer.Init();
var closeDatas = await closer.BuildCloseData(prices);
if (closeDatas?.Count() > 0)
    prices.AddRange(closeDatas.Distinct());

swClose.Stop();

rst.PriceList = prices;

closer.BuildCloseData will invoke redis to get data

When redis become slow

Above screenshot is performances counter of IIS connection when redis become slow. Request can't be processed in time, so connection can' release, CPU becom busy, all work delaied.

When restart IIS

Above screenshot is performances counter of IIS connection when I restart IIS. Redis response quickly , no work delay, no connection hanged.

Can somebody tell me why ?

gruan01 commented 3 years ago

Status when become slow:

10.89.70.14:4003: Cluster v6.0.9, replica; 16 databases; keep-alive: 00:01:00; int: ConnectedEstablished; sub: ConnectedEstablished, 1 active 10.89.70.14:4003: int ops=14079, qu=0, qs=0, qc=0, wr=0, socks=1; sub ops=53, qu=0, qs=0, qc=0, wr=0, subs=1, socks=1 Circular op-count snapshot; int: 14067+4+6+2=14079 (1.20 ops/s; spans 10s); sub: 53 (0.00 ops/s; spans 10s) 10.89.70.14:4004: Cluster v6.0.9, replica; 16 databases; keep-alive: 00:01:00; int: ConnectedEstablished; sub: ConnectedEstablished, 1 active 10.89.70.14:4004: int ops=14080, qu=0, qs=0, qc=0, wr=0, socks=1; sub ops=53, qu=0, qs=0, qc=0, wr=0, subs=1, socks=1 Circular op-count snapshot; int: 14067+4+6+3=14080 (1.30 ops/s; spans 10s); sub: 53 (0.00 ops/s; spans 10s) 10.89.70.14:4005: Cluster v6.0.9, replica; 16 databases; keep-alive: 00:01:00; int: ConnectedEstablished; sub: ConnectedEstablished, 1 active 10.89.70.14:4005: int ops=14858, qu=0, qs=0, qc=0, wr=0, socks=1; sub ops=53, qu=0, qs=0, qc=0, wr=0, subs=1, socks=1 Circular op-count snapshot; int: 14846+1+9+2=14858 (1.20 ops/s; spans 10s); sub: 53 (0.00 ops/s; spans 10s) Sync timeouts: 0; async timeouts: 0; fire and forget: 0; last heartbeat: 0s ago

gruan01 commented 3 years ago

I have use NProfiler to get some runtime information, my mistake , not caused by redis. Sorry.