redis / ioredis

🚀 A robust, performance-focused, and full-featured Redis client for Node.js.
MIT License
14.35k stars 1.19k forks source link

Retry logic for ioredis response incorrect result #965

Closed stanley115 closed 4 years ago

stanley115 commented 5 years ago

Hi,

I am using ioredis with version 4.2.0 and I found ioredis may response unexpected result when "failover primary" is triggered on Amazon ElastiCache.

To verify this issue, I wrote a program to do the following steps:

  1. insert 1000000 key-value pairs into redis where key = value
  2. get 200 random key from redis and check whether the result match the key
  3. at the same time, perform some multi-set operations

The source code is as follows:

const Redis = require('ioredis');

const redis = new Redis({
    'host': xxx,
    'port': xxx,
});

function isEqual(a, b) {
    if (!Array.isArray(a) || !Array.isArray(b)) {
        return false;
    }

    if (a.length !== b.length) {
        return false;
    }

    for (let i = 0; i < a.length; i++) {
        if (a[i] !== b[i]) {
            return false;
        }
    }

    return true;
}

let counter = 0;

async function test() {
    while (true) {
        counter++;

        if (counter % 1000 === 0) {
            console.log('test', counter);
        }
        let keys = [];

        for (let i = 0; i < 200; i++) {
            let value = `${Math.floor(Math.random() * 1000000)}`;

            keys.push(value);
        }

        try {
            let values;

            values = await redis.mget(keys);

            if (!isEqual(keys, values)) {
                console.error('Keys and values do not match');
                console.error(keys, values);

                return;
            }

            // set the values
            const multi = await redis.multi();

            for (let i = 0; i < keys.length; i++) {
                await multi.set(keys[i], values[i]);
            }

            try {
                await multi.exec();
            } catch (err) {

                // discard the transaction if encounter any error
                await multi.discard();
            }

            values = await redis.mget(keys);

            if (!isEqual(keys, values)) {
                console.error('Keys and values do not match');
                console.error(keys, values);

                return;
            }

        } catch (err) {
            console.error('Redis connection error', err);
        }
    }
}

(async () => {
    let promises = [];

    // Set 1000000 key-value pair into redis
    for (let i = 0; i < 1000000; i++) {
        promises.push(
            redis.set(i, i)
        );
    }

    await Promise.all(promises);

    promises = [];

    for (let i = 0; i < 200; i++) {
        // Test redis in parallel
        promises.push(test());
    }

    await Promise.all(promises);
})();

When the program is executed, it runs normally without any error at the beginning:

test 1000
test 2000
test 3000
test 4000
test 5000
test 6000
test 7000
test 8000

However, when I trigger "failover primary" is triggered on Amazon ElastiCache, it responses the following error:

test 31000
[ioredis] Unhandled error event: Error: write EPIPE
    at WriteWrap.afterWrite [as oncomplete] (net.js:788:14)
Keys and values do not match
[ '243409',
  '137016',
  '601542',
  '734081',
  '210151',
  '68771',
  '895837',
  '876693',
  '588017',
  '29828',
  '17298',
  '670564',
  '836440',
  '270865',
  '306435',
  '931699',
  '775613',
  '217668',
  '581266',
  '976507',
  '832352',
  '679326',
  '52561',
  '53287',
  '104417',
  '146403',
  '39159',
  '114127',
  '871857',
  '132837',
  '387522',
  '562921',
  '42104',
  '199721',
  '739081',
  '521383',
  '662032',
  '113916',
  '548452',
  '439403',
  '444550',
  '893604',
  '848739',
  '140389',
  '515737',
  '876296',
  '173461',
  '564650',
  '390476',
  '798553',
  '76405',
  '902405',
  '446459',
  '759290',
  '427111',
  '988981',
  '967138',
  '137452',
  '520179',
  '485755',
  '720760',
  '845674',
  '872862',
  '821507',
  '924065',
  '574757',
  '128519',
  '906625',
  '632835',
  '21495',
  '83587',
  '777786',
  '823747',
  '171798',
  '683516',
  '127883',
  '78898',
  '956534',
  '559139',
  '773756',
  '319490',
  '610275',
  '407368',
  '531517',
  '469918',
  '427556',
  '261721',
  '430777',
  '877009',
  '298908',
  '564349',
  '804059',
  '671345',
  '953904',
  '531551',
  '590455',
  '979441',
  '463816',
  '407925',
  '633564',
  ... 100 more items ] 'QUEUED'

and the program terminate with the following error:

Error: Command queue state error. If you can reproduce this, please report it. Last reply: 995929,435542,106821,513330,236068,695023,745357,618648,185801,633124,656521,310435,940466,479575,808160,416420,19858,211146,425218,140598,924085,961997,664625,578621,328188,615778,89132,125432,741458,739122,400620,758320,634350,352547,600585,682298,933154,188276,897557,811743,646589,987422,540335,681971,659102,156246,827271,37592,152314,833971,688430,877638,839522,79903,126904,827111,486631,499505,445872,165188,957601,820105,748310,935720,805906,735996,941498,243421,787074,261476,261790,473453,341959,986679,192225,625257,135429,47694,25263,436500,937899,843472,111746,446841,626126,321981,112825,900243,258222,887488,434748,812479,458536,683031,226176,621812,152860,194738,508094,749085,982258,678053,113761,956206,374257,626489,863173,293197,424767,636834,742311,959087,914309,918353,559260,774120,681847,737399,213218,437021,176342,725435,295235,729465,343807,960928,127085,827722,878959,584385,535367,506862,57615,168427,12160,59708,852020,465299,846000,301785,567768,834208,978330,611490,92358,759136,358587,819213,394238,571914,638338,776079,94375,232995,948415,89949,869999,910308,407192,405751,938798,510490,814789,913200,861669,264338,93524,371432,919094,774245,587445,460332,844889,165153,199058,542872,571002,274567,508553,924620,780545,835010,187975,654759,401789,41298,723730,889871,266452,765027,797533,517217,961999,178726,25432,418646,287127,679359,897524,62380
    at Redis.exports.returnReply (xxxxxx/node_modules/ioredis/built/redis/parser.js:143:39)
    at JavascriptRedisParser.returnReply (xxxxxx/node_modules/ioredis/built/redis/parser.js:23:19)
    at JavascriptRedisParser.execute (xxxxxx/node_modules/ioredis/node_modules/redis-parser/lib/parser.js:544:14)
    at Socket.<anonymous> (xxxxxx/node_modules/ioredis/built/redis/event_handler.js:107:26)
    at Socket.emit (events.js:198:13)
    at addChunk (_stream_readable.js:288:12)
    at readableAddChunk (_stream_readable.js:269:11)
    at Socket.Readable.push (_stream_readable.js:224:10)
    at TCP.onStreamRead [as onread] (internal/stream_base_commons.js:94:17)
Emitted 'error' event at:
    at Redis.exports.returnReply (xxxxxx/node_modules/ioredis/built/redis/parser.js:143:25)
    at JavascriptRedisParser.returnReply (xxxxxx/node_modules/ioredis/built/redis/parser.js:23:19)
    [... lines matching original stack trace ...]
    at TCP.onStreamRead [as onread] (internal/stream_base_commons.js:94:17)

PS: I found this issue cannot be reproduced when maxRetriesPerRequest is set to be 0, and I strongly suspect that there is some state corruption when retry happen.

Here is the ElastiCache cluster details:

Node type cache.m3.medium
Engine Redis
Engine Version Compatibility 4.0.10
Shards 1
Number of Nodes 2 nodes (1 primary, 1 replica)
NezzarClp commented 5 years ago

Any update on this issue? I am using ioredis and facing the same issue as well.

stale[bot] commented 5 years ago

This issue has been automatically marked as stale because it has not had recent activity. It will be closed after 7 days if no further activity occurs, but feel free to re-open a closed issue if needed.

alavers commented 4 years ago

I'm encountering this same issue when testing an elasticache failover/upgrade. (Elasticache redis 5.0.0, ioredis 4.14.0)

My client is running a mix of individual commands and multi command blocks. When an elasticache resource upgrade occurs, the connection is closed and reestablished. However, the ioredis command queue falls out of sync, and I begin seeing individual redis commands being resolved with the reply result of a different command.

My debugging so far has shown that during the connection reset, commands sent inside of a multi context are getting put individually on the offline queue, which on its own doesn't seem right.

Just before a failover takes place, I send a multi block that looks like this:

multi
del
hset
expire
setex
exec

All of these commands get queued up by ioredis, but then I observe the following:

  1. Receive a reply 'OK' (in response to multi)
  2. Receive a READONLY error in response to del, due to the ongoing ElastiCache resource upgrade.
  3. del command is put on the offline queue
  4. The connection is reestablished with a new IP address
  5. del is sent from the offline queue
  6. Receive a reply '1' (in response to del). We were expecting QUEUED here, but the multi context was lost in the reconnect.
  7. ioredis resolves a pending hset command with the reply value '1'. The command queue is now out of sync
  8. The next command I send through this client has its reply resolved to the pending expire command, and so on.

It seems that if a connection reset occurs when a multi block is still in flight we might need to either flush or re-send any multi'd commands that might still be in the queue. I'll try to run this experiment in a fork and see if it resolves this issue.

alavers commented 4 years ago

To start, I think perhaps transacted commands found in prevCommandQueue or offlineQueue should be rejected. A quick experiment trying this appears to resolve the issue I'm facing.

In the meantime @stanley115 are you able to reproduce the issue with autoResendUnfulfilledCommands set to false?

luishdz1010 commented 4 years ago

This just hit us on production, after recovering from a failover our sites started showing content from different users mixed together. Any workarounds?

alavers commented 4 years ago

As far as workarounds, we simply have to restart all connected services whenever we perform an elasticache upgrade. I'm not aware of any better workaround, but I do have a potential fix working locally that I've been meaning to PR. I've been sidetracked since then.

ioredis-robot commented 4 years ago

:tada: This issue has been resolved in version 4.16.1 :tada:

The release is available on:

Your semantic-release bot :package::rocket: