FROM redis:3.2
MAINTAINER ZhangShuangyan <Zhangshuangyan@gridsum.com>
ADD docker-entrypoint.sh /
RUN chmod +x /docker-entrypoint.sh
ENV ANNOUNCE_IP redis-pro.internal.gridsumdissector.com
EXPOSE 6379
ENTRYPOINT ["/docker-entrypoint.sh"]
docker-entrypoint.sh
#!/bin/bash
set -e
CONFIGURATION_FILE=/etc/redis.conf
if [ "$ANNOUNCE_IP" ]; then
echo "slave-announce-ip $ANNOUNCE_IP" >> $CONFIGURATION_FILE
fi
if [ "$ANNOUNCE_PORT" ]; then
echo "slave-announce-port $ANNOUNCE_PORT" >> $CONFIGURATION_FILE
fi
if [ "$REDIS_MASTER" ]; then
echo "slaveof $REDIS_MASTER 6379" >> $CONFIGURATION_FILE
fi
exec redis-server $CONFIGURATION_FILE
1:S 13 Nov 05:20:48.274 * Connecting to MASTER 10.202.40.102:44983
1:S 13 Nov 05:20:48.274 * MASTER <-> SLAVE sync started
1:S 13 Nov 05:20:48.278 * Non blocking connect for SYNC fired the event.
1:S 13 Nov 05:20:48.278 * Master replied to PING, replication can continue...
1:S 13 Nov 05:20:48.280 * Partial resynchronization not possible (no cached master)
1:S 13 Nov 05:20:48.280 * Master does not support PSYNC or is in error state (reply: -ERR Can't SYNC while not connected with my master)
1:S 13 Nov 05:20:48.280 * Retrying with SYNC...
1:S 13 Nov 05:20:48.281 # MASTER aborted replication with an error: ERR Can't SYNC while not connected with my master
1:S 13 Nov 05:20:49.276 * Connecting to MASTER 10.202.40.102:44983
1:S 13 Nov 05:20:49.276 * MASTER <-> SLAVE sync started
1:S 13 Nov 05:20:49.277 * Non blocking connect for SYNC fired the event.
1:S 13 Nov 05:20:49.278 * Master replied to PING, replication can continue...
1:S 13 Nov 05:20:49.282 * Partial resynchronization not possible (no cached master)
1:S 13 Nov 05:20:49.283 * Master does not support PSYNC or is in error state (reply: -ERR Can't SYNC while not connected with my master)
1:S 13 Nov 05:20:49.283 * Retrying with SYNC...
1:S 13 Nov 05:20:49.284 # MASTER aborted replication with an error: ERR Can't SYNC while not connected with my master
1:S 13 Nov 05:20:50.279 * Connecting to MASTER 10.202.40.102:44983
1:S 13 Nov 05:20:50.279 * MASTER <-> SLAVE sync started
1:S 13 Nov 05:20:50.280 * Non blocking connect for SYNC fired the event.
1:S 13 Nov 05:20:50.281 * Master replied to PING, replication can continue...
1:S 13 Nov 05:20:50.282 * Partial resynchronization not possible (no cached master)
1:S 13 Nov 05:20:50.283 * Master does not support PSYNC or is in error state (reply: -ERR Can't SYNC while not connected with my master)
1:S 13 Nov 05:20:50.283 * Retrying with SYNC...
1:S 13 Nov 05:20:50.283 # MASTER aborted replication with an error: ERR Can't SYNC while not connected with my master
由于redis本身无法支持端口映射,直接部署redis sentinel到k8s集群上,Redis主节点的INFO信息中slave节点的ip和port分别是pod所在的node的ip和容器的port,如果要找到对应的从节点实例,应该是nodeip+nodeport或者而不是nodeip+containerport,并且Redis sentinel从主节点的INFO信息中获取从节点的ip和port,并更新到自身的配置文件中,因此sentinel获取的从节点实例的地址也是错误的。尝试通过配置从节点slave-announce-ip为域名,slave-announce-port为从节点service对应的nodeport暴露地址,镜像文件如下:
Dockerfile
docker-entrypoint.sh
redis-slave-RC.yaml
运行后sentinel日志如下:
可以看到sentinel看到的从节点的地址是17.0.112.4:6379,测试可以正常主备切换
但是本周测试发现,现在sentinel从master解析到的slave地址不再是17.0.112.4:6379这种形式的了,而是如下:
从sentinel的配置文件更新中,我们可以看到slave的地址被解析为10.202.40.103 44983,对master节点执行缩容到0的操作,然后观察结果如下:
这里出现一个新选出的主节点10.202.40.102:44983,还出现一个从节点10.202.40.103:44983,而其实我们通过执行nslookup命令可以知道域名redis-pro.internal.gridsumdissector.com就是对应着这两个地址:10.202.40.102和10.202.40.103
但是我们的sentinel以为它们是两个实例,所以sentinel对10.202.40.102:44983执行了slaveof 10.202.40.103 44983的命令,接下来我们看到结果如下:
这个实例进入了slave的状态,Redis集群中没有了主节点
解决的办法是从域名对应的ip地址中选择其中一个,比如10.202.40.102作为slave-announce-ip