详解 Redis 哨兵模式(三)
哨兵模式是提高 Redis 可用性的一种方式,本文是哨兵模式系列的第三篇文章,主要介绍哨兵模式判断主观下线和客观下线以及主从切换的过程。本来这篇文章想直接进行源码分析,但是越写越陷入细节写的很分散,最终还是放弃了。所以通过这篇文章不会执着于源码的实现顺序,旨在讲清楚哨兵模式下故障切换的过程。
客观下线
判断客观下线的函数是 sentinelCheckObjectivelyDown
,只有主节点才会判断是否客观下线,从节点不会。
void sentinelCheckObjectivelyDown(sentinelRedisInstance *master) {
dictIterator *di;
dictEntry *de;
unsigned int quorum = 0, odown = 0;
// 当前主节点被当前哨兵判断为主观下线
if (master->flags & SRI_S_DOWN) {
/* Is down for enough sentinels? */
quorum = 1; /* the current sentinel. */
/* Count all the other sentinels. */
di = dictGetIterator(master->sentinels);
// 遍历所有的哨兵,如果哨兵将主节点状态设置为 SRI_MASTER_DOWN,如果是则 quorum 加1
while((de = dictNext(di)) != NULL) {
sentinelRedisInstance *ri = dictGetVal(de);
if (ri->flags & SRI_MASTER_DOWN) quorum++;
}
dictReleaseIterator(di);
if (quorum >= master->quorum) odown = 1;
}
/* Set the flag accordingly to the outcome. */
if (odown) {
// 判断为客观下线且没有设置主节点的状态为 SRI_O_DOWN
if ((master->flags & SRI_O_DOWN) == 0) {
// 发送 +odown 消息
sentinelEvent(LL_WARNING,"+odown",master,"%@ #quorum %d/%d",
quorum, master->quorum);
// 新增 SRI_O_DOWN 状态
master->flags |= SRI_O_DOWN;
// 记录客观下线的时间
master->o_down_since_time = mstime();
}
} else {
if (master->flags & SRI_O_DOWN) {
sentinelEvent(LL_WARNING,"-odown",master,"%@");
master->flags &= ~SRI_O_DOWN;
}
}
}
sentinelCheckObjectivelyDown
函数会获取所有的哨兵实例的状态,将判定当前节点为主观下线的哨兵(也就是状态中含有 SRI_MASTER_DOWN
的哨兵)个数累加起来,如果个数大于等于 master->quorum
也就是主节点配置的 quorum
值,就会被判定为客观下线。SRI_MASTER_DOWN
这个状态是如何设置的呢,后面我们会分析到。当被判定为客观下线时,将会进行故障转移。
故障转移
sentinelAskMasterStateToOtherSentinels
当哨兵判断主节点为客观下线之后会调用 sentinelAskMasterStateToOtherSentinels
函数。sentinelAskMasterStateToOtherSentinels
主要是获取合适的哨兵,向这些哨兵发送 sentinel is-master-down-by-addr
命令,并将回调函数设置为 sentinelReceiveIsMasterDownReply
。
#define SENTINEL_ASK_FORCED (1<<0)
void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int flags) {
dictIterator *di;
dictEntry *de;
di = dictGetIterator(master->sentinels);
// 遍历所有哨兵
while((de = dictNext(di)) != NULL) {
sentinelRedisInstance *ri = dictGetVal(de);
mstime_t elapsed = mstime() - ri->last_master_down_reply_time;
char port[32];
int retval;
/* If the master state from other sentinel is too old, we clear it. */
// 如果这个哨兵与主节点长时间没有通信则剔除
if (elapsed > SENTINEL_ASK_PERIOD*5) {
ri->flags &= ~SRI_MASTER_DOWN;
sdsfree(ri->leader);
ri->leader = NULL;
}
/* Only ask if master is down to other sentinels if:
*
* 1) We believe it is down, or there is a failover in progress.
* 2) Sentinel is connected.
* 3) We did not receive the info within SENTINEL_ASK_PERIOD ms. */
if ((master->flags & SRI_S_DOWN) == 0) continue;
if (ri->link->disconnected) continue;
if (!(flags & SENTINEL_ASK_FORCED) &&
mstime() - ri->last_master_down_reply_time < SENTINEL_ASK_PERIOD)
continue;
/* Ask */
ll2string(port,sizeof(port),master->addr->port);
// 发送 SENTINEL is-master-down-by-addr 命令
retval = redisAsyncCommand(ri->link->cc,
sentinelReceiveIsMasterDownReply, ri,
"%s is-master-down-by-addr %s %s %llu %s",
sentinelInstanceMapCommand(ri,"SENTINEL"),
master->addr->ip, port,
sentinel.current_epoch,
(master->failover_state > SENTINEL_FAILOVER_STATE_NONE) ?
sentinel.myid : "*");
if (retval == C_OK) ri->link->pending_commands++;
}
dictReleaseIterator(di);
}
从上面的代码中我们可以看出 sentinel is-master-down-by-addr
命令会发送主节点的 ip 和端口以及哨兵的 epoch
和实例 ID。如果当前主节点已经开始故障切换,实例 ID 就是哨兵自身的 ID,否则实例 ID 为 *
。完整的命令为:
SENTINEL IS-MASTER-DOWN-BY-ADDR <ip> <port> <current-epoch> <runid>
是否开始故障切换的状态是由 master->failover_state
记录的,它的默认值是 SENTINEL_FAILOVER_STATE_NONE
也就是 0,当开始故障转移时,它的状态值就会大于 SENTINEL_FAILOVER_STATE_NONE
。
处理 sentinel is-master-down-by-addr
命令
我们接下来看一下其他哨兵是如何处理 sentinel is-master-down-by-addr
命令的。处理这个命令的是 sentinelCommand
函数中的一个分支,下面我们来看一下:
void sentinelCommand(client *c) {
...
if (!strcasecmp(c->argv[1]->ptr,"is-master-down-by-addr")) {
...
/* It exists? Is actually a master? Is subjectively down? It's down.
* Note: if we are in tilt mode we always reply with "0". */
// 当前哨兵判断主节点的状态是否是主观下线
if (!sentinel.tilt && ri && (ri->flags & SRI_S_DOWN) &&
(ri->flags & SRI_MASTER))
isdown = 1;
/* Vote for the master (or fetch the previous vote) if the request
* includes a runid, otherwise the sender is not seeking for a vote. */
// 投票
if (ri && ri->flags & SRI_MASTER && strcasecmp(c->argv[5]->ptr,"*")) {
leader = sentinelVoteLeader(ri,(uint64_t)req_epoch,
c->argv[5]->ptr,
&leader_epoch);
}
/* Reply with a three-elements multi-bulk reply:
* down state, leader, vote epoch. */
addReplyMultiBulkLen(c,3);
// 返回当前哨兵判断主节点是否是主观下线的结果
addReply(c, isdown ? shared.cone : shared.czero);
// 返回 leader ID 或 *
addReplyBulkCString(c, leader ? leader : "*");
// 返回 leader 的纪元
addReplyLongLong(c, (long long)leader_epoch);
if (leader) sdsfree(leader);
}
...
}
我们首先看一下哨兵收到其他哨兵回复后的操作,对于 sentinelVoteLeader
投票环节我们后面再讲。
sentinelReceiveIsMasterDownReply
前面我们说过发送 sentinel is-master-down-by-addr
命令的回调函数为 sentinelReceiveIsMasterDownReply
。我们接下来分析一下收到回复后做了什么。
void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *privdata) {
...
if (r->type == REDIS_REPLY_ARRAY && r->elements == 3 &&
r->element[0]->type == REDIS_REPLY_INTEGER &&
r->element[1]->type == REDIS_REPLY_STRING &&
r->element[2]->type == REDIS_REPLY_INTEGER)
{
ri->last_master_down_reply_time = mstime();
// 如果为主观下线,则设置状态为 SRI_MASTER_DOWN
if (r->element[0]->integer == 1) {
ri->flags |= SRI_MASTER_DOWN;
} else {
ri->flags &= ~SRI_MASTER_DOWN;
}
// 更新 leader ID 以及 epoch
if (strcmp(r->element[1]->str,"*")) {
/* If the runid in the reply is not "*" the Sentinel actually
* replied with a vote. */
sdsfree(ri->leader);
if ((long long)ri->leader_epoch != r->element[2]->integer)
serverLog(LL_WARNING,
"%s voted for %s %llu", ri->name,
r->element[1]->str,
(unsigned long long) r->element[2]->integer);
ri->leader = sdsnew(r->element[1]->str);
ri->leader_epoch = r->element[2]->integer;
}
}
}
我们可以看到这个函数会进行判断 r->element[0]->integer == 1
,也就是哨兵判断当前主节点是否是主观下线,如果是主观下线则新增状态 SRI_MASTER_DOWN
。
哨兵选举
我们来看一下进行故障转移的流程。
void sentinelHandleRedisInstance(sentinelRedisInstance *ri) {
...
if (ri->flags & SRI_MASTER) {
sentinelCheckObjectivelyDown(ri);
// 是否进行故障切换
if (sentinelStartFailoverIfNeeded(ri))
// 主要进行哨兵选举
sentinelAskMasterStateToOtherSentinels(ri,SENTINEL_ASK_FORCED);
//
sentinelFailoverStateMachine(ri);
// 获取其他哨兵对主节点的状态
sentinelAskMasterStateToOtherSentinels(ri,SENTINEL_NO_FLAGS
}
}
我们首先来看 sentinelStartFailoverIfNeeded
函数,它会判断是否要进行故障转移,它的判断条件有三个:
- 主节点的
flag
已经标记了SRI_O_DOWN
- 主节点没有在执行故障转移
- 如果已经开始故障转移,那么开始时间距离当前时间需要超过
sentinel.conf
⽂件中的sentinel failover-timeout
配置项的2倍。下面是代码实现:
int sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) {
/* We can't failover if the master is not in O_DOWN state. */
// 主节点的 `flag` 已经标记了 `SRI_O_DOWN`
if (!(master->flags & SRI_O_DOWN)) return 0;
/* Failover already in progress? */
// 主节点有没有在执行故障转移
if (master->flags & SRI_FAILOVER_IN_PROGRESS) return 0;
/* Last failover attempt started too little time ago? */
// 如果已经开始故障切换,那么开始时间距离当前时间,需要超过 sentinel.conf ⽂件中的 sentinel failover-timeout 配置项的2倍。
if (mstime() - master->failover_start_time <
master->failover_timeout*2)
{
...
return 0;
}
sentinelStartFailover(master);
return 1;
}
这三个条件都校验通过说明需要进行故障转移,则会调用 sentinelStartFailover
函数来设置一些进行故障转移的状态。主要将 failover_state
设置为 SENTINEL_FAILOVER_STATE_WAIT_START
同时将 flags
设置 SRI_FAILOVER_IN_PROGRESS
状态表示故障转移开始。
void sentinelStartFailover(sentinelRedisInstance *master) {
serverAssert(master->flags & SRI_MASTER);
master->failover_state = SENTINEL_FAILOVER_STATE_WAIT_START;
master->flags |= SRI_FAILOVER_IN_PROGRESS;
master->failover_epoch = ++sentinel.current_epoch;
sentinelEvent(LL_WARNING,"+new-epoch",master,"%llu",
(unsigned long long) sentinel.current_epoch);
sentinelEvent(LL_WARNING,"+try-failover",master,"%@");
master->failover_start_time = mstime()+rand()%SENTINEL_MAX_DESYNC;
master->failover_state_change_time = mstime();
}
sentinelVoteLeader
char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch) {
if (req_epoch > sentinel.current_epoch) {
sentinel.current_epoch = req_epoch;
sentinelFlushConfig();
sentinelEvent(LL_WARNING,"+new-epoch",master,"%llu",
(unsigned long long) sentinel.current_epoch);
}
// 当主节点的 epoch 小于发送消息哨兵的 epoch 并且当前哨兵的 epoch 小于等于发送消息哨兵的 epoch,表示可以给发送消息的哨兵投票
if (master->leader_epoch < req_epoch && sentinel.current_epoch <= req_epoch)
{
sdsfree(master->leader);
// 向发送消息的哨兵投票
master->leader = sdsnew(req_runid);
// 更新主节点的 epoch
master->leader_epoch = sentinel.current_epoch;
sentinelFlushConfig();
sentinelEvent(LL_WARNING,"+vote-for-leader",master,"%s %llu",
master->leader, (unsigned long long) master->leader_epoch);
if (strcasecmp(master->leader,sentinel.myid))
master->failover_start_time = mstime()+rand()%SENTINEL_MAX_DESYNC;
}
*leader_epoch = master->leader_epoch;
return master->leader ? sdsnew(master->leader) : NULL;
}
sentinelFailoverStateMachine
这个函数是哨兵故障转移的状态机,从下面的代码可以看到,它会根据 ri->failover_state
不同的状态调用不同的函数。
void sentinelFailoverStateMachine(sentinelRedisInstance *ri) {
serverAssert(ri->flags & SRI_MASTER);
if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS)) return;
switch(ri->failover_state) {
case SENTINEL_FAILOVER_STATE_WAIT_START:
sentinelFailoverWaitStart(ri);
break;
case SENTINEL_FAILOVER_STATE_SELECT_SLAVE:
sentinelFailoverSelectSlave(ri);
break;
case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE:
sentinelFailoverSendSlaveOfNoOne(ri);
break;
case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION:
sentinelFailoverWaitPromotion(ri);
break;
case SENTINEL_FAILOVER_STATE_RECONF_SLAVES:
sentinelFailoverReconfNextSlave(ri);
break;
}
}
转载自:https://juejin.cn/post/7179525164496289852