raft概述
Raft 通过选举出一个领导人,然后给予他全部的管理复制日志的责任来实现一致性。领导人从客户端接收日志条目(log entries),把日志条目复制到其他服务器上,并告诉其他的服务器什么时候可以安全地将日志条目应用到他们的状态机中。
raft存在的三种状态:
- Follower - 每个节点的初始状态
- Candidate - 当Follower的ElectionTime到期时,成为Candidate开始选举
- Leader - 成为Candidate后获取到大多数节点的投票,成为Leader,向其余节点发送心跳包
需要完成的方法:
选举相关
- RequestVote
- startElection
日志相关
- sendHeartbeats
- AppendEntries
- applyLogs
持久化相关
- persist
- readPersist
快照相关
- Snapshot
- InstallSnapshot
初始化
func Make(peers []*labrpc.ClientEnd, me int, persister *Persister, applyCh chan ApplyMsg) *Raft {
rf := &Raft{}
rf.peers = peers
rf.persister = persister
rf.me = me
// Your initialization code here (3A, 3B, 3C).
rf.dead = 0
rf.state = FOLLOWER
rf.currentTerm = 0
rf.votedFor = -1
rf.log = make([]Entry, 1)
rf.commitIndex = 0
rf.lastApplied = 0
rf.lastIncludeTerm = 0
rf.snapshot = make([]byte, 0)
rf.nextIndex = make([]int, len(peers))
for i := 0; i < len(peers); i++ {
rf.nextIndex[i] = 1
}
rf.matchIndex = make([]int, len(peers))
rf.applyCh = applyCh
rf.applyCond = sync.NewCond(&rf.mu)
// initialize from state persisted before a crash
rf.readPersist(persister.ReadRaftState())
rf.snapshot=rf.persister.ReadSnapshot()
// start ticker goroutine to start elections
rf.lastApplied=rf.lastIncludeIndex
go rf.ticker()
go rf.applyLogs()
return rf
}
func (rf *Raft) ticker() {
for !rf.killed() {
now := time.Now()
if rf.state == LEADER {
if now.After(rf.heartbeatTimer) {
rf.sendHeartbeats()
rf.resetHeartbeatTimer()
}
} else {
if now.After(rf.electionTimer) {
rf.startElection()
rf.resetElectionTimer()
}
}
time.Sleep(10 * time.Millisecond)
}
}
选举
startElection
func (rf *Raft) startElection() {
rf.mu.Lock()
//更新状态
rf.state = CANDIDATE
rf.currentTerm++
rf.votedFor = rf.me
rf.persistStateAndSnapshot()
currentTerm := rf.currentTerm
lastLogIndex := 0
lastLogTerm := 0
//3D之前忽略rf.lastIncludeIndex
lastLogIndex = len(rf.log) + rf.lastIncludeIndex - 1
lastLogTerm = rf.log[len(rf.log)-1].Term
votes := int32(1)
rf.mu.Unlock()
rf.resetElectionTimer()
for i := range rf.peers {
if i == rf.me {
continue
}
go func(i int) {
args := &RequestVoteArgs{
Term: currentTerm,
CandidateID: rf.me,
LastLogIndex: lastLogIndex,
LastLogItem: lastLogTerm,
}
reply := &RequestVoteReply{}
if rf.state != CANDIDATE {
return
}
ok := rf.sendRequestVote(i, args, reply)
if ok {
rf.mu.Lock()
if reply.VoteGranted {
votes++
if votes > int32(len(rf.peers)/2) {
if rf.state == CANDIDATE && rf.currentTerm == currentTerm {
rf.state = LEADER
for i := 0; i < len(rf.peers); i++ {
rf.nextIndex[i] = len(rf.log) + rf.lastIncludeIndex
rf.matchIndex[i] = rf.lastIncludeIndex
}
rf.persistStateAndSnapshot()
rf.sendHeartbeats()
}
}
} else if reply.Term > currentTerm {
rf.currentTerm = reply.Term
rf.state = FOLLOWER
rf.votedFor = -1
rf.persistStateAndSnapshot()
}
rf.mu.Unlock()
}
}(i)
}
}
- 开始选举后,更新自身的状态和任期,并投票给自己
- 发送RPC投票请求给其他节点,累计投票大于
len(rf.peers)/2
时,成为Leader,更新自身的nextIndex[]
和matchIndex[]
- 成为Leader后开始发送心跳包
- 若
reply
返回比自己更高的任期,成为Follower,更新任期
在等待投票的时候,候选人可能会从其他的服务器接收到声明它是领导人的附加条目(AppendEntries)RPC。如果这个领导人的任期号(包含在此次的 RPC中)不小于候选人当前的任期号,那么候选人会承认领导人合法并回到跟随者状态。 (所以成为Leader前会判断是否还是Candidate以及任期是否已经发生变化)如果此次 RPC 中的任期号比自己小,那么候选人就会拒绝这次的 RPC 并且继续保持候选人状态。
RequestVote
func (rf *Raft) RequestVote(args *RequestVoteArgs, reply *RequestVoteReply) {
if rf.killed() {
reply.Term = -1
reply.VoteGranted = false
return
}
defer rf.mu.Unlock()
//自身日期更大
if args.Term < rf.currentTerm {
reply.VoteGranted = false
reply.Term = rf.currentTerm
return
}
//candidate的任期更大
if args.Term > rf.currentTerm {
// 更新自身的状态
rf.state = FOLLOWER
rf.currentTerm = args.Term
rf.votedFor = -1
rf.persistStateAndSnapshot()
}
maxLocalLogIndex := rf.lastIncludeIndex + len(rf.log) - 1
voteForCandidate := (rf.votedFor == -1 || rf.votedFor == args.CandidateID)
logUpToDate := (args.LastLogItem > rf.log[len(rf.log)-1].Term) ||
((args.LastLogItem == rf.log[len(rf.log)-1].Term &&
args.LastLogIndex >= maxLocalLogIndex))
if voteForCandidate && logUpToDate {
rf.votedFor = args.CandidateID
reply.VoteGranted = true
rf.state = FOLLOWER
rf.resetElectionTimer()
rf.persistStateAndSnapshot()
} else {
reply.VoteGranted = false
}
reply.Term = rf.currentTerm
}
- 参数
- Raft 通过随机选举定时器来阻止选举分裂的发生,即使选举分裂发生也可以很快的被解决。选举超时将在
[150,300]
ms 之间随机生成,这样就大概率保证集群中会有一个机器会先超时,而避免所有机器同时超时从而降低选举分裂情况发生的概率。 - Student‘s Guide: if you have already voted in the current term, and an incoming
RequestVote
RPC has a higher term that you, you should _first_ step down and adopt their term (thereby resettingvotedFor
), and _then_ handle the RPC, which will result in you granting the vote。即在收到任期更大的RPC请求应该更新自己的votedFor
为-1 投票规则:
- 若candidate的最新日志的term大于节点最新日志的任期
- 或者两个任期相等但是candidate的最新日志的index大于节点
- 在上面的基础上,节点还未投票或者已经投票给该candidate(由于网络原因,在同一任期又给该节点发送了投票请求)
日志相关
sendHeartbeats
func (rf *Raft) sendHeartbeats() {
if rf.state != LEADER {
return
}
for i := 0; i < len(rf.peers); i++ {
if i == rf.me {
continue
}
go func(server int) {
args := &AppendEntryArgs{
Term: rf.currentTerm,
LeaderId: rf.me,
PreLogIndex: 0,
PreLogItem: 0,
Entries: nil,
LeaderCommit: rf.commitIndex,
}
args.PreLogIndex = rf.nextIndex[server] - 1
nextIndex := rf.nextIndex[server]
//3D
if args.PreLogIndex < rf.lastIncludeIndex {
arg := &InstallSnapshotArgs{
LeaderId: rf.me,
Term: rf.currentTerm,
LastIncludeIndex: rf.lastIncludeIndex,
LastIncludeTerm: rf.lastIncludeTerm,
Data: rf.snapshot,
}
reply := &InstallSnapshotReply{}
if ok := rf.sendInstallSnapshot(server, arg, reply); !ok {
return
}
if reply.Term > rf.currentTerm {
rf.currentTerm = reply.Term
rf.state = FOLLOWER
rf.votedFor = -1
rf.persistStateAndSnapshot()
return
}
//调整该Server的matchIndex 与 nextIndex
rf.mu.Lock()
rf.nextIndex[server] = arg.LastIncludeIndex+1
rf.matchIndex[server] = arg.LastIncludeIndex
rf.mu.Unlock()
} else {
if nextIndex-rf.lastIncludeIndex<len(rf.log){
args.Entries = rf.log[nextIndex-rf.lastIncludeIndex:]
}
args.PreLogItem = rf.log[args.PreLogIndex-rf.lastIncludeIndex].Term
reply := &AppendEntryReply{}
if rf.state != LEADER {
return
}
ok := rf.sendAppendEntry(server, args, reply)
if !ok {
return
}
// 如果term变了,表示该结点不再是leader,什么也不做
if rf.currentTerm != args.Term {
rf.state = FOLLOWER
rf.persistStateAndSnapshot()
return
}
//发现更大的term,本结点是旧leader
if reply.Term > rf.currentTerm {
rf.currentTerm = reply.Term
rf.state = FOLLOWER
rf.persistStateAndSnapshot()
return
}
if reply.Success {
rf.mu.Lock()
rf.matchIndex[server] = args.PreLogIndex + len(args.Entries)
rf.nextIndex[server] = rf.matchIndex[server] + 1
// 提交到哪个位置需要根据中位数来判断,中位数表示过半提交的日志位置,
matchIndexSlice := make([]int, len(rf.peers))
copy(matchIndexSlice, rf.matchIndex)
rf.mu.Unlock()
sort.Slice(matchIndexSlice, func(i, j int) bool {
return matchIndexSlice[i] < matchIndexSlice[j]
})
newCommitIndex := matchIndexSlice[(len(rf.peers)-1)/2]
//不能提交不属于当前term的日志
if newCommitIndex > rf.commitIndex && rf.log[newCommitIndex-rf.lastIncludeIndex].Term == rf.currentTerm {
rf.commitIndex = newCommitIndex
rf.applyCond.Signal()
}
} else {
if reply.ConflictTerm == -1 {
rf.nextIndex[server] = reply.ConflictIndex
} else {
conflictIndex := -1
for i := args.PreLogIndex - rf.lastIncludeIndex; i > 0; i-- {
if rf.log[i].Term == reply.ConflictTerm {
conflictIndex = i
break
}
}
rf.mu.Lock()
if conflictIndex != -1 {
rf.nextIndex[server] = conflictIndex + 1
} else {
rf.nextIndex[server] = reply.ConflictIndex
}
rf.mu.Unlock()
}
}
}
}(i)
}
}
易错:在返回Success后更新
matchIndex[]
,再由matchIndex[]
更新nextIndex[]
,否则测试时可能会报错rf.matchIndex[server] = args.PreLogIndex + len(args.Entries) rf.nextIndex[server] = rf.matchIndex[server] + 1
A good example of this is setting
matchIndex = nextIndex - 1
, ormatchIndex = len(log)
when you receive a response to an RPC. This is _not_ safe, because both of those values could have been updated since when you sent the RPC. Instead, the correct thing to do is updatematchIndex
to beprevLogIndex + len(entries[])
from the arguments you sent in the RPC originally.
AppendEntries
func (rf *Raft) AppendEntries(args *AppendEntryArgs, reply *AppendEntryReply) {
if rf.killed() {
reply.Term = -1
reply.Success = false
return
}
rf.mu.Lock()
defer rf.mu.Unlock()
if args.Term < rf.currentTerm || rf.lastIncludeIndex>args.PreLogIndex{
reply.Term = rf.currentTerm
reply.Success = false
return
}
//this node's term is less than leader's node
//reset this node's term
if args.Term > rf.currentTerm {
rf.currentTerm = args.Term
rf.votedFor = -1
rf.persistStateAndSnapshot()
}
rf.state = FOLLOWER
rf.resetElectionTimer()
maxLocalLogIndex := rf.lastIncludeIndex + len(rf.log) - 1
//节点日志小于Leader
if maxLocalLogIndex < args.PreLogIndex {
reply.ConflictIndex = maxLocalLogIndex + 1
reply.ConflictTerm = -1
reply.Success = false
return
}
if rf.lastIncludeIndex < args.PreLogIndex {
//存在索引但任期不等
if rf.log[args.PreLogIndex-rf.lastIncludeIndex].Term != args.PreLogItem {
reply.Term, reply.Success = rf.currentTerm, false
reply.ConflictTerm = rf.log[args.PreLogIndex-rf.lastIncludeIndex].Term
for i := 0; i <= len(rf.log)-1; i++ {
if rf.log[i].Term == reply.ConflictTerm {
reply.ConflictIndex = i
break
}
}
reply.Success = false
return
}
}
if rf.lastApplied > args.PreLogIndex {
reply.Term = rf.currentTerm
reply.Success = false
reply.ConflictIndex = rf.lastApplied + 1
return
}
rf.log = append(rf.log[:args.PreLogIndex-rf.lastIncludeIndex+1], args.Entries...)
rf.persistStateAndSnapshot()
//判断是否更新commitIndex
if args.LeaderCommit > rf.commitIndex {
rf.commitIndex = min(args.LeaderCommit, maxLocalLogIndex)
rf.applyCond.Signal()
}
reply.Term = rf.currentTerm
reply.Success = true
}
rf.commitIndex = min(args.LeaderCommit, maxLocalLogIndex)
,是取两者更小值- 在Student's Guide中提到如果追随者
prevLogIndex
的日志中有,但是术语不匹配,它应该返回conflictTerm = log[prevLogIndex].Term
,然后在其日志中搜索第一个条目的术语等于的索引conflictTerm
。
applyLogs
func (rf *Raft) applyLogs() {
for !rf.killed() {
rf.mu.Lock()
for rf.lastApplied >= rf.commitIndex {
rf.applyCond.Wait()
}
// 将需要应用的日志条目复制到一个临时切片中,减少锁的持有时间
var msgsToApply []ApplyMsg
for i := rf.lastApplied + 1; i <= rf.commitIndex; i++ {
if i < rf.lastIncludeIndex{
rf.lastApplied = rf.lastIncludeIndex
continue
}
applyMsg := ApplyMsg{
CommandValid: true,
Command: rf.log[i-rf.lastIncludeIndex].Command,
CommandIndex: i,
}
msgsToApply = append(msgsToApply, applyMsg)
rf.lastApplied += 1
}
rf.mu.Unlock()
// 现在在锁外发送applyMsg
for _, msg := range msgsToApply {
rf.applyCh <- msg
}
}
}
- 在前面的测试中可以在持有锁时
apply
日志,但在3D中应在锁外发送applyMsg
,否则测试时会死锁,测试会报错
持久化相关
persist
这是3D的实现,前面的实验中不需要持久化后两个,在最后的rf.persister.Save(state, rf.snapshot)
中把snapshot
换成nil
就可以
func (rf *Raft) persistStateAndSnapshot() {
w := new(bytes.Buffer)
e := labgob.NewEncoder(w)
e.Encode(rf.currentTerm)
e.Encode(rf.votedFor)
e.Encode(rf.log)
e.Encode(rf.lastIncludeIndex)
e.Encode(rf.lastIncludeTerm)
state := w.Bytes()
rf.persister.Save(state, rf.snapshot)
}
readPersist
func (rf *Raft) readPersist(data []byte) {
if data == nil || len(data) < 1 { // bootstrap without any state?
return
}
// Your code here (3C).
// Example:
r := bytes.NewBuffer(data)
d := labgob.NewDecoder(r)
var currentTerm int
var votedFor int
var log []Entry
var lastIncludedIndex int
var lastIncludedTerm int
var lastApplies int
if d.Decode(¤tTerm) != nil || d.Decode(&votedFor) != nil || d.Decode(&log) != nil || d.Decode(&lastIncludedIndex) != nil || d.Decode(&lastIncludedTerm) != nil ||d.Decode(&lastApplies) != nil{
// fmt.Println("decode error")
} else {
rf.currentTerm = currentTerm
rf.votedFor = votedFor
rf.log = log
rf.lastIncludeIndex = lastIncludedIndex
rf.lastIncludeTerm = lastIncludedTerm
rf.lastApplied =lastApplies
}
}
我在readPersist
中没实现快照的读取,所以在Make
中读取快照
快照相关
快照是最简单的压缩方法。快照方法下,当前整个系统状态将会写到存储在稳定存储介质的一个快照。这样这个点之前的日志就可以删除。
Snapshot
func (rf *Raft) Snapshot(index int, snapshot []byte) {
// Your code here (3D).
if rf.killed() {
return
}
rf.mu.Lock()
if rf.lastIncludeIndex >= index || index > rf.commitIndex {
rf.mu.Unlock()
return
}
rf.log = rf.log[index-rf.lastIncludeIndex:]
rf.lastIncludeIndex = index
rf.lastIncludeTerm = rf.log[0].Term
rf.snapshot = snapshot
rf.mu.Unlock()
rf.persistStateAndSnapshot()
}
正常情况下机器都是单独的进行快照,但是在leader 已经删除了下条需要发送给参与者的日志记录时,也会发送快照到那些落后的机器上。leader 将会使用一种叫做 InstallSnapshot 新的RPC 来拷贝快照到那些远远落后的机器。通常快照会包含接受这日志中没有的新信息。这种情况下,参与者删除整个日志并被快照取代,如果参与者重复接受到一个快照,那么快照之前的日志记录可以删除,但是快照之后的日志记录是合法的并需要被保留
InstallSnapShot
func (rf *Raft) InstallSnapShot(args *InstallSnapshotArgs, reply *InstallSnapshotReply) {
rf.mu.Lock()
if rf.currentTerm > args.Term {
reply.Term = rf.currentTerm
rf.mu.Unlock()
return
}
if args.Term > rf.currentTerm {
rf.currentTerm, rf.votedFor = args.Term, -1
rf.persistStateAndSnapshot()
}
reply.Term = args.Term
rf.state = FOLLOWER
rf.persistStateAndSnapshot()
rf.resetHeartbeatTimer()
if rf.lastIncludeIndex >= args.LastIncludeIndex || args.LastIncludeIndex <= rf.commitIndex{
rf.mu.Unlock()
return
}
maxLocalLogIndex := rf.lastIncludeIndex + len(rf.log) - 1
if maxLocalLogIndex <= args.LastIncludeIndex {
rf.log = []Entry{
{Index: args.LastIncludeIndex, Term: args.LastIncludeTerm},
}
}
rf.lastIncludeIndex = args.LastIncludeIndex
rf.lastIncludeTerm = args.LastIncludeTerm
index := args.LastIncludeIndex
rf.commitIndex = index
rf.lastApplied = index
rf.snapshot = args.Data
rf.persistStateAndSnapshot()
rf.mu.Unlock()
applyMsg:=ApplyMsg{
SnapshotValid: true,
Snapshot: args.Data,
SnapshotTerm: args.LastIncludeTerm,
SnapshotIndex: args.LastIncludeIndex,
}
rf.applyCh<-applyMsg
}
遇见的问题
matchIndex
、nextIndex
的更新,在上文中已经提到了(成为Leader时,和Leader收到AppendEntries的reply时)- 死锁问题:在未使用
defer rf.mu.Unlock
的地方,记得在所以退出的地方前都释放锁 - 3D中出现了出界问题,根据报错改就可以了
- 不要忘了读取快照,否则3D测试会报错
- 在每个需持久化的数据更新时,一定要使用
persist
,不要遗漏 重置选举时间的时机,不要在其他地方重置选举计时器
- 从leader 处收到一个AppendEntries RPC(或InstallSnapshot),且 leader 的任期必须大于等于节点的任期。
- 给一个节点投票。
- 当前节点选举超时。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。