InfluxDB集群 -- 添加DataNode源码分析

a朋

influxdb集群中,client在node1上执行add dataNode node3的命令:

influxd_ctl add-data node3:8088

整体流程如下:

  • node1收到CLI命令,向自己的8084发送GET /add-data请求,request body: {"node": "node3:8088"};
  • node1向node3的8088端口发送TCP消息AddDataNode;
  • node3收到AddDataNode后,主动将自己加入集群;

CLI命令处理

命令行解析的代码入口:

// cmd/influxd_ctl/cli/cli.go
func (c *CommandLine) Run() error {
    switch cmd {
    case "add-data":
        return do_add_data(c)
    }
}

向本机的集群管理端口8084,发送GET /add-data:

func do_add_data(c *CommandLine) error {
    var node string
    node = c.CMD[1]
    // 向本机的8084发送http add-data
    url := c.getURL("add-data", map[string]string{"node": node})
    resp, err := http.Get(url)
    if err != nil {
        return err
    }
    defer resp.Body.Close()
    body, err := ioutil.ReadAll(resp.Body)
    ......
    return nil
}

node1向node3发送AddDataNode

node1处理GET /add-data的handler:

// services/admin_cluster/handler.go
func (h *handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
    switch r.Method {
    case "GET":
        switch r.URL.Path {
        case "/add-data":
            h.WrapHandler("add-data", h.addDataNode).ServeHTTP(w, r)
        }
    }
}

构造了1个AddDataNode的request,然后通过TCP发送给node3:

func (h *handler) addDataNode(w http.ResponseWriter, r *http.Request) {
    ps := r.URL.Query()
    if len(ps["node"]) != 1 {
        http.Error(w, "", http.StatusBadRequest)
    }
    node := ps["node"][0]
    req := Request{AddDataNode, "", 0}
    err := h.call(node, req)
    h.sendResponse(err, w)
}

h.call()发送TCP消息:AddDataNode

// 发送tcp消息
func (h *handler) call(node string, req Request) error {
    conn, err := tcp.Dial("tcp", node, MuxHeader)
    if err != nil {
        return fmt.Errorf("tcp dial to node %s failed. %v", node, err)
    }
    defer conn.Close()
    if err = json.NewEncoder(conn).Encode(req); err != nil {
        return fmt.Errorf("encode and send request failed. %v", err)
    }
    // read response
    resp, err := h.readResponse(conn)
    ...
    return nil
}

node3将自己加入集群

node3接收到TCP消息:AddDataNode

// services/admin_cluster/tcphandler.go
func (h *TCPHandler) handleConn(conn net.Conn) {
    r := &Request{}
    err := json.NewDecoder(conn).Decode(r)
    switch r.Type {
    case AddDataNode:
        h.handleAddDataNode(r, conn)
    }
}

由h.Server.DataServerJoin()处理节点加入集群:

func (h *TCPHandler) handleAddDataNode(r *Request, conn net.Conn) {
    if err := h.Server.DataServerJoin(); err != nil {
        Err := fmt.Sprintf("DataServerJoin failed. %v", err)
        if err := json.NewEncoder(conn).Encode(Response{Err}); err != nil {
            h.Logger.Info("Encode admin tcp resposne failed.", zap.Error(err))
        }
    } else {
        if err := json.NewEncoder(conn).Encode(Response{}); err != nil {
            h.Logger.Info("Encode admin tcp resposne failed.", zap.Error(err))
        }
    }
}

节点加入集群:

  • 由metaClient向集群发送添加DataNode的消息;
  • 本机更新节点信息node.json文件;
// cmd/influxd/run/server.go
func (s *Server) DataServerJoin() (err error) {
    if s.config.Data.Enabled {
        // If we've already created a data node for our id, we're done
        if _, err = s.MetaClient.DataNode(s.Node.GetDataID()); err == nil {
            return nil
        }
        // 向leader发送添加data节点的的请求
        n, err := s.MetaClient.CreateDataNode(s.HTTPAddr(), s.TCPAddr())
        // 重试,直到成功
        for err != nil {
            log.Printf("Unable to create data node. retry in 1s: %s", err.Error())
            time.Sleep(time.Second)
            n, err = s.MetaClient.CreateDataNode(s.HTTPAddr(), s.TCPAddr())
        }
        s.Node.ID = n.ID
        // 更新本机的node.json内容
        if serr := s.Node.Save(); serr != nil {
            return serr
        }
        return nil
    }
    return fmt.Errorf("Data node is not enabled")
}

metaClient.CreateDataNode()实际是向Leader发送Command_CreateDataNodeCommand:

// CreateDataNode will create a new data node in the metastore
func (c *Client) CreateDataNode(httpAddr, tcpAddr string) (*NodeInfo, error) {
    cmd := &internal.CreateDataNodeCommand{
        HTTPAddr: proto.String(httpAddr),
        TCPAddr:  proto.String(tcpAddr),
    }
    if err := c.retryUntilExec(internal.Command_CreateDataNodeCommand, internal.E_CreateDataNodeCommand_Command, cmd); err != nil {
        return nil, err
    }
    n, err := c.DataNodeByTCPHost(tcpAddr)
    if err != nil {
        return nil, err
    }
    c.nodeID = n.ID
    return n, nil
}

Raft Leader收到Command_CreateDatNodeCommand:

// services/meta/store_fsm.go
func (fsm *storeFSM) Apply(l *raft.Log) interface{} {
    var cmd internal.Command
    if err := proto.Unmarshal(l.Data, &cmd); err != nil {
        panic(fmt.Errorf("cannot marshal command: %x", l.Data))
    }
    switch cmd.GetType() {
    case internal.Command_CreateDataNodeCommand:
            return fsm.applyCreateDataNodeCommand(&cmd)
    }
}

将新节点更新到状态机:

func (fsm *storeFSM) applyCreateDataNodeCommand(cmd *internal.Command) interface{} {
    ext, _ := proto.GetExtension(cmd, internal.E_CreateDataNodeCommand_Command)
    v := ext.(*internal.CreateDataNodeCommand)

    other := fsm.data.Clone()
    other.CreateDataNode(v.GetHTTPAddr(), v.GetTCPAddr())
    fsm.data = other
    return nil
}
阅读 232
10 声望
5 粉丝
0 条评论
10 声望
5 粉丝
文章目录
宣传栏