1

简介

相对于udp协议来说,tcp更加复杂。为了给应用层提供可靠性传输,tcp协议引入了标志位,选项,序列号,滑动窗口等特性。

TCP标志位

#define TCPHDR_FIN 0x01
#define TCPHDR_SYN 0x02
#define TCPHDR_RST 0x04
#define TCPHDR_PSH 0x08
#define TCPHDR_ACK 0x10
#define TCPHDR_URG 0x20
#define TCPHDR_ECE 0x40
#define TCPHDR_CWR 0x80

URG 紧急指针(urgent pointer)有效(见20.8节)。
ACK 确认序号有效。
PSH 接收方应该尽快将这个报文段交给应用层。
RST 重建连接。
SYN 同步序号用来发起一个连接。这个标志和下一个标志将在第18章介绍。
FIN 发端完成发送任务。
ECE:ECN响应标志被用来在TCP3次握手时表明一个TCP端是具备ECN功能的,并且表明接收到的TCP包的IP头部的ECN被设置为11。更多信息请参考RFC793。
CWR:拥塞窗口减少标志被发送主机设置,用来表明它接收到了设置ECE标志的TCP包。拥塞窗口是被TCP维护的一个内部变量,用来管理发送窗口大小。

当两个支持ECN的TCP端进行TCP连接时,它们交换SYN,SYN-ACK和ACK包。对于支持ECN的TCP端来说,SYN包的ECE和CWR标志都被设置了SYN-ACK只设置ECE标志

tcp状态变迁

连接跟踪会根据tcp交互报文的标志位进行状态跟踪。由于防火墙处于客户端和服务器的中间,所以连接跟踪的状态机与客户端和服务器端并不完全一样。

下图是客户端和服务器短的状态变迁图:
image.png

linux连接跟踪的状态变迁:

/*
 * The TCP state transition table needs a few words...
 *
 * We are the man in the middle. All the packets go through us
 * but might get lost in transit to the destination.
 * It is assumed that the destinations can't receive segments
 * we haven't seen.
 *
 * The checked segment is in window, but our windows are *not*
 * equivalent with the ones of the sender/receiver. We always
 * try to guess the state of the current sender.
 *
 * The meaning of the states are:
 *
 * NONE:    initial state
 * SYN_SENT:    SYN-only packet seen
 * SYN_SENT2:    SYN-only packet seen from reply dir, simultaneous open
 * SYN_RECV:    SYN-ACK packet seen
 * ESTABLISHED:    ACK packet seen
 * FIN_WAIT:    FIN packet seen
 * CLOSE_WAIT:    ACK seen (after FIN)
 * LAST_ACK:    FIN seen (after FIN)
 * TIME_WAIT:    last ACK seen
 * CLOSE:    closed connection (RST)
 *
 * Packets marked as IGNORED (sIG):
 *    if they may be either invalid or valid
 *    and the receiver may send back a connection
 *    closing RST or a SYN/ACK.
 *
 * Packets marked as INVALID (sIV):
 *    if we regard them as truly invalid packets
 */
static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
    {
/* ORIGINAL */
/*          sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2    */
/*syn*/       { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
/*
 *    sNO -> sSS    Initialize a new connection
 *    sSS -> sSS    Retransmitted SYN
 *    sS2 -> sS2    Late retransmitted SYN
 *    sSR -> sIG
 *    sES -> sIG    Error: SYNs in window outside the SYN_SENT state
 *            are errors. Receiver will reply with RST
 *            and close the connection.
 *            Or we are not in sync and hold a dead connection.
 *    sFW -> sIG
 *    sCW -> sIG
 *    sLA -> sIG
 *    sTW -> sSS    Reopened connection (RFC 1122).
 *    sCL -> sSS
 */
/*          sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2    */
/*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR },
/*
 *    sNO -> sIV    Too late and no reason to do anything
 *    sSS -> sIV    Client can't send SYN and then SYN/ACK
 *    sS2 -> sSR    SYN/ACK sent to SYN2 in simultaneous open
 *    sSR -> sSR    Late retransmitted SYN/ACK in simultaneous open
 *    sES -> sIV    Invalid SYN/ACK packets sent by the client
 *    sFW -> sIV
 *    sCW -> sIV
 *    sLA -> sIV
 *    sTW -> sIV
 *    sCL -> sIV
 */
/*          sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2    */
/*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
/*
 *    sNO -> sIV    Too late and no reason to do anything...
 *    sSS -> sIV    Client migth not send FIN in this state:
 *            we enforce waiting for a SYN/ACK reply first.
 *    sS2 -> sIV
 *    sSR -> sFW    Close started.
 *    sES -> sFW
 *    sFW -> sLA    FIN seen in both directions, waiting for
 *            the last ACK.
 *            Migth be a retransmitted FIN as well...
 *    sCW -> sLA
 *    sLA -> sLA    Retransmitted FIN. Remain in the same state.
 *    sTW -> sTW
 *    sCL -> sCL
 */
/*          sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2    */
/*ack*/       { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
/*
 *    sNO -> sES    Assumed.
 *    sSS -> sIV    ACK is invalid: we haven't seen a SYN/ACK yet.
 *    sS2 -> sIV
 *    sSR -> sES    Established state is reached.
 *    sES -> sES    :-)
 *    sFW -> sCW    Normal close request answered by ACK.
 *    sCW -> sCW
 *    sLA -> sTW    Last ACK detected (RFC5961 challenged)
 *    sTW -> sTW    Retransmitted last ACK. Remain in the same state.
 *    sCL -> sCL
 */
/*          sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2    */
/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
/*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
    },
    {
/* REPLY */
/*          sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2    */
/*syn*/       { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sSS, sIV, sS2 },
/*
 *    sNO -> sIV    Never reached.
 *    sSS -> sS2    Simultaneous open
 *    sS2 -> sS2    Retransmitted simultaneous SYN
 *    sSR -> sIV    Invalid SYN packets sent by the server
 *    sES -> sIV
 *    sFW -> sIV
 *    sCW -> sIV
 *    sLA -> sIV
 *    sTW -> sSS    Reopened connection, but server may have switched role
 *    sCL -> sIV
 */
/*          sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2    */
/*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
/*
 *    sSS -> sSR    Standard open.
 *    sS2 -> sSR    Simultaneous open
 *    sSR -> sIG    Retransmitted SYN/ACK, ignore it.
 *    sES -> sIG    Late retransmitted SYN/ACK?
 *    sFW -> sIG    Might be SYN/ACK answering ignored SYN
 *    sCW -> sIG
 *    sLA -> sIG
 *    sTW -> sIG
 *    sCL -> sIG
 */
/*          sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2    */
/*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
/*
 *    sSS -> sIV    Server might not send FIN in this state.
 *    sS2 -> sIV
 *    sSR -> sFW    Close started.
 *    sES -> sFW
 *    sFW -> sLA    FIN seen in both directions.
 *    sCW -> sLA
 *    sLA -> sLA    Retransmitted FIN.
 *    sTW -> sTW
 *    sCL -> sCL
 */
/*          sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2    */
/*ack*/       { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
/*
 *    sSS -> sIG    Might be a half-open connection.
 *    sS2 -> sIG
 *    sSR -> sSR    Might answer late resent SYN.
 *    sES -> sES    :-)
 *    sFW -> sCW    Normal close request answered by ACK.
 *    sCW -> sCW
 *    sLA -> sTW    Last ACK detected (RFC5961 challenged)
 *    sTW -> sTW    Retransmitted last ACK.
 *    sCL -> sCL
 */
/*          sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2    */
/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
/*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
    }
};

tcp滑动窗口和窗口扩大因子

连接跟踪实现了对tcp传输协议的窗口的检测,通过对报文的发送序列号和应答序列号的检测,过滤掉非法的tcp报文,减轻DDOS攻击。

滑动窗口

image.png
如上图所示,发送端在发送报文时,会根据客户端发布的窗口大小,以及确认的字节数,来判断当前能够发送的最大的报文序列号。从上图来看,发送端最大能发送的合理的序列号为9,最小能发送的序列号是4(可能会重传)。对于正常主机来说,可以限定当前发送的的发送序列号在4到9之间即为合法。

​ 防火墙位于客户端和服务器之间,为了实现对seq的过滤,防火墙不能阻塞任意一个由正常主机发送的报文。为了准确的猜出发送端的seq范围,要求客户端和服务器端的所有报文都要通过防火墙。

在论文Real Stateful TCP Packet Filtering in IP Filter中作者定义了如下四个不等式来校验发送序列号和应答序列号:

  • 发送序列号上限值:从上图可知,发送方的发送序列号的上限值是应答方应答的最大序列号+应答方发送的最新窗口值
  • 发送序列号下限值:从上图可知,发送方的发送序列号的下限值是应答方已经应答的最大序列号,但是由于可能存在应答报文经过防火墙后,并没有立即到达请求方,或者应答报文出现了丢失的情况,所以发送方可能会重传一些防火墙认为已经应答的报文,所以发送序列号的下限需要放宽。目前linux实现的下限为发送方已经发送的最大序号减去接收方的最新窗口值
  • 应答序列号上限值:应答序列号是接收方对发送方已经发送的字节的应答,所以其上限必然是发送方已经发送的最大序列号,不能应答发送方没有发送的字节数。
  • 应答序列号下限值:应答序列号的下限值比较难以确定,因为应答报文可能会同时携带发送数据,既然携带发送数据就有可能重传。也就是说,即使接收方已经发送过了一个应答,并且发送方已经接收了该应答,也有可能发送方应答携带数据的应答报文时,出现应答丢失的情况,导致应答方再次发送该应答报文。种种原因应答序列号的下限值放宽到如下:
对方发送的最大字节序列号 - MAXACKWINDOW。其中MAXACKWINDOW的值为65536。在linux实现中使用MAXACKWINCONST,其值为66000。

linux在实现的时候对上面四个不等式进行了调整:

The boundaries and the conditions are changed according to RFC793:
   //报文必须与窗口相交,即报文的最大发送序列号(seq+len)可以大于sender.td_maxend,但是seq必须小于
   //sender.td_maxend。这个与论文不同,论文是seq+len < sender.td_maxend。
   //报文的最大发送序列号(seq+len)必须大于下边界sender.td_end - receiver.td_maxwin。
   the packet must intersect the window (i.e. segments may be
   after the right or before the left edge) and thus receivers may ACK
   segments after the right edge of the window.
    //td_maxend为本端能够发送的最大字节序列号,其等于对端发送的最大的应答序列号 + 当前对端的窗口号
    td_maxend = max(sack + max(win,1)) seen in reply packets
    //td_maxwin为本端的窗口+已经选择性确定的报文字节数,这个有一定的偏差,不是很严谨,但是不会阻塞合法报文。
    td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
    td_maxwin += seq + len - sender.td_maxend
            if seq + len > sender.td_maxend
    //本端发送的最大的序列号。        
    td_end    = max(seq + len) seen in sent packets
   //发送序列号上边界,需要小于sender.td_maxend
   I.   Upper bound for valid data:    seq <= sender.td_maxend
   //发送序列号下边界,要求报文的最后一个字节的序列号大于sender.td_end - receiver.td_maxwin即可。
   //对论文中进行了边界放宽。
   II.  Lower bound for valid data:    seq + len >= sender.td_end - receiver.td_maxwin
   //应答序列号上边界,小于对端发送的最大的字节序列号,这个与论文中是一致的。
   III.    Upper bound for valid (s)ack:   sack <= receiver.td_end
   //应答序列号的下边界,与论文中是一致的。
   IV.    Lower bound for valid (s)ack:    sack >= receiver.td_end - MAXACKWINDOW
   //sack是所有右边界中最大的一个数值,它会大于等于ack。在没有sack的时候该值等于ack。
   where sack is the highest right edge of sack block found in the packet
   or ack in the case of packet without SACK option.

   The upper bound limit for a valid (s)ack is not ignored -
   we doesn't have to deal with fragments.

数据结构

struct ip_ct_tcp_state {
    u_int32_t    td_end;        /* max of seq + len 本次报文发送端的最大序列号,即发送端下一个发送的报文的第一个字节序列号*/
    u_int32_t    td_maxend;    /* max of ack + max(win, 1) 本端能发送的最大字节序列号。其值为对端发送的应答号 + 窗口号。1表示在窗口为0时,允许发送1字节的探测报文。发送序列号的上边界 */
    u_int32_t    td_maxwin;    /* max(win) the maximum window seen,本段发布的最新窗口 */
    u_int32_t    td_maxack;    /* max of ack 本端发送的最大确认号 */
    u_int8_t    td_scale;    /* window scale factor 窗口扩大因子 */
    u_int8_t    flags;        /* per direction options 每方向的选项标志 */
};
struct ip_ct_tcp {
    struct ip_ct_tcp_state seen[2];    /* connection parameters per direction 每个方向上的序列号状态 */
    u_int8_t    state;        /* state of the connection (enum tcp_conntrack) 连接状态 */
    /* For detecting stale connections */
    u_int8_t    last_dir;    /* Direction of the last packet (enum ip_conntrack_dir) 上一个报文的方向 */
    u_int8_t    retrans;    /* Number of retransmitted packets 重传报文次数 */
    u_int8_t    last_index;    /* Index of the last packet 上一个报文的标志集合索引 */
    u_int32_t    last_seq;    /* Last sequence number seen in dir 上一个报文的发送序列号 */
    u_int32_t    last_ack;    /* Last sequence number seen in opposite dir 上一个报文的应答序列号 */
    u_int32_t    last_end;    /* Last seq + len 上一个报文的长度与发送序列号之和 */
    u_int16_t    last_win;    /* Last window advertisement seen in dir 上一次发布的窗口 */
    /* For SYN packets while we may be out-of-sync */
    u_int8_t    last_wscale;    /* Last window scaling factor seen 上一次发布的窗口扩大因子 */
    u_int8_t    last_flags;    /* Last flags set 报文tcp选项标志 */
};

初始化

正常情况下,第一个SYN报文到来时会创建会话,进行初始化,初始化后必须让接下来的报文通过序列号检查。接下来的报文正常情况下是SYN/ACK报文,或者是SYN报文重传。还有一种情况是,防火墙设备重启后,丢失了很多会话的信息,会话首包不是SYN包,而是一些中间报文,这种异常情况需要特殊处理。

初始化使用tcp_new函数:

/* 计算最后一个字节的序列号,dataoff表示的是ip头部的长度,len表示ip报文的长度 */
static inline __u32 segment_seq_plus_len(__u32 seq,
                     size_t len,
                     unsigned int dataoff,
                     const struct tcphdr *tcph)
{
    /* XXX Should I use payload length field in IP/IPv6 header ?
     * - YK 
     * 报文长度减去TCP头的报文的偏移,再减去tcp头长度即得到负载的长度
     * 同时如果有syn或者fin报文的话,都需要加1,因为这两个标志占用seq
     * 这个*/
    return (seq + len - dataoff - tcph->doff*4
        + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
}

/* Fixme: what about big packets? */
#define MAXACKWINCONST            66000
#define MAXACKWINDOW(sender)                        \
    ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin    \
                          : MAXACKWINCONST)

/* Called when a new connection for this protocol found. */
/* 新连接到来时,调用该函数进行检查 */              
static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
            unsigned int dataoff, unsigned int *timeouts)
{
    enum tcp_conntrack new_state;
    const struct tcphdr *th;
    struct tcphdr _tcph;
    struct net *net = nf_ct_net(ct);
    struct nf_tcp_net *tn = tcp_pernet(net);
    //发送方向状态
    const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
    //接收方向状态
    const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
    //获取tcp头部地址
    th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
    BUG_ON(th == NULL);

    /* Don't need lock here: this conntrack not in circulation yet 获取下一个状态*/
    /* 根据报文的方向(请求方向)和标志位计算tcp的下一个状态,默认首包的起始状态为TCP_CONNTRACK_NONE */
    new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];

    /* Invalid: delete conntrack 不合适状态,直接放弃连接跟踪 */
    if (new_state >= TCP_CONNTRACK_MAX) {
        pr_debug("nf_ct_tcp: invalid new deleting.\n");
        return false;
    }

    //初始化state
    if (new_state == TCP_CONNTRACK_SYN_SENT) {/* 首包是syn报文的话,一般是这个状态,初始化相关信息 */
        //状态清0 
        memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
        /* SYN packet */
        ct->proto.tcp.seen[0].td_end = /* td_end = s + n 即报文的发送序列号 + 报文的长度*/
            segment_seq_plus_len(ntohl(th->seq), skb->len,
                         dataoff, th);
        //本报文携带的窗口信息
        ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
        if (ct->proto.tcp.seen[0].td_maxwin == 0)
            ct->proto.tcp.seen[0].td_maxwin = 1;
        ct->proto.tcp.seen[0].td_maxend = ct->proto.tcp.seen[0].td_end;
        //tcp选项处理。
        tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
    } else if (tn->tcp_loose == 0) {/* 严格情况,直接不让通过,可以通过sysctl进行设置,synproxy会设置该标志 */
        /* Don't try to pick up connections. */
        return false;
    } else {//宽松环境下,这里是异常情况,即首包不是syn报文,而是中间报文,窗口扩大因子选项只能在syn和synack中携带,丢失了很多信息,这里只能简单处理
        memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
        /*
         * We are in the middle of a connection,
         * its history is lost for us.
         * Let's try to use the data from the packet.
         */
        ct->proto.tcp.seen[0].td_end =
            segment_seq_plus_len(ntohl(th->seq), skb->len,
                         dataoff, th);
        ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
        if (ct->proto.tcp.seen[0].td_maxwin == 0)
            ct->proto.tcp.seen[0].td_maxwin = 1;//最小为1,因为窗口为0时需要接受一个字节的持续窗口探测报文。     
        ct->proto.tcp.seen[0].td_maxend =/* td_maxend=ack+win,但是这里还没有收到对方的信息,所以仅仅初始化为 */
            ct->proto.tcp.seen[0].td_end +
            ct->proto.tcp.seen[0].td_maxwin;

        /* We assume SACK and liberal window checking to handle
         * window scaling 设置IP_CT_TCP_FLAG_BE_LIBERAL标志不再处理序列号的问题 */
        ct->proto.tcp.seen[0].flags =
        ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
                          IP_CT_TCP_FLAG_BE_LIBERAL;
    }

    /* tcp_packet will set them,由tcp_packet函数设置该值 */
    ct->proto.tcp.last_index = TCP_NONE_SET;

    pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
         "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
         sender->td_end, sender->td_maxend, sender->td_maxwin,
         sender->td_scale,
         receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
         receiver->td_scale);
    return true;
}

序列号校验

/* 返回0表示报文不接受,返回1表示报文接受 */
static bool tcp_in_window(const struct nf_conn *ct,/* 报文所属的连接跟踪控制块 */
              struct ip_ct_tcp *state,/* 连接跟踪的tcp控制块 */
              enum ip_conntrack_dir dir,/* 报文方向 */
              unsigned int index,/* 报文的标志索引 */
              const struct sk_buff *skb,/* 报文 */
              unsigned int dataoff,/*tcp头偏移*/
              const struct tcphdr *tcph)/* tcp头部指针 */
{
    struct net *net = nf_ct_net(ct);
    struct nf_tcp_net *tn = tcp_pernet(net);
    struct ip_ct_tcp_state *sender = &state->seen[dir];
    struct ip_ct_tcp_state *receiver = &state->seen[!dir];
    const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
    __u32 seq, ack, sack, end, win, swin;
    s32 receiver_offset;
    bool res, in_recv_win;

    /*
     * Get the required data from the packet.
     */
    seq = ntohl(tcph->seq);/* 获取发送序列号 */
    ack = sack = ntohl(tcph->ack_seq);/* 获取应答序列号 */
    win = ntohs(tcph->window);/* 获取窗口大小 */
    /* 计算新的序列号,即end等于最后一个字节的下一个字节的序列号 */
    end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
    /* 如果接收方使能了sack,那么需要处理sack,这里获取sack中最大的有边界值。如果没有sack选项,则sack的值等于ack */
    if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
        tcp_sack(skb, dataoff, tcph, &sack);

    /* Take into account NAT sequence number mangling */
    /* nat可能会导致报文负载发生变化(如alg,synproxy),这里进行序列号调整,获取报文的累计偏移,这里判断的是确认偏移。 */
    receiver_offset = nf_ct_seq_offset(ct, !dir, ack - 1);
    ack -= receiver_offset;/* 得到真实的序列号 */
    sack -= receiver_offset;/* 得到真实的序列号 */

    pr_debug("tcp_in_window: START\n");
    pr_debug("tcp_in_window: ");
    nf_ct_dump_tuple(tuple);
    pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
         seq, ack, receiver_offset, sack, receiver_offset, win, end);
    pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
         "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
         sender->td_end, sender->td_maxend, sender->td_maxwin,
         sender->td_scale,
         receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
         receiver->td_scale);
    
    if (sender->td_maxwin == 0) {/* 窗口为0,可能是syn-ack报文(被动打开),或者syn报文(同时打开),或者会话丢失,中间报文创建的会话 */
        /*
         * Initialize sender data.
         * syn-ack报文,这边是应答方发送的第一个报文,进行应答方初始化。
         */
        if (tcph->syn) {
            /*
             * SYN-ACK in reply to a SYN
             * or SYN from reply direction in simultaneous open.
             */
            sender->td_end =
            sender->td_maxend = end;/* seq + 1 */
            sender->td_maxwin = (win == 0 ? 1 : win);/* 初始化窗口大小,后面会加上窗口扩大因子的影响,+1表示要处理持续报文 */
            /* 处理tcp选项,获取窗口扩大因子 */
            tcp_options(skb, dataoff, tcph, sender);
            /*
             * RFC 1323:
             * Both sides must send the Window Scale option
             * to enable window scaling in either direction.
             * 只有双方都发送了窗口扩大选项,才能开启窗口扩大功能
             */
            if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
                  && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
                sender->td_scale =/* 关闭窗口扩大因子 */
                receiver->td_scale = 0;
            //没有ack说明同时打开,接受。
            if (!tcph->ack)
                return true;
        } else {//没有syn标志,说明是中间报文,由于窗口扩大因子只能在syn报文中携带,会话丢失历史消息,不再校验。
            /*
             * We are in the middle of a connection,
             * its history is lost for us.
             * Let's try to use the data from the packet.
             */
            sender->td_end = end;
            //窗口扩大调整
            swin = win << sender->td_scale;
            sender->td_maxwin = (swin == 0 ? 1 : swin);
            sender->td_maxend = end + sender->td_maxwin;
            /*
             * We haven't seen traffic in the other direction yet
             * but we have to tweak window tracking to pass III
             * and IV until that happens.
             */
            if (receiver->td_maxwin == 0)
                receiver->td_end = receiver->td_maxend = sack;
        }
    } else if (((state->state == TCP_CONNTRACK_SYN_SENT//syn报文重传
             && dir == IP_CT_DIR_ORIGINAL)
           || (state->state == TCP_CONNTRACK_SYN_RECV//syn-ack重传
             && dir == IP_CT_DIR_REPLY))
           && after(end, sender->td_end)) {
        /*
         * RFC 793: "if a TCP is reinitialized ... then it need
         * not wait at all; it must only be sure to use sequence
         * numbers larger than those recently used."
         */
        sender->td_end =
        sender->td_maxend = end;
        sender->td_maxwin = (win == 0 ? 1 : win);

        tcp_options(skb, dataoff, tcph, sender);
    }

    if (!(tcph->ack)) {
        /*
         * If there is no ACK, just pretend it was set and OK.
         * 报文没有ack标志,也就是ack序列号无效,这里直接获取对端的最大发送序列号作为应答序列号。
         */
        ack = sack = receiver->td_end;
    } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==/* rst包 */
            (TCP_FLAG_ACK|TCP_FLAG_RST))
           && (ack == 0)) {
        /*
         * Broken TCP stacks, that set ACK in RST packets as well
         * with zero ack value.
         */
        ack = sack = receiver->td_end;
    }
    /* 第一个reset报文,即客户端发送请求,服务器端收到第一个报,立即发送的rst报文 */
    if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)
        /*
         * RST sent answering SYN.
         */
        seq = end = sender->td_end;

    pr_debug("tcp_in_window: ");
    nf_ct_dump_tuple(tuple);
    pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
         seq, ack, receiver_offset, sack, receiver_offset, win, end);
    pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
         "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
         sender->td_end, sender->td_maxend, sender->td_maxwin,
         sender->td_scale,
         receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
         receiver->td_scale);

    /* Is the ending sequence in the receive window (if available)? */
    /* 判断等式(II)是否满足,即发送序列号的下边界 */
    in_recv_win = !receiver->td_maxwin ||
              after(end, sender->td_end - receiver->td_maxwin - 1);/* end > td_end -td_maxwin */
    //|--------------|
    //               td_end

    pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
         before(seq, sender->td_maxend + 1),
         (in_recv_win ? 1 : 0),
         before(sack, receiver->td_end + 1),
         after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));

    if (before(seq, sender->td_maxend + 1) && // 检查发送序列号上边界,等式(I)
        in_recv_win &&                         //检查发送序列号下边界。等式(II)
        before(sack, receiver->td_end + 1) &&  // 检查应答序列号上边界,等式(III)
        after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {// 检查应答序列号下边界,等式(IV)
        /*  |---------------------------| */
        //  |----sack-------------------|td_end
        /*
         * Take into account window scaling (RFC 1323).
         * 根据发送方发布的窗口和窗口缩放系数计算真实的发送方的接收端口
         */
        if (!tcph->syn)
            win <<= sender->td_scale;

        /*
         * Update sender data.
         * 更新发送方数据。
         */
        // 本端选择性确认了一些报文,说明本端可以接收更多字节。
        swin = win + (sack - ack);
        if (sender->td_maxwin < swin)/* 窗口更新 */
            sender->td_maxwin = swin;
        if (after(end, sender->td_end)) {/* 更新本端已经发送的最大的字节序列号 */
            sender->td_end = end;
            sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;/* 发送端新增未被确认的字节 */
        }
        
        if (tcph->ack) {/* 存在确认标志,则需要更新最大确认序列号 */
            if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
                sender->td_maxack = ack;
                sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
            } else if (after(ack, sender->td_maxack))/* 本次确认了新的数据,更新本地最大确认序列号 */
                sender->td_maxack = ack;/* 更新本端已经确认收到对端的报文字节数 */
        }

        /*
         * Update receiver data.
         * 更新对端数据
         */
        // 这里没看明白,本次发送超过了最大允许发送的字节,为什么要刷新接收方的窗口呢?
        if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)
            receiver->td_maxwin += end - sender->td_maxend;
        // 计算td_maxend。td_maxend接收端的。本次确认的序列号,加上本端发布的窗口,就是对端能发送的最大序列号。
        if (after(sack + win, receiver->td_maxend - 1)) {/* 本次报文sack确认的序列号 */
            receiver->td_maxend = sack + win;/* td_maxend = ack + win */
            if (win == 0)//在窗口为0时,持续定时器的探测报文1字节允许通过,所以这里加1
                receiver->td_maxend++;
        }
        
        if (ack == receiver->td_end)/* 本次发送的报文已经确认对端发送的报文已经全部收到 */
            receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;/* 对端发送的报文已经被本段全部确认 */

        /*
         * Check retransmissions.
         * 检查重传
         */
        if (index == TCP_ACK_SET) {
            if (state->last_dir == dir/* 方向相同 */
                && state->last_seq == seq/* 序列号相同 */
                && state->last_ack == ack
                && state->last_end == end
                && state->last_win == win)
                state->retrans++;//上一个报文的重传报文,累计
            else {//非重传报文,刷新上一次纪录。
                state->last_dir = dir;
                state->last_seq = seq;
                state->last_ack = ack;
                state->last_end = end;
                state->last_win = win;
                state->retrans = 0;
            }
        }
        res = true;
    } else {//如果seq大于上边界,即报文没有与窗口相交,则有误。
        res = false;
        if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
            tn->tcp_be_liberal)//中间报文构建的会话,丢失了一些历史信息,不进行校验。
            res = true;
        if (!res) {
            nf_ct_l4proto_log_invalid(skb, ct,
            "%s",
            before(seq, sender->td_maxend + 1) ?  //值为0打印 "SEQ is over the upper bound (over the window of the receiver)"
            in_recv_win ?                         //值为0打印     "SEQ is under the lower bound (already ACKed data retransmitted)"
            before(sack, receiver->td_end + 1) ?  //值为0打印     "ACK is over the upper bound (ACKed data not seen yet)"
            after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" //值为0打印  "ACK is under the lower bound (possible overly delayed ACK)"
            : "ACK is under the lower bound (possible overly delayed ACK)"    //值全为1打印“BUG”
            : "ACK is over the upper bound (ACKed data not seen yet)"
            : "SEQ is under the lower bound (already ACKed data retransmitted)"
            : "SEQ is over the upper bound (over the window of the receiver)");
        }
    }

    pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
         "receiver end=%u maxend=%u maxwin=%u\n",
         res, sender->td_end, sender->td_maxend, sender->td_maxwin,
         receiver->td_end, receiver->td_maxend, receiver->td_maxwin);

    return res;
}

实现

连接跟踪控制块

const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
{
    .l3proto        = PF_INET,
    .l4proto         = IPPROTO_TCP,
    .pkt_to_tuple         = tcp_pkt_to_tuple,
    .invert_tuple         = tcp_invert_tuple,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
    .print_conntrack     = tcp_print_conntrack,
#endif
    .packet         = tcp_packet,
    .get_timeouts        = tcp_get_timeouts,
    .new             = tcp_new,
    .error            = tcp_error,
    .can_early_drop        = tcp_can_early_drop,/* 连接控制块是否可以提前回收 */
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
    .to_nlattr        = tcp_to_nlattr,
    .from_nlattr        = nlattr_to_tcp,
    .tuple_to_nlattr    = nf_ct_port_tuple_to_nlattr,
    .nlattr_to_tuple    = nf_ct_port_nlattr_to_tuple,
    .nlattr_tuple_size    = tcp_nlattr_tuple_size,
    .nlattr_size        = TCP_NLATTR_SIZE,
    .nla_policy        = nf_ct_port_nla_policy,
#endif
#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
    .ctnl_timeout        = {
        .nlattr_to_obj    = tcp_timeout_nlattr_to_obj,
        .obj_to_nlattr    = tcp_timeout_obj_to_nlattr,
        .nlattr_max    = CTA_TIMEOUT_TCP_MAX,
        .obj_size    = sizeof(unsigned int) *
                    TCP_CONNTRACK_TIMEOUT_MAX,
        .nla_policy    = tcp_timeout_nla_policy,
    },
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
    .init_net        = tcp_init_net,
    .get_net_proto        = tcp_get_net_proto,
};

常见标志

/* Window scaling is advertised by the sender */
#define IP_CT_TCP_FLAG_WINDOW_SCALE        0x01   //支持窗口缩放

/* SACK is permitted by the sender 发送方支持sack选项 */
#define IP_CT_TCP_FLAG_SACK_PERM        0x02

/* This sender sent FIN first 发送方首先发送了FIN */
#define IP_CT_TCP_FLAG_CLOSE_INIT        0x04

/* Be liberal in window checking 中间报文创建会话,连接跟踪丢失历史信息 */
#define IP_CT_TCP_FLAG_BE_LIBERAL        0x08

/* Has unacknowledged data 本端发送的数据还没有被全部确认 */
#define IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED    0x10

/* The field td_maxack has been set td_maxack已经被设置了 */
#define IP_CT_TCP_FLAG_MAXACK_SET        0x20

/* Marks possibility for expected RFC5961 challenge ACK */
#define IP_CT_EXP_CHALLENGE_ACK         0x40

/* Simultaneous open initialized */
#define IP_CT_TCP_SIMULTANEOUS_OPEN        0x80

struct nf_ct_tcp_flags {
    __u8 flags;
    __u8 mask;
};

tcp_error

该函数是在连接跟踪过程中第一个调用的函数,用于错误检查,主要是头部合法性校验,flag标志位校验,校验码校验。

/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
static int tcp_error(struct net *net, struct nf_conn *tmpl,
             struct sk_buff *skb,
             unsigned int dataoff,
             u_int8_t pf,
             unsigned int hooknum)
{
    const struct tcphdr *th;
    struct tcphdr _tcph;
    unsigned int tcplen = skb->len - dataoff;
    u_int8_t tcpflags;

    /* Smaller that minimal TCP header? */
    th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
    if (th == NULL) {
        tcp_error_log(skb, net, pf, "short packet");
        return -NF_ACCEPT;
    }

    /* Not whole TCP header or malformed packet */
    if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
        tcp_error_log(skb, net, pf, "truncated packet");
        return -NF_ACCEPT;
    }

    /* Checksum invalid? Ignore.
     * We skip checking packets on the outgoing path
     * because the checksum is assumed to be correct.
     */
    /* FIXME: Source route IP option packets --RR */
    if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
        nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
        tcp_error_log(skb, net, pf, "bad checksum");
        return -NF_ACCEPT;
    }

    /* Check TCP flags. */
    /* 获取tcp的flag字段,去掉TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH三个bit */
    tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
    /* 校验剩下bit是否合法 */
    if (!tcp_valid_flags[tcpflags]) {
        tcp_error_log(skb, net, pf, "invalid tcp flag combination");
        return -NF_ACCEPT;
    }

    return NF_ACCEPT;
}
/* table of valid flag combinations - PUSH, ECE and CWR are always valid */
/* 标志之间的组合,不能存在非法组合 */
static const u8 tcp_valid_flags[(TCPHDR_FIN|TCPHDR_SYN|TCPHDR_RST|TCPHDR_ACK|
                 TCPHDR_URG) + 1] =
{
    [TCPHDR_SYN]                = 1,//syn报文可以只设置一个标志
    [TCPHDR_SYN|TCPHDR_URG]            = 1,
    [TCPHDR_SYN|TCPHDR_ACK]            = 1,//syn+ack报文会同时有这两个标志位
    [TCPHDR_RST]                = 1,//rst报文可以只设置rst标志
    [TCPHDR_RST|TCPHDR_ACK]            = 1,//rst报文可以对报文进行确认,即可以设置ack标志。
    [TCPHDR_FIN|TCPHDR_ACK]            = 1,//fin报文必须对报文进行确认,必须设置ack标志。
    [TCPHDR_FIN|TCPHDR_ACK|TCPHDR_URG]    = 1,
    [TCPHDR_ACK]                = 1,//中间的数据报文,可以只设置ack标志。
    [TCPHDR_ACK|TCPHDR_URG]            = 1,
};

tcp_pkt_to_tuple

获取报文的源目的端口,填充到tuple中。

static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
                 struct net *net, struct nf_conntrack_tuple *tuple)
{
    const struct tcphdr *hp;
    struct tcphdr _hdr;

    /* Actually only need first 4 bytes to get ports. */
    hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
    if (hp == NULL)
        return false;

    tuple->src.u.tcp.port = hp->source;
    tuple->dst.u.tcp.port = hp->dest;

    return true;
}

tcp_invert_tuple

求反向路径的传输层tuple信息,将报文的源目的端口交换即可。

static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
                 const struct nf_conntrack_tuple *orig)
{
    tuple->src.u.tcp.port = orig->dst.u.tcp.port;
    tuple->dst.u.tcp.port = orig->src.u.tcp.port;
    return true;
}

tcp_options

简化版tcp选项处理函数,重点关注sack选项和窗口缩放因子选项

/*
 * Simplified tcp_parse_options routine from tcp_input.c
 * 简单的tcp选项处理函数,用于处理连接跟踪需要处理的两个选项
 * sack选项和窗口扩大因子。
 */
static void tcp_options(const struct sk_buff *skb,
            unsigned int dataoff,
            const struct tcphdr *tcph,
            struct ip_ct_tcp_state *state)
{
    unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
    const unsigned char *ptr;
    int length = (tcph->doff*4) - sizeof(struct tcphdr);

    if (!length)
        return;

    ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
                 length, buff);
    BUG_ON(ptr == NULL);

    state->td_scale =
    state->flags = 0;

    while (length > 0) {
        int opcode=*ptr++;
        int opsize;

        switch (opcode) {
        case TCPOPT_EOL:
            return;
        case TCPOPT_NOP:    /* Ref: RFC 793 section 3.1 */
            length--;
            continue;
        default:
            if (length < 2)
                return;
            opsize=*ptr++;
            if (opsize < 2) /* "silly options" */
                return;
            if (opsize > length)
                return;    /* don't parse partial options */
            //支持sack选项
            if (opcode == TCPOPT_SACK_PERM
                && opsize == TCPOLEN_SACK_PERM)
                state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
            else if (opcode == TCPOPT_WINDOW//窗口扩大因子选项
                 && opsize == TCPOLEN_WINDOW) {
                state->td_scale = *(u_int8_t *)ptr;

                if (state->td_scale > TCP_MAX_WSCALE)/* 处理窗口扩大因子 */
                    state->td_scale = TCP_MAX_WSCALE;

                state->flags |=
                    IP_CT_TCP_FLAG_WINDOW_SCALE;
            }
            ptr += opsize - 2;
            length -= opsize;
        }
    }
}

/* 报文的sack选项处理,选取最大的right-edge */
static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
                     const struct tcphdr *tcph, __u32 *sack)
{
    unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
    const unsigned char *ptr;
    int length = (tcph->doff*4) - sizeof(struct tcphdr);
    __u32 tmp;

    if (!length)
        return;
    /* 获取选项起始地址 */
    ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
                 length, buff);
    BUG_ON(ptr == NULL);

    /* Fast path for timestamp-only option */
    /* 快速选项过滤,对于选项中只有时间戳选项,直接跳过。 */
    if (length == TCPOLEN_TSTAMP_ALIGNED
        && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
                       | (TCPOPT_NOP << 16)
                       | (TCPOPT_TIMESTAMP << 8)
                       | TCPOLEN_TIMESTAMP))
        return;

    while (length > 0) {
        int opcode = *ptr++;
        int opsize, i;

        switch (opcode) {
        case TCPOPT_EOL:
            return;
        case TCPOPT_NOP:    /* Ref: RFC 793 section 3.1 */
            length--;
            continue;
        default:
            if (length < 2)
                return;
            opsize = *ptr++;
            if (opsize < 2) /* "silly options" */
                return;
            if (opsize > length)
                return;    /* don't parse partial options */

            if (opcode == TCPOPT_SACK
                && opsize >= (TCPOLEN_SACK_BASE
                      + TCPOLEN_SACK_PERBLOCK)
                && !((opsize - TCPOLEN_SACK_BASE)
                 % TCPOLEN_SACK_PERBLOCK)) {
                for (i = 0;
                     i < (opsize - TCPOLEN_SACK_BASE);
                     i += TCPOLEN_SACK_PERBLOCK) {
                    tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);
                    //获取最大的sack值。 
                    if (after(tmp, *sack))
                        *sack = tmp;
                }
                return;
            }
            ptr += opsize - 2;
            length -= opsize;
        }
    }
}

tcp_packet

对报文进行合法性校验,状态转换和状态合法性检查,窗口合法性检查。

/* Returns verdict for packet, or -1 for invalid. */
/* 返回报文的裁决,-1表示非法 */
static int tcp_packet(struct nf_conn *ct,
              const struct sk_buff *skb,
              unsigned int dataoff,
              enum ip_conntrack_info ctinfo,
              unsigned int *timeouts)
{
    struct net *net = nf_ct_net(ct);
    struct nf_tcp_net *tn = tcp_pernet(net);
    struct nf_conntrack_tuple *tuple;
    enum tcp_conntrack new_state, old_state;
    enum ip_conntrack_dir dir;
    const struct tcphdr *th;
    struct tcphdr _tcph;
    unsigned long timeout;
    unsigned int index;

    th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
    BUG_ON(th == NULL);

    spin_lock_bh(&ct->lock);
    old_state = ct->proto.tcp.state;
    dir = CTINFO2DIR(ctinfo);
    index = get_conntrack_index(th);
    /* 获取tcp状态机的下一个状态 */
    new_state = tcp_conntracks[dir][index][old_state];
    tuple = &ct->tuplehash[dir].tuple;

    switch (new_state) {
    case TCP_CONNTRACK_SYN_SENT:/* syn报文,重传syn */
        if (old_state < TCP_CONNTRACK_TIME_WAIT)//重新打开,特殊处理。
            break;
        /* RFC 1122: "When a connection is closed actively,
         * it MUST linger in TIME-WAIT state for a time 2xMSL
         * (Maximum Segment Lifetime). However, it MAY accept
         * a new SYN from the remote TCP to reopen the connection
         * directly from TIME-WAIT state, if..."
         * We ignore the conditions because we are in the
         * TIME-WAIT state anyway.
         *
         * Handle aborted connections: we and the server
         * think there is an existing connection but the client
         * aborts it and starts a new one.
         */
        if (((ct->proto.tcp.seen[dir].flags
              | ct->proto.tcp.seen[!dir].flags)
             & IP_CT_TCP_FLAG_CLOSE_INIT)//双方主动发送
            || (ct->proto.tcp.last_dir == dir
                && ct->proto.tcp.last_index == TCP_RST_SET)) {
            /* Attempt to reopen a closed/aborted connection.
             * Delete this connection and look up again. */
            spin_unlock_bh(&ct->lock);

            /* Only repeat if we can actually remove the timer.
             * Destruction may already be in progress in process
             * context and we must give it a chance to terminate.
             */
            if (nf_ct_kill(ct))
                return -NF_REPEAT;//这里返回REPEAT,重新进行会话创建。
            return NF_DROP;
        }
        /* Fall through */
    case TCP_CONNTRACK_IGNORE:
        /* Ignored packets:
         *
         * Our connection entry may be out of sync, so ignore
         * packets which may signal the real connection between
         * the client and the server.
         *
         * a) SYN in ORIGINAL
         * b) SYN/ACK in REPLY
         * c) ACK in reply direction after initial SYN in original.
         *
         * If the ignored packet is invalid, the receiver will send
         * a RST we'll catch below.
         */
        if (index == TCP_SYNACK_SET
            && ct->proto.tcp.last_index == TCP_SYN_SET
            && ct->proto.tcp.last_dir != dir
            && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {/* syn报文的应答,这个不应该是TCP_CONNTRACK_IGNORE状态啊? */
            /* b) This SYN/ACK acknowledges a SYN that we earlier
             * ignored as invalid. This means that the client and
             * the server are both in sync, while the firewall is
             * not. We get in sync from the previously annotated
             * values.
             * 此SYN/ACK确认了一个我们先前忽略为无效的SYN。这意味着客户机和服务器都是同步的,
             * 而防火墙则不是同步的。我们与以前注释过的值同步。更新防火墙的状态为正确的状态。
             */
            old_state = TCP_CONNTRACK_SYN_SENT;
            new_state = TCP_CONNTRACK_SYN_RECV;
            ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
                ct->proto.tcp.last_end;
            ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
                ct->proto.tcp.last_end;
            ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
                ct->proto.tcp.last_win == 0 ?
                    1 : ct->proto.tcp.last_win;
            ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
                ct->proto.tcp.last_wscale;
            ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
            ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
                ct->proto.tcp.last_flags;
            memset(&ct->proto.tcp.seen[dir], 0,
                   sizeof(struct ip_ct_tcp_state));
            break;
        }
        ct->proto.tcp.last_index = index;/* 更新上一个报文的标志索引 */
        ct->proto.tcp.last_dir = dir;/* 更新报文方向 */
        ct->proto.tcp.last_seq = ntohl(th->seq);/* 更新发送序列号 */
        ct->proto.tcp.last_end =
            segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);/* 更新长度与序列号之和,可以反推出上一个报文的长度 */
        ct->proto.tcp.last_win = ntohs(th->window);

        /* a) This is a SYN in ORIGINAL. The client and the server
         * may be in sync but we are not. In that case, we annotate
         * the TCP options and let the packet go through. If it is a
         * valid SYN packet, the server will reply with a SYN/ACK, and
         * then we'll get in sync. Otherwise, the server potentially
         * responds with a challenge ACK if implementing RFC5961.
         * 这是请求方向的SYN。客户端和服务器可能是同步的,但我们不是。
         * 在这种情况下,我们注释TCP选项并让数据包通过。如果它是
         * 一个有效的SYN数据包,服务器将使用SYN/ACK进行应答,然后
         * 我们将同步。
         */
        if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
            struct ip_ct_tcp_state seen = {};

            ct->proto.tcp.last_flags =
            ct->proto.tcp.last_wscale = 0;
            tcp_options(skb, dataoff, th, &seen);
            if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
                ct->proto.tcp.last_flags |=
                    IP_CT_TCP_FLAG_WINDOW_SCALE;
                ct->proto.tcp.last_wscale = seen.td_scale;
            }
            if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
                ct->proto.tcp.last_flags |=
                    IP_CT_TCP_FLAG_SACK_PERM;
            }
            /* Mark the potential for RFC5961 challenge ACK,
             * this pose a special problem for LAST_ACK state
             * as ACK is intrepretated as ACKing last FIN.
             */
            if (old_state == TCP_CONNTRACK_LAST_ACK)
                ct->proto.tcp.last_flags |=
                    IP_CT_EXP_CHALLENGE_ACK;
        }
        spin_unlock_bh(&ct->lock);
        nf_ct_l4proto_log_invalid(skb, ct, "invalid packet ignored in "
                      "state %s ", tcp_conntrack_names[old_state]);
        return NF_ACCEPT;
    case TCP_CONNTRACK_MAX:
        /* Special case for SYN proxy: when the SYN to the server or
         * the SYN/ACK from the server is lost, the client may transmit
         * a keep-alive packet while in SYN_SENT state. This needs to
         * be associated with the original conntrack entry in order to
         * generate a new SYN with the correct sequence number.
         * SYN代理的特例:当SYN到服务器或服务器的SYN/ACK丢失时,客户端可
         * 以在SYN_SENT状态下发送一个保持活动的数据包。这需要与原始连接项
         * 相关联,以便生成具有正确序列号的新SYN。
         */
        if (nfct_synproxy(ct) && old_state == TCP_CONNTRACK_SYN_SENT &&
            index == TCP_ACK_SET && dir == IP_CT_DIR_ORIGINAL &&
            ct->proto.tcp.last_dir == IP_CT_DIR_ORIGINAL &&
            ct->proto.tcp.seen[dir].td_end - 1 == ntohl(th->seq)) {
            pr_debug("nf_ct_tcp: SYN proxy client keep alive\n");
            spin_unlock_bh(&ct->lock);
            return NF_ACCEPT;
        }

        /* Invalid packet */
        pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
             dir, get_conntrack_index(th), old_state);
        spin_unlock_bh(&ct->lock);
        nf_ct_l4proto_log_invalid(skb, ct, "invalid state");
        return -NF_ACCEPT;
    case TCP_CONNTRACK_TIME_WAIT:/* 客户端收到对端的fin报文后,发送ack后进入该状态,所以本报文一定是一个ack报文 */
        /* RFC5961 compliance cause stack to send "challenge-ACK"
         * e.g. in response to spurious SYNs.  Conntrack MUST
         * not believe this ACK is acking last FIN.
         */
        if (old_state == TCP_CONNTRACK_LAST_ACK &&
            index == TCP_ACK_SET &&
            ct->proto.tcp.last_dir != dir &&
            ct->proto.tcp.last_index == TCP_SYN_SET &&
            (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) {
            /* Detected RFC5961 challenge ACK */
            ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
            spin_unlock_bh(&ct->lock);
            nf_ct_l4proto_log_invalid(skb, ct, "challenge-ack ignored");
            return NF_ACCEPT; /* Don't change state */
        }
        break;
    case TCP_CONNTRACK_SYN_SENT2:/* 发送sync报文后,收到对端sync报文,是同时打开情况 */
        /* tcp_conntracks table is not smart enough to handle
         * simultaneous open.
         */
        ct->proto.tcp.last_flags |= IP_CT_TCP_SIMULTANEOUS_OPEN;
        break;
    case TCP_CONNTRACK_SYN_RECV:/* 同时打开情况下,收到对端发送来的ack报文,则认为连接建立 */
        if (dir == IP_CT_DIR_REPLY && index == TCP_ACK_SET &&
            ct->proto.tcp.last_flags & IP_CT_TCP_SIMULTANEOUS_OPEN)
            new_state = TCP_CONNTRACK_ESTABLISHED;
        break;
    case TCP_CONNTRACK_CLOSE:
        if (index == TCP_RST_SET
            && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)
            && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) {
            /* Invalid RST  */
            spin_unlock_bh(&ct->lock);
            nf_ct_l4proto_log_invalid(skb, ct, "invalid rst");
            return -NF_ACCEPT;
        }
        if (index == TCP_RST_SET
            && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
             && ct->proto.tcp.last_index == TCP_SYN_SET)
            || (!test_bit(IPS_ASSURED_BIT, &ct->status)
                && ct->proto.tcp.last_index == TCP_ACK_SET))
            && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
            /* RST sent to invalid SYN or ACK we had let through
             * at a) and c) above:
             *
             * a) SYN was in window then
             * c) we hold a half-open connection.
             *
             * Delete our connection entry.
             * We skip window checking, because packet might ACK
             * segments we ignored. */
            goto in_window;
        }
        /* Just fall through */
    default:
        /* Keep compilers happy. */
        break;
    }
    /* 判断报文是否在窗口中,在窗口处理中处理报文重传问题 */
    if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,/* 返回1表示报文在窗口中,返回0表示报文 */
               skb, dataoff, th)) {
        spin_unlock_bh(&ct->lock);
        return -NF_ACCEPT;
    }
     in_window:
    /* From now on we have got in-window packets */
    ct->proto.tcp.last_index = index;
    ct->proto.tcp.last_dir = dir;

    pr_debug("tcp_conntracks: ");
    nf_ct_dump_tuple(tuple);
    pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
         (th->syn ? 1 : 0), (th->ack ? 1 : 0),
         (th->fin ? 1 : 0), (th->rst ? 1 : 0),
         old_state, new_state);

    ct->proto.tcp.state = new_state;//更新状态
    if (old_state != new_state
        && new_state == TCP_CONNTRACK_FIN_WAIT)
        ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;/* 发送方发送了第一个fin包 */

    if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&/* 修改重传状态下超时时间 */
        timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
        timeout = timeouts[TCP_CONNTRACK_RETRANS];
    else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
         IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
         timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
        timeout = timeouts[TCP_CONNTRACK_UNACK];
    else if (ct->proto.tcp.last_win == 0 &&
         timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
        timeout = timeouts[TCP_CONNTRACK_RETRANS];
    else
        timeout = timeouts[new_state];
    spin_unlock_bh(&ct->lock);

    if (new_state != old_state)/* 状态发生变化,通知协议发生变化 */
        nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
    /* 该连接还没有见到应答方向的报文 */
    if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
        /* If only reply is a RST, we can consider ourselves not to
           have an established connection: this is a fairly common
           problem case, so we can delete the conntrack
           immediately.  --RR */
        if (th->rst) {
            nf_ct_kill_acct(ct, ctinfo, skb);
            return NF_ACCEPT;
        }
        /* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
         * pickup with loose=1. Avoid large ESTABLISHED timeout.
         */
        if (new_state == TCP_CONNTRACK_ESTABLISHED &&
            timeout > timeouts[TCP_CONNTRACK_UNACK])
            timeout = timeouts[TCP_CONNTRACK_UNACK];
    } else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
           && (old_state == TCP_CONNTRACK_SYN_RECV
               || old_state == TCP_CONNTRACK_ESTABLISHED)
           && new_state == TCP_CONNTRACK_ESTABLISHED) {
        /* Set ASSURED if we see see valid ack in ESTABLISHED
           after SYN_RECV or a valid answer for a picked up
           connection. 连接已经建立,设置不能过早超时标志 */
        set_bit(IPS_ASSURED_BIT, &ct->status);
        nf_conntrack_event_cache(IPCT_ASSURED, ct);
    }
    /* 更新链接超时时间 */
    nf_ct_refresh_acct(ct, ctinfo, skb, timeout);

    return NF_ACCEPT;
}

tcp_get_timeouts

获取tcp协议的超时时间数组。tcp的超时时间根据连接的状态不同而不同。

//tcp超时定时数组
static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = {
    [TCP_CONNTRACK_SYN_SENT]    = 2 MINS,
    [TCP_CONNTRACK_SYN_RECV]    = 60 SECS,
    [TCP_CONNTRACK_ESTABLISHED]    = 5 DAYS,
    [TCP_CONNTRACK_FIN_WAIT]    = 2 MINS,
    [TCP_CONNTRACK_CLOSE_WAIT]    = 60 SECS,
    [TCP_CONNTRACK_LAST_ACK]    = 30 SECS,
    [TCP_CONNTRACK_TIME_WAIT]    = 2 MINS,
    [TCP_CONNTRACK_CLOSE]        = 10 SECS,
    [TCP_CONNTRACK_SYN_SENT2]    = 2 MINS,
/* RFC1122 says the R2 limit should be at least 100 seconds.
   Linux uses 15 packets as limit, which corresponds
   to ~13-30min depending on RTO. */
    [TCP_CONNTRACK_RETRANS]        = 5 MINS,
    [TCP_CONNTRACK_UNACK]        = 5 MINS,
};

static unsigned int *tcp_get_timeouts(struct net *net)
{
    return tcp_pernet(net)->timeouts;
}

tcp_can_early_drop

连接跟踪是否可以提前销毁。用于垃圾回收。

static bool tcp_can_early_drop(const struct nf_conn *ct)
{
    switch (ct->proto.tcp.state) {
    case TCP_CONNTRACK_FIN_WAIT:
    case TCP_CONNTRACK_LAST_ACK:
    case TCP_CONNTRACK_TIME_WAIT:
    case TCP_CONNTRACK_CLOSE:
    case TCP_CONNTRACK_CLOSE_WAIT:
        return true;
    default:
        break;
    }

    return false;
}

ouyangxibao
189 声望162 粉丝

不生产代码,只是代码的搬运工