clipboard.png

1.首包在prerouting或者在output节点上,在函数resolve_normal_ct中会创建连接跟踪。如果存在期望连接的话,则关联其对应的master连接,并且设置新创建的连接跟踪的状态为IP_CT_RELATED,若果没有对应的期望连接,则设置其状态为IP_CT_NEW。正常情况下,首包只有这两种状态。

2.对于第一个应答包和后续应答包也是在prerouting或者在output节点上,一般在函数resolve_normal_ct中可以找到对应的处于IP_CT_RELATED或者IP_CT_NEW状态的连接跟踪,找到后设置其状态为IP_CT_ESTABLISHED_REPLY

3.对于非首个请求报文,在prerouting或者在output节点上,函数resolve_normal_ct中可以找到对应的连接跟踪,一般会将其设置为IP_CT_ESTABLISHED。

4.对于ICMP错误报文(比如源抑制,ttl超时,不可达报文)到达netfilter后,会根据ICMP携带的原始报文查找其所属的CT,如果该ICMP差错报文是CT的请求方向报文产生的,那么设置其状态为IP_CT_RELATED,如果是应答方向的报文产生的则设置为IP_CT_RELATED_REPLY。

5.很重要的一点,对于子连接的首包,会在函数init_conntrack中创建连接跟踪,并查找到其对应的子连接(先创建连接跟踪,然后查找期望连接进行关联),在离开init_conntrack函数之前执行对应的expectfn函数。

我们以典型的ftp的期望连接为例,分析expectfn的作用。

假设没有nat

主动模式

客户端发送PORT xxx,xxx,xxx,xxx,ppp,ppp给服务器,连接跟踪通过help结构捕获了该消息,然后生成了请求方向的期望连接跟踪(假设从PORT命令中拿到的ip地址为dataip,端口为dataport)。假设主链接的应答方向ip为rip

dip/mask      = dataip/0xffffffff
dport/mask    = dataport/0xffff
sip/mask      = rip/0xffffffff(由于源IP可以由服务器端重新制定,所以这里直接设置母连接服务器的IP地址也是不准确的,但是大多数正常情况是这样的)
sport/mask    = 0/0
protocol = tcp

// 可以从函数nf_ct_expect_init调用出分析得出。
/* 初始化期望连接,使用反方向的源地址和目的地址作为源目的地址,使用内容中的端口作为目的端口 */
    nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, cmd.l3num,
              &ct->tuplehash[!dir].tuple.src.u3, daddr,
              IPPROTO_TCP, NULL, &cmd.u.tcp.port);
void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
               u_int8_t family,
               const union nf_inet_addr *saddr,
               const union nf_inet_addr *daddr,
               u_int8_t proto, const __be16 *src, const __be16 *dst)
{
    int len;
    /* 初始化期望连接,使用反方向的源地址和目的地址作为源目的地址,使用内容中的端口作为目的端口 */

    if (family == AF_INET)
        len = 4;
    else
        len = 16;

    exp->flags = 0;
    exp->class = class;
    exp->expectfn = NULL;
    exp->helper = NULL;
    exp->tuple.src.l3num = family;
    exp->tuple.dst.protonum = proto;
//以ftp的主动模式为例,nat保护客户端
    if (saddr) {/* 服务器源公网IP,因为主动模式是由服务器发起连接 */
        memcpy(&exp->tuple.src.u3, saddr, len);
        if (sizeof(exp->tuple.src.u3) > len)
            /* address needs to be cleared for nf_ct_tuple_equal */
            memset((void *)&exp->tuple.src.u3 + len, 0x00,
                   sizeof(exp->tuple.src.u3) - len);
        memset(&exp->mask.src.u3, 0xFF, len);
        if (sizeof(exp->mask.src.u3) > len)
            memset((void *)&exp->mask.src.u3 + len, 0x00,
                   sizeof(exp->mask.src.u3) - len);
    } else {/* 没有的话通配 */
        memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3));
        memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3));
    }

    if (src) {/* 源端口,一般不会设置,ftp数据端口还没发起,所以一般不设置 */
        exp->tuple.src.u.all = *src;
        exp->mask.src.u.all = htons(0xFFFF);
    } else {
        exp->tuple.src.u.all = 0;
        exp->mask.src.u.all = 0;
    }
    /* 目的地址采用反向连接的目的IP,这个IP是客户端经过nat之后的IP */
    memcpy(&exp->tuple.dst.u3, daddr, len);
    if (sizeof(exp->tuple.dst.u3) > len)
        /* address needs to be cleared for nf_ct_tuple_equal */
        memset((void *)&exp->tuple.dst.u3 + len, 0x00,
               sizeof(exp->tuple.dst.u3) - len);
    /* 目的端口采用PORT命令中的端口,这个端口是修改后的端口 */
    exp->tuple.dst.u.all = *dst;

#ifdef CONFIG_NF_NAT_NEEDED
    memset(&exp->saved_addr, 0, sizeof(exp->saved_addr));
    memset(&exp->saved_proto, 0, sizeof(exp->saved_proto));
#endif
}
static int help(struct sk_buff *skb,
        unsigned int protoff,
        struct nf_conn *ct,
        enum ip_conntrack_info ctinfo)
{
    unsigned int dataoff, datalen;
    const struct tcphdr *th;
    struct tcphdr _tcph;
    const char *fb_ptr;
    int ret;
    u32 seq;
    int dir = CTINFO2DIR(ctinfo);
    unsigned int uninitialized_var(matchlen), uninitialized_var(matchoff);
    struct nf_ct_ftp_master *ct_ftp_info = nfct_help_data(ct);
    struct nf_conntrack_expect *exp;
    union nf_inet_addr *daddr;
    struct nf_conntrack_man cmd = {};
    unsigned int i;
    int found = 0, ends_in_nl;
    typeof(nf_nat_ftp_hook) nf_nat_ftp;

    /* Until there's been traffic both ways, don't look in packets. */
    if (ctinfo != IP_CT_ESTABLISHED &&
        ctinfo != IP_CT_ESTABLISHED_REPLY) {
        pr_debug("ftp: Conntrackinfo = %u\n", ctinfo);
        return NF_ACCEPT;
    }

    th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
    if (th == NULL)
        return NF_ACCEPT;

    dataoff = protoff + th->doff * 4;
    /* No data? */
    if (dataoff >= skb->len) {
        pr_debug("ftp: dataoff(%u) >= skblen(%u)\n", dataoff,
             skb->len);
        return NF_ACCEPT;
    }
    datalen = skb->len - dataoff;

    spin_lock_bh(&nf_ftp_lock);
    fb_ptr = skb_header_pointer(skb, dataoff, datalen, ftp_buffer);
    BUG_ON(fb_ptr == NULL);

    ends_in_nl = (fb_ptr[datalen - 1] == '\n');
    seq = ntohl(th->seq) + datalen;

    /* Look up to see if we're just after a \n. */
    if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
        /* We're picking up this, clear flags and let it continue */
        if (unlikely(ct_ftp_info->flags[dir] & NF_CT_FTP_SEQ_PICKUP)) {
            ct_ftp_info->flags[dir] ^= NF_CT_FTP_SEQ_PICKUP;
            goto skip_nl_seq;
        }

        /* Now if this ends in \n, update ftp info. */
        pr_debug("nf_conntrack_ftp: wrong seq pos %s(%u) or %s(%u)\n",
             ct_ftp_info->seq_aft_nl_num[dir] > 0 ? "" : "(UNSET)",
             ct_ftp_info->seq_aft_nl[dir][0],
             ct_ftp_info->seq_aft_nl_num[dir] > 1 ? "" : "(UNSET)",
             ct_ftp_info->seq_aft_nl[dir][1]);
        ret = NF_ACCEPT;
        goto out_update_nl;
    }

skip_nl_seq:
    /* Initialize IP/IPv6 addr to expected address (it's not mentioned
       in EPSV responses) */
    cmd.l3num = nf_ct_l3num(ct);
    memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
           sizeof(cmd.u3.all));

    for (i = 0; i < ARRAY_SIZE(search[dir]); i++) {
        found = find_pattern(fb_ptr, datalen,
                     search[dir][i].pattern,
                     search[dir][i].plen,
                     search[dir][i].skip,
                     search[dir][i].term,
                     &matchoff, &matchlen,
                     &cmd,
                     search[dir][i].getnum);
        if (found) break;
    }
    if (found == -1) {
        /* We don't usually drop packets.  After all, this is
           connection tracking, not packet filtering.
           However, it is necessary for accurate tracking in
           this case. */
        nf_ct_helper_log(skb, ct, "partial matching of `%s'",
                     search[dir][i].pattern);
        ret = NF_DROP;
        goto out;
    } else if (found == 0) { /* No match */
        ret = NF_ACCEPT;
        goto out_update_nl;
    }

    pr_debug("conntrack_ftp: match `%.*s' (%u bytes at %u)\n",
         matchlen, fb_ptr + matchoff,
         matchlen, ntohl(th->seq) + matchoff);

    exp = nf_ct_expect_alloc(ct);
    if (exp == NULL) {
        nf_ct_helper_log(skb, ct, "cannot alloc expectation");
        ret = NF_DROP;
        goto out;
    }

    /* We refer to the reverse direction ("!dir") tuples here,
     * because we're expecting something in the other direction.
     * Doesn't matter unless NAT is happening.  
     * 获取反方向的目的地址
     */
    daddr = &ct->tuplehash[!dir].tuple.dst.u3;

    /* Update the ftp info */
    if ((cmd.l3num == nf_ct_l3num(ct)) &&
        memcmp(&cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all,
             sizeof(cmd.u3.all))) {
        /* Enrico Scholz's passive FTP to partially RNAT'd ftp
           server: it really wants us to connect to a
           different IP address.  Simply don't record it for
           NAT. */
        if (cmd.l3num == PF_INET) {
            pr_debug("NOT RECORDING: %pI4 != %pI4\n",
                 &cmd.u3.ip,
                 &ct->tuplehash[dir].tuple.src.u3.ip);
        } else {
            pr_debug("NOT RECORDING: %pI6 != %pI6\n",
                 cmd.u3.ip6,
                 ct->tuplehash[dir].tuple.src.u3.ip6);
        }

        /* Thanks to Cristiano Lincoln Mattos
           <lincoln@cesar.org.br> for reporting this potential
           problem (DMZ machines opening holes to internal
           networks, or the packet filter itself). */
        if (!loose) {
            ret = NF_ACCEPT;
            goto out_put_expect;
        }
        daddr = &cmd.u3;
    }
    /* 初始化期望连接,使用反方向的源地址和目的地址作为源目的地址,使用内容中的端口作为目的端口 */
    nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, cmd.l3num,
              &ct->tuplehash[!dir].tuple.src.u3, daddr,
              IPPROTO_TCP, NULL, &cmd.u.tcp.port);

    /* Now, NAT might want to mangle the packet, and register the
     * (possibly changed) expectation itself. */
    nf_nat_ftp = rcu_dereference(nf_nat_ftp_hook);
    if (nf_nat_ftp && ct->status & IPS_NAT_MASK)
        ret = nf_nat_ftp(skb, ctinfo, search[dir][i].ftptype,
                 protoff, matchoff, matchlen, exp);
    else {
        /* Can't expect this?  Best to drop packet now. */
        if (nf_ct_expect_related(exp) != 0) {
            nf_ct_helper_log(skb, ct, "cannot add expectation");
            ret = NF_DROP;
        } else
            ret = NF_ACCEPT;
    }

out_put_expect:
    nf_ct_expect_put(exp);

out_update_nl:
    /* Now if this ends in \n, update ftp info.  Seq may have been
     * adjusted by NAT code. */
    if (ends_in_nl)
        update_nl_seq(ct, seq, ct_ftp_info, dir, skb);
 out:
    spin_unlock_bh(&nf_ftp_lock);
    return ret;
}

从下面语句:

/* Now, NAT might want to mangle the packet, and register the
     * (possibly changed) expectation itself. */
    nf_nat_ftp = rcu_dereference(nf_nat_ftp_hook);
    if (nf_nat_ftp && ct->status & IPS_NAT_MASK)
        ret = nf_nat_ftp(skb, ctinfo, search[dir][i].ftptype,
                 protoff, matchoff, matchlen, exp);

可以看出,如果主连接没有进行NAT的话,是不会设置exp->expectfn函数的。

被动模式

同样对于被动方式,客户端发送PASV命令,然后服务器端发送xxx,xxx,xxx,xxx,ppp,ppp给客户端。help函数捕获了该消息,然后创建expect连接。假设客户端主链接的IP为cip。

dip/mask      = dataip/0xffffffff
dport/mask    = dataport/0xffff
sip/mask      = cip/0xffffffff
sport/mask    = 0/0
protocol = tcp

好了,这里说一句,exp都是子连接请求方向的。

如果有NAT,那help和expect还会做什么了?

NAT在客户端侧,这种场景很常见

主动模式

客户端发送PORT xxx,xxx,xxx,xxx,ppp,ppp。NAT设备通过help函数捕获了该消息。因为客户端是一个私网地址,其port命令中的地址也是一个私网地址,如果直接发送给服务器端,那么服务器端将不能连接该地址。所以,NAT设备需要为数据连接选择一个公网地址(可以是和主链接一样的地址,也可以是别的地址)和一个新的端口号,进行地址转换,将转换后的地址重新填充到PORT命令中。假设客户端发送的PORT命令为PORT 10.10.10.10 10000。经过NAT设备时,给其替换成1.1.1.1 1000。那么服务器将会连接客户端的(1.1.1.1 10000)。NAT设备必须为子连接的请求方向构建期望连接,即为:

dip/mask      = 1.1.1.1/0xffffffff
dport/mask    = 10000/0xffff
sip/mask      = rip/0xffffffff(由于源IP可以由服务器重新指定,所以这里直接设置母连接服务器端的IP地址也是不准确的,但是大多数正常情况是这样的)
sport/mask    = 0/0
protocol = tcp

假设服务器端以2.2.2.2 20连接客户端1.1.1.1 10000。那么NAT将会为数据通道创建新的连接跟踪为:

请求方向
dip      = 1.1.1.1
dport    = 10000
sip      = 2.2.2.2
sport    = 20
protocol = tcp

应答方向:
dip      = 2.2.2.2
dport    = 20
sip      = 1.1.1.1
sport    = 10000
protocol = tcp

这样的连接跟踪创建后,请求方向的报文都可以找到对应的连接跟踪,但是客户端收到的请求报文是经过dnat方向操作后的报文:

dip      = 10.10.10.10
dport    = 10000
sip      = 2.2.2.2
sport    = 20
protocol = tcp

发送的回应报文为:

dip      = 2.2.2.2
dport    = 20
sip      = 10.10.10.10
sport    = 10000
protocol = tcp

是无法命中连接跟踪的。

所以对于这种情况下的子连接需要做两件事:

1.为子连接构建nat信息,在nat模块中将根据这些信息进行nat操作。

2.修正连接跟踪的应答方向五元组,使客户端报文能命中连接跟踪。

这两件事由谁来做了?

答案就是expect函数

if (exp) {/* 执行期望函数 */
    if (exp->expectfn)
        exp->expectfn(ct, exp);
    nf_ct_expect_put(exp);
}

对于ftp来说,该函数为nf_nat_follow_master。从上面可以知道,数据通道请求方向需要做DNAT。其中exp->dir的值为主连接设置的,该值为help函数收到PORT命令时的方向的反方向(主动模式为请求方向,那么反方向为应答方向),所以exp->dir的值为IP_CT_DIR_REPLY。

/* Setup NAT on this expected conntrack so it follows master. */
/* If we fail to get a free NAT slot, we'll get dropped on confirm */
void nf_nat_follow_master(struct nf_conn *ct,
              struct nf_conntrack_expect *exp)
{
    struct nf_nat_range range;

    /* This must be a fresh one. */
    BUG_ON(ct->status & IPS_NAT_DONE_MASK);

    /* Change src to where master sends to */
    range.flags = NF_NAT_RANGE_MAP_IPS;
    range.min_addr = range.max_addr
        = ct->master->tuplehash[!exp->dir].tuple.dst.u3;
    nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);

    /* For DST manip, map port here to where it's expected. */
    /* 进行DNAT处理 */
    range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
    range.min_proto = range.max_proto = exp->saved_proto;
    range.min_addr = range.max_addr
        = ct->master->tuplehash[!exp->dir].tuple.src.u3;//这里使用的是主连接的请求方向源IP,即客户端主链接IP。
    nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);//构建nat信息
}

重点分析nat信息的构建

/* 根据提供的nat类型以及范围进行nat五元组修改 */
unsigned int
nf_nat_setup_info(struct nf_conn *ct,
          const struct nf_nat_range *range,
          enum nf_nat_manip_type maniptype)
{
    struct net *net = nf_ct_net(ct);/* 获取该连接跟踪所在的网络命名空间 */
    struct nf_conntrack_tuple curr_tuple, new_tuple;

    /* Can't setup nat info for confirmed ct. */
    /* 连接已经确认的不在进行构建 */
    if (nf_ct_is_confirmed(ct))
        return NF_ACCEPT;

    WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
        maniptype != NF_NAT_MANIP_DST);

    if (WARN_ON(nf_nat_initialized(ct, maniptype)))
        return NF_DROP;

    /* What we've got will look like inverse of reply. Normally
     * this is what is in the conntrack, except for prior
     * manipulations (future optimization: if num_manips == 0,
     * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
     * 获取请求方向的五元组
     */
    nf_ct_invert_tuplepr(&curr_tuple,
                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
    /* 根据请求方向的五元组获取nat后的请求方向的五元组 */
    get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
    /* 新的请求方向的五元组与原来的五元组不一样,则需要改变应答方向的五元组 */
    if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
        struct nf_conntrack_tuple reply;

        /* Alter conntrack table so will recognize replies. */
        /* 根据新的五元组得到应答方向的新的五元组 */
        nf_ct_invert_tuplepr(&reply, &new_tuple);
        /* 替换应答方向的五元组 */
        nf_conntrack_alter_reply(ct, &reply);

        /* Non-atomic: we own this at the moment. */
        if (maniptype == NF_NAT_MANIP_SRC)
            ct->status |= IPS_SRC_NAT;
        else
            ct->status |= IPS_DST_NAT;
        /* 判断该连接是否存在help,如果存在则必须添加seq-adj扩展功能 */
        if (nfct_help(ct) && !nfct_seqadj(ct))
            if (!nfct_seqadj_ext_add(ct))
                return NF_DROP;
    }
    /* 如果是源nat操作,则将该五元组添加到nf_nat_bysource hash表中 */
    if (maniptype == NF_NAT_MANIP_SRC) {
        unsigned int srchash;
        spinlock_t *lock;

        srchash = hash_by_src(net,
                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
        lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
        spin_lock_bh(lock);
        hlist_add_head_rcu(&ct->nat_bysource,
                   &nf_nat_bysource[srchash]);
        spin_unlock_bh(lock);
    }

    /* It's done. nat处理完毕 */
    if (maniptype == NF_NAT_MANIP_DST)
        ct->status |= IPS_DST_NAT_DONE;
    else
        ct->status |= IPS_SRC_NAT_DONE;

    return NF_ACCEPT;
}

传入的参数ct为:

请求方向
dip      = 1.1.1.1
dport    = 10000
sip      = 2.2.2.2
sport    = 20
protocol = tcp

应答方向:
dip      = 2.2.2.2
dport    = 20
sip      = 1.1.1.1
sport    = 10000
protocol = tcp

语句:

    nf_ct_invert_tuplepr(&curr_tuple,
                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);

的意思是求ct的应答方向的反连接,即:

应答方向:
dip      = 2.2.2.2
dport    = 20
sip      = 1.1.1.1
sport    = 10000
protocol = tcp

的反连接,curr_tuple即为请求方向:

请求方向
dip      = 1.1.1.1
dport    = 10000
sip      = 2.2.2.2
sport    = 20
protocol = tcp

语句:

get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);

/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
 * we change the source to map into the range. For NF_INET_PRE_ROUTING
 * and NF_INET_LOCAL_OUT, we change the destination to map into the
 * range. It might not be possible to get a unique tuple, but we try.
 * At worst (or if we race), we will end up with a final duplicate in
 * __ip_conntrack_confirm and drop the packet. */
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
         const struct nf_conntrack_tuple *orig_tuple,
         const struct nf_nat_range *range,//客户端IP
         struct nf_conn *ct,
         enum nf_nat_manip_type maniptype)//目的nat
{
    const struct nf_conntrack_zone *zone;
    const struct nf_nat_l3proto *l3proto;
    const struct nf_nat_l4proto *l4proto;
    struct net *net = nf_ct_net(ct);

    zone = nf_ct_zone(ct);

    rcu_read_lock();
    l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
    l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
                    orig_tuple->dst.protonum);

    /* 1) If this srcip/proto/src-proto-part is currently mapped,
     * and that same mapping gives a unique tuple within the given
     * range, use that.
     *
     * This is only required for source (ie. NAT/masq) mappings.
     * So far, we don't do local source mappings, so multiple
     * manips not an issue.
     */
    if (maniptype == NF_NAT_MANIP_SRC &&
        !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
        /* try the original tuple first */
        if (in_range(l3proto, l4proto, orig_tuple, range)) {
            if (!nf_nat_used_tuple(orig_tuple, ct)) {
                *tuple = *orig_tuple;
                goto out;
            }
        } else if (find_appropriate_src(net, zone, l3proto, l4proto,
                        orig_tuple, tuple, range)) {
            pr_debug("get_unique_tuple: Found current src map\n");
            if (!nf_nat_used_tuple(tuple, ct))
                goto out;
        }
    }

    /* 2) Select the least-used IP/proto combination in the given range */
    /* 2) 选择最少使用的IP/protocol组合,这里会修改tuple的dip */
    *tuple = *orig_tuple;
    find_best_ips_proto(zone, tuple, range, ct, maniptype);

    /* 3) The per-protocol part of the manip is made to map into
     * the range to make a unique tuple.
     */

    /* Only bother mapping if it's not already in range and unique */
    if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
        if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
            if (l4proto->in_range(tuple, maniptype,
                          &range->min_proto,
                          &range->max_proto) &&
                (range->min_proto.all == range->max_proto.all ||
                 !nf_nat_used_tuple(tuple, ct)))
                goto out;
        } else if (!nf_nat_used_tuple(tuple, ct)) {
            goto out;
        }
    }

    /* Last change: get protocol to try to obtain unique tuple. */
    /* 我们不修改端口 */
    l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
out:
    rcu_read_unlock();
}

/* For [FUTURE] fragmentation handling, we want the least-used
 * src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
 * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
 * 1-65535, we don't do pro-rata allocation based on ports; we choose
 * the ip with the lowest src-ip/dst-ip/proto usage.
 * 选择一个最少使用的IP/PRO协议组合
 */
static void
find_best_ips_proto(const struct nf_conntrack_zone *zone,
            struct nf_conntrack_tuple *tuple,
            const struct nf_nat_range *range,
            const struct nf_conn *ct,
            enum nf_nat_manip_type maniptype)
{
    union nf_inet_addr *var_ipp;
    unsigned int i, max;
    /* Host order */
    u32 minip, maxip, j, dist;
    bool full_range;

    /* No IP mapping?  Do nothing. */
    if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
        return;

    if (maniptype == NF_NAT_MANIP_SRC)
        var_ipp = &tuple->src.u3;
    else
        var_ipp = &tuple->dst.u3;//设置目的IP,先获取目的IP地址

    /* Fast path: only one choice. 如果只有一个IP地址,则就使用该IP地址,我们就一个IP,即客户端IP */
    if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) {
        *var_ipp = range->min_addr;
        return;
    }

    if (nf_ct_l3num(ct) == NFPROTO_IPV4)
        max = sizeof(var_ipp->ip) / sizeof(u32) - 1;
    else
        max = sizeof(var_ipp->ip6) / sizeof(u32) - 1;

    /* Hashing source and destination IPs gives a fairly even
     * spread in practice (if there are a small number of IPs
     * involved, there usually aren't that many connections
     * anyway).  The consistency means that servers see the same
     * client coming from the same IP (some Internet Banking sites
     * like this), even across reboots.
     */
    j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
           range->flags & NF_NAT_RANGE_PERSISTENT ?
            0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id);

    full_range = false;
    for (i = 0; i <= max; i++) {
        /* If first bytes of the address are at the maximum, use the
         * distance. Otherwise use the full range.
         */
        if (!full_range) {
            minip = ntohl((__force __be32)range->min_addr.all[i]);
            maxip = ntohl((__force __be32)range->max_addr.all[i]);
            dist  = maxip - minip + 1;
        } else {
            minip = 0;
            dist  = ~0;
        }

        var_ipp->all[i] = (__force __u32)
            htonl(minip + reciprocal_scale(j, dist));
        if (var_ipp->all[i] != range->max_addr.all[i])
            full_range = true;

        if (!(range->flags & NF_NAT_RANGE_PERSISTENT))
            j ^= (__force u32)tuple->dst.u3.all[i];
    }
}

对curr_tuple进行nat构建,替换掉curr_tuple中的目的IP后,得到new_tuple:

dip      = 10.10.10.10
dport    = 10000
sip      = 2.2.2.2
sport    = 20
protocol = tcp
if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {//&new_tuple, &curr_tuple必然不相等,
        struct nf_conntrack_tuple reply;

        /* Alter conntrack table so will recognize replies. */
        /* 根据新的五元组得到应答方向的新的五元组 */
        nf_ct_invert_tuplepr(&reply, &new_tuple);//将new_tuple进行反转,得到reply
        /* 替换应答方向的五元组 */
        nf_conntrack_alter_reply(ct, &reply);//将reply作为子连接ct的应答方向五元组

        /* Non-atomic: we own this at the moment. */
        if (maniptype == NF_NAT_MANIP_SRC)
            ct->status |= IPS_SRC_NAT;
        else
            ct->status |= IPS_DST_NAT;//同时设置需要进行目的NAT,nat模块看到该标志后将会将请求方向的目的IP改为应答方向的源IP。
        /* 判断该连接是否存在help,如果存在则必须添加seq-adj扩展功能,数据通道没有help,不需要进行seqadj。 */
        if (nfct_help(ct) && !nfct_seqadj(ct))
            if (!nfct_seqadj_ext_add(ct))
                return NF_DROP;
    }

reply为:

dip      = 2.2.2.2 
dport    = 20
sip      = 10.10.10.10
sport    = 10000
protocol = tcp

这个时候的reply就能命中客户端数据通道的应答了,bingo!

同时设置需要进行目的NATct->status |= IPS_DST_NAT;nat模块看到该标志后将会将请求方向的目的IP改为应答方向的源IP。完成报文的处理。应答方向报文看到该标志后,进行目的NAT的反操作,将应答报文的源IP改为请求方向的目的IP。

被动模式

服务器端发送 xxx,xxx,xxx,xxx,ppp,ppp。NAT设备通过help函数捕获了该消息。该地址是一个公网地址,NAT设备不需要对ftp的内容进行NAT转换(注意是不对内容进行nat转换的,控制通道依然需要进行SNAT转换),NAT设备会直接将该报文的内容发送给客户端。假设服务器端发送命令为2.2.2.2 10000。NAT设备必须为子连接的请求方向构建期望连接,即为:

dip/mask      = 2.2.2.2/0xffffffff
dport/mask    = 10000/0xffff
sip/mask      = rip/0xffffffff //rip为主连接的请求方向源IP,即客户端IP
sport/mask    = 0/0   //源端口暂时不知道
protocol = tcp

假设客户端以10.10.10.10 5000连接服务器端2.2.2.2 10000。那么NAT将会为数据通道创建新的连接跟踪为:

请求方向
dip      = 2.2.2.2
dport    = 10000
sip      = 10.10.10.10
sport    = 5000
protocol = tcp

应答方向:
dip      = 10.10.10.10
dport    = 5000
sip      = 2.2.2.2
sport    = 10000
protocol = tcp

这样的连接跟踪创建后,是无法命中应答方向的报文的:

dip      = 1.1.1.1
dport    = 5000
sip      = 2.2.2.2
sport    = 10000
protocol = tcp

所以需要修正应答方向的五元组。

其中exp->dir的值为主连接设置的,该值为help函数收到服务器发送的端口应答时的方向的反方向(被动动模式为应答方向,那么反方向为请求方向),所以exp->dir的值为IP_CT_DIR_ORIGINAL。

/* Setup NAT on this expected conntrack so it follows master. */
/* If we fail to get a free NAT slot, we'll get dropped on confirm */
void nf_nat_follow_master(struct nf_conn *ct,
              struct nf_conntrack_expect *exp)
{
    struct nf_nat_range range;

    /* This must be a fresh one. */
    BUG_ON(ct->status & IPS_NAT_DONE_MASK);

    /* Change src to where master sends to */
    range.flags = NF_NAT_RANGE_MAP_IPS;//进行源NAT
    range.min_addr = range.max_addr
        = ct->master->tuplehash[!exp->dir].tuple.dst.u3;//这里是应答方向,选取目的IP
    nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);//进行源NAT

    /* For DST manip, map port here to where it's expected. */
    /* 进行DNAT处理 */
    range.flags = (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED);
    range.min_proto = range.max_proto = exp->saved_proto;
    range.min_addr = range.max_addr
        = ct->master->tuplehash[!exp->dir].tuple.src.u3;//这里使用的是主连接的请求方向源IP,即客户端主链接IP。
    nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST);//构建nat信息
}

后面的推理跟主动模式差不多。


ouyangxibao
189 声望161 粉丝

不生产代码,只是代码的搬运工