icmp协议相比于tcp,udp有其读特性,它介于网络层和传输层之间,它没有传输层的源目的端口。所以在创建连接跟踪时需要进行特殊处理。还有ICMP属于差错报文,并不是所有icmp报文是成对出现的,这些不同造成了icmp的处理与tcp,udp处理的不同。

ICMP简介

icmp报文由如下种类:

image.png

连接跟踪实现

报文类型

最多有18种icmp报文,每一种icmp报文可能会有一些子类。只有下面四种icmp报文是成对出现的。

static const u_int8_t valid_new[] = {
    [ICMP_ECHO] = 1,
    [ICMP_TIMESTAMP] = 1,
    [ICMP_INFO_REQUEST] = 1,
    [ICMP_ADDRESS] = 1
};
//其成对的关系为
/* Add 1; spaces filled with 0. 这里都给其对应的类型加了1,主要是因为ICMP_ECHO值为0,内核想把0这个值表示没有成对消息,所以在这里进行了加1,最后在构建CT的时候会减掉1。详细可以查看函数icmp_invert_tuple。
*/
static const u_int8_t invmap[] = {
    [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
    [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
    [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
    [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
    [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
    [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
    [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
    [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1
};

因为只有这四对消息时成对的,所以连接跟踪只会为这四对消息进行连接跟踪。

五元组

icmp报文没有源目的端口,采用什么来填充tuple呢?

从下面代码可以看出,连接跟踪使用一个__be16 id来替代tuple中的源port,这里的id是icmp报文中的标识符,这四类消息都有,其中ping消息一般在其中填充ping程序的pid,所以同一台设备启动两个不同的ping程序ping同一个ip会生成两个会话:

/* The protocol-specific manipulable parts of the tuple: always in
 * network order
 */
union nf_conntrack_man_proto {
    /* Add other protocols here. */
    __be16 all;

    struct {
        __be16 port;
    } tcp;
    struct {
        __be16 port;
    } udp;
    struct {
        __be16 id;
    } icmp;
    struct {
        __be16 port;
    } dccp;
    struct {
        __be16 port;
    } sctp;
    struct {
        __be16 key;    /* GRE key is 32bit, PPtP only uses 16bit */
    } gre;
};

从下面代码可以看出,连接跟踪使用一个u_int8_t type, code;来替代tuple中的目的port:

/* This contains the information to distinguish a connection. */
struct nf_conntrack_tuple {
    struct nf_conntrack_man src;

    /* These are the parts of the tuple which are fixed. */
    struct {
        union nf_inet_addr u3;
        union {
            /* Add other protocols here. */
            __be16 all;

            struct {
                __be16 port;
            } tcp;
            struct {
                __be16 port;
            } udp;
            struct {
                u_int8_t type, code;
            } icmp;
            struct {
                __be16 port;
            } dccp;
            struct {
                __be16 port;
            } sctp;
            struct {
                __be16 key;
            } gre;
        } u;

        /* The protocol. */
        u_int8_t protonum;

        /* The direction (for tuplehash) */
        u_int8_t dir;
    } dst;
};

下面我们看一下,icmp如何求一个tuple的反转tuple,是否项tcp,udp将源目的端口调换一样呢?

/* 反转五元组 */
static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
                  const struct nf_conntrack_tuple *orig)
{
    if (orig->dst.u.icmp.type >= sizeof(invmap) ||//判断类型是否超过了最大值,非法
        !invmap[orig->dst.u.icmp.type])//判断该类型的icmp消息是否是成对的,使用0表示不成对,不成对则不处理。
        return false;
    //id依然填写到id的位置,没有被调换到type,code位置
    tuple->src.u.icmp.id = orig->src.u.icmp.id;
    //只是替换了type到其对应的type。
    tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;//这里减了1,因为invmap中都加了1
    //code不会变。因为这四对消息的code只有一个值0。详细请看前面的图片。
    tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
    return true;
}

从上面可以看出,icmp求反转tuple时,只会将type替换成对应的type(我们这里不涉及IP地址)

非成对icmp报文处理

icmp更多的是差错报文,它是用来通知源主机的一些错误信息的。它的产生往往是某台设备发送的报文在传输过程中出现了差错,在传输路径中设备或者目标主机设备检测到了差错,从而生成一个ICMP差错报文通知源主机。差错报文会在icmp报文头后添加一段导致该icmp报文的原始报文头信息。所以说icmp差错报文是一个连接的附属,连接跟踪将差错报文视为一个子连接(不会真实创建CT,而是依附于主连接,设置该报文的状态为IP_CT_RELATED或者IP_CT_RELATED_REPLY)。

连接跟踪处理如下几种差错报文:

ICMP_DEST_UNREACH   //目的不可达
ICMP_SOURCE_QUENCH  //源抑制,向源主机发送源抑制报文通知源主机减慢发送速度
ICMP_TIME_EXCEEDED  //TTL超时,
ICMP_PARAMETERPROB  //参数问题,
ICMP_REDIRECT       //重定向,收到该差错的主机需要更新路由的下一跳,或者邻居(直连主机)

连接跟踪对于这几种差错报文,需要正确交给目标主机。处理的主要原因是NAT,后续详细说明。

ICMP协议控制块

const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
{
    .l3proto        = PF_INET,
    .l4proto        = IPPROTO_ICMP,
    .pkt_to_tuple        = icmp_pkt_to_tuple,
    .invert_tuple        = icmp_invert_tuple,
    .packet            = icmp_packet,
    .get_timeouts        = icmp_get_timeouts,
    .new            = icmp_new,
    .error            = icmp_error,
    .destroy        = NULL,
    .me            = NULL,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
    .tuple_to_nlattr    = icmp_tuple_to_nlattr,
    .nlattr_tuple_size    = icmp_nlattr_tuple_size,
    .nlattr_to_tuple    = icmp_nlattr_to_tuple,
    .nla_policy        = icmp_nla_policy,
#endif
#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
    .ctnl_timeout        = {
        .nlattr_to_obj    = icmp_timeout_nlattr_to_obj,
        .obj_to_nlattr    = icmp_timeout_obj_to_nlattr,
        .nlattr_max    = CTA_TIMEOUT_ICMP_MAX,
        .obj_size    = sizeof(unsigned int),
        .nla_policy    = icmp_timeout_nla_policy,
    },
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
    .init_net        = icmp_init_net,
    .get_net_proto        = icmp_get_net_proto,
};

连接跟踪首先进行错误检验,执行error函数,对于icmp来说就是icmp_error函数

/* Small and modified version of icmp_rcv */
/* 用于在连接跟踪中处理报文错误,tmpl一般为NULL */
static int
icmp_error(struct net *net, struct nf_conn *tmpl,
       struct sk_buff *skb, unsigned int dataoff,
       u8 pf, unsigned int hooknum)
{
    const struct icmphdr *icmph;
    struct icmphdr _ih;

    /* Not enough header? icmp头是否完整 */
    icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
    if (icmph == NULL) {
        icmp_error_log(skb, net, pf, "short packet");
        return -NF_ACCEPT;
    }

    /* See ip_conntrack_proto_tcp.c */
    /* 检验校验码 */
    if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&//本机发送的不检查
        nf_ip_checksum(skb, hooknum, dataoff, 0)) {
        icmp_error_log(skb, net, pf, "bad hw icmp checksum");
        return -NF_ACCEPT;
    }

    /*
     *    18 is the highest 'known' ICMP type. Anything else is a mystery
     *
     *    RFC 1122: 3.2.2  Unknown ICMP messages types MUST be silently
     *          discarded.
     * 类型是否非法。
     */
    if (icmph->type > NR_ICMP_TYPES) {
        icmp_error_log(skb, net, pf, "invalid icmp type");
        return -NF_ACCEPT;
    }

    /* Need to track icmp error message? */
    /* 非差错报文直接检查通过 */
    if (icmph->type != ICMP_DEST_UNREACH &&
        icmph->type != ICMP_SOURCE_QUENCH &&
        icmph->type != ICMP_TIME_EXCEEDED &&
        icmph->type != ICMP_PARAMETERPROB &&
        icmph->type != ICMP_REDIRECT)
        return NF_ACCEPT;
    //处理icmp差错报文
    return icmp_error_message(net, tmpl, skb, hooknum);
}

/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
/* icmp差错报文处理,主要是根据内层携带的原始报文头找到对应的主连接。
** 然后设置该报文依附于主链接,是一个RELATE报文
*/             
static int
icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
         unsigned int hooknum)
{
    struct nf_conntrack_tuple innertuple, origtuple;
    const struct nf_conntrack_l4proto *innerproto;
    const struct nf_conntrack_tuple_hash *h;
    const struct nf_conntrack_zone *zone;
    enum ip_conntrack_info ctinfo;
    struct nf_conntrack_zone tmp;

    WARN_ON(skb_nfct(skb));
    zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);

    /* Are they talking about one of our connections? */
    /* 根据内层报文信息获取对应的五元组到origtuple中 */
    if (!nf_ct_get_tuplepr(skb,
                   skb_network_offset(skb) + ip_hdrlen(skb)
                               + sizeof(struct icmphdr),
                   PF_INET, net, &origtuple)) {
        pr_debug("icmp_error_message: failed to get tuple\n");
        return -NF_ACCEPT;
    }

    /* rcu_read_lock()ed by nf_hook_thresh */
    /* 获取内层报文的传输层控制块 */               
    innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum);

    /* Ordinarily, we'd expect the inverted tupleproto, but it's
       been preserved inside the ICMP. 
    ** 获取内层报文的反向五元组
    */
    if (!nf_ct_invert_tuple(&innertuple, &origtuple,
                &nf_conntrack_l3proto_ipv4, innerproto)) {
        pr_debug("icmp_error_message: no match\n");
        return -NF_ACCEPT;
    }
    //设置报文的状态为子连接,这是报文的状态。
    ctinfo = IP_CT_RELATED;
    //根据反向五元组获取对应的主CT。为什么是反向呢?
    //因为icmp报文是对源报文的一个响应,所以应该根据源报文的信息去获取其所属连接。
    h = nf_conntrack_find_get(net, zone, &innertuple);
    if (!h) {
        pr_debug("icmp_error_message: no match\n");
        return -NF_ACCEPT;
    }
    //如果是应答方向,则设置其状态为
    if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
        ctinfo += IP_CT_IS_REPLY;

    /* Update skb to refer to this connection */
    /* 将该报文关联到 主CT     ,其状态为IP_CT_RELATED or IP_CT_RELATED_REPLY*/
    nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
    /* 修改内部的 */
    return NF_ACCEPT;
}
//差错报文返回NF_ACCEPT后,因为设置了报文的CT,报文的连接跟踪处理就结束了。详细情况nf_conntrack_in函数。

icmp_pkt_to_tuple

/* 提取icmp的五元组,只有成对报文才会 */
static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
                  struct net *net, struct nf_conntrack_tuple *tuple)
{
    const struct icmphdr *hp;
    struct icmphdr _hdr;

    hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
    if (hp == NULL)
        return false;

    tuple->dst.u.icmp.type = hp->type;/* 类型 */
    tuple->src.u.icmp.id = hp->un.echo.id;/* id号,ping报文为进程id */
    tuple->dst.u.icmp.code = hp->code;/* 代码,一般为0 */

    return true;
}

icmp_invert_tuple

/* 反转五元组 */
static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
                  const struct nf_conntrack_tuple *orig)
{
    if (orig->dst.u.icmp.type >= sizeof(invmap) ||//判断类型是否超过了最大值,非法
        !invmap[orig->dst.u.icmp.type])//判断该类型的icmp消息是否是成对的,使用0表示不成对,不成对则不处理。
        return false;
    //id依然填写到id的位置,没有被调换到type,code位置
    tuple->src.u.icmp.id = orig->src.u.icmp.id;
    //只是替换了type到其对应的type。
    tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;//这里减了1,因为invmap中都加了1
    //code不会变。因为这四对消息的code只有一个值0。详细请看前面的图片。
    tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
    return true;
}

icmp_new

非差错报文的请求方向报文会被该函数处理,主要是进行合法性校验,icmp_error函数已经处理过了,这里多余。

/* Called when a new connection for this protocol found. */
static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
             unsigned int dataoff, unsigned int *timeouts)
{
    static const u_int8_t valid_new[] = {/*  共18个元素,其中只有下面四个icmp请求会进行连接跟踪 */
        [ICMP_ECHO] = 1,
        [ICMP_TIMESTAMP] = 1,
        [ICMP_INFO_REQUEST] = 1,
        [ICMP_ADDRESS] = 1
    };

    if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) ||
        !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
        /* Can't create a new ICMP `conn' with this. */
        pr_debug("icmp: can't create new conn with type %u\n",
             ct->tuplehash[0].tuple.dst.u.icmp.type);
        nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple);
        return false;
    }
    return true;
}

icmp_packet

非差错报文的应答方向报文会被该函数处理,主要是进行超时更新和报文统计。

/* Returns verdict for packet, or -1 for invalid. */
/* icmp协议自己的连接跟踪事务处理,对于icmp仅仅是进行报文统计 */
static int icmp_packet(struct nf_conn *ct,
               const struct sk_buff *skb,
               unsigned int dataoff,
               enum ip_conntrack_info ctinfo,
               unsigned int *timeout)
{
    /* Do not immediately delete the connection after the first
       successful reply to avoid excessive conntrackd traffic
       and also to handle correctly ICMP echo reply duplicates. */
    nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);

    return NF_ACCEPT;
}

icmp_get_timeouts

icmp 连接跟踪超时时间获取,一般是30秒。

/* 获取该命名空间的icmp连接跟踪的超时时间 */
static unsigned int *icmp_get_timeouts(struct net *net)
{
    return &icmp_pernet(net)->timeout;
}

static int icmp_init_net(struct net *net, u_int16_t proto)
{
    struct nf_icmp_net *in = icmp_pernet(net);
    struct nf_proto_net *pn = &in->pn;

    in->timeout = nf_ct_icmp_timeout;

    return icmp_kmemdup_sysctl_table(pn, in);
}
/* icmp会话30秒超时 */
static const unsigned int nf_ct_icmp_timeout = 30*HZ;

ICMP对NAT的支持

非差错报文对nat的支持

icmp报文对nat的支持实际更多的是网络层的支持,对于icmp报文本身来说只有一个标识符可以改变,不过很少有场景要改变标识符的。下面就代码简单的分析一下.

icmp nat控制块
const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
    .l4proto        = IPPROTO_ICMP,
    .manip_pkt        = icmp_manip_pkt,
    .in_range        = icmp_in_range,
    .unique_tuple        = icmp_unique_tuple,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
    .nlattr_to_range    = nf_nat_l4proto_nlattr_to_range,
#endif
};
icmp_in_range

判断icmp的标识符是否在指定的范围中。

static bool
icmp_in_range(const struct nf_conntrack_tuple *tuple,
          enum nf_nat_manip_type maniptype,
          const union nf_conntrack_man_proto *min,
          const union nf_conntrack_man_proto *max)
{
    return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
           ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
}
icmp_unique_tuple

分配一个标志符,使得五元组唯一。

static void
icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
          struct nf_conntrack_tuple *tuple,
          const struct nf_nat_range *range,
          enum nf_nat_manip_type maniptype,
          const struct nf_conn *ct)
{
    static u_int16_t id;
    unsigned int range_size;
    unsigned int i;

    range_size = ntohs(range->max_proto.icmp.id) -
             ntohs(range->min_proto.icmp.id) + 1;
    /* If no range specified... 没有指定范围,则设置方位为0xffff */
    if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
        range_size = 0xFFFF;

    for (i = 0; ; ++id) {
        tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
                         (id % range_size));
        if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
            return;
    }
    return;
}
icmp_manip_pkt

将选择的标识符替换掉原来的标识符,更新校验码。

static bool
icmp_manip_pkt(struct sk_buff *skb,
           const struct nf_nat_l3proto *l3proto,
           unsigned int iphdroff, unsigned int hdroff,
           const struct nf_conntrack_tuple *tuple,
           enum nf_nat_manip_type maniptype)
{
    struct icmphdr *hdr;

    if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
        return false;

    hdr = (struct icmphdr *)(skb->data + hdroff);
    inet_proto_csum_replace2(&hdr->checksum, skb,
                 hdr->un.echo.id, tuple->src.u.icmp.id, false);
    hdr->un.echo.id = tuple->src.u.icmp.id;
    return true;
}

差错报文对NAT的支持,常说的ICMP ALG

差错报文的内层报文信息来自于产生差错的报文。当一个主机发送一个报文经过NAT后,其报文头发生了改变。也就是说,检测到该报文有差错的设备看到的报文是经过NAT后的报文,所以NAT需要将内层报文还原回原来的报文再转发给源主机。

unsigned int
nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
           const struct nf_hook_state *state,
           unsigned int (*do_chain)(void *priv,
                    struct sk_buff *skb,
                    const struct nf_hook_state *state,
                    struct nf_conn *ct))
{
    struct nf_conn *ct;
    enum ip_conntrack_info ctinfo;
    struct nf_conn_nat *nat;
    /* maniptype == SRC for postrouting. */
    enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);

    ct = nf_ct_get(skb, &ctinfo);
    /* Can't track?  It's not due to stress, or conntrack would
     * have dropped it.  Hence it's the user's responsibilty to
     * packet filter it out, or implement conntrack/NAT for that
     * protocol. 8) --RR
     */
    if (!ct)
        return NF_ACCEPT;

    nat = nfct_nat(ct);

    switch (ctinfo) {
    case IP_CT_RELATED://对于icmp差错报文,会为这两个状态
    case IP_CT_RELATED_REPLY:
        //icmp报文特殊处理,这种状态的报文是一个icmp差错报文。
        //根据其所属的原始报文决定其所属的ct。对icmp携带的原始报文部分进行
        //相应操作。
        if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
            //既会对内层报文进行nat,也会对外层报文进行nat,这里处理完毕后就返回了。
            if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
                               state->hook))
                return NF_DROP;
            else
                return NF_ACCEPT;
        }
     ...
}

int nf_nat_icmp_reply_translation(struct sk_buff *skb,
                  struct nf_conn *ct,
                  enum ip_conntrack_info ctinfo,
                  unsigned int hooknum)
{
    struct {
        struct icmphdr    icmp;
        struct iphdr    ip;
    } *inside;
    enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
    enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
    unsigned int hdrlen = ip_hdrlen(skb);
    const struct nf_nat_l4proto *l4proto;
    struct nf_conntrack_tuple target;
    unsigned long statusbit;

    WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);

    if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
        return 0;

    if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
        return 0;
    //获取icmp报文头起始地址
    inside = (void *)skb->data + hdrlen;
    if (inside->icmp.type == ICMP_REDIRECT) {//重定向差错报恩。
        if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
            return 0;
        if (ct->status & IPS_NAT_MASK)
            return 0;
    }

    if (manip == NF_NAT_MANIP_SRC)
        statusbit = IPS_SRC_NAT;
    else
        statusbit = IPS_DST_NAT;

    /* Invert if this is reply direction */
    /* 应答方向进行求反 */
    if (dir == IP_CT_DIR_REPLY)
        statusbit ^= IPS_NAT_MASK;
    //如果主连接没有该nat操作,退出。
    if (!(ct->status & statusbit))
        return 1;
    //获取内层报文的传输层操作控制块
    l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
    //进行内层报文nat处理。包括传输层和网络层
    if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
                   l4proto, &ct->tuplehash[!dir].tuple, !manip))
        return 0;
    //更新icmp校验码
    if (skb->ip_summed != CHECKSUM_PARTIAL) {
        /* Reloading "inside" here since manip_pkt may reallocate */
        inside = (void *)skb->data + hdrlen;
        inside->icmp.checksum = 0;
        inside->icmp.checksum =
            csum_fold(skb_checksum(skb, hdrlen,
                           skb->len - hdrlen, 0));
    }

    /* Change outer to look like the reply to an incoming packet */
    //进行外层报文的nat处理
    nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
    l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
    if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
        return 0;

    return 1;
}

ouyangxibao
189 声望161 粉丝

不生产代码,只是代码的搬运工