icmp协议相比于tcp,udp有其读特性,它介于网络层和传输层之间,它没有传输层的源目的端口。所以在创建连接跟踪时需要进行特殊处理。还有ICMP属于差错报文,并不是所有icmp报文是成对出现的,这些不同造成了icmp的处理与tcp,udp处理的不同。
ICMP简介
icmp报文由如下种类:
连接跟踪实现
报文类型
最多有18种icmp报文,每一种icmp报文可能会有一些子类。只有下面四种icmp报文是成对出现的。
static const u_int8_t valid_new[] = {
[ICMP_ECHO] = 1,
[ICMP_TIMESTAMP] = 1,
[ICMP_INFO_REQUEST] = 1,
[ICMP_ADDRESS] = 1
};
//其成对的关系为
/* Add 1; spaces filled with 0. 这里都给其对应的类型加了1,主要是因为ICMP_ECHO值为0,内核想把0这个值表示没有成对消息,所以在这里进行了加1,最后在构建CT的时候会减掉1。详细可以查看函数icmp_invert_tuple。
*/
static const u_int8_t invmap[] = {
[ICMP_ECHO] = ICMP_ECHOREPLY + 1,
[ICMP_ECHOREPLY] = ICMP_ECHO + 1,
[ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
[ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
[ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
[ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
[ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
[ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1
};
因为只有这四对消息时成对的,所以连接跟踪只会为这四对消息进行连接跟踪。
五元组
icmp报文没有源目的端口,采用什么来填充tuple呢?
从下面代码可以看出,连接跟踪使用一个__be16 id来替代tuple中的源port,这里的id是icmp报文中的标识符,这四类消息都有,其中ping消息一般在其中填充ping程序的pid,所以同一台设备启动两个不同的ping程序ping同一个ip会生成两个会话:
/* The protocol-specific manipulable parts of the tuple: always in
* network order
*/
union nf_conntrack_man_proto {
/* Add other protocols here. */
__be16 all;
struct {
__be16 port;
} tcp;
struct {
__be16 port;
} udp;
struct {
__be16 id;
} icmp;
struct {
__be16 port;
} dccp;
struct {
__be16 port;
} sctp;
struct {
__be16 key; /* GRE key is 32bit, PPtP only uses 16bit */
} gre;
};
从下面代码可以看出,连接跟踪使用一个u_int8_t type, code;来替代tuple中的目的port:
/* This contains the information to distinguish a connection. */
struct nf_conntrack_tuple {
struct nf_conntrack_man src;
/* These are the parts of the tuple which are fixed. */
struct {
union nf_inet_addr u3;
union {
/* Add other protocols here. */
__be16 all;
struct {
__be16 port;
} tcp;
struct {
__be16 port;
} udp;
struct {
u_int8_t type, code;
} icmp;
struct {
__be16 port;
} dccp;
struct {
__be16 port;
} sctp;
struct {
__be16 key;
} gre;
} u;
/* The protocol. */
u_int8_t protonum;
/* The direction (for tuplehash) */
u_int8_t dir;
} dst;
};
下面我们看一下,icmp如何求一个tuple的反转tuple,是否项tcp,udp将源目的端口调换一样呢?
/* 反转五元组 */
static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple *orig)
{
if (orig->dst.u.icmp.type >= sizeof(invmap) ||//判断类型是否超过了最大值,非法
!invmap[orig->dst.u.icmp.type])//判断该类型的icmp消息是否是成对的,使用0表示不成对,不成对则不处理。
return false;
//id依然填写到id的位置,没有被调换到type,code位置
tuple->src.u.icmp.id = orig->src.u.icmp.id;
//只是替换了type到其对应的type。
tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;//这里减了1,因为invmap中都加了1
//code不会变。因为这四对消息的code只有一个值0。详细请看前面的图片。
tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
return true;
}
从上面可以看出,icmp求反转tuple时,只会将type替换成对应的type(我们这里不涉及IP地址)
非成对icmp报文处理
icmp更多的是差错报文,它是用来通知源主机的一些错误信息的。它的产生往往是某台设备发送的报文在传输过程中出现了差错,在传输路径中设备或者目标主机设备检测到了差错,从而生成一个ICMP差错报文通知源主机。差错报文会在icmp报文头后添加一段导致该icmp报文的原始报文头信息。所以说icmp差错报文是一个连接的附属,连接跟踪将差错报文视为一个子连接(不会真实创建CT,而是依附于主连接,设置该报文的状态为IP_CT_RELATED或者IP_CT_RELATED_REPLY)。
连接跟踪处理如下几种差错报文:
ICMP_DEST_UNREACH //目的不可达
ICMP_SOURCE_QUENCH //源抑制,向源主机发送源抑制报文通知源主机减慢发送速度
ICMP_TIME_EXCEEDED //TTL超时,
ICMP_PARAMETERPROB //参数问题,
ICMP_REDIRECT //重定向,收到该差错的主机需要更新路由的下一跳,或者邻居(直连主机)
连接跟踪对于这几种差错报文,需要正确交给目标主机。处理的主要原因是NAT,后续详细说明。
ICMP协议控制块
const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
{
.l3proto = PF_INET,
.l4proto = IPPROTO_ICMP,
.pkt_to_tuple = icmp_pkt_to_tuple,
.invert_tuple = icmp_invert_tuple,
.packet = icmp_packet,
.get_timeouts = icmp_get_timeouts,
.new = icmp_new,
.error = icmp_error,
.destroy = NULL,
.me = NULL,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = icmp_tuple_to_nlattr,
.nlattr_tuple_size = icmp_nlattr_tuple_size,
.nlattr_to_tuple = icmp_nlattr_to_tuple,
.nla_policy = icmp_nla_policy,
#endif
#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
.ctnl_timeout = {
.nlattr_to_obj = icmp_timeout_nlattr_to_obj,
.obj_to_nlattr = icmp_timeout_obj_to_nlattr,
.nlattr_max = CTA_TIMEOUT_ICMP_MAX,
.obj_size = sizeof(unsigned int),
.nla_policy = icmp_timeout_nla_policy,
},
#endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */
.init_net = icmp_init_net,
.get_net_proto = icmp_get_net_proto,
};
连接跟踪首先进行错误检验,执行error函数,对于icmp来说就是icmp_error函数
/* Small and modified version of icmp_rcv */
/* 用于在连接跟踪中处理报文错误,tmpl一般为NULL */
static int
icmp_error(struct net *net, struct nf_conn *tmpl,
struct sk_buff *skb, unsigned int dataoff,
u8 pf, unsigned int hooknum)
{
const struct icmphdr *icmph;
struct icmphdr _ih;
/* Not enough header? icmp头是否完整 */
icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih);
if (icmph == NULL) {
icmp_error_log(skb, net, pf, "short packet");
return -NF_ACCEPT;
}
/* See ip_conntrack_proto_tcp.c */
/* 检验校验码 */
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&//本机发送的不检查
nf_ip_checksum(skb, hooknum, dataoff, 0)) {
icmp_error_log(skb, net, pf, "bad hw icmp checksum");
return -NF_ACCEPT;
}
/*
* 18 is the highest 'known' ICMP type. Anything else is a mystery
*
* RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
* discarded.
* 类型是否非法。
*/
if (icmph->type > NR_ICMP_TYPES) {
icmp_error_log(skb, net, pf, "invalid icmp type");
return -NF_ACCEPT;
}
/* Need to track icmp error message? */
/* 非差错报文直接检查通过 */
if (icmph->type != ICMP_DEST_UNREACH &&
icmph->type != ICMP_SOURCE_QUENCH &&
icmph->type != ICMP_TIME_EXCEEDED &&
icmph->type != ICMP_PARAMETERPROB &&
icmph->type != ICMP_REDIRECT)
return NF_ACCEPT;
//处理icmp差错报文
return icmp_error_message(net, tmpl, skb, hooknum);
}
/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
/* icmp差错报文处理,主要是根据内层携带的原始报文头找到对应的主连接。
** 然后设置该报文依附于主链接,是一个RELATE报文
*/
static int
icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
unsigned int hooknum)
{
struct nf_conntrack_tuple innertuple, origtuple;
const struct nf_conntrack_l4proto *innerproto;
const struct nf_conntrack_tuple_hash *h;
const struct nf_conntrack_zone *zone;
enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
WARN_ON(skb_nfct(skb));
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
/* Are they talking about one of our connections? */
/* 根据内层报文信息获取对应的五元组到origtuple中 */
if (!nf_ct_get_tuplepr(skb,
skb_network_offset(skb) + ip_hdrlen(skb)
+ sizeof(struct icmphdr),
PF_INET, net, &origtuple)) {
pr_debug("icmp_error_message: failed to get tuple\n");
return -NF_ACCEPT;
}
/* rcu_read_lock()ed by nf_hook_thresh */
/* 获取内层报文的传输层控制块 */
innerproto = __nf_ct_l4proto_find(PF_INET, origtuple.dst.protonum);
/* Ordinarily, we'd expect the inverted tupleproto, but it's
been preserved inside the ICMP.
** 获取内层报文的反向五元组
*/
if (!nf_ct_invert_tuple(&innertuple, &origtuple,
&nf_conntrack_l3proto_ipv4, innerproto)) {
pr_debug("icmp_error_message: no match\n");
return -NF_ACCEPT;
}
//设置报文的状态为子连接,这是报文的状态。
ctinfo = IP_CT_RELATED;
//根据反向五元组获取对应的主CT。为什么是反向呢?
//因为icmp报文是对源报文的一个响应,所以应该根据源报文的信息去获取其所属连接。
h = nf_conntrack_find_get(net, zone, &innertuple);
if (!h) {
pr_debug("icmp_error_message: no match\n");
return -NF_ACCEPT;
}
//如果是应答方向,则设置其状态为
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
ctinfo += IP_CT_IS_REPLY;
/* Update skb to refer to this connection */
/* 将该报文关联到 主CT ,其状态为IP_CT_RELATED or IP_CT_RELATED_REPLY*/
nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
/* 修改内部的 */
return NF_ACCEPT;
}
//差错报文返回NF_ACCEPT后,因为设置了报文的CT,报文的连接跟踪处理就结束了。详细情况nf_conntrack_in函数。
icmp_pkt_to_tuple
/* 提取icmp的五元组,只有成对报文才会 */
static bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
struct net *net, struct nf_conntrack_tuple *tuple)
{
const struct icmphdr *hp;
struct icmphdr _hdr;
hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
if (hp == NULL)
return false;
tuple->dst.u.icmp.type = hp->type;/* 类型 */
tuple->src.u.icmp.id = hp->un.echo.id;/* id号,ping报文为进程id */
tuple->dst.u.icmp.code = hp->code;/* 代码,一般为0 */
return true;
}
icmp_invert_tuple
/* 反转五元组 */
static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple *orig)
{
if (orig->dst.u.icmp.type >= sizeof(invmap) ||//判断类型是否超过了最大值,非法
!invmap[orig->dst.u.icmp.type])//判断该类型的icmp消息是否是成对的,使用0表示不成对,不成对则不处理。
return false;
//id依然填写到id的位置,没有被调换到type,code位置
tuple->src.u.icmp.id = orig->src.u.icmp.id;
//只是替换了type到其对应的type。
tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;//这里减了1,因为invmap中都加了1
//code不会变。因为这四对消息的code只有一个值0。详细请看前面的图片。
tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
return true;
}
icmp_new
非差错报文的请求方向报文会被该函数处理,主要是进行合法性校验,icmp_error函数已经处理过了,这里多余。
/* Called when a new connection for this protocol found. */
static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
unsigned int dataoff, unsigned int *timeouts)
{
static const u_int8_t valid_new[] = {/* 共18个元素,其中只有下面四个icmp请求会进行连接跟踪 */
[ICMP_ECHO] = 1,
[ICMP_TIMESTAMP] = 1,
[ICMP_INFO_REQUEST] = 1,
[ICMP_ADDRESS] = 1
};
if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) ||
!valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
/* Can't create a new ICMP `conn' with this. */
pr_debug("icmp: can't create new conn with type %u\n",
ct->tuplehash[0].tuple.dst.u.icmp.type);
nf_ct_dump_tuple_ip(&ct->tuplehash[0].tuple);
return false;
}
return true;
}
icmp_packet
非差错报文的应答方向报文会被该函数处理,主要是进行超时更新和报文统计。
/* Returns verdict for packet, or -1 for invalid. */
/* icmp协议自己的连接跟踪事务处理,对于icmp仅仅是进行报文统计 */
static int icmp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
unsigned int *timeout)
{
/* Do not immediately delete the connection after the first
successful reply to avoid excessive conntrackd traffic
and also to handle correctly ICMP echo reply duplicates. */
nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
return NF_ACCEPT;
}
icmp_get_timeouts
icmp 连接跟踪超时时间获取,一般是30秒。
/* 获取该命名空间的icmp连接跟踪的超时时间 */
static unsigned int *icmp_get_timeouts(struct net *net)
{
return &icmp_pernet(net)->timeout;
}
static int icmp_init_net(struct net *net, u_int16_t proto)
{
struct nf_icmp_net *in = icmp_pernet(net);
struct nf_proto_net *pn = &in->pn;
in->timeout = nf_ct_icmp_timeout;
return icmp_kmemdup_sysctl_table(pn, in);
}
/* icmp会话30秒超时 */
static const unsigned int nf_ct_icmp_timeout = 30*HZ;
ICMP对NAT的支持
非差错报文对nat的支持
icmp报文对nat的支持实际更多的是网络层的支持,对于icmp报文本身来说只有一个标识符可以改变,不过很少有场景要改变标识符的。下面就代码简单的分析一下.
icmp nat控制块
const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
.l4proto = IPPROTO_ICMP,
.manip_pkt = icmp_manip_pkt,
.in_range = icmp_in_range,
.unique_tuple = icmp_unique_tuple,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
#endif
};
icmp_in_range
判断icmp的标识符是否在指定的范围中。
static bool
icmp_in_range(const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype,
const union nf_conntrack_man_proto *min,
const union nf_conntrack_man_proto *max)
{
return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
}
icmp_unique_tuple
分配一个标志符,使得五元组唯一。
static void
icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype,
const struct nf_conn *ct)
{
static u_int16_t id;
unsigned int range_size;
unsigned int i;
range_size = ntohs(range->max_proto.icmp.id) -
ntohs(range->min_proto.icmp.id) + 1;
/* If no range specified... 没有指定范围,则设置方位为0xffff */
if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
range_size = 0xFFFF;
for (i = 0; ; ++id) {
tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
(id % range_size));
if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
return;
}
return;
}
icmp_manip_pkt
将选择的标识符替换掉原来的标识符,更新校验码。
static bool
icmp_manip_pkt(struct sk_buff *skb,
const struct nf_nat_l3proto *l3proto,
unsigned int iphdroff, unsigned int hdroff,
const struct nf_conntrack_tuple *tuple,
enum nf_nat_manip_type maniptype)
{
struct icmphdr *hdr;
if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
return false;
hdr = (struct icmphdr *)(skb->data + hdroff);
inet_proto_csum_replace2(&hdr->checksum, skb,
hdr->un.echo.id, tuple->src.u.icmp.id, false);
hdr->un.echo.id = tuple->src.u.icmp.id;
return true;
}
差错报文对NAT的支持,常说的ICMP ALG
差错报文的内层报文信息来自于产生差错的报文。当一个主机发送一个报文经过NAT后,其报文头发生了改变。也就是说,检测到该报文有差错的设备看到的报文是经过NAT后的报文,所以NAT需要将内层报文还原回原来的报文再转发给源主机。
unsigned int
nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state,
unsigned int (*do_chain)(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state,
struct nf_conn *ct))
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
struct nf_conn_nat *nat;
/* maniptype == SRC for postrouting. */
enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
ct = nf_ct_get(skb, &ctinfo);
/* Can't track? It's not due to stress, or conntrack would
* have dropped it. Hence it's the user's responsibilty to
* packet filter it out, or implement conntrack/NAT for that
* protocol. 8) --RR
*/
if (!ct)
return NF_ACCEPT;
nat = nfct_nat(ct);
switch (ctinfo) {
case IP_CT_RELATED://对于icmp差错报文,会为这两个状态
case IP_CT_RELATED_REPLY:
//icmp报文特殊处理,这种状态的报文是一个icmp差错报文。
//根据其所属的原始报文决定其所属的ct。对icmp携带的原始报文部分进行
//相应操作。
if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
//既会对内层报文进行nat,也会对外层报文进行nat,这里处理完毕后就返回了。
if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
state->hook))
return NF_DROP;
else
return NF_ACCEPT;
}
...
}
int nf_nat_icmp_reply_translation(struct sk_buff *skb,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int hooknum)
{
struct {
struct icmphdr icmp;
struct iphdr ip;
} *inside;
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
unsigned int hdrlen = ip_hdrlen(skb);
const struct nf_nat_l4proto *l4proto;
struct nf_conntrack_tuple target;
unsigned long statusbit;
WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);
if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
return 0;
if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
return 0;
//获取icmp报文头起始地址
inside = (void *)skb->data + hdrlen;
if (inside->icmp.type == ICMP_REDIRECT) {//重定向差错报恩。
if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
return 0;
if (ct->status & IPS_NAT_MASK)
return 0;
}
if (manip == NF_NAT_MANIP_SRC)
statusbit = IPS_SRC_NAT;
else
statusbit = IPS_DST_NAT;
/* Invert if this is reply direction */
/* 应答方向进行求反 */
if (dir == IP_CT_DIR_REPLY)
statusbit ^= IPS_NAT_MASK;
//如果主连接没有该nat操作,退出。
if (!(ct->status & statusbit))
return 1;
//获取内层报文的传输层操作控制块
l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
//进行内层报文nat处理。包括传输层和网络层
if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
l4proto, &ct->tuplehash[!dir].tuple, !manip))
return 0;
//更新icmp校验码
if (skb->ip_summed != CHECKSUM_PARTIAL) {
/* Reloading "inside" here since manip_pkt may reallocate */
inside = (void *)skb->data + hdrlen;
inside->icmp.checksum = 0;
inside->icmp.checksum =
csum_fold(skb_checksum(skb, hdrlen,
skb->len - hdrlen, 0));
}
/* Change outer to look like the reply to an incoming packet */
//进行外层报文的nat处理
nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
return 0;
return 1;
}
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。