简介
目前一个连接跟踪的五元组为源目的IP,传输层协议,源目的端口。多租户环境下,租户的私有地址网络可能存在重叠,如果只用这五个元素来区分一个CT的话,无法满足多租户的需求。所以引入zone的概念,zone是一个16bit的整型数,不同用户使用不同的id,从而保证租户之间的隔离。
实现
连接跟踪控制块中的zone成员:
struct nf_conn {
/* Usage count in here is 1 for hash table, 1 per skb,
* plus 1 for any connection(s) we are `master' for
*
* Hint, SKB address this struct and refcnt via skb->_nfct and
* helpers nf_conntrack_get() and nf_conntrack_put().
* Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt,
* beware nf_ct_get() is different and don't inc refcnt.
*/
struct nf_conntrack ct_general;
spinlock_t lock;
u16 cpu;
//连接跟踪zone成员。
#ifdef CONFIG_NF_CONNTRACK_ZONES
struct nf_conntrack_zone zone;
#endif
...
};
zone定义
struct nf_conntrack_zone {
u16 id;//id
u8 flags;//标志,目前只有一个标志NF_CT_FLAG_MARK,表示使用skb->mark作为zone-id,否则使用id成员作为zone-id。
u8 dir;//方向,默认是双向的,即从某一个网口接收的报文不管是应答还是请求方向都用同一个zoneid,最常见。
//看见宏NF_CT_DEFAULT_ZONE_DIR。
};
#define NF_CT_DEFAULT_ZONE_DIR (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL)
//详见函数:
static inline const struct nf_conntrack_zone *
nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb,
struct nf_conntrack_zone *tmp)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
if (!tmpl)
return &nf_ct_zone_dflt;
//设置zone
if (tmpl->zone.flags & NF_CT_FLAG_MARK)
return nf_ct_zone_init(tmp, skb->mark, tmpl->zone.dir, 0);
#endif
return nf_ct_zone(tmpl);
}
static inline const struct nf_conntrack_zone *
nf_ct_zone(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
return &ct->zone;
#else
return &nf_ct_zone_dflt;
#endif
}
//初始化连接跟踪的zone。
static inline const struct nf_conntrack_zone *
nf_ct_zone_init(struct nf_conntrack_zone *zone, u16 id, u8 dir, u8 flags)
{
zone->id = id;
zone->flags = flags;
zone->dir = dir;
return zone;
}
默认的连接跟踪zone定义
/* Built-in default zone used e.g. by modules. */
const struct nf_conntrack_zone nf_ct_zone_dflt = {
.id = NF_CT_DEFAULT_ZONE_ID,
.dir = NF_CT_DEFAULT_ZONE_DIR,
};
EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
#define NF_CT_DEFAULT_ZONE_ID 0
#define NF_CT_ZONE_DIR_ORIG (1 << IP_CT_DIR_ORIGINAL)
#define NF_CT_ZONE_DIR_REPL (1 << IP_CT_DIR_REPLY)
#define NF_CT_DEFAULT_ZONE_DIR (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL)
zone的常见操作函数
static inline const struct nf_conntrack_zone *
nf_ct_zone(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
return &ct->zone;
#else
return &nf_ct_zone_dflt;
#endif
}
//设置连接跟踪的zone。
static inline const struct nf_conntrack_zone *
nf_ct_zone_init(struct nf_conntrack_zone *zone, u16 id, u8 dir, u8 flags)
{
zone->id = id;
zone->flags = flags;
zone->dir = dir;
return zone;
}
static inline const struct nf_conntrack_zone *
nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb,
struct nf_conntrack_zone *tmp)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
if (!tmpl)
return &nf_ct_zone_dflt;
//设置zone
if (tmpl->zone.flags & NF_CT_FLAG_MARK)
return nf_ct_zone_init(tmp, skb->mark, tmpl->zone.dir, 0);
#endif
return nf_ct_zone(tmpl);
}
//设置ct的zone
static inline void nf_ct_zone_add(struct nf_conn *ct,
const struct nf_conntrack_zone *zone)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
ct->zone = *zone;
#endif
}
static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone,
enum ip_conntrack_dir dir)
{
return zone->dir & (1 << dir);
}
//或者ct的某一个方向的zone id
static inline u16 nf_ct_zone_id(const struct nf_conntrack_zone *zone,
enum ip_conntrack_dir dir)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
return nf_ct_zone_matches_dir(zone, dir) ?
zone->id : NF_CT_DEFAULT_ZONE_ID;
#else
return NF_CT_DEFAULT_ZONE_ID;
#endif
}
//判断两个ct在同一个方向上的zone id是否相等
static inline bool nf_ct_zone_equal(const struct nf_conn *a,
const struct nf_conntrack_zone *b,
enum ip_conntrack_dir dir)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
return nf_ct_zone_id(nf_ct_zone(a), dir) ==
nf_ct_zone_id(b, dir);
#else
return true;
#endif
}
//比较连接跟踪a和b任意方向的zone是否相等
static inline bool nf_ct_zone_equal_any(const struct nf_conn *a,
const struct nf_conntrack_zone *b)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
return nf_ct_zone(a)->id == b->id;
#else
return true;
#endif
}
ZONE的使用
通过将设备映射到不同的zone,实现租户流量到zone的映射,也可以使用iptables的mark功能设置流量的zone。linux使用CT target来设置流量的zone。CT命令会在内核创建一个连接跟踪模板,命中该规则的flow将会设置模板CT,首包在创建CT的时候会以模板为参考进行初始化,从而将我们在CT target设置的参数传递给连接跟踪。
CT
The CT target sets parameters for a packet or its associated connection. The target attaches a "template" connection tracking entry to the packet, which is then used by the conntrack core
when initializing a new ct entry. This target is thus only valid in the "raw" table.
--notrack
Disables connection tracking for this packet.
--helper name
Use the helper identified by name for the connection. This is more flexible than loading the conntrack helper modules with preset ports.
--ctevents event[,...]
Only generate the specified conntrack events for this connection. Possible event types are: new, related, destroy, reply, assured, protoinfo, helper, mark (this refers to the ctmark,
not nfmark), natseqinfo, secmark (ctsecmark).
--expevents event[,...]
Only generate the specified expectation events for this connection. Possible event types are: new.
--zone-orig {id|mark}
For traffic coming from ORIGINAL direction, assign this packet to zone id and only have lookups done in that zone. If mark is used instead of id, the zone is derived from the packet
nfmark.
--zone-reply {id|mark}
For traffic coming from REPLY direction, assign this packet to zone id and only have lookups done in that zone. If mark is used instead of id, the zone is derived from the packet
nfmark.
--zone {id|mark}
Assign this packet to zone id and only have lookups done in that zone. If mark is used instead of id, the zone is derived from the packet nfmark. By default, packets have zone 0.
This option applies to both directions.
--timeout name
Use the timeout policy identified by name for the connection. This is provides more flexible timeout policy definition than global timeout values available at /proc/sys/net/netfil‐
ter/nf_conntrack_*_timeout_*.
案例
sudo iptables -t raw -A PREROUTING -i ens39 -j CT --zone 2
#该命令将ens39网口收到的报文映射到zone 2中,实现不同接口收到的流量连接跟踪隔离。
CT target实现分析
struct xt_ct_target_info_v1 {
__u16 flags;//标志,见下面枚举
__u16 zone;//zone id
__u32 ct_events;
__u32 exp_events;
char helper[16];
char timeout[32];//用户自定义超时时间
/* Used internally by the kernel */
/* 连接跟踪模板 */
struct nf_conn *ct __attribute__((aligned(8)));
};
enum {
XT_CT_NOTRACK = 1 << 0,//设置了--notrack参数
XT_CT_NOTRACK_ALIAS = 1 << 1,//设置了--notrack参数
XT_CT_ZONE_DIR_ORIG = 1 << 2,//zone设置的是请求方向
XT_CT_ZONE_DIR_REPL = 1 << 3,//zone设置的应答方向,也可以两个标志都有
XT_CT_ZONE_MARK = 1 << 4,//zone来自于nfmark
XT_CT_MASK = XT_CT_NOTRACK | XT_CT_NOTRACK_ALIAS |
XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL |
XT_CT_ZONE_MARK,
};
构建模板
static int xt_ct_tg_check_v2(const struct xt_tgchk_param *par)
{
struct xt_ct_target_info_v1 *info = par->targinfo;
if (info->flags & ~XT_CT_MASK)//一个选项也没设置,直接退出。
return -EINVAL;
return xt_ct_tg_check(par, par->targinfo);
}
static int xt_ct_tg_check(const struct xt_tgchk_param *par,
struct xt_ct_target_info_v1 *info)
{
struct nf_conntrack_zone zone;
struct nf_conn_help *help;
struct nf_conn *ct;
int ret = -EOPNOTSUPP;
if (info->flags & XT_CT_NOTRACK) {
ct = NULL;
goto out;
}
#ifndef CONFIG_NF_CONNTRACK_ZONES
if (info->zone || info->flags & (XT_CT_ZONE_DIR_ORIG |
XT_CT_ZONE_DIR_REPL |
XT_CT_ZONE_MARK))
goto err1;
#endif
ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
goto err1;
memset(&zone, 0, sizeof(zone));
zone.id = info->zone;
zone.dir = xt_ct_flags_to_dir(info);
if (info->flags & XT_CT_ZONE_MARK)
zone.flags |= NF_CT_FLAG_MARK;
//分配ct模板
ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL);
if (!ct) {
ret = -ENOMEM;
goto err2;
}
...
return ret;
}
执行target
static unsigned int xt_ct_target_v1(struct sk_buff *skb,
const struct xt_action_param *par)
{
//获取规则信息
const struct xt_ct_target_info_v1 *info = par->targinfo;
struct nf_conn *ct = info->ct;//获取规则的CT模板
return xt_ct_target(skb, ct);
}
static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
{
/* Previously seen (loopback)? Ignore. */
if (skb->_nfct != 0)
return XT_CONTINUE;
if (ct) {//设置报文的CT模板。
atomic_inc(&ct->ct_general.use);
nf_ct_set(skb, ct, IP_CT_NEW);
} else {
nf_ct_set(skb, ct, IP_CT_UNTRACKED);
}
return XT_CONTINUE;
}
连接跟踪对模板的处理
unsigned int
nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
struct sk_buff *skb)
{
const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
struct nf_conn *ct, *tmpl;
enum ip_conntrack_info ctinfo;
unsigned int *timeouts;
unsigned int dataoff;
u_int8_t protonum;
int ret;
tmpl = nf_ct_get(skb, &ctinfo);//获取模板
if (tmpl || ctinfo == IP_CT_UNTRACKED) {
/* Previously seen (loopback or untracked)? Ignore. */
/* 自己ping自己的报文会经过lo从prerouting进入协议栈,由于报文已经
** 在out进行了连接跟踪。所以这里直接接受。
** 只有环回接口(ping自己的任何一个地址)的报文会携带CT,并且其不是模板。
** 使用CT动作,设置报文为IP_CT_UNTRACKED也会直接返回。
*/
if ((tmpl && !nf_ct_is_template(tmpl)) ||//设置了zone的话,会有tmpl,并且nf_ct_is_template为真。
ctinfo == IP_CT_UNTRACKED) {
NF_CT_STAT_INC_ATOMIC(net, ignore);
return NF_ACCEPT;
}
skb->_nfct = 0;
}
...
return ret;
}
/* On success, returns 0, sets skb->_nfct | ctinfo */
static int
resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
struct sk_buff *skb,
unsigned int dataoff,
u_int16_t l3num,
u_int8_t protonum,
const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto)
{
const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
struct nf_conn *ct;
u32 hash;
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
dataoff, l3num, protonum, net, &tuple, l3proto,
l4proto)) {
pr_debug("Can't get tuple\n");
return 0;
}
/* look for tuple match 查找CT的时候,使用模板的zone */
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
hash = hash_conntrack_raw(&tuple, net);
h = __nf_conntrack_find_get(net, zone, &tuple, hash);
if (!h) {
//没有找到,那么查找期望连接
h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
skb, dataoff, hash);
if (!h)
return 0;
if (IS_ERR(h))
return PTR_ERR(h);
}
...
return 0;
}
/* Allocate a new conntrack: we return -ENOMEM if classification
failed due to stress. Otherwise it really is unclassifiable. */
static noinline struct nf_conntrack_tuple_hash *
init_conntrack(struct net *net, struct nf_conn *tmpl,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto,
struct sk_buff *skb,
unsigned int dataoff, u32 hash)
{
struct nf_conn *ct;
struct nf_conn_help *help;
struct nf_conntrack_tuple repl_tuple;
struct nf_conntrack_ecache *ecache;
struct nf_conntrack_expect *exp = NULL;
const struct nf_conntrack_zone *zone;
struct nf_conn_timeout *timeout_ext;
struct nf_conntrack_zone tmp;
unsigned int *timeouts;
if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
pr_debug("Can't invert tuple.\n");
return NULL;
}
//初始化ct也会只用模板的zone。
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
//分配连接跟踪
ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
hash);
...
}
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。