简介
ovs分类器中的前缀树是一个过滤器,将规则的网络层的源目的IP构建前缀树。在查找时,先使用Trie进行匹配,如果不能命中的话,说明整个子表都不会命中,跳过该子表,加速匹配过程。其数据结构就是一个二叉树。
Trie属于分类器classifier,一个分类器classifier可以有多个Trie,典型的是两个:IPV4源目的IP。
数据结构
typedef OVSRCU_TYPE(struct trie_node *) rcu_trie_ptr;
/* A longest-prefix match tree. */
/* 一个最长匹配的前缀树,前缀树节点 */
struct trie_node {
uint32_t prefix; /* Prefix bits for this node, MSB first. 前缀,msb优先*/
uint8_t n_bits; /* Never zero, except for the root node. 前缀长度,用于限制上面的prefix */
unsigned int n_rules; /* Number of rules that have this prefix. 拥有这个前缀的规则数 */
rcu_trie_ptr edges[2]; /* Both NULL if leaf. 叶子的话,两边的都为空,
树的两个节点,左右节点,因为一个bit只有两个值,
要么0,要么1,所以是二叉树 */
};
/* Prefix trie for a 'field' */
/* 为field域构建前缀树 */
struct cls_trie {
const struct mf_field *field; /* Trie field, or NULL. 构成该树的报文域 */
rcu_trie_ptr root; /* NULL if none. 树根 */
};
函数
trie_init
该函数用于初始化一个Trie
/* 进行分类器的前缀树的初始化:1计算前缀长度,2将已经存在的规则加入该前缀树 */
static void
trie_init(struct classifier *cls, int trie_idx, const struct mf_field *field)
{
struct cls_trie *trie = &cls->tries[trie_idx];/* 从前缀树数组中获取指定的树描述控制块 */
struct cls_subtable *subtable;/* 子表 */
/* 先将以前的树进行删除 */
if (trie_idx < cls->n_tries) {
trie_destroy(&trie->root);
} else {
ovsrcu_set_hidden(&trie->root, NULL);
}
//设置该前缀树对应的域
trie->field = field;
/* Add existing rules to the new trie. */
/* 添加已经存在的规则到新树中 */
CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {/* 遍历每一个子表 */
unsigned int plen;/* 前缀长度局部变量 */
//计算前缀树的前缀长度
plen = field ? minimask_get_prefix_len(&subtable->mask, field) : 0;/* 获取匹配域的掩码的长度 */
if (plen) {/* 存在前缀长度,那么将该子表下的规则插入该前缀树 */
struct cls_match *head;
CMAP_FOR_EACH (head, cmap_node, &subtable->rules) {/* 遍历每一个规则 */
trie_insert(trie, head->cls_rule, plen);/* 将规则插入前缀树 */
}
}
/* Initialize subtable's prefix length on this field. This will
* allow readers to use the trie. */
atomic_thread_fence(memory_order_release);
subtable->trie_plen[trie_idx] = plen; /* 设置子表的前缀树的前缀长度,子表的前缀长度是确定的 */
}
}
trie_insert
插入一个规则到前缀树
/* 插入的是前缀掩码 */
/* edge为当前节点 */
static void
trie_insert_prefix(rcu_trie_ptr *edge, const ovs_be32 *prefix, int mlen)/* 前缀与前缀长度 */
{
struct trie_node *node;
int ofs = 0;
/* Walk the tree. */
/* 遍历整棵树 */
for (; (node = ovsrcu_get_protected(struct trie_node *, edge));/* 从根节点开始 */
edge = trie_next_edge(node, prefix, ofs)) {/* 根据前缀bit的值为1还是0获取下一个节点 */
unsigned int eqbits = trie_prefix_equal_bits(node, prefix, ofs, mlen);/* */
ofs += eqbits;/* 得到需要插入的前缀与本节点相同的bit个数 */
if (eqbits < node->n_bits) {/* bit相等的个数比该节点的前缀bit个数要小,说明两者不相等 */
/* Mismatch, new node needs to be inserted above. 所以需要新建一个节点*/
int old_branch = get_bit_at(node->prefix, eqbits);/* 从相等的位置开始一个新的分支 */
struct trie_node *new_parent;
/* 创建一个新的分支 */
new_parent = trie_branch_create(prefix, ofs - eqbits, eqbits,
ofs == mlen ? 1 : 0);
/* Copy the node to modify it. */
/* 修改一个节点 */
node = trie_node_rcu_realloc(node);
/* Adjust the new node for its new position in the tree. */
node->prefix <<= eqbits; /* 去掉相同的几个bit,因为这几个bit分给新的父节点 */
node->n_bits -= eqbits; /* 前缀个数减小,因为新的节点已经占用了一部分 */
ovsrcu_set_hidden(&new_parent->edges[old_branch], node);/* 原来的节点作为这个新的节点的子节点 */
/* Check if need a new branch for the new rule. */
if (ofs < mlen) {
ovsrcu_set_hidden(&new_parent->edges[!old_branch],
trie_branch_create(prefix, ofs, mlen - ofs,
1));
}
ovsrcu_set(edge, new_parent); /* Publish changes. */
return;
}
/* Full match so far. */
if (ofs == mlen) {
/* Full match at the current node, rule needs to be added here. */
node->n_rules++;
return;
}
}
/* Must insert a new tree branch for the new rule. */
ovsrcu_set(edge, trie_branch_create(prefix, ofs, mlen - ofs, 1));
}
/*
* This is called only when mask prefix is known to be CIDR and non-zero.
* Relies on the fact that the flow and mask have the same map, and since
* the mask is CIDR, the storage for the flow field exists even if it
* happened to be zeros.
* 获取指定的前缀。
*/
static const ovs_be32 *
minimatch_get_prefix(const struct minimatch *match, const struct mf_field *mf)
{
size_t u64_ofs = mf->flow_be32ofs / 2;/* 4字节单位偏移转换成8个字节单位偏移 */
return (OVS_FORCE const ovs_be32 *)miniflow_get__(match->flow, u64_ofs)
+ (mf->flow_be32ofs & 1);
}
/* Insert rule in to the prefix tree.
* 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
* in 'rule'.
* 插入一个规则到前缀树,前缀掩码必须不为0
*/
static void
trie_insert(struct cls_trie *trie/* 需要插入的树 */, const struct cls_rule *rule/* 需要插入的规则 */, int mlen/* 掩码长度 */)
{
trie_insert_prefix(&trie->root,/* 根节点 */
minimatch_get_prefix(&rule->match, trie->field)/* 获取匹配域的掩码起始地址 */, mlen/* 掩码长度 */);
}
trie_remove
从前缀树中删除一个规则
/* 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
* in 'rule'. 移除规则cls_rule对应的前缀*/
static void
trie_remove(struct cls_trie *trie,/* 规则的前缀树 */
const struct cls_rule *rule, /* 要移除的规则描述 */
int mlen)/* 掩码长度 */
{
trie_remove_prefix(&trie->root,
minimatch_get_prefix(&rule->match, trie->field), mlen);
}
/* 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
* in 'rule'. */
static void
trie_remove_prefix(rcu_trie_ptr *root, const ovs_be32 *prefix, int mlen)
{
struct trie_node *node;
rcu_trie_ptr *edges[sizeof(union trie_prefix) * CHAR_BIT];
int depth = 0, ofs = 0;
/* Walk the tree. */
for (edges[0] = root;
(node = ovsrcu_get_protected(struct trie_node *, edges[depth]));
edges[++depth] = trie_next_edge(node, prefix, ofs)) {/* 根据前缀获取下一个节点 */
unsigned int eqbits = trie_prefix_equal_bits(node, prefix, ofs, mlen);/* 判断两个节点前缀相等的位置 */
if (eqbits < node->n_bits) {/* 不相等,说明不能命中 */
/* Mismatch, nothing to be removed. This should never happen, as
* only rules in the classifier are ever removed. */
break; /* Log a warning. */
}
/* Full match so far. 全匹配,到目前位置 */
ofs += eqbits;
if (ofs == mlen) {/* 全匹配了 */
/* Full prefix match at the current node, remove rule here. */
if (!node->n_rules) {/* 没有规则,直接退出,一般是不可能的 */
break; /* Log a warning. */
}
node->n_rules--;/* 规则数减掉1 */
/* Check if can prune the tree. 检查是否能阶段这个树,一般来说,最后一个规则被删除的话,是可以截断这棵树的 */
while (!node->n_rules) {
struct trie_node *next,/* 获取当前节点的左右分支 */
*edge0 = ovsrcu_get_protected(struct trie_node *,
&node->edges[0]),
*edge1 = ovsrcu_get_protected(struct trie_node *,
&node->edges[1]);
if (edge0 && edge1) {/* 左右分支都存在不能截断 */
break; /* A branching point, cannot prune. */
}
/* Else have at most one child node, remove this node. 只有一个分支,截断他,这个分支就是最后一个规则的分支 */
next = edge0 ? edge0 : edge1;
if (next) {
if (node->n_bits + next->n_bits > TRIE_PREFIX_BITS) {
break; /* Cannot combine. */
}
next = trie_node_rcu_realloc(next); /* Modify. */
/* Combine node with next. */
next->prefix = node->prefix | next->prefix >> node->n_bits;
next->n_bits += node->n_bits;
}
/* Update the parent's edge. */
ovsrcu_set(edges[depth], next); /* Publish changes. 更新变化*/
trie_node_destroy(node);/* 销毁当前节点 */
if (next || !depth) {
/* Branch not pruned or at root, nothing more to do. */
break;
}
node = ovsrcu_get_protected(struct trie_node *,
edges[--depth]);
}
return;
}
}
/* Cannot go deeper. This should never happen, since only rules
* that actually exist in the classifier are ever removed. */
}
trie_lookup
在前缀树中查找是否命中规则
/* Returns the number of bits in the prefix mask necessary to determine a
* mismatch, in case there are longer prefixes in the tree below the one that
* matched.
* '*plens' will have a bit set for each prefix length that may have matching
* rules. The caller is responsible for clearing the '*plens' prior to
* calling this.
* 前缀树查找函数
* trie:树根
* value:需要匹配的内容,一般是ip地址(ipv4或者ipv6)
* plens:命中结果存储的位置,它跟value是等长的,用于记录存在规则的bit,即多长的前缀
* 存在规则的命中。
* n_bits:本域的匹配长度(bit数),要么是32要么是128。
* 返回值为匹配过的bit数,如果最后一个node miss-match,返回值为match_len + 1
*/
static unsigned int
trie_lookup_value(const rcu_trie_ptr *trie, const ovs_be32 value[],
ovs_be32 plens[]/* 返回存在命中规则的bit,用于记录命中 */, unsigned int n_bits/* 本区域最长的bit数 */)
{
const struct trie_node *prev = NULL;
const struct trie_node *node = ovsrcu_get(struct trie_node *, trie);
unsigned int match_len = 0; /* Number of matching bits. */
/* 从树根开始遍历节点 */
for (; node; prev = node, node = trie_next_node(node, value, match_len)) {
unsigned int eqbits;
/* Check if this edge can be followed. 判断下一个节点的前缀与需要查找的前缀相等的个数 */
eqbits = prefix_equal_bits(node->prefix, node->n_bits, value,
match_len);
match_len += eqbits;/* 得到相等的个数 */
if (eqbits < node->n_bits) { /* Mismatch, nothing more to be found. */
/* Bit at offset 'match_len' differed. 如果相等的bit个数不一致的话,说明这两个节点代表的掩码不一样 */
return match_len + 1; /* Includes the first mismatching bit. 返回的长度还包含了第一个不匹配的bit */
}
/* Full match, check if rules exist at this prefix length.
* 如果两者的前缀掩码完全一样,判断该节点是否存在规则 */
/* 如果存在的话,说明已经命中了规则,设置当前位置命中了规则
*/
if (node->n_rules > 0) {
be_set_bit_at(plens, match_len - 1);
}
if (match_len >= n_bits) {/* 全匹配了 */
return n_bits; /* Full prefix. */
}
}
/* node == NULL. Full match so far, but we tried to follow an
* non-existing branch. Need to exclude the other branch if it exists
* (it does not if we were called on an empty trie or 'prev' is a leaf
* node).
* 能走到这里说明整个前缀树都匹配完毕了,即node == NULL跳出了循环。
* 没有全前缀的命中。
* prev为NULL,说明整棵树为空。
* prev为leaf,说明整棵树匹配完毕。
* 其它情况应该不会走到这里,所以不会存在返回match_len + 1的情况。
* match_len + 1表示包含了第一个不匹配的bit。
*/
return !prev || trie_is_leaf(prev) ? match_len : match_len + 1;
}
/* 使用分类器的前缀树进行查找flow */
/* trie:分类器前缀树
** flow:需要匹配的报文的flow
** plens:返回的匹配的前缀
*/
static unsigned int
trie_lookup(const struct cls_trie *trie, const struct flow *flow,
union trie_prefix *plens)
{
const struct mf_field *mf = trie->field;/* 树的匹配元数据 */
/* Check that current flow matches the prerequisites for the trie
* field. Some match fields are used for multiple purposes, so we
* must check that the trie is relevant for this flow.
* 检查当前流的匹配先决条件,一些匹配域可能用于多个目的。比如我们需要查找的
* ipv4的话,那么在Ethernet头部的ethertype字段必须要等于ipv4
*/
if (mf_are_prereqs_ok(mf, flow, NULL)) {/* 进行查找 */
return trie_lookup_value(&trie->root,/* 树根节点 */
&((ovs_be32 *)flow)[mf->flow_be32ofs],/* 报文在该域的值 */
&plens->be32, mf->n_bits);//该域的长度
}
//不满足条件,说明本报文不需要参与前缀树过滤,所以对任何subtable都是符合条件的。
//不需要跳过,继续查找subtable。
memset(plens, 0xff, sizeof *plens); /* All prefixes, no skipping. */
return 0; /* Value not used in this case. 这种情况下,该报文不需要参与Trie过滤 */
}
过滤器check_tries
该函数通过Trie进行过滤,报文不可能匹配的话,返回True,否则返回False。
/* Return 'true' if can skip rest of the subtable based on the prefix trie
* lookup results.
* 基于前缀树查找结果,前缀树一般用于匹配源目的ip字段。用于快速过滤。
*/
static inline bool
check_tries(struct trie_ctx trie_ctx[CLS_MAX_TRIES], unsigned int n_tries,
const unsigned int field_plen[CLS_MAX_TRIES],
const struct flowmap range_map, const struct flow *flow,//报文内容
struct flow_wildcards *wc)//报文域bit位
{
int j;
/* Check if we could avoid fully unwildcarding the next level of
* fields using the prefix tries. The trie checks are done only as
* needed to avoid folding in additional bits to the wildcards mask.
* */
for (j = 0; j < n_tries; j++) {/* 遍历每一棵树 */
/* Is the trie field relevant for this subtable, and
is the trie field within the current range of fields? */
if (field_plen[j] &&/* 前缀长度存在并且在当前的stage中,则需要进行匹配 */
flowmap_is_set(&range_map, trie_ctx[j].be32ofs / 2)) {
struct trie_ctx *ctx = &trie_ctx[j];/* 树匹配上下文 */
/* On-demand trie lookup. */
/* 对需要查找的树进行查找,整个报文每个trie只需要查找一次,后续子表直接拿前面的结果来用即可 */
if (!ctx->lookup_done) {/* 树还没查找过,进行查找 */
memset(&ctx->match_plens, 0, sizeof ctx->match_plens);/* 清空匹配了的长度为0 */
/* 返回的是参与了匹配的bit数,命中 */
ctx->maskbits = trie_lookup(ctx->trie, flow, &ctx->match_plens);
ctx->lookup_done = true;/* 设置本树已经查找完了 */
}
/* Possible to skip the rest of the subtable if subtable's
* prefix on the field is not included in the lookup result.
* 查看本subtable的前缀长度是否有规则命中,没有的话,说明该子表不存在
* 命中的可能,直接跳过。没看明白里面的两个if的作用。
*/
if (!be_get_bit_at(&ctx->match_plens.be32, field_plen[j] - 1)) {
/* We want the trie lookup to never result in unwildcarding
* any bits that would not be unwildcarded otherwise.
* Since the trie is shared by the whole classifier, it is
* possible that the 'maskbits' contain bits that are
* irrelevant for the partition relevant for the current
* packet. Hence the checks below.
* Trie是整个分类器共享的。它是可能的maskbits可能包含一些与这个报文不相关的部分。
* 因此需要进行下面的检查。
*/
/* Check that the trie result will not unwildcard more bits
* than this subtable would otherwise.
* 这棵树最长*/
if (ctx->maskbits <= field_plen[j]) {//没有全匹配
/* Unwildcard the bits and skip the rest. */
/* 设置非通配bit,用于跳过其子表其他 */
mask_set_prefix_bits(wc, ctx->be32ofs, ctx->maskbits);
/* Note: Prerequisite already unwildcarded, as the only
* prerequisite of the supported trie lookup fields is
* the ethertype, which is always unwildcarded. */
return true;
}
/* Can skip if the field is already unwildcarded. */
if (mask_prefix_bits_set(wc, ctx->be32ofs, ctx->maskbits)) {
return true;
}
}
}
}
return false;
}
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。