Linux kernel之网络rps

rps即receive package scaling, 是内核软件层实现的一个性能扩展机制,根据网络包数据计算hash值,然后挂载到对应cpu的backlog队列中,代码如下:

static int netif_rx_internal(struct sk_buff *skb)
{
    int ret;

    net_timestamp_check(netdev_tstamp_prequeue, skb);

    trace_netif_rx(skb);
#ifdef CONFIG_RPS
    if (static_key_false(&rps_needed)) {
        struct rps_dev_flow voidflow, *rflow = &voidflow;
        int cpu;

        preempt_disable();
        rcu_read_lock();

        cpu = get_rps_cpu(skb->dev, skb, &rflow); //获取cpu号
        if (cpu < 0)
            cpu = smp_processor_id();

        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);//入队到cpu的backlog队列

        rcu_read_unlock();
        preempt_enable();
    } else
#endif
    {
        unsigned int qtail;
        ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
        put_cpu();
    }
    return ret;
}

其关键点在于get_rps_cpu(),其函数实现如下:

  1. 根据skb入队记录获取skb的rxqueue
  2. 获取rxqueue->rps_map,若不存在映射表返回-1,这种情况下由上述netif_rx_internal()函数分析可知直接挂在到处理cpu的backlog队列中,若该rxqueue的map到唯一cpu,直接返回相应cpu
  3. 进入核心流程skb_get_hash(),即计算skb hash值
/*
 * get_rps_cpu is called from netif_receive_skb and returns the target
 * CPU from the RPS map of the receiving queue for a given skb.
 * rcu_read_lock must be held on entry.
 */
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
               struct rps_dev_flow **rflowp)
{
    struct netdev_rx_queue *rxqueue;
    struct rps_map *map;
    struct rps_dev_flow_table *flow_table;
    struct rps_sock_flow_table *sock_flow_table;
    int cpu = -1;
    u16 tcpu;
    u32 hash;

    if (skb_rx_queue_recorded(skb)) {
        u16 index = skb_get_rx_queue(skb);
        if (unlikely(index >= dev->real_num_rx_queues)) {
            WARN_ONCE(dev->real_num_rx_queues > 1,
                  "%s received packet on queue %u, but number "
                  "of RX queues is %u\n",
                  dev->name, index, dev->real_num_rx_queues);
            goto done;
        }
        rxqueue = dev->_rx + index;
    } else
        rxqueue = dev->_rx;

    map = rcu_dereference(rxqueue->rps_map);
    if (map) {
        if (map->len == 1 &&
            !rcu_access_pointer(rxqueue->rps_flow_table)) {
            tcpu = map->cpus[0];
            if (cpu_online(tcpu))
                cpu = tcpu;
            goto done;
        }
    } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
        goto done;
    }

    skb_reset_network_header(skb);
    hash = skb_get_hash(skb);
    if (!hash)
        goto done;

    flow_table = rcu_dereference(rxqueue->rps_flow_table);
    sock_flow_table = rcu_dereference(rps_sock_flow_table);
    if (flow_table && sock_flow_table) {
        u16 next_cpu;
        struct rps_dev_flow *rflow;

        rflow = &flow_table->flows[hash & flow_table->mask];
        tcpu = rflow->cpu;

        next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];

        /*
         * If the desired CPU (where last recvmsg was done) is
         * different from current CPU (one in the rx-queue flow
         * table entry), switch if one of the following holds:
         *   - Current CPU is unset (equal to RPS_NO_CPU).
         *   - Current CPU is offline.
         *   - The current CPU's queue tail has advanced beyond the
         *     last packet that was enqueued using this table entry.
         *     This guarantees that all previous packets for the flow
         *     have been dequeued, thus preserving in order delivery.
         */
        if (unlikely(tcpu != next_cpu) &&
            (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
             ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
              rflow->last_qtail)) >= 0)) {
            tcpu = next_cpu;
            rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
        }

        if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
            *rflowp = rflow;
            cpu = tcpu;
            goto done;
        }
    }

    if (map) {
        tcpu = map->cpus[reciprocal_scale(hash, map->len)];
        if (cpu_online(tcpu)) {
            cpu = tcpu;
            goto done;
        }
    }

done:
    return cpu;
}

skb_get_hash流程分析:对于skb_get_hash()函数,首先判断是否已经设置了l4_hash或者sw_hash,如果已经设置,就直接返回已经设置的hash值(对于某些driver如hyperv,vmware以及某些设备上的网卡driver,在网卡driver层会为skb设置hash值,这种场景下无须再次计算)。否则调用__skb_get_hash()计算并设置hash值。对于这一函数的核心为__skb_flow_dissect()函数,具体见下面分析。


static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
                          struct flow_keys *flow,
                          unsigned int flags)
{
    memset(flow, 0, sizeof(*flow));
    return __skb_flow_dissect(skb, &flow_keys_dissector, flow,
                  NULL, 0, 0, 0, flags);
}

static inline u32 ___skb_get_hash(const struct sk_buff *skb,
                  struct flow_keys *keys, u32 keyval)
{
    skb_flow_dissect_flow_keys(skb, keys,
                   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);

    return __flow_hash_from_keys(keys, keyval);
}

void __skb_get_hash(struct sk_buff *skb)
{
    struct flow_keys keys;
    u32 hash;

    __flow_hash_secret_init();

    hash = ___skb_get_hash(skb, &keys, hashrnd);

    __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
}

static inline __u32 skb_get_hash(struct sk_buff *skb)
{
    if (!skb->l4_hash && !skb->sw_hash)
        __skb_get_hash(skb);

    return skb->hash;
}

此处__skb_flow_dissect()使用flow_keys_dissector作为流分发器,由flow_dissector_key定义和skb_flow_dissector_init()函数,对于flow_keys_dissector, enable了KEY_CONTROL, KEY_BASIC, KEY_IPV4_ADDR, KEY_TIPC_ADDR, KEY_PORTS, KEY_VLAN, KEY_FLOW_TABLE, KEY_GET_KEYID。据此,分析_skb_flow_dissect()函数:

  1. 基于skb是否存在vlan_tag获取protol,获取flow_key的key_control和key_basic指针。
  2. 如果是IP包,由于flow_dissector_key中enable了KEY_IPV4_ADDRS,key_control->addr_type = KEY_IPv4_ADDRS,key_addrs->v4addrs = iph->saddr, iph->daddr,即使用源地址和目的地址
  3. 判断是否是分片包,若是分片包,直接退出该函数
  4. 否则进一步判断是否在L3层stop,若stop l3,则退出,否则进一步从skb获取ports作为key_ports的输入。由于传入flag为0,因此会继续获取ports。

static const struct flow_dissector_key flow_keys_dissector_keys[] = {
    {
        .key_id = FLOW_DISSECTOR_KEY_CONTROL,
        .offset = offsetof(struct flow_keys, control),
    },
    {
        .key_id = FLOW_DISSECTOR_KEY_BASIC,
        .offset = offsetof(struct flow_keys, basic),
    },
    {
        .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
        .offset = offsetof(struct flow_keys, addrs.v4addrs),
    },
    {
        .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
        .offset = offsetof(struct flow_keys, addrs.v6addrs),
    },
    {
        .key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS,
        .offset = offsetof(struct flow_keys, addrs.tipcaddrs),
    },
    {
        .key_id = FLOW_DISSECTOR_KEY_PORTS,
        .offset = offsetof(struct flow_keys, ports),
    },
    {
        .key_id = FLOW_DISSECTOR_KEY_VLAN,
        .offset = offsetof(struct flow_keys, vlan),
    },
    {
        .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
        .offset = offsetof(struct flow_keys, tags),
    },
    {
        .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
        .offset = offsetof(struct flow_keys, keyid),
    },
};


bool __skb_flow_dissect(const struct sk_buff *skb,
            struct flow_dissector *flow_dissector,
            void *target_container,
            void *data, __be16 proto, int nhoff, int hlen,
            unsigned int flags)
{
    struct flow_dissector_key_control *key_control;
    struct flow_dissector_key_basic *key_basic;
    struct flow_dissector_key_addrs *key_addrs;
    struct flow_dissector_key_ports *key_ports;
    struct flow_dissector_key_icmp *key_icmp;
    struct flow_dissector_key_tags *key_tags;
    struct flow_dissector_key_vlan *key_vlan;
    bool skip_vlan = false;
    u8 ip_proto = 0;
    bool ret;

    if (!data) {
        data = skb->data;
        proto = skb_vlan_tag_present(skb) ?
             skb->vlan_proto : skb->protocol;
        nhoff = skb_network_offset(skb);
        hlen = skb_headlen(skb);
    }

    /* It is ensured by skb_flow_dissector_init() that control key will
     * be always present.
     */
    key_control = skb_flow_dissector_target(flow_dissector,
                        FLOW_DISSECTOR_KEY_CONTROL,
                        target_container);

    /* It is ensured by skb_flow_dissector_init() that basic key will
     * be always present.
     */
    key_basic = skb_flow_dissector_target(flow_dissector,
                          FLOW_DISSECTOR_KEY_BASIC,
                          target_container);

    if (dissector_uses_key(flow_dissector,
                   FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
        struct ethhdr *eth = eth_hdr(skb);
        struct flow_dissector_key_eth_addrs *key_eth_addrs;

        key_eth_addrs = skb_flow_dissector_target(flow_dissector,
                              FLOW_DISSECTOR_KEY_ETH_ADDRS,
                              target_container);
        memcpy(key_eth_addrs, &eth->h_dest, sizeof(*key_eth_addrs));
    }

proto_again:
    switch (proto) {
    case htons(ETH_P_IP): {
        const struct iphdr *iph;
        struct iphdr _iph;
ip:
        iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
        if (!iph || iph->ihl < 5)
            goto out_bad;
        nhoff += iph->ihl * 4;

        ip_proto = iph->protocol;

        if (dissector_uses_key(flow_dissector,
                       FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
            key_addrs = skb_flow_dissector_target(flow_dissector,
                                  FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                                  target_container);

            memcpy(&key_addrs->v4addrs, &iph->saddr,
                   sizeof(key_addrs->v4addrs));
            key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
        }

        if (ip_is_fragment(iph)) {
            key_control->flags |= FLOW_DIS_IS_FRAGMENT;

            if (iph->frag_off & htons(IP_OFFSET)) {
                goto out_good;
            } else {
                key_control->flags |= FLOW_DIS_FIRST_FRAG;
                if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
                    goto out_good;
            }
        }

        __skb_flow_dissect_ipv4(skb, flow_dissector,
                    target_container, data, iph);

        if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
            goto out_good;

        break;
    }
    ...

    if (dissector_uses_key(flow_dissector,
                   FLOW_DISSECTOR_KEY_PORTS)) {
        key_ports = skb_flow_dissector_target(flow_dissector,
                              FLOW_DISSECTOR_KEY_PORTS,
                              target_container);
        key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
                            data, hlen);
    }

    if (dissector_uses_key(flow_dissector,
                   FLOW_DISSECTOR_KEY_ICMP)) {
        key_icmp = skb_flow_dissector_target(flow_dissector,
                             FLOW_DISSECTOR_KEY_ICMP,
                             target_container);
        key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen);
    }

out_good:
    ret = true;

    key_control->thoff = (u16)nhoff;
out:
    key_basic->n_proto = proto;
    key_basic->ip_proto = ip_proto;

    return ret;

out_bad:
    ret = false;
    key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen);
    goto out;
}

综上,对于分片包,仅根据IP包源地址和目的地址进行hash,对于非分片包,会进一步回去包的源端口和目的端口进行hash。


LanceLiu89
0 声望0 粉丝