记录一下linux 5.14 中 listen 中 backlog参数 的实现

enjolras1205

概述

面试时候选人提到了半连接,全连接队列分别由内核中的常量,backlog参数决定。此前只知道一个队列,于是看代码一探究竟。

源码剖析

源码版本:
https://github.com/torvalds/l...
d992fe5318d8d7af9510b879439a3c7f283da442
代码里有大量的函数指针,对应不同的协议簇,在看源码的时候必然有很多猜测成分,但最终的结论是可信的,主要是这个实现很合理。

入口

搜索"sys_listen", 可以看到,backlog确实是取了用户传入和sysctl_somaxconn的最小值。

int __sys_listen(int fd, int backlog)
{
    struct socket *sock;
    int err, fput_needed;
    int somaxconn;

    sock = sockfd_lookup_light(fd, &err, &fput_needed);
    if (sock) {
        somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
        if ((unsigned int)backlog > somaxconn)
            backlog = somaxconn;

        err = security_socket_listen(sock, backlog);
        if (!err)
            err = sock->ops->listen(sock, backlog);

        fput_light(sock->file, fput_needed);
    }
    return err;
}

listen的实现

backlog 写入到 sk_max_ack_backlog

可以看到sock->ops是一个函数指针结构体,搜索"sock->ops = ",可以找到一系列proto_ops。挑base_sock_ops继续看。

static const struct proto_ops base_sock_ops = {
    .family        = PF_ISDN,
    .owner        = THIS_MODULE,
    .release    = base_sock_release,
    .ioctl        = base_sock_ioctl,
    .bind        = base_sock_bind,
    .getname    = sock_no_getname,
    .sendmsg    = sock_no_sendmsg,
    .recvmsg    = sock_no_recvmsg,
    .listen        = sock_no_listen,
    .shutdown    = sock_no_shutdown,
    .connect    = sock_no_connect,
    .socketpair    = sock_no_socketpair,
    .accept        = sock_no_accept,
    .mmap        = sock_no_mmap
};

发现大部分的proto_ops都没有定义listen的实现。
搜索".listen =",可以找到有 svc_listen, dn_listen, unix_listen, vsock_listen, x25_listen等. 从命名和文件头的描述,猜测inet_listen是实际用的listen,即socket_addr中指定的协议。继续看inet_listen。
可以看到backlog赋值给了sk->sk_max_ack_backlog,搜索发现其它listen函数也是如此。

int inet_listen(struct socket *sock, int backlog)
{
    struct sock *sk = sock->sk;
    unsigned char old_state;
    int err, tcp_fastopen;

    lock_sock(sk);

    err = -EINVAL;
    if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
        goto out;

    old_state = sk->sk_state;
    if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
        goto out;

    WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
    /* Really, if the socket is already in listen state
     * we can only allow the backlog to be adjusted.
     */

sk_max_ack_backlog 的含义

搜索 sk_max_ack_backlog,可以看到sk_max_ack_backlog控制的就是established队列的大小:
include/net/sock.h:930

static inline void sk_acceptq_removed(struct sock *sk)
{
    WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1);
}

static inline void sk_acceptq_added(struct sock *sk)
{
    WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1);
}

/* Note: If you think the test should be:
 *    return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog);
 * Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.")
 */
static inline bool sk_acceptq_is_full(const struct sock *sk)
{
    return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
}

此外还有其它的代码说明这一点:如
inet_connection_sock.c:501 在accept后,即调用 reqsk_queue_remove 将 sk_ack_backlog减1。
tcp_check_req (由SYN_RECV -> ESTABLISHED) -> inet_csk_complete_hashdance -> inet_csk_reqsk_queue_add 中将 sk_ack_backlog 加1。
net/ipv4/tcp_minisocks.c:772
在收到sync时,会调用一个函数指针,判断sync_recv_sock队列是否满

    /* OK, ACK is valid, create big socket and
     * feed this segment to it. It will repeat all
     * the tests. THIS SEGMENT MUST MOVE SOCKET TO
     * ESTABLISHED STATE. If it will be dropped after
     * socket is created, wait for troubles.
     */
    child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
                             req, &own_req);
    if (!child)
        goto listen_overflow;

net/ipv4/tcp_input.c:6772
可以看到,如果accept队列超过上限,有新的握手完成包时,会触发listen_overflow
listen_overflow的处理是,记录状态,对客户端不通知。
net/ipv4/tcp_minisocks.c:788

listen_overflow:
    if (sk != req->rsk_listener)
        __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);

    if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) {
        inet_rsk(req)->acked = 1;
        return NULL;
    }

如果正常,会调用 inet_csk_complete_hashdance,并加到established队列中。

真的有 syn_recv 队列么?

处理第一步sync握手请求时,仅仅是将request_sock_queue结构体的qlen加1

从sync握手请求包开始:
net/ipv4/tcp_input.c:6357

    case TCP_LISTEN:
        if (th->ack)
            return 1;

        if (th->rst)
            goto discard;

        if (th->syn) {
            if (th->fin)
                goto discard;
            /* It is possible that we process SYN packets from backlog,
             * so we need to make sure to disable BH and RCU right there.
             */
            rcu_read_lock();
            local_bh_disable();
            acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;

可以看到当established队列满的时候,不会接受新的sync请求,有意思的是reqsk_queue判断,用的是同一个sk_max_ack_backlog作为上限值:
net/ipv4/tcp_input.c:6772

int tcp_conn_request(struct request_sock_ops *rsk_ops,
             const struct tcp_request_sock_ops *af_ops,
             struct sock *sk, struct sk_buff *skb)
{
    struct tcp_fastopen_cookie foc = { .len = -1 };
    __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
    struct tcp_options_received tmp_opt;
    struct tcp_sock *tp = tcp_sk(sk);
    struct net *net = sock_net(sk);
    struct sock *fastopen_sk = NULL;
    struct request_sock *req;
    bool want_cookie = false;
    struct dst_entry *dst;
    struct flowi fl;

    /* TW buckets are converted to open requests without
     * limitations, they conserve resources and peer is
     * evidently real one.
     */
    if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
         inet_csk_reqsk_queue_is_full(sk)) && !isn) {
        want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
        if (!want_cookie)
            goto drop;
    }

    if (sk_acceptq_is_full(sk)) {
        NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
        goto drop;
    }

另外也和sysctl_max_syn_backlog值做了比较,当队列长度达到sysctl_max_syn_backlog的一半时,会直接drop:
net/ipv4/tcp_input.c:6840

    if (!want_cookie && !isn) {
        /* Kill the following clause, if you dislike this way. */
        if (!net->ipv4.sysctl_tcp_syncookies &&
            (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
             (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
            !tcp_peer_is_proven(req, dst)) {
            /* Without syncookies last quarter of
             * backlog is filled with destinations,
             * proven to be alive.
             * It means that we continue to communicate
             * to destinations, already remembered
             * to the moment of synflood.
             */
            pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
                    rsk_ops->family);
            goto drop_and_release;
        }

        isn = af_ops->init_seq(skb);
    }

最终只是将qlen加1:
inet_connection_sock.c:923

void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
                   unsigned long timeout)
{
    reqsk_queue_hash_req(req, timeout);
    inet_csk_reqsk_queue_added(sk);
}

处理第三步sync ack时,是从hash table中取得的sock

inet_hashtables.h:342

static inline struct sock *__inet_lookup(struct net *net,
                     struct inet_hashinfo *hashinfo,
                     struct sk_buff *skb, int doff,
                     const __be32 saddr, const __be16 sport,
                     const __be32 daddr, const __be16 dport,
                     const int dif, const int sdif,
                     bool *refcounted)
{
    u16 hnum = ntohs(dport);
    struct sock *sk;

    sk = __inet_lookup_established(net, hashinfo, saddr, sport,
                       daddr, hnum, dif, sdif);
    *refcounted = true;
    if (sk)
        return sk;
    *refcounted = false;
    return __inet_lookup_listener(net, hashinfo, skb, doff, saddr,
                      sport, daddr, hnum, dif, sdif);
}

inet_hashtable.c:327
建立链接前,由net结构体(看起来和网络设备相关),源地址,主机(目标)端口计算出哈希值。

struct sock *__inet_lookup_listener(struct net *net,
                    struct inet_hashinfo *hashinfo,
                    struct sk_buff *skb, int doff,
                    const __be32 saddr, __be16 sport,
                    const __be32 daddr, const unsigned short hnum,
                    const int dif, const int sdif)
{
    struct inet_listen_hashbucket *ilb2;
    struct sock *result = NULL;
    unsigned int hash2;

    /* Lookup redirect from BPF */
    if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
        result = inet_lookup_run_bpf(net, hashinfo, skb, doff,
                         saddr, sport, daddr, hnum);
        if (result)
            goto done;
    }

    hash2 = ipv4_portaddr_hash(net, daddr, hnum);
    ilb2 = inet_lhash2_bucket(hashinfo, hash2);

inet_hashtable.c:390
建立链接后哈希值,由net结构体以及源,目标地址,端口计算出来。

struct sock *__inet_lookup_established(struct net *net,
                  struct inet_hashinfo *hashinfo,
                  const __be32 saddr, const __be16 sport,
                  const __be32 daddr, const u16 hnum,
                  const int dif, const int sdif)
{
    INET_ADDR_COOKIE(acookie, saddr, daddr);
    const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
    struct sock *sk;
    const struct hlist_nulls_node *node;
    /* Optimize here for direct hit, only listening connections can
     * have wildcards anyways.
     */
    unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
    unsigned int slot = hash & hashinfo->ehash_mask;
    struct inet_ehash_bucket *head = &hashinfo->ehash[slot];

结论

  1. backlog决定icsk_accept_queue队列大小。该队列中为ESTABLISHTED状态的链接。
  2. 没有物理意义上的syn_recv queue,但确实有抽象意义上的syn_recv queue。backlog也决定虚拟的 reqsk(sync_recv) queue 队列大小。实际的实现为一个全局的哈希表。在每个listen的sock上单独统计大小。这个队列大小也由tcp_max_syn_backlog控制。
  3. fastopen可以跳过上面的限制。
  4. 服务器应该尽量减少链接数量,全局的sock哈希表冲突变多时必然影响性能。
  5. reqsk queue中的sock,有超时和重发机制。见reqsk_timer_handler,里面还有一个新生代老年代优化。因为和backlog参数无直接关联,这里不做展开。
阅读 192

41 声望
3 粉丝
0 条评论
你知道吗?

41 声望
3 粉丝
宣传栏