概述
面试时候选人提到了半连接,全连接队列分别由内核中的常量,backlog参数决定。此前只知道一个队列,于是看代码一探究竟。
源码剖析
源码版本:
https://github.com/torvalds/l...
d992fe5318d8d7af9510b879439a3c7f283da442
代码里有大量的函数指针,对应不同的协议簇,在看源码的时候必然有很多猜测成分,但最终的结论是可信的,主要是这个实现很合理。
入口
搜索"sys_listen", 可以看到,backlog确实是取了用户传入和sysctl_somaxconn的最小值。
int __sys_listen(int fd, int backlog)
{
struct socket *sock;
int err, fput_needed;
int somaxconn;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
if ((unsigned int)backlog > somaxconn)
backlog = somaxconn;
err = security_socket_listen(sock, backlog);
if (!err)
err = sock->ops->listen(sock, backlog);
fput_light(sock->file, fput_needed);
}
return err;
}
listen的实现
backlog 写入到 sk_max_ack_backlog
可以看到sock->ops是一个函数指针结构体,搜索"sock->ops = ",可以找到一系列proto_ops。挑base_sock_ops继续看。
static const struct proto_ops base_sock_ops = {
.family = PF_ISDN,
.owner = THIS_MODULE,
.release = base_sock_release,
.ioctl = base_sock_ioctl,
.bind = base_sock_bind,
.getname = sock_no_getname,
.sendmsg = sock_no_sendmsg,
.recvmsg = sock_no_recvmsg,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.connect = sock_no_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.mmap = sock_no_mmap
};
发现大部分的proto_ops都没有定义listen的实现。
搜索".listen =",可以找到有 svc_listen, dn_listen, unix_listen, vsock_listen, x25_listen等. 从命名和文件头的描述,猜测inet_listen是实际用的listen,即socket_addr中指定的协议。继续看inet_listen。
可以看到backlog赋值给了sk->sk_max_ack_backlog,搜索发现其它listen函数也是如此。
int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
unsigned char old_state;
int err, tcp_fastopen;
lock_sock(sk);
err = -EINVAL;
if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
goto out;
old_state = sk->sk_state;
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto out;
WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
sk_max_ack_backlog 的含义
搜索 sk_max_ack_backlog,可以看到sk_max_ack_backlog控制的就是established队列的大小:
include/net/sock.h:930
static inline void sk_acceptq_removed(struct sock *sk)
{
WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1);
}
static inline void sk_acceptq_added(struct sock *sk)
{
WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1);
}
/* Note: If you think the test should be:
* return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog);
* Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.")
*/
static inline bool sk_acceptq_is_full(const struct sock *sk)
{
return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
}
此外还有其它的代码说明这一点:如
inet_connection_sock.c:501 在accept后,即调用 reqsk_queue_remove 将 sk_ack_backlog减1。
tcp_check_req (由SYN_RECV -> ESTABLISHED) -> inet_csk_complete_hashdance -> inet_csk_reqsk_queue_add 中将 sk_ack_backlog 加1。
net/ipv4/tcp_minisocks.c:772
在收到sync时,会调用一个函数指针,判断sync_recv_sock队列是否满
/* OK, ACK is valid, create big socket and
* feed this segment to it. It will repeat all
* the tests. THIS SEGMENT MUST MOVE SOCKET TO
* ESTABLISHED STATE. If it will be dropped after
* socket is created, wait for troubles.
*/
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
req, &own_req);
if (!child)
goto listen_overflow;
net/ipv4/tcp_input.c:6772
可以看到,如果accept队列超过上限,有新的握手完成包时,会触发listen_overflow
listen_overflow的处理是,记录状态,对客户端不通知。
net/ipv4/tcp_minisocks.c:788
listen_overflow:
if (sk != req->rsk_listener)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) {
inet_rsk(req)->acked = 1;
return NULL;
}
如果正常,会调用 inet_csk_complete_hashdance,并加到established队列中。
真的有 syn_recv 队列么?
处理第一步sync握手请求时,仅仅是将request_sock_queue结构体的qlen加1
从sync握手请求包开始:
net/ipv4/tcp_input.c:6357
case TCP_LISTEN:
if (th->ack)
return 1;
if (th->rst)
goto discard;
if (th->syn) {
if (th->fin)
goto discard;
/* It is possible that we process SYN packets from backlog,
* so we need to make sure to disable BH and RCU right there.
*/
rcu_read_lock();
local_bh_disable();
acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
可以看到当established队列满的时候,不会接受新的sync请求,有意思的是reqsk_queue判断,用的是同一个sk_max_ack_backlog作为上限值:
net/ipv4/tcp_input.c:6772
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
{
struct tcp_fastopen_cookie foc = { .len = -1 };
__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
struct request_sock *req;
bool want_cookie = false;
struct dst_entry *dst;
struct flowi fl;
/* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is
* evidently real one.
*/
if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
if (!want_cookie)
goto drop;
}
if (sk_acceptq_is_full(sk)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
}
另外也和sysctl_max_syn_backlog值做了比较,当队列长度达到sysctl_max_syn_backlog的一半时,会直接drop:
net/ipv4/tcp_input.c:6840
if (!want_cookie && !isn) {
/* Kill the following clause, if you dislike this way. */
if (!net->ipv4.sysctl_tcp_syncookies &&
(net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(net->ipv4.sysctl_max_syn_backlog >> 2)) &&
!tcp_peer_is_proven(req, dst)) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
* proven to be alive.
* It means that we continue to communicate
* to destinations, already remembered
* to the moment of synflood.
*/
pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
rsk_ops->family);
goto drop_and_release;
}
isn = af_ops->init_seq(skb);
}
最终只是将qlen加1:
inet_connection_sock.c:923
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
unsigned long timeout)
{
reqsk_queue_hash_req(req, timeout);
inet_csk_reqsk_queue_added(sk);
}
处理第三步sync ack时,是从hash table中取得的sock
inet_hashtables.h:342
static inline struct sock *__inet_lookup(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const __be16 dport,
const int dif, const int sdif,
bool *refcounted)
{
u16 hnum = ntohs(dport);
struct sock *sk;
sk = __inet_lookup_established(net, hashinfo, saddr, sport,
daddr, hnum, dif, sdif);
*refcounted = true;
if (sk)
return sk;
*refcounted = false;
return __inet_lookup_listener(net, hashinfo, skb, doff, saddr,
sport, daddr, hnum, dif, sdif);
}
inet_hashtable.c:327
建立链接前,由net结构体(看起来和网络设备相关),源地址,主机(目标)端口计算出哈希值。
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif, const int sdif)
{
struct inet_listen_hashbucket *ilb2;
struct sock *result = NULL;
unsigned int hash2;
/* Lookup redirect from BPF */
if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
result = inet_lookup_run_bpf(net, hashinfo, skb, doff,
saddr, sport, daddr, hnum);
if (result)
goto done;
}
hash2 = ipv4_portaddr_hash(net, daddr, hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
inet_hashtable.c:390
建立链接后哈希值,由net结构体以及源,目标地址,端口计算出来。
struct sock *__inet_lookup_established(struct net *net,
struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const u16 hnum,
const int dif, const int sdif)
{
INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
struct sock *sk;
const struct hlist_nulls_node *node;
/* Optimize here for direct hit, only listening connections can
* have wildcards anyways.
*/
unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
unsigned int slot = hash & hashinfo->ehash_mask;
struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
结论
- backlog决定icsk_accept_queue队列大小。该队列中为ESTABLISHTED状态的链接。
- 没有物理意义上的syn_recv queue,但确实有抽象意义上的syn_recv queue。backlog也决定虚拟的 reqsk(sync_recv) queue 队列大小。实际的实现为一个全局的哈希表。在每个listen的sock上单独统计大小。这个队列大小也由tcp_max_syn_backlog控制。
- fastopen可以跳过上面的限制。
- 服务器应该尽量减少链接数量,全局的sock哈希表冲突变多时必然影响性能。
- reqsk queue中的sock,有超时和重发机制。见reqsk_timer_handler,里面还有一个新生代老年代优化。因为和backlog参数无直接关联,这里不做展开。
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。