summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2006-11-16 02:30:37 -0800
committerDavid S. Miller <davem@sunset.davemloft.net>2006-12-02 21:21:44 -0800
commit72a3effaf633bcae9034b7e176bdbd78d64a71db (patch)
treeb7a331527f1b15335a358f97809134f35587e57a /net
parent3c62f75aac7348ee262b1295cfcfeb3473f76815 (diff)
downloadlwn-72a3effaf633bcae9034b7e176bdbd78d64a71db.tar.gz
lwn-72a3effaf633bcae9034b7e176bdbd78d64a71db.zip
[NET]: Size listen hash tables using backlog hint
We currently allocate a fixed size (TCP_SYNQ_HSIZE=512) slots hash table for each LISTEN socket, regardless of various parameters (listen backlog for example) On x86_64, this means order-1 allocations (might fail), even for 'small' sockets, expecting few connections. On the contrary, a huge server wanting a backlog of 50000 is slowed down a bit because of this fixed limit. This patch makes the sizing of listen hash table a dynamic parameter, depending of : - net.core.somaxconn tunable (default is 128) - net.ipv4.tcp_max_syn_backlog tunable (default : 256, 1024 or 128) - backlog value given by user application (2nd parameter of listen()) For large allocations (bigger than PAGE_SIZE), we use vmalloc() instead of kmalloc(). We still limit memory allocation with the two existing tunables (somaxconn & tcp_max_syn_backlog). So for standard setups, this patch actually reduce RAM usage. Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/core/request_sock.c35
-rw-r--r--net/dccp/ipv4.c2
-rw-r--r--net/dccp/proto.c6
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/inet_connection_sock.c2
-rw-r--r--net/ipv4/tcp_ipv4.c6
-rw-r--r--net/ipv6/tcp_ipv6.c2
7 files changed, 35 insertions, 20 deletions
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 79ebd75fbe4d..5f0818d815e6 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -15,6 +15,7 @@
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/vmalloc.h>
#include <net/request_sock.h>
@@ -29,22 +30,31 @@
* it is absolutely not enough even at 100conn/sec. 256 cures most
* of problems. This value is adjusted to 128 for very small machines
* (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
- * Further increasing requires to change hash table size.
+ * Note : Dont forget somaxconn that may limit backlog too.
*/
int sysctl_max_syn_backlog = 256;
int reqsk_queue_alloc(struct request_sock_queue *queue,
- const int nr_table_entries)
+ unsigned int nr_table_entries)
{
- const int lopt_size = sizeof(struct listen_sock) +
- nr_table_entries * sizeof(struct request_sock *);
- struct listen_sock *lopt = kzalloc(lopt_size, GFP_KERNEL);
-
+ size_t lopt_size = sizeof(struct listen_sock);
+ struct listen_sock *lopt;
+
+ nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
+ nr_table_entries = max_t(u32, nr_table_entries, 8);
+ nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
+ lopt_size += nr_table_entries * sizeof(struct request_sock *);
+ if (lopt_size > PAGE_SIZE)
+ lopt = __vmalloc(lopt_size,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+ else
+ lopt = kzalloc(lopt_size, GFP_KERNEL);
if (lopt == NULL)
return -ENOMEM;
- for (lopt->max_qlen_log = 6;
- (1 << lopt->max_qlen_log) < sysctl_max_syn_backlog;
+ for (lopt->max_qlen_log = 3;
+ (1 << lopt->max_qlen_log) < nr_table_entries;
lopt->max_qlen_log++);
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
@@ -65,9 +75,11 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
{
/* make all the listen_opt local to us */
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
+ size_t lopt_size = sizeof(struct listen_sock) +
+ lopt->nr_table_entries * sizeof(struct request_sock *);
if (lopt->qlen != 0) {
- int i;
+ unsigned int i;
for (i = 0; i < lopt->nr_table_entries; i++) {
struct request_sock *req;
@@ -81,7 +93,10 @@ void reqsk_queue_destroy(struct request_sock_queue *queue)
}
BUG_TRAP(lopt->qlen == 0);
- kfree(lopt);
+ if (lopt_size > PAGE_SIZE)
+ vfree(lopt);
+ else
+ kfree(lopt);
}
EXPORT_SYMBOL(reqsk_queue_destroy);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index e08e7688a263..0a5d68dbb418 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -1022,7 +1022,7 @@ static void dccp_v4_reqsk_destructor(struct request_sock *req)
kfree(inet_rsk(req)->opt);
}
-static struct request_sock_ops dccp_request_sock_ops = {
+static struct request_sock_ops dccp_request_sock_ops _read_mostly = {
.family = PF_INET,
.obj_size = sizeof(struct dccp_request_sock),
.rtx_syn_ack = dccp_v4_send_response,
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 72cbdcfc2c65..047d170a363a 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -262,12 +262,12 @@ int dccp_destroy_sock(struct sock *sk)
EXPORT_SYMBOL_GPL(dccp_destroy_sock);
-static inline int dccp_listen_start(struct sock *sk)
+static inline int dccp_listen_start(struct sock *sk, int backlog)
{
struct dccp_sock *dp = dccp_sk(sk);
dp->dccps_role = DCCP_ROLE_LISTEN;
- return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
+ return inet_csk_listen_start(sk, backlog);
}
int dccp_disconnect(struct sock *sk, int flags)
@@ -788,7 +788,7 @@ int inet_dccp_listen(struct socket *sock, int backlog)
* FIXME: here it probably should be sk->sk_prot->listen_start
* see tcp_listen_start
*/
- err = dccp_listen_start(sk);
+ err = dccp_listen_start(sk, backlog);
if (err)
goto out;
}
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index edcf0932ac6d..4a81d54a7569 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -204,7 +204,7 @@ int inet_listen(struct socket *sock, int backlog)
* we can only allow the backlog to be adjusted.
*/
if (old_state != TCP_LISTEN) {
- err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
+ err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 96bbe2a0aa1b..9d68837888d3 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -343,7 +343,7 @@ struct dst_entry* inet_csk_route_req(struct sock *sk,
EXPORT_SYMBOL_GPL(inet_csk_route_req);
static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
- const u32 rnd, const u16 synq_hsize)
+ const u32 rnd, const u32 synq_hsize)
{
return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 22ef8bd26620..5fbf96552cac 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -715,7 +715,7 @@ static struct ip_options *tcp_v4_save_options(struct sock *sk,
return dopt;
}
-struct request_sock_ops tcp_request_sock_ops = {
+struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.family = PF_INET,
.obj_size = sizeof(struct tcp_request_sock),
.rtx_syn_ack = tcp_v4_send_synack,
@@ -1385,7 +1385,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
if (st->state == TCP_SEQ_STATE_OPENREQ) {
struct request_sock *req = cur;
- icsk = inet_csk(st->syn_wait_sk);
+ icsk = inet_csk(st->syn_wait_sk);
req = req->dl_next;
while (1) {
while (req) {
@@ -1395,7 +1395,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
}
req = req->dl_next;
}
- if (++st->sbucket >= TCP_SYNQ_HSIZE)
+ if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
break;
get_req:
req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index eb6d145ecfd7..1a3c46c139f8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -526,7 +526,7 @@ static void tcp_v6_reqsk_destructor(struct request_sock *req)
kfree_skb(inet6_rsk(req)->pktopts);
}
-static struct request_sock_ops tcp6_request_sock_ops = {
+static struct request_sock_ops tcp6_request_sock_ops _read_mostly = {
.family = AF_INET6,
.obj_size = sizeof(struct tcp6_request_sock),
.rtx_syn_ack = tcp_v6_send_synack,