summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2015-05-24 14:49:35 -0700
committerDavid S. Miller <davem@davemloft.net>2015-05-27 13:30:44 -0400
commit07f4c90062f8fc7c8c26f8f95324cbe8fa3145a5 (patch)
tree47f456afc79f5493af730edc0611bf166ea2c68e /net/ipv4
parent837b9955b1dd22719f0c7dcf321690f5392d99c3 (diff)
downloadlwn-07f4c90062f8fc7c8c26f8f95324cbe8fa3145a5.tar.gz
lwn-07f4c90062f8fc7c8c26f8f95324cbe8fa3145a5.zip
tcp/dccp: try to not exhaust ip_local_port_range in connect()
A long standing problem on busy servers is the tiny available TCP port range (/proc/sys/net/ipv4/ip_local_port_range) and the default sequential allocation of source ports in connect() system call. If a host is having a lot of active TCP sessions, chances are very high that all ports are in use by at least one flow, and subsequent bind(0) attempts fail, or have to scan a big portion of space to find a slot. In this patch, I changed the starting point in __inet_hash_connect() so that we try to favor even [1] ports, leaving odd ports for bind() users. We still perform a sequential search, so there is no guarantee, but if connect() targets are very different, end result is we leave more ports available to bind(), and we spread them all over the range, lowering time for both connect() and bind() to find a slot. This strategy only works well if /proc/sys/net/ipv4/ip_local_port_range is even, ie if start/end values have different parity. Therefore, default /proc/sys/net/ipv4/ip_local_port_range was changed to 32768 - 60999 (instead of 32768 - 61000) There is no change on security aspects here, only some poor hashing schemes could be eventually impacted by this change. [1] : The odd/even property depends on ip_local_port_range values parity Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/inet_hashtables.c10
2 files changed, 9 insertions, 3 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 235d36afece3..6ad0f7a711c9 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1595,7 +1595,7 @@ static __net_init int inet_init_net(struct net *net)
*/
seqlock_init(&net->ipv4.ip_local_ports.lock);
net->ipv4.ip_local_ports.range[0] = 32768;
- net->ipv4.ip_local_ports.range[1] = 61000;
+ net->ipv4.ip_local_ports.range[1] = 60999;
seqlock_init(&net->ipv4.ping_group_range.lock);
/*
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 185efef0f125..1b336dc179f8 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -502,8 +502,14 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
+ /* By starting with offset being an even number,
+ * we tend to leave about 50% of ports for other uses,
+ * like bind(0).
+ */
+ offset &= ~1;
+
local_bh_disable();
- for (i = 1; i <= remaining; i++) {
+ for (i = 0; i < remaining; i++) {
port = low + (i + offset) % remaining;
if (inet_is_local_reserved_port(net, port))
continue;
@@ -547,7 +553,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
return -EADDRNOTAVAIL;
ok:
- hint += i;
+ hint += (i + 2) & ~1;
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, port);