summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2015-08-31 12:34:00 -0700
committerDavid S. Miller <davem@davemloft.net>2015-08-31 12:34:00 -0700
commit9dc30648f0708cf063e29470d83f63f8dc8fc430 (patch)
treef12130b61955f4471ebd61474244ecf9ebdc0858
parent87583ebb9f6ea6dc7f8ef167b815656787e429fc (diff)
parentc3a8d9474684d391b0afc3970d9b249add15ec07 (diff)
downloadlwn-9dc30648f0708cf063e29470d83f63f8dc8fc430.tar.gz
lwn-9dc30648f0708cf063e29470d83f63f8dc8fc430.zip
Merge branch 'per-route-dctcp-receive-side'
Daniel Borkmann says: ==================== tcp: receive-side per route dctcp handling Original cover letter: Currently, the following case doesn't use DCTCP, even if it should: - responder has f.e. cubic as system wide default - 'ip route congctl dctcp $src' was set Then, DCTCP is NOT used if a DCTCP sender attempts to connect from a host in the $src range: ECT(0) is set, but listen_sk is not dctcp, so we fail the INET_ECN_is_not_ect sanity check. We also have to examine the dst used for the SYN/ACK reply to make this case work. In order to minimize additional cost, store the 'ecn is must have' information is the dst_features field. The set targets -next instead of -net since this doesn't seem to be a serious bug and to give the change more soak time until it hits linus tree. v1 -> v2: - Addressed Dave's feedback, not exposing any bits to user space - Added patch 3 to reject incorrect configurations - Rest as is, rebased and retested ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/net/dst.h6
-rw-r--r--include/net/tcp.h2
-rw-r--r--include/uapi/linux/rtnetlink.h11
-rw-r--r--net/core/rtnetlink.c6
-rw-r--r--net/ipv4/fib_semantics.c77
-rw-r--r--net/ipv4/tcp_cong.c9
-rw-r--r--net/ipv4/tcp_input.c7
-rw-r--r--net/ipv6/route.c39
8 files changed, 101 insertions, 56 deletions
diff --git a/include/net/dst.h b/include/net/dst.h
index 4c4801645371..9261d928303d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -207,6 +207,12 @@ static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
p[metric-1] = val;
}
+/* Kernel-internal feature bits that are unallocated in user space. */
+#define DST_FEATURE_ECN_CA (1 << 31)
+
+#define DST_FEATURE_MASK (DST_FEATURE_ECN_CA)
+#define DST_FEATURE_ECN_MASK (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN)
+
static inline u32
dst_feature(const struct dst_entry *dst, u32 feature)
{
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4a7b03947a38..0cab28cd43a9 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -888,7 +888,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno;
struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
-u32 tcp_ca_get_key_by_name(const char *name);
+u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca);
#ifdef CONFIG_INET
char *tcp_ca_get_name_by_key(u32 key, char *buffer);
#else
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 0d3d3cc43356..702024769c74 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -418,10 +418,13 @@ enum {
#define RTAX_MAX (__RTAX_MAX - 1)
-#define RTAX_FEATURE_ECN 0x00000001
-#define RTAX_FEATURE_SACK 0x00000002
-#define RTAX_FEATURE_TIMESTAMP 0x00000004
-#define RTAX_FEATURE_ALLFRAG 0x00000008
+#define RTAX_FEATURE_ECN (1 << 0)
+#define RTAX_FEATURE_SACK (1 << 1)
+#define RTAX_FEATURE_TIMESTAMP (1 << 2)
+#define RTAX_FEATURE_ALLFRAG (1 << 3)
+
+#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
+ RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
struct rta_session {
__u8 proto;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 788ceed39463..a466821d1441 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -678,6 +678,12 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
continue;
if (nla_put_string(skb, i + 1, name))
goto nla_put_failure;
+ } else if (i == RTAX_FEATURES - 1) {
+ u32 user_features = metrics[i] & RTAX_FEATURE_MASK;
+
+ BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK);
+ if (nla_put_u32(skb, i + 1, user_features))
+ goto nla_put_failure;
} else {
if (nla_put_u32(skb, i + 1, metrics[i]))
goto nla_put_failure;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 1b2d01170a4d..992a9597daf8 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -876,6 +876,50 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
return true;
}
+static int
+fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
+{
+ bool ecn_ca = false;
+ struct nlattr *nla;
+ int remaining;
+
+ if (!cfg->fc_mx)
+ return 0;
+
+ nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+ int type = nla_type(nla);
+ u32 val;
+
+ if (!type)
+ continue;
+ if (type > RTAX_MAX)
+ return -EINVAL;
+
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
+
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+ if (val == TCP_CA_UNSPEC)
+ return -EINVAL;
+ } else {
+ val = nla_get_u32(nla);
+ }
+ if (type == RTAX_ADVMSS && val > 65535 - 40)
+ val = 65535 - 40;
+ if (type == RTAX_MTU && val > 65535 - 15)
+ val = 65535 - 15;
+ if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+ return -EINVAL;
+ fi->fib_metrics[type - 1] = val;
+ }
+
+ if (ecn_ca)
+ fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
+
+ return 0;
+}
+
struct fib_info *fib_create_info(struct fib_config *cfg)
{
int err;
@@ -948,36 +992,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto failure;
} endfor_nexthops(fi)
- if (cfg->fc_mx) {
- struct nlattr *nla;
- int remaining;
-
- nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
- int type = nla_type(nla);
-
- if (type) {
- u32 val;
-
- if (type > RTAX_MAX)
- goto err_inval;
- if (type == RTAX_CC_ALGO) {
- char tmp[TCP_CA_NAME_MAX];
-
- nla_strlcpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(tmp);
- if (val == TCP_CA_UNSPEC)
- goto err_inval;
- } else {
- val = nla_get_u32(nla);
- }
- if (type == RTAX_ADVMSS && val > 65535 - 40)
- val = 65535 - 40;
- if (type == RTAX_MTU && val > 65535 - 15)
- val = 65535 - 15;
- fi->fib_metrics[type - 1] = val;
- }
- }
- }
+ err = fib_convert_metrics(fi, cfg);
+ if (err)
+ goto failure;
if (cfg->fc_mp) {
#ifdef CONFIG_IP_ROUTE_MULTIPATH
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index a2ed23c595cf..93c4dc3ab23f 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -114,16 +114,19 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
-u32 tcp_ca_get_key_by_name(const char *name)
+u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
{
const struct tcp_congestion_ops *ca;
- u32 key;
+ u32 key = TCP_CA_UNSPEC;
might_sleep();
rcu_read_lock();
ca = __tcp_ca_find_autoload(name);
- key = ca ? ca->key : TCP_CA_UNSPEC;
+ if (ca) {
+ key = ca->key;
+ *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
+ }
rcu_read_unlock();
return key;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index dc08e2352665..a8f515bb19c4 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6003,14 +6003,17 @@ static void tcp_ecn_create_request(struct request_sock *req,
const struct net *net = sock_net(listen_sk);
bool th_ecn = th->ece && th->cwr;
bool ect, ecn_ok;
+ u32 ecn_ok_dst;
if (!th_ecn)
return;
ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
- ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
+ ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
+ ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
- if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk))
+ if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
+ (ecn_ok_dst & DST_FEATURE_ECN_CA))
inet_rsk(req)->ecn_ok = 1;
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 308dd5f9158f..f45cac6f8356 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1698,6 +1698,7 @@ out:
static int ip6_convert_metrics(struct mx6_config *mxc,
const struct fib6_config *cfg)
{
+ bool ecn_ca = false;
struct nlattr *nla;
int remaining;
u32 *mp;
@@ -1711,30 +1712,36 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
int type = nla_type(nla);
+ u32 val;
- if (type) {
- u32 val;
+ if (!type)
+ continue;
+ if (unlikely(type > RTAX_MAX))
+ goto err;
+
+ if (type == RTAX_CC_ALGO) {
+ char tmp[TCP_CA_NAME_MAX];
- if (unlikely(type > RTAX_MAX))
+ nla_strlcpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+ if (val == TCP_CA_UNSPEC)
goto err;
- if (type == RTAX_CC_ALGO) {
- char tmp[TCP_CA_NAME_MAX];
+ } else {
+ val = nla_get_u32(nla);
+ }
+ if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+ goto err;
- nla_strlcpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(tmp);
- if (val == TCP_CA_UNSPEC)
- goto err;
- } else {
- val = nla_get_u32(nla);
- }
+ mp[type - 1] = val;
+ __set_bit(type - 1, mxc->mx_valid);
+ }
- mp[type - 1] = val;
- __set_bit(type - 1, mxc->mx_valid);
- }
+ if (ecn_ca) {
+ __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
+ mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
}
mxc->mx = mp;
-
return 0;
err:
kfree(mp);