From 087c1a601ad7f851a2d31f5fa0e5e9dfc766df55 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 30 Apr 2015 20:14:07 -0700
Subject: net: sched: run ingress qdisc without locks

TC classifiers/actions were converted to RCU by John in the series:
http://thread.gmane.org/gmane.linux.network/329739/focus=329739
and many follow on patches.
This is the last patch from that series that finally drops
ingress spin_lock.

Single cpu ingress+u32 performance goes from 22.9 Mpps to 24.5 Mpps.

In two cpu case when both cores are receiving traffic on the same
device and go into the same ingress+u32 the performance jumps
from 4.5 + 4.5 Mpps to 23.5 + 23.5 Mpps

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index c7ba0388f1be..74a5b62f7568 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3547,10 +3547,8 @@ static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
 
 	q = rcu_dereference(rxq->qdisc);
 	if (q != &noop_qdisc) {
-		spin_lock(qdisc_lock(q));
 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
 			result = qdisc_enqueue_root(skb, q);
-		spin_unlock(qdisc_lock(q));
 	}
 
 	return result;
-- 
cgit v1.2.3


From 50fb799289501c2eab9f43fc9af513027e1e994f Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Fri, 1 May 2015 11:30:12 -0700
Subject: net: Add skb_get_hash_perturb

This calls flow_disect and __skb_get_hash to procure a hash for a
packet. Input includes a key to initialize jhash. This function
does not set skb->hash.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  2 ++
 net/core/flow_dissector.c | 38 ++++++++++++++++++++++++++++++--------
 2 files changed, 32 insertions(+), 8 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 66e374d62f64..acb83e249e3f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -927,6 +927,8 @@ static inline __u32 skb_get_hash(struct sk_buff *skb)
 	return skb->hash;
 }
 
+__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb);
+
 static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
 {
 	return skb->hash;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 2c35c02a931e..b1df995ef06c 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -267,13 +267,12 @@ static __always_inline void __flow_hash_secret_init(void)
 	net_get_random_once(&hashrnd, sizeof(hashrnd));
 }
 
-static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c)
+static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c, u32 keyval)
 {
-	__flow_hash_secret_init();
-	return jhash_3words(a, b, c, hashrnd);
+	return jhash_3words(a, b, c, keyval);
 }
 
-static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
+static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 {
 	u32 hash;
 
@@ -287,7 +286,8 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
 
 	hash = __flow_hash_3words((__force u32)keys->dst,
 				  (__force u32)keys->src,
-				  (__force u32)keys->ports);
+				  (__force u32)keys->ports,
+				  keyval);
 	if (!hash)
 		hash = 1;
 
@@ -296,10 +296,20 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys)
 
 u32 flow_hash_from_keys(struct flow_keys *keys)
 {
-	return __flow_hash_from_keys(keys);
+	__flow_hash_secret_init();
+	return __flow_hash_from_keys(keys, hashrnd);
 }
 EXPORT_SYMBOL(flow_hash_from_keys);
 
+static inline u32 ___skb_get_hash(const struct sk_buff *skb,
+				  struct flow_keys *keys, u32 keyval)
+{
+	if (!skb_flow_dissect(skb, keys))
+		return 0;
+
+	return __flow_hash_from_keys(keys, keyval);
+}
+
 /*
  * __skb_get_hash: calculate a flow hash based on src/dst addresses
  * and src/dst port numbers.  Sets hash in skb to non-zero hash value
@@ -309,8 +319,12 @@ EXPORT_SYMBOL(flow_hash_from_keys);
 void __skb_get_hash(struct sk_buff *skb)
 {
 	struct flow_keys keys;
+	u32 hash;
 
-	if (!skb_flow_dissect(skb, &keys))
+	__flow_hash_secret_init();
+
+	hash = ___skb_get_hash(skb, &keys, hashrnd);
+	if (!hash)
 		return;
 
 	if (keys.ports)
@@ -318,10 +332,18 @@ void __skb_get_hash(struct sk_buff *skb)
 
 	skb->sw_hash = 1;
 
-	skb->hash = __flow_hash_from_keys(&keys);
+	skb->hash = hash;
 }
 EXPORT_SYMBOL(__skb_get_hash);
 
+__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
+{
+	struct flow_keys keys;
+
+	return ___skb_get_hash(skb, &keys, perturb);
+}
+EXPORT_SYMBOL(skb_get_hash_perturb);
+
 /*
  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
  * to be used as a distribution range.
-- 
cgit v1.2.3


From 2f59e1ebaa7f762c8825871b5486b5f5b4fa952f Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Fri, 1 May 2015 11:30:17 -0700
Subject: net: Add flow_keys digest

Some users of flow keys (well just sch_choke now) need to pass
flow_keys in skbuff cb, and use them for exact comparisons of flows
so that skb->hash is not sufficient. In order to increase size of
the flow_keys structure, we introduce another structure for
the purpose of passing flow keys in skbuff cb. We limit this structure
to sixteen bytes, and we will technically treat this as a digest of
flow_keys struct hence its name flow_keys_digest. In the first
incaranation we just copy the flow_keys structure up to 16 bytes--
this is the same information previously passed in the cb. In the
future, we'll adapt this for larger flow_keys and could use something
like SHA-1 over the whole flow_keys to improve the quality of the
digest.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_keys.h   | 16 ++++++++++++++++
 net/core/flow_dissector.c | 27 +++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'net/core')

diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
index dc8fd81412bf..6d6ef626811a 100644
--- a/include/net/flow_keys.h
+++ b/include/net/flow_keys.h
@@ -42,4 +42,20 @@ static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8
 u32 flow_hash_from_keys(struct flow_keys *keys);
 unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len,
 			   __be16 protocol);
+
+/* struct flow_keys_digest:
+ *
+ * This structure is used to hold a digest of the full flow keys. This is a
+ * larger "hash" of a flow to allow definitively matching specific flows where
+ * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so
+ * that it can by used in CB of skb (see sch_choke for an example).
+ */
+#define FLOW_KEYS_DIGEST_LEN	16
+struct flow_keys_digest {
+	u8	data[FLOW_KEYS_DIGEST_LEN];
+};
+
+void make_flow_keys_digest(struct flow_keys_digest *digest,
+			   const struct flow_keys *flow);
+
 #endif
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index b1df995ef06c..d3acc4dff4ae 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -310,6 +310,33 @@ static inline u32 ___skb_get_hash(const struct sk_buff *skb,
 	return __flow_hash_from_keys(keys, keyval);
 }
 
+struct _flow_keys_digest_data {
+	__be16	n_proto;
+	u8	ip_proto;
+	u8	padding;
+	__be32	ports;
+	__be32	src;
+	__be32	dst;
+};
+
+void make_flow_keys_digest(struct flow_keys_digest *digest,
+			   const struct flow_keys *flow)
+{
+	struct _flow_keys_digest_data *data =
+	    (struct _flow_keys_digest_data *)digest;
+
+	BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));
+
+	memset(digest, 0, sizeof(*digest));
+
+	data->n_proto = flow->n_proto;
+	data->ip_proto = flow->ip_proto;
+	data->ports = flow->ports;
+	data->src = flow->src;
+	data->dst = flow->dst;
+}
+EXPORT_SYMBOL(make_flow_keys_digest);
+
 /*
  * __skb_get_hash: calculate a flow hash based on src/dst addresses
  * and src/dst port numbers.  Sets hash in skb to non-zero hash value
-- 
cgit v1.2.3


From c19ae86a510cf4332af64caab04718bc853d3184 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Fri, 1 May 2015 22:19:43 -0700
Subject: tc: remove unused redirect ttl

improves ingress+u32 performance from 22.4 Mpps to 22.9 Mpps

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Florian Westphal <fw@strlen.de>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/pkt_cls.h | 2 ++
 net/core/dev.c               | 9 ---------
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'net/core')

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 6810ca43a80a..596ffa0c7084 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -65,11 +65,13 @@ bits 9,10,11: redirect counter -  redirect TTL. Loop avoidance
 #define SET_TC_NCLS(v)   ( TC_NCLS | (v & ~TC_NCLS))
 #define CLR_TC_NCLS(v)   ( v & ~TC_NCLS)
 
+#ifndef __KERNEL__
 #define S_TC_RTTL          _TC_MAKE32(9)
 #define M_TC_RTTL          _TC_MAKEMASK(3,S_TC_RTTL)
 #define G_TC_RTTL(x)       _TC_GETVALUE(x,S_TC_RTTL,M_TC_RTTL)
 #define V_TC_RTTL(x)       _TC_MAKEVALUE(x,S_TC_RTTL)
 #define SET_TC_RTTL(v,n)   ((V_TC_RTTL(n)) | (v & ~M_TC_RTTL))
+#endif
 
 #define S_TC_AT          _TC_MAKE32(12)
 #define M_TC_AT          _TC_MAKEMASK(2,S_TC_AT)
diff --git a/net/core/dev.c b/net/core/dev.c
index 74a5b62f7568..862875ec8f2f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3531,18 +3531,9 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
  */
 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
 {
-	struct net_device *dev = skb->dev;
-	u32 ttl = G_TC_RTTL(skb->tc_verd);
 	int result = TC_ACT_OK;
 	struct Qdisc *q;
 
-	if (unlikely(MAX_RED_LOOP < ttl++)) {
-		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
-				     skb->skb_iif, dev->ifindex);
-		return TC_ACT_SHOT;
-	}
-
-	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
 
 	q = rcu_dereference(rxq->qdisc);
-- 
cgit v1.2.3


From 9afd85c9e4552b276e2f4cfefd622bdeeffbbf26 Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Sat, 2 May 2015 14:01:07 +0200
Subject: net: Export IGMP/MLD message validation code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With this patch, the IGMP and MLD message validation functions are moved
from the bridge code to IPv4/IPv6 multicast files. Some small
refactoring was done to enhance readibility and to iron out some
differences in behaviour between the IGMP and MLD parsing code (e.g. the
skb-cloning of MLD messages is now only done if necessary, just like the
IGMP part always did).

Finally, these IGMP and MLD message validation functions are exported so
that not only the bridge can use it but batman-adv later, too.

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h      |   1 +
 include/linux/skbuff.h    |   3 +
 include/net/addrconf.h    |   1 +
 net/bridge/br_multicast.c | 218 +++++++---------------------------------------
 net/core/skbuff.c         |  87 ++++++++++++++++++
 net/ipv4/igmp.c           | 162 ++++++++++++++++++++++++++++++++++
 net/ipv6/Makefile         |   1 +
 net/ipv6/mcast_snoop.c    | 213 ++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 498 insertions(+), 188 deletions(-)
 create mode 100644 net/ipv6/mcast_snoop.c

(limited to 'net/core')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 2c677afeea47..193ad488d3e2 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -130,5 +130,6 @@ extern void ip_mc_unmap(struct in_device *);
 extern void ip_mc_remap(struct in_device *);
 extern void ip_mc_dec_group(struct in_device *in_dev, __be32 addr);
 extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr);
+int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed);
 
 #endif
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index acb83e249e3f..9c2f793573fa 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3419,6 +3419,9 @@ static inline void skb_checksum_none_assert(const struct sk_buff *skb)
 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
 
 int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
+struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
+				     unsigned int transport_len,
+				     __sum16(*skb_chkf)(struct sk_buff *skb));
 
 u32 skb_get_poff(const struct sk_buff *skb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 80456f72d70a..def59d3a34d5 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -142,6 +142,7 @@ void ipv6_mc_unmap(struct inet6_dev *idev);
 void ipv6_mc_remap(struct inet6_dev *idev);
 void ipv6_mc_init_dev(struct inet6_dev *idev);
 void ipv6_mc_destroy_dev(struct inet6_dev *idev);
+int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed);
 void addrconf_dad_failure(struct inet6_ifaddr *ifp);
 
 bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index b52f4cb8aee9..2d69d5cab52f 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -975,9 +975,6 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
 	int err = 0;
 	__be32 group;
 
-	if (!pskb_may_pull(skb, sizeof(*ih)))
-		return -EINVAL;
-
 	ih = igmpv3_report_hdr(skb);
 	num = ntohs(ih->ngrec);
 	len = sizeof(*ih);
@@ -1248,25 +1245,14 @@ static int br_ip4_multicast_query(struct net_bridge *br,
 			max_delay = 10 * HZ;
 			group = 0;
 		}
-	} else {
-		if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) {
-			err = -EINVAL;
-			goto out;
-		}
-
+	} else if (skb->len >= sizeof(*ih3)) {
 		ih3 = igmpv3_query_hdr(skb);
 		if (ih3->nsrcs)
 			goto out;
 
 		max_delay = ih3->code ?
 			    IGMPV3_MRC(ih3->code) * (HZ / IGMP_TIMER_SCALE) : 1;
-	}
-
-	/* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
-	 * all-systems destination addresses (224.0.0.1) for general queries
-	 */
-	if (!group && iph->daddr != htonl(INADDR_ALLHOSTS_GROUP)) {
-		err = -EINVAL;
+	} else {
 		goto out;
 	}
 
@@ -1329,12 +1315,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 	    (port && port->state == BR_STATE_DISABLED))
 		goto out;
 
-	/* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */
-	if (!(ipv6_addr_type(&ip6h->saddr) & IPV6_ADDR_LINKLOCAL)) {
-		err = -EINVAL;
-		goto out;
-	}
-
 	if (skb->len == sizeof(*mld)) {
 		if (!pskb_may_pull(skb, sizeof(*mld))) {
 			err = -EINVAL;
@@ -1358,14 +1338,6 @@ static int br_ip6_multicast_query(struct net_bridge *br,
 
 	is_general_query = group && ipv6_addr_any(group);
 
-	/* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer
-	 * all-nodes destination address (ff02::1) for general queries
-	 */
-	if (is_general_query && !ipv6_addr_is_ll_all_nodes(&ip6h->daddr)) {
-		err = -EINVAL;
-		goto out;
-	}
-
 	if (is_general_query) {
 		saddr.proto = htons(ETH_P_IPV6);
 		saddr.u.ip6 = ip6h->saddr;
@@ -1557,66 +1529,22 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 				 struct sk_buff *skb,
 				 u16 vid)
 {
-	struct sk_buff *skb2 = skb;
-	const struct iphdr *iph;
+	struct sk_buff *skb_trimmed = NULL;
 	struct igmphdr *ih;
-	unsigned int len;
-	unsigned int offset;
 	int err;
 
-	/* We treat OOM as packet loss for now. */
-	if (!pskb_may_pull(skb, sizeof(*iph)))
-		return -EINVAL;
-
-	iph = ip_hdr(skb);
-
-	if (iph->ihl < 5 || iph->version != 4)
-		return -EINVAL;
-
-	if (!pskb_may_pull(skb, ip_hdrlen(skb)))
-		return -EINVAL;
-
-	iph = ip_hdr(skb);
+	err = ip_mc_check_igmp(skb, &skb_trimmed);
 
-	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
-		return -EINVAL;
-
-	if (iph->protocol != IPPROTO_IGMP) {
-		if (!ipv4_is_local_multicast(iph->daddr))
+	if (err == -ENOMSG) {
+		if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr))
 			BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
 		return 0;
+	} else if (err < 0) {
+		return err;
 	}
 
-	len = ntohs(iph->tot_len);
-	if (skb->len < len || len < ip_hdrlen(skb))
-		return -EINVAL;
-
-	if (skb->len > len) {
-		skb2 = skb_clone(skb, GFP_ATOMIC);
-		if (!skb2)
-			return -ENOMEM;
-
-		err = pskb_trim_rcsum(skb2, len);
-		if (err)
-			goto err_out;
-	}
-
-	len -= ip_hdrlen(skb2);
-	offset = skb_network_offset(skb2) + ip_hdrlen(skb2);
-	__skb_pull(skb2, offset);
-	skb_reset_transport_header(skb2);
-
-	err = -EINVAL;
-	if (!pskb_may_pull(skb2, sizeof(*ih)))
-		goto out;
-
-	if (skb_checksum_simple_validate(skb2))
-		goto out;
-
-	err = 0;
-
 	BR_INPUT_SKB_CB(skb)->igmp = 1;
-	ih = igmp_hdr(skb2);
+	ih = igmp_hdr(skb);
 
 	switch (ih->type) {
 	case IGMP_HOST_MEMBERSHIP_REPORT:
@@ -1625,21 +1553,19 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
 		err = br_ip4_multicast_add_group(br, port, ih->group, vid);
 		break;
 	case IGMPV3_HOST_MEMBERSHIP_REPORT:
-		err = br_ip4_multicast_igmp3_report(br, port, skb2, vid);
+		err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
 		break;
 	case IGMP_HOST_MEMBERSHIP_QUERY:
-		err = br_ip4_multicast_query(br, port, skb2, vid);
+		err = br_ip4_multicast_query(br, port, skb_trimmed, vid);
 		break;
 	case IGMP_HOST_LEAVE_MESSAGE:
 		br_ip4_multicast_leave_group(br, port, ih->group, vid);
 		break;
 	}
 
-out:
-	__skb_push(skb2, offset);
-err_out:
-	if (skb2 != skb)
-		kfree_skb(skb2);
+	if (skb_trimmed)
+		kfree_skb(skb_trimmed);
+
 	return err;
 }
 
@@ -1649,126 +1575,42 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
 				 struct sk_buff *skb,
 				 u16 vid)
 {
-	struct sk_buff *skb2;
-	const struct ipv6hdr *ip6h;
-	u8 icmp6_type;
-	u8 nexthdr;
-	__be16 frag_off;
-	unsigned int len;
-	int offset;
+	struct sk_buff *skb_trimmed = NULL;
+	struct mld_msg *mld;
 	int err;
 
-	if (!pskb_may_pull(skb, sizeof(*ip6h)))
-		return -EINVAL;
+	err = ipv6_mc_check_mld(skb, &skb_trimmed);
 
-	ip6h = ipv6_hdr(skb);
-
-	/*
-	 * We're interested in MLD messages only.
-	 *  - Version is 6
-	 *  - MLD has always Router Alert hop-by-hop option
-	 *  - But we do not support jumbrograms.
-	 */
-	if (ip6h->version != 6)
-		return 0;
-
-	/* Prevent flooding this packet if there is no listener present */
-	if (!ipv6_addr_is_ll_all_nodes(&ip6h->daddr))
-		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
-
-	if (ip6h->nexthdr != IPPROTO_HOPOPTS ||
-	    ip6h->payload_len == 0)
-		return 0;
-
-	len = ntohs(ip6h->payload_len) + sizeof(*ip6h);
-	if (skb->len < len)
-		return -EINVAL;
-
-	nexthdr = ip6h->nexthdr;
-	offset = ipv6_skip_exthdr(skb, sizeof(*ip6h), &nexthdr, &frag_off);
-
-	if (offset < 0 || nexthdr != IPPROTO_ICMPV6)
+	if (err == -ENOMSG) {
+		if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
+			BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
 		return 0;
-
-	/* Okay, we found ICMPv6 header */
-	skb2 = skb_clone(skb, GFP_ATOMIC);
-	if (!skb2)
-		return -ENOMEM;
-
-	err = -EINVAL;
-	if (!pskb_may_pull(skb2, offset + sizeof(struct icmp6hdr)))
-		goto out;
-
-	len -= offset - skb_network_offset(skb2);
-
-	__skb_pull(skb2, offset);
-	skb_reset_transport_header(skb2);
-	skb_postpull_rcsum(skb2, skb_network_header(skb2),
-			   skb_network_header_len(skb2));
-
-	icmp6_type = icmp6_hdr(skb2)->icmp6_type;
-
-	switch (icmp6_type) {
-	case ICMPV6_MGM_QUERY:
-	case ICMPV6_MGM_REPORT:
-	case ICMPV6_MGM_REDUCTION:
-	case ICMPV6_MLD2_REPORT:
-		break;
-	default:
-		err = 0;
-		goto out;
-	}
-
-	/* Okay, we found MLD message. Check further. */
-	if (skb2->len > len) {
-		err = pskb_trim_rcsum(skb2, len);
-		if (err)
-			goto out;
-		err = -EINVAL;
+	} else if (err < 0) {
+		return err;
 	}
 
-	ip6h = ipv6_hdr(skb2);
-
-	if (skb_checksum_validate(skb2, IPPROTO_ICMPV6, ip6_compute_pseudo))
-		goto out;
-
-	err = 0;
-
 	BR_INPUT_SKB_CB(skb)->igmp = 1;
+	mld = (struct mld_msg *)skb_transport_header(skb);
 
-	switch (icmp6_type) {
+	switch (mld->mld_type) {
 	case ICMPV6_MGM_REPORT:
-	    {
-		struct mld_msg *mld;
-		if (!pskb_may_pull(skb2, sizeof(*mld))) {
-			err = -EINVAL;
-			goto out;
-		}
-		mld = (struct mld_msg *)skb_transport_header(skb2);
 		BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
 		err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid);
 		break;
-	    }
 	case ICMPV6_MLD2_REPORT:
-		err = br_ip6_multicast_mld2_report(br, port, skb2, vid);
+		err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid);
 		break;
 	case ICMPV6_MGM_QUERY:
-		err = br_ip6_multicast_query(br, port, skb2, vid);
+		err = br_ip6_multicast_query(br, port, skb_trimmed, vid);
 		break;
 	case ICMPV6_MGM_REDUCTION:
-	    {
-		struct mld_msg *mld;
-		if (!pskb_may_pull(skb2, sizeof(*mld))) {
-			err = -EINVAL;
-			goto out;
-		}
-		mld = (struct mld_msg *)skb_transport_header(skb2);
 		br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid);
-	    }
+		break;
 	}
 
-out:
-	kfree_skb(skb2);
+	if (skb_trimmed)
+		kfree_skb(skb_trimmed);
+
 	return err;
 }
 #endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3cfff2a3d651..1e4278a4dd7e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4030,6 +4030,93 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
 }
 EXPORT_SYMBOL(skb_checksum_setup);
 
+/**
+ * skb_checksum_maybe_trim - maybe trims the given skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ *
+ * Checks whether the given skb has data beyond the given transport length.
+ * If so, returns a cloned skb trimmed to this transport length.
+ * Otherwise returns the provided skb. Returns NULL in error cases
+ * (e.g. transport_len exceeds skb length or out-of-memory).
+ *
+ * Caller needs to set the skb transport header and release the returned skb.
+ * Provided skb is consumed.
+ */
+static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
+					       unsigned int transport_len)
+{
+	struct sk_buff *skb_chk;
+	unsigned int len = skb_transport_offset(skb) + transport_len;
+	int ret;
+
+	if (skb->len < len) {
+		kfree_skb(skb);
+		return NULL;
+	} else if (skb->len == len) {
+		return skb;
+	}
+
+	skb_chk = skb_clone(skb, GFP_ATOMIC);
+	kfree_skb(skb);
+
+	if (!skb_chk)
+		return NULL;
+
+	ret = pskb_trim_rcsum(skb_chk, len);
+	if (ret) {
+		kfree_skb(skb_chk);
+		return NULL;
+	}
+
+	return skb_chk;
+}
+
+/**
+ * skb_checksum_trimmed - validate checksum of an skb
+ * @skb: the skb to check
+ * @transport_len: the data length beyond the network header
+ * @skb_chkf: checksum function to use
+ *
+ * Applies the given checksum function skb_chkf to the provided skb.
+ * Returns a checked and maybe trimmed skb. Returns NULL on error.
+ *
+ * If the skb has data beyond the given transport length, then a
+ * trimmed & cloned skb is checked and returned.
+ *
+ * Caller needs to set the skb transport header and release the returned skb.
+ * Provided skb is consumed.
+ */
+struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
+				     unsigned int transport_len,
+				     __sum16(*skb_chkf)(struct sk_buff *skb))
+{
+	struct sk_buff *skb_chk;
+	unsigned int offset = skb_transport_offset(skb);
+	int ret;
+
+	skb_chk = skb_checksum_maybe_trim(skb, transport_len);
+	if (!skb_chk)
+		return NULL;
+
+	if (!pskb_may_pull(skb_chk, offset)) {
+		kfree_skb(skb_chk);
+		return NULL;
+	}
+
+	__skb_pull(skb_chk, offset);
+	ret = skb_chkf(skb_chk);
+	__skb_push(skb_chk, offset);
+
+	if (ret) {
+		kfree_skb(skb_chk);
+		return NULL;
+	}
+
+	return skb_chk;
+}
+EXPORT_SYMBOL(skb_checksum_trimmed);
+
 void __skb_warn_lro_forwarding(const struct sk_buff *skb)
 {
 	net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a3a697f5ffba..651cdf648ec4 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1339,6 +1339,168 @@ out:
 }
 EXPORT_SYMBOL(ip_mc_inc_group);
 
+static int ip_mc_check_iphdr(struct sk_buff *skb)
+{
+	const struct iphdr *iph;
+	unsigned int len;
+	unsigned int offset = skb_network_offset(skb) + sizeof(*iph);
+
+	if (!pskb_may_pull(skb, offset))
+		return -EINVAL;
+
+	iph = ip_hdr(skb);
+
+	if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph))
+		return -EINVAL;
+
+	offset += ip_hdrlen(skb) - sizeof(*iph);
+
+	if (!pskb_may_pull(skb, offset))
+		return -EINVAL;
+
+	iph = ip_hdr(skb);
+
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		return -EINVAL;
+
+	len = skb_network_offset(skb) + ntohs(iph->tot_len);
+	if (skb->len < len || len < offset)
+		return -EINVAL;
+
+	skb_set_transport_header(skb, offset);
+
+	return 0;
+}
+
+static int ip_mc_check_igmp_reportv3(struct sk_buff *skb)
+{
+	unsigned int len = skb_transport_offset(skb);
+
+	len += sizeof(struct igmpv3_report);
+
+	return pskb_may_pull(skb, len) ? 0 : -EINVAL;
+}
+
+static int ip_mc_check_igmp_query(struct sk_buff *skb)
+{
+	unsigned int len = skb_transport_offset(skb);
+
+	len += sizeof(struct igmphdr);
+	if (skb->len < len)
+		return -EINVAL;
+
+	/* IGMPv{1,2}? */
+	if (skb->len != len) {
+		/* or IGMPv3? */
+		len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr);
+		if (skb->len < len || !pskb_may_pull(skb, len))
+			return -EINVAL;
+	}
+
+	/* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
+	 * all-systems destination addresses (224.0.0.1) for general queries
+	 */
+	if (!igmp_hdr(skb)->group &&
+	    ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ip_mc_check_igmp_msg(struct sk_buff *skb)
+{
+	switch (igmp_hdr(skb)->type) {
+	case IGMP_HOST_LEAVE_MESSAGE:
+	case IGMP_HOST_MEMBERSHIP_REPORT:
+	case IGMPV2_HOST_MEMBERSHIP_REPORT:
+		/* fall through */
+		return 0;
+	case IGMPV3_HOST_MEMBERSHIP_REPORT:
+		return ip_mc_check_igmp_reportv3(skb);
+	case IGMP_HOST_MEMBERSHIP_QUERY:
+		return ip_mc_check_igmp_query(skb);
+	default:
+		return -ENOMSG;
+	}
+}
+
+static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
+{
+	return skb_checksum_simple_validate(skb);
+}
+
+static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+
+{
+	struct sk_buff *skb_chk;
+	unsigned int transport_len;
+	unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr);
+	int ret;
+
+	transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb);
+
+	skb_get(skb);
+	skb_chk = skb_checksum_trimmed(skb, transport_len,
+				       ip_mc_validate_checksum);
+	if (!skb_chk)
+		return -EINVAL;
+
+	if (!pskb_may_pull(skb_chk, len)) {
+		kfree_skb(skb_chk);
+		return -EINVAL;
+	}
+
+	ret = ip_mc_check_igmp_msg(skb_chk);
+	if (ret) {
+		kfree_skb(skb_chk);
+		return ret;
+	}
+
+	if (skb_trimmed)
+		*skb_trimmed = skb_chk;
+	else
+		kfree_skb(skb_chk);
+
+	return 0;
+}
+
+/**
+ * ip_mc_check_igmp - checks whether this is a sane IGMP packet
+ * @skb: the skb to validate
+ * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional)
+ *
+ * Checks whether an IPv4 packet is a valid IGMP packet. If so sets
+ * skb network and transport headers accordingly and returns zero.
+ *
+ * -EINVAL: A broken packet was detected, i.e. it violates some internet
+ *  standard
+ * -ENOMSG: IP header validation succeeded but it is not an IGMP packet.
+ * -ENOMEM: A memory allocation failure happened.
+ *
+ * Optionally, an skb pointer might be provided via skb_trimmed (or set it
+ * to NULL): After parsing an IGMP packet successfully it will point to
+ * an skb which has its tail aligned to the IP packet end. This might
+ * either be the originally provided skb or a trimmed, cloned version if
+ * the skb frame had data beyond the IP packet. A cloned skb allows us
+ * to leave the original skb and its full frame unchanged (which might be
+ * desirable for layer 2 frame jugglers).
+ *
+ * The caller needs to release a reference count from any returned skb_trimmed.
+ */
+int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+{
+	int ret = ip_mc_check_iphdr(skb);
+
+	if (ret < 0)
+		return ret;
+
+	if (ip_hdr(skb)->protocol != IPPROTO_IGMP)
+		return -ENOMSG;
+
+	return __ip_mc_check_igmp(skb, skb_trimmed);
+}
+EXPORT_SYMBOL(ip_mc_check_igmp);
+
 /*
  *	Resend IGMP JOIN report; used by netdev notifier.
  */
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 2e8c06108ab9..0f3f1999719a 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -48,4 +48,5 @@ obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
 
 ifneq ($(CONFIG_IPV6),)
 obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o
+obj-y += mcast_snoop.o
 endif
diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c
new file mode 100644
index 000000000000..1a2cbc13a7d3
--- /dev/null
+++ b/net/ipv6/mcast_snoop.c
@@ -0,0 +1,213 @@
+/* Copyright (C) 2010: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
+ * Copyright (C) 2015: Linus Lüssing <linus.luessing@c0d3.blue>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki.
+ */
+
+#include <linux/skbuff.h>
+#include <net/ipv6.h>
+#include <net/mld.h>
+#include <net/addrconf.h>
+#include <net/ip6_checksum.h>
+
+static int ipv6_mc_check_ip6hdr(struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6h;
+	unsigned int len;
+	unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h);
+
+	if (!pskb_may_pull(skb, offset))
+		return -EINVAL;
+
+	ip6h = ipv6_hdr(skb);
+
+	if (ip6h->version != 6)
+		return -EINVAL;
+
+	len = offset + ntohs(ip6h->payload_len);
+	if (skb->len < len || len <= offset)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ipv6_mc_check_exthdrs(struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6h;
+	unsigned int offset;
+	u8 nexthdr;
+	__be16 frag_off;
+
+	ip6h = ipv6_hdr(skb);
+
+	if (ip6h->nexthdr != IPPROTO_HOPOPTS)
+		return -ENOMSG;
+
+	nexthdr = ip6h->nexthdr;
+	offset = skb_network_offset(skb) + sizeof(*ip6h);
+	offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
+
+	if (offset < 0)
+		return -EINVAL;
+
+	if (nexthdr != IPPROTO_ICMPV6)
+		return -ENOMSG;
+
+	skb_set_transport_header(skb, offset);
+
+	return 0;
+}
+
+static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb)
+{
+	unsigned int len = skb_transport_offset(skb);
+
+	len += sizeof(struct mld2_report);
+
+	return pskb_may_pull(skb, len) ? 0 : -EINVAL;
+}
+
+static int ipv6_mc_check_mld_query(struct sk_buff *skb)
+{
+	struct mld_msg *mld;
+	unsigned int len = skb_transport_offset(skb);
+
+	/* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */
+	if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL))
+		return -EINVAL;
+
+	len += sizeof(struct mld_msg);
+	if (skb->len < len)
+		return -EINVAL;
+
+	/* MLDv1? */
+	if (skb->len != len) {
+		/* or MLDv2? */
+		len += sizeof(struct mld2_query) - sizeof(struct mld_msg);
+		if (skb->len < len || !pskb_may_pull(skb, len))
+			return -EINVAL;
+	}
+
+	mld = (struct mld_msg *)skb_transport_header(skb);
+
+	/* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer
+	 * all-nodes destination address (ff02::1) for general queries
+	 */
+	if (ipv6_addr_any(&mld->mld_mca) &&
+	    !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int ipv6_mc_check_mld_msg(struct sk_buff *skb)
+{
+	struct mld_msg *mld = (struct mld_msg *)skb_transport_header(skb);
+
+	switch (mld->mld_type) {
+	case ICMPV6_MGM_REDUCTION:
+	case ICMPV6_MGM_REPORT:
+		/* fall through */
+		return 0;
+	case ICMPV6_MLD2_REPORT:
+		return ipv6_mc_check_mld_reportv2(skb);
+	case ICMPV6_MGM_QUERY:
+		return ipv6_mc_check_mld_query(skb);
+	default:
+		return -ENOMSG;
+	}
+}
+
+static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb)
+{
+	return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo);
+}
+
+static int __ipv6_mc_check_mld(struct sk_buff *skb,
+			       struct sk_buff **skb_trimmed)
+
+{
+	struct sk_buff *skb_chk = NULL;
+	unsigned int transport_len;
+	unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg);
+	int ret;
+
+	transport_len = ntohs(ipv6_hdr(skb)->payload_len);
+	transport_len -= skb_transport_offset(skb) - sizeof(struct ipv6hdr);
+
+	skb_get(skb);
+	skb_chk = skb_checksum_trimmed(skb, transport_len,
+				       ipv6_mc_validate_checksum);
+	if (!skb_chk)
+		return -EINVAL;
+
+	if (!pskb_may_pull(skb_chk, len)) {
+		kfree_skb(skb_chk);
+		return -EINVAL;
+	}
+
+	ret = ipv6_mc_check_mld_msg(skb_chk);
+	if (ret) {
+		kfree_skb(skb_chk);
+		return ret;
+	}
+
+	if (skb_trimmed)
+		*skb_trimmed = skb_chk;
+	else
+		kfree_skb(skb_chk);
+
+	return 0;
+}
+
+/**
+ * ipv6_mc_check_mld - checks whether this is a sane MLD packet
+ * @skb: the skb to validate
+ * @skb_trimmed: to store an skb pointer trimmed to IPv6 packet tail (optional)
+ *
+ * Checks whether an IPv6 packet is a valid MLD packet. If so sets
+ * skb network and transport headers accordingly and returns zero.
+ *
+ * -EINVAL: A broken packet was detected, i.e. it violates some internet
+ *  standard
+ * -ENOMSG: IP header validation succeeded but it is not an MLD packet.
+ * -ENOMEM: A memory allocation failure happened.
+ *
+ * Optionally, an skb pointer might be provided via skb_trimmed (or set it
+ * to NULL): After parsing an MLD packet successfully it will point to
+ * an skb which has its tail aligned to the IP packet end. This might
+ * either be the originally provided skb or a trimmed, cloned version if
+ * the skb frame had data beyond the IP packet. A cloned skb allows us
+ * to leave the original skb and its full frame unchanged (which might be
+ * desirable for layer 2 frame jugglers).
+ *
+ * The caller needs to release a reference count from any returned skb_trimmed.
+ */
+int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+{
+	int ret;
+
+	ret = ipv6_mc_check_ip6hdr(skb);
+	if (ret < 0)
+		return ret;
+
+	ret = ipv6_mc_check_exthdrs(skb);
+	if (ret < 0)
+		return ret;
+
+	return __ipv6_mc_check_mld(skb, skb_trimmed);
+}
+EXPORT_SYMBOL(ipv6_mc_check_mld);
-- 
cgit v1.2.3


From fcba67c94abe83e0e69a65737000ccbb16a4fa03 Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Tue, 5 May 2015 00:19:35 +0200
Subject: net: fix two sparse warnings introduced by IGMP/MLD parsing exports
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

> net/core/skbuff.c:4108:13: sparse: incorrect type in assignment (different base types)
> net/ipv6/mcast_snoop.c:63 ipv6_mc_check_exthdrs() warn: unsigned 'offset' is never less than zero.

Introduced by 9afd85c9e4552b276e2f4cfefd622bdeeffbbf26
("net: Export IGMP/MLD message validation code")

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c      | 2 +-
 net/ipv6/mcast_snoop.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1e4278a4dd7e..b9eb90b39ac7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4093,7 +4093,7 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
 {
 	struct sk_buff *skb_chk;
 	unsigned int offset = skb_transport_offset(skb);
-	int ret;
+	__sum16 ret;
 
 	skb_chk = skb_checksum_maybe_trim(skb, transport_len);
 	if (!skb_chk)
diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c
index 1a2cbc13a7d3..df8afe5ab31e 100644
--- a/net/ipv6/mcast_snoop.c
+++ b/net/ipv6/mcast_snoop.c
@@ -47,7 +47,7 @@ static int ipv6_mc_check_ip6hdr(struct sk_buff *skb)
 static int ipv6_mc_check_exthdrs(struct sk_buff *skb)
 {
 	const struct ipv6hdr *ip6h;
-	unsigned int offset;
+	int offset;
 	u8 nexthdr;
 	__be16 frag_off;
 
-- 
cgit v1.2.3


From 4ae92bc77ac8e620f7c8d59b5882a4cb0d1c4ef1 Mon Sep 17 00:00:00 2001
From: Nicolas Schichan <nschichan@freebox.fr>
Date: Wed, 6 May 2015 16:12:27 +0200
Subject: net: filter: add a callback to allow classic post-verifier
 transformations

This is in preparation for use by the seccomp code, the rationale is
not to duplicate additional code within the seccomp layer, but instead,
have it abstracted and hidden within the classic BPF API.

As an interim step, this now also makes bpf_prepare_filter() visible
(not as exported symbol though), so that seccomp can reuse that code
path instead of reimplementing it.

Joint work with Daniel Borkmann.

Signed-off-by: Nicolas Schichan <nschichan@freebox.fr>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Kees Cook <keescook@chromium.org>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  6 ++++++
 net/core/filter.c      | 18 +++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index fa11b3a367be..91996247cb55 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -384,7 +384,13 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 int sk_attach_bpf(u32 ufd, struct sock *sk);
 int sk_detach_filter(struct sock *sk);
 
+typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter,
+				       unsigned int flen);
+
 int bpf_check_classic(const struct sock_filter *filter, unsigned int flen);
+struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
+				    bpf_aux_classic_check_t trans);
+
 int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
 		  unsigned int len);
 
diff --git a/net/core/filter.c b/net/core/filter.c
index bf831a85c315..e670494f1d83 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -988,7 +988,8 @@ out_err:
 	return ERR_PTR(err);
 }
 
-static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
+struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
+				    bpf_aux_classic_check_t trans)
 {
 	int err;
 
@@ -1001,6 +1002,17 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp)
 		return ERR_PTR(err);
 	}
 
+	/* There might be additional checks and transformations
+	 * needed on classic filters, f.e. in case of seccomp.
+	 */
+	if (trans) {
+		err = trans(fp->insns, fp->len);
+		if (err) {
+			__bpf_prog_release(fp);
+			return ERR_PTR(err);
+		}
+	}
+
 	/* Probe if we can JIT compile the filter and if so, do
 	 * the compilation of the filter.
 	 */
@@ -1050,7 +1062,7 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
 	/* bpf_prepare_filter() already takes care of freeing
 	 * memory in case something goes wrong.
 	 */
-	fp = bpf_prepare_filter(fp);
+	fp = bpf_prepare_filter(fp, NULL);
 	if (IS_ERR(fp))
 		return PTR_ERR(fp);
 
@@ -1135,7 +1147,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	/* bpf_prepare_filter() already takes care of freeing
 	 * memory in case something goes wrong.
 	 */
-	prog = bpf_prepare_filter(prog);
+	prog = bpf_prepare_filter(prog, NULL);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-- 
cgit v1.2.3


From d9e12f42e58da475379b9080708b94f2095904af Mon Sep 17 00:00:00 2001
From: Nicolas Schichan <nschichan@freebox.fr>
Date: Wed, 6 May 2015 16:12:28 +0200
Subject: seccomp: simplify seccomp_prepare_filter and reuse bpf_prepare_filter

Remove the calls to bpf_check_classic(), bpf_convert_filter() and
bpf_migrate_runtime() and let bpf_prepare_filter() take care of that
instead.

seccomp_check_filter() is passed to bpf_prepare_filter() so that it
gets called from there, after bpf_check_classic().

We can now remove exposure of two internal classic BPF functions
previously used by seccomp. The export of bpf_check_classic() symbol,
previously known as sk_chk_filter(), was there since pre git times,
and no in-tree module was using it, therefore remove it.

Joint work with Daniel Borkmann.

Signed-off-by: Nicolas Schichan <nschichan@freebox.fr>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Kees Cook <keescook@chromium.org>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  4 ---
 kernel/seccomp.c       | 68 ++++++++++++++++----------------------------------
 net/core/filter.c      |  8 +++---
 3 files changed, 26 insertions(+), 54 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 91996247cb55..0dcb44bcfc5f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -363,9 +363,6 @@ int sk_filter(struct sock *sk, struct sk_buff *skb);
 void bpf_prog_select_runtime(struct bpf_prog *fp);
 void bpf_prog_free(struct bpf_prog *fp);
 
-int bpf_convert_filter(struct sock_filter *prog, int len,
-		       struct bpf_insn *new_prog, int *new_len);
-
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
 struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 				  gfp_t gfp_extra_flags);
@@ -387,7 +384,6 @@ int sk_detach_filter(struct sock *sk);
 typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter,
 				       unsigned int flen);
 
-int bpf_check_classic(const struct sock_filter *filter, unsigned int flen);
 struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
 				    bpf_aux_classic_check_t trans);
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4f44028943e6..93d40f7f3683 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -347,15 +347,14 @@ static inline void seccomp_sync_threads(void)
 static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 {
 	struct seccomp_filter *filter;
-	unsigned long fp_size;
-	struct sock_filter *fp;
-	int new_len;
-	long ret;
+	struct bpf_prog *prog;
+	unsigned long fsize;
 
 	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
 		return ERR_PTR(-EINVAL);
+
 	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
-	fp_size = fprog->len * sizeof(struct sock_filter);
+	fsize = bpf_classic_proglen(fprog);
 
 	/*
 	 * Installing a seccomp filter requires that the task has
@@ -368,60 +367,37 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 				     CAP_SYS_ADMIN) != 0)
 		return ERR_PTR(-EACCES);
 
-	fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
-	if (!fp)
+	prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
+	if (!prog)
 		return ERR_PTR(-ENOMEM);
 
 	/* Copy the instructions from fprog. */
-	ret = -EFAULT;
-	if (copy_from_user(fp, fprog->filter, fp_size))
-		goto free_prog;
-
-	/* Check and rewrite the fprog via the skb checker */
-	ret = bpf_check_classic(fp, fprog->len);
-	if (ret)
-		goto free_prog;
+	if (copy_from_user(prog->insns, fprog->filter, fsize)) {
+		__bpf_prog_free(prog);
+		return ERR_PTR(-EFAULT);
+	}
 
-	/* Check and rewrite the fprog for seccomp use */
-	ret = seccomp_check_filter(fp, fprog->len);
-	if (ret)
-		goto free_prog;
+	prog->len = fprog->len;
 
-	/* Convert 'sock_filter' insns to 'bpf_insn' insns */
-	ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len);
-	if (ret)
-		goto free_prog;
+	/* bpf_prepare_filter() already takes care of freeing
+	 * memory in case something goes wrong.
+	 */
+	prog = bpf_prepare_filter(prog, seccomp_check_filter);
+	if (IS_ERR(prog))
+		return ERR_CAST(prog);
 
 	/* Allocate a new seccomp_filter */
-	ret = -ENOMEM;
 	filter = kzalloc(sizeof(struct seccomp_filter),
 			 GFP_KERNEL|__GFP_NOWARN);
-	if (!filter)
-		goto free_prog;
-
-	filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN);
-	if (!filter->prog)
-		goto free_filter;
-
-	ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
-	if (ret)
-		goto free_filter_prog;
+	if (!filter) {
+		bpf_prog_destroy(prog);
+		return ERR_PTR(-ENOMEM);
+	}
 
-	kfree(fp);
+	filter->prog = prog;
 	atomic_set(&filter->usage, 1);
-	filter->prog->len = new_len;
-
-	bpf_prog_select_runtime(filter->prog);
 
 	return filter;
-
-free_filter_prog:
-	__bpf_prog_free(filter->prog);
-free_filter:
-	kfree(filter);
-free_prog:
-	kfree(fp);
-	return ERR_PTR(ret);
 }
 
 /**
diff --git a/net/core/filter.c b/net/core/filter.c
index e670494f1d83..f887084740cd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -355,8 +355,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
  * for socket filters: ctx == 'struct sk_buff *', for seccomp:
  * ctx == 'struct seccomp_data *'.
  */
-int bpf_convert_filter(struct sock_filter *prog, int len,
-		       struct bpf_insn *new_prog, int *new_len)
+static int bpf_convert_filter(struct sock_filter *prog, int len,
+			      struct bpf_insn *new_prog, int *new_len)
 {
 	int new_flen = 0, pass = 0, target, i;
 	struct bpf_insn *new_insn;
@@ -751,7 +751,8 @@ static bool chk_code_allowed(u16 code_to_probe)
  *
  * Returns 0 if the rule set is legal or -EINVAL if not.
  */
-int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
+static int bpf_check_classic(const struct sock_filter *filter,
+			     unsigned int flen)
 {
 	bool anc_found;
 	int pc;
@@ -825,7 +826,6 @@ int bpf_check_classic(const struct sock_filter *filter, unsigned int flen)
 
 	return -EINVAL;
 }
-EXPORT_SYMBOL(bpf_check_classic);
 
 static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
 				      const struct sock_fprog *fprog)
-- 
cgit v1.2.3


From 658da9379d0e1aa1e3e40c9b795357ada818b1c8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 6 May 2015 16:12:29 +0200
Subject: net: filter: add __GFP_NOWARN flag for larger kmem allocs

When seccomp BPF was added, it was discussed to add __GFP_NOWARN
flag for their configuration path as f.e. up to 32K allocations are
more prone to fail under stress. As we're going to reuse BPF API,
add __GFP_NOWARN flags where larger kmalloc() and friends allocations
could fail.

It doesn't make much sense to pass around __GFP_NOWARN everywhere as
an extra argument only for seccomp while we just as well could run
into similar issues for socket filters, where it's not desired to
have a user application throw a WARN() due to allocation failure.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Nicolas Schichan <nschichan@freebox.fr>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Kees Cook <keescook@chromium.org>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index f887084740cd..45c015d6b974 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -371,7 +371,8 @@ static int bpf_convert_filter(struct sock_filter *prog, int len,
 		return -EINVAL;
 
 	if (new_prog) {
-		addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL);
+		addrs = kcalloc(len, sizeof(*addrs),
+				GFP_KERNEL | __GFP_NOWARN);
 		if (!addrs)
 			return -ENOMEM;
 	}
@@ -839,7 +840,9 @@ static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
 
 	fkprog = fp->orig_prog;
 	fkprog->len = fprog->len;
-	fkprog->filter = kmemdup(fp->insns, fsize, GFP_KERNEL);
+
+	fkprog->filter = kmemdup(fp->insns, fsize,
+				 GFP_KERNEL | __GFP_NOWARN);
 	if (!fkprog->filter) {
 		kfree(fp->orig_prog);
 		return -ENOMEM;
@@ -941,7 +944,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 	 * pass. At this time, the user BPF is stored in fp->insns.
 	 */
 	old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
-			   GFP_KERNEL);
+			   GFP_KERNEL | __GFP_NOWARN);
 	if (!old_prog) {
 		err = -ENOMEM;
 		goto out_err;
-- 
cgit v1.2.3


From ac67eb2c5347bd9976308c0e0cf1d9e7ca690342 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 6 May 2015 16:12:30 +0200
Subject: seccomp, filter: add and use bpf_prog_create_from_user from seccomp

Seccomp has always been a special candidate when it comes to preparation
of its filters in seccomp_prepare_filter(). Due to the extra checks and
filter rewrite it partially duplicates code and has BPF internals exposed.

This patch adds a generic API inside the BPF code code that seccomp can use
and thus keep it's filter preparation code minimal and better maintainable.
The other side-effect is that now classic JITs can add seccomp support as
well by only providing a BPF_LDX | BPF_W | BPF_ABS translation.

Tested with seccomp and BPF test suites.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Nicolas Schichan <nschichan@freebox.fr>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Kees Cook <keescook@chromium.org>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 12 +++++-------
 kernel/seccomp.c       | 42 ++++++++++++-----------------------------
 net/core/filter.c      | 51 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 66 insertions(+), 39 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0dcb44bcfc5f..3c03a6085b82 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -374,19 +374,17 @@ static inline void bpf_prog_unlock_free(struct bpf_prog *fp)
 	__bpf_prog_free(fp);
 }
 
+typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter,
+				       unsigned int flen);
+
 int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
+int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
+			      bpf_aux_classic_check_t trans);
 void bpf_prog_destroy(struct bpf_prog *fp);
 
 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 int sk_attach_bpf(u32 ufd, struct sock *sk);
 int sk_detach_filter(struct sock *sk);
-
-typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter,
-				       unsigned int flen);
-
-struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
-				    bpf_aux_classic_check_t trans);
-
 int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
 		  unsigned int len);
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 93d40f7f3683..245df6b32b81 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -346,15 +346,13 @@ static inline void seccomp_sync_threads(void)
  */
 static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 {
-	struct seccomp_filter *filter;
-	struct bpf_prog *prog;
-	unsigned long fsize;
+	struct seccomp_filter *sfilter;
+	int ret;
 
 	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
 		return ERR_PTR(-EINVAL);
 
 	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
-	fsize = bpf_classic_proglen(fprog);
 
 	/*
 	 * Installing a seccomp filter requires that the task has
@@ -367,37 +365,21 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 				     CAP_SYS_ADMIN) != 0)
 		return ERR_PTR(-EACCES);
 
-	prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
-	if (!prog)
-		return ERR_PTR(-ENOMEM);
-
-	/* Copy the instructions from fprog. */
-	if (copy_from_user(prog->insns, fprog->filter, fsize)) {
-		__bpf_prog_free(prog);
-		return ERR_PTR(-EFAULT);
-	}
-
-	prog->len = fprog->len;
-
-	/* bpf_prepare_filter() already takes care of freeing
-	 * memory in case something goes wrong.
-	 */
-	prog = bpf_prepare_filter(prog, seccomp_check_filter);
-	if (IS_ERR(prog))
-		return ERR_CAST(prog);
-
 	/* Allocate a new seccomp_filter */
-	filter = kzalloc(sizeof(struct seccomp_filter),
-			 GFP_KERNEL|__GFP_NOWARN);
-	if (!filter) {
-		bpf_prog_destroy(prog);
+	sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
+	if (!sfilter)
 		return ERR_PTR(-ENOMEM);
+
+	ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
+					seccomp_check_filter);
+	if (ret < 0) {
+		kfree(sfilter);
+		return ERR_PTR(ret);
 	}
 
-	filter->prog = prog;
-	atomic_set(&filter->usage, 1);
+	atomic_set(&sfilter->usage, 1);
 
-	return filter;
+	return sfilter;
 }
 
 /**
diff --git a/net/core/filter.c b/net/core/filter.c
index 45c015d6b974..a831f193e2c7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -991,8 +991,8 @@ out_err:
 	return ERR_PTR(err);
 }
 
-struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
-				    bpf_aux_classic_check_t trans)
+static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
+					   bpf_aux_classic_check_t trans)
 {
 	int err;
 
@@ -1074,6 +1074,53 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_create);
 
+/**
+ *	bpf_prog_create_from_user - create an unattached filter from user buffer
+ *	@pfp: the unattached filter that is created
+ *	@fprog: the filter program
+ *	@trans: post-classic verifier transformation handler
+ *
+ * This function effectively does the same as bpf_prog_create(), only
+ * that it builds up its insns buffer from user space provided buffer.
+ * It also allows for passing a bpf_aux_classic_check_t handler.
+ */
+int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
+			      bpf_aux_classic_check_t trans)
+{
+	unsigned int fsize = bpf_classic_proglen(fprog);
+	struct bpf_prog *fp;
+
+	/* Make sure new filter is there and in the right amounts. */
+	if (fprog->filter == NULL)
+		return -EINVAL;
+
+	fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
+	if (!fp)
+		return -ENOMEM;
+
+	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
+		__bpf_prog_free(fp);
+		return -EFAULT;
+	}
+
+	fp->len = fprog->len;
+	/* Since unattached filters are not copied back to user
+	 * space through sk_get_filter(), we do not need to hold
+	 * a copy here, and can spare us the work.
+	 */
+	fp->orig_prog = NULL;
+
+	/* bpf_prepare_filter() already takes care of freeing
+	 * memory in case something goes wrong.
+	 */
+	fp = bpf_prepare_filter(fp, trans);
+	if (IS_ERR(fp))
+		return PTR_ERR(fp);
+
+	*pfp = fp;
+	return 0;
+}
+
 void bpf_prog_destroy(struct bpf_prog *fp)
 {
 	__bpf_prog_release(fp);
-- 
cgit v1.2.3


From 790ba4566c1a709462550463b222d23b6f352b47 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Wed, 6 May 2015 15:52:23 +0000
Subject: tcp: set SOCK_NOSPACE under memory pressure

Under tcp memory pressure, calling epoll_wait() in edge triggered
mode after -EAGAIN, can result in an indefinite hang in epoll_wait(),
even when there is sufficient memory available to continue making
progress. The problem is that when __sk_mem_schedule() returns 0
under memory pressure, we do not set the SOCK_NOSPACE flag in the
tcp write paths (tcp_sendmsg() or do_tcp_sendpages()). Then, since
SOCK_NOSPACE is used to trigger wakeups when incoming acks create
sufficient new space in the write queue, all outstanding packets
are acked, but we never wake up with the the EPOLLOUT that we are
expecting from epoll_wait().

This issue is currently limited to epoll() when used in edge trigger
mode, since 'tcp_poll()', does in fact currently set SOCK_NOSPACE.
This is sufficient for poll()/select() and epoll() in level trigger
mode. However, in edge trigger mode, epoll() is relying on the write
path to set SOCK_NOSPACE. EPOLL(7) says that in edge-trigger mode we
can only call epoll_wait() after read/write return -EAGAIN. Thus, in
the case of the socket write, we are relying on the fact that
tcp_sendmsg()/network write paths are going to issue a wakeup for
us at some point in the future when we get -EAGAIN.

Normally, epoll() edge trigger works fine when we've exceeded the
sk->sndbuf because in that case we do set SOCK_NOSPACE. However, when
we return -EAGAIN from the write path b/c we are over the tcp memory
limits and not b/c we are over the sndbuf, we are never going to get
another wakeup.

I can reproduce this issue, using SO_SNDBUF, since __sk_mem_schedule()
will return 0, or failure more readily with SO_SNDBUF:

1) create socket and set SO_SNDBUF to N
2) add socket as edge trigger
3) write to socket and block in epoll on -EAGAIN
4) cause tcp mem pressure via: echo "<small val>" > net.ipv4.tcp_mem

The fix here is simply to set SOCK_NOSPACE in sk_stream_wait_memory()
when the socket is non-blocking. Note that SOCK_NOSPACE, in addition
to waking up outstanding waiters is also used to expand the size of
the sk->sndbuf. However, we will not expand it by setting it in this
case because tcp_should_expand_sndbuf(), ensures that no expansion
occurs when we are under tcp memory pressure.

Note that we could still hang if sk->sk_wmem_queue is 0, when we get
the -EAGAIN. In this case the SOCK_NOSPACE bit will not help, since we
are waiting for and event that will never happen. I believe
that this case is harder to hit (and did not hit in my testing),
in that over the tcp 'soft' memory limits, we continue to guarantee a
minimum write buffer size. Perhaps, we could return -ENOSPC in this
case, or maybe we simply issue a wakeup in this case, such that we
keep retrying the write. Note that this case is not specific to
epoll() ET, but rather would affect blocking sockets as well. So I
view this patch as bringing epoll() edge-trigger into sync with the
current poll()/select()/epoll() level trigger and blocking sockets
behavior.

Signed-off-by: Jason Baron <jbaron@akamai.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/stream.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/stream.c b/net/core/stream.c
index 301c05f26060..d70f77a0c889 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -119,6 +119,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 	int err = 0;
 	long vm_wait = 0;
 	long current_timeo = *timeo_p;
+	bool noblock = (*timeo_p ? false : true);
 	DEFINE_WAIT(wait);
 
 	if (sk_stream_memory_free(sk))
@@ -131,8 +132,11 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
 
 		if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 			goto do_error;
-		if (!*timeo_p)
+		if (!*timeo_p) {
+			if (noblock)
+				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 			goto do_nonblock;
+		}
 		if (signal_pending(current))
 			goto do_interrupted;
 		clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
-- 
cgit v1.2.3


From 109582af18b9aade9385ea6609a792f80a7d70ca Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 7 May 2015 11:02:47 +0200
Subject: netns: returns always an id in __peernet2id()

All callers of this function expect a nsid, not an error.
Thus, returns NETNSA_NSID_NOT_ASSIGNED in case of error so that callers
don't have to convert the error to NETNSA_NSID_NOT_ASSIGNED.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 78fc04ad36fc..294d38742e2a 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -192,10 +192,12 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc)
 	if (id > 0)
 		return id;
 
-	if (alloc)
-		return alloc_netid(net, peer, -1);
+	if (alloc) {
+		id = alloc_netid(net, peer, -1);
+		return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
+	}
 
-	return -ENOENT;
+	return NETNSA_NSID_NOT_ASSIGNED;
 }
 
 /* This function returns the id of a peer netns. If no id is assigned, one will
@@ -204,10 +206,8 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc)
 int peernet2id(struct net *net, struct net *peer)
 {
 	bool alloc = atomic_read(&peer->count) == 0 ? false : true;
-	int id;
 
-	id = __peernet2id(net, peer, alloc);
-	return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
+	return __peernet2id(net, peer, alloc);
 }
 EXPORT_SYMBOL(peernet2id);
 
@@ -554,13 +554,10 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
 	rth = nlmsg_data(nlh);
 	rth->rtgen_family = AF_UNSPEC;
 
-	if (nsid >= 0) {
+	if (nsid >= 0)
 		id = nsid;
-	} else {
+	else
 		id = __peernet2id(net, peer, false);
-		if  (id < 0)
-			id = NETNSA_NSID_NOT_ASSIGNED;
-	}
 	if (nla_put_s32(skb, NETNSA_NSID, id))
 		goto nla_put_failure;
 
-- 
cgit v1.2.3


From cab3c8ec8d57ef48ed754ee7acf2b9bdce80fa5f Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 7 May 2015 11:02:48 +0200
Subject: netns: always provide the id to rtnl_net_fill()

The goal of this commit is to prepare the rework of the locking of nsnid
protection.
After this patch, rtnl_net_notifyid() will not call anymore __peernet2id(),
ie no idr_* operation into this function.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 31 +++++++++++--------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 294d38742e2a..37c68bb72db3 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -147,8 +147,7 @@ static void ops_free_list(const struct pernet_operations *ops,
 	}
 }
 
-static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
-			      int id);
+static void rtnl_net_notifyid(struct net *net, int cmd, int id);
 static int alloc_netid(struct net *net, struct net *peer, int reqid)
 {
 	int min = 0, max = 0, id;
@@ -162,7 +161,7 @@ static int alloc_netid(struct net *net, struct net *peer, int reqid)
 
 	id = idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL);
 	if (id >= 0)
-		rtnl_net_notifyid(net, peer, RTM_NEWNSID, id);
+		rtnl_net_notifyid(net, RTM_NEWNSID, id);
 
 	return id;
 }
@@ -365,7 +364,7 @@ static void cleanup_net(struct work_struct *work)
 			int id = __peernet2id(tmp, net, false);
 
 			if (id >= 0) {
-				rtnl_net_notifyid(tmp, net, RTM_DELNSID, id);
+				rtnl_net_notifyid(tmp, RTM_DELNSID, id);
 				idr_remove(&tmp->netns_ids, id);
 			}
 		}
@@ -538,14 +537,10 @@ static int rtnl_net_get_size(void)
 }
 
 static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
-			 int cmd, struct net *net, struct net *peer,
-			 int nsid)
+			 int cmd, struct net *net, int nsid)
 {
 	struct nlmsghdr *nlh;
 	struct rtgenmsg *rth;
-	int id;
-
-	ASSERT_RTNL();
 
 	nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags);
 	if (!nlh)
@@ -554,11 +549,7 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
 	rth = nlmsg_data(nlh);
 	rth->rtgen_family = AF_UNSPEC;
 
-	if (nsid >= 0)
-		id = nsid;
-	else
-		id = __peernet2id(net, peer, false);
-	if (nla_put_s32(skb, NETNSA_NSID, id))
+	if (nla_put_s32(skb, NETNSA_NSID, nsid))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
@@ -575,7 +566,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
 	struct nlattr *tb[NETNSA_MAX + 1];
 	struct sk_buff *msg;
 	struct net *peer;
-	int err;
+	int err, id;
 
 	err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
 			  rtnl_net_policy);
@@ -597,8 +588,9 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
 		goto out;
 	}
 
+	id = __peernet2id(net, peer, false);
 	err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
-			    RTM_GETNSID, net, peer, -1);
+			    RTM_GETNSID, net, id);
 	if (err < 0)
 		goto err_out;
 
@@ -630,7 +622,7 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data)
 
 	ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid,
 			    net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI,
-			    RTM_NEWNSID, net_cb->net, peer, id);
+			    RTM_NEWNSID, net_cb->net, id);
 	if (ret < 0)
 		return ret;
 
@@ -658,8 +650,7 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
-static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
-			      int id)
+static void rtnl_net_notifyid(struct net *net, int cmd, int id)
 {
 	struct sk_buff *msg;
 	int err = -ENOMEM;
@@ -668,7 +659,7 @@ static void rtnl_net_notifyid(struct net *net, struct net *peer, int cmd,
 	if (!msg)
 		goto out;
 
-	err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, peer, id);
+	err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id);
 	if (err < 0)
 		goto err_out;
 
-- 
cgit v1.2.3


From 7a0877d4b438886b72be61632eaa774d13262f70 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 7 May 2015 11:02:49 +0200
Subject: netns: rename peernet2id() to peernet2id_alloc()

In a following commit, a new function will be introduced to only lookup for
a nsid (no allocation if the nsid doesn't exist). To avoid confusion, the
existing function is renamed.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c         | 2 +-
 include/net/net_namespace.h | 2 +-
 net/core/net_namespace.c    | 4 ++--
 net/core/rtnetlink.c        | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 3517ab0aa803..48341ae49012 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -336,7 +336,7 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 
 	if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
 	    nla_put_s32(skb, NDA_LINK_NETNSID,
-			peernet2id(dev_net(vxlan->dev), vxlan->net)))
+			peernet2id_alloc(dev_net(vxlan->dev), vxlan->net)))
 		goto nla_put_failure;
 
 	if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index f733656404de..6d1e2eae32fb 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -271,7 +271,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
 #define __net_initconst	__initconst
 #endif
 
-int peernet2id(struct net *net, struct net *peer);
+int peernet2id_alloc(struct net *net, struct net *peer);
 struct net *get_net_ns_by_id(struct net *net, int id);
 
 struct pernet_operations {
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 37c68bb72db3..9c806ac569f9 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -202,13 +202,13 @@ static int __peernet2id(struct net *net, struct net *peer, bool alloc)
 /* This function returns the id of a peer netns. If no id is assigned, one will
  * be allocated and returned.
  */
-int peernet2id(struct net *net, struct net *peer)
+int peernet2id_alloc(struct net *net, struct net *peer)
 {
 	bool alloc = atomic_read(&peer->count) == 0 ? false : true;
 
 	return __peernet2id(net, peer, alloc);
 }
-EXPORT_SYMBOL(peernet2id);
+EXPORT_SYMBOL(peernet2id_alloc);
 
 struct net *get_net_ns_by_id(struct net *net, int id)
 {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 666e0928ba40..83e08323fdcd 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1204,7 +1204,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 		struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
 
 		if (!net_eq(dev_net(dev), link_net)) {
-			int id = peernet2id(dev_net(dev), link_net);
+			int id = peernet2id_alloc(dev_net(dev), link_net);
 
 			if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
 				goto nla_put_failure;
-- 
cgit v1.2.3


From 3138dbf881274cb20d9aa1b307861f689e820fbe Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 7 May 2015 11:02:50 +0200
Subject: netns: notify new nsid outside __peernet2id()

There is no functional change with this patch. It will ease the refactoring
of the locking system that protects nsids and the support of the netlink
socket option NETLINK_LISTEN_ALL_NSID.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 9c806ac569f9..ee864241f8d6 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -147,10 +147,9 @@ static void ops_free_list(const struct pernet_operations *ops,
 	}
 }
 
-static void rtnl_net_notifyid(struct net *net, int cmd, int id);
 static int alloc_netid(struct net *net, struct net *peer, int reqid)
 {
-	int min = 0, max = 0, id;
+	int min = 0, max = 0;
 
 	ASSERT_RTNL();
 
@@ -159,11 +158,7 @@ static int alloc_netid(struct net *net, struct net *peer, int reqid)
 		max = reqid + 1;
 	}
 
-	id = idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL);
-	if (id >= 0)
-		rtnl_net_notifyid(net, RTM_NEWNSID, id);
-
-	return id;
+	return idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL);
 }
 
 /* This function is used by idr_for_each(). If net is equal to peer, the
@@ -179,34 +174,50 @@ static int net_eq_idr(int id, void *net, void *peer)
 	return 0;
 }
 
-static int __peernet2id(struct net *net, struct net *peer, bool alloc)
+static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
 {
 	int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
+	bool alloc_it = *alloc;
 
 	ASSERT_RTNL();
 
+	*alloc = false;
+
 	/* Magic value for id 0. */
 	if (id == NET_ID_ZERO)
 		return 0;
 	if (id > 0)
 		return id;
 
-	if (alloc) {
+	if (alloc_it) {
 		id = alloc_netid(net, peer, -1);
+		*alloc = true;
 		return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
 	}
 
 	return NETNSA_NSID_NOT_ASSIGNED;
 }
 
+static int __peernet2id(struct net *net, struct net *peer)
+{
+	bool no = false;
+
+	return __peernet2id_alloc(net, peer, &no);
+}
+
+static void rtnl_net_notifyid(struct net *net, int cmd, int id);
 /* This function returns the id of a peer netns. If no id is assigned, one will
  * be allocated and returned.
  */
 int peernet2id_alloc(struct net *net, struct net *peer)
 {
 	bool alloc = atomic_read(&peer->count) == 0 ? false : true;
+	int id;
 
-	return __peernet2id(net, peer, alloc);
+	id = __peernet2id_alloc(net, peer, &alloc);
+	if (alloc && id >= 0)
+		rtnl_net_notifyid(net, RTM_NEWNSID, id);
+	return id;
 }
 EXPORT_SYMBOL(peernet2id_alloc);
 
@@ -361,7 +372,7 @@ static void cleanup_net(struct work_struct *work)
 		list_del_rcu(&net->list);
 		list_add_tail(&net->exit_list, &net_exit_list);
 		for_each_net(tmp) {
-			int id = __peernet2id(tmp, net, false);
+			int id = __peernet2id(tmp, net);
 
 			if (id >= 0) {
 				rtnl_net_notifyid(tmp, RTM_DELNSID, id);
@@ -516,14 +527,16 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (IS_ERR(peer))
 		return PTR_ERR(peer);
 
-	if (__peernet2id(net, peer, false) >= 0) {
+	if (__peernet2id(net, peer) >= 0) {
 		err = -EEXIST;
 		goto out;
 	}
 
 	err = alloc_netid(net, peer, nsid);
-	if (err > 0)
+	if (err >= 0) {
+		rtnl_net_notifyid(net, RTM_NEWNSID, err);
 		err = 0;
+	}
 out:
 	put_net(peer);
 	return err;
@@ -588,7 +601,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
 		goto out;
 	}
 
-	id = __peernet2id(net, peer, false);
+	id = __peernet2id(net, peer);
 	err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
 			    RTM_GETNSID, net, id);
 	if (err < 0)
-- 
cgit v1.2.3


From 95f38411df055a0ecefe3a3d119d98241087d5ca Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 7 May 2015 11:02:51 +0200
Subject: netns: use a spin_lock to protect nsid management

Before this patch, nsid were protected by the rtnl lock. The goal of this
patch is to be able to find a nsid without needing to hold the rtnl lock.

The next patch will introduce a netlink socket option to listen to all
netns that have a nsid assigned into the netns where the socket is opened.
Thus, it's important to call rtnl_net_notifyid() outside the spinlock, to
avoid a recursive lock (nsid are notified via rtnl). This was the main
reason of the previous patch.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 57 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 13 deletions(-)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index ee864241f8d6..ae5008b097de 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -28,6 +28,7 @@
 static LIST_HEAD(pernet_list);
 static struct list_head *first_device = &pernet_list;
 DEFINE_MUTEX(net_mutex);
+static DEFINE_SPINLOCK(nsid_lock);
 
 LIST_HEAD(net_namespace_list);
 EXPORT_SYMBOL_GPL(net_namespace_list);
@@ -147,18 +148,17 @@ static void ops_free_list(const struct pernet_operations *ops,
 	}
 }
 
+/* should be called with nsid_lock held */
 static int alloc_netid(struct net *net, struct net *peer, int reqid)
 {
 	int min = 0, max = 0;
 
-	ASSERT_RTNL();
-
 	if (reqid >= 0) {
 		min = reqid;
 		max = reqid + 1;
 	}
 
-	return idr_alloc(&net->netns_ids, peer, min, max, GFP_KERNEL);
+	return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
 }
 
 /* This function is used by idr_for_each(). If net is equal to peer, the
@@ -174,13 +174,15 @@ static int net_eq_idr(int id, void *net, void *peer)
 	return 0;
 }
 
+/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc
+ * is set to true, thus the caller knows that the new id must be notified via
+ * rtnl.
+ */
 static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
 {
 	int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
 	bool alloc_it = *alloc;
 
-	ASSERT_RTNL();
-
 	*alloc = false;
 
 	/* Magic value for id 0. */
@@ -198,6 +200,7 @@ static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
 	return NETNSA_NSID_NOT_ASSIGNED;
 }
 
+/* should be called with nsid_lock held */
 static int __peernet2id(struct net *net, struct net *peer)
 {
 	bool no = false;
@@ -211,27 +214,46 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id);
  */
 int peernet2id_alloc(struct net *net, struct net *peer)
 {
-	bool alloc = atomic_read(&peer->count) == 0 ? false : true;
+	unsigned long flags;
+	bool alloc;
 	int id;
 
+	spin_lock_irqsave(&nsid_lock, flags);
+	alloc = atomic_read(&peer->count) == 0 ? false : true;
 	id = __peernet2id_alloc(net, peer, &alloc);
+	spin_unlock_irqrestore(&nsid_lock, flags);
 	if (alloc && id >= 0)
 		rtnl_net_notifyid(net, RTM_NEWNSID, id);
 	return id;
 }
 EXPORT_SYMBOL(peernet2id_alloc);
 
+/* This function returns, if assigned, the id of a peer netns. */
+static int peernet2id(struct net *net, struct net *peer)
+{
+	unsigned long flags;
+	int id;
+
+	spin_lock_irqsave(&nsid_lock, flags);
+	id = __peernet2id(net, peer);
+	spin_unlock_irqrestore(&nsid_lock, flags);
+	return id;
+}
+
 struct net *get_net_ns_by_id(struct net *net, int id)
 {
+	unsigned long flags;
 	struct net *peer;
 
 	if (id < 0)
 		return NULL;
 
 	rcu_read_lock();
+	spin_lock_irqsave(&nsid_lock, flags);
 	peer = idr_find(&net->netns_ids, id);
 	if (peer)
 		get_net(peer);
+	spin_unlock_irqrestore(&nsid_lock, flags);
 	rcu_read_unlock();
 
 	return peer;
@@ -372,14 +394,19 @@ static void cleanup_net(struct work_struct *work)
 		list_del_rcu(&net->list);
 		list_add_tail(&net->exit_list, &net_exit_list);
 		for_each_net(tmp) {
-			int id = __peernet2id(tmp, net);
+			int id;
 
-			if (id >= 0) {
-				rtnl_net_notifyid(tmp, RTM_DELNSID, id);
+			spin_lock_irq(&nsid_lock);
+			id = __peernet2id(tmp, net);
+			if (id >= 0)
 				idr_remove(&tmp->netns_ids, id);
-			}
+			spin_unlock_irq(&nsid_lock);
+			if (id >= 0)
+				rtnl_net_notifyid(tmp, RTM_DELNSID, id);
 		}
+		spin_lock_irq(&nsid_lock);
 		idr_destroy(&net->netns_ids);
+		spin_unlock_irq(&nsid_lock);
 
 	}
 	rtnl_unlock();
@@ -507,6 +534,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	struct net *net = sock_net(skb->sk);
 	struct nlattr *tb[NETNSA_MAX + 1];
+	unsigned long flags;
 	struct net *peer;
 	int nsid, err;
 
@@ -527,12 +555,14 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (IS_ERR(peer))
 		return PTR_ERR(peer);
 
+	spin_lock_irqsave(&nsid_lock, flags);
 	if (__peernet2id(net, peer) >= 0) {
 		err = -EEXIST;
 		goto out;
 	}
 
 	err = alloc_netid(net, peer, nsid);
+	spin_unlock_irqrestore(&nsid_lock, flags);
 	if (err >= 0) {
 		rtnl_net_notifyid(net, RTM_NEWNSID, err);
 		err = 0;
@@ -601,7 +631,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh)
 		goto out;
 	}
 
-	id = __peernet2id(net, peer);
+	id = peernet2id(net, peer);
 	err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
 			    RTM_GETNSID, net, id);
 	if (err < 0)
@@ -654,10 +684,11 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
 		.idx = 0,
 		.s_idx = cb->args[0],
 	};
+	unsigned long flags;
 
-	ASSERT_RTNL();
-
+	spin_lock_irqsave(&nsid_lock, flags);
 	idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
+	spin_unlock_irqrestore(&nsid_lock, flags);
 
 	cb->args[0] = net_cb.idx;
 	return skb->len;
-- 
cgit v1.2.3


From 59324cf35aba5336b611074028777838a963d03b Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 7 May 2015 11:02:53 +0200
Subject: netlink: allow to listen "all" netns

More accurately, listen all netns that have a nsid assigned into the netns
where the netlink socket is opened.
For this purpose, a netlink socket option is added:
NETLINK_LISTEN_ALL_NSID. When this option is set on a netlink socket, this
socket will receive netlink notifications from all netns that have a nsid
assigned into the netns where the socket has been opened. The nsid is sent
to userland via an anscillary data.

With this patch, a daemon needs only one socket to listen many netns. This
is useful when the number of netns is high.

Because 0 is a valid value for a nsid, the field nsid_is_set indicates if
the field nsid is valid or not. skb->cb is initialized to 0 on skb
allocation, thus we are sure that we will never send a nsid 0 by error to
the userland.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h      |  2 ++
 include/net/net_namespace.h  |  2 ++
 include/uapi/linux/netlink.h |  1 +
 net/core/net_namespace.c     | 10 ++++++++-
 net/netlink/af_netlink.c     | 52 +++++++++++++++++++++++++++++++++++++++-----
 5 files changed, 61 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 6835c1279df7..9120edb650a0 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -28,6 +28,8 @@ struct netlink_skb_parms {
 	__u32			dst_group;
 	__u32			flags;
 	struct sock		*sk;
+	bool			nsid_is_set;
+	int			nsid;
 };
 
 #define NETLINK_CB(skb)		(*(struct netlink_skb_parms*)&((skb)->cb))
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 6d1e2eae32fb..3f850acc844e 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -272,6 +272,8 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
 #endif
 
 int peernet2id_alloc(struct net *net, struct net *peer);
+int peernet2id(struct net *net, struct net *peer);
+bool peernet_has_id(struct net *net, struct net *peer);
 struct net *get_net_ns_by_id(struct net *net, int id);
 
 struct pernet_operations {
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 1a85940f8ab7..3e34b7d702f8 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -108,6 +108,7 @@ struct nlmsgerr {
 #define NETLINK_NO_ENOBUFS	5
 #define NETLINK_RX_RING		6
 #define NETLINK_TX_RING		7
+#define NETLINK_LISTEN_ALL_NSID	8
 
 struct nl_pktinfo {
 	__u32	group;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index ae5008b097de..a665bf490c88 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -229,7 +229,7 @@ int peernet2id_alloc(struct net *net, struct net *peer)
 EXPORT_SYMBOL(peernet2id_alloc);
 
 /* This function returns, if assigned, the id of a peer netns. */
-static int peernet2id(struct net *net, struct net *peer)
+int peernet2id(struct net *net, struct net *peer)
 {
 	unsigned long flags;
 	int id;
@@ -240,6 +240,14 @@ static int peernet2id(struct net *net, struct net *peer)
 	return id;
 }
 
+/* This function returns true is the peer netns has an id assigned into the
+ * current netns.
+ */
+bool peernet_has_id(struct net *net, struct net *peer)
+{
+	return peernet2id(net, peer) >= 0;
+}
+
 struct net *get_net_ns_by_id(struct net *net, int id)
 {
 	unsigned long flags;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index bf7f56d7a9aa..a5fff75accf8 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -83,6 +83,7 @@ struct listeners {
 #define NETLINK_F_RECV_PKTINFO		0x2
 #define NETLINK_F_BROADCAST_SEND_ERROR	0x4
 #define NETLINK_F_RECV_NO_ENOBUFS	0x8
+#define NETLINK_F_LISTEN_ALL_NSID	0x10
 
 static inline int netlink_is_kernel(struct sock *sk)
 {
@@ -1932,8 +1933,17 @@ static void do_one_broadcast(struct sock *sk,
 	    !test_bit(p->group - 1, nlk->groups))
 		return;
 
-	if (!net_eq(sock_net(sk), p->net))
-		return;
+	if (!net_eq(sock_net(sk), p->net)) {
+		if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID))
+			return;
+
+		if (!peernet_has_id(sock_net(sk), p->net))
+			return;
+
+		if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
+				     CAP_NET_BROADCAST))
+			return;
+	}
 
 	if (p->failure) {
 		netlink_overrun(sk);
@@ -1959,13 +1969,22 @@ static void do_one_broadcast(struct sock *sk,
 		p->failure = 1;
 		if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
 			p->delivery_failure = 1;
-	} else if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
+		goto out;
+	}
+	if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
 		kfree_skb(p->skb2);
 		p->skb2 = NULL;
-	} else if (sk_filter(sk, p->skb2)) {
+		goto out;
+	}
+	if (sk_filter(sk, p->skb2)) {
 		kfree_skb(p->skb2);
 		p->skb2 = NULL;
-	} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
+		goto out;
+	}
+	NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
+	NETLINK_CB(p->skb2).nsid_is_set = true;
+	val = netlink_broadcast_deliver(sk, p->skb2);
+	if (val < 0) {
 		netlink_overrun(sk);
 		if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
 			p->delivery_failure = 1;
@@ -1974,6 +1993,7 @@ static void do_one_broadcast(struct sock *sk,
 		p->delivered = 1;
 		p->skb2 = NULL;
 	}
+out:
 	sock_put(sk);
 }
 
@@ -2202,6 +2222,16 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		break;
 	}
 #endif /* CONFIG_NETLINK_MMAP */
+	case NETLINK_LISTEN_ALL_NSID:
+		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
+			return -EPERM;
+
+		if (val)
+			nlk->flags |= NETLINK_F_LISTEN_ALL_NSID;
+		else
+			nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID;
+		err = 0;
+		break;
 	default:
 		err = -ENOPROTOOPT;
 	}
@@ -2268,6 +2298,16 @@ static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
 	put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
 }
 
+static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg,
+					 struct sk_buff *skb)
+{
+	if (!NETLINK_CB(skb).nsid_is_set)
+		return;
+
+	put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int),
+		 &NETLINK_CB(skb).nsid);
+}
+
 static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 {
 	struct sock *sk = sock->sk;
@@ -2421,6 +2461,8 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 
 	if (nlk->flags & NETLINK_F_RECV_PKTINFO)
 		netlink_cmsg_recv_pktinfo(msg, skb);
+	if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID)
+		netlink_cmsg_listen_all_nsid(sk, msg, skb);
 
 	memset(&scm, 0, sizeof(scm));
 	scm.creds = *NETLINK_CREDS(skb);
-- 
cgit v1.2.3


From f1f00d8ff60ca056db3805406507483eeb3794d7 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 7 May 2015 16:34:51 +0200
Subject: pktgen: adjust flag NO_TIMESTAMP to be more pktgen compliant

Allow flag NO_TIMESTAMP to turn timestamping on again, like other flags,
with a negation of the flag like !NO_TIMESTAMP.

Also document the option flag NO_TIMESTAMP.

Fixes: afb84b626184 ("pktgen: add flag NO_TIMESTAMP to disable timestamping")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/pktgen.txt | 2 ++
 net/core/pktgen.c                   | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'net/core')

diff --git a/Documentation/networking/pktgen.txt b/Documentation/networking/pktgen.txt
index 0344f1d45b37..6199ee6a040b 100644
--- a/Documentation/networking/pktgen.txt
+++ b/Documentation/networking/pktgen.txt
@@ -145,6 +145,7 @@ Examples:
                               UDPCSUM,
                               IPSEC # IPsec encapsulation (needs CONFIG_XFRM)
                               NODE_ALLOC # node specific memory allocation
+                              NO_TIMESTAMP # disable timestamping
 
  pgset spi SPI_VALUE     Set specific SA used to transform packet.
 
@@ -287,6 +288,7 @@ flag
   UDPCSUM
   IPSEC
   NODE_ALLOC
+  NO_TIMESTAMP
 
 dst_min
 dst_max
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 508155b283dd..43bb21507373 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1267,6 +1267,9 @@ static ssize_t pktgen_if_write(struct file *file,
 		else if (strcmp(f, "NO_TIMESTAMP") == 0)
 			pkt_dev->flags |= F_NO_TIMESTAMP;
 
+		else if (strcmp(f, "!NO_TIMESTAMP") == 0)
+			pkt_dev->flags &= ~F_NO_TIMESTAMP;
+
 		else {
 			sprintf(pg_result,
 				"Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
-- 
cgit v1.2.3


From 62f64aed622b6055b5fc447e3e421c9351563fc8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 7 May 2015 16:35:32 +0200
Subject: pktgen: introduce xmit_mode '<start_xmit|netif_receive>'

Introduce xmit_mode 'netif_receive' for pktgen which generates the
packets using familiar pktgen commands, but feeds them into
netif_receive_skb() instead of ndo_start_xmit().

Default mode is called 'start_xmit'.

It is designed to test netif_receive_skb and ingress qdisc
performace only. Make sure to understand how it works before
using it for other rx benchmarking.

Sample script 'pktgen.sh':
\#!/bin/bash
function pgset() {
  local result

  echo $1 > $PGDEV

  result=`cat $PGDEV | fgrep "Result: OK:"`
  if [ "$result" = "" ]; then
    cat $PGDEV | fgrep Result:
  fi
}

[ -z "$1" ] && echo "Usage: $0 DEV" && exit 1
ETH=$1

PGDEV=/proc/net/pktgen/kpktgend_0
pgset "rem_device_all"
pgset "add_device $ETH"

PGDEV=/proc/net/pktgen/$ETH
pgset "xmit_mode netif_receive"
pgset "pkt_size 60"
pgset "dst 198.18.0.1"
pgset "dst_mac 90:e2:ba:ff:ff:ff"
pgset "count 10000000"
pgset "burst 32"

PGDEV=/proc/net/pktgen/pgctrl
echo "Running... ctrl^C to stop"
pgset "start"
echo "Done"
cat /proc/net/pktgen/$ETH

Usage:
$ sudo ./pktgen.sh eth2
...
Result: OK: 232376(c232372+d3) usec, 10000000 (60byte,0frags)
  43033682pps 20656Mb/sec (20656167360bps) errors: 10000000

Raw netif_receive_skb speed should be ~43 million packet
per second on 3.7Ghz x86 and 'perf report' should look like:
  37.69%  kpktgend_0   [kernel.vmlinux]  [k] __netif_receive_skb_core
  25.81%  kpktgend_0   [kernel.vmlinux]  [k] kfree_skb
   7.22%  kpktgend_0   [kernel.vmlinux]  [k] ip_rcv
   5.68%  kpktgend_0   [pktgen]          [k] pktgen_thread_worker

If fib_table_lookup is seen on top, it means skb was processed
by the stack. To benchmark netif_receive_skb only make sure
that 'dst_mac' of your pktgen script is different from
receiving device mac and it will be dropped by ip_rcv

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/pktgen.txt |  7 ++++
 net/core/pktgen.c                   | 82 ++++++++++++++++++++++++++++++++++---
 2 files changed, 84 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/Documentation/networking/pktgen.txt b/Documentation/networking/pktgen.txt
index 6199ee6a040b..747faccc4120 100644
--- a/Documentation/networking/pktgen.txt
+++ b/Documentation/networking/pktgen.txt
@@ -193,6 +193,10 @@ Examples:
  pgset "rate 300M"        set rate to 300 Mb/s
  pgset "ratep 1000000"    set rate to 1Mpps
 
+ pgset "xmit_mode netif_receive"  RX inject into stack netif_receive_skb()
+				  Works with "burst" but not with "clone_skb".
+				  Default xmit_mode is "start_xmit".
+
 Sample scripts
 ==============
 
@@ -310,6 +314,9 @@ flowlen
 rate
 ratep
 
+xmit_mode <start_xmit|netif_receive>
+
+
 References:
 ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/
 ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/examples/
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 43bb21507373..8f2687da058e 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -210,6 +210,10 @@
 #define T_REMDEVALL   (1<<2)	/* Remove all devs */
 #define T_REMDEV      (1<<3)	/* Remove one dev */
 
+/* Xmit modes */
+#define M_START_XMIT		0	/* Default normal TX */
+#define M_NETIF_RECEIVE 	1	/* Inject packets into stack */
+
 /* If lock -- protects updating of if_list */
 #define   if_lock(t)           spin_lock(&(t->if_lock));
 #define   if_unlock(t)           spin_unlock(&(t->if_lock));
@@ -251,13 +255,14 @@ struct pktgen_dev {
 	 * we will do a random selection from within the range.
 	 */
 	__u32 flags;
-	int removal_mark;	/* non-zero => the device is marked for
-				 * removal by worker thread */
-
+	int xmit_mode;
 	int min_pkt_size;
 	int max_pkt_size;
 	int pkt_overhead;	/* overhead for MPLS, VLANs, IPSEC etc */
 	int nfrags;
+	int removal_mark;	/* non-zero => the device is marked for
+				 * removal by worker thread */
+
 	struct page *page;
 	u64 delay;		/* nano-seconds */
 
@@ -620,6 +625,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
 	if (pkt_dev->node >= 0)
 		seq_printf(seq, "     node: %d\n", pkt_dev->node);
 
+	if (pkt_dev->xmit_mode == M_NETIF_RECEIVE)
+		seq_puts(seq, "     xmit_mode: netif_receive\n");
+
 	seq_puts(seq, "     Flags: ");
 
 	if (pkt_dev->flags & F_IPV6)
@@ -1081,7 +1089,8 @@ static ssize_t pktgen_if_write(struct file *file,
 		if (len < 0)
 			return len;
 		if ((value > 0) &&
-		    (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
+		    ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
+		     !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
 			return -ENOTSUPP;
 		i += len;
 		pkt_dev->clone_skb = value;
@@ -1134,7 +1143,7 @@ static ssize_t pktgen_if_write(struct file *file,
 			return len;
 
 		i += len;
-		if ((value > 1) &&
+		if ((value > 1) && (pkt_dev->xmit_mode == M_START_XMIT) &&
 		    (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
 			return -ENOTSUPP;
 		pkt_dev->burst = value < 1 ? 1 : value;
@@ -1160,6 +1169,35 @@ static ssize_t pktgen_if_write(struct file *file,
 			sprintf(pg_result, "ERROR: node not possible");
 		return count;
 	}
+	if (!strcmp(name, "xmit_mode")) {
+		char f[32];
+
+		memset(f, 0, 32);
+		len = strn_len(&user_buffer[i], sizeof(f) - 1);
+		if (len < 0)
+			return len;
+
+		if (copy_from_user(f, &user_buffer[i], len))
+			return -EFAULT;
+		i += len;
+
+		if (strcmp(f, "start_xmit") == 0) {
+			pkt_dev->xmit_mode = M_START_XMIT;
+		} else if (strcmp(f, "netif_receive") == 0) {
+			/* clone_skb set earlier, not supported in this mode */
+			if (pkt_dev->clone_skb > 0)
+				return -ENOTSUPP;
+
+			pkt_dev->xmit_mode = M_NETIF_RECEIVE;
+		} else {
+			sprintf(pg_result,
+				"xmit_mode -:%s:- unknown\nAvailable modes: %s",
+				f, "start_xmit, netif_receive\n");
+			return count;
+		}
+		sprintf(pg_result, "OK: xmit_mode=%s", f);
+		return count;
+	}
 	if (!strcmp(name, "flag")) {
 		char f[32];
 		memset(f, 0, 32);
@@ -3320,6 +3358,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 	unsigned int burst = ACCESS_ONCE(pkt_dev->burst);
 	struct net_device *odev = pkt_dev->odev;
 	struct netdev_queue *txq;
+	struct sk_buff *skb;
 	int ret;
 
 	/* If device is offline, then don't send */
@@ -3357,6 +3396,38 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 	if (pkt_dev->delay && pkt_dev->last_ok)
 		spin(pkt_dev, pkt_dev->next_tx);
 
+	if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) {
+		skb = pkt_dev->skb;
+		skb->protocol = eth_type_trans(skb, skb->dev);
+		atomic_add(burst, &skb->users);
+		local_bh_disable();
+		do {
+			ret = netif_receive_skb(skb);
+			if (ret == NET_RX_DROP)
+				pkt_dev->errors++;
+			pkt_dev->sofar++;
+			pkt_dev->seq_num++;
+			if (atomic_read(&skb->users) != burst) {
+				/* skb was queued by rps/rfs or taps,
+				 * so cannot reuse this skb
+				 */
+				atomic_sub(burst - 1, &skb->users);
+				/* get out of the loop and wait
+				 * until skb is consumed
+				 */
+				pkt_dev->last_ok = 1;
+				break;
+			}
+			/* skb was 'freed' by stack, so clean few
+			 * bits and reuse it
+			 */
+#ifdef CONFIG_NET_CLS_ACT
+			skb->tc_verd = 0; /* reset reclass/redir ttl */
+#endif
+		} while (--burst > 0);
+		goto out; /* Skips xmit_mode M_START_XMIT */
+	}
+
 	txq = skb_get_tx_queue(odev, pkt_dev->skb);
 
 	local_bh_disable();
@@ -3404,6 +3475,7 @@ xmit_more:
 unlock:
 	HARD_TX_UNLOCK(odev, txq);
 
+out:
 	local_bh_enable();
 
 	/* If pkt_dev->count is zero, then run forever */
-- 
cgit v1.2.3


From 11aa9c28b4209242a9de0a661a7b3405adb568a0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 May 2015 21:09:13 -0500
Subject: net: Pass kern from net_proto_family.create to sk_alloc

In preparation for changing how struct net is refcounted
on kernel sockets pass the knowledge that we are creating
a kernel socket from sock_create_kern through to sk_alloc.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/af_alg.c                |  4 ++--
 drivers/isdn/mISDN/socket.c    | 12 ++++++------
 drivers/net/macvtap.c          |  2 +-
 drivers/net/ppp/pppoe.c        |  4 ++--
 drivers/net/ppp/pppox.c        |  2 +-
 drivers/net/ppp/pptp.c         |  4 ++--
 drivers/net/tun.c              |  2 +-
 include/linux/if_pppox.h       |  2 +-
 include/net/af_vsock.h         |  2 +-
 include/net/llc_conn.h         |  2 +-
 include/net/sock.h             |  2 +-
 net/appletalk/ddp.c            |  2 +-
 net/atm/common.c               |  4 ++--
 net/atm/common.h               |  2 +-
 net/atm/pvc.c                  |  2 +-
 net/atm/svc.c                  |  2 +-
 net/ax25/af_ax25.c             |  4 ++--
 net/bluetooth/bnep/sock.c      |  2 +-
 net/bluetooth/cmtp/sock.c      |  2 +-
 net/bluetooth/hci_sock.c       |  2 +-
 net/bluetooth/hidp/sock.c      |  2 +-
 net/bluetooth/l2cap_sock.c     | 10 +++++-----
 net/bluetooth/rfcomm/sock.c    |  8 ++++----
 net/bluetooth/sco.c            |  8 ++++----
 net/caif/caif_socket.c         |  2 +-
 net/can/af_can.c               |  2 +-
 net/core/sock.c                |  3 ++-
 net/decnet/af_decnet.c         |  8 ++++----
 net/ieee802154/socket.c        |  2 +-
 net/ipv4/af_inet.c             |  2 +-
 net/ipv6/af_inet6.c            |  2 +-
 net/ipx/af_ipx.c               |  2 +-
 net/irda/af_irda.c             |  2 +-
 net/iucv/af_iucv.c             | 10 +++++-----
 net/key/af_key.c               |  2 +-
 net/l2tp/l2tp_ppp.c            |  4 ++--
 net/llc/af_llc.c               |  2 +-
 net/llc/llc_conn.c             |  6 +++---
 net/netlink/af_netlink.c       | 11 +++++------
 net/netrom/af_netrom.c         |  4 ++--
 net/nfc/af_nfc.c               |  2 +-
 net/nfc/llcp.h                 |  2 +-
 net/nfc/llcp_core.c            |  2 +-
 net/nfc/llcp_sock.c            |  8 ++++----
 net/nfc/nfc.h                  |  2 +-
 net/nfc/rawsock.c              |  4 ++--
 net/packet/af_packet.c         |  2 +-
 net/phonet/af_phonet.c         |  2 +-
 net/phonet/pep.c               |  2 +-
 net/rds/af_rds.c               |  2 +-
 net/rose/af_rose.c             |  4 ++--
 net/rxrpc/af_rxrpc.c           |  2 +-
 net/sctp/ipv6.c                |  2 +-
 net/sctp/protocol.c            |  2 +-
 net/tipc/socket.c              |  2 +-
 net/unix/af_unix.c             |  8 ++++----
 net/vmw_vsock/af_vsock.c       |  7 ++++---
 net/vmw_vsock/vmci_transport.c |  2 +-
 net/x25/af_x25.c               |  8 ++++----
 59 files changed, 109 insertions(+), 108 deletions(-)

(limited to 'net/core')

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index f22cc56fd1b3..5ad0d5354535 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -244,7 +244,7 @@ int af_alg_accept(struct sock *sk, struct socket *newsock)
 	if (!type)
 		goto unlock;
 
-	sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto);
+	sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, 0);
 	err = -ENOMEM;
 	if (!sk2)
 		goto unlock;
@@ -324,7 +324,7 @@ static int alg_create(struct net *net, struct socket *sock, int protocol,
 		return -EPROTONOSUPPORT;
 
 	err = -ENOMEM;
-	sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto);
+	sk = sk_alloc(net, PF_ALG, GFP_KERNEL, &alg_proto, kern);
 	if (!sk)
 		goto out;
 
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 8dc7290089bb..0d29b5a6356d 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -601,14 +601,14 @@ static const struct proto_ops data_sock_ops = {
 };
 
 static int
-data_sock_create(struct net *net, struct socket *sock, int protocol)
+data_sock_create(struct net *net, struct socket *sock, int protocol, int kern)
 {
 	struct sock *sk;
 
 	if (sock->type != SOCK_DGRAM)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto);
+	sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -756,14 +756,14 @@ static const struct proto_ops base_sock_ops = {
 
 
 static int
-base_sock_create(struct net *net, struct socket *sock, int protocol)
+base_sock_create(struct net *net, struct socket *sock, int protocol, int kern)
 {
 	struct sock *sk;
 
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto);
+	sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -785,7 +785,7 @@ mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern)
 
 	switch (proto) {
 	case ISDN_P_BASE:
-		err = base_sock_create(net, sock, proto);
+		err = base_sock_create(net, sock, proto, kern);
 		break;
 	case ISDN_P_TE_S0:
 	case ISDN_P_NT_S0:
@@ -799,7 +799,7 @@ mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern)
 	case ISDN_P_B_L2DTMF:
 	case ISDN_P_B_L2DSP:
 	case ISDN_P_B_L2DSPHDLC:
-		err = data_sock_create(net, sock, proto);
+		err = data_sock_create(net, sock, proto, kern);
 		break;
 	default:
 		return err;
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 8c350c5d54ad..0398631a3c24 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -476,7 +476,7 @@ static int macvtap_open(struct inode *inode, struct file *file)
 
 	err = -ENOMEM;
 	q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
-					     &macvtap_proto);
+					     &macvtap_proto, 0);
 	if (!q)
 		goto out;
 
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index aa1dd926623a..f86c5ab334aa 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -546,11 +546,11 @@ static struct proto pppoe_sk_proto __read_mostly = {
  * Initialize a new struct sock.
  *
  **********************************************************************/
-static int pppoe_create(struct net *net, struct socket *sock)
+static int pppoe_create(struct net *net, struct socket *sock, int kern)
 {
 	struct sock *sk;
 
-	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto);
+	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/drivers/net/ppp/pppox.c b/drivers/net/ppp/pppox.c
index 2940e9fe351b..0e1b30622477 100644
--- a/drivers/net/ppp/pppox.c
+++ b/drivers/net/ppp/pppox.c
@@ -118,7 +118,7 @@ static int pppox_create(struct net *net, struct socket *sock, int protocol,
 	    !try_module_get(pppox_protos[protocol]->owner))
 		goto out;
 
-	rc = pppox_protos[protocol]->create(net, sock);
+	rc = pppox_protos[protocol]->create(net, sock, kern);
 
 	module_put(pppox_protos[protocol]->owner);
 out:
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index e3bfbd4d0136..14839bc0aaf5 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -561,14 +561,14 @@ static void pptp_sock_destruct(struct sock *sk)
 	skb_queue_purge(&sk->sk_receive_queue);
 }
 
-static int pptp_create(struct net *net, struct socket *sock)
+static int pptp_create(struct net *net, struct socket *sock, int kern)
 {
 	int error = -ENOMEM;
 	struct sock *sk;
 	struct pppox_sock *po;
 	struct pptp_opt *opt;
 
-	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pptp_sk_proto);
+	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pptp_sk_proto, kern);
 	if (!sk)
 		goto out;
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3262f3e2b8b2..1a1c4f7b3ec5 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2148,7 +2148,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	DBG1(KERN_INFO, "tunX: tun_chr_open\n");
 
 	tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
-					    &tun_proto);
+					    &tun_proto, 0);
 	if (!tfile)
 		return -ENOMEM;
 	RCU_INIT_POINTER(tfile->tun, NULL);
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index 66a7d7600f43..b49cf923becc 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -74,7 +74,7 @@ static inline struct sock *sk_pppox(struct pppox_sock *po)
 struct module;
 
 struct pppox_proto {
-	int		(*create)(struct net *net, struct socket *sock);
+	int		(*create)(struct net *net, struct socket *sock, int kern);
 	int		(*ioctl)(struct socket *sock, unsigned int cmd,
 				 unsigned long arg);
 	struct module	*owner;
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 172632dd9930..db639a4c5ab8 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -74,7 +74,7 @@ void vsock_pending_work(struct work_struct *work);
 struct sock *__vsock_create(struct net *net,
 			    struct socket *sock,
 			    struct sock *parent,
-			    gfp_t priority, unsigned short type);
+			    gfp_t priority, unsigned short type, int kern);
 
 /**** TRANSPORT ****/
 
diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h
index 0134681acc4c..fe994d2e5286 100644
--- a/include/net/llc_conn.h
+++ b/include/net/llc_conn.h
@@ -96,7 +96,7 @@ static __inline__ char llc_backlog_type(struct sk_buff *skb)
 }
 
 struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority,
-			  struct proto *prot);
+			  struct proto *prot, int kern);
 void llc_sk_free(struct sock *sk);
 
 void llc_sk_reset(struct sock *sk);
diff --git a/include/net/sock.h b/include/net/sock.h
index 3a4898ec8c67..d8dcf91732b0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1514,7 +1514,7 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
 
 
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
-		      struct proto *prot);
+		      struct proto *prot, int kern);
 void sk_free(struct sock *sk);
 void sk_release_kernel(struct sock *sk);
 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 3b7ad43c7dad..d5871ac493eb 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1030,7 +1030,7 @@ static int atalk_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
 		goto out;
 	rc = -ENOMEM;
-	sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto);
+	sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern);
 	if (!sk)
 		goto out;
 	rc = 0;
diff --git a/net/atm/common.c b/net/atm/common.c
index ed0466637e13..49a872db7e42 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -141,7 +141,7 @@ static struct proto vcc_proto = {
 	.release_cb = vcc_release_cb,
 };
 
-int vcc_create(struct net *net, struct socket *sock, int protocol, int family)
+int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern)
 {
 	struct sock *sk;
 	struct atm_vcc *vcc;
@@ -149,7 +149,7 @@ int vcc_create(struct net *net, struct socket *sock, int protocol, int family)
 	sock->sk = NULL;
 	if (sock->type == SOCK_STREAM)
 		return -EINVAL;
-	sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto);
+	sk = sk_alloc(net, family, GFP_KERNEL, &vcc_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 	sock_init_data(sock, sk);
diff --git a/net/atm/common.h b/net/atm/common.h
index 4d6f5b2068ac..959436b87182 100644
--- a/net/atm/common.h
+++ b/net/atm/common.h
@@ -10,7 +10,7 @@
 #include <linux/poll.h> /* for poll_table */
 
 
-int vcc_create(struct net *net, struct socket *sock, int protocol, int family);
+int vcc_create(struct net *net, struct socket *sock, int protocol, int family, int kern);
 int vcc_release(struct socket *sock);
 int vcc_connect(struct socket *sock, int itf, short vpi, int vci);
 int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index ae0324021407..040207ec399f 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -136,7 +136,7 @@ static int pvc_create(struct net *net, struct socket *sock, int protocol,
 		return -EAFNOSUPPORT;
 
 	sock->ops = &pvc_proto_ops;
-	return vcc_create(net, sock, protocol, PF_ATMPVC);
+	return vcc_create(net, sock, protocol, PF_ATMPVC, kern);
 }
 
 static const struct net_proto_family pvc_family_ops = {
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 1ba23f5018e7..3fa0a9ee98d1 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -660,7 +660,7 @@ static int svc_create(struct net *net, struct socket *sock, int protocol,
 		return -EAFNOSUPPORT;
 
 	sock->ops = &svc_proto_ops;
-	error = vcc_create(net, sock, protocol, AF_ATMSVC);
+	error = vcc_create(net, sock, protocol, AF_ATMSVC, kern);
 	if (error)
 		return error;
 	ATM_SD(sock)->local.sas_family = AF_ATMSVC;
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 330c1f4a5a0b..4273533d22b1 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -855,7 +855,7 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol,
 		return -ESOCKTNOSUPPORT;
 	}
 
-	sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto);
+	sk = sk_alloc(net, PF_AX25, GFP_ATOMIC, &ax25_proto, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
@@ -881,7 +881,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
 	struct sock *sk;
 	ax25_cb *ax25, *oax25;
 
-	sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC,	osk->sk_prot);
+	sk = sk_alloc(sock_net(osk), PF_AX25, GFP_ATOMIC, osk->sk_prot, 0);
 	if (sk == NULL)
 		return NULL;
 
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index bde2bdd9e929..b5116fa9835e 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -202,7 +202,7 @@ static int bnep_sock_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index d82787d417bd..ce86a7bae844 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -205,7 +205,7 @@ static int cmtp_sock_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &cmtp_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 56f9edbf3d05..5b14dcafcd08 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -1377,7 +1377,7 @@ static int hci_sock_create(struct net *net, struct socket *sock, int protocol,
 
 	sock->ops = &hci_sock_ops;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index cb3fdde1968a..008ba439bd62 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -235,7 +235,7 @@ static int hidp_sock_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index a7278f05eafb..244287706f91 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -43,7 +43,7 @@ static struct bt_sock_list l2cap_sk_list = {
 static const struct proto_ops l2cap_sock_ops;
 static void l2cap_sock_init(struct sock *sk, struct sock *parent);
 static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
-				     int proto, gfp_t prio);
+				     int proto, gfp_t prio, int kern);
 
 bool l2cap_is_socket(struct socket *sock)
 {
@@ -1193,7 +1193,7 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan)
 	}
 
 	sk = l2cap_sock_alloc(sock_net(parent), NULL, BTPROTO_L2CAP,
-			      GFP_ATOMIC);
+			      GFP_ATOMIC, 0);
 	if (!sk) {
 		release_sock(parent);
 		return NULL;
@@ -1523,12 +1523,12 @@ static struct proto l2cap_proto = {
 };
 
 static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
-				     int proto, gfp_t prio)
+				     int proto, gfp_t prio, int kern)
 {
 	struct sock *sk;
 	struct l2cap_chan *chan;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -1574,7 +1574,7 @@ static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol,
 
 	sock->ops = &l2cap_sock_ops;
 
-	sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+	sk = l2cap_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 825e8fb5114b..b2338e971b33 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -269,12 +269,12 @@ static struct proto rfcomm_proto = {
 	.obj_size	= sizeof(struct rfcomm_pinfo)
 };
 
-static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio)
+static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern)
 {
 	struct rfcomm_dlc *d;
 	struct sock *sk;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -324,7 +324,7 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock,
 
 	sock->ops = &rfcomm_sock_ops;
 
-	sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+	sk = rfcomm_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -969,7 +969,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
 		goto done;
 	}
 
-	sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC);
+	sk = rfcomm_sock_alloc(sock_net(parent), NULL, BTPROTO_RFCOMM, GFP_ATOMIC, 0);
 	if (!sk)
 		goto done;
 
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 4322c833e748..6b6e59dc54cf 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -460,11 +460,11 @@ static struct proto sco_proto = {
 	.obj_size	= sizeof(struct sco_pinfo)
 };
 
-static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio)
+static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern)
 {
 	struct sock *sk;
 
-	sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto);
+	sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -501,7 +501,7 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol,
 
 	sock->ops = &sco_sock_ops;
 
-	sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC);
+	sk = sco_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -1026,7 +1026,7 @@ static void sco_conn_ready(struct sco_conn *conn)
 		bh_lock_sock(parent);
 
 		sk = sco_sock_alloc(sock_net(parent), NULL,
-				    BTPROTO_SCO, GFP_ATOMIC);
+				    BTPROTO_SCO, GFP_ATOMIC, 0);
 		if (!sk) {
 			bh_unlock_sock(parent);
 			sco_conn_unlock(conn);
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 4ec0c803aef1..78a04ebb113c 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -1047,7 +1047,7 @@ static int caif_create(struct net *net, struct socket *sock, int protocol,
 	 * is really not used at all in the net/core or socket.c but the
 	 * initialization makes sure that sock->state is not uninitialized.
 	 */
-	sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot);
+	sk = sk_alloc(net, PF_CAIF, GFP_KERNEL, &prot, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 32d710eaf1fc..d4d404bdfc9a 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -179,7 +179,7 @@ static int can_create(struct net *net, struct socket *sock, int protocol,
 
 	sock->ops = cp->ops;
 
-	sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot);
+	sk = sk_alloc(net, PF_CAN, GFP_KERNEL, cp->prot, kern);
 	if (!sk) {
 		err = -ENOMEM;
 		goto errout;
diff --git a/net/core/sock.c b/net/core/sock.c
index e891bcf325ca..cbc3789b830c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1396,9 +1396,10 @@ EXPORT_SYMBOL_GPL(sock_update_netprioidx);
  *	@family: protocol family
  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
  *	@prot: struct proto associated with this new sock instance
+ *	@kern: is this to be a kernel socket?
  */
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
-		      struct proto *prot)
+		      struct proto *prot, int kern)
 {
 	struct sock *sk;
 
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 754484b3cd0e..675cf94e04f8 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -468,10 +468,10 @@ static struct proto dn_proto = {
 	.obj_size		= sizeof(struct dn_sock),
 };
 
-static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp)
+static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp, int kern)
 {
 	struct dn_scp *scp;
-	struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto);
+	struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto, kern);
 
 	if  (!sk)
 		goto out;
@@ -693,7 +693,7 @@ static int dn_create(struct net *net, struct socket *sock, int protocol,
 	}
 
 
-	if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL)) == NULL)
+	if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL, kern)) == NULL)
 		return -ENOBUFS;
 
 	sk->sk_protocol = protocol;
@@ -1096,7 +1096,7 @@ static int dn_accept(struct socket *sock, struct socket *newsock, int flags)
 
 	cb = DN_SKB_CB(skb);
 	sk->sk_ack_backlog--;
-	newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation);
+	newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, 0);
 	if (newsk == NULL) {
 		release_sock(sk);
 		kfree_skb(skb);
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index b60c65f70346..7aaaf967df58 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -1014,7 +1014,7 @@ static int ieee802154_create(struct net *net, struct socket *sock,
 	}
 
 	rc = -ENOMEM;
-	sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto);
+	sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto, kern);
 	if (!sk)
 		goto out;
 	rc = 0;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 09f4d024dfe5..e2dd9cb99d61 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -317,7 +317,7 @@ lookup_protocol:
 	WARN_ON(!answer_prot->slab);
 
 	err = -ENOBUFS;
-	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
+	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
 	if (!sk)
 		goto out;
 
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 4632afa57e05..f3866c0b6cfe 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -167,7 +167,7 @@ lookup_protocol:
 	WARN_ON(!answer_prot->slab);
 
 	err = -ENOBUFS;
-	sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot);
+	sk = sk_alloc(net, PF_INET6, GFP_KERNEL, answer_prot, kern);
 	if (!sk)
 		goto out;
 
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 4ea5d7497b5f..48d0dc89b58d 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1347,7 +1347,7 @@ static int ipx_create(struct net *net, struct socket *sock, int protocol,
 		goto out;
 
 	rc = -ENOMEM;
-	sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto);
+	sk = sk_alloc(net, PF_IPX, GFP_KERNEL, &ipx_proto, kern);
 	if (!sk)
 		goto out;
 
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index ee0ea25c8e7a..fae6822cc367 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1100,7 +1100,7 @@ static int irda_create(struct net *net, struct socket *sock, int protocol,
 	}
 
 	/* Allocate networking socket */
-	sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto);
+	sk = sk_alloc(net, PF_IRDA, GFP_KERNEL, &irda_proto, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 6daa52a18d40..918151c11348 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -535,12 +535,12 @@ static void iucv_sock_init(struct sock *sk, struct sock *parent)
 		sk->sk_type = parent->sk_type;
 }
 
-static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio)
+static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, int kern)
 {
 	struct sock *sk;
 	struct iucv_sock *iucv;
 
-	sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto);
+	sk = sk_alloc(&init_net, PF_IUCV, prio, &iucv_proto, kern);
 	if (!sk)
 		return NULL;
 	iucv = iucv_sk(sk);
@@ -602,7 +602,7 @@ static int iucv_sock_create(struct net *net, struct socket *sock, int protocol,
 		return -ESOCKTNOSUPPORT;
 	}
 
-	sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL);
+	sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -1723,7 +1723,7 @@ static int iucv_callback_connreq(struct iucv_path *path,
 	}
 
 	/* Create the new socket */
-	nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC);
+	nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0);
 	if (!nsk) {
 		err = pr_iucv->path_sever(path, user_data);
 		iucv_path_free(path);
@@ -1933,7 +1933,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb)
 		goto out;
 	}
 
-	nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC);
+	nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0);
 	bh_lock_sock(sk);
 	if ((sk->sk_state != IUCV_LISTEN) ||
 	    sk_acceptq_is_full(sk) ||
diff --git a/net/key/af_key.c b/net/key/af_key.c
index f0d52d721b3a..9e834ec475a9 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -149,7 +149,7 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
 		return -EPROTONOSUPPORT;
 
 	err = -ENOMEM;
-	sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto);
+	sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern);
 	if (sk == NULL)
 		goto out;
 
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index e9b0dec56b8e..f56c9f69e9f2 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -542,12 +542,12 @@ static int pppol2tp_backlog_recv(struct sock *sk, struct sk_buff *skb)
 
 /* socket() handler. Initialize a new struct sock.
  */
-static int pppol2tp_create(struct net *net, struct socket *sock)
+static int pppol2tp_create(struct net *net, struct socket *sock, int kern)
 {
 	int error = -ENOMEM;
 	struct sock *sk;
 
-	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto);
+	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto, kern);
 	if (!sk)
 		goto out;
 
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 17a8dff06090..8fd9febaa5ba 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -168,7 +168,7 @@ static int llc_ui_create(struct net *net, struct socket *sock, int protocol,
 
 	if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) {
 		rc = -ENOMEM;
-		sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto);
+		sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto, kern);
 		if (sk) {
 			rc = 0;
 			llc_ui_sk_init(sock, sk);
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 81a61fce3afb..3e821daf9dd4 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -768,7 +768,7 @@ static struct sock *llc_create_incoming_sock(struct sock *sk,
 					     struct llc_addr *daddr)
 {
 	struct sock *newsk = llc_sk_alloc(sock_net(sk), sk->sk_family, GFP_ATOMIC,
-					  sk->sk_prot);
+					  sk->sk_prot, 0);
 	struct llc_sock *newllc, *llc = llc_sk(sk);
 
 	if (!newsk)
@@ -931,9 +931,9 @@ static void llc_sk_init(struct sock *sk)
  *	Allocates a LLC sock and initializes it. Returns the new LLC sock
  *	or %NULL if there's no memory available for one
  */
-struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot)
+struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern)
 {
-	struct sock *sk = sk_alloc(net, family, priority, prot);
+	struct sock *sk = sk_alloc(net, family, priority, prot, kern);
 
 	if (!sk)
 		goto out;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a5fff75accf8..be6665ab7f40 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1119,14 +1119,15 @@ static struct proto netlink_proto = {
 };
 
 static int __netlink_create(struct net *net, struct socket *sock,
-			    struct mutex *cb_mutex, int protocol)
+			    struct mutex *cb_mutex, int protocol,
+			    int kern)
 {
 	struct sock *sk;
 	struct netlink_sock *nlk;
 
 	sock->ops = &netlink_ops;
 
-	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
+	sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
@@ -1188,7 +1189,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
 	if (err < 0)
 		goto out;
 
-	err = __netlink_create(net, sock, cb_mutex, protocol);
+	err = __netlink_create(net, sock, cb_mutex, protocol, kern);
 	if (err < 0)
 		goto out_module;
 
@@ -2515,14 +2516,12 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module,
 
 	if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
 		return NULL;
-
 	/*
 	 * We have to just have a reference on the net from sk, but don't
 	 * get_net it. Besides, we cannot get and then put the net here.
 	 * So we create one inside init_net and the move it to net.
 	 */
-
-	if (__netlink_create(&init_net, sock, cb_mutex, unit) < 0)
+	if (__netlink_create(&init_net, sock, cb_mutex, unit, 0) < 0)
 		goto out_sock_release_nosk;
 
 	sk = sock->sk;
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index b987fd56c3c5..ed212ffc1d9d 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -433,7 +433,7 @@ static int nr_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_SEQPACKET || protocol != 0)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto);
+	sk = sk_alloc(net, PF_NETROM, GFP_ATOMIC, &nr_proto, kern);
 	if (sk  == NULL)
 		return -ENOMEM;
 
@@ -476,7 +476,7 @@ static struct sock *nr_make_new(struct sock *osk)
 	if (osk->sk_type != SOCK_SEQPACKET)
 		return NULL;
 
-	sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot);
+	sk = sk_alloc(sock_net(osk), PF_NETROM, GFP_ATOMIC, osk->sk_prot, 0);
 	if (sk == NULL)
 		return NULL;
 
diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c
index 2277276f52bc..54e40fa47822 100644
--- a/net/nfc/af_nfc.c
+++ b/net/nfc/af_nfc.c
@@ -40,7 +40,7 @@ static int nfc_sock_create(struct net *net, struct socket *sock, int proto,
 
 	read_lock(&proto_tab_lock);
 	if (proto_tab[proto] &&	try_module_get(proto_tab[proto]->owner)) {
-		rc = proto_tab[proto]->create(net, sock, proto_tab[proto]);
+		rc = proto_tab[proto]->create(net, sock, proto_tab[proto], kern);
 		module_put(proto_tab[proto]->owner);
 	}
 	read_unlock(&proto_tab_lock);
diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h
index de1789e3cc82..1f68724d44d3 100644
--- a/net/nfc/llcp.h
+++ b/net/nfc/llcp.h
@@ -225,7 +225,7 @@ void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local,
 			       struct sk_buff *skb, u8 direction);
 
 /* Sock API */
-struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp);
+struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern);
 void nfc_llcp_sock_free(struct nfc_llcp_sock *sock);
 void nfc_llcp_accept_unlink(struct sock *sk);
 void nfc_llcp_accept_enqueue(struct sock *parent, struct sock *sk);
diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c
index b18f07ccb504..98876274a1ee 100644
--- a/net/nfc/llcp_core.c
+++ b/net/nfc/llcp_core.c
@@ -934,7 +934,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local,
 		sock->ssap = ssap;
 	}
 
-	new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC);
+	new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC, 0);
 	if (new_sk == NULL) {
 		reason = LLCP_DM_REJ;
 		release_sock(&sock->sk);
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index 9578bd6a4f3e..b7de0da46acd 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -942,12 +942,12 @@ static void llcp_sock_destruct(struct sock *sk)
 	}
 }
 
-struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp)
+struct sock *nfc_llcp_sock_alloc(struct socket *sock, int type, gfp_t gfp, int kern)
 {
 	struct sock *sk;
 	struct nfc_llcp_sock *llcp_sock;
 
-	sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto);
+	sk = sk_alloc(&init_net, PF_NFC, gfp, &llcp_sock_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -993,7 +993,7 @@ void nfc_llcp_sock_free(struct nfc_llcp_sock *sock)
 }
 
 static int llcp_sock_create(struct net *net, struct socket *sock,
-			    const struct nfc_protocol *nfc_proto)
+			    const struct nfc_protocol *nfc_proto, int kern)
 {
 	struct sock *sk;
 
@@ -1009,7 +1009,7 @@ static int llcp_sock_create(struct net *net, struct socket *sock,
 	else
 		sock->ops = &llcp_sock_ops;
 
-	sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC);
+	sk = nfc_llcp_sock_alloc(sock, sock->type, GFP_ATOMIC, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
index a8ce80b47720..5c93e8412a26 100644
--- a/net/nfc/nfc.h
+++ b/net/nfc/nfc.h
@@ -30,7 +30,7 @@ struct nfc_protocol {
 	struct proto *proto;
 	struct module *owner;
 	int (*create)(struct net *net, struct socket *sock,
-		      const struct nfc_protocol *nfc_proto);
+		      const struct nfc_protocol *nfc_proto, int kern);
 };
 
 struct nfc_rawsock {
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 82b4e8024778..e9a91488fe3d 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -334,7 +334,7 @@ static void rawsock_destruct(struct sock *sk)
 }
 
 static int rawsock_create(struct net *net, struct socket *sock,
-			  const struct nfc_protocol *nfc_proto)
+			  const struct nfc_protocol *nfc_proto, int kern)
 {
 	struct sock *sk;
 
@@ -348,7 +348,7 @@ static int rawsock_create(struct net *net, struct socket *sock,
 	else
 		sock->ops = &rawsock_ops;
 
-	sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto);
+	sk = sk_alloc(net, PF_NFC, GFP_ATOMIC, nfc_proto->proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 5102c3cc4eec..94713276a1d9 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2832,7 +2832,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 	sock->state = SS_UNCONNECTED;
 
 	err = -ENOBUFS;
-	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
+	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
 	if (sk == NULL)
 		goto out;
 
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 32ab87d34828..10d42f3220ab 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -97,7 +97,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol,
 		goto out;
 	}
 
-	sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot);
+	sk = sk_alloc(net, PF_PHONET, GFP_KERNEL, pnp->prot, kern);
 	if (sk == NULL) {
 		err = -ENOMEM;
 		goto out;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 6de2aeb98a1f..850a86cde0b3 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -845,7 +845,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp)
 	}
 
 	/* Create a new to-be-accepted sock */
-	newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot);
+	newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot, 0);
 	if (!newsk) {
 		pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL);
 		err = -ENOBUFS;
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 10443377fb9d..3d83641f2861 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -440,7 +440,7 @@ static int rds_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_SEQPACKET || protocol)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
+	sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index 8ae603069a1a..36dbc2da3661 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -520,7 +520,7 @@ static int rose_create(struct net *net, struct socket *sock, int protocol,
 	if (sock->type != SOCK_SEQPACKET || protocol != 0)
 		return -ESOCKTNOSUPPORT;
 
-	sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto);
+	sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
@@ -559,7 +559,7 @@ static struct sock *rose_make_new(struct sock *osk)
 	if (osk->sk_type != SOCK_SEQPACKET)
 		return NULL;
 
-	sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto);
+	sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto, 0);
 	if (sk == NULL)
 		return NULL;
 
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 0095b9a0b779..25d60ed15284 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -632,7 +632,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
 	sock->ops = &rxrpc_rpc_ops;
 	sock->state = SS_UNCONNECTED;
 
-	sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto);
+	sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto, kern);
 	if (!sk)
 		return -ENOMEM;
 
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 0e4198ee2370..e703ff7fed40 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -635,7 +635,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
 	struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
 	struct sctp6_sock *newsctp6sk;
 
-	newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot);
+	newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0);
 	if (!newsk)
 		goto out;
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 53b7acde9aa3..59e80356672b 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -550,7 +550,7 @@ static struct sock *sctp_v4_create_accept_sk(struct sock *sk,
 					     struct sctp_association *asoc)
 {
 	struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL,
-			sk->sk_prot);
+			sk->sk_prot, 0);
 	struct inet_sock *newinet;
 
 	if (!newsk)
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 9074b5cede38..8f3c8e2cef8e 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -342,7 +342,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
 	}
 
 	/* Allocate socket's protocol area */
-	sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto);
+	sk = sk_alloc(net, AF_TIPC, GFP_KERNEL, &tipc_proto, kern);
 	if (sk == NULL)
 		return -ENOMEM;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 5266ea7b922b..941b3d26e3bf 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -620,7 +620,7 @@ static struct proto unix_proto = {
  */
 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 
-static struct sock *unix_create1(struct net *net, struct socket *sock)
+static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
 {
 	struct sock *sk = NULL;
 	struct unix_sock *u;
@@ -629,7 +629,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 		goto out;
 
-	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
+	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
 	if (!sk)
 		goto out;
 
@@ -688,7 +688,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol,
 		return -ESOCKTNOSUPPORT;
 	}
 
-	return unix_create1(net, sock) ? 0 : -ENOMEM;
+	return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
 }
 
 static int unix_release(struct socket *sock)
@@ -1088,7 +1088,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	err = -ENOMEM;
 
 	/* create new sock for complete connection */
-	newsk = unix_create1(sock_net(sk), NULL);
+	newsk = unix_create1(sock_net(sk), NULL, 0);
 	if (newsk == NULL)
 		goto out;
 
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 2ec86e652a19..df5fc6b340f1 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -581,13 +581,14 @@ struct sock *__vsock_create(struct net *net,
 			    struct socket *sock,
 			    struct sock *parent,
 			    gfp_t priority,
-			    unsigned short type)
+			    unsigned short type,
+			    int kern)
 {
 	struct sock *sk;
 	struct vsock_sock *psk;
 	struct vsock_sock *vsk;
 
-	sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto);
+	sk = sk_alloc(net, AF_VSOCK, priority, &vsock_proto, kern);
 	if (!sk)
 		return NULL;
 
@@ -1866,7 +1867,7 @@ static int vsock_create(struct net *net, struct socket *sock,
 
 	sock->state = SS_UNCONNECTED;
 
-	return __vsock_create(net, sock, NULL, GFP_KERNEL, 0) ? 0 : -ENOMEM;
+	return __vsock_create(net, sock, NULL, GFP_KERNEL, 0, kern) ? 0 : -ENOMEM;
 }
 
 static const struct net_proto_family vsock_family_ops = {
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index c294da095461..1f63daff3965 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1022,7 +1022,7 @@ static int vmci_transport_recv_listen(struct sock *sk,
 	}
 
 	pending = __vsock_create(sock_net(sk), NULL, sk, GFP_KERNEL,
-				 sk->sk_type);
+				 sk->sk_type, 0);
 	if (!pending) {
 		vmci_transport_send_reset(sk, pkt);
 		return -ENOMEM;
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index c3ab230e4493..a750f330b8dd 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -515,10 +515,10 @@ static struct proto x25_proto = {
 	.obj_size = sizeof(struct x25_sock),
 };
 
-static struct sock *x25_alloc_socket(struct net *net)
+static struct sock *x25_alloc_socket(struct net *net, int kern)
 {
 	struct x25_sock *x25;
-	struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto);
+	struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern);
 
 	if (!sk)
 		goto out;
@@ -553,7 +553,7 @@ static int x25_create(struct net *net, struct socket *sock, int protocol,
 		goto out;
 
 	rc = -ENOBUFS;
-	if ((sk = x25_alloc_socket(net)) == NULL)
+	if ((sk = x25_alloc_socket(net, kern)) == NULL)
 		goto out;
 
 	x25 = x25_sk(sk);
@@ -602,7 +602,7 @@ static struct sock *x25_make_new(struct sock *osk)
 	if (osk->sk_type != SOCK_SEQPACKET)
 		goto out;
 
-	if ((sk = x25_alloc_socket(sock_net(osk))) == NULL)
+	if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL)
 		goto out;
 
 	x25 = x25_sk(sk);
-- 
cgit v1.2.3


From 26abe14379f8e2fa3fd1bcf97c9a7ad9364886fe Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 May 2015 21:10:31 -0500
Subject: net: Modify sk_alloc to not reference count the netns of kernel
 sockets.

Now that sk_alloc knows when a kernel socket is being allocated modify
it to not reference count the network namespace of kernel sockets.

Keep track of if a socket needs reference counting by adding a flag to
struct sock called sk_net_refcnt.

Update all of the callers of sock_create_kern to stop using
sk_change_net and sk_release_kernel as those hacks are no longer
needed, to avoid reference counting a kernel socket.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_common.h       |  2 +-
 include/net/sock.h              |  2 ++
 net/core/sock.c                 |  8 ++++++--
 net/ipv4/af_inet.c              |  4 +---
 net/ipv4/udp_tunnel.c           |  8 +++-----
 net/ipv6/ip6_udp_tunnel.c       |  6 ++----
 net/l2tp/l2tp_core.c            | 15 ++++++---------
 net/netfilter/ipvs/ip_vs_sync.c | 30 +++++++++---------------------
 8 files changed, 30 insertions(+), 45 deletions(-)

(limited to 'net/core')

diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 4a92423eefa5..279f83591971 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -41,7 +41,7 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len,
 
 static inline void inet_ctl_sock_destroy(struct sock *sk)
 {
-	sk_release_kernel(sk);
+	sock_release(sk->sk_socket);
 }
 
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index d8dcf91732b0..9e6b2c0b4741 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -184,6 +184,7 @@ struct sock_common {
 	unsigned char		skc_reuse:4;
 	unsigned char		skc_reuseport:1;
 	unsigned char		skc_ipv6only:1;
+	unsigned char		skc_net_refcnt:1;
 	int			skc_bound_dev_if;
 	union {
 		struct hlist_node	skc_bind_node;
@@ -323,6 +324,7 @@ struct sock {
 #define sk_reuse		__sk_common.skc_reuse
 #define sk_reuseport		__sk_common.skc_reuseport
 #define sk_ipv6only		__sk_common.skc_ipv6only
+#define sk_net_refcnt		__sk_common.skc_net_refcnt
 #define sk_bound_dev_if		__sk_common.skc_bound_dev_if
 #define sk_bind_node		__sk_common.skc_bind_node
 #define sk_prot			__sk_common.skc_prot
diff --git a/net/core/sock.c b/net/core/sock.c
index cbc3789b830c..9e8968e9ebeb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1412,7 +1412,10 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		 */
 		sk->sk_prot = sk->sk_prot_creator = prot;
 		sock_lock_init(sk);
-		sock_net_set(sk, get_net(net));
+		sk->sk_net_refcnt = kern ? 0 : 1;
+		if (likely(sk->sk_net_refcnt))
+			get_net(net);
+		sock_net_set(sk, net);
 		atomic_set(&sk->sk_wmem_alloc, 1);
 
 		sock_update_classid(sk);
@@ -1446,7 +1449,8 @@ static void __sk_free(struct sock *sk)
 	if (sk->sk_peer_cred)
 		put_cred(sk->sk_peer_cred);
 	put_pid(sk->sk_peer_pid);
-	put_net(sock_net(sk));
+	if (likely(sk->sk_net_refcnt))
+		put_net(sock_net(sk));
 	sk_prot_free(sk->sk_prot_creator, sk);
 }
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index e2dd9cb99d61..235d36afece3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1430,7 +1430,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
 			 struct net *net)
 {
 	struct socket *sock;
-	int rc = sock_create_kern(&init_net, family, type, protocol, &sock);
+	int rc = sock_create_kern(net, family, type, protocol, &sock);
 
 	if (rc == 0) {
 		*sk = sock->sk;
@@ -1440,8 +1440,6 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
 		 * we do not wish this socket to see incoming packets.
 		 */
 		(*sk)->sk_prot->unhash(*sk);
-
-		sk_change_net(*sk, net);
 	}
 	return rc;
 }
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 4e2837476967..933ea903f7b8 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -15,12 +15,10 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
 	struct socket *sock = NULL;
 	struct sockaddr_in udp_addr;
 
-	err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &sock);
+	err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock);
 	if (err < 0)
 		goto error;
 
-	sk_change_net(sock->sk, net);
-
 	udp_addr.sin_family = AF_INET;
 	udp_addr.sin_addr = cfg->local_ip;
 	udp_addr.sin_port = cfg->local_udp_port;
@@ -47,7 +45,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
 error:
 	if (sock) {
 		kernel_sock_shutdown(sock, SHUT_RDWR);
-		sk_release_kernel(sock->sk);
+		sock_release(sock);
 	}
 	*sockp = NULL;
 	return err;
@@ -101,7 +99,7 @@ void udp_tunnel_sock_release(struct socket *sock)
 {
 	rcu_assign_sk_user_data(sock->sk, NULL);
 	kernel_sock_shutdown(sock, SHUT_RDWR);
-	sk_release_kernel(sock->sk);
+	sock_release(sock);
 }
 EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
 
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 478576b61214..e1a1136bda7c 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -19,12 +19,10 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 	int err;
 	struct socket *sock = NULL;
 
-	err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM, 0, &sock);
+	err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock);
 	if (err < 0)
 		goto error;
 
-	sk_change_net(sock->sk, net);
-
 	udp6_addr.sin6_family = AF_INET6;
 	memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
 	       sizeof(udp6_addr.sin6_addr));
@@ -55,7 +53,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
 error:
 	if (sock) {
 		kernel_sock_shutdown(sock, SHUT_RDWR);
-		sk_release_kernel(sock->sk);
+		sock_release(sock);
 	}
 	*sockp = NULL;
 	return err;
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index ae513a2fe7f3..f6b090df3930 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1334,9 +1334,10 @@ static void l2tp_tunnel_del_work(struct work_struct *work)
 		if (sock)
 			inet_shutdown(sock, 2);
 	} else {
-		if (sock)
+		if (sock) {
 			kernel_sock_shutdown(sock, SHUT_RDWR);
-		sk_release_kernel(sk);
+			sock_release(sock);
+		}
 	}
 
 	l2tp_tunnel_sock_put(sk);
@@ -1399,13 +1400,11 @@ static int l2tp_tunnel_sock_create(struct net *net,
 		if (cfg->local_ip6 && cfg->peer_ip6) {
 			struct sockaddr_l2tpip6 ip6_addr = {0};
 
-			err = sock_create_kern(&init_net, AF_INET6, SOCK_DGRAM,
+			err = sock_create_kern(net, AF_INET6, SOCK_DGRAM,
 					  IPPROTO_L2TP, &sock);
 			if (err < 0)
 				goto out;
 
-			sk_change_net(sock->sk, net);
-
 			ip6_addr.l2tp_family = AF_INET6;
 			memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6,
 			       sizeof(ip6_addr.l2tp_addr));
@@ -1429,13 +1428,11 @@ static int l2tp_tunnel_sock_create(struct net *net,
 		{
 			struct sockaddr_l2tpip ip_addr = {0};
 
-			err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM,
+			err = sock_create_kern(net, AF_INET, SOCK_DGRAM,
 					  IPPROTO_L2TP, &sock);
 			if (err < 0)
 				goto out;
 
-			sk_change_net(sock->sk, net);
-
 			ip_addr.l2tp_family = AF_INET;
 			ip_addr.l2tp_addr = cfg->local_ip;
 			ip_addr.l2tp_conn_id = tunnel_id;
@@ -1462,7 +1459,7 @@ out:
 	*sockp = sock;
 	if ((err < 0) && sock) {
 		kernel_sock_shutdown(sock, SHUT_RDWR);
-		sk_release_kernel(sock->sk);
+		sock_release(sock);
 		*sockp = NULL;
 	}
 
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 2e9a5b5d1239..b08ba9538d12 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1457,18 +1457,12 @@ static struct socket *make_send_sock(struct net *net, int id)
 	struct socket *sock;
 	int result;
 
-	/* First create a socket move it to right name space later */
-	result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	/* First create a socket */
+	result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
 	if (result < 0) {
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
 	}
-	/*
-	 * Kernel sockets that are a part of a namespace, should not
-	 * hold a reference to a namespace in order to allow to stop it.
-	 * After sk_change_net should be released using sk_release_kernel.
-	 */
-	sk_change_net(sock->sk, net);
 	result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
 	if (result < 0) {
 		pr_err("Error setting outbound mcast interface\n");
@@ -1497,7 +1491,7 @@ static struct socket *make_send_sock(struct net *net, int id)
 	return sock;
 
 error:
-	sk_release_kernel(sock->sk);
+	sock_release(sock);
 	return ERR_PTR(result);
 }
 
@@ -1518,17 +1512,11 @@ static struct socket *make_receive_sock(struct net *net, int id)
 	int result;
 
 	/* First create a socket */
-	result = sock_create_kern(&init_net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+	result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
 	if (result < 0) {
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
 	}
-	/*
-	 * Kernel sockets that are a part of a namespace, should not
-	 * hold a reference to a namespace in order to allow to stop it.
-	 * After sk_change_net should be released using sk_release_kernel.
-	 */
-	sk_change_net(sock->sk, net);
 	/* it is equivalent to the REUSEADDR option in user-space */
 	sock->sk->sk_reuse = SK_CAN_REUSE;
 	result = sysctl_sync_sock_size(ipvs);
@@ -1554,7 +1542,7 @@ static struct socket *make_receive_sock(struct net *net, int id)
 	return sock;
 
 error:
-	sk_release_kernel(sock->sk);
+	sock_release(sock);
 	return ERR_PTR(result);
 }
 
@@ -1692,7 +1680,7 @@ done:
 		ip_vs_sync_buff_release(sb);
 
 	/* release the sending multicast socket */
-	sk_release_kernel(tinfo->sock->sk);
+	sock_release(tinfo->sock);
 	kfree(tinfo);
 
 	return 0;
@@ -1729,7 +1717,7 @@ static int sync_thread_backup(void *data)
 	}
 
 	/* release the sending multicast socket */
-	sk_release_kernel(tinfo->sock->sk);
+	sock_release(tinfo->sock);
 	kfree(tinfo->buf);
 	kfree(tinfo);
 
@@ -1854,11 +1842,11 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
 	return 0;
 
 outsocket:
-	sk_release_kernel(sock->sk);
+	sock_release(sock);
 
 outtinfo:
 	if (tinfo) {
-		sk_release_kernel(tinfo->sock->sk);
+		sock_release(tinfo->sock);
 		kfree(tinfo->buf);
 		kfree(tinfo);
 	}
-- 
cgit v1.2.3


From affb9792f1d99e1e4d64411e147b648d65f2576e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 May 2015 21:12:13 -0500
Subject: net: kill sk_change_net and sk_release_kernel

These functions are no longer needed and no longer used kill them.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 17 -----------------
 net/core/sock.c    | 19 -------------------
 2 files changed, 36 deletions(-)

(limited to 'net/core')

diff --git a/include/net/sock.h b/include/net/sock.h
index 9e6b2c0b4741..d882f4c8e438 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1518,7 +1518,6 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		      struct proto *prot, int kern);
 void sk_free(struct sock *sk);
-void sk_release_kernel(struct sock *sk);
 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
 
 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
@@ -2194,22 +2193,6 @@ void sock_net_set(struct sock *sk, struct net *net)
 	write_pnet(&sk->sk_net, net);
 }
 
-/*
- * Kernel sockets, f.e. rtnl or icmp_socket, are a part of a namespace.
- * They should not hold a reference to a namespace in order to allow
- * to stop it.
- * Sockets after sk_change_net should be released using sk_release_kernel
- */
-static inline void sk_change_net(struct sock *sk, struct net *net)
-{
-	struct net *current_net = sock_net(sk);
-
-	if (!net_eq(current_net, net)) {
-		put_net(current_net);
-		sock_net_set(sk, net);
-	}
-}
-
 static inline struct sock *skb_steal_sock(struct sk_buff *skb)
 {
 	if (skb->sk) {
diff --git a/net/core/sock.c b/net/core/sock.c
index 9e8968e9ebeb..c18738a795b0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1466,25 +1466,6 @@ void sk_free(struct sock *sk)
 }
 EXPORT_SYMBOL(sk_free);
 
-/*
- * Last sock_put should drop reference to sk->sk_net. It has already
- * been dropped in sk_change_net. Taking reference to stopping namespace
- * is not an option.
- * Take reference to a socket to remove it from hash _alive_ and after that
- * destroy it in the context of init_net.
- */
-void sk_release_kernel(struct sock *sk)
-{
-	if (sk == NULL || sk->sk_socket == NULL)
-		return;
-
-	sock_hold(sk);
-	sock_net_set(sk, get_net(&init_net));
-	sock_release(sk->sk_socket);
-	sock_put(sk);
-}
-EXPORT_SYMBOL(sk_release_kernel);
-
 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
 {
 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-- 
cgit v1.2.3


From c9e99fd078ef7fdcd9ee4f5a4cfdbece319587af Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 9 May 2015 22:51:31 +0200
Subject: net: sched: consolidate handle_ing and ing_filter

Given quite some code has been removed from ing_filter(), we can just
consolidate that function into handle_ing() and get rid of a few
instructions at the same time.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 46 ++++++++++++++++------------------------------
 1 file changed, 16 insertions(+), 30 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 862875ec8f2f..8a757464bfa2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3521,37 +3521,19 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 #endif
 
 #ifdef CONFIG_NET_CLS_ACT
-/* TODO: Maybe we should just force sch_ingress to be compiled in
- * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
- * a compare and 2 stores extra right now if we dont have it on
- * but have CONFIG_NET_CLS_ACT
- * NOTE: This doesn't stop any functionality; if you dont have
- * the ingress scheduler, you just can't add policies on ingress.
- *
- */
-static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
-{
-	int result = TC_ACT_OK;
-	struct Qdisc *q;
-
-	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
-
-	q = rcu_dereference(rxq->qdisc);
-	if (q != &noop_qdisc) {
-		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
-			result = qdisc_enqueue_root(skb, q);
-	}
-
-	return result;
-}
-
 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 					 struct packet_type **pt_prev,
 					 int *ret, struct net_device *orig_dev)
 {
 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
+	struct Qdisc *q;
 
-	if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
+	/* If there's at least one ingress present somewhere (so
+	 * we get here via enabled static key), remaining devices
+	 * that are not configured with an ingress qdisc will bail
+	 * out w/o the rcu_dereference().
+	 */
+	if (!rxq || (q = rcu_dereference(rxq->qdisc)) == &noop_qdisc)
 		return skb;
 
 	if (*pt_prev) {
@@ -3559,11 +3541,15 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 		*pt_prev = NULL;
 	}
 
-	switch (ing_filter(skb, rxq)) {
-	case TC_ACT_SHOT:
-	case TC_ACT_STOLEN:
-		kfree_skb(skb);
-		return NULL;
+	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+
+	if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+		switch (qdisc_enqueue_root(skb, q)) {
+		case TC_ACT_SHOT:
+		case TC_ACT_STOLEN:
+			kfree_skb(skb);
+			return NULL;
+		}
 	}
 
 	return skb;
-- 
cgit v1.2.3


From d2788d34885d4ce5ba17a8996fd95d28942e574e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 9 May 2015 22:51:32 +0200
Subject: net: sched: further simplify handle_ing

Ingress qdisc has no other purpose than calling into tc_classify()
that executes attached classifier(s) and action(s).

It has a 1:1 relationship to dev->ingress_queue. After having commit
087c1a601ad7 ("net: sched: run ingress qdisc without locks") removed
the central ingress lock, one major contention point is gone.

The extra indirection layers however, are not necessary for calling
into ingress qdisc. pktgen calling locally into netif_receive_skb()
with a dummy u32, single CPU result on a Supermicro X10SLM-F, Xeon
E3-1240: before ~21,1 Mpps, after patch ~22,9 Mpps.

We can redirect the private classifier list to the netdev directly,
without changing any classifier API bits (!) and execute on that from
handle_ing() side. The __QDISC_STATE_DEACTIVATE test can be removed,
ingress qdisc doesn't have a queue and thus dev_deactivate_queue()
is also not applicable, ingress_cl_list provides similar behaviour.
In other words, ingress qdisc acts like TCQ_F_BUILTIN qdisc.

One next possible step is the removal of the dev's ingress (dummy)
netdev_queue, and to only have the list member in the netdevice
itself.

Note, the filter chain is RCU protected and individual filter elements
are being kfree'd by sched subsystem after RCU grace period. RCU read
lock is being held by __netif_receive_skb_core().

Joint work with Alexei Starovoitov.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 ++++
 net/core/dev.c            | 30 ++++++++++++++----------
 net/sched/sch_ingress.c   | 58 ++++++++---------------------------------------
 3 files changed, 31 insertions(+), 61 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1899c74a7127..c4e1caf6056f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1655,7 +1655,11 @@ struct net_device {
 	rx_handler_func_t __rcu	*rx_handler;
 	void __rcu		*rx_handler_data;
 
+#if CONFIG_NET_CLS_ACT
+	struct tcf_proto __rcu  *ingress_cl_list;
+#endif
 	struct netdev_queue __rcu *ingress_queue;
+
 	unsigned char		broadcast[MAX_ADDR_LEN];
 #ifdef CONFIG_RFS_ACCEL
 	struct cpu_rmap		*rx_cpu_rmap;
diff --git a/net/core/dev.c b/net/core/dev.c
index 8a757464bfa2..e5f77c40bbd1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3525,31 +3525,37 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 					 struct packet_type **pt_prev,
 					 int *ret, struct net_device *orig_dev)
 {
-	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
-	struct Qdisc *q;
+	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
+	struct tcf_result cl_res;
 
 	/* If there's at least one ingress present somewhere (so
 	 * we get here via enabled static key), remaining devices
 	 * that are not configured with an ingress qdisc will bail
-	 * out w/o the rcu_dereference().
+	 * out here.
 	 */
-	if (!rxq || (q = rcu_dereference(rxq->qdisc)) == &noop_qdisc)
+	if (!cl)
 		return skb;
-
 	if (*pt_prev) {
 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
 		*pt_prev = NULL;
 	}
 
+	qdisc_bstats_update_cpu(cl->q, skb);
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
 
-	if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
-		switch (qdisc_enqueue_root(skb, q)) {
-		case TC_ACT_SHOT:
-		case TC_ACT_STOLEN:
-			kfree_skb(skb);
-			return NULL;
-		}
+	switch (tc_classify(skb, cl, &cl_res)) {
+	case TC_ACT_OK:
+	case TC_ACT_RECLASSIFY:
+		skb->tc_index = TC_H_MIN(cl_res.classid);
+		break;
+	case TC_ACT_SHOT:
+		qdisc_qstats_drop_cpu(cl->q);
+	case TC_ACT_STOLEN:
+	case TC_ACT_QUEUED:
+		kfree_skb(skb);
+		return NULL;
+	default:
+		break;
 	}
 
 	return skb;
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index a89cc3278bfb..e7c648fa9dc3 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -12,16 +12,10 @@
 #include <linux/list.h>
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
+
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 
-
-struct ingress_qdisc_data {
-	struct tcf_proto __rcu	*filter_list;
-};
-
-/* ------------------------- Class/flow operations ------------------------- */
-
 static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg)
 {
 	return NULL;
@@ -49,45 +43,11 @@ static void ingress_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 static struct tcf_proto __rcu **ingress_find_tcf(struct Qdisc *sch,
 						 unsigned long cl)
 {
-	struct ingress_qdisc_data *p = qdisc_priv(sch);
-
-	return &p->filter_list;
-}
-
-/* --------------------------- Qdisc operations ---------------------------- */
+	struct net_device *dev = qdisc_dev(sch);
 
-static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
-{
-	struct ingress_qdisc_data *p = qdisc_priv(sch);
-	struct tcf_result res;
-	struct tcf_proto *fl = rcu_dereference_bh(p->filter_list);
-	int result;
-
-	result = tc_classify(skb, fl, &res);
-
-	qdisc_bstats_update_cpu(sch, skb);
-	switch (result) {
-	case TC_ACT_SHOT:
-		result = TC_ACT_SHOT;
-		qdisc_qstats_drop_cpu(sch);
-		break;
-	case TC_ACT_STOLEN:
-	case TC_ACT_QUEUED:
-		result = TC_ACT_STOLEN;
-		break;
-	case TC_ACT_RECLASSIFY:
-	case TC_ACT_OK:
-		skb->tc_index = TC_H_MIN(res.classid);
-	default:
-		result = TC_ACT_OK;
-		break;
-	}
-
-	return result;
+	return &dev->ingress_cl_list;
 }
 
-/* ------------------------------------------------------------- */
-
 static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
 {
 	net_inc_ingress_queue();
@@ -98,9 +58,9 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt)
 
 static void ingress_destroy(struct Qdisc *sch)
 {
-	struct ingress_qdisc_data *p = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
 
-	tcf_destroy_chain(&p->filter_list);
+	tcf_destroy_chain(&dev->ingress_cl_list);
 	net_dec_ingress_queue();
 }
 
@@ -111,6 +71,7 @@ static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb)
 	nest = nla_nest_start(skb, TCA_OPTIONS);
 	if (nest == NULL)
 		goto nla_put_failure;
+
 	return nla_nest_end(skb, nest);
 
 nla_put_failure:
@@ -131,8 +92,6 @@ static const struct Qdisc_class_ops ingress_class_ops = {
 static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
 	.cl_ops		=	&ingress_class_ops,
 	.id		=	"ingress",
-	.priv_size	=	sizeof(struct ingress_qdisc_data),
-	.enqueue	=	ingress_enqueue,
 	.init		=	ingress_init,
 	.destroy	=	ingress_destroy,
 	.dump		=	ingress_dump,
@@ -149,6 +108,7 @@ static void __exit ingress_module_exit(void)
 	unregister_qdisc(&ingress_qdisc_ops);
 }
 
-module_init(ingress_module_init)
-module_exit(ingress_module_exit)
+module_init(ingress_module_init);
+module_exit(ingress_module_exit);
+
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 9451980a6646ed487efce04a9df28f450935683e Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 6 May 2015 21:11:40 -0700
Subject: net: Use cached copy of pfmemalloc to avoid accessing page

While testing I found that the testing for pfmemalloc in build_skb was
rather expensive.  I found the issue to be two-fold.  First we have to get
from the virtual address to the head page and that comes at the cost of
something like 11 cycles.  Then there is the cost for reading pfmemalloc out
of the head page which can be cache cold due to the fact that
put_page_testzero is likely invalidating the cache-line on one or more
CPUs as the fragments can be shared.

To avoid this extra expense I have added a pfmemalloc member to the
netdev_alloc_cache.  I then pushed pieces of __alloc_rx_skb into
__napi_alloc_skb and __netdev_alloc_skb so that I could rewrite them to
make use of the cached pfmemalloc value.  The result is that my perf traces
show a reduction from 9.28% overhead to 3.7% for the code covered by
build_skb, __alloc_rx_skb, and __napi_alloc_skb when performing a test with
the packet being dropped instead of being handed to napi_gro_receive.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 141 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 79 insertions(+), 62 deletions(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b9eb90b39ac7..d6851ca32598 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -353,6 +353,7 @@ struct netdev_alloc_cache {
 	 * containing page->_count every time we allocate a fragment.
 	 */
 	unsigned int		pagecnt_bias;
+	bool pfmemalloc;
 };
 static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
 static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
@@ -379,10 +380,9 @@ static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
 	return page;
 }
 
-static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache,
+static void *__alloc_page_frag(struct netdev_alloc_cache *nc,
 			       unsigned int fragsz, gfp_t gfp_mask)
 {
-	struct netdev_alloc_cache *nc = this_cpu_ptr(cache);
 	struct page *page = nc->frag.page;
 	unsigned int size;
 	int offset;
@@ -402,6 +402,7 @@ refill:
 		atomic_add(size - 1, &page->_count);
 
 		/* reset page count bias and offset to start of new frag */
+		nc->pfmemalloc = page->pfmemalloc;
 		nc->pagecnt_bias = size;
 		nc->frag.offset = size;
 	}
@@ -430,11 +431,13 @@ refill:
 
 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 {
+	struct netdev_alloc_cache *nc;
 	unsigned long flags;
 	void *data;
 
 	local_irq_save(flags);
-	data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask);
+	nc = this_cpu_ptr(&netdev_alloc_cache);
+	data = __alloc_page_frag(nc, fragsz, gfp_mask);
 	local_irq_restore(flags);
 	return data;
 }
@@ -454,7 +457,9 @@ EXPORT_SYMBOL(netdev_alloc_frag);
 
 static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 {
-	return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask);
+	struct netdev_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+	return __alloc_page_frag(nc, fragsz, gfp_mask);
 }
 
 void *napi_alloc_frag(unsigned int fragsz)
@@ -464,76 +469,64 @@ void *napi_alloc_frag(unsigned int fragsz)
 EXPORT_SYMBOL(napi_alloc_frag);
 
 /**
- *	__alloc_rx_skb - allocate an skbuff for rx
+ *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ *	@dev: network device to receive on
  *	@length: length to allocate
  *	@gfp_mask: get_free_pages mask, passed to alloc_skb
- *	@flags:	If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
- *		allocations in case we have to fallback to __alloc_skb()
- *		If SKB_ALLOC_NAPI is set, page fragment will be allocated
- *		from napi_cache instead of netdev_cache.
  *
  *	Allocate a new &sk_buff and assign it a usage count of one. The
- *	buffer has unspecified headroom built in. Users should allocate
+ *	buffer has NET_SKB_PAD headroom built in. Users should allocate
  *	the headroom they think they need without accounting for the
  *	built in space. The built in space is used for optimisations.
  *
  *	%NULL is returned if there is no free memory.
  */
-static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask,
-				      int flags)
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
+				   gfp_t gfp_mask)
 {
-	struct sk_buff *skb = NULL;
-	unsigned int fragsz = SKB_DATA_ALIGN(length) +
-			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	struct netdev_alloc_cache *nc;
+	unsigned long flags;
+	struct sk_buff *skb;
+	bool pfmemalloc;
+	void *data;
 
-	if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
-		void *data;
+	len += NET_SKB_PAD;
 
-		if (sk_memalloc_socks())
-			gfp_mask |= __GFP_MEMALLOC;
+	if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+	    (gfp_mask & (__GFP_WAIT | GFP_DMA)))
+		return __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
 
-		data = (flags & SKB_ALLOC_NAPI) ?
-			__napi_alloc_frag(fragsz, gfp_mask) :
-			__netdev_alloc_frag(fragsz, gfp_mask);
+	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	len = SKB_DATA_ALIGN(len);
 
-		if (likely(data)) {
-			skb = build_skb(data, fragsz);
-			if (unlikely(!skb))
-				put_page(virt_to_head_page(data));
-		}
-	} else {
-		skb = __alloc_skb(length, gfp_mask,
-				  SKB_ALLOC_RX, NUMA_NO_NODE);
-	}
-	return skb;
-}
+	if (sk_memalloc_socks())
+		gfp_mask |= __GFP_MEMALLOC;
 
-/**
- *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
- *	@dev: network device to receive on
- *	@length: length to allocate
- *	@gfp_mask: get_free_pages mask, passed to alloc_skb
- *
- *	Allocate a new &sk_buff and assign it a usage count of one. The
- *	buffer has NET_SKB_PAD headroom built in. Users should allocate
- *	the headroom they think they need without accounting for the
- *	built in space. The built in space is used for optimisations.
- *
- *	%NULL is returned if there is no free memory.
- */
-struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
-				   unsigned int length, gfp_t gfp_mask)
-{
-	struct sk_buff *skb;
+	local_irq_save(flags);
 
-	length += NET_SKB_PAD;
-	skb = __alloc_rx_skb(length, gfp_mask, 0);
+	nc = this_cpu_ptr(&netdev_alloc_cache);
+	data = __alloc_page_frag(nc, len, gfp_mask);
+	pfmemalloc = nc->pfmemalloc;
 
-	if (likely(skb)) {
-		skb_reserve(skb, NET_SKB_PAD);
-		skb->dev = dev;
+	local_irq_restore(flags);
+
+	if (unlikely(!data))
+		return NULL;
+
+	skb = __build_skb(data, len);
+	if (unlikely(!skb)) {
+		put_page(virt_to_head_page(data));
+		return NULL;
 	}
 
+	/* use OR instead of assignment to avoid clearing of bits in mask */
+	if (pfmemalloc)
+		skb->pfmemalloc = 1;
+	skb->head_frag = 1;
+
+	skb_reserve(skb, NET_SKB_PAD);
+	skb->dev = dev;
+
 	return skb;
 }
 EXPORT_SYMBOL(__netdev_alloc_skb);
@@ -551,19 +544,43 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
  *
  *	%NULL is returned if there is no free memory.
  */
-struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
-				 unsigned int length, gfp_t gfp_mask)
+struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
+				 gfp_t gfp_mask)
 {
+	struct netdev_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 	struct sk_buff *skb;
+	void *data;
+
+	len += NET_SKB_PAD + NET_IP_ALIGN;
+
+	if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+	    (gfp_mask & (__GFP_WAIT | GFP_DMA)))
+		return __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+
+	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	len = SKB_DATA_ALIGN(len);
 
-	length += NET_SKB_PAD + NET_IP_ALIGN;
-	skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI);
+	if (sk_memalloc_socks())
+		gfp_mask |= __GFP_MEMALLOC;
+
+	data = __alloc_page_frag(nc, len, gfp_mask);
+	if (unlikely(!data))
+		return NULL;
 
-	if (likely(skb)) {
-		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
-		skb->dev = napi->dev;
+	skb = __build_skb(data, len);
+	if (unlikely(!skb)) {
+		put_page(virt_to_head_page(data));
+		return NULL;
 	}
 
+	/* use OR instead of assignment to avoid clearing of bits in mask */
+	if (nc->pfmemalloc)
+		skb->pfmemalloc = 1;
+	skb->head_frag = 1;
+
+	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+	skb->dev = napi->dev;
+
 	return skb;
 }
 EXPORT_SYMBOL(__napi_alloc_skb);
-- 
cgit v1.2.3


From 0e39250845c0f91acc64264709b25f7f9b85c2c3 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 6 May 2015 21:11:51 -0700
Subject: net: Store virtual address instead of page in netdev_alloc_cache

This change makes it so that we store the virtual address of the page
in the netdev_alloc_cache instead of the page pointer.  The idea behind
this is to avoid multiple calls to page_address since the virtual address
is required for every access, but the page pointer is only needed at
allocation or reset of the page.

While I was at it I also reordered the netdev_alloc_cache structure a bit
so that the size is always 16 bytes by dropping size in the case where
PAGE_SIZE is greater than or equal to 32KB.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  5 ++---
 net/core/skbuff.c      | 55 +++++++++++++++++++++++++++++---------------------
 2 files changed, 34 insertions(+), 26 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9c2f793573fa..8b9a2c35a9d7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2128,9 +2128,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
 		kfree_skb(skb);
 }
 
-#define NETDEV_FRAG_PAGE_MAX_ORDER get_order(32768)
-#define NETDEV_FRAG_PAGE_MAX_SIZE  (PAGE_SIZE << NETDEV_FRAG_PAGE_MAX_ORDER)
-#define NETDEV_PAGECNT_MAX_BIAS	   NETDEV_FRAG_PAGE_MAX_SIZE
+#define NETDEV_FRAG_PAGE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
+#define NETDEV_FRAG_PAGE_MAX_ORDER	get_order(NETDEV_FRAG_PAGE_MAX_SIZE)
 
 void *netdev_alloc_frag(unsigned int fragsz);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d6851ca32598..a3062ec341c3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -348,7 +348,13 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
 EXPORT_SYMBOL(build_skb);
 
 struct netdev_alloc_cache {
-	struct page_frag	frag;
+	void * va;
+#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
+	__u16 offset;
+	__u16 size;
+#else
+	__u32 offset;
+#endif
 	/* we maintain a pagecount bias, so that we dont dirty cache line
 	 * containing page->_count every time we allocate a fragment.
 	 */
@@ -361,21 +367,20 @@ static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
 static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
 				       gfp_t gfp_mask)
 {
-	const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER;
 	struct page *page = NULL;
 	gfp_t gfp = gfp_mask;
 
-	if (order) {
-		gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-			    __GFP_NOMEMALLOC;
-		page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
-		nc->frag.size = PAGE_SIZE << (page ? order : 0);
-	}
-
+#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
+	gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
+		    __GFP_NOMEMALLOC;
+	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
+				NETDEV_FRAG_PAGE_MAX_ORDER);
+	nc->size = page ? NETDEV_FRAG_PAGE_MAX_SIZE : PAGE_SIZE;
+#endif
 	if (unlikely(!page))
 		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
 
-	nc->frag.page = page;
+	nc->va = page ? page_address(page) : NULL;
 
 	return page;
 }
@@ -383,19 +388,20 @@ static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
 static void *__alloc_page_frag(struct netdev_alloc_cache *nc,
 			       unsigned int fragsz, gfp_t gfp_mask)
 {
-	struct page *page = nc->frag.page;
-	unsigned int size;
+	unsigned int size = PAGE_SIZE;
+	struct page *page;
 	int offset;
 
-	if (unlikely(!page)) {
+	if (unlikely(!nc->va)) {
 refill:
 		page = __page_frag_refill(nc, gfp_mask);
 		if (!page)
 			return NULL;
 
-		/* if size can vary use frag.size else just use PAGE_SIZE */
-		size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
-
+#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
 		/* Even if we own the page, we do not use atomic_set().
 		 * This would break get_page_unless_zero() users.
 		 */
@@ -404,17 +410,20 @@ refill:
 		/* reset page count bias and offset to start of new frag */
 		nc->pfmemalloc = page->pfmemalloc;
 		nc->pagecnt_bias = size;
-		nc->frag.offset = size;
+		nc->offset = size;
 	}
 
-	offset = nc->frag.offset - fragsz;
+	offset = nc->offset - fragsz;
 	if (unlikely(offset < 0)) {
+		page = virt_to_page(nc->va);
+
 		if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
 			goto refill;
 
-		/* if size can vary use frag.size else just use PAGE_SIZE */
-		size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE;
-
+#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
 		/* OK, page count is 0, we can safely set it */
 		atomic_set(&page->_count, size);
 
@@ -424,9 +433,9 @@ refill:
 	}
 
 	nc->pagecnt_bias--;
-	nc->frag.offset = offset;
+	nc->offset = offset;
 
-	return page_address(page) + offset;
+	return nc->va + offset;
 }
 
 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
-- 
cgit v1.2.3


From b63ae8ca096dfdbfeef6a209c30a93a966518853 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 6 May 2015 21:11:57 -0700
Subject: mm/net: Rename and move page fragment handling from net/ to mm/

This change moves the __alloc_page_frag functionality out of the networking
stack and into the page allocation portion of mm.  The idea it so help make
this maintainable by placing it with other page allocation functions.

Since we are moving it from skbuff.c to page_alloc.c I have also renamed
the basic defines and structure from netdev_alloc_cache to page_frag_cache
to reflect that this is now part of a different kernel subsystem.

I have also added a simple __free_page_frag function which can handle
freeing the frags based on the skb->head pointer.  The model for this is
based off of __free_pages since we don't actually need to deal with all of
the cases that put_page handles.  I incorporated the virt_to_head_page call
and compound_order into the function as it actually allows for a signficant
size reduction by reducing code duplication.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/gfp.h      |   5 +++
 include/linux/mm_types.h |  18 +++++++++
 include/linux/skbuff.h   |   3 --
 mm/page_alloc.c          |  98 ++++++++++++++++++++++++++++++++++++++++++++++
 net/core/skbuff.c        | 100 +++--------------------------------------------
 5 files changed, 127 insertions(+), 97 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 97a9373e61e8..70a7fee1efb3 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -366,6 +366,11 @@ extern void free_pages(unsigned long addr, unsigned int order);
 extern void free_hot_cold_page(struct page *page, bool cold);
 extern void free_hot_cold_page_list(struct list_head *list, bool cold);
 
+struct page_frag_cache;
+extern void *__alloc_page_frag(struct page_frag_cache *nc,
+			       unsigned int fragsz, gfp_t gfp_mask);
+extern void __free_page_frag(void *addr);
+
 extern void __free_kmem_pages(struct page *page, unsigned int order);
 extern void free_kmem_pages(unsigned long addr, unsigned int order);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8d37e26a1007..0038ac7466fd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -226,6 +226,24 @@ struct page_frag {
 #endif
 };
 
+#define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
+#define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
+
+struct page_frag_cache {
+	void * va;
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	__u16 offset;
+	__u16 size;
+#else
+	__u32 offset;
+#endif
+	/* we maintain a pagecount bias, so that we dont dirty cache line
+	 * containing page->_count every time we allocate a fragment.
+	 */
+	unsigned int		pagecnt_bias;
+	bool pfmemalloc;
+};
+
 typedef unsigned long __nocast vm_flags_t;
 
 /*
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8b9a2c35a9d7..0039fcc45b3b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2128,9 +2128,6 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
 		kfree_skb(skb);
 }
 
-#define NETDEV_FRAG_PAGE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
-#define NETDEV_FRAG_PAGE_MAX_ORDER	get_order(NETDEV_FRAG_PAGE_MAX_SIZE)
-
 void *netdev_alloc_frag(unsigned int fragsz);
 
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ebffa0e4a9c0..2fd31aebef30 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2966,6 +2966,104 @@ void free_pages(unsigned long addr, unsigned int order)
 
 EXPORT_SYMBOL(free_pages);
 
+/*
+ * Page Fragment:
+ *  An arbitrary-length arbitrary-offset area of memory which resides
+ *  within a 0 or higher order page.  Multiple fragments within that page
+ *  are individually refcounted, in the page's reference counter.
+ *
+ * The page_frag functions below provide a simple allocation framework for
+ * page fragments.  This is used by the network stack and network device
+ * drivers to provide a backing region of memory for use as either an
+ * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
+ */
+static struct page *__page_frag_refill(struct page_frag_cache *nc,
+				       gfp_t gfp_mask)
+{
+	struct page *page = NULL;
+	gfp_t gfp = gfp_mask;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+	gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
+		    __GFP_NOMEMALLOC;
+	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
+				PAGE_FRAG_CACHE_MAX_ORDER);
+	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
+#endif
+	if (unlikely(!page))
+		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+
+	nc->va = page ? page_address(page) : NULL;
+
+	return page;
+}
+
+void *__alloc_page_frag(struct page_frag_cache *nc,
+			unsigned int fragsz, gfp_t gfp_mask)
+{
+	unsigned int size = PAGE_SIZE;
+	struct page *page;
+	int offset;
+
+	if (unlikely(!nc->va)) {
+refill:
+		page = __page_frag_refill(nc, gfp_mask);
+		if (!page)
+			return NULL;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
+		/* Even if we own the page, we do not use atomic_set().
+		 * This would break get_page_unless_zero() users.
+		 */
+		atomic_add(size - 1, &page->_count);
+
+		/* reset page count bias and offset to start of new frag */
+		nc->pfmemalloc = page->pfmemalloc;
+		nc->pagecnt_bias = size;
+		nc->offset = size;
+	}
+
+	offset = nc->offset - fragsz;
+	if (unlikely(offset < 0)) {
+		page = virt_to_page(nc->va);
+
+		if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+			goto refill;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+		/* if size can vary use size else just use PAGE_SIZE */
+		size = nc->size;
+#endif
+		/* OK, page count is 0, we can safely set it */
+		atomic_set(&page->_count, size);
+
+		/* reset page count bias and offset to start of new frag */
+		nc->pagecnt_bias = size;
+		offset = size - fragsz;
+	}
+
+	nc->pagecnt_bias--;
+	nc->offset = offset;
+
+	return nc->va + offset;
+}
+EXPORT_SYMBOL(__alloc_page_frag);
+
+/*
+ * Frees a page fragment allocated out of either a compound or order 0 page.
+ */
+void __free_page_frag(void *addr)
+{
+	struct page *page = virt_to_head_page(addr);
+
+	if (unlikely(put_page_testzero(page)))
+		__free_pages_ok(page, compound_order(page));
+}
+EXPORT_SYMBOL(__free_page_frag);
+
 /*
  * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
  * of the current memory cgroup.
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a3062ec341c3..dcc0e07abf47 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -347,100 +347,12 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
 }
 EXPORT_SYMBOL(build_skb);
 
-struct netdev_alloc_cache {
-	void * va;
-#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
-	__u16 offset;
-	__u16 size;
-#else
-	__u32 offset;
-#endif
-	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_count every time we allocate a fragment.
-	 */
-	unsigned int		pagecnt_bias;
-	bool pfmemalloc;
-};
-static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache);
-
-static struct page *__page_frag_refill(struct netdev_alloc_cache *nc,
-				       gfp_t gfp_mask)
-{
-	struct page *page = NULL;
-	gfp_t gfp = gfp_mask;
-
-#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
-	gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-		    __GFP_NOMEMALLOC;
-	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
-				NETDEV_FRAG_PAGE_MAX_ORDER);
-	nc->size = page ? NETDEV_FRAG_PAGE_MAX_SIZE : PAGE_SIZE;
-#endif
-	if (unlikely(!page))
-		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
-
-	nc->va = page ? page_address(page) : NULL;
-
-	return page;
-}
-
-static void *__alloc_page_frag(struct netdev_alloc_cache *nc,
-			       unsigned int fragsz, gfp_t gfp_mask)
-{
-	unsigned int size = PAGE_SIZE;
-	struct page *page;
-	int offset;
-
-	if (unlikely(!nc->va)) {
-refill:
-		page = __page_frag_refill(nc, gfp_mask);
-		if (!page)
-			return NULL;
-
-#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
-		/* Even if we own the page, we do not use atomic_set().
-		 * This would break get_page_unless_zero() users.
-		 */
-		atomic_add(size - 1, &page->_count);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pfmemalloc = page->pfmemalloc;
-		nc->pagecnt_bias = size;
-		nc->offset = size;
-	}
-
-	offset = nc->offset - fragsz;
-	if (unlikely(offset < 0)) {
-		page = virt_to_page(nc->va);
-
-		if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
-			goto refill;
-
-#if (PAGE_SIZE < NETDEV_FRAG_PAGE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
-		/* OK, page count is 0, we can safely set it */
-		atomic_set(&page->_count, size);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pagecnt_bias = size;
-		offset = size - fragsz;
-	}
-
-	nc->pagecnt_bias--;
-	nc->offset = offset;
-
-	return nc->va + offset;
-}
+static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
 
 static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 {
-	struct netdev_alloc_cache *nc;
+	struct page_frag_cache *nc;
 	unsigned long flags;
 	void *data;
 
@@ -466,7 +378,7 @@ EXPORT_SYMBOL(netdev_alloc_frag);
 
 static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 {
-	struct netdev_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+	struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 
 	return __alloc_page_frag(nc, fragsz, gfp_mask);
 }
@@ -493,7 +405,7 @@ EXPORT_SYMBOL(napi_alloc_frag);
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 				   gfp_t gfp_mask)
 {
-	struct netdev_alloc_cache *nc;
+	struct page_frag_cache *nc;
 	unsigned long flags;
 	struct sk_buff *skb;
 	bool pfmemalloc;
@@ -556,7 +468,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 				 gfp_t gfp_mask)
 {
-	struct netdev_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+	struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 	struct sk_buff *skb;
 	void *data;
 
-- 
cgit v1.2.3


From 181edb2bfa22b50817684135ab6430ed2808abf0 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 6 May 2015 21:12:03 -0700
Subject: net: Add skb_free_frag to replace use of put_page in freeing
 skb->head

This change adds a function called skb_free_frag which is meant to
compliment the function netdev_alloc_frag.  The general idea is to enable a
more lightweight version of page freeing since we don't actually need all
the overhead of a put_page, and we don't quite fit the model of __free_pages.

Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  5 +++++
 net/core/skbuff.c      | 10 ++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0039fcc45b3b..c0b574a414e7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2182,6 +2182,11 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
 	return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
 }
 
+static inline void skb_free_frag(void *addr)
+{
+	__free_page_frag(addr);
+}
+
 void *napi_alloc_frag(unsigned int fragsz);
 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
 				 unsigned int length, gfp_t gfp_mask);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dcc0e07abf47..d67e612bf0ef 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -436,7 +436,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 
 	skb = __build_skb(data, len);
 	if (unlikely(!skb)) {
-		put_page(virt_to_head_page(data));
+		skb_free_frag(data);
 		return NULL;
 	}
 
@@ -490,7 +490,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 
 	skb = __build_skb(data, len);
 	if (unlikely(!skb)) {
-		put_page(virt_to_head_page(data));
+		skb_free_frag(data);
 		return NULL;
 	}
 
@@ -549,10 +549,12 @@ static void skb_clone_fraglist(struct sk_buff *skb)
 
 static void skb_free_head(struct sk_buff *skb)
 {
+	unsigned char *head = skb->head;
+
 	if (skb->head_frag)
-		put_page(virt_to_head_page(skb->head));
+		skb_free_frag(head);
 	else
-		kfree(skb->head);
+		kfree(head);
 }
 
 static void skb_release_data(struct sk_buff *skb)
-- 
cgit v1.2.3


From ebb9a03a590e2325f747be43c8db450e92509501 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Sun, 10 May 2015 09:47:46 -0700
Subject: switchdev: s/netdev_switch_/switchdev_/ and
 s/NETDEV_SWITCH_/SWITCHDEV_/

Turned out that "switchdev" sticks. So just unify all related terms to use
this prefix.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Acked-by: Andy Gospodarek <gospo@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c      |   4 +-
 drivers/net/ethernet/rocker/rocker.c |  10 +--
 drivers/net/team/team.c              |   4 +-
 include/net/switchdev.h              | 111 +++++++++++++++--------------
 net/bridge/br.c                      |  22 +++---
 net/bridge/br_netlink.c              |   6 +-
 net/bridge/br_stp.c                  |   2 +-
 net/core/net-sysfs.c                 |   2 +-
 net/core/rtnetlink.c                 |   2 +-
 net/ipv4/fib_trie.c                  |  40 +++++------
 net/switchdev/switchdev.c            | 134 +++++++++++++++++------------------
 11 files changed, 166 insertions(+), 171 deletions(-)

(limited to 'net/core')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2ee13be7551b..a475a0389cc7 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4039,8 +4039,8 @@ static const struct net_device_ops bond_netdev_ops = {
 	.ndo_add_slave		= bond_enslave,
 	.ndo_del_slave		= bond_release,
 	.ndo_fix_features	= bond_fix_features,
-	.ndo_bridge_setlink	= ndo_dflt_netdev_switch_port_bridge_setlink,
-	.ndo_bridge_dellink	= ndo_dflt_netdev_switch_port_bridge_dellink,
+	.ndo_bridge_setlink	= ndo_dflt_switchdev_port_bridge_setlink,
+	.ndo_bridge_dellink	= ndo_dflt_switchdev_port_bridge_dellink,
 	.ndo_features_check	= passthru_features_check,
 };
 
diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 0c6f0a8b42dd..418f62b19545 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -3377,17 +3377,17 @@ static void rocker_port_fdb_learn_work(struct work_struct *work)
 		container_of(work, struct rocker_fdb_learn_work, work);
 	bool removing = (lw->flags & ROCKER_OP_FLAG_REMOVE);
 	bool learned = (lw->flags & ROCKER_OP_FLAG_LEARNED);
-	struct netdev_switch_notifier_fdb_info info;
+	struct switchdev_notifier_fdb_info info;
 
 	info.addr = lw->addr;
 	info.vid = lw->vid;
 
 	if (learned && removing)
-		call_netdev_switch_notifiers(NETDEV_SWITCH_FDB_DEL,
-					     lw->dev, &info.info);
+		call_switchdev_notifiers(SWITCHDEV_FDB_DEL,
+					 lw->dev, &info.info);
 	else if (learned && !removing)
-		call_netdev_switch_notifiers(NETDEV_SWITCH_FDB_ADD,
-					     lw->dev, &info.info);
+		call_switchdev_notifiers(SWITCHDEV_FDB_ADD,
+					 lw->dev, &info.info);
 
 	kfree(work);
 }
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 6928448f6b7f..cfe84965afb6 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1977,8 +1977,8 @@ static const struct net_device_ops team_netdev_ops = {
 	.ndo_del_slave		= team_del_slave,
 	.ndo_fix_features	= team_fix_features,
 	.ndo_change_carrier     = team_change_carrier,
-	.ndo_bridge_setlink     = ndo_dflt_netdev_switch_port_bridge_setlink,
-	.ndo_bridge_dellink     = ndo_dflt_netdev_switch_port_bridge_dellink,
+	.ndo_bridge_setlink     = ndo_dflt_switchdev_port_bridge_setlink,
+	.ndo_bridge_dellink     = ndo_dflt_switchdev_port_bridge_dellink,
 	.ndo_features_check	= passthru_features_check,
 };
 
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index d2e69ee3019a..cd921fa0d63a 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -43,124 +43,125 @@ struct swdev_ops {
 				      u8 tos, u8 type, u32 tb_id);
 };
 
-enum netdev_switch_notifier_type {
-	NETDEV_SWITCH_FDB_ADD = 1,
-	NETDEV_SWITCH_FDB_DEL,
+enum switchdev_notifier_type {
+	SWITCHDEV_FDB_ADD = 1,
+	SWITCHDEV_FDB_DEL,
 };
 
-struct netdev_switch_notifier_info {
+struct switchdev_notifier_info {
 	struct net_device *dev;
 };
 
-struct netdev_switch_notifier_fdb_info {
-	struct netdev_switch_notifier_info info; /* must be first */
+struct switchdev_notifier_fdb_info {
+	struct switchdev_notifier_info info; /* must be first */
 	const unsigned char *addr;
 	u16 vid;
 };
 
 static inline struct net_device *
-netdev_switch_notifier_info_to_dev(const struct netdev_switch_notifier_info *info)
+switchdev_notifier_info_to_dev(const struct switchdev_notifier_info *info)
 {
 	return info->dev;
 }
 
 #ifdef CONFIG_NET_SWITCHDEV
 
-int netdev_switch_parent_id_get(struct net_device *dev,
-				struct netdev_phys_item_id *psid);
-int netdev_switch_port_stp_update(struct net_device *dev, u8 state);
-int register_netdev_switch_notifier(struct notifier_block *nb);
-int unregister_netdev_switch_notifier(struct notifier_block *nb);
-int call_netdev_switch_notifiers(unsigned long val, struct net_device *dev,
-				 struct netdev_switch_notifier_info *info);
-int netdev_switch_port_bridge_setlink(struct net_device *dev,
-				struct nlmsghdr *nlh, u16 flags);
-int netdev_switch_port_bridge_dellink(struct net_device *dev,
-				struct nlmsghdr *nlh, u16 flags);
-int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev,
-					       struct nlmsghdr *nlh, u16 flags);
-int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev,
-					       struct nlmsghdr *nlh, u16 flags);
-int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
-			       u8 tos, u8 type, u32 nlflags, u32 tb_id);
-int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
-			       u8 tos, u8 type, u32 tb_id);
-void netdev_switch_fib_ipv4_abort(struct fib_info *fi);
+int switchdev_parent_id_get(struct net_device *dev,
+			    struct netdev_phys_item_id *psid);
+int switchdev_port_stp_update(struct net_device *dev, u8 state);
+int register_switchdev_notifier(struct notifier_block *nb);
+int unregister_switchdev_notifier(struct notifier_block *nb);
+int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
+			     struct switchdev_notifier_info *info);
+int switchdev_port_bridge_setlink(struct net_device *dev,
+				  struct nlmsghdr *nlh, u16 flags);
+int switchdev_port_bridge_dellink(struct net_device *dev,
+				  struct nlmsghdr *nlh, u16 flags);
+int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev,
+					   struct nlmsghdr *nlh, u16 flags);
+int ndo_dflt_switchdev_port_bridge_setlink(struct net_device *dev,
+					   struct nlmsghdr *nlh, u16 flags);
+int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
+			   u8 tos, u8 type, u32 nlflags, u32 tb_id);
+int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
+			   u8 tos, u8 type, u32 tb_id);
+void switchdev_fib_ipv4_abort(struct fib_info *fi);
 
 #else
 
-static inline int netdev_switch_parent_id_get(struct net_device *dev,
-					      struct netdev_phys_item_id *psid)
+static inline int switchdev_parent_id_get(struct net_device *dev,
+					  struct netdev_phys_item_id *psid)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int netdev_switch_port_stp_update(struct net_device *dev,
-						u8 state)
+static inline int switchdev_port_stp_update(struct net_device *dev,
+					    u8 state)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int register_netdev_switch_notifier(struct notifier_block *nb)
+static inline int register_switchdev_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
 
-static inline int unregister_netdev_switch_notifier(struct notifier_block *nb)
+static inline int unregister_switchdev_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
 
-static inline int call_netdev_switch_notifiers(unsigned long val, struct net_device *dev,
-					       struct netdev_switch_notifier_info *info)
+static inline int call_switchdev_notifiers(unsigned long val,
+					   struct net_device *dev,
+					   struct switchdev_notifier_info *info)
 {
 	return NOTIFY_DONE;
 }
 
-static inline int netdev_switch_port_bridge_setlink(struct net_device *dev,
-						    struct nlmsghdr *nlh,
-						    u16 flags)
+static inline int switchdev_port_bridge_setlink(struct net_device *dev,
+						struct nlmsghdr *nlh,
+						u16 flags)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int netdev_switch_port_bridge_dellink(struct net_device *dev,
-						    struct nlmsghdr *nlh,
-						    u16 flags)
+static inline int switchdev_port_bridge_dellink(struct net_device *dev,
+						struct nlmsghdr *nlh,
+						u16 flags)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev,
-							struct nlmsghdr *nlh,
-							u16 flags)
+static inline int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev,
+							 struct nlmsghdr *nlh,
+							 u16 flags)
 {
 	return 0;
 }
 
-static inline int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev,
-							struct nlmsghdr *nlh,
-							u16 flags)
+static inline int ndo_dflt_switchdev_port_bridge_setlink(struct net_device *dev,
+							 struct nlmsghdr *nlh,
+							 u16 flags)
 {
 	return 0;
 }
 
-static inline int netdev_switch_fib_ipv4_add(u32 dst, int dst_len,
-					     struct fib_info *fi,
-					     u8 tos, u8 type,
-					     u32 nlflags, u32 tb_id)
+static inline int switchdev_fib_ipv4_add(u32 dst, int dst_len,
+					 struct fib_info *fi,
+					 u8 tos, u8 type,
+					 u32 nlflags, u32 tb_id)
 {
 	return 0;
 }
 
-static inline int netdev_switch_fib_ipv4_del(u32 dst, int dst_len,
-					     struct fib_info *fi,
-					     u8 tos, u8 type, u32 tb_id)
+static inline int switchdev_fib_ipv4_del(u32 dst, int dst_len,
+					 struct fib_info *fi,
+					 u8 tos, u8 type, u32 tb_id)
 {
 	return 0;
 }
 
-static inline void netdev_switch_fib_ipv4_abort(struct fib_info *fi)
+static inline void switchdev_fib_ipv4_abort(struct fib_info *fi)
 {
 }
 
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 02c24cf63c34..a1abe4936fe1 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -121,13 +121,13 @@ static struct notifier_block br_device_notifier = {
 	.notifier_call = br_device_event
 };
 
-static int br_netdev_switch_event(struct notifier_block *unused,
-				  unsigned long event, void *ptr)
+static int br_switchdev_event(struct notifier_block *unused,
+			      unsigned long event, void *ptr)
 {
-	struct net_device *dev = netdev_switch_notifier_info_to_dev(ptr);
+	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
 	struct net_bridge_port *p;
 	struct net_bridge *br;
-	struct netdev_switch_notifier_fdb_info *fdb_info;
+	struct switchdev_notifier_fdb_info *fdb_info;
 	int err = NOTIFY_DONE;
 
 	rtnl_lock();
@@ -138,14 +138,14 @@ static int br_netdev_switch_event(struct notifier_block *unused,
 	br = p->br;
 
 	switch (event) {
-	case NETDEV_SWITCH_FDB_ADD:
+	case SWITCHDEV_FDB_ADD:
 		fdb_info = ptr;
 		err = br_fdb_external_learn_add(br, p, fdb_info->addr,
 						fdb_info->vid);
 		if (err)
 			err = notifier_from_errno(err);
 		break;
-	case NETDEV_SWITCH_FDB_DEL:
+	case SWITCHDEV_FDB_DEL:
 		fdb_info = ptr;
 		err = br_fdb_external_learn_del(br, p, fdb_info->addr,
 						fdb_info->vid);
@@ -159,8 +159,8 @@ out:
 	return err;
 }
 
-static struct notifier_block br_netdev_switch_notifier = {
-	.notifier_call = br_netdev_switch_event,
+static struct notifier_block br_switchdev_notifier = {
+	.notifier_call = br_switchdev_event,
 };
 
 static void __net_exit br_net_exit(struct net *net)
@@ -214,7 +214,7 @@ static int __init br_init(void)
 	if (err)
 		goto err_out3;
 
-	err = register_netdev_switch_notifier(&br_netdev_switch_notifier);
+	err = register_switchdev_notifier(&br_switchdev_notifier);
 	if (err)
 		goto err_out4;
 
@@ -235,7 +235,7 @@ static int __init br_init(void)
 	return 0;
 
 err_out5:
-	unregister_netdev_switch_notifier(&br_netdev_switch_notifier);
+	unregister_switchdev_notifier(&br_switchdev_notifier);
 err_out4:
 	unregister_netdevice_notifier(&br_device_notifier);
 err_out3:
@@ -253,7 +253,7 @@ static void __exit br_deinit(void)
 {
 	stp_proto_unregister(&br_stp_proto);
 	br_netlink_fini();
-	unregister_netdev_switch_notifier(&br_netdev_switch_notifier);
+	unregister_switchdev_notifier(&br_switchdev_notifier);
 	unregister_netdevice_notifier(&br_device_notifier);
 	brioctl_set(NULL);
 	unregister_pernet_subsys(&br_net_ops);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 4b5c236998ff..dc234533e204 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -631,8 +631,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
 	if (p && !(flags & BRIDGE_FLAGS_SELF)) {
 		/* set bridge attributes in hardware if supported
 		 */
-		ret_offload = netdev_switch_port_bridge_setlink(dev, nlh,
-								flags);
+		ret_offload = switchdev_port_bridge_setlink(dev, nlh, flags);
 		if (ret_offload && ret_offload != -EOPNOTSUPP)
 			br_warn(p->br, "error setting attrs on port %u(%s)\n",
 				(unsigned int)p->port_no, p->dev->name);
@@ -671,8 +670,7 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
 	if (p && !(flags & BRIDGE_FLAGS_SELF)) {
 		/* del bridge attributes in hardware
 		 */
-		ret_offload = netdev_switch_port_bridge_dellink(dev, nlh,
-								flags);
+		ret_offload = switchdev_port_bridge_dellink(dev, nlh, flags);
 		if (ret_offload && ret_offload != -EOPNOTSUPP)
 			br_warn(p->br, "error deleting attrs on port %u (%s)\n",
 				(unsigned int)p->port_no, p->dev->name);
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index fb3ebe615513..28e3f4bc01e0 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -42,7 +42,7 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
 	int err;
 
 	p->state = state;
-	err = netdev_switch_port_stp_update(p->dev, state);
+	err = switchdev_port_stp_update(p->dev, state);
 	if (err && err != -EOPNOTSUPP)
 		br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
 				(unsigned int) p->port_no, p->dev->name);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 4238d6da5c60..be86a7ce9282 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -460,7 +460,7 @@ static ssize_t phys_switch_id_show(struct device *dev,
 	if (dev_isalive(netdev)) {
 		struct netdev_phys_item_id ppid;
 
-		ret = netdev_switch_parent_id_get(netdev, &ppid);
+		ret = switchdev_parent_id_get(netdev, &ppid);
 		if (!ret)
 			ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
 	}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 83e08323fdcd..fcd41fcd7e70 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1006,7 +1006,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
 	int err;
 	struct netdev_phys_item_id psid;
 
-	err = netdev_switch_parent_id_get(dev, &psid);
+	err = switchdev_parent_id_get(dev, &psid);
 	if (err) {
 		if (err == -EOPNOTSUPP)
 			return 0;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e13fcc602da2..03444c6ae342 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1165,13 +1165,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 			new_fa->fa_state = state & ~FA_S_ACCESSED;
 			new_fa->fa_slen = fa->fa_slen;
 
-			err = netdev_switch_fib_ipv4_add(key, plen, fi,
-							 new_fa->fa_tos,
-							 cfg->fc_type,
-							 cfg->fc_nlflags,
-							 tb->tb_id);
+			err = switchdev_fib_ipv4_add(key, plen, fi,
+						     new_fa->fa_tos,
+						     cfg->fc_type,
+						     cfg->fc_nlflags,
+						     tb->tb_id);
 			if (err) {
-				netdev_switch_fib_ipv4_abort(fi);
+				switchdev_fib_ipv4_abort(fi);
 				kmem_cache_free(fn_alias_kmem, new_fa);
 				goto out;
 			}
@@ -1215,12 +1215,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 	new_fa->tb_id = tb->tb_id;
 
 	/* (Optionally) offload fib entry to switch hardware. */
-	err = netdev_switch_fib_ipv4_add(key, plen, fi, tos,
-					 cfg->fc_type,
-					 cfg->fc_nlflags,
-					 tb->tb_id);
+	err = switchdev_fib_ipv4_add(key, plen, fi, tos, cfg->fc_type,
+				     cfg->fc_nlflags, tb->tb_id);
 	if (err) {
-		netdev_switch_fib_ipv4_abort(fi);
+		switchdev_fib_ipv4_abort(fi);
 		goto out_free_new_fa;
 	}
 
@@ -1239,7 +1237,7 @@ succeeded:
 	return 0;
 
 out_sw_fib_del:
-	netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
+	switchdev_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
 out_free_new_fa:
 	kmem_cache_free(fn_alias_kmem, new_fa);
 out:
@@ -1517,8 +1515,8 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 	if (!fa_to_delete)
 		return -ESRCH;
 
-	netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
-				   cfg->fc_type, tb->tb_id);
+	switchdev_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
+			       cfg->fc_type, tb->tb_id);
 
 	rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
 		  &cfg->fc_nlinfo, 0);
@@ -1767,10 +1765,9 @@ void fib_table_flush_external(struct fib_table *tb)
 			if (!fi || !(fi->fib_flags & RTNH_F_EXTERNAL))
 				continue;
 
-			netdev_switch_fib_ipv4_del(n->key,
-						   KEYLENGTH - fa->fa_slen,
-						   fi, fa->fa_tos,
-						   fa->fa_type, tb->tb_id);
+			switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
+					       fi, fa->fa_tos, fa->fa_type,
+					       tb->tb_id);
 		}
 
 		/* update leaf slen */
@@ -1835,10 +1832,9 @@ int fib_table_flush(struct fib_table *tb)
 				continue;
 			}
 
-			netdev_switch_fib_ipv4_del(n->key,
-						   KEYLENGTH - fa->fa_slen,
-						   fi, fa->fa_tos,
-						   fa->fa_type, tb->tb_id);
+			switchdev_fib_ipv4_del(n->key, KEYLENGTH - fa->fa_slen,
+					       fi, fa->fa_tos, fa->fa_type,
+					       tb->tb_id);
 			hlist_del_rcu(&fa->fa_list);
 			fib_release_info(fa->fa_info);
 			alias_free_mem_rcu(fa);
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 46568b85c333..52613ed49a8c 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -19,14 +19,14 @@
 #include <net/switchdev.h>
 
 /**
- *	netdev_switch_parent_id_get - Get ID of a switch
+ *	switchdev_parent_id_get - Get ID of a switch
  *	@dev: port device
  *	@psid: switch ID
  *
  *	Get ID of a switch this port is part of.
  */
-int netdev_switch_parent_id_get(struct net_device *dev,
-				struct netdev_phys_item_id *psid)
+int switchdev_parent_id_get(struct net_device *dev,
+			    struct netdev_phys_item_id *psid)
 {
 	const struct swdev_ops *ops = dev->swdev_ops;
 
@@ -34,17 +34,17 @@ int netdev_switch_parent_id_get(struct net_device *dev,
 		return -EOPNOTSUPP;
 	return ops->swdev_parent_id_get(dev, psid);
 }
-EXPORT_SYMBOL_GPL(netdev_switch_parent_id_get);
+EXPORT_SYMBOL_GPL(switchdev_parent_id_get);
 
 /**
- *	netdev_switch_port_stp_update - Notify switch device port of STP
+ *	switchdev_port_stp_update - Notify switch device port of STP
  *					state change
  *	@dev: port device
  *	@state: port STP state
  *
  *	Notify switch device port of bridge port STP state change.
  */
-int netdev_switch_port_stp_update(struct net_device *dev, u8 state)
+int switchdev_port_stp_update(struct net_device *dev, u8 state)
 {
 	const struct swdev_ops *ops = dev->swdev_ops;
 	struct net_device *lower_dev;
@@ -55,57 +55,57 @@ int netdev_switch_port_stp_update(struct net_device *dev, u8 state)
 		return ops->swdev_port_stp_update(dev, state);
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		err = netdev_switch_port_stp_update(lower_dev, state);
+		err = switchdev_port_stp_update(lower_dev, state);
 		if (err && err != -EOPNOTSUPP)
 			return err;
 	}
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(netdev_switch_port_stp_update);
+EXPORT_SYMBOL_GPL(switchdev_port_stp_update);
 
-static DEFINE_MUTEX(netdev_switch_mutex);
-static RAW_NOTIFIER_HEAD(netdev_switch_notif_chain);
+static DEFINE_MUTEX(switchdev_mutex);
+static RAW_NOTIFIER_HEAD(switchdev_notif_chain);
 
 /**
- *	register_netdev_switch_notifier - Register notifier
+ *	register_switchdev_notifier - Register notifier
  *	@nb: notifier_block
  *
  *	Register switch device notifier. This should be used by code
  *	which needs to monitor events happening in particular device.
  *	Return values are same as for atomic_notifier_chain_register().
  */
-int register_netdev_switch_notifier(struct notifier_block *nb)
+int register_switchdev_notifier(struct notifier_block *nb)
 {
 	int err;
 
-	mutex_lock(&netdev_switch_mutex);
-	err = raw_notifier_chain_register(&netdev_switch_notif_chain, nb);
-	mutex_unlock(&netdev_switch_mutex);
+	mutex_lock(&switchdev_mutex);
+	err = raw_notifier_chain_register(&switchdev_notif_chain, nb);
+	mutex_unlock(&switchdev_mutex);
 	return err;
 }
-EXPORT_SYMBOL_GPL(register_netdev_switch_notifier);
+EXPORT_SYMBOL_GPL(register_switchdev_notifier);
 
 /**
- *	unregister_netdev_switch_notifier - Unregister notifier
+ *	unregister_switchdev_notifier - Unregister notifier
  *	@nb: notifier_block
  *
  *	Unregister switch device notifier.
  *	Return values are same as for atomic_notifier_chain_unregister().
  */
-int unregister_netdev_switch_notifier(struct notifier_block *nb)
+int unregister_switchdev_notifier(struct notifier_block *nb)
 {
 	int err;
 
-	mutex_lock(&netdev_switch_mutex);
-	err = raw_notifier_chain_unregister(&netdev_switch_notif_chain, nb);
-	mutex_unlock(&netdev_switch_mutex);
+	mutex_lock(&switchdev_mutex);
+	err = raw_notifier_chain_unregister(&switchdev_notif_chain, nb);
+	mutex_unlock(&switchdev_mutex);
 	return err;
 }
-EXPORT_SYMBOL_GPL(unregister_netdev_switch_notifier);
+EXPORT_SYMBOL_GPL(unregister_switchdev_notifier);
 
 /**
- *	call_netdev_switch_notifiers - Call notifiers
+ *	call_switchdev_notifiers - Call notifiers
  *	@val: value passed unmodified to notifier function
  *	@dev: port device
  *	@info: notifier information data
@@ -114,21 +114,21 @@ EXPORT_SYMBOL_GPL(unregister_netdev_switch_notifier);
  *	when it needs to propagate hardware event.
  *	Return values are same as for atomic_notifier_call_chain().
  */
-int call_netdev_switch_notifiers(unsigned long val, struct net_device *dev,
-				 struct netdev_switch_notifier_info *info)
+int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
+			     struct switchdev_notifier_info *info)
 {
 	int err;
 
 	info->dev = dev;
-	mutex_lock(&netdev_switch_mutex);
-	err = raw_notifier_call_chain(&netdev_switch_notif_chain, val, info);
-	mutex_unlock(&netdev_switch_mutex);
+	mutex_lock(&switchdev_mutex);
+	err = raw_notifier_call_chain(&switchdev_notif_chain, val, info);
+	mutex_unlock(&switchdev_mutex);
 	return err;
 }
-EXPORT_SYMBOL_GPL(call_netdev_switch_notifiers);
+EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
 
 /**
- *	netdev_switch_port_bridge_setlink - Notify switch device port of bridge
+ *	switchdev_port_bridge_setlink - Notify switch device port of bridge
  *	port attributes
  *
  *	@dev: port device
@@ -137,8 +137,8 @@ EXPORT_SYMBOL_GPL(call_netdev_switch_notifiers);
  *
  *	Notify switch device port of bridge port attributes
  */
-int netdev_switch_port_bridge_setlink(struct net_device *dev,
-				      struct nlmsghdr *nlh, u16 flags)
+int switchdev_port_bridge_setlink(struct net_device *dev,
+				  struct nlmsghdr *nlh, u16 flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 
@@ -150,10 +150,10 @@ int netdev_switch_port_bridge_setlink(struct net_device *dev,
 
 	return ops->ndo_bridge_setlink(dev, nlh, flags);
 }
-EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_setlink);
+EXPORT_SYMBOL_GPL(switchdev_port_bridge_setlink);
 
 /**
- *	netdev_switch_port_bridge_dellink - Notify switch device port of bridge
+ *	switchdev_port_bridge_dellink - Notify switch device port of bridge
  *	port attribute delete
  *
  *	@dev: port device
@@ -162,8 +162,8 @@ EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_setlink);
  *
  *	Notify switch device port of bridge port attribute delete
  */
-int netdev_switch_port_bridge_dellink(struct net_device *dev,
-				      struct nlmsghdr *nlh, u16 flags)
+int switchdev_port_bridge_dellink(struct net_device *dev,
+				  struct nlmsghdr *nlh, u16 flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
 
@@ -175,11 +175,11 @@ int netdev_switch_port_bridge_dellink(struct net_device *dev,
 
 	return ops->ndo_bridge_dellink(dev, nlh, flags);
 }
-EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_dellink);
+EXPORT_SYMBOL_GPL(switchdev_port_bridge_dellink);
 
 /**
- *	ndo_dflt_netdev_switch_port_bridge_setlink - default ndo bridge setlink
- *						     op for master devices
+ *	ndo_dflt_switchdev_port_bridge_setlink - default ndo bridge setlink
+ *						 op for master devices
  *
  *	@dev: port device
  *	@nlh: netlink msg with bridge port attributes
@@ -187,8 +187,8 @@ EXPORT_SYMBOL_GPL(netdev_switch_port_bridge_dellink);
  *
  *	Notify master device slaves of bridge port attributes
  */
-int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev,
-					       struct nlmsghdr *nlh, u16 flags)
+int ndo_dflt_switchdev_port_bridge_setlink(struct net_device *dev,
+					   struct nlmsghdr *nlh, u16 flags)
 {
 	struct net_device *lower_dev;
 	struct list_head *iter;
@@ -198,18 +198,18 @@ int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev,
 		return ret;
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		err = netdev_switch_port_bridge_setlink(lower_dev, nlh, flags);
+		err = switchdev_port_bridge_setlink(lower_dev, nlh, flags);
 		if (err && err != -EOPNOTSUPP)
 			ret = err;
 	}
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_setlink);
+EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_setlink);
 
 /**
- *	ndo_dflt_netdev_switch_port_bridge_dellink - default ndo bridge dellink
- *						     op for master devices
+ *	ndo_dflt_switchdev_port_bridge_dellink - default ndo bridge dellink
+ *						 op for master devices
  *
  *	@dev: port device
  *	@nlh: netlink msg with bridge port attributes
@@ -217,8 +217,8 @@ EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_setlink);
  *
  *	Notify master device slaves of bridge port attribute deletes
  */
-int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev,
-					       struct nlmsghdr *nlh, u16 flags)
+int ndo_dflt_switchdev_port_bridge_dellink(struct net_device *dev,
+					   struct nlmsghdr *nlh, u16 flags)
 {
 	struct net_device *lower_dev;
 	struct list_head *iter;
@@ -228,16 +228,16 @@ int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev,
 		return ret;
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		err = netdev_switch_port_bridge_dellink(lower_dev, nlh, flags);
+		err = switchdev_port_bridge_dellink(lower_dev, nlh, flags);
 		if (err && err != -EOPNOTSUPP)
 			ret = err;
 	}
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(ndo_dflt_netdev_switch_port_bridge_dellink);
+EXPORT_SYMBOL_GPL(ndo_dflt_switchdev_port_bridge_dellink);
 
-static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev)
+static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
 {
 	const struct swdev_ops *ops = dev->swdev_ops;
 	struct net_device *lower_dev;
@@ -253,7 +253,7 @@ static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev)
 		return dev;
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
-		port_dev = netdev_switch_get_lowest_dev(lower_dev);
+		port_dev = switchdev_get_lowest_dev(lower_dev);
 		if (port_dev)
 			return port_dev;
 	}
@@ -261,7 +261,7 @@ static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev)
 	return NULL;
 }
 
-static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi)
+static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
 {
 	struct netdev_phys_item_id psid;
 	struct netdev_phys_item_id prev_psid;
@@ -276,11 +276,11 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi)
 		if (!nh->nh_dev)
 			return NULL;
 
-		dev = netdev_switch_get_lowest_dev(nh->nh_dev);
+		dev = switchdev_get_lowest_dev(nh->nh_dev);
 		if (!dev)
 			return NULL;
 
-		if (netdev_switch_parent_id_get(dev, &psid))
+		if (switchdev_parent_id_get(dev, &psid))
 			return NULL;
 
 		if (nhsel > 0) {
@@ -297,7 +297,7 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi)
 }
 
 /**
- *	netdev_switch_fib_ipv4_add - Add IPv4 route entry to switch
+ *	switchdev_fib_ipv4_add - Add IPv4 route entry to switch
  *
  *	@dst: route's IPv4 destination address
  *	@dst_len: destination address length (prefix length)
@@ -309,8 +309,8 @@ static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi)
  *
  *	Add IPv4 route entry to switch device.
  */
-int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
-			       u8 tos, u8 type, u32 nlflags, u32 tb_id)
+int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
+			   u8 tos, u8 type, u32 nlflags, u32 tb_id)
 {
 	struct net_device *dev;
 	const struct swdev_ops *ops;
@@ -328,7 +328,7 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 	if (fi->fib_net->ipv4.fib_offload_disabled)
 		return 0;
 
-	dev = netdev_switch_get_dev_by_nhs(fi);
+	dev = switchdev_get_dev_by_nhs(fi);
 	if (!dev)
 		return 0;
 	ops = dev->swdev_ops;
@@ -343,10 +343,10 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_add);
+EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_add);
 
 /**
- *	netdev_switch_fib_ipv4_del - Delete IPv4 route entry from switch
+ *	switchdev_fib_ipv4_del - Delete IPv4 route entry from switch
  *
  *	@dst: route's IPv4 destination address
  *	@dst_len: destination address length (prefix length)
@@ -357,8 +357,8 @@ EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_add);
  *
  *	Delete IPv4 route entry from switch device.
  */
-int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
-			       u8 tos, u8 type, u32 tb_id)
+int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
+			   u8 tos, u8 type, u32 tb_id)
 {
 	struct net_device *dev;
 	const struct swdev_ops *ops;
@@ -367,7 +367,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 	if (!(fi->fib_flags & RTNH_F_EXTERNAL))
 		return 0;
 
-	dev = netdev_switch_get_dev_by_nhs(fi);
+	dev = switchdev_get_dev_by_nhs(fi);
 	if (!dev)
 		return 0;
 	ops = dev->swdev_ops;
@@ -381,14 +381,14 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_del);
+EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_del);
 
 /**
- *	netdev_switch_fib_ipv4_abort - Abort an IPv4 FIB operation
+ *	switchdev_fib_ipv4_abort - Abort an IPv4 FIB operation
  *
  *	@fi: route FIB info structure
  */
-void netdev_switch_fib_ipv4_abort(struct fib_info *fi)
+void switchdev_fib_ipv4_abort(struct fib_info *fi)
 {
 	/* There was a problem installing this route to the offload
 	 * device.  For now, until we come up with more refined
@@ -401,4 +401,4 @@ void netdev_switch_fib_ipv4_abort(struct fib_info *fi)
 	fib_flush_external(fi->fib_net);
 	fi->fib_net->ipv4.fib_offload_disabled = true;
 }
-EXPORT_SYMBOL_GPL(netdev_switch_fib_ipv4_abort);
+EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort);
-- 
cgit v1.2.3


From f8e20a9f87d33865cc1d67f13da0db8d457fc3c9 Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sun, 10 May 2015 09:47:49 -0700
Subject: switchdev: convert parent_id_get to switchdev attr get

Switch ID is just a gettable port attribute.  Convert switchdev op
switchdev_parent_id_get to a switchdev attr.

Note: for sysfs and netlink interfaces, SWITCHDEV_ATTR_PORT_PARENT_ID is
called with SWITCHDEV_F_NO_RECUSE to limit switch ID user-visiblity to only
port netdevs.  So when a port is stacked under bond/bridge, the user can
only query switch id via the switch ports, but not via the upper devices

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker.c | 17 +++++++++++-----
 include/net/switchdev.h              | 18 ++++-------------
 net/core/net-sysfs.c                 | 10 +++++++---
 net/core/rtnetlink.c                 |  9 ++++++---
 net/dsa/slave.c                      | 16 ++++++++++-----
 net/switchdev/switchdev.c            | 38 +++++++++++-------------------------
 6 files changed, 51 insertions(+), 57 deletions(-)

(limited to 'net/core')

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 7473874daf1f..24282991d9fd 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4221,14 +4221,21 @@ static const struct net_device_ops rocker_port_netdev_ops = {
  * swdev interface
  ********************/
 
-static int rocker_port_switchdev_parent_id_get(struct net_device *dev,
-					       struct netdev_phys_item_id *psid)
+static int rocker_port_attr_get(struct net_device *dev,
+				struct switchdev_attr *attr)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
 	struct rocker *rocker = rocker_port->rocker;
 
-	psid->id_len = sizeof(rocker->hw.id);
-	memcpy(&psid->id, &rocker->hw.id, psid->id_len);
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_PORT_PARENT_ID:
+		attr->ppid.id_len = sizeof(rocker->hw.id);
+		memcpy(&attr->ppid.id, &rocker->hw.id, attr->ppid.id_len);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
 	return 0;
 }
 
@@ -4266,7 +4273,7 @@ static int rocker_port_switchdev_fib_ipv4_del(struct net_device *dev,
 }
 
 static const struct switchdev_ops rocker_port_switchdev_ops = {
-	.switchdev_parent_id_get	= rocker_port_switchdev_parent_id_get,
+	.switchdev_port_attr_get	= rocker_port_attr_get,
 	.switchdev_port_stp_update	= rocker_port_switchdev_port_stp_update,
 	.switchdev_fib_ipv4_add		= rocker_port_switchdev_fib_ipv4_add,
 	.switchdev_fib_ipv4_del		= rocker_port_switchdev_fib_ipv4_del,
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 282043844bcf..93316e7ab372 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -25,12 +25,16 @@ enum switchdev_trans {
 
 enum switchdev_attr_id {
 	SWITCHDEV_ATTR_UNDEFINED,
+	SWITCHDEV_ATTR_PORT_PARENT_ID,
 };
 
 struct switchdev_attr {
 	enum switchdev_attr_id id;
 	enum switchdev_trans trans;
 	u32 flags;
+	union {
+		struct netdev_phys_item_id ppid;	/* PORT_PARENT_ID */
+	};
 };
 
 struct fib_info;
@@ -38,10 +42,6 @@ struct fib_info;
 /**
  * struct switchdev_ops - switchdev operations
  *
- * @switchdev_parent_id_get: Called to get an ID of the switch chip this port
- *   is part of.  If driver implements this, it indicates that it
- *   represents a port of a switch chip.
- *
  * @switchdev_port_attr_get: Get a port attribute (see switchdev_attr).
  *
  * @switchdev_port_attr_set: Set a port attribute (see switchdev_attr).
@@ -54,8 +54,6 @@ struct fib_info;
  * @switchdev_fib_ipv4_del: Called to delete IPv4 route from switch device.
  */
 struct switchdev_ops {
-	int	(*switchdev_parent_id_get)(struct net_device *dev,
-					   struct netdev_phys_item_id *psid);
 	int	(*switchdev_port_attr_get)(struct net_device *dev,
 					   struct switchdev_attr *attr);
 	int	(*switchdev_port_attr_set)(struct net_device *dev,
@@ -93,8 +91,6 @@ switchdev_notifier_info_to_dev(const struct switchdev_notifier_info *info)
 
 #ifdef CONFIG_NET_SWITCHDEV
 
-int switchdev_parent_id_get(struct net_device *dev,
-			    struct netdev_phys_item_id *psid);
 int switchdev_port_attr_get(struct net_device *dev,
 			    struct switchdev_attr *attr);
 int switchdev_port_attr_set(struct net_device *dev,
@@ -120,12 +116,6 @@ void switchdev_fib_ipv4_abort(struct fib_info *fi);
 
 #else
 
-static inline int switchdev_parent_id_get(struct net_device *dev,
-					  struct netdev_phys_item_id *psid)
-{
-	return -EOPNOTSUPP;
-}
-
 static inline int switchdev_port_attr_get(struct net_device *dev,
 					  struct switchdev_attr *attr)
 {
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index be86a7ce9282..5a9ce96f6d27 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -458,11 +458,15 @@ static ssize_t phys_switch_id_show(struct device *dev,
 		return restart_syscall();
 
 	if (dev_isalive(netdev)) {
-		struct netdev_phys_item_id ppid;
+		struct switchdev_attr attr = {
+			.id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+			.flags = SWITCHDEV_F_NO_RECURSE,
+		};
 
-		ret = switchdev_parent_id_get(netdev, &ppid);
+		ret = switchdev_port_attr_get(netdev, &attr);
 		if (!ret)
-			ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
+			ret = sprintf(buf, "%*phN\n", attr.ppid.id_len,
+				      attr.ppid.id);
 	}
 	rtnl_unlock();
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index fcd41fcd7e70..c6c6b2c34926 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1004,16 +1004,19 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev)
 static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
 {
 	int err;
-	struct netdev_phys_item_id psid;
+	struct switchdev_attr attr = {
+		.id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+		.flags = SWITCHDEV_F_NO_RECURSE,
+	};
 
-	err = switchdev_parent_id_get(dev, &psid);
+	err = switchdev_port_attr_get(dev, &attr);
 	if (err) {
 		if (err == -EOPNOTSUPP)
 			return 0;
 		return err;
 	}
 
-	if (nla_put(skb, IFLA_PHYS_SWITCH_ID, psid.id_len, psid.id))
+	if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.ppid.id_len, attr.ppid.id))
 		return -EMSGSIZE;
 
 	return 0;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 1546acf6ebd3..de705b674ac9 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -382,14 +382,20 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev)
 	return ret;
 }
 
-static int dsa_slave_parent_id_get(struct net_device *dev,
-				   struct netdev_phys_item_id *psid)
+static int dsa_slave_port_attr_get(struct net_device *dev,
+				   struct switchdev_attr *attr)
 {
 	struct dsa_slave_priv *p = netdev_priv(dev);
 	struct dsa_switch *ds = p->parent;
 
-	psid->id_len = sizeof(ds->index);
-	memcpy(&psid->id, &ds->index, psid->id_len);
+	switch (attr->id) {
+	case SWITCHDEV_ATTR_PORT_PARENT_ID:
+		attr->ppid.id_len = sizeof(ds->index);
+		memcpy(&attr->ppid.id, &ds->index, attr->ppid.id_len);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
 
 	return 0;
 }
@@ -676,7 +682,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
 };
 
 static const struct switchdev_ops dsa_slave_switchdev_ops = {
-	.switchdev_parent_id_get	= dsa_slave_parent_id_get,
+	.switchdev_port_attr_get	= dsa_slave_port_attr_get,
 	.switchdev_port_stp_update	= dsa_slave_stp_update,
 };
 
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 8f47187dc185..117fd0797abd 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -18,24 +18,6 @@
 #include <net/ip_fib.h>
 #include <net/switchdev.h>
 
-/**
- *	switchdev_parent_id_get - Get ID of a switch
- *	@dev: port device
- *	@psid: switch ID
- *
- *	Get ID of a switch this port is part of.
- */
-int switchdev_parent_id_get(struct net_device *dev,
-			    struct netdev_phys_item_id *psid)
-{
-	const struct switchdev_ops *ops = dev->switchdev_ops;
-
-	if (!ops || !ops->switchdev_parent_id_get)
-		return -EOPNOTSUPP;
-	return ops->switchdev_parent_id_get(dev, psid);
-}
-EXPORT_SYMBOL_GPL(switchdev_parent_id_get);
-
 /**
  *	switchdev_port_attr_get - Get port attribute
  *
@@ -414,11 +396,10 @@ static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
 	struct list_head *iter;
 
 	/* Recusively search down until we find a sw port dev.
-	 * (A sw port dev supports switchdev_parent_id_get).
+	 * (A sw port dev supports switchdev_port_attr_get).
 	 */
 
-	if (dev->features & NETIF_F_HW_SWITCH_OFFLOAD &&
-	    ops && ops->switchdev_parent_id_get)
+	if (ops && ops->switchdev_port_attr_get)
 		return dev;
 
 	netdev_for_each_lower_dev(dev, lower_dev, iter) {
@@ -432,8 +413,10 @@ static struct net_device *switchdev_get_lowest_dev(struct net_device *dev)
 
 static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
 {
-	struct netdev_phys_item_id psid;
-	struct netdev_phys_item_id prev_psid;
+	struct switchdev_attr attr = {
+		.id = SWITCHDEV_ATTR_PORT_PARENT_ID,
+	};
+	struct switchdev_attr prev_attr;
 	struct net_device *dev = NULL;
 	int nhsel;
 
@@ -449,17 +432,18 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
 		if (!dev)
 			return NULL;
 
-		if (switchdev_parent_id_get(dev, &psid))
+		if (switchdev_port_attr_get(dev, &attr))
 			return NULL;
 
 		if (nhsel > 0) {
-			if (prev_psid.id_len != psid.id_len)
+			if (prev_attr.ppid.id_len != attr.ppid.id_len)
 				return NULL;
-			if (memcmp(prev_psid.id, psid.id, psid.id_len))
+			if (memcmp(prev_attr.ppid.id, attr.ppid.id,
+				   attr.ppid.id_len))
 				return NULL;
 		}
 
-		prev_psid = psid;
+		prev_attr = attr;
 	}
 
 	return dev;
-- 
cgit v1.2.3


From 7889cbee8357aaed85898d028829dfb4f75bae2c Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Sun, 10 May 2015 09:48:07 -0700
Subject: switchdev: remove NETIF_F_HW_SWITCH_OFFLOAD feature flag

Roopa said remove the feature flag for this series and she'll work on
bringing it back if needed at a later date.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c      | 5 +----
 drivers/net/ethernet/rocker/rocker.c | 3 +--
 drivers/net/team/team.c              | 2 +-
 include/linux/netdev_features.h      | 5 +----
 net/core/ethtool.c                   | 1 -
 5 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'net/core')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 147642dae3d0..a2e25de98bde 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1015,10 +1015,7 @@ static netdev_features_t bond_fix_features(struct net_device *dev,
 	netdev_features_t mask;
 	struct slave *slave;
 
-	/* If any slave has the offload feature flag set,
-	 * set the offload flag on the bond.
-	 */
-	mask = features | NETIF_F_HW_SWITCH_OFFLOAD;
+	mask = features;
 
 	features &= ~NETIF_F_ONE_FOR_ALL;
 	features |= NETIF_F_ALL_FOR_ALL;
diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 1c906504cfe2..9715e3322706 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4906,8 +4906,7 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number)
 	rocker_carrier_init(rocker_port);
 
 	dev->features |= NETIF_F_NETNS_LOCAL |
-			 NETIF_F_HW_VLAN_CTAG_FILTER |
-			 NETIF_F_HW_SWITCH_OFFLOAD;
+			 NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	err = register_netdev(dev);
 	if (err) {
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index bda32be596ef..1ec035a53c3d 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1924,7 +1924,7 @@ static netdev_features_t team_fix_features(struct net_device *dev,
 	struct team *team = netdev_priv(dev);
 	netdev_features_t mask;
 
-	mask = features | NETIF_F_HW_SWITCH_OFFLOAD;
+	mask = features;
 	features &= ~NETIF_F_ONE_FOR_ALL;
 	features |= NETIF_F_ALL_FOR_ALL;
 
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 7d59dc6ab789..9672781c593d 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -66,7 +66,6 @@ enum {
 	NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
 	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
 	NETIF_F_BUSY_POLL_BIT,		/* Busy poll */
-	NETIF_F_HW_SWITCH_OFFLOAD_BIT,  /* HW switch offload */
 
 	/*
 	 * Add your fresh new feature above and remember to update
@@ -125,7 +124,6 @@ enum {
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
 #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
 #define NETIF_F_BUSY_POLL	__NETIF_F(BUSY_POLL)
-#define NETIF_F_HW_SWITCH_OFFLOAD	__NETIF_F(HW_SWITCH_OFFLOAD)
 
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
@@ -161,8 +159,7 @@ enum {
  */
 #define NETIF_F_ONE_FOR_ALL	(NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
 				 NETIF_F_SG | NETIF_F_HIGHDMA |		\
-				 NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED | \
-				 NETIF_F_HW_SWITCH_OFFLOAD)
+				 NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED)
 
 /*
  * If one device doesn't support one of these features, then disable it
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 1d00b8922902..eb0c3ace7458 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -98,7 +98,6 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_RXALL_BIT] =            "rx-all",
 	[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
 	[NETIF_F_BUSY_POLL_BIT] =        "busy-poll",
-	[NETIF_F_HW_SWITCH_OFFLOAD_BIT] = "hw-switch-offload",
 };
 
 static const char
-- 
cgit v1.2.3


From a2029240e5836e73ebcc1a8ddb8c22d636f89c9a Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Mon, 11 May 2015 21:17:53 +0200
Subject: net: deinline netif_tx_stop_all_queues(), remove WARN_ON in
 netif_tx_stop_queue()

These functions compile to 60 bytes of machine code each.
With this .config: http://busybox.net/~vda/kernel_config
there are 617 calls of netif_tx_stop_queue()
and 49 calls of netif_tx_stop_all_queues() in vmlinux.

To fix this, remove WARN_ON in netif_tx_stop_queue()
as suggested by davem, and deinline netif_tx_stop_all_queues().

Change in code size is about 20k:

   text      data      bss       dec     hex filename
82426986 22255416 20627456 125309858 77813a2 vmlinux.before
82406248 22255416 20627456 125289120 777c2a0 vmlinux

gcc-4.7.2 still creates deinlined version of netif_tx_stop_queue
sometimes:

$ nm --size-sort vmlinux | grep netif_tx_stop_queue | wc -l
190

ffffffff81b558a8 <netif_tx_stop_queue>:
ffffffff81b558a8:       55                      push   %rbp
ffffffff81b558a9:       48 89 e5                mov    %rsp,%rbp
ffffffff81b558ac:       f0 80 8f e0 01 00 00    lock orb $0x1,0x1e0(%rdi)
ffffffff81b558b3:       01
ffffffff81b558b4:       5d                      pop    %rbp
ffffffff81b558b5:       c3                      retq

This needs additional fixing.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
CC: Alexei Starovoitov <alexei.starovoitov@gmail.com>
CC: Alexander Duyck <alexander.duyck@gmail.com>
CC: Joe Perches <joe@perches.com>
CC: David S. Miller <davem@davemloft.net>
CC: Jiri Pirko <jpirko@redhat.com>
CC: linux-kernel@vger.kernel.org
CC: netdev@vger.kernel.org
CC: netfilter-devel@vger.kernel.org
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 14 +-------------
 net/core/dev.c            | 11 +++++++++++
 2 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2b39235b9f13..fa57915f440c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2559,10 +2559,6 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev)
 
 static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
 {
-	if (WARN_ON(!dev_queue)) {
-		pr_info("netif_stop_queue() cannot be called before register_netdev()\n");
-		return;
-	}
 	set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
 }
 
@@ -2578,15 +2574,7 @@ static inline void netif_stop_queue(struct net_device *dev)
 	netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));
 }
 
-static inline void netif_tx_stop_all_queues(struct net_device *dev)
-{
-	unsigned int i;
-
-	for (i = 0; i < dev->num_tx_queues; i++) {
-		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
-		netif_tx_stop_queue(txq);
-	}
-}
+void netif_tx_stop_all_queues(struct net_device *dev);
 
 static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index e5f77c40bbd1..fd012bbe0486 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6301,6 +6301,17 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
 	return 0;
 }
 
+void netif_tx_stop_all_queues(struct net_device *dev)
+{
+	unsigned int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+		netif_tx_stop_queue(txq);
+	}
+}
+EXPORT_SYMBOL(netif_tx_stop_all_queues);
+
 /**
  *	register_netdevice	- register a network device
  *	@dev: device to register
-- 
cgit v1.2.3


From 9eea92226407e7a117ef1ceef45380ebd000a0e2 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Mon, 11 May 2015 15:19:48 -0700
Subject: pktgen: fix packet generation

pkt_gen->last_ok was not set properly, so after the first burst
pktgen instead of allocating new packet, will reuse old one, advance
eth_type_trans further, which would mean the stack will be seeing very
short bogus packets.

Fixes: 62f64aed622b ("pktgen: introduce xmit_mode '<start_xmit|netif_receive>'")
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 8f2687da058e..62f979984a23 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1189,6 +1189,16 @@ static ssize_t pktgen_if_write(struct file *file,
 				return -ENOTSUPP;
 
 			pkt_dev->xmit_mode = M_NETIF_RECEIVE;
+
+			/* make sure new packet is allocated every time
+			 * pktgen_xmit() is called
+			 */
+			pkt_dev->last_ok = 1;
+
+			/* override clone_skb if user passed default value
+			 * at module loading time
+			 */
+			pkt_dev->clone_skb = 0;
 		} else {
 			sprintf(pg_result,
 				"xmit_mode -:%s:- unknown\nAvailable modes: %s",
@@ -3415,7 +3425,6 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 				/* get out of the loop and wait
 				 * until skb is consumed
 				 */
-				pkt_dev->last_ok = 1;
 				break;
 			}
 			/* skb was 'freed' by stack, so clean few
-- 
cgit v1.2.3


From 42275bd8fcb351f951781d8882f359d25976824b Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Wed, 13 May 2015 11:16:50 -0700
Subject: switchdev: don't use anonymous union on switchdev attr/obj structs

Older gcc versions (e.g.  gcc version 4.4.6) don't like anonymous unions
which was causing build issues on the newly added switchdev attr/obj
structs.  Fix this by using named union on structs.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Reported-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker.c | 18 ++++++++---------
 include/net/switchdev.h              |  4 ++--
 net/bridge/br_stp.c                  |  2 +-
 net/core/net-sysfs.c                 |  4 ++--
 net/core/rtnetlink.c                 |  3 ++-
 net/dsa/slave.c                      |  6 +++---
 net/switchdev/switchdev.c            | 39 ++++++++++++++++++------------------
 7 files changed, 39 insertions(+), 37 deletions(-)

(limited to 'net/core')

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index ca533936658b..f0a9cb44be6b 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4339,11 +4339,11 @@ static int rocker_port_attr_get(struct net_device *dev,
 
 	switch (attr->id) {
 	case SWITCHDEV_ATTR_PORT_PARENT_ID:
-		attr->ppid.id_len = sizeof(rocker->hw.id);
-		memcpy(&attr->ppid.id, &rocker->hw.id, attr->ppid.id_len);
+		attr->u.ppid.id_len = sizeof(rocker->hw.id);
+		memcpy(&attr->u.ppid.id, &rocker->hw.id, attr->u.ppid.id_len);
 		break;
 	case SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS:
-		attr->brport_flags = rocker_port->brport_flags;
+		attr->u.brport_flags = rocker_port->brport_flags;
 		break;
 	default:
 		return -EOPNOTSUPP;
@@ -4400,11 +4400,11 @@ static int rocker_port_attr_set(struct net_device *dev,
 	switch (attr->id) {
 	case SWITCHDEV_ATTR_PORT_STP_STATE:
 		err = rocker_port_stp_update(rocker_port, attr->trans,
-					     attr->stp_state);
+					     attr->u.stp_state);
 		break;
 	case SWITCHDEV_ATTR_PORT_BRIDGE_FLAGS:
 		err = rocker_port_brport_flags_set(rocker_port, attr->trans,
-						   attr->brport_flags);
+						   attr->u.brport_flags);
 		break;
 	default:
 		err = -EOPNOTSUPP;
@@ -4466,10 +4466,10 @@ static int rocker_port_obj_add(struct net_device *dev,
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_PORT_VLAN:
 		err = rocker_port_vlans_add(rocker_port, obj->trans,
-					    &obj->vlan);
+					    &obj->u.vlan);
 		break;
 	case SWITCHDEV_OBJ_IPV4_FIB:
-		fib4 = &obj->ipv4_fib;
+		fib4 = &obj->u.ipv4_fib;
 		err = rocker_port_fib_ipv4(rocker_port, obj->trans,
 					   htonl(fib4->dst), fib4->dst_len,
 					   fib4->fi, fib4->tb_id, 0);
@@ -4520,10 +4520,10 @@ static int rocker_port_obj_del(struct net_device *dev,
 
 	switch (obj->id) {
 	case SWITCHDEV_OBJ_PORT_VLAN:
-		err = rocker_port_vlans_del(rocker_port, &obj->vlan);
+		err = rocker_port_vlans_del(rocker_port, &obj->u.vlan);
 		break;
 	case SWITCHDEV_OBJ_IPV4_FIB:
-		fib4 = &obj->ipv4_fib;
+		fib4 = &obj->u.ipv4_fib;
 		err = rocker_port_fib_ipv4(rocker_port, SWITCHDEV_TRANS_NONE,
 					   htonl(fib4->dst), fib4->dst_len,
 					   fib4->fi, fib4->tb_id,
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 9f9a7cc573c3..ea5b1c230d3d 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -38,7 +38,7 @@ struct switchdev_attr {
 		struct netdev_phys_item_id ppid;	/* PORT_PARENT_ID */
 		u8 stp_state;				/* PORT_STP_STATE */
 		unsigned long brport_flags;		/* PORT_BRIDGE_FLAGS */
-	};
+	} u;
 };
 
 struct fib_info;
@@ -67,7 +67,7 @@ struct switchdev_obj {
 			u32 nlflags;
 			u32 tb_id;
 		} ipv4_fib;
-	};
+	} u;
 };
 
 /**
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index b9300da31565..45f1ff113af9 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -41,7 +41,7 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
 {
 	struct switchdev_attr attr = {
 		.id = SWITCHDEV_ATTR_PORT_STP_STATE,
-		.stp_state = state,
+		.u.stp_state = state,
 	};
 	int err;
 
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 5a9ce96f6d27..18b34d771ed4 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -465,8 +465,8 @@ static ssize_t phys_switch_id_show(struct device *dev,
 
 		ret = switchdev_port_attr_get(netdev, &attr);
 		if (!ret)
-			ret = sprintf(buf, "%*phN\n", attr.ppid.id_len,
-				      attr.ppid.id);
+			ret = sprintf(buf, "%*phN\n", attr.u.ppid.id_len,
+				      attr.u.ppid.id);
 	}
 	rtnl_unlock();
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index c6c6b2c34926..141ccc357e2e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1016,7 +1016,8 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
 		return err;
 	}
 
-	if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.ppid.id_len, attr.ppid.id))
+	if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.u.ppid.id_len,
+		    attr.u.ppid.id))
 		return -EMSGSIZE;
 
 	return 0;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 3fb5210e318c..04ffad311704 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -353,7 +353,7 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
 	switch (attr->id) {
 	case SWITCHDEV_ATTR_PORT_STP_STATE:
 		if (attr->trans == SWITCHDEV_TRANS_COMMIT)
-			ret = dsa_slave_stp_update(dev, attr->stp_state);
+			ret = dsa_slave_stp_update(dev, attr->u.stp_state);
 		break;
 	default:
 		ret = -EOPNOTSUPP;
@@ -408,8 +408,8 @@ static int dsa_slave_port_attr_get(struct net_device *dev,
 
 	switch (attr->id) {
 	case SWITCHDEV_ATTR_PORT_PARENT_ID:
-		attr->ppid.id_len = sizeof(ds->index);
-		memcpy(&attr->ppid.id, &ds->index, attr->ppid.id_len);
+		attr->u.ppid.id_len = sizeof(ds->index);
+		memcpy(&attr->u.ppid.id, &ds->index, attr->u.ppid.id_len);
 		break;
 	default:
 		return -EOPNOTSUPP;
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 77f1b6e3f78e..0409f9b5bdbc 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -383,7 +383,7 @@ int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 		return err;
 
 	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode,
-				       attr.brport_flags, mask, nlflags);
+				       attr.u.brport_flags, mask, nlflags);
 }
 EXPORT_SYMBOL_GPL(switchdev_port_bridge_getlink);
 
@@ -402,9 +402,9 @@ static int switchdev_port_br_setflag(struct net_device *dev,
 		return err;
 
 	if (flag)
-		attr.brport_flags |= brport_flag;
+		attr.u.brport_flags |= brport_flag;
 	else
-		attr.brport_flags &= ~brport_flag;
+		attr.u.brport_flags &= ~brport_flag;
 
 	return switchdev_port_attr_set(dev, &attr);
 }
@@ -466,6 +466,7 @@ static int switchdev_port_br_afspec(struct net_device *dev,
 	struct switchdev_obj obj = {
 		.id = SWITCHDEV_OBJ_PORT_VLAN,
 	};
+	struct switchdev_obj_vlan *vlan = &obj.u.vlan;
 	int rem;
 	int err;
 
@@ -475,30 +476,30 @@ static int switchdev_port_br_afspec(struct net_device *dev,
 		if (nla_len(attr) != sizeof(struct bridge_vlan_info))
 			return -EINVAL;
 		vinfo = nla_data(attr);
-		obj.vlan.flags = vinfo->flags;
+		vlan->flags = vinfo->flags;
 		if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
-			if (obj.vlan.vid_start)
+			if (vlan->vid_start)
 				return -EINVAL;
-			obj.vlan.vid_start = vinfo->vid;
+			vlan->vid_start = vinfo->vid;
 		} else if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_END) {
-			if (!obj.vlan.vid_start)
+			if (!vlan->vid_start)
 				return -EINVAL;
-			obj.vlan.vid_end = vinfo->vid;
-			if (obj.vlan.vid_end <= obj.vlan.vid_start)
+			vlan->vid_end = vinfo->vid;
+			if (vlan->vid_end <= vlan->vid_start)
 				return -EINVAL;
 			err = f(dev, &obj);
 			if (err)
 				return err;
-			memset(&obj.vlan, 0, sizeof(obj.vlan));
+			memset(vlan, 0, sizeof(*vlan));
 		} else {
-			if (obj.vlan.vid_start)
+			if (vlan->vid_start)
 				return -EINVAL;
-			obj.vlan.vid_start = vinfo->vid;
-			obj.vlan.vid_end = vinfo->vid;
+			vlan->vid_start = vinfo->vid;
+			vlan->vid_end = vinfo->vid;
 			err = f(dev, &obj);
 			if (err)
 				return err;
-			memset(&obj.vlan, 0, sizeof(obj.vlan));
+			memset(vlan, 0, sizeof(*vlan));
 		}
 	}
 
@@ -613,10 +614,10 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
 			return NULL;
 
 		if (nhsel > 0) {
-			if (prev_attr.ppid.id_len != attr.ppid.id_len)
+			if (prev_attr.u.ppid.id_len != attr.u.ppid.id_len)
 				return NULL;
-			if (memcmp(prev_attr.ppid.id, attr.ppid.id,
-				   attr.ppid.id_len))
+			if (memcmp(prev_attr.u.ppid.id, attr.u.ppid.id,
+				   attr.u.ppid.id_len))
 				return NULL;
 		}
 
@@ -644,7 +645,7 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 {
 	struct switchdev_obj fib_obj = {
 		.id = SWITCHDEV_OBJ_IPV4_FIB,
-		.ipv4_fib = {
+		.u.ipv4_fib = {
 			.dst = dst,
 			.dst_len = dst_len,
 			.fi = fi,
@@ -698,7 +699,7 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 {
 	struct switchdev_obj fib_obj = {
 		.id = SWITCHDEV_OBJ_IPV4_FIB,
-		.ipv4_fib = {
+		.u.ipv4_fib = {
 			.dst = dst,
 			.dst_len = dst_len,
 			.fi = fi,
-- 
cgit v1.2.3


From 1bd758eb1cab2fa5b71a23f9e5d3c8076f4ed650 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:07 +0200
Subject: net: change name of flow_dissector header to match the .c file name

add couple of empty lines on the way.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c             |  2 +-
 drivers/net/ethernet/cisco/enic/enic_clsf.c |  2 +-
 include/linux/skbuff.h                      |  2 +-
 include/net/flow_dissector.h                | 68 +++++++++++++++++++++++++++++
 include/net/flow_keys.h                     | 61 --------------------------
 include/net/ip.h                            |  2 +-
 include/net/ipv6.h                          |  2 +-
 net/core/flow_dissector.c                   |  2 +-
 net/sched/cls_flow.c                        |  2 +-
 net/sched/sch_choke.c                       |  2 +-
 10 files changed, 76 insertions(+), 69 deletions(-)
 create mode 100644 include/net/flow_dissector.h
 delete mode 100644 include/net/flow_keys.h

(limited to 'net/core')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a2e25de98bde..fe7c38721775 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -76,7 +76,7 @@
 #include <net/netns/generic.h>
 #include <net/pkt_sched.h>
 #include <linux/rculist.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 #include <net/switchdev.h>
 #include <net/bonding.h>
 #include <net/bond_3ad.h>
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
index 0be6850be8a2..380f04903a36 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.c
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -5,7 +5,7 @@
 #include <linux/in.h>
 #include <linux/types.h>
 #include <linux/skbuff.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 #include "enic_res.h"
 #include "enic_clsf.h"
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c0b574a414e7..e35c2b13d434 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -34,7 +34,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/netdev_features.h>
 #include <linux/sched.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 
 /* A. Checksumming of received packets by device.
  *
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
new file mode 100644
index 000000000000..5e99a7b3963c
--- /dev/null
+++ b/include/net/flow_dissector.h
@@ -0,0 +1,68 @@
+#ifndef _NET_FLOW_DISSECTOR_H
+#define _NET_FLOW_DISSECTOR_H
+
+/* struct flow_keys:
+ *	@src: source ip address in case of IPv4
+ *	      For IPv6 it contains 32bit hash of src address
+ *	@dst: destination ip address in case of IPv4
+ *	      For IPv6 it contains 32bit hash of dst address
+ *	@ports: port numbers of Transport header
+ *		port16[0]: src port number
+ *		port16[1]: dst port number
+ *	@thoff: Transport header offset
+ *	@n_proto: Network header protocol (eg. IPv4/IPv6)
+ *	@ip_proto: Transport header protocol (eg. TCP/UDP)
+ * All the members, except thoff, are in network byte order.
+ */
+struct flow_keys {
+	/* (src,dst) must be grouped, in the same way than in IP header */
+	__be32 src;
+	__be32 dst;
+	union {
+		__be32 ports;
+		__be16 port16[2];
+	};
+	u16	thoff;
+	__be16	n_proto;
+	u8	ip_proto;
+};
+
+bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
+			void *data, __be16 proto, int nhoff, int hlen);
+
+static inline bool skb_flow_dissect(const struct sk_buff *skb,
+				    struct flow_keys *flow)
+{
+	return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0);
+}
+
+__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+			    void *data, int hlen_proto);
+
+static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
+					int thoff, u8 ip_proto)
+{
+	return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
+}
+
+u32 flow_hash_from_keys(struct flow_keys *keys);
+
+unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len,
+			   __be16 protocol);
+
+/* struct flow_keys_digest:
+ *
+ * This structure is used to hold a digest of the full flow keys. This is a
+ * larger "hash" of a flow to allow definitively matching specific flows where
+ * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so
+ * that it can by used in CB of skb (see sch_choke for an example).
+ */
+#define FLOW_KEYS_DIGEST_LEN	16
+struct flow_keys_digest {
+	u8	data[FLOW_KEYS_DIGEST_LEN];
+};
+
+void make_flow_keys_digest(struct flow_keys_digest *digest,
+			   const struct flow_keys *flow);
+
+#endif
diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
deleted file mode 100644
index 6d6ef626811a..000000000000
--- a/include/net/flow_keys.h
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef _NET_FLOW_KEYS_H
-#define _NET_FLOW_KEYS_H
-
-/* struct flow_keys:
- *	@src: source ip address in case of IPv4
- *	      For IPv6 it contains 32bit hash of src address
- *	@dst: destination ip address in case of IPv4
- *	      For IPv6 it contains 32bit hash of dst address
- *	@ports: port numbers of Transport header
- *		port16[0]: src port number
- *		port16[1]: dst port number
- *	@thoff: Transport header offset
- *	@n_proto: Network header protocol (eg. IPv4/IPv6)
- *	@ip_proto: Transport header protocol (eg. TCP/UDP)
- * All the members, except thoff, are in network byte order.
- */
-struct flow_keys {
-	/* (src,dst) must be grouped, in the same way than in IP header */
-	__be32 src;
-	__be32 dst;
-	union {
-		__be32 ports;
-		__be16 port16[2];
-	};
-	u16	thoff;
-	__be16	n_proto;
-	u8	ip_proto;
-};
-
-bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
-			void *data, __be16 proto, int nhoff, int hlen);
-static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
-{
-	return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0);
-}
-__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
-			    void *data, int hlen_proto);
-static inline __be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto)
-{
-	return __skb_flow_get_ports(skb, thoff, ip_proto, NULL, 0);
-}
-u32 flow_hash_from_keys(struct flow_keys *keys);
-unsigned int flow_get_hlen(const unsigned char *data, unsigned int max_len,
-			   __be16 protocol);
-
-/* struct flow_keys_digest:
- *
- * This structure is used to hold a digest of the full flow keys. This is a
- * larger "hash" of a flow to allow definitively matching specific flows where
- * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so
- * that it can by used in CB of skb (see sch_choke for an example).
- */
-#define FLOW_KEYS_DIGEST_LEN	16
-struct flow_keys_digest {
-	u8	data[FLOW_KEYS_DIGEST_LEN];
-};
-
-void make_flow_keys_digest(struct flow_keys_digest *digest,
-			   const struct flow_keys *flow);
-
-#endif
diff --git a/include/net/ip.h b/include/net/ip.h
index d14af7edd197..562eb653aebc 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -31,7 +31,7 @@
 #include <net/route.h>
 #include <net/snmp.h>
 #include <net/flow.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 
 struct sock;
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 53d25ef1699a..9932b86394a9 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -19,7 +19,7 @@
 #include <net/if_inet6.h>
 #include <net/ndisc.h>
 #include <net/flow.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 #include <net/snmp.h>
 
 #define SIN6_LEN_RFC2133	24
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d3acc4dff4ae..eaa86b539221 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -12,7 +12,7 @@
 #include <linux/if_tunnel.h>
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
 /* copy saddr & daddr, possibly using 64bit load/store
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index a620c4e288a5..4c5a92298906 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -26,7 +26,7 @@
 #include <net/pkt_cls.h>
 #include <net/ip.h>
 #include <net/route.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 #include <net/netfilter/nf_conntrack.h>
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index dfe3da75594c..2624e991a2d1 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -18,7 +18,7 @@
 #include <net/pkt_sched.h>
 #include <net/inet_ecn.h>
 #include <net/red.h>
-#include <net/flow_keys.h>
+#include <net/flow_dissector.h>
 
 /*
    CHOKe stateless AQM for fair bandwidth allocation
-- 
cgit v1.2.3


From 10b89ee43e849544eddfe34e535341fc077464ec Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:09 +0200
Subject: net: move *skb_get_poff declarations into correct header

Since these functions are defined in flow_dissector.c, move header
declarations from skbuff.h into flow_dissector.h

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       | 4 ----
 include/net/flow_dissector.h | 3 +++
 net/core/filter.c            | 1 +
 net/ethernet/eth.c           | 1 +
 4 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e35c2b13d434..17607ab9e7a2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3424,10 +3424,6 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
 				     unsigned int transport_len,
 				     __sum16(*skb_chkf)(struct sk_buff *skb));
 
-u32 skb_get_poff(const struct sk_buff *skb);
-u32 __skb_get_poff(const struct sk_buff *skb, void *data,
-		   const struct flow_keys *keys, int hlen);
-
 /**
  * skb_head_is_locked - Determine if the skb->head is locked down
  * @skb: skb to check
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 118ae6926a72..4570ccafe59d 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -46,6 +46,9 @@ static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
 }
 
 u32 flow_hash_from_keys(struct flow_keys *keys);
+u32 skb_get_poff(const struct sk_buff *skb);
+u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+		   const struct flow_keys *keys, int hlen);
 
 /* struct flow_keys_digest:
  *
diff --git a/net/core/filter.c b/net/core/filter.c
index a831f193e2c7..6805717be614 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -36,6 +36,7 @@
 #include <net/netlink.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
+#include <net/flow_dissector.h>
 #include <linux/errno.h>
 #include <linux/timer.h>
 #include <asm/uaccess.h>
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 9045e2a1108f..9332a0ab0698 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -58,6 +58,7 @@
 #include <net/ipv6.h>
 #include <net/ip.h>
 #include <net/dsa.h>
+#include <net/flow_dissector.h>
 #include <linux/uaccess.h>
 
 __setup("ether=", netdev_boot_setup);
-- 
cgit v1.2.3


From d4fd32757176d1b03533770ef75aa2b781fa5667 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:10 +0200
Subject: flow_dissector: fix doc for __skb_get_hash and remove couple of empty
 lines

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index eaa86b539221..0d9bc3a586ba 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -337,8 +337,11 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
 }
 EXPORT_SYMBOL(make_flow_keys_digest);
 
-/*
- * __skb_get_hash: calculate a flow hash based on src/dst addresses
+/**
+ * __skb_get_hash: calculate a flow hash
+ * @skb: sk_buff to calculate flow hash from
+ *
+ * This function calculates a flow hash based on src/dst addresses
  * and src/dst port numbers.  Sets hash in skb to non-zero hash value
  * on success, zero indicates no valid hash.  Also, sets l4_hash in skb
  * if hash is a canonical 4-tuple hash over transport ports.
@@ -353,12 +356,9 @@ void __skb_get_hash(struct sk_buff *skb)
 	hash = ___skb_get_hash(skb, &keys, hashrnd);
 	if (!hash)
 		return;
-
 	if (keys.ports)
 		skb->l4_hash = 1;
-
 	skb->sw_hash = 1;
-
 	skb->hash = hash;
 }
 EXPORT_SYMBOL(__skb_get_hash);
-- 
cgit v1.2.3


From 5605c76240aadc823e3d46ac9afde2f26fbcf019 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:12 +0200
Subject: net: move __skb_tx_hash to dev.c

__skb_tx_hash function has no relation to flow_dissect so just move it
to dev.c

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 +++
 include/linux/skbuff.h    |  3 ---
 net/core/dev.c            | 28 ++++++++++++++++++++++++++++
 net/core/flow_dissector.c | 28 ----------------------------
 4 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cd0951c1893d..d3ed01c18247 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2832,6 +2832,9 @@ static inline int netif_set_xps_queue(struct net_device *dev,
 }
 #endif
 
+u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
+		  unsigned int num_tx_queues);
+
 /*
  * Returns a Tx hash for the given packet when dev->real_num_tx_queues is used
  * as a distribution range limit for the returned value.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ae2d1b7769d8..b01c7fba7c17 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3299,9 +3299,6 @@ static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
 	return skb->queue_mapping != 0;
 }
 
-u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
-		  unsigned int num_tx_queues);
-
 static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
 {
 #ifdef CONFIG_XFRM
diff --git a/net/core/dev.c b/net/core/dev.c
index 90a568a150b4..d044d2f8532b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2350,6 +2350,34 @@ void netif_device_attach(struct net_device *dev)
 }
 EXPORT_SYMBOL(netif_device_attach);
 
+/*
+ * Returns a Tx hash based on the given packet descriptor a Tx queues' number
+ * to be used as a distribution range.
+ */
+u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
+		  unsigned int num_tx_queues)
+{
+	u32 hash;
+	u16 qoffset = 0;
+	u16 qcount = num_tx_queues;
+
+	if (skb_rx_queue_recorded(skb)) {
+		hash = skb_get_rx_queue(skb);
+		while (unlikely(hash >= num_tx_queues))
+			hash -= num_tx_queues;
+		return hash;
+	}
+
+	if (dev->num_tc) {
+		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+		qoffset = dev->tc_to_txq[tc].offset;
+		qcount = dev->tc_to_txq[tc].count;
+	}
+
+	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
+}
+EXPORT_SYMBOL(__skb_tx_hash);
+
 static void skb_warn_bad_offload(const struct sk_buff *skb)
 {
 	static const netdev_features_t null_features = 0;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 0d9bc3a586ba..07ca11d06339 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -371,34 +371,6 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
 }
 EXPORT_SYMBOL(skb_get_hash_perturb);
 
-/*
- * Returns a Tx hash based on the given packet descriptor a Tx queues' number
- * to be used as a distribution range.
- */
-u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
-		  unsigned int num_tx_queues)
-{
-	u32 hash;
-	u16 qoffset = 0;
-	u16 qcount = num_tx_queues;
-
-	if (skb_rx_queue_recorded(skb)) {
-		hash = skb_get_rx_queue(skb);
-		while (unlikely(hash >= num_tx_queues))
-			hash -= num_tx_queues;
-		return hash;
-	}
-
-	if (dev->num_tc) {
-		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
-		qoffset = dev->tc_to_txq[tc].offset;
-		qcount = dev->tc_to_txq[tc].count;
-	}
-
-	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
-}
-EXPORT_SYMBOL(__skb_tx_hash);
-
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen)
 {
-- 
cgit v1.2.3


From 638b2a699fd3ec926d6dda2d2bd96e8f1c49e463 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:13 +0200
Subject: net: move netdev_pick_tx and dependencies to net/core/dev.c

next to its user. No relation to flow_dissector so it makes no sense to
have it in flow_dissector.c

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c            | 78 +++++++++++++++++++++++++++++++++++++++++++++++
 net/core/flow_dissector.c | 78 -----------------------------------------------
 2 files changed, 78 insertions(+), 78 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index d044d2f8532b..af549062ae8e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2936,6 +2936,84 @@ int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(dev_loopback_xmit);
 
+static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
+{
+#ifdef CONFIG_XPS
+	struct xps_dev_maps *dev_maps;
+	struct xps_map *map;
+	int queue_index = -1;
+
+	rcu_read_lock();
+	dev_maps = rcu_dereference(dev->xps_maps);
+	if (dev_maps) {
+		map = rcu_dereference(
+		    dev_maps->cpu_map[skb->sender_cpu - 1]);
+		if (map) {
+			if (map->len == 1)
+				queue_index = map->queues[0];
+			else
+				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
+									   map->len)];
+			if (unlikely(queue_index >= dev->real_num_tx_queues))
+				queue_index = -1;
+		}
+	}
+	rcu_read_unlock();
+
+	return queue_index;
+#else
+	return -1;
+#endif
+}
+
+static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	int queue_index = sk_tx_queue_get(sk);
+
+	if (queue_index < 0 || skb->ooo_okay ||
+	    queue_index >= dev->real_num_tx_queues) {
+		int new_index = get_xps_queue(dev, skb);
+		if (new_index < 0)
+			new_index = skb_tx_hash(dev, skb);
+
+		if (queue_index != new_index && sk &&
+		    rcu_access_pointer(sk->sk_dst_cache))
+			sk_tx_queue_set(sk, new_index);
+
+		queue_index = new_index;
+	}
+
+	return queue_index;
+}
+
+struct netdev_queue *netdev_pick_tx(struct net_device *dev,
+				    struct sk_buff *skb,
+				    void *accel_priv)
+{
+	int queue_index = 0;
+
+#ifdef CONFIG_XPS
+	if (skb->sender_cpu == 0)
+		skb->sender_cpu = raw_smp_processor_id() + 1;
+#endif
+
+	if (dev->real_num_tx_queues != 1) {
+		const struct net_device_ops *ops = dev->netdev_ops;
+		if (ops->ndo_select_queue)
+			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
+							    __netdev_pick_tx);
+		else
+			queue_index = __netdev_pick_tx(dev, skb);
+
+		if (!accel_priv)
+			queue_index = netdev_cap_txqueue(dev, queue_index);
+	}
+
+	skb_set_queue_mapping(skb, queue_index);
+	return netdev_get_tx_queue(dev, queue_index);
+}
+
 /**
  *	__dev_queue_xmit - transmit a buffer
  *	@skb: buffer to transmit
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 07ca11d06339..04f87237bd64 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -431,81 +431,3 @@ u32 skb_get_poff(const struct sk_buff *skb)
 
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
 }
-
-static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
-{
-#ifdef CONFIG_XPS
-	struct xps_dev_maps *dev_maps;
-	struct xps_map *map;
-	int queue_index = -1;
-
-	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_maps);
-	if (dev_maps) {
-		map = rcu_dereference(
-		    dev_maps->cpu_map[skb->sender_cpu - 1]);
-		if (map) {
-			if (map->len == 1)
-				queue_index = map->queues[0];
-			else
-				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
-									   map->len)];
-			if (unlikely(queue_index >= dev->real_num_tx_queues))
-				queue_index = -1;
-		}
-	}
-	rcu_read_unlock();
-
-	return queue_index;
-#else
-	return -1;
-#endif
-}
-
-static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
-{
-	struct sock *sk = skb->sk;
-	int queue_index = sk_tx_queue_get(sk);
-
-	if (queue_index < 0 || skb->ooo_okay ||
-	    queue_index >= dev->real_num_tx_queues) {
-		int new_index = get_xps_queue(dev, skb);
-		if (new_index < 0)
-			new_index = skb_tx_hash(dev, skb);
-
-		if (queue_index != new_index && sk &&
-		    rcu_access_pointer(sk->sk_dst_cache))
-			sk_tx_queue_set(sk, new_index);
-
-		queue_index = new_index;
-	}
-
-	return queue_index;
-}
-
-struct netdev_queue *netdev_pick_tx(struct net_device *dev,
-				    struct sk_buff *skb,
-				    void *accel_priv)
-{
-	int queue_index = 0;
-
-#ifdef CONFIG_XPS
-	if (skb->sender_cpu == 0)
-		skb->sender_cpu = raw_smp_processor_id() + 1;
-#endif
-
-	if (dev->real_num_tx_queues != 1) {
-		const struct net_device_ops *ops = dev->netdev_ops;
-		if (ops->ndo_select_queue)
-			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
-							    __netdev_pick_tx);
-		else
-			queue_index = __netdev_pick_tx(dev, skb);
-
-		if (!accel_priv)
-			queue_index = netdev_cap_txqueue(dev, queue_index);
-	}
-
-	skb_set_queue_mapping(skb, queue_index);
-	return netdev_get_tx_queue(dev, queue_index);
-}
-- 
cgit v1.2.3


From 0db89b8b3243829704225e756ca7417e06812bba Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:14 +0200
Subject: flow_dissector: fix doc for skb_get_poff

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 04f87237bd64..d885e282908e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -417,8 +417,12 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 	return poff;
 }
 
-/* skb_get_poff() returns the offset to the payload as far as it could
- * be dissected. The main user is currently BPF, so that we can dynamically
+/**
+ * skb_get_poff - get the offset to the payload
+ * @skb: sk_buff to get the payload offset from
+ *
+ * The function will get the offset to the payload as far as it could
+ * be dissected.  The main user is currently BPF, so that we can dynamically
  * truncate packets without needing to push actual payload to the user
  * space and can analyze headers only, instead.
  */
-- 
cgit v1.2.3


From fbff949e3bc7f3f7d9e8b3ef4855ec7138276a25 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:15 +0200
Subject: flow_dissector: introduce programable flow_dissector

Introduce dissector infrastructure which allows user to specify which
parts of skb he wants to dissect.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 61 ++++++++++++++++++++++++++++++++++++++++++++
 net/core/flow_dissector.c    | 48 ++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+)

(limited to 'net/core')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index e4ee761ca8c0..20239e807f77 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -1,6 +1,64 @@
 #ifndef _NET_FLOW_DISSECTOR_H
 #define _NET_FLOW_DISSECTOR_H
 
+/**
+ * struct flow_dissector_key_basic:
+ * @thoff: Transport header offset
+ * @n_proto: Network header protocol (eg. IPv4/IPv6)
+ * @ip_proto: Transport header protocol (eg. TCP/UDP)
+ */
+struct flow_dissector_key_basic {
+	u16	thoff;
+	__be16	n_proto;
+	u8	ip_proto;
+};
+
+/**
+ * struct flow_dissector_key_addrs:
+ * @src: source ip address in case of IPv4
+ *	 For IPv6 it contains 32bit hash of src address
+ * @dst: destination ip address in case of IPv4
+ *	 For IPv6 it contains 32bit hash of dst address
+ */
+struct flow_dissector_key_addrs {
+	/* (src,dst) must be grouped, in the same way than in IP header */
+	__be32 src;
+	__be32 dst;
+};
+
+/**
+ * flow_dissector_key_tp_ports:
+ *	@ports: port numbers of Transport header
+ *		port16[0]: src port number
+ *		port16[1]: dst port number
+ */
+struct flow_dissector_key_ports {
+	union {
+		__be32 ports;
+		__be16 port16[2];
+	};
+};
+
+enum flow_dissector_key_id {
+	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
+	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_addrs */
+	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
+	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
+
+	FLOW_DISSECTOR_KEY_MAX,
+};
+
+struct flow_dissector_key {
+	enum flow_dissector_key_id key_id;
+	size_t offset; /* offset of struct flow_dissector_key_*
+			  in target the struct */
+};
+
+struct flow_dissector {
+	unsigned int used_keys; /* each bit repesents presence of one key id */
+	unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
+};
+
 /* struct flow_keys:
  *	@src: source ip address in case of IPv4
  *	      For IPv6 it contains 32bit hash of src address
@@ -27,6 +85,9 @@ struct flow_keys {
 	u8	ip_proto;
 };
 
+void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
+			     const struct flow_dissector_key *key,
+			     unsigned int key_count);
 bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
 			void *data, __be16 proto, int nhoff, int hlen);
 
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d885e282908e..6883e22b8fc4 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1,3 +1,4 @@
+#include <linux/kernel.h>
 #include <linux/skbuff.h>
 #include <linux/export.h>
 #include <linux/ip.h>
@@ -15,6 +16,53 @@
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
+static bool skb_flow_dissector_uses_key(struct flow_dissector *flow_dissector,
+					enum flow_dissector_key_id key_id)
+{
+	return flow_dissector->used_keys & (1 << key_id);
+}
+
+static void skb_flow_dissector_set_key(struct flow_dissector *flow_dissector,
+				       enum flow_dissector_key_id key_id)
+{
+	flow_dissector->used_keys |= (1 << key_id);
+}
+
+static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
+				       enum flow_dissector_key_id key_id,
+				       void *target_container)
+{
+	return ((char *) target_container) + flow_dissector->offset[key_id];
+}
+
+void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
+			     const struct flow_dissector_key *key,
+			     unsigned int key_count)
+{
+	unsigned int i;
+
+	memset(flow_dissector, 0, sizeof(*flow_dissector));
+
+	for (i = 0; i < key_count; i++, key++) {
+		/* User should make sure that every key target offset is withing
+		 * boundaries of unsigned short.
+		 */
+		BUG_ON(key->offset > USHRT_MAX);
+		BUG_ON(skb_flow_dissector_uses_key(flow_dissector,
+						   key->key_id));
+
+		skb_flow_dissector_set_key(flow_dissector, key->key_id);
+		flow_dissector->offset[key->key_id] = key->offset;
+	}
+
+	/* Ensure that the dissector always includes basic key. That way
+	 * we are able to avoid handling lack of it in fast path.
+	 */
+	BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
+					    FLOW_DISSECTOR_KEY_BASIC));
+}
+EXPORT_SYMBOL(skb_flow_dissector_init);
+
 /* copy saddr & daddr, possibly using 64bit load/store
  * Equivalent to :	flow->src = iph->saddr;
  *			flow->dst = iph->daddr;
-- 
cgit v1.2.3


From 06635a35d13d42b95422bba6633f175245cc644e Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:16 +0200
Subject: flow_dissect: use programable dissector in skb_flow_dissect and
 friends

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                |  18 +--
 drivers/net/ethernet/cisco/enic/enic_clsf.c    |  27 ++--
 drivers/net/ethernet/cisco/enic/enic_ethtool.c |  10 +-
 drivers/net/hyperv/netvsc_drv.c                |   8 +-
 include/linux/skbuff.h                         |   4 +-
 include/net/flow_dissector.h                   |  63 ++++----
 include/net/ip.h                               |   8 +-
 include/net/ipv6.h                             |   8 +-
 net/core/flow_dissector.c                      | 199 +++++++++++++++++--------
 net/ethernet/eth.c                             |   6 +-
 net/sched/cls_flow.c                           |  20 +--
 net/sched/sch_choke.c                          |   4 +-
 12 files changed, 229 insertions(+), 146 deletions(-)

(limited to 'net/core')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index fe7c38721775..ef26e0147050 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3051,16 +3051,16 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	int noff, proto = -1;
 
 	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23)
-		return skb_flow_dissect(skb, fk);
+		return skb_flow_dissect_flow_keys(skb, fk);
 
-	fk->ports = 0;
+	fk->ports.ports = 0;
 	noff = skb_network_offset(skb);
 	if (skb->protocol == htons(ETH_P_IP)) {
 		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph))))
 			return false;
 		iph = ip_hdr(skb);
-		fk->src = iph->saddr;
-		fk->dst = iph->daddr;
+		fk->addrs.src = iph->saddr;
+		fk->addrs.dst = iph->daddr;
 		noff += iph->ihl << 2;
 		if (!ip_is_fragment(iph))
 			proto = iph->protocol;
@@ -3068,15 +3068,15 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph6))))
 			return false;
 		iph6 = ipv6_hdr(skb);
-		fk->src = (__force __be32)ipv6_addr_hash(&iph6->saddr);
-		fk->dst = (__force __be32)ipv6_addr_hash(&iph6->daddr);
+		fk->addrs.src = (__force __be32)ipv6_addr_hash(&iph6->saddr);
+		fk->addrs.dst = (__force __be32)ipv6_addr_hash(&iph6->daddr);
 		noff += sizeof(*iph6);
 		proto = iph6->nexthdr;
 	} else {
 		return false;
 	}
 	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0)
-		fk->ports = skb_flow_get_ports(skb, noff, proto);
+		fk->ports.ports = skb_flow_get_ports(skb, noff, proto);
 
 	return true;
 }
@@ -3102,8 +3102,8 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
 	    bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23)
 		hash = bond_eth_hash(skb);
 	else
-		hash = (__force u32)flow.ports;
-	hash ^= (__force u32)flow.dst ^ (__force u32)flow.src;
+		hash = (__force u32)flow.ports.ports;
+	hash ^= (__force u32)flow.addrs.dst ^ (__force u32)flow.addrs.src;
 	hash ^= (hash >> 16);
 	hash ^= (hash >> 8);
 
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
index 380f04903a36..d3d25c7e5b96 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.c
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -22,7 +22,7 @@ int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq)
 	int res;
 	struct filter data;
 
-	switch (keys->ip_proto) {
+	switch (keys->basic.ip_proto) {
 	case IPPROTO_TCP:
 		data.u.ipv4.protocol = PROTO_TCP;
 		break;
@@ -33,10 +33,10 @@ int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq)
 		return -EPROTONOSUPPORT;
 	};
 	data.type = FILTER_IPV4_5TUPLE;
-	data.u.ipv4.src_addr = ntohl(keys->src);
-	data.u.ipv4.dst_addr = ntohl(keys->dst);
-	data.u.ipv4.src_port = ntohs(keys->port16[0]);
-	data.u.ipv4.dst_port = ntohs(keys->port16[1]);
+	data.u.ipv4.src_addr = ntohl(keys->addrs.src);
+	data.u.ipv4.dst_addr = ntohl(keys->addrs.dst);
+	data.u.ipv4.src_port = ntohs(keys->ports.port16[0]);
+	data.u.ipv4.dst_port = ntohs(keys->ports.port16[1]);
 	data.u.ipv4.flags = FILTER_FIELDS_IPV4_5TUPLE;
 
 	spin_lock_bh(&enic->devcmd_lock);
@@ -158,11 +158,11 @@ static struct enic_rfs_fltr_node *htbl_key_search(struct hlist_head *h,
 	struct enic_rfs_fltr_node *tpos;
 
 	hlist_for_each_entry(tpos, h, node)
-		if (tpos->keys.src == k->src &&
-		    tpos->keys.dst == k->dst &&
-		    tpos->keys.ports == k->ports &&
-		    tpos->keys.ip_proto == k->ip_proto &&
-		    tpos->keys.n_proto == k->n_proto)
+		if (tpos->keys.addrs.src == k->addrs.src &&
+		    tpos->keys.addrs.dst == k->addrs.dst &&
+		    tpos->keys.ports.ports == k->ports.ports &&
+		    tpos->keys.basic.ip_proto == k->basic.ip_proto &&
+		    tpos->keys.basic.n_proto == k->basic.n_proto)
 			return tpos;
 	return NULL;
 }
@@ -177,9 +177,10 @@ int enic_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
 	int res, i;
 
 	enic = netdev_priv(dev);
-	res = skb_flow_dissect(skb, &keys);
-	if (!res || keys.n_proto != htons(ETH_P_IP) ||
-	    (keys.ip_proto != IPPROTO_TCP && keys.ip_proto != IPPROTO_UDP))
+	res = skb_flow_dissect_flow_keys(skb, &keys);
+	if (!res || keys.basic.n_proto != htons(ETH_P_IP) ||
+	    (keys.basic.ip_proto != IPPROTO_TCP &&
+	     keys.basic.ip_proto != IPPROTO_UDP))
 		return -EPROTONOSUPPORT;
 
 	tbl_idx = skb_get_hash_raw(skb) & ENIC_RFS_FLW_MASK;
diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
index 28d9ca675a27..7588f8dcc890 100644
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -334,7 +334,7 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd)
 	n = htbl_fltr_search(enic, (u16)fsp->location);
 	if (!n)
 		return -EINVAL;
-	switch (n->keys.ip_proto) {
+	switch (n->keys.basic.ip_proto) {
 	case IPPROTO_TCP:
 		fsp->flow_type = TCP_V4_FLOW;
 		break;
@@ -346,16 +346,16 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd)
 		break;
 	}
 
-	fsp->h_u.tcp_ip4_spec.ip4src = n->keys.src;
+	fsp->h_u.tcp_ip4_spec.ip4src = n->keys.addrs.src;
 	fsp->m_u.tcp_ip4_spec.ip4src = (__u32)~0;
 
-	fsp->h_u.tcp_ip4_spec.ip4dst = n->keys.dst;
+	fsp->h_u.tcp_ip4_spec.ip4dst = n->keys.addrs.dst;
 	fsp->m_u.tcp_ip4_spec.ip4dst = (__u32)~0;
 
-	fsp->h_u.tcp_ip4_spec.psrc = n->keys.port16[0];
+	fsp->h_u.tcp_ip4_spec.psrc = n->keys.ports.port16[0];
 	fsp->m_u.tcp_ip4_spec.psrc = (__u16)~0;
 
-	fsp->h_u.tcp_ip4_spec.pdst = n->keys.port16[1];
+	fsp->h_u.tcp_ip4_spec.pdst = n->keys.ports.port16[1];
 	fsp->m_u.tcp_ip4_spec.pdst = (__u16)~0;
 
 	fsp->ring_cookie = n->rq_id;
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 5993c7e2d723..8e5fe888a0ec 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -196,12 +196,12 @@ static bool netvsc_set_hash(u32 *hash, struct sk_buff *skb)
 	struct flow_keys flow;
 	int data_len;
 
-	if (!skb_flow_dissect(skb, &flow) ||
-	    !(flow.n_proto == htons(ETH_P_IP) ||
-	      flow.n_proto == htons(ETH_P_IPV6)))
+	if (!skb_flow_dissect_flow_keys(skb, &flow) ||
+	    !(flow.basic.n_proto == htons(ETH_P_IP) ||
+	      flow.basic.n_proto == htons(ETH_P_IPV6)))
 		return false;
 
-	if (flow.ip_proto == IPPROTO_TCP)
+	if (flow.basic.ip_proto == IPPROTO_TCP)
 		data_len = 12;
 	else
 		data_len = 8;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b01c7fba7c17..f83aa6568cbf 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1935,8 +1935,8 @@ static inline void skb_probe_transport_header(struct sk_buff *skb,
 
 	if (skb_transport_header_was_set(skb))
 		return;
-	else if (skb_flow_dissect(skb, &keys))
-		skb_set_transport_header(skb, keys.thoff);
+	else if (skb_flow_dissect_flow_keys(skb, &keys))
+		skb_set_transport_header(skb, keys.basic.thoff);
 	else
 		skb_set_transport_header(skb, offset_hint);
 }
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 20239e807f77..0c8d406fb730 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -59,42 +59,47 @@ struct flow_dissector {
 	unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
 };
 
-/* struct flow_keys:
- *	@src: source ip address in case of IPv4
- *	      For IPv6 it contains 32bit hash of src address
- *	@dst: destination ip address in case of IPv4
- *	      For IPv6 it contains 32bit hash of dst address
- *	@ports: port numbers of Transport header
- *		port16[0]: src port number
- *		port16[1]: dst port number
- *	@thoff: Transport header offset
- *	@n_proto: Network header protocol (eg. IPv4/IPv6)
- *	@ip_proto: Transport header protocol (eg. TCP/UDP)
- * All the members, except thoff, are in network byte order.
- */
-struct flow_keys {
-	/* (src,dst) must be grouped, in the same way than in IP header */
-	__be32 src;
-	__be32 dst;
-	union {
-		__be32 ports;
-		__be16 port16[2];
-	};
-	u16	thoff;
-	__be16	n_proto;
-	u8	ip_proto;
-};
-
 void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 			     const struct flow_dissector_key *key,
 			     unsigned int key_count);
-bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
+
+bool __skb_flow_dissect(const struct sk_buff *skb,
+			struct flow_dissector *flow_dissector,
+			void *target_container,
 			void *data, __be16 proto, int nhoff, int hlen);
 
 static inline bool skb_flow_dissect(const struct sk_buff *skb,
-				    struct flow_keys *flow)
+				    struct flow_dissector *flow_dissector,
+				    void *target_container)
+{
+	return __skb_flow_dissect(skb, flow_dissector, target_container,
+				  NULL, 0, 0, 0);
+}
+
+struct flow_keys {
+	struct flow_dissector_key_addrs addrs;
+	struct flow_dissector_key_ports ports;
+	struct flow_dissector_key_basic basic;
+};
+
+extern struct flow_dissector flow_keys_dissector;
+extern struct flow_dissector flow_keys_buf_dissector;
+
+static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
+					      struct flow_keys *flow)
+{
+	memset(flow, 0, sizeof(*flow));
+	return __skb_flow_dissect(skb, &flow_keys_dissector, flow,
+				  NULL, 0, 0, 0);
+}
+
+static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow,
+						  void *data, __be16 proto,
+						  int nhoff, int hlen)
 {
-	return __skb_flow_dissect(skb, flow, NULL, 0, 0, 0);
+	memset(flow, 0, sizeof(*flow));
+	return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow,
+				  data, proto, nhoff, hlen);
 }
 
 __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
diff --git a/include/net/ip.h b/include/net/ip.h
index 562eb653aebc..b0443d4fe13f 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -360,10 +360,10 @@ static inline void inet_set_txhash(struct sock *sk)
 	struct inet_sock *inet = inet_sk(sk);
 	struct flow_keys keys;
 
-	keys.src = inet->inet_saddr;
-	keys.dst = inet->inet_daddr;
-	keys.port16[0] = inet->inet_sport;
-	keys.port16[1] = inet->inet_dport;
+	keys.addrs.src = inet->inet_saddr;
+	keys.addrs.dst = inet->inet_daddr;
+	keys.ports.port16[0] = inet->inet_sport;
+	keys.ports.port16[1] = inet->inet_dport;
 
 	sk->sk_txhash = flow_hash_from_keys(&keys);
 }
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 9932b86394a9..9eed9761dfce 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -698,10 +698,10 @@ static inline void ip6_set_txhash(struct sock *sk)
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct flow_keys keys;
 
-	keys.src = (__force __be32)ipv6_addr_hash(&np->saddr);
-	keys.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr);
-	keys.port16[0] = inet->inet_sport;
-	keys.port16[1] = inet->inet_dport;
+	keys.addrs.src = (__force __be32)ipv6_addr_hash(&np->saddr);
+	keys.addrs.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr);
+	keys.ports.port16[0] = inet->inet_sport;
+	keys.ports.port16[1] = inet->inet_dport;
 
 	sk->sk_txhash = flow_hash_from_keys(&keys);
 }
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 6883e22b8fc4..6a49acaa6651 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -13,6 +13,7 @@
 #include <linux/if_tunnel.h>
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
+#include <linux/stddef.h>
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
@@ -63,17 +64,6 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 }
 EXPORT_SYMBOL(skb_flow_dissector_init);
 
-/* copy saddr & daddr, possibly using 64bit load/store
- * Equivalent to :	flow->src = iph->saddr;
- *			flow->dst = iph->daddr;
- */
-static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
-{
-	BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
-		     offsetof(typeof(*flow), src) + sizeof(flow->src));
-	memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
-}
-
 /**
  * __skb_flow_get_ports - extract the upper layer ports and return them
  * @skb: sk_buff to extract the ports from
@@ -111,17 +101,27 @@ EXPORT_SYMBOL(__skb_flow_get_ports);
 /**
  * __skb_flow_dissect - extract the flow_keys struct and return it
  * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
+ * @flow_dissector: list of keys to dissect
+ * @target_container: target structure to put dissected values into
  * @data: raw buffer pointer to the packet, if NULL use skb->data
  * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
  * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
  * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
  *
- * The function will try to retrieve the struct flow_keys from either the skbuff
- * or a raw buffer specified by the rest parameters
+ * The function will try to retrieve individual keys into target specified
+ * by flow_dissector from either the skbuff or a raw buffer specified by the
+ * rest parameters.
+ *
+ * Caller must take care of zeroing target container memory.
  */
-bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
+bool __skb_flow_dissect(const struct sk_buff *skb,
+			struct flow_dissector *flow_dissector,
+			void *target_container,
 			void *data, __be16 proto, int nhoff, int hlen)
 {
+	struct flow_dissector_key_basic *key_basic;
+	struct flow_dissector_key_addrs *key_addrs;
+	struct flow_dissector_key_ports *key_ports;
 	u8 ip_proto;
 
 	if (!data) {
@@ -131,7 +131,12 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
 		hlen = skb_headlen(skb);
 	}
 
-	memset(flow, 0, sizeof(*flow));
+	/* It is ensured by skb_flow_dissector_init() that basic key will
+	 * be always present.
+	 */
+	key_basic = skb_flow_dissector_target(flow_dissector,
+					      FLOW_DISSECTOR_KEY_BASIC,
+					      target_container);
 
 again:
 	switch (proto) {
@@ -148,14 +153,13 @@ ip:
 		if (ip_is_fragment(iph))
 			ip_proto = 0;
 
-		/* skip the address processing if skb is NULL.  The assumption
-		 * here is that if there is no skb we are not looking for flow
-		 * info but lengths and protocols.
-		 */
-		if (!skb)
+		if (!skb_flow_dissector_uses_key(flow_dissector,
+						 FLOW_DISSECTOR_KEY_IPV4_ADDRS))
 			break;
-
-		iph_to_flow_copy_addrs(flow, iph);
+		key_addrs = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+						      target_container);
+		memcpy(key_addrs, &iph->saddr, sizeof(*key_addrs));
 		break;
 	}
 	case htons(ETH_P_IPV6): {
@@ -171,12 +175,15 @@ ipv6:
 		ip_proto = iph->nexthdr;
 		nhoff += sizeof(struct ipv6hdr);
 
-		/* see comment above in IPv4 section */
-		if (!skb)
+		if (!skb_flow_dissector_uses_key(flow_dissector,
+						 FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS))
 			break;
+		key_addrs = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
+						      target_container);
 
-		flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
-		flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
+		key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
+		key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
 
 		flow_label = ip6_flowlabel(iph);
 		if (flow_label) {
@@ -184,10 +191,18 @@ ipv6:
 			 * use that to represent the ports without any
 			 * further dissection.
 			 */
-			flow->n_proto = proto;
-			flow->ip_proto = ip_proto;
-			flow->ports = flow_label;
-			flow->thoff = (u16)nhoff;
+
+			key_basic->n_proto = proto;
+			key_basic->ip_proto = ip_proto;
+			key_basic->thoff = (u16)nhoff;
+
+			if (!skb_flow_dissector_uses_key(flow_dissector,
+							 FLOW_DISSECTOR_KEY_PORTS))
+				break;
+			key_ports = skb_flow_dissector_target(flow_dissector,
+							      FLOW_DISSECTOR_KEY_PORTS,
+							      target_container);
+			key_ports->ports = flow_label;
 
 			return true;
 		}
@@ -234,14 +249,22 @@ ipv6:
 		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
 		if (!hdr)
 			return false;
-		flow->src = hdr->srcnode;
-		flow->dst = 0;
-		flow->n_proto = proto;
-		flow->thoff = (u16)nhoff;
+		key_basic->n_proto = proto;
+		key_basic->thoff = (u16)nhoff;
+
+		if (skb_flow_dissector_uses_key(flow_dissector,
+						FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) {
+			return true;
+			key_addrs = skb_flow_dissector_target(flow_dissector,
+							      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
+							      target_container);
+			key_addrs->src = hdr->srcnode;
+			key_addrs->dst = 0;
+		}
 		return true;
 	}
 	case htons(ETH_P_FCOE):
-		flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
+		key_basic->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
 		/* fall through */
 	default:
 		return false;
@@ -296,14 +319,24 @@ ipv6:
 		break;
 	}
 
-	flow->n_proto = proto;
-	flow->ip_proto = ip_proto;
-	flow->thoff = (u16) nhoff;
-
-	/* unless skb is set we don't need to record port info */
-	if (skb)
-		flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
-						   data, hlen);
+	/* It is ensured by skb_flow_dissector_init() that basic key will
+	 * be always present.
+	 */
+	key_basic = skb_flow_dissector_target(flow_dissector,
+					      FLOW_DISSECTOR_KEY_BASIC,
+					      target_container);
+	key_basic->n_proto = proto;
+	key_basic->ip_proto = ip_proto;
+	key_basic->thoff = (u16) nhoff;
+
+	if (skb_flow_dissector_uses_key(flow_dissector,
+					FLOW_DISSECTOR_KEY_PORTS)) {
+		key_ports = skb_flow_dissector_target(flow_dissector,
+						      FLOW_DISSECTOR_KEY_PORTS,
+						      target_container);
+		key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
+							data, hlen);
+	}
 
 	return true;
 }
@@ -325,16 +358,16 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 	u32 hash;
 
 	/* get a consistent hash (same value on both flow directions) */
-	if (((__force u32)keys->dst < (__force u32)keys->src) ||
-	    (((__force u32)keys->dst == (__force u32)keys->src) &&
-	     ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) {
-		swap(keys->dst, keys->src);
-		swap(keys->port16[0], keys->port16[1]);
+	if (((__force u32)keys->addrs.dst < (__force u32)keys->addrs.src) ||
+	    (((__force u32)keys->addrs.dst == (__force u32)keys->addrs.src) &&
+	     ((__force u16)keys->ports.port16[1] < (__force u16)keys->ports.port16[0]))) {
+		swap(keys->addrs.dst, keys->addrs.src);
+		swap(keys->ports.port16[0], keys->ports.port16[1]);
 	}
 
-	hash = __flow_hash_3words((__force u32)keys->dst,
-				  (__force u32)keys->src,
-				  (__force u32)keys->ports,
+	hash = __flow_hash_3words((__force u32)keys->addrs.dst,
+				  (__force u32)keys->addrs.src,
+				  (__force u32)keys->ports.ports,
 				  keyval);
 	if (!hash)
 		hash = 1;
@@ -352,7 +385,7 @@ EXPORT_SYMBOL(flow_hash_from_keys);
 static inline u32 ___skb_get_hash(const struct sk_buff *skb,
 				  struct flow_keys *keys, u32 keyval)
 {
-	if (!skb_flow_dissect(skb, keys))
+	if (!skb_flow_dissect_flow_keys(skb, keys))
 		return 0;
 
 	return __flow_hash_from_keys(keys, keyval);
@@ -377,11 +410,11 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
 
 	memset(digest, 0, sizeof(*digest));
 
-	data->n_proto = flow->n_proto;
-	data->ip_proto = flow->ip_proto;
-	data->ports = flow->ports;
-	data->src = flow->src;
-	data->dst = flow->dst;
+	data->n_proto = flow->basic.n_proto;
+	data->ip_proto = flow->basic.ip_proto;
+	data->ports = flow->ports.ports;
+	data->src = flow->addrs.src;
+	data->dst = flow->addrs.dst;
 }
 EXPORT_SYMBOL(make_flow_keys_digest);
 
@@ -404,7 +437,7 @@ void __skb_get_hash(struct sk_buff *skb)
 	hash = ___skb_get_hash(skb, &keys, hashrnd);
 	if (!hash)
 		return;
-	if (keys.ports)
+	if (keys.ports.ports)
 		skb->l4_hash = 1;
 	skb->sw_hash = 1;
 	skb->hash = hash;
@@ -422,9 +455,9 @@ EXPORT_SYMBOL(skb_get_hash_perturb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen)
 {
-	u32 poff = keys->thoff;
+	u32 poff = keys->basic.thoff;
 
-	switch (keys->ip_proto) {
+	switch (keys->basic.ip_proto) {
 	case IPPROTO_TCP: {
 		/* access doff as u8 to avoid unaligned access */
 		const u8 *doff;
@@ -478,8 +511,52 @@ u32 skb_get_poff(const struct sk_buff *skb)
 {
 	struct flow_keys keys;
 
-	if (!skb_flow_dissect(skb, &keys))
+	if (!skb_flow_dissect_flow_keys(skb, &keys))
 		return 0;
 
 	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
 }
+
+static const struct flow_dissector_key flow_keys_dissector_keys[] = {
+	{
+		.key_id = FLOW_DISSECTOR_KEY_BASIC,
+		.offset = offsetof(struct flow_keys, basic),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+		.offset = offsetof(struct flow_keys, addrs),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
+		.offset = offsetof(struct flow_keys, addrs),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_PORTS,
+		.offset = offsetof(struct flow_keys, ports),
+	},
+};
+
+static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
+	{
+		.key_id = FLOW_DISSECTOR_KEY_BASIC,
+		.offset = offsetof(struct flow_keys, basic),
+	},
+};
+
+struct flow_dissector flow_keys_dissector __read_mostly;
+EXPORT_SYMBOL(flow_keys_dissector);
+
+struct flow_dissector flow_keys_buf_dissector __read_mostly;
+
+static int __init init_default_flow_dissectors(void)
+{
+	skb_flow_dissector_init(&flow_keys_dissector,
+				flow_keys_dissector_keys,
+				ARRAY_SIZE(flow_keys_dissector_keys));
+	skb_flow_dissector_init(&flow_keys_buf_dissector,
+				flow_keys_buf_dissector_keys,
+				ARRAY_SIZE(flow_keys_buf_dissector_keys));
+	return 0;
+}
+
+late_initcall_sync(init_default_flow_dissectors);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 9332a0ab0698..c3325bd2f3fb 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -131,9 +131,9 @@ u32 eth_get_headlen(void *data, unsigned int len)
 		return len;
 
 	/* parse any remaining L2/L3 headers, check for L4 */
-	if (!__skb_flow_dissect(NULL, &keys, data,
-				eth->h_proto, sizeof(*eth), len))
-		return max_t(u32, keys.thoff, sizeof(*eth));
+	if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
+					    sizeof(*eth), len))
+		return max_t(u32, keys.basic.thoff, sizeof(*eth));
 
 	/* parse for any L4 headers */
 	return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 4c5a92298906..4b3e3e30bf4d 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -68,35 +68,35 @@ static inline u32 addr_fold(void *addr)
 
 static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->src)
-		return ntohl(flow->src);
+	if (flow->addrs.src)
+		return ntohl(flow->addrs.src);
 	return addr_fold(skb->sk);
 }
 
 static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->dst)
-		return ntohl(flow->dst);
+	if (flow->addrs.dst)
+		return ntohl(flow->addrs.dst);
 	return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
 }
 
 static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	return flow->ip_proto;
+	return flow->basic.ip_proto;
 }
 
 static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->ports)
-		return ntohs(flow->port16[0]);
+	if (flow->ports.ports)
+		return ntohs(flow->ports.port16[0]);
 
 	return addr_fold(skb->sk);
 }
 
 static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->ports)
-		return ntohs(flow->port16[1]);
+	if (flow->ports.ports)
+		return ntohs(flow->ports.port16[1]);
 
 	return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
 }
@@ -295,7 +295,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 
 		keymask = f->keymask;
 		if (keymask & FLOW_KEYS_NEEDED)
-			skb_flow_dissect(skb, &flow_keys);
+			skb_flow_dissect_flow_keys(skb, &flow_keys);
 
 		for (n = 0; n < f->nkeys; n++) {
 			key = ffs(keymask) - 1;
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 2624e991a2d1..93d5742dc7e0 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -170,13 +170,13 @@ static bool choke_match_flow(struct sk_buff *skb1,
 
 	if (!choke_skb_cb(skb1)->keys_valid) {
 		choke_skb_cb(skb1)->keys_valid = 1;
-		skb_flow_dissect(skb1, &temp);
+		skb_flow_dissect_flow_keys(skb1, &temp);
 		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
 	}
 
 	if (!choke_skb_cb(skb2)->keys_valid) {
 		choke_skb_cb(skb2)->keys_valid = 1;
-		skb_flow_dissect(skb2, &temp);
+		skb_flow_dissect_flow_keys(skb2, &temp);
 		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
 	}
 
-- 
cgit v1.2.3


From b924933cbbfbdcaa2831a39780c116ec6e48c397 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:18 +0200
Subject: flow_dissector: introduce support for ipv6 addressses

So far, only hashes made out of ipv6 addresses could be dissected. This
patch introduces support for dissection of full ipv6 addresses.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 13 +++++++++++++
 net/core/flow_dissector.c    | 29 +++++++++++++++++++++--------
 2 files changed, 34 insertions(+), 8 deletions(-)

(limited to 'net/core')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 1add8918a155..586b12306a52 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/skbuff.h>
+#include <linux/in6.h>
 
 /**
  * struct flow_dissector_key_basic:
@@ -42,11 +43,23 @@ struct flow_dissector_key_ports {
 	};
 };
 
+/**
+ * struct flow_dissector_key_ipv6_addrs:
+ * @src: source ip address
+ * @dst: destination ip address
+ */
+struct flow_dissector_key_ipv6_addrs {
+	/* (src,dst) must be grouped, in the same way than in IP header */
+	struct in6_addr src;
+	struct in6_addr dst;
+};
+
 enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
 	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
+	FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 6a49acaa6651..1b95d5ccc9d6 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -175,16 +175,29 @@ ipv6:
 		ip_proto = iph->nexthdr;
 		nhoff += sizeof(struct ipv6hdr);
 
-		if (!skb_flow_dissector_uses_key(flow_dissector,
-						 FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS))
-			break;
-		key_addrs = skb_flow_dissector_target(flow_dissector,
-						      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
-						      target_container);
+		if (skb_flow_dissector_uses_key(flow_dissector,
+						FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) {
+			key_addrs = skb_flow_dissector_target(flow_dissector,
+							      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
+							      target_container);
 
-		key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
-		key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
+			key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
+			key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
+			goto flow_label;
+		}
+		if (skb_flow_dissector_uses_key(flow_dissector,
+						FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+			struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs;
 
+			key_ipv6_addrs = skb_flow_dissector_target(flow_dissector,
+								   FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+								   target_container);
+
+			memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs));
+			goto flow_label;
+		}
+		break;
+flow_label:
 		flow_label = ip6_flowlabel(iph);
 		if (flow_label) {
 			/* Awesome, IPv6 packet has a flow label so we can
-- 
cgit v1.2.3


From 67a900cc0436d74e7ff89042371760def087680d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:19 +0200
Subject: flow_dissector: introduce support for Ethernet addresses

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 13 +++++++++++++
 net/core/flow_dissector.c    | 12 ++++++++++++
 2 files changed, 25 insertions(+)

(limited to 'net/core')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 586b12306a52..5eac9870689c 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 #include <linux/skbuff.h>
 #include <linux/in6.h>
+#include <uapi/linux/if_ether.h>
 
 /**
  * struct flow_dissector_key_basic:
@@ -54,12 +55,24 @@ struct flow_dissector_key_ipv6_addrs {
 	struct in6_addr dst;
 };
 
+/**
+ * struct flow_dissector_key_eth_addrs:
+ * @src: source Ethernet address
+ * @dst: destination Ethernet address
+ */
+struct flow_dissector_key_eth_addrs {
+	/* (dst,src) must be grouped, in the same way than in ETH header */
+	unsigned char dst[ETH_ALEN];
+	unsigned char src[ETH_ALEN];
+};
+
 enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
 	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
 	FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
+	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 1b95d5ccc9d6..7a0b391114a5 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -14,6 +14,7 @@
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
 #include <linux/stddef.h>
+#include <linux/if_ether.h>
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
@@ -138,6 +139,17 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 					      FLOW_DISSECTOR_KEY_BASIC,
 					      target_container);
 
+	if (skb_flow_dissector_uses_key(flow_dissector,
+					FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
+		struct ethhdr *eth = eth_hdr(skb);
+		struct flow_dissector_key_eth_addrs *key_eth_addrs;
+
+		key_eth_addrs = skb_flow_dissector_target(flow_dissector,
+							  FLOW_DISSECTOR_KEY_ETH_ADDRS,
+							  target_container);
+		memcpy(key_eth_addrs, &eth->h_dest, sizeof(*key_eth_addrs));
+	}
+
 again:
 	switch (proto) {
 	case htons(ETH_P_IP): {
-- 
cgit v1.2.3


From 59346afe7a5548ab3e9730aeff33993faa76abbe Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 12 May 2015 14:56:20 +0200
Subject: flow_dissector: change port array into src, dst tuple

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cisco/enic/enic_clsf.c    | 4 ++--
 drivers/net/ethernet/cisco/enic/enic_ethtool.c | 4 ++--
 include/net/flow_dissector.h                   | 9 ++++++---
 include/net/ip.h                               | 4 ++--
 include/net/ipv6.h                             | 4 ++--
 net/core/flow_dissector.c                      | 4 ++--
 net/sched/cls_flow.c                           | 4 ++--
 7 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'net/core')

diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
index d3d25c7e5b96..6739ebc08c47 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.c
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -35,8 +35,8 @@ int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq)
 	data.type = FILTER_IPV4_5TUPLE;
 	data.u.ipv4.src_addr = ntohl(keys->addrs.src);
 	data.u.ipv4.dst_addr = ntohl(keys->addrs.dst);
-	data.u.ipv4.src_port = ntohs(keys->ports.port16[0]);
-	data.u.ipv4.dst_port = ntohs(keys->ports.port16[1]);
+	data.u.ipv4.src_port = ntohs(keys->ports.src);
+	data.u.ipv4.dst_port = ntohs(keys->ports.dst);
 	data.u.ipv4.flags = FILTER_FIELDS_IPV4_5TUPLE;
 
 	spin_lock_bh(&enic->devcmd_lock);
diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
index 7588f8dcc890..117c0968dd0b 100644
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -352,10 +352,10 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd)
 	fsp->h_u.tcp_ip4_spec.ip4dst = n->keys.addrs.dst;
 	fsp->m_u.tcp_ip4_spec.ip4dst = (__u32)~0;
 
-	fsp->h_u.tcp_ip4_spec.psrc = n->keys.ports.port16[0];
+	fsp->h_u.tcp_ip4_spec.psrc = n->keys.ports.src;
 	fsp->m_u.tcp_ip4_spec.psrc = (__u16)~0;
 
-	fsp->h_u.tcp_ip4_spec.pdst = n->keys.ports.port16[1];
+	fsp->h_u.tcp_ip4_spec.pdst = n->keys.ports.dst;
 	fsp->m_u.tcp_ip4_spec.pdst = (__u16)~0;
 
 	fsp->ring_cookie = n->rq_id;
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 5eac9870689c..bac9c1421f58 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -34,13 +34,16 @@ struct flow_dissector_key_addrs {
 /**
  * flow_dissector_key_tp_ports:
  *	@ports: port numbers of Transport header
- *		port16[0]: src port number
- *		port16[1]: dst port number
+ *		src: source port number
+ *		dst: destination port number
  */
 struct flow_dissector_key_ports {
 	union {
 		__be32 ports;
-		__be16 port16[2];
+		struct {
+			__be16 src;
+			__be16 dst;
+		};
 	};
 };
 
diff --git a/include/net/ip.h b/include/net/ip.h
index b0443d4fe13f..0ed6d768e606 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -362,8 +362,8 @@ static inline void inet_set_txhash(struct sock *sk)
 
 	keys.addrs.src = inet->inet_saddr;
 	keys.addrs.dst = inet->inet_daddr;
-	keys.ports.port16[0] = inet->inet_sport;
-	keys.ports.port16[1] = inet->inet_dport;
+	keys.ports.src = inet->inet_sport;
+	keys.ports.dst = inet->inet_dport;
 
 	sk->sk_txhash = flow_hash_from_keys(&keys);
 }
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 9eed9761dfce..aab8190d16e8 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -700,8 +700,8 @@ static inline void ip6_set_txhash(struct sock *sk)
 
 	keys.addrs.src = (__force __be32)ipv6_addr_hash(&np->saddr);
 	keys.addrs.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr);
-	keys.ports.port16[0] = inet->inet_sport;
-	keys.ports.port16[1] = inet->inet_dport;
+	keys.ports.src = inet->inet_sport;
+	keys.ports.dst = inet->inet_dport;
 
 	sk->sk_txhash = flow_hash_from_keys(&keys);
 }
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 7a0b391114a5..204d09c42510 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -385,9 +385,9 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 	/* get a consistent hash (same value on both flow directions) */
 	if (((__force u32)keys->addrs.dst < (__force u32)keys->addrs.src) ||
 	    (((__force u32)keys->addrs.dst == (__force u32)keys->addrs.src) &&
-	     ((__force u16)keys->ports.port16[1] < (__force u16)keys->ports.port16[0]))) {
+	     ((__force u16)keys->ports.dst < (__force u16)keys->ports.src))) {
 		swap(keys->addrs.dst, keys->addrs.src);
-		swap(keys->ports.port16[0], keys->ports.port16[1]);
+		swap(keys->ports.src, keys->ports.dst);
 	}
 
 	hash = __flow_hash_3words((__force u32)keys->addrs.dst,
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 4b3e3e30bf4d..b4359924846c 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -88,7 +88,7 @@ static u32 flow_get_proto(const struct sk_buff *skb, const struct flow_keys *flo
 static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys *flow)
 {
 	if (flow->ports.ports)
-		return ntohs(flow->ports.port16[0]);
+		return ntohs(flow->ports.src);
 
 	return addr_fold(skb->sk);
 }
@@ -96,7 +96,7 @@ static u32 flow_get_proto_src(const struct sk_buff *skb, const struct flow_keys
 static u32 flow_get_proto_dst(const struct sk_buff *skb, const struct flow_keys *flow)
 {
 	if (flow->ports.ports)
-		return ntohs(flow->ports.port16[1]);
+		return ntohs(flow->ports.dst);
 
 	return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
 }
-- 
cgit v1.2.3


From a080e7bd0a8e56519d451eaea4ab05212d90e010 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 13 May 2015 13:34:13 -0700
Subject: net: Reserve skb headroom and set skb->dev even if using __alloc_skb

When I had inlined __alloc_rx_skb into __netdev_alloc_skb and
__napi_alloc_skb I had overlooked the fact that there was a return in the
__alloc_rx_skb.  As a result we weren't reserving headroom or setting the
skb->dev in certain cases.  This change corrects that by adding a couple of
jump labels to jump to depending on __alloc_skb either succeeding or failing.

Fixes: 9451980a6646 ("net: Use cached copy of pfmemalloc to avoid accessing page")
Reported-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Tested-by: Kevin Hilman <khilman@linaro.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/skbuff.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d67e612bf0ef..f3fe9bd9e672 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -414,8 +414,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 	len += NET_SKB_PAD;
 
 	if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
-	    (gfp_mask & (__GFP_WAIT | GFP_DMA)))
-		return __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+	    (gfp_mask & (__GFP_WAIT | GFP_DMA))) {
+		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+		if (!skb)
+			goto skb_fail;
+		goto skb_success;
+	}
 
 	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 	len = SKB_DATA_ALIGN(len);
@@ -445,9 +449,11 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 		skb->pfmemalloc = 1;
 	skb->head_frag = 1;
 
+skb_success:
 	skb_reserve(skb, NET_SKB_PAD);
 	skb->dev = dev;
 
+skb_fail:
 	return skb;
 }
 EXPORT_SYMBOL(__netdev_alloc_skb);
@@ -475,8 +481,12 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 	len += NET_SKB_PAD + NET_IP_ALIGN;
 
 	if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
-	    (gfp_mask & (__GFP_WAIT | GFP_DMA)))
-		return __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+	    (gfp_mask & (__GFP_WAIT | GFP_DMA))) {
+		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+		if (!skb)
+			goto skb_fail;
+		goto skb_success;
+	}
 
 	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 	len = SKB_DATA_ALIGN(len);
@@ -499,9 +509,11 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 		skb->pfmemalloc = 1;
 	skb->head_frag = 1;
 
+skb_success:
 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
 	skb->dev = napi->dev;
 
+skb_fail:
 	return skb;
 }
 EXPORT_SYMBOL(__napi_alloc_skb);
-- 
cgit v1.2.3


From 1cf51900f8545b358b5deaacfda348d990f671db Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Wed, 13 May 2015 18:19:37 +0200
Subject: net: add CONFIG_NET_INGRESS to enable ingress filtering

This new config switch enables the ingress filtering infrastructure that is
controlled through the ingress_needed static key. This prepares the
introduction of the Netfilter ingress hook that resides under this unique
static key.

Note that CONFIG_SCH_INGRESS automatically selects this, that should be no
problem since this also depends on CONFIG_NET_CLS_ACT.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 2 +-
 net/Kconfig               | 3 +++
 net/core/dev.c            | 7 ++++---
 net/sched/Kconfig         | 1 +
 4 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index bd29ab4b0941..a2324fb45cf4 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -79,7 +79,7 @@ static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
 
 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
 
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
 void net_inc_ingress_queue(void);
 void net_dec_ingress_queue(void);
 #endif
diff --git a/net/Kconfig b/net/Kconfig
index 44dd5786ee91..57a7c5af3175 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -45,6 +45,9 @@ config COMPAT_NETLINK_MESSAGES
 	  Newly written code should NEVER need this option but do
 	  compat-independent messages instead!
 
+config NET_INGRESS
+	bool
+
 menu "Networking options"
 
 source "net/packet/Kconfig"
diff --git a/net/core/dev.c b/net/core/dev.c
index af549062ae8e..a5ef90016ce7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1630,7 +1630,7 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
 }
 EXPORT_SYMBOL(call_netdevice_notifiers);
 
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
 static struct static_key ingress_needed __read_mostly;
 
 void net_inc_ingress_queue(void)
@@ -3798,13 +3798,14 @@ another_round:
 	}
 
 skip_taps:
-#ifdef CONFIG_NET_CLS_ACT
+#ifdef CONFIG_NET_INGRESS
 	if (static_key_false(&ingress_needed)) {
 		skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
 		if (!skb)
 			goto unlock;
 	}
-
+#endif
+#ifdef CONFIG_NET_CLS_ACT
 	skb->tc_verd = 0;
 ncls:
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 5fd1c2f487d2..daa33432b716 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -312,6 +312,7 @@ config NET_SCH_PIE
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
+	select NET_INGRESS
 	---help---
 	  Say Y here if you want to use classifiers for incoming packets.
 	  If unsure, say Y.
-- 
cgit v1.2.3


From e687ad60af09010936bbd0b2a3b5d90a8ee8353c Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Wed, 13 May 2015 18:19:38 +0200
Subject: netfilter: add netfilter ingress hook after handle_ing() under unique
 static key

This patch adds the Netfilter ingress hook just after the existing tc ingress
hook, that seems to be the consensus solution for this.

Note that the Netfilter hook resides under the global static key that enables
ingress filtering. Nonetheless, Netfilter still also has its own static key for
minimal impact on the existing handle_ing().

* Without this patch:

Result: OK: 6216490(c6216338+d152) usec, 100000000 (60byte,0frags)
  16086246pps 7721Mb/sec (7721398080bps) errors: 100000000

    42.46%  kpktgend_0   [kernel.kallsyms]   [k] __netif_receive_skb_core
    25.92%  kpktgend_0   [kernel.kallsyms]   [k] kfree_skb
     7.81%  kpktgend_0   [pktgen]            [k] pktgen_thread_worker
     5.62%  kpktgend_0   [kernel.kallsyms]   [k] ip_rcv
     2.70%  kpktgend_0   [kernel.kallsyms]   [k] netif_receive_skb_internal
     2.34%  kpktgend_0   [kernel.kallsyms]   [k] netif_receive_skb_sk
     1.44%  kpktgend_0   [kernel.kallsyms]   [k] __build_skb

* With this patch:

Result: OK: 6214833(c6214731+d101) usec, 100000000 (60byte,0frags)
  16090536pps 7723Mb/sec (7723457280bps) errors: 100000000

    41.23%  kpktgend_0      [kernel.kallsyms]  [k] __netif_receive_skb_core
    26.57%  kpktgend_0      [kernel.kallsyms]  [k] kfree_skb
     7.72%  kpktgend_0      [pktgen]           [k] pktgen_thread_worker
     5.55%  kpktgend_0      [kernel.kallsyms]  [k] ip_rcv
     2.78%  kpktgend_0      [kernel.kallsyms]  [k] netif_receive_skb_internal
     2.06%  kpktgend_0      [kernel.kallsyms]  [k] netif_receive_skb_sk
     1.43%  kpktgend_0      [kernel.kallsyms]  [k] __build_skb

* Without this patch + tc ingress:

        tc filter add dev eth4 parent ffff: protocol ip prio 1 \
                u32 match ip dst 4.3.2.1/32

Result: OK: 9269001(c9268821+d179) usec, 100000000 (60byte,0frags)
  10788648pps 5178Mb/sec (5178551040bps) errors: 100000000

    40.99%  kpktgend_0   [kernel.kallsyms]  [k] __netif_receive_skb_core
    17.50%  kpktgend_0   [kernel.kallsyms]  [k] kfree_skb
    11.77%  kpktgend_0   [cls_u32]          [k] u32_classify
     5.62%  kpktgend_0   [kernel.kallsyms]  [k] tc_classify_compat
     5.18%  kpktgend_0   [pktgen]           [k] pktgen_thread_worker
     3.23%  kpktgend_0   [kernel.kallsyms]  [k] tc_classify
     2.97%  kpktgend_0   [kernel.kallsyms]  [k] ip_rcv
     1.83%  kpktgend_0   [kernel.kallsyms]  [k] netif_receive_skb_internal
     1.50%  kpktgend_0   [kernel.kallsyms]  [k] netif_receive_skb_sk
     0.99%  kpktgend_0   [kernel.kallsyms]  [k] __build_skb

* With this patch + tc ingress:

        tc filter add dev eth4 parent ffff: protocol ip prio 1 \
                u32 match ip dst 4.3.2.1/32

Result: OK: 9308218(c9308091+d126) usec, 100000000 (60byte,0frags)
  10743194pps 5156Mb/sec (5156733120bps) errors: 100000000

    42.01%  kpktgend_0   [kernel.kallsyms]   [k] __netif_receive_skb_core
    17.78%  kpktgend_0   [kernel.kallsyms]   [k] kfree_skb
    11.70%  kpktgend_0   [cls_u32]           [k] u32_classify
     5.46%  kpktgend_0   [kernel.kallsyms]   [k] tc_classify_compat
     5.16%  kpktgend_0   [pktgen]            [k] pktgen_thread_worker
     2.98%  kpktgend_0   [kernel.kallsyms]   [k] ip_rcv
     2.84%  kpktgend_0   [kernel.kallsyms]   [k] tc_classify
     1.96%  kpktgend_0   [kernel.kallsyms]   [k] netif_receive_skb_internal
     1.57%  kpktgend_0   [kernel.kallsyms]   [k] netif_receive_skb_sk

Note that the results are very similar before and after.

I can see gcc gets the code under the ingress static key out of the hot path.
Then, on that cold branch, it generates the code to accomodate the netfilter
ingress static key. My explanation for this is that this reduces the pressure
on the instruction cache for non-users as the new code is out of the hot path,
and it comes with minimal impact for tc ingress users.

Using gcc version 4.8.4 on:

Architecture:          x86_64
CPU op-mode(s):        32-bit, 64-bit
Byte Order:            Little Endian
CPU(s):                8
[...]
L1d cache:             16K
L1i cache:             64K
L2 cache:              2048K
L3 cache:              8192K

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h         |  3 +++
 include/linux/netfilter.h         |  1 +
 include/linux/netfilter_ingress.h | 41 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/netfilter.h    |  6 ++++++
 net/core/dev.c                    | 36 ++++++++++++++++++++++++++++++++++
 net/netfilter/Kconfig             |  7 +++++++
 net/netfilter/core.c              | 31 ++++++++++++++++++++++++++++-
 7 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/netfilter_ingress.h

(limited to 'net/core')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d3ed01c18247..51f8d2f5dc3f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1656,6 +1656,9 @@ struct net_device {
 	struct tcf_proto __rcu  *ingress_cl_list;
 #endif
 	struct netdev_queue __rcu *ingress_queue;
+#ifdef CONFIG_NETFILTER_INGRESS
+	struct list_head	nf_hooks_ingress;
+#endif
 
 	unsigned char		broadcast[MAX_ADDR_LEN];
 #ifdef CONFIG_RFS_ACCEL
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 49d00638d1fa..f5ff5d156da8 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -86,6 +86,7 @@ struct nf_hook_ops {
 
 	/* User fills in from here down. */
 	nf_hookfn		*hook;
+	struct net_device	*dev;
 	struct module		*owner;
 	void			*priv;
 	u_int8_t		pf;
diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h
new file mode 100644
index 000000000000..cb0727fe2b3d
--- /dev/null
+++ b/include/linux/netfilter_ingress.h
@@ -0,0 +1,41 @@
+#ifndef _NETFILTER_INGRESS_H_
+#define _NETFILTER_INGRESS_H_
+
+#include <linux/netfilter.h>
+#include <linux/netdevice.h>
+
+#ifdef CONFIG_NETFILTER_INGRESS
+static inline int nf_hook_ingress_active(struct sk_buff *skb)
+{
+	return nf_hook_list_active(&skb->dev->nf_hooks_ingress,
+				   NFPROTO_NETDEV, NF_NETDEV_INGRESS);
+}
+
+static inline int nf_hook_ingress(struct sk_buff *skb)
+{
+	struct nf_hook_state state;
+
+	nf_hook_state_init(&state, &skb->dev->nf_hooks_ingress,
+			   NF_NETDEV_INGRESS, INT_MIN, NFPROTO_NETDEV, NULL,
+			   skb->dev, NULL, NULL);
+	return nf_hook_slow(skb, &state);
+}
+
+static inline void nf_hook_ingress_init(struct net_device *dev)
+{
+	INIT_LIST_HEAD(&dev->nf_hooks_ingress);
+}
+#else /* CONFIG_NETFILTER_INGRESS */
+static inline int nf_hook_ingress_active(struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline int nf_hook_ingress(struct sk_buff *skb)
+{
+	return 0;
+}
+
+static inline void nf_hook_ingress_init(struct net_device *dev) {}
+#endif /* CONFIG_NETFILTER_INGRESS */
+#endif /* _NETFILTER_INGRESS_H_ */
diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h
index ef1b1f88ca18..177027cce6b3 100644
--- a/include/uapi/linux/netfilter.h
+++ b/include/uapi/linux/netfilter.h
@@ -51,11 +51,17 @@ enum nf_inet_hooks {
 	NF_INET_NUMHOOKS
 };
 
+enum nf_dev_hooks {
+	NF_NETDEV_INGRESS,
+	NF_NETDEV_NUMHOOKS
+};
+
 enum {
 	NFPROTO_UNSPEC =  0,
 	NFPROTO_INET   =  1,
 	NFPROTO_IPV4   =  2,
 	NFPROTO_ARP    =  3,
+	NFPROTO_NETDEV =  5,
 	NFPROTO_BRIDGE =  7,
 	NFPROTO_IPV6   = 10,
 	NFPROTO_DECNET = 12,
diff --git a/net/core/dev.c b/net/core/dev.c
index a5ef90016ce7..29f0d6e6542c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,6 +135,7 @@
 #include <linux/if_macvlan.h>
 #include <linux/errqueue.h>
 #include <linux/hrtimer.h>
+#include <linux/netfilter_ingress.h>
 
 #include "net-sysfs.h"
 
@@ -3666,6 +3667,13 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 
 	return skb;
 }
+#else
+static inline struct sk_buff *handle_ing(struct sk_buff *skb,
+					 struct packet_type **pt_prev,
+					 int *ret, struct net_device *orig_dev)
+{
+	return skb;
+}
 #endif
 
 /**
@@ -3739,6 +3747,28 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 	}
 }
 
+#ifdef CONFIG_NETFILTER_INGRESS
+static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
+			     int *ret, struct net_device *orig_dev)
+{
+	if (nf_hook_ingress_active(skb)) {
+		if (*pt_prev) {
+			*ret = deliver_skb(skb, *pt_prev, orig_dev);
+			*pt_prev = NULL;
+		}
+
+		return nf_hook_ingress(skb);
+	}
+	return 0;
+}
+#else
+static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
+			     int *ret, struct net_device *orig_dev)
+{
+	return 0;
+}
+#endif
+
 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 {
 	struct packet_type *ptype, *pt_prev;
@@ -3803,6 +3833,9 @@ skip_taps:
 		skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
 		if (!skb)
 			goto unlock;
+
+		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
+			goto unlock;
 	}
 #endif
 #ifdef CONFIG_NET_CLS_ACT
@@ -6968,6 +7001,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	dev->group = INIT_NETDEV_GROUP;
 	if (!dev->ethtool_ops)
 		dev->ethtool_ops = &default_ethtool_ops;
+
+	nf_hook_ingress_init(dev);
+
 	return dev;
 
 free_all:
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f70e34a68f70..db1c674397ad 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1,6 +1,13 @@
 menu "Core Netfilter Configuration"
 	depends on NET && INET && NETFILTER
 
+config NETFILTER_INGRESS
+	bool "Netfilter ingress support"
+	select NET_INGRESS
+	help
+	  This allows you to classify packets from ingress using the Netfilter
+	  infrastructure.
+
 config NETFILTER_NETLINK
 	tristate
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index e418cfd603c0..653e32eac08c 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -64,10 +64,27 @@ static DEFINE_MUTEX(nf_hook_mutex);
 
 int nf_register_hook(struct nf_hook_ops *reg)
 {
+	struct list_head *nf_hook_list;
 	struct nf_hook_ops *elem;
 
 	mutex_lock(&nf_hook_mutex);
-	list_for_each_entry(elem, &nf_hooks[reg->pf][reg->hooknum], list) {
+	switch (reg->pf) {
+	case NFPROTO_NETDEV:
+#ifdef CONFIG_NETFILTER_INGRESS
+		if (reg->hooknum == NF_NETDEV_INGRESS) {
+			BUG_ON(reg->dev == NULL);
+			nf_hook_list = &reg->dev->nf_hooks_ingress;
+			net_inc_ingress_queue();
+			break;
+		}
+#endif
+		/* Fall through. */
+	default:
+		nf_hook_list = &nf_hooks[reg->pf][reg->hooknum];
+		break;
+	}
+
+	list_for_each_entry(elem, nf_hook_list, list) {
 		if (reg->priority < elem->priority)
 			break;
 	}
@@ -85,6 +102,18 @@ void nf_unregister_hook(struct nf_hook_ops *reg)
 	mutex_lock(&nf_hook_mutex);
 	list_del_rcu(&reg->list);
 	mutex_unlock(&nf_hook_mutex);
+	switch (reg->pf) {
+	case NFPROTO_NETDEV:
+#ifdef CONFIG_NETFILTER_INGRESS
+		if (reg->hooknum == NF_NETDEV_INGRESS) {
+			net_dec_ingress_queue();
+			break;
+		}
+		break;
+#endif
+	default:
+		break;
+	}
 #ifdef HAVE_JUMP_LABEL
 	static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]);
 #endif
-- 
cgit v1.2.3


From 0c58a2db91747c841d042b1d56615fb1eaf138c7 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Wed, 13 May 2015 13:43:09 +0200
Subject: netns: fix unbalanced spin_lock on error

Unlock was missing on error path.

Fixes: 95f38411df05 ("netns: use a spin_lock to protect nsid management")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/net_namespace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index cbee75f2fc28..d2f42da9479b 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -565,6 +565,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 	spin_lock_irqsave(&nsid_lock, flags);
 	if (__peernet2id(net, peer) >= 0) {
+		spin_unlock_irqrestore(&nsid_lock, flags);
 		err = -EEXIST;
 		goto out;
 	}
-- 
cgit v1.2.3


From 3365495c1883561e4a8811f46a3800c512a3c00a Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 14 May 2015 00:36:28 +0200
Subject: net: core: set qdisc pkt len before tc_classify

commit d2788d34885d4ce5ba ("net: sched: further simplify handle_ing")
removed the call to qdisc_enqueue_root().

However, after this removal we no longer set qdisc pkt length.
This breaks traffic policing on ingress.

This is the minimum fix: set qdisc pkt length before tc_classify.

Only setting the length does remove support for 'stab' on ingress, but
as Alexei pointed out:
 "Though it was allowed to add qdisc_size_table to ingress, it's useless.
  Nothing takes advantage of recomputed qdisc_pkt_len".

Jamal suggested to use qdisc_pkt_len_init(), but as Eric mentioned that
would result in qdisc_pkt_len_init to no longer get inlined due to the
additional 2nd call site.

ingress policing is rare and GRO doesn't really work that well with police
on ingress, as we see packets > mtu and drop skbs that  -- without
aggregation -- would still have fitted the policier budget.
Thus to have reliable/smooth ingress policing GRO has to be turned off.

Cc: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Fixes: d2788d34885d ("net: sched: further simplify handle_ing")
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 29f0d6e6542c..0e7afefb5072 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3647,8 +3647,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 		*pt_prev = NULL;
 	}
 
-	qdisc_bstats_update_cpu(cl->q, skb);
+	qdisc_skb_cb(skb)->pkt_len = skb->len;
 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
+	qdisc_bstats_update_cpu(cl->q, skb);
 
 	switch (tc_classify(skb, cl, &cl_res)) {
 	case TC_ACT_OK:
-- 
cgit v1.2.3


From 1a24e04e4b50939daa3041682b38b82c896ca438 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 15 May 2015 12:39:25 -0700
Subject: net: fix sk_mem_reclaim_partial()

sk_mem_reclaim_partial() goal is to ensure each socket has
one SK_MEM_QUANTUM forward allocation. This is needed both for
performance and better handling of memory pressure situations in
follow up patches.

SK_MEM_QUANTUM is currently a page, but might be reduced to 4096 bytes
as some arches have 64KB pages.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 6 +++---
 net/core/sock.c    | 9 +++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/include/net/sock.h b/include/net/sock.h
index d882f4c8e438..4581a60636f8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1368,7 +1368,7 @@ static inline struct inode *SOCK_INODE(struct socket *socket)
  * Functions for memory accounting
  */
 int __sk_mem_schedule(struct sock *sk, int size, int kind);
-void __sk_mem_reclaim(struct sock *sk);
+void __sk_mem_reclaim(struct sock *sk, int amount);
 
 #define SK_MEM_QUANTUM ((int)PAGE_SIZE)
 #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
@@ -1409,7 +1409,7 @@ static inline void sk_mem_reclaim(struct sock *sk)
 	if (!sk_has_account(sk))
 		return;
 	if (sk->sk_forward_alloc >= SK_MEM_QUANTUM)
-		__sk_mem_reclaim(sk);
+		__sk_mem_reclaim(sk, sk->sk_forward_alloc);
 }
 
 static inline void sk_mem_reclaim_partial(struct sock *sk)
@@ -1417,7 +1417,7 @@ static inline void sk_mem_reclaim_partial(struct sock *sk)
 	if (!sk_has_account(sk))
 		return;
 	if (sk->sk_forward_alloc > SK_MEM_QUANTUM)
-		__sk_mem_reclaim(sk);
+		__sk_mem_reclaim(sk, sk->sk_forward_alloc - 1);
 }
 
 static inline void sk_mem_charge(struct sock *sk, int size)
diff --git a/net/core/sock.c b/net/core/sock.c
index c18738a795b0..29124fcdc42a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2069,12 +2069,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);
 /**
  *	__sk_reclaim - reclaim memory_allocated
  *	@sk: socket
+ *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
  */
-void __sk_mem_reclaim(struct sock *sk)
+void __sk_mem_reclaim(struct sock *sk, int amount)
 {
-	sk_memory_allocated_sub(sk,
-				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
-	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+	amount >>= SK_MEM_QUANTUM_SHIFT;
+	sk_memory_allocated_sub(sk, amount);
+	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
 
 	if (sk_under_memory_pressure(sk) &&
 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
-- 
cgit v1.2.3


From 74b80e841b4da2f83d8f548f32cfe421fde85aca Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Fri, 15 May 2015 13:27:32 +0200
Subject: flow_dissector: remove bogus return in tipc section

Fixes: 06635a35d13d42b9 ("flow_dissect: use programable dissector in skb_flow_dissect and friends")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 204d09c42510..703d05916a80 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -279,7 +279,6 @@ flow_label:
 
 		if (skb_flow_dissector_uses_key(flow_dissector,
 						FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) {
-			return true;
 			key_addrs = skb_flow_dissector_target(flow_dissector,
 							      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
 							      target_container);
-- 
cgit v1.2.3


From de133464c9e70808d3e5a861294bc55940988178 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Fri, 15 May 2015 14:47:32 -0700
Subject: netns: make nsid_lock per net

The spinlock is used to protect netns_ids which is per net,
so there is no need to use a global spinlock.

Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h |  1 +
 net/core/net_namespace.c    | 32 ++++++++++++++++----------------
 2 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'net/core')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 3f850acc844e..72eb23723294 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -58,6 +58,7 @@ struct net {
 	struct list_head	exit_list;	/* Use only net_mutex */
 
 	struct user_namespace   *user_ns;	/* Owning user namespace */
+	spinlock_t		nsid_lock;
 	struct idr		netns_ids;
 
 	struct ns_common	ns;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index d2f42da9479b..2c2eb1b629b1 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -28,7 +28,6 @@
 static LIST_HEAD(pernet_list);
 static struct list_head *first_device = &pernet_list;
 DEFINE_MUTEX(net_mutex);
-static DEFINE_SPINLOCK(nsid_lock);
 
 LIST_HEAD(net_namespace_list);
 EXPORT_SYMBOL_GPL(net_namespace_list);
@@ -218,10 +217,10 @@ int peernet2id_alloc(struct net *net, struct net *peer)
 	bool alloc;
 	int id;
 
-	spin_lock_irqsave(&nsid_lock, flags);
+	spin_lock_irqsave(&net->nsid_lock, flags);
 	alloc = atomic_read(&peer->count) == 0 ? false : true;
 	id = __peernet2id_alloc(net, peer, &alloc);
-	spin_unlock_irqrestore(&nsid_lock, flags);
+	spin_unlock_irqrestore(&net->nsid_lock, flags);
 	if (alloc && id >= 0)
 		rtnl_net_notifyid(net, RTM_NEWNSID, id);
 	return id;
@@ -234,9 +233,9 @@ int peernet2id(struct net *net, struct net *peer)
 	unsigned long flags;
 	int id;
 
-	spin_lock_irqsave(&nsid_lock, flags);
+	spin_lock_irqsave(&net->nsid_lock, flags);
 	id = __peernet2id(net, peer);
-	spin_unlock_irqrestore(&nsid_lock, flags);
+	spin_unlock_irqrestore(&net->nsid_lock, flags);
 	return id;
 }
 
@@ -257,11 +256,11 @@ struct net *get_net_ns_by_id(struct net *net, int id)
 		return NULL;
 
 	rcu_read_lock();
-	spin_lock_irqsave(&nsid_lock, flags);
+	spin_lock_irqsave(&net->nsid_lock, flags);
 	peer = idr_find(&net->netns_ids, id);
 	if (peer)
 		get_net(peer);
-	spin_unlock_irqrestore(&nsid_lock, flags);
+	spin_unlock_irqrestore(&net->nsid_lock, flags);
 	rcu_read_unlock();
 
 	return peer;
@@ -282,6 +281,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 	net->dev_base_seq = 1;
 	net->user_ns = user_ns;
 	idr_init(&net->netns_ids);
+	spin_lock_init(&net->nsid_lock);
 
 	list_for_each_entry(ops, &pernet_list, list) {
 		error = ops_init(ops, net);
@@ -404,17 +404,17 @@ static void cleanup_net(struct work_struct *work)
 		for_each_net(tmp) {
 			int id;
 
-			spin_lock_irq(&nsid_lock);
+			spin_lock_irq(&tmp->nsid_lock);
 			id = __peernet2id(tmp, net);
 			if (id >= 0)
 				idr_remove(&tmp->netns_ids, id);
-			spin_unlock_irq(&nsid_lock);
+			spin_unlock_irq(&tmp->nsid_lock);
 			if (id >= 0)
 				rtnl_net_notifyid(tmp, RTM_DELNSID, id);
 		}
-		spin_lock_irq(&nsid_lock);
+		spin_lock_irq(&net->nsid_lock);
 		idr_destroy(&net->netns_ids);
-		spin_unlock_irq(&nsid_lock);
+		spin_unlock_irq(&net->nsid_lock);
 
 	}
 	rtnl_unlock();
@@ -563,15 +563,15 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (IS_ERR(peer))
 		return PTR_ERR(peer);
 
-	spin_lock_irqsave(&nsid_lock, flags);
+	spin_lock_irqsave(&net->nsid_lock, flags);
 	if (__peernet2id(net, peer) >= 0) {
-		spin_unlock_irqrestore(&nsid_lock, flags);
+		spin_unlock_irqrestore(&net->nsid_lock, flags);
 		err = -EEXIST;
 		goto out;
 	}
 
 	err = alloc_netid(net, peer, nsid);
-	spin_unlock_irqrestore(&nsid_lock, flags);
+	spin_unlock_irqrestore(&net->nsid_lock, flags);
 	if (err >= 0) {
 		rtnl_net_notifyid(net, RTM_NEWNSID, err);
 		err = 0;
@@ -695,9 +695,9 @@ static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
 	};
 	unsigned long flags;
 
-	spin_lock_irqsave(&nsid_lock, flags);
+	spin_lock_irqsave(&net->nsid_lock, flags);
 	idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
-	spin_unlock_irqrestore(&nsid_lock, flags);
+	spin_unlock_irqrestore(&net->nsid_lock, flags);
 
 	cb->args[0] = net_cb.idx;
 	return skb->len;
-- 
cgit v1.2.3


From 765c9c639fbb132af0cafc6e1da22fe6cea26bb8 Mon Sep 17 00:00:00 2001
From: Erik Kline <ek@google.com>
Date: Mon, 18 May 2015 19:44:41 +0900
Subject: neigh: Better handling of transition to NUD_PROBE state

[1] When entering NUD_PROBE state via neigh_update(), perhaps received
    from userspace, correctly (re)initialize the probes count to zero.

    This is useful for forcing revalidation of a neighbor (for example
    if the host is attempting to do DNA [IPv4 4436, IPv6 6059]).

[2] Notify listeners when a neighbor goes into NUD_PROBE state.

    By sending notifications on entry to NUD_PROBE state listeners get
    more timely warnings of imminent connectivity issues.

    The current notifications on entry to NUD_STALE have somewhat
    limited usefulness: NUD_STALE is a perfectly normal state, as is
    NUD_DELAY, whereas notifications on entry to NUD_FAILURE come after
    a neighbor reachability problem has been confirmed (typically after
    three probes).

Signed-off-by: Erik Kline <ek@google.com>
Acked-By: Lorenzo Colitti <lorenzo@google.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 3de654256028..3a74df750af4 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -913,6 +913,7 @@ static void neigh_timer_handler(unsigned long arg)
 			neigh->nud_state = NUD_PROBE;
 			neigh->updated = jiffies;
 			atomic_set(&neigh->probes, 0);
+			notify = 1;
 			next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
 		}
 	} else {
@@ -1144,6 +1145,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
 
 	if (new != old) {
 		neigh_del_timer(neigh);
+		if (new & NUD_PROBE)
+			atomic_set(&neigh->probes, 0);
 		if (new & NUD_IN_TIMER)
 			neigh_add_timer(neigh, (jiffies +
 						((new & NUD_REACHABLE) ?
-- 
cgit v1.2.3


From e7582bab5d28ea72e07cf2c74632eaf46a6c1a50 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 19 May 2015 22:33:25 +0200
Subject: net: dev: reduce both ingress hook ifdefs

Reduce ifdef pollution slightly, no functional change. We can simply
remove the extra alternative definition of handle_ing() and nf_ingress().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index 0e7afefb5072..594163d0c6eb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3627,11 +3627,11 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev,
 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
 #endif
 
-#ifdef CONFIG_NET_CLS_ACT
 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 					 struct packet_type **pt_prev,
 					 int *ret, struct net_device *orig_dev)
 {
+#ifdef CONFIG_NET_CLS_ACT
 	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
 	struct tcf_result cl_res;
 
@@ -3665,17 +3665,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb,
 	default:
 		break;
 	}
-
-	return skb;
-}
-#else
-static inline struct sk_buff *handle_ing(struct sk_buff *skb,
-					 struct packet_type **pt_prev,
-					 int *ret, struct net_device *orig_dev)
-{
+#endif /* CONFIG_NET_CLS_ACT */
 	return skb;
 }
-#endif
 
 /**
  *	netdev_rx_handler_register - register receive handler
@@ -3748,10 +3740,10 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 	}
 }
 
-#ifdef CONFIG_NETFILTER_INGRESS
 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 			     int *ret, struct net_device *orig_dev)
 {
+#ifdef CONFIG_NETFILTER_INGRESS
 	if (nf_hook_ingress_active(skb)) {
 		if (*pt_prev) {
 			*ret = deliver_skb(skb, *pt_prev, orig_dev);
@@ -3760,15 +3752,9 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
 
 		return nf_hook_ingress(skb);
 	}
+#endif /* CONFIG_NETFILTER_INGRESS */
 	return 0;
 }
-#else
-static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
-			     int *ret, struct net_device *orig_dev)
-{
-	return 0;
-}
-#endif
 
 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 {
-- 
cgit v1.2.3


From 04fd61ab36ec065e194ab5e74ae34a5240d992bb Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 19 May 2015 16:59:03 -0700
Subject: bpf: allow bpf programs to tail-call other bpf programs

introduce bpf_tail_call(ctx, &jmp_table, index) helper function
which can be used from BPF programs like:
int bpf_prog(struct pt_regs *ctx)
{
  ...
  bpf_tail_call(ctx, &jmp_table, index);
  ...
}
that is roughly equivalent to:
int bpf_prog(struct pt_regs *ctx)
{
  ...
  if (jmp_table[index])
    return (*jmp_table[index])(ctx);
  ...
}
The important detail that it's not a normal call, but a tail call.
The kernel stack is precious, so this helper reuses the current
stack frame and jumps into another BPF program without adding
extra call frame.
It's trivially done in interpreter and a bit trickier in JITs.
In case of x64 JIT the bigger part of generated assembler prologue
is common for all programs, so it is simply skipped while jumping.
Other JITs can do similar prologue-skipping optimization or
do stack unwind before jumping into the next program.

bpf_tail_call() arguments:
ctx - context pointer
jmp_table - one of BPF_MAP_TYPE_PROG_ARRAY maps used as the jump table
index - index in the jump table

Since all BPF programs are idenitified by file descriptor, user space
need to populate the jmp_table with FDs of other BPF programs.
If jmp_table[index] is empty the bpf_tail_call() doesn't jump anywhere
and program execution continues as normal.

New BPF_MAP_TYPE_PROG_ARRAY map type is introduced so that user space can
populate this jmp_table array with FDs of other bpf programs.
Programs can share the same jmp_table array or use multiple jmp_tables.

The chain of tail calls can form unpredictable dynamic loops therefore
tail_call_cnt is used to limit the number of calls and currently is set to 32.

Use cases:
Acked-by: Daniel Borkmann <daniel@iogearbox.net>

==========
- simplify complex programs by splitting them into a sequence of small programs

- dispatch routine
  For tracing and future seccomp the program may be triggered on all system
  calls, but processing of syscall arguments will be different. It's more
  efficient to implement them as:
  int syscall_entry(struct seccomp_data *ctx)
  {
     bpf_tail_call(ctx, &syscall_jmp_table, ctx->nr /* syscall number */);
     ... default: process unknown syscall ...
  }
  int sys_write_event(struct seccomp_data *ctx) {...}
  int sys_read_event(struct seccomp_data *ctx) {...}
  syscall_jmp_table[__NR_write] = sys_write_event;
  syscall_jmp_table[__NR_read] = sys_read_event;

  For networking the program may call into different parsers depending on
  packet format, like:
  int packet_parser(struct __sk_buff *skb)
  {
     ... parse L2, L3 here ...
     __u8 ipproto = load_byte(skb, ... offsetof(struct iphdr, protocol));
     bpf_tail_call(skb, &ipproto_jmp_table, ipproto);
     ... default: process unknown protocol ...
  }
  int parse_tcp(struct __sk_buff *skb) {...}
  int parse_udp(struct __sk_buff *skb) {...}
  ipproto_jmp_table[IPPROTO_TCP] = parse_tcp;
  ipproto_jmp_table[IPPROTO_UDP] = parse_udp;

- for TC use case, bpf_tail_call() allows to implement reclassify-like logic

- bpf_map_update_elem/delete calls into BPF_MAP_TYPE_PROG_ARRAY jump table
  are atomic, so user space can build chains of BPF programs on the fly

Implementation details:
=======================
- high performance of bpf_tail_call() is the goal.
  It could have been implemented without JIT changes as a wrapper on top of
  BPF_PROG_RUN() macro, but with two downsides:
  . all programs would have to pay performance penalty for this feature and
    tail call itself would be slower, since mandatory stack unwind, return,
    stack allocate would be done for every tailcall.
  . tailcall would be limited to programs running preempt_disabled, since
    generic 'void *ctx' doesn't have room for 'tail_call_cnt' and it would
    need to be either global per_cpu variable accessed by helper and by wrapper
    or global variable protected by locks.

  In this implementation x64 JIT bypasses stack unwind and jumps into the
  callee program after prologue.

- bpf_prog_array_compatible() ensures that prog_type of callee and caller
  are the same and JITed/non-JITed flag is the same, since calling JITed
  program from non-JITed is invalid, since stack frames are different.
  Similarly calling kprobe type program from socket type program is invalid.

- jump table is implemented as BPF_MAP_TYPE_PROG_ARRAY to reuse 'map'
  abstraction, its user space API and all of verifier logic.
  It's in the existing arraymap.c file, since several functions are
  shared with regular array map.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  22 +++++++++
 include/linux/filter.h   |   2 +-
 include/uapi/linux/bpf.h |  10 +++++
 kernel/bpf/arraymap.c    | 113 ++++++++++++++++++++++++++++++++++++++++++++---
 kernel/bpf/core.c        |  73 +++++++++++++++++++++++++++++-
 kernel/bpf/syscall.c     |  23 +++++++++-
 kernel/bpf/verifier.c    |  17 +++++++
 kernel/trace/bpf_trace.c |   2 +
 net/core/filter.c        |   2 +
 9 files changed, 255 insertions(+), 9 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d5cda067115a..8821b9a8689e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -126,6 +126,27 @@ struct bpf_prog_aux {
 	struct work_struct work;
 };
 
+struct bpf_array {
+	struct bpf_map map;
+	u32 elem_size;
+	/* 'ownership' of prog_array is claimed by the first program that
+	 * is going to use this map or by the first program which FD is stored
+	 * in the map to make sure that all callers and callees have the same
+	 * prog_type and JITed flag
+	 */
+	enum bpf_prog_type owner_prog_type;
+	bool owner_jited;
+	union {
+		char value[0] __aligned(8);
+		struct bpf_prog *prog[0] __aligned(8);
+	};
+};
+#define MAX_TAIL_CALL_CNT 32
+
+u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
+void bpf_prog_array_map_clear(struct bpf_map *map);
+bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
+
 #ifdef CONFIG_BPF_SYSCALL
 void bpf_register_prog_type(struct bpf_prog_type_list *tl);
 void bpf_register_map_type(struct bpf_map_type_list *tl);
@@ -160,5 +181,6 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
+extern const struct bpf_func_proto bpf_tail_call_proto;
 
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 200be4a74a33..17724f6ea983 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -378,7 +378,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 
 int sk_filter(struct sock *sk, struct sk_buff *skb);
 
-void bpf_prog_select_runtime(struct bpf_prog *fp);
+int bpf_prog_select_runtime(struct bpf_prog *fp);
 void bpf_prog_free(struct bpf_prog *fp);
 
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a9ebdf5701e8..f0a9af8b4dae 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -113,6 +113,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_UNSPEC,
 	BPF_MAP_TYPE_HASH,
 	BPF_MAP_TYPE_ARRAY,
+	BPF_MAP_TYPE_PROG_ARRAY,
 };
 
 enum bpf_prog_type {
@@ -210,6 +211,15 @@ enum bpf_func_id {
 	 * Return: 0 on success
 	 */
 	BPF_FUNC_l4_csum_replace,
+
+	/**
+	 * bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program
+	 * @ctx: context pointer passed to next program
+	 * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY
+	 * @index: index inside array that selects specific program to run
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_tail_call,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 8a6616583f38..614bcd4c1d74 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -14,12 +14,7 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
-
-struct bpf_array {
-	struct bpf_map map;
-	u32 elem_size;
-	char value[0] __aligned(8);
-};
+#include <linux/filter.h>
 
 /* Called from syscall */
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -154,3 +149,109 @@ static int __init register_array_map(void)
 	return 0;
 }
 late_initcall(register_array_map);
+
+static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
+{
+	/* only bpf_prog file descriptors can be stored in prog_array map */
+	if (attr->value_size != sizeof(u32))
+		return ERR_PTR(-EINVAL);
+	return array_map_alloc(attr);
+}
+
+static void prog_array_map_free(struct bpf_map *map)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
+
+	synchronize_rcu();
+
+	/* make sure it's empty */
+	for (i = 0; i < array->map.max_entries; i++)
+		BUG_ON(array->prog[i] != NULL);
+	kvfree(array);
+}
+
+static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	return NULL;
+}
+
+/* only called from syscall */
+static int prog_array_map_update_elem(struct bpf_map *map, void *key,
+				      void *value, u64 map_flags)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *prog, *old_prog;
+	u32 index = *(u32 *)key, ufd;
+
+	if (map_flags != BPF_ANY)
+		return -EINVAL;
+
+	if (index >= array->map.max_entries)
+		return -E2BIG;
+
+	ufd = *(u32 *)value;
+	prog = bpf_prog_get(ufd);
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	if (!bpf_prog_array_compatible(array, prog)) {
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
+	old_prog = xchg(array->prog + index, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	return 0;
+}
+
+static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	struct bpf_prog *old_prog;
+	u32 index = *(u32 *)key;
+
+	if (index >= array->map.max_entries)
+		return -E2BIG;
+
+	old_prog = xchg(array->prog + index, NULL);
+	if (old_prog) {
+		bpf_prog_put(old_prog);
+		return 0;
+	} else {
+		return -ENOENT;
+	}
+}
+
+/* decrement refcnt of all bpf_progs that are stored in this map */
+void bpf_prog_array_map_clear(struct bpf_map *map)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	int i;
+
+	for (i = 0; i < array->map.max_entries; i++)
+		prog_array_map_delete_elem(map, &i);
+}
+
+static const struct bpf_map_ops prog_array_ops = {
+	.map_alloc = prog_array_map_alloc,
+	.map_free = prog_array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = prog_array_map_lookup_elem,
+	.map_update_elem = prog_array_map_update_elem,
+	.map_delete_elem = prog_array_map_delete_elem,
+};
+
+static struct bpf_map_type_list prog_array_type __read_mostly = {
+	.ops = &prog_array_ops,
+	.type = BPF_MAP_TYPE_PROG_ARRAY,
+};
+
+static int __init register_prog_array_map(void)
+{
+	bpf_register_map_type(&prog_array_type);
+	return 0;
+}
+late_initcall(register_prog_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 54f0e7fcd0e2..d44b25cbe460 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -176,6 +176,15 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return 0;
 }
 
+const struct bpf_func_proto bpf_tail_call_proto = {
+	.func = NULL,
+	.gpl_only = false,
+	.ret_type = RET_VOID,
+	.arg1_type = ARG_PTR_TO_CTX,
+	.arg2_type = ARG_CONST_MAP_PTR,
+	.arg3_type = ARG_ANYTHING,
+};
+
 /**
  *	__bpf_prog_run - run eBPF program on a given context
  *	@ctx: is the data we are operating on
@@ -244,6 +253,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 		[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
 		/* Call instruction */
 		[BPF_JMP | BPF_CALL] = &&JMP_CALL,
+		[BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
 		/* Jumps */
 		[BPF_JMP | BPF_JA] = &&JMP_JA,
 		[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -286,6 +296,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 		[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
 		[BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
 	};
+	u32 tail_call_cnt = 0;
 	void *ptr;
 	int off;
 
@@ -431,6 +442,30 @@ select_insn:
 						       BPF_R4, BPF_R5);
 		CONT;
 
+	JMP_TAIL_CALL: {
+		struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
+		struct bpf_array *array = container_of(map, struct bpf_array, map);
+		struct bpf_prog *prog;
+		u64 index = BPF_R3;
+
+		if (unlikely(index >= array->map.max_entries))
+			goto out;
+
+		if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+			goto out;
+
+		tail_call_cnt++;
+
+		prog = READ_ONCE(array->prog[index]);
+		if (unlikely(!prog))
+			goto out;
+
+		ARG1 = BPF_R1;
+		insn = prog->insnsi;
+		goto select_insn;
+out:
+		CONT;
+	}
 	/* JMP */
 	JMP_JA:
 		insn += insn->off;
@@ -619,6 +654,40 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
 {
 }
 
+bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp)
+{
+	if (array->owner_prog_type) {
+		if (array->owner_prog_type != fp->type)
+			return false;
+		if (array->owner_jited != fp->jited)
+			return false;
+	} else {
+		array->owner_prog_type = fp->type;
+		array->owner_jited = fp->jited;
+	}
+	return true;
+}
+
+static int check_tail_call(const struct bpf_prog *fp)
+{
+	struct bpf_prog_aux *aux = fp->aux;
+	int i;
+
+	for (i = 0; i < aux->used_map_cnt; i++) {
+		struct bpf_array *array;
+		struct bpf_map *map;
+
+		map = aux->used_maps[i];
+		if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+			continue;
+		array = container_of(map, struct bpf_array, map);
+		if (!bpf_prog_array_compatible(array, fp))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
 /**
  *	bpf_prog_select_runtime - select execution runtime for BPF program
  *	@fp: bpf_prog populated with internal BPF program
@@ -626,7 +695,7 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
  * try to JIT internal BPF program, if JIT is not available select interpreter
  * BPF program will be executed via BPF_PROG_RUN() macro
  */
-void bpf_prog_select_runtime(struct bpf_prog *fp)
+int bpf_prog_select_runtime(struct bpf_prog *fp)
 {
 	fp->bpf_func = (void *) __bpf_prog_run;
 
@@ -634,6 +703,8 @@ void bpf_prog_select_runtime(struct bpf_prog *fp)
 	bpf_int_jit_compile(fp);
 	/* Lock whole bpf_prog as read-only */
 	bpf_prog_lock_ro(fp);
+
+	return check_tail_call(fp);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3bae6c591914..98a69bd83069 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -68,6 +68,12 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 {
 	struct bpf_map *map = filp->private_data;
 
+	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
+		/* prog_array stores refcnt-ed bpf_prog pointers
+		 * release them all when user space closes prog_array_fd
+		 */
+		bpf_prog_array_map_clear(map);
+
 	bpf_map_put(map);
 	return 0;
 }
@@ -392,6 +398,19 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
 			 */
 			BUG_ON(!prog->aux->ops->get_func_proto);
 
+			if (insn->imm == BPF_FUNC_tail_call) {
+				/* mark bpf_tail_call as different opcode
+				 * to avoid conditional branch in
+				 * interpeter for every normal call
+				 * and to prevent accidental JITing by
+				 * JIT compiler that doesn't support
+				 * bpf_tail_call yet
+				 */
+				insn->imm = 0;
+				insn->code |= BPF_X;
+				continue;
+			}
+
 			fn = prog->aux->ops->get_func_proto(insn->imm);
 			/* all functions that have prototype and verifier allowed
 			 * programs to call them, must be real in-kernel functions
@@ -532,7 +551,9 @@ static int bpf_prog_load(union bpf_attr *attr)
 	fixup_bpf_calls(prog);
 
 	/* eBPF program is ready to be JITed */
-	bpf_prog_select_runtime(prog);
+	err = bpf_prog_select_runtime(prog);
+	if (err < 0)
+		goto free_used_maps;
 
 	err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
 	if (err < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 47dcd3aa6e23..cfd9a40b9a5a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -907,6 +907,23 @@ static int check_call(struct verifier_env *env, int func_id)
 			fn->ret_type, func_id);
 		return -EINVAL;
 	}
+
+	if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY &&
+	    func_id != BPF_FUNC_tail_call)
+		/* prog_array map type needs extra care:
+		 * only allow to pass it into bpf_tail_call() for now.
+		 * bpf_map_delete_elem() can be allowed in the future,
+		 * while bpf_map_update_elem() must only be done via syscall
+		 */
+		return -EINVAL;
+
+	if (func_id == BPF_FUNC_tail_call &&
+	    map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+		/* don't allow any other map type to be passed into
+		 * bpf_tail_call()
+		 */
+		return -EINVAL;
+
 	return 0;
 }
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 2d56ce501632..646445e41bd4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -172,6 +172,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_probe_read_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
 
 	case BPF_FUNC_trace_printk:
 		/*
diff --git a/net/core/filter.c b/net/core/filter.c
index 6805717be614..3adcca6f17a4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1421,6 +1421,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_prandom_u32_proto;
 	case BPF_FUNC_get_smp_processor_id:
 		return &bpf_get_smp_processor_id_proto;
+	case BPF_FUNC_tail_call:
+		return &bpf_tail_call_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 12c227ec89a70c14e3efcf102c5babece381e172 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Fri, 22 May 2015 11:05:58 +0200
Subject: flow_dissector: do not break if ports are not needed in flowlabel

This restored previous behaviour. If caller does not want ports to be
filled, we should not break.

Fixes: 06635a35d13d ("flow_dissect: use programable dissector in skb_flow_dissect and friends")
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 703d05916a80..1f2d89300b1a 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -221,13 +221,13 @@ flow_label:
 			key_basic->ip_proto = ip_proto;
 			key_basic->thoff = (u16)nhoff;
 
-			if (!skb_flow_dissector_uses_key(flow_dissector,
-							 FLOW_DISSECTOR_KEY_PORTS))
-				break;
-			key_ports = skb_flow_dissector_target(flow_dissector,
-							      FLOW_DISSECTOR_KEY_PORTS,
-							      target_container);
-			key_ports->ports = flow_label;
+			if (skb_flow_dissector_uses_key(flow_dissector,
+							FLOW_DISSECTOR_KEY_PORTS)) {
+				key_ports = skb_flow_dissector_target(flow_dissector,
+								      FLOW_DISSECTOR_KEY_PORTS,
+								      target_container);
+				key_ports->ports = flow_label;
+			}
 
 			return true;
 		}
-- 
cgit v1.2.3


From d079abd181950a44cdf31daafd1662388a6c4d2e Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 21 May 2015 12:16:11 +0200
Subject: pktgen: adjust spacing in proc file interface output

Too many spaces were introduced in commit 63adc6fb8ac0 ("pktgen: cleanup
checkpatch warnings"), thus misaligning "src_min:" to other columns.

Fixes: 63adc6fb8ac0 ("pktgen: cleanup checkpatch warnings")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 62f979984a23..167e59243fea 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -572,7 +572,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
 			   "     dst_min: %s  dst_max: %s\n",
 			   pkt_dev->dst_min, pkt_dev->dst_max);
 		seq_printf(seq,
-			   "        src_min: %s  src_max: %s\n",
+			   "     src_min: %s  src_max: %s\n",
 			   pkt_dev->src_min, pkt_dev->src_max);
 	}
 
-- 
cgit v1.2.3


From 4020726479ff799318a5aa188b81d79df86a0ea3 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 21 May 2015 12:16:56 +0200
Subject: pktgen: make /proc/net/pktgen/pgctrl report fail on invalid input

Giving /proc/net/pktgen/pgctrl an invalid command just returns shell
success and prints a warning in dmesg.  This is not very useful for
shell scripting, as it can only detect the error by parsing dmesg.

Instead return -EINVAL when the command is unknown, as this provides
userspace shell scripting a way of detecting this.

Also bump version tag to 2.75, because (1) reading /proc/net/pktgen/pgctrl
output this version number which would allow to detect this small
semantic change, and (2) because the pktgen version tag have not been
updated since 2010.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 167e59243fea..676550f34560 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -177,7 +177,7 @@
 #include <asm/dma.h>
 #include <asm/div64.h>		/* do_div */
 
-#define VERSION	"2.74"
+#define VERSION	"2.75"
 #define IP_NAME_SZ 32
 #define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
 #define MPLS_STACK_BOTTOM htonl(0x00000100)
@@ -512,7 +512,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
 		pktgen_reset_all_threads(pn);
 
 	else
-		pr_warn("Unknown command: %s\n", data);
+		return -EINVAL;
 
 	return count;
 }
-- 
cgit v1.2.3


From be12a1fe298e8be04d5215364f94654dff81b0bc Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Thu, 21 May 2015 16:59:58 +0200
Subject: net: skbuff: add skb_append_pagefrags and use it

Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  3 +++
 net/core/skbuff.c      | 18 ++++++++++++++++++
 net/ipv4/ip_output.c   | 15 ++++-----------
 3 files changed, 25 insertions(+), 11 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b617095adb88..f708936cdd23 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -861,6 +861,9 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
 					int len, int odd, struct sk_buff *skb),
 			    void *from, int length);
 
+int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
+			 int offset, size_t size);
+
 struct skb_seq_state {
 	__u32		lower_offset;
 	__u32		upper_offset;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f3fe9bd9e672..4f2babeaf18d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2915,6 +2915,24 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
 }
 EXPORT_SYMBOL(skb_append_datato_frags);
 
+int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
+			 int offset, size_t size)
+{
+	int i = skb_shinfo(skb)->nr_frags;
+
+	if (skb_can_coalesce(skb, i, page, offset)) {
+		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
+	} else if (i < MAX_SKB_FRAGS) {
+		get_page(page);
+		skb_fill_page_desc(skb, i, page, offset, size);
+	} else {
+		return -EMSGSIZE;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(skb_append_pagefrags);
+
 /**
  *	skb_pull_rcsum - pull skb and update receive checksum
  *	@skb: buffer to update
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 8d91b922fcfe..451b009dae75 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1233,11 +1233,9 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
 	}
 
 	while (size > 0) {
-		int i;
-
-		if (skb_is_gso(skb))
+		if (skb_is_gso(skb)) {
 			len = size;
-		else {
+		} else {
 
 			/* Check if the remaining data fits into current packet. */
 			len = mtu - skb->len;
@@ -1289,15 +1287,10 @@ ssize_t	ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
 			continue;
 		}
 
-		i = skb_shinfo(skb)->nr_frags;
 		if (len > size)
 			len = size;
-		if (skb_can_coalesce(skb, i, page, offset)) {
-			skb_frag_size_add(&skb_shinfo(skb)->frags[i-1], len);
-		} else if (i < MAX_SKB_FRAGS) {
-			get_page(page);
-			skb_fill_page_desc(skb, i, page, offset, len);
-		} else {
+
+		if (skb_append_pagefrags(skb, page, offset, len)) {
 			err = -EMSGSIZE;
 			goto error;
 		}
-- 
cgit v1.2.3


From a60e3cc7c92973a31fad0fd04dc5cf4355d3d1ef Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Thu, 21 May 2015 17:00:00 +0200
Subject: net: make skb_splice_bits more configureable

Prepare skb_splice_bits to be able to deal with AF_UNIX sockets.

AF_UNIX sockets don't use lock_sock/release_sock and thus we have to
use a callback to make the locking and unlocking configureable.

Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 11 +++++++++--
 net/core/skbuff.c      | 45 ++++++++++++++++++++++++++++-----------------
 net/ipv4/tcp.c         |  5 +++--
 3 files changed, 40 insertions(+), 21 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f708936cdd23..6b41c15efa27 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -35,6 +35,7 @@
 #include <linux/netdev_features.h>
 #include <linux/sched.h>
 #include <net/flow_dissector.h>
+#include <linux/splice.h>
 
 /* A. Checksumming of received packets by device.
  *
@@ -2699,9 +2700,15 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
 __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to,
 			      int len, __wsum csum);
-int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
+ssize_t skb_socket_splice(struct sock *sk,
+			  struct pipe_inode_info *pipe,
+			  struct splice_pipe_desc *spd);
+int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 		    struct pipe_inode_info *pipe, unsigned int len,
-		    unsigned int flags);
+		    unsigned int flags,
+		    ssize_t (*splice_cb)(struct sock *,
+					 struct pipe_inode_info *,
+					 struct splice_pipe_desc *));
 void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
 unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
 int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4f2babeaf18d..02769fa4f5c8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1870,15 +1870,39 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
 	return false;
 }
 
+ssize_t skb_socket_splice(struct sock *sk,
+			  struct pipe_inode_info *pipe,
+			  struct splice_pipe_desc *spd)
+{
+	int ret;
+
+	/* Drop the socket lock, otherwise we have reverse
+	 * locking dependencies between sk_lock and i_mutex
+	 * here as compared to sendfile(). We enter here
+	 * with the socket lock held, and splice_to_pipe() will
+	 * grab the pipe inode lock. For sendfile() emulation,
+	 * we call into ->sendpage() with the i_mutex lock held
+	 * and networking will grab the socket lock.
+	 */
+	release_sock(sk);
+	ret = splice_to_pipe(pipe, spd);
+	lock_sock(sk);
+
+	return ret;
+}
+
 /*
  * Map data from the skb to a pipe. Should handle both the linear part,
  * the fragments, and the frag list. It does NOT handle frag lists within
  * the frag list, if such a thing exists. We'd probably need to recurse to
  * handle that cleanly.
  */
-int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
+int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
 		    struct pipe_inode_info *pipe, unsigned int tlen,
-		    unsigned int flags)
+		    unsigned int flags,
+		    ssize_t (*splice_cb)(struct sock *,
+					 struct pipe_inode_info *,
+					 struct splice_pipe_desc *))
 {
 	struct partial_page partial[MAX_SKB_FRAGS];
 	struct page *pages[MAX_SKB_FRAGS];
@@ -1891,7 +1915,6 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 		.spd_release = sock_spd_release,
 	};
 	struct sk_buff *frag_iter;
-	struct sock *sk = skb->sk;
 	int ret = 0;
 
 	/*
@@ -1914,20 +1937,8 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,
 	}
 
 done:
-	if (spd.nr_pages) {
-		/*
-		 * Drop the socket lock, otherwise we have reverse
-		 * locking dependencies between sk_lock and i_mutex
-		 * here as compared to sendfile(). We enter here
-		 * with the socket lock held, and splice_to_pipe() will
-		 * grab the pipe inode lock. For sendfile() emulation,
-		 * we call into ->sendpage() with the i_mutex lock held
-		 * and networking will grab the socket lock.
-		 */
-		release_sock(sk);
-		ret = splice_to_pipe(pipe, &spd);
-		lock_sock(sk);
-	}
+	if (spd.nr_pages)
+		ret = splice_cb(sk, pipe, &spd);
 
 	return ret;
 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 90afcec3f427..65f791f74845 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -695,8 +695,9 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
 	struct tcp_splice_state *tss = rd_desc->arg.data;
 	int ret;
 
-	ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
-			      tss->flags);
+	ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe,
+			      min(rd_desc->count, len), tss->flags,
+			      skb_socket_splice);
 	if (ret > 0)
 		rd_desc->count -= ret;
 	return ret;
-- 
cgit v1.2.3


From 2b514574f7e88c8498027ee366fd6e7aae5aa4b5 Mon Sep 17 00:00:00 2001
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Thu, 21 May 2015 17:00:01 +0200
Subject: net: af_unix: implement splice for stream af_unix sockets

unix_stream_recvmsg is refactored to unix_stream_read_generic in this
patch and enhanced to deal with pipe splicing. The refactoring is
inneglible, we mostly have to deal with a non-existing struct msghdr
argument.

Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/splice.c        |   1 +
 net/core/skbuff.c  |   1 +
 net/unix/af_unix.c | 140 +++++++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 121 insertions(+), 21 deletions(-)

(limited to 'net/core')

diff --git a/fs/splice.c b/fs/splice.c
index bfe62ae40f40..4f355a1c1a9e 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -261,6 +261,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(splice_to_pipe);
 
 void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
 {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 02769fa4f5c8..9bac0e6f8dfa 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1942,6 +1942,7 @@ done:
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(skb_splice_bits);
 
 /**
  *	skb_store_bits - store bits from kernel buffer to skb
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 7762c0b46721..b8c44076c776 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -520,6 +520,9 @@ static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
 				    size_t size, int flags);
+static ssize_t unix_stream_splice_read(struct socket *,  loff_t *ppos,
+				       struct pipe_inode_info *, size_t size,
+				       unsigned int flags);
 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
 static int unix_dgram_connect(struct socket *, struct sockaddr *,
@@ -561,6 +564,7 @@ static const struct proto_ops unix_stream_ops = {
 	.recvmsg =	unix_stream_recvmsg,
 	.mmap =		sock_no_mmap,
 	.sendpage =	unix_stream_sendpage,
+	.splice_read =	unix_stream_splice_read,
 	.set_peek_off =	unix_set_peek_off,
 };
 
@@ -1957,8 +1961,9 @@ out:
  *	Sleep until more data has arrived. But check for races..
  */
 static long unix_stream_data_wait(struct sock *sk, long timeo,
-				  struct sk_buff *last)
+				  struct sk_buff *last, unsigned int last_len)
 {
+	struct sk_buff *tail;
 	DEFINE_WAIT(wait);
 
 	unix_state_lock(sk);
@@ -1966,7 +1971,9 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
 	for (;;) {
 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
-		if (skb_peek_tail(&sk->sk_receive_queue) != last ||
+		tail = skb_peek_tail(&sk->sk_receive_queue);
+		if (tail != last ||
+		    (tail && tail->len != last_len) ||
 		    sk->sk_err ||
 		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
 		    signal_pending(current) ||
@@ -1990,38 +1997,50 @@ static unsigned int unix_skb_len(const struct sk_buff *skb)
 	return skb->len - UNIXCB(skb).consumed;
 }
 
-static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
-			       size_t size, int flags)
+struct unix_stream_read_state {
+	int (*recv_actor)(struct sk_buff *, int, int,
+			  struct unix_stream_read_state *);
+	struct socket *socket;
+	struct msghdr *msg;
+	struct pipe_inode_info *pipe;
+	size_t size;
+	int flags;
+	unsigned int splice_flags;
+};
+
+static int unix_stream_read_generic(struct unix_stream_read_state *state)
 {
 	struct scm_cookie scm;
+	struct socket *sock = state->socket;
 	struct sock *sk = sock->sk;
 	struct unix_sock *u = unix_sk(sk);
-	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
 	int copied = 0;
+	int flags = state->flags;
 	int noblock = flags & MSG_DONTWAIT;
-	int check_creds = 0;
+	bool check_creds = false;
 	int target;
 	int err = 0;
 	long timeo;
 	int skip;
+	size_t size = state->size;
+	unsigned int last_len;
 
 	err = -EINVAL;
 	if (sk->sk_state != TCP_ESTABLISHED)
 		goto out;
 
 	err = -EOPNOTSUPP;
-	if (flags&MSG_OOB)
+	if (flags & MSG_OOB)
 		goto out;
 
-	target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
 	timeo = sock_rcvtimeo(sk, noblock);
 
+	memset(&scm, 0, sizeof(scm));
+
 	/* Lock the socket to prevent queue disordering
 	 * while sleeps in memcpy_tomsg
 	 */
-
-	memset(&scm, 0, sizeof(scm));
-
 	err = mutex_lock_interruptible(&u->readlock);
 	if (unlikely(err)) {
 		/* recvmsg() in non blocking mode is supposed to return -EAGAIN
@@ -2037,6 +2056,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
 
 		unix_state_lock(sk);
 		last = skb = skb_peek(&sk->sk_receive_queue);
+		last_len = last ? last->len : 0;
 again:
 		if (skb == NULL) {
 			unix_sk(sk)->recursion_level = 0;
@@ -2059,16 +2079,17 @@ again:
 				break;
 			mutex_unlock(&u->readlock);
 
-			timeo = unix_stream_data_wait(sk, timeo, last);
+			timeo = unix_stream_data_wait(sk, timeo, last,
+						      last_len);
 
-			if (signal_pending(current)
-			    ||  mutex_lock_interruptible(&u->readlock)) {
+			if (signal_pending(current) ||
+			    mutex_lock_interruptible(&u->readlock)) {
 				err = sock_intr_errno(timeo);
 				goto out;
 			}
 
 			continue;
- unlock:
+unlock:
 			unix_state_unlock(sk);
 			break;
 		}
@@ -2077,6 +2098,7 @@ again:
 		while (skip >= unix_skb_len(skb)) {
 			skip -= unix_skb_len(skb);
 			last = skb;
+			last_len = skb->len;
 			skb = skb_peek_next(skb, &sk->sk_receive_queue);
 			if (!skb)
 				goto again;
@@ -2093,18 +2115,20 @@ again:
 		} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
 			/* Copy credentials */
 			scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
-			check_creds = 1;
+			check_creds = true;
 		}
 
 		/* Copy address just once */
-		if (sunaddr) {
-			unix_copy_addr(msg, skb->sk);
+		if (state->msg && state->msg->msg_name) {
+			DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
+					 state->msg->msg_name);
+			unix_copy_addr(state->msg, skb->sk);
 			sunaddr = NULL;
 		}
 
 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
-		if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
-					  msg, chunk)) {
+		chunk = state->recv_actor(skb, skip, chunk, state);
+		if (chunk < 0) {
 			if (copied == 0)
 				copied = -EFAULT;
 			break;
@@ -2142,11 +2166,85 @@ again:
 	} while (size);
 
 	mutex_unlock(&u->readlock);
-	scm_recv(sock, msg, &scm, flags);
+	if (state->msg)
+		scm_recv(sock, state->msg, &scm, flags);
+	else
+		scm_destroy(&scm);
 out:
 	return copied ? : err;
 }
 
+static int unix_stream_read_actor(struct sk_buff *skb,
+				  int skip, int chunk,
+				  struct unix_stream_read_state *state)
+{
+	int ret;
+
+	ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
+				    state->msg, chunk);
+	return ret ?: chunk;
+}
+
+static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
+			       size_t size, int flags)
+{
+	struct unix_stream_read_state state = {
+		.recv_actor = unix_stream_read_actor,
+		.socket = sock,
+		.msg = msg,
+		.size = size,
+		.flags = flags
+	};
+
+	return unix_stream_read_generic(&state);
+}
+
+static ssize_t skb_unix_socket_splice(struct sock *sk,
+				      struct pipe_inode_info *pipe,
+				      struct splice_pipe_desc *spd)
+{
+	int ret;
+	struct unix_sock *u = unix_sk(sk);
+
+	mutex_unlock(&u->readlock);
+	ret = splice_to_pipe(pipe, spd);
+	mutex_lock(&u->readlock);
+
+	return ret;
+}
+
+static int unix_stream_splice_actor(struct sk_buff *skb,
+				    int skip, int chunk,
+				    struct unix_stream_read_state *state)
+{
+	return skb_splice_bits(skb, state->socket->sk,
+			       UNIXCB(skb).consumed + skip,
+			       state->pipe, chunk, state->splice_flags,
+			       skb_unix_socket_splice);
+}
+
+static ssize_t unix_stream_splice_read(struct socket *sock,  loff_t *ppos,
+				       struct pipe_inode_info *pipe,
+				       size_t size, unsigned int flags)
+{
+	struct unix_stream_read_state state = {
+		.recv_actor = unix_stream_splice_actor,
+		.socket = sock,
+		.pipe = pipe,
+		.size = size,
+		.splice_flags = flags,
+	};
+
+	if (unlikely(*ppos))
+		return -ESPIPE;
+
+	if (sock->file->f_flags & O_NONBLOCK ||
+	    flags & SPLICE_F_NONBLOCK)
+		state.flags = MSG_DONTWAIT;
+
+	return unix_stream_read_generic(&state);
+}
+
 static int unix_shutdown(struct socket *sock, int mode)
 {
 	struct sock *sk = sock->sk;
-- 
cgit v1.2.3


From d4969581452be430135ba5040c5b6c9023c1d461 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 25 May 2015 16:06:37 -0700
Subject: pktgen: remove one sparse error

net/core/pktgen.c:2672:43: warning: incorrect type in assignment (different base types)
net/core/pktgen.c:2672:43:    expected unsigned short [unsigned] [short] [usertype] <noident>
net/core/pktgen.c:2672:43:    got restricted __be16 [usertype] protocol

Let's use proper struct ethhdr instead of hard coding everything.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/pktgen.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 676550f34560..d93cbc5715f4 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2645,9 +2645,9 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
 		struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
 		int nhead = 0;
 		if (x) {
-			int ret;
-			__u8 *eth;
+			struct ethhdr *eth;
 			struct iphdr *iph;
+			int ret;
 
 			nhead = x->props.header_len - skb_headroom(skb);
 			if (nhead > 0) {
@@ -2667,9 +2667,9 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
 				goto err;
 			}
 			/* restore ll */
-			eth = (__u8 *) skb_push(skb, ETH_HLEN);
-			memcpy(eth, pkt_dev->hh, 12);
-			*(u16 *) &eth[12] = protocol;
+			eth = (struct ethhdr *)skb_push(skb, ETH_HLEN);
+			memcpy(eth, pkt_dev->hh, 2 * ETH_ALEN);
+			eth->h_proto = protocol;
 
 			/* Update IPv4 header len as well as checksum value */
 			iph = ip_hdr(skb);
-- 
cgit v1.2.3


From 68319052d10d18462fbfd5571a5062cb7ec50788 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 25 May 2015 18:55:48 -0700
Subject: net: remove a sparse error in secure_dccpv6_sequence_number()

make C=2 CF=-D__CHECK_ENDIAN__ net/core/secure_seq.o
net/core/secure_seq.c:157:50: warning: restricted __be32 degrades to
integer

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/secure_seq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 51dd3193a33e..fd3ce461fbe6 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -154,7 +154,7 @@ u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
 	net_secret_init();
 	memcpy(hash, saddr, 16);
 	for (i = 0; i < 4; i++)
-		secret[i] = net_secret[i] + daddr[i];
+		secret[i] = net_secret[i] + (__force u32)daddr[i];
 	secret[4] = net_secret[4] +
 		(((__force u16)sport << 16) + (__force u16)dport);
 	for (i = 5; i < MD5_MESSAGE_BYTES / 4; i++)
-- 
cgit v1.2.3


From 05c985436dfe81ab784251a54dd69be5b2dea98a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 25 May 2015 18:50:01 -0700
Subject: net: fix inet_proto_csum_replace4() sparse errors

make C=2 CF=-D__CHECK_ENDIAN__ net/core/utils.o
...
net/core/utils.c:307:72: warning: incorrect type in argument 2 (different base types)
net/core/utils.c:307:72:    expected restricted __wsum [usertype] addend
net/core/utils.c:307:72:    got restricted __be32 [usertype] from
net/core/utils.c:308:34: warning: incorrect type in argument 2 (different base types)
net/core/utils.c:308:34:    expected restricted __wsum [usertype] addend
net/core/utils.c:308:34:    got restricted __be32 [usertype] to
net/core/utils.c:310:70: warning: incorrect type in argument 2 (different base types)
net/core/utils.c:310:70:    expected restricted __wsum [usertype] addend
net/core/utils.c:310:70:    got restricted __be32 [usertype] from
net/core/utils.c:310:77: warning: incorrect type in argument 2 (different base types)
net/core/utils.c:310:77:    expected restricted __wsum [usertype] addend
net/core/utils.c:310:77:    got restricted __be32 [usertype] to
net/core/utils.c:312:72: warning: incorrect type in argument 2 (different base types)
net/core/utils.c:312:72:    expected restricted __wsum [usertype] addend
net/core/utils.c:312:72:    got restricted __be32 [usertype] from
net/core/utils.c:313:35: warning: incorrect type in argument 2 (different base types)
net/core/utils.c:313:35:    expected restricted __wsum [usertype] addend
net/core/utils.c:313:35:    got restricted __be32 [usertype] to

Note we can use csum_replace4() helper

Fixes: 58e3cac5613aa ("net: optimise inet_proto_csum_replace4()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/utils.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/net/core/utils.c b/net/core/utils.c
index 7b803884c162..a7732a068043 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -304,13 +304,15 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
 			      __be32 from, __be32 to, int pseudohdr)
 {
 	if (skb->ip_summed != CHECKSUM_PARTIAL) {
-		*sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), from),
-				 to));
+		csum_replace4(sum, from, to);
 		if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
-			skb->csum = ~csum_add(csum_sub(~(skb->csum), from), to);
+			skb->csum = ~csum_add(csum_sub(~(skb->csum),
+						       (__force __wsum)from),
+					      (__force __wsum)to);
 	} else if (pseudohdr)
-		*sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum), from),
-				  to));
+		*sum = ~csum_fold(csum_add(csum_sub(csum_unfold(*sum),
+						    (__force __wsum)from),
+					   (__force __wsum)to));
 }
 EXPORT_SYMBOL(inet_proto_csum_replace4);
 
-- 
cgit v1.2.3


From d6a4e26afb80c049e7f94e1b7b506dcda61eee88 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 26 May 2015 08:55:28 -0700
Subject: tcp: tcp_tso_autosize() minimum is one packet

By making sure sk->sk_gso_max_segs minimal value is one,
and sysctl_tcp_min_tso_segs minimal value is one as well,
tcp_tso_autosize() will return a non zero value.

We can then revert 843925f33fcc293d80acf2c5c8a78adf3344d49b
("tcp: Do not apply TSO segment limit to non-TSO packets")
and save few cpu cycles in fast path.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/sock.c            | 5 ++++-
 net/ipv4/sysctl_net_ipv4.c | 2 +-
 net/ipv4/tcp_output.c      | 4 ++--
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 29124fcdc42a..e72633c346b1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1581,6 +1581,8 @@ EXPORT_SYMBOL_GPL(sk_clone_lock);
 
 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 {
+	u32 max_segs = 1;
+
 	__sk_dst_set(sk, dst);
 	sk->sk_route_caps = dst->dev->features;
 	if (sk->sk_route_caps & NETIF_F_GSO)
@@ -1592,9 +1594,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 		} else {
 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
 			sk->sk_gso_max_size = dst->dev->gso_max_size;
-			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
+			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
 		}
 	}
+	sk->sk_gso_max_segs = max_segs;
 }
 EXPORT_SYMBOL_GPL(sk_setup_caps);
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 841de32f1fee..e64892769607 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -702,7 +702,7 @@ static struct ctl_table ipv4_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= &one,
 		.extra2		= &gso_max_segs,
 	},
 	{
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 534e5fdb04c1..190538a2a88c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2088,7 +2088,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
 			break;
 
-		if (tso_segs == 1 || !max_segs) {
+		if (tso_segs == 1) {
 			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
 						     (tcp_skb_is_last(sk, skb) ?
 						      nonagle : TCP_NAGLE_PUSH))))
@@ -2101,7 +2101,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 		}
 
 		limit = mss_now;
-		if (tso_segs > 1 && max_segs && !tcp_urg_mode(tp))
+		if (tso_segs > 1 && !tcp_urg_mode(tp))
 			limit = tcp_mss_split_point(sk, skb, mss_now,
 						    min_t(unsigned int,
 							  cwnd_quota,
-- 
cgit v1.2.3


From 37e82c2f974b72c9ab49c787ef7b5bb1aec12768 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 27 May 2015 15:30:39 -0700
Subject: bpf: allow BPF programs access skb->skb_iif and skb->dev->ifindex
 fields

classic BPF already exposes skb->dev->ifindex via SKF_AD_IFINDEX extension.
Allow eBPF program to access it as well. Note that classic aborts execution
of the program if 'skb->dev == NULL' (which is inconvenient for program
writers), whereas eBPF returns zero in such case.
Also expose the 'skb_iif' field, since programs triggered by redirected
packet need to known the original interface index.
Summary:
__skb->ifindex         -> skb->dev->ifindex
__skb->ingress_ifindex -> skb->skb_iif

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  2 ++
 net/core/filter.c        | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'net/core')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f0a9af8b4dae..72f3080afa1e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -236,6 +236,8 @@ struct __sk_buff {
 	__u32 vlan_tci;
 	__u32 vlan_proto;
 	__u32 priority;
+	__u32 ingress_ifindex;
+	__u32 ifindex;
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index 3adcca6f17a4..2c30d6632d66 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1499,6 +1499,24 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
 				      offsetof(struct sk_buff, priority));
 		break;
 
+	case offsetof(struct __sk_buff, ingress_ifindex):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sk_buff, skb_iif));
+		break;
+
+	case offsetof(struct __sk_buff, ifindex):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+
+		*insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)),
+				      dst_reg, src_reg,
+				      offsetof(struct sk_buff, dev));
+		*insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg,
+				      offsetof(struct net_device, ifindex));
+		break;
+
 	case offsetof(struct __sk_buff, mark):
 		return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn);
 
-- 
cgit v1.2.3


From 282c320d33541d61f4ab91248448040c4ebab1fc Mon Sep 17 00:00:00 2001
From: Wang Long <long.wanglong@huawei.com>
Date: Fri, 29 May 2015 01:02:08 +0000
Subject: netevent: remove automatic variable in register_netevent_notifier()

Remove automatic variable 'err' in register_netevent_notifier() and
return the result of atomic_notifier_chain_register() directly.

Signed-off-by: Wang Long <long.wanglong@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/netevent.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/net/core/netevent.c b/net/core/netevent.c
index f17ccd291d39..8b3bc4fac613 100644
--- a/net/core/netevent.c
+++ b/net/core/netevent.c
@@ -31,10 +31,7 @@ static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
  */
 int register_netevent_notifier(struct notifier_block *nb)
 {
-	int err;
-
-	err = atomic_notifier_chain_register(&netevent_notif_chain, nb);
-	return err;
+	return atomic_notifier_chain_register(&netevent_notif_chain, nb);
 }
 EXPORT_SYMBOL_GPL(register_netevent_notifier);
 
-- 
cgit v1.2.3


From 17ca8cbf49be3aa94bb1c2b7ee6545fd70094eb4 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 29 May 2015 23:23:06 +0200
Subject: ebpf: allow bpf_ktime_get_ns_proto also for networking

As this is already exported from tracing side via commit d9847d310ab4
("tracing: Allow BPF programs to call bpf_ktime_get_ns()"), we might
as well want to move it to the core, so also networking users can make
use of it, e.g. to measure diffs for certain flows from ingress/egress.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 kernel/bpf/core.c        |  1 +
 kernel/bpf/helpers.c     | 13 +++++++++++++
 kernel/trace/bpf_trace.c | 12 ------------
 net/core/filter.c        |  2 ++
 5 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5f520f5f087e..ca854e5bb2f7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -186,5 +186,6 @@ extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
+extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
 
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d44b25cbe460..4548422d5f6c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -734,6 +734,7 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
 
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bd7f5988ed9c..b3aaabdf9a50 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -13,6 +13,7 @@
 #include <linux/rcupdate.h>
 #include <linux/random.h>
 #include <linux/smp.h>
+#include <linux/ktime.h>
 
 /* If kernel subsystem is allowing eBPF programs to call this function,
  * inside its own verifier_ops->get_func_proto() callback it should return
@@ -111,3 +112,15 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 };
+
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* NMI safe access to clock monotonic */
+	return ktime_get_mono_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+	.func		= bpf_ktime_get_ns,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 646445e41bd4..50c4015a8ad3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -79,18 +79,6 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-	/* NMI safe access to clock monotonic */
-	return ktime_get_mono_fast_ns();
-}
-
-static const struct bpf_func_proto bpf_ktime_get_ns_proto = {
-	.func		= bpf_ktime_get_ns,
-	.gpl_only	= true,
-	.ret_type	= RET_INTEGER,
-};
-
 /*
  * limited trace_printk()
  * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
diff --git a/net/core/filter.c b/net/core/filter.c
index 2c30d6632d66..b78a010a957f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1423,6 +1423,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_tail_call:
 		return &bpf_tail_call_proto;
+	case BPF_FUNC_ktime_get_ns:
+		return &bpf_ktime_get_ns_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From bdef7de4b8d9be4cf7bf5aea977f827310ab3ff0 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 1 Jun 2015 14:56:09 -0700
Subject: net: Add priority to packet_offload objects.

When we scan a packet for GRO processing, we want to see the most
common packet types in the front of the offload_base list.

So add a priority field so we can handle this properly.

IPv4/IPv6 get the highest priority with the implicit zero priority
field.

Next comes ethernet with a priority of 10, and then we have the MPLS
types with a priority of 15.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Suggested-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 +
 net/core/dev.c            | 8 ++++++--
 net/ethernet/eth.c        | 1 +
 net/mpls/mpls_gso.c       | 2 ++
 4 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 51f8d2f5dc3f..6f5f71ff5169 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1997,6 +1997,7 @@ struct offload_callbacks {
 
 struct packet_offload {
 	__be16			 type;	/* This is really htons(ether_type). */
+	u16			 priority;
 	struct offload_callbacks callbacks;
 	struct list_head	 list;
 };
diff --git a/net/core/dev.c b/net/core/dev.c
index 594163d0c6eb..0602e917a305 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -469,10 +469,14 @@ EXPORT_SYMBOL(dev_remove_pack);
  */
 void dev_add_offload(struct packet_offload *po)
 {
-	struct list_head *head = &offload_base;
+	struct packet_offload *elem;
 
 	spin_lock(&offload_lock);
-	list_add_rcu(&po->list, head);
+	list_for_each_entry(elem, &offload_base, list) {
+		if (po->priority < elem->priority)
+			break;
+	}
+	list_add_rcu(&po->list, elem->list.prev);
 	spin_unlock(&offload_lock);
 }
 EXPORT_SYMBOL(dev_add_offload);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index c3325bd2f3fb..7d0e239a6755 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -470,6 +470,7 @@ EXPORT_SYMBOL(eth_gro_complete);
 
 static struct packet_offload eth_packet_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_TEB),
+	.priority = 10,
 	.callbacks = {
 		.gro_receive = eth_gro_receive,
 		.gro_complete = eth_gro_complete,
diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 809df534a720..0183b32da942 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c
@@ -62,6 +62,7 @@ out:
 
 static struct packet_offload mpls_mc_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_MPLS_MC),
+	.priority = 15,
 	.callbacks = {
 		.gso_segment    =	mpls_gso_segment,
 	},
@@ -69,6 +70,7 @@ static struct packet_offload mpls_mc_offload __read_mostly = {
 
 static struct packet_offload mpls_uc_offload __read_mostly = {
 	.type = cpu_to_be16(ETH_P_MPLS_UC),
+	.priority = 15,
 	.callbacks = {
 		.gso_segment    =	mpls_gso_segment,
 	},
-- 
cgit v1.2.3


From 3896d655f4d491c67d669a15f275a39f713410f8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Tue, 2 Jun 2015 16:03:14 -0700
Subject: bpf: introduce bpf_clone_redirect() helper

Allow eBPF programs attached to classifier/actions to call
bpf_clone_redirect(skb, ifindex, flags) helper which will
mirror or redirect the packet by dynamic ifindex selection
from within the program to a target device either at ingress
or at egress. Can be used for various scenarios, for example,
to load balance skbs into veths, split parts of the traffic
to local taps, etc.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 10 ++++++++++
 net/core/filter.c        | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

(limited to 'net/core')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 72f3080afa1e..42aa19abab86 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -220,6 +220,16 @@ enum bpf_func_id {
 	 * Return: 0 on success
 	 */
 	BPF_FUNC_tail_call,
+
+	/**
+	 * bpf_clone_redirect(skb, ifindex, flags) - redirect to another netdev
+	 * @skb: pointer to skb
+	 * @ifindex: ifindex of the net device
+	 * @flags: bit 0 - if set, redirect to ingress instead of egress
+	 *         other bits - reserved
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_clone_redirect,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index b78a010a957f..64c121c09655 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -46,6 +46,7 @@
 #include <linux/seccomp.h>
 #include <linux/if_vlan.h>
 #include <linux/bpf.h>
+#include <net/sch_generic.h>
 
 /**
  *	sk_filter - run a packet through a socket filter
@@ -1407,6 +1408,43 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+#define BPF_IS_REDIRECT_INGRESS(flags)	((flags) & 1)
+
+static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2;
+	struct net_device *dev;
+
+	dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
+	if (unlikely(!dev))
+		return -EINVAL;
+
+	if (unlikely(!(dev->flags & IFF_UP)))
+		return -EINVAL;
+
+	skb2 = skb_clone(skb, GFP_ATOMIC);
+	if (unlikely(!skb2))
+		return -ENOMEM;
+
+	if (G_TC_AT(skb2->tc_verd) & AT_INGRESS)
+		skb_push(skb2, skb2->mac_len);
+
+	if (BPF_IS_REDIRECT_INGRESS(flags))
+		return dev_forward_skb(dev, skb2);
+
+	skb2->dev = dev;
+	return dev_queue_xmit(skb2);
+}
+
+const struct bpf_func_proto bpf_clone_redirect_proto = {
+	.func           = bpf_clone_redirect,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_ANYTHING,
+	.arg3_type      = ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
@@ -1440,6 +1478,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_l3_csum_replace_proto;
 	case BPF_FUNC_l4_csum_replace:
 		return &bpf_l4_csum_replace_proto;
+	case BPF_FUNC_clone_redirect:
+		return &bpf_clone_redirect_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 94db13fe5ff144ea5afcfa3a3daf38a239090acf Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 4 Jun 2015 08:33:48 -0700
Subject: bpf: fix build due to missing tc_verd

fix build error:
net/core/filter.c: In function 'bpf_clone_redirect':
net/core/filter.c:1429:18: error: 'struct sk_buff' has no member named 'tc_verd'
  if (G_TC_AT(skb2->tc_verd) & AT_INGRESS)

Fixes: 3896d655f4d4 ("bpf: introduce bpf_clone_redirect() helper")
Reported-by: Or Gerlitz <gerlitz.or@gmail.com>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 64c121c09655..09b2062eb5b8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -46,7 +46,6 @@
 #include <linux/seccomp.h>
 #include <linux/if_vlan.h>
 #include <linux/bpf.h>
-#include <net/sch_generic.h>
 
 /**
  *	sk_filter - run a packet through a socket filter
@@ -1426,8 +1425,7 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
 	if (unlikely(!skb2))
 		return -ENOMEM;
 
-	if (G_TC_AT(skb2->tc_verd) & AT_INGRESS)
-		skb_push(skb2, skb2->mac_len);
+	skb_push(skb2, skb2->data - skb_mac_header(skb2));
 
 	if (BPF_IS_REDIRECT_INGRESS(flags))
 		return dev_forward_skb(dev, skb2);
-- 
cgit v1.2.3


From ce3b5355477ce99bffa60a6a215f2e11db4b649c Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:36 -0700
Subject: net: Simplify GRE case in flow_dissector

Do break when we see routing flag or a non-zero version number in GRE
header.

Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 1f2d89300b1a..7f699169dc92 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -308,30 +308,30 @@ flow_label:
 		 * Only look inside GRE if version zero and no
 		 * routing
 		 */
-		if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
-			proto = hdr->proto;
+		if (hdr->flags & (GRE_VERSION | GRE_ROUTING))
+			break;
+
+		proto = hdr->proto;
+		nhoff += 4;
+		if (hdr->flags & GRE_CSUM)
 			nhoff += 4;
-			if (hdr->flags & GRE_CSUM)
-				nhoff += 4;
-			if (hdr->flags & GRE_KEY)
-				nhoff += 4;
-			if (hdr->flags & GRE_SEQ)
-				nhoff += 4;
-			if (proto == htons(ETH_P_TEB)) {
-				const struct ethhdr *eth;
-				struct ethhdr _eth;
-
-				eth = __skb_header_pointer(skb, nhoff,
-							   sizeof(_eth),
-							   data, hlen, &_eth);
-				if (!eth)
-					return false;
-				proto = eth->h_proto;
-				nhoff += sizeof(*eth);
-			}
-			goto again;
+		if (hdr->flags & GRE_KEY)
+			nhoff += 4;
+		if (hdr->flags & GRE_SEQ)
+			nhoff += 4;
+		if (proto == htons(ETH_P_TEB)) {
+			const struct ethhdr *eth;
+			struct ethhdr _eth;
+
+			eth = __skb_header_pointer(skb, nhoff,
+						   sizeof(_eth),
+						   data, hlen, &_eth);
+			if (!eth)
+				return false;
+			proto = eth->h_proto;
+			nhoff += sizeof(*eth);
 		}
-		break;
+		goto again;
 	}
 	case IPPROTO_IPIP:
 		proto = htons(ETH_P_IP);
-- 
cgit v1.2.3


From c468efe2c7d478bad8855f7d170cf245ee0f1b3f Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:38 -0700
Subject: net: Remove superfluous setting of key_basic

key_basic is set twice in __skb_flow_dissect which seems unnecessary.
Remove second one.

Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 7f699169dc92..0763795bea8f 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -343,12 +343,6 @@ flow_label:
 		break;
 	}
 
-	/* It is ensured by skb_flow_dissector_init() that basic key will
-	 * be always present.
-	 */
-	key_basic = skb_flow_dissector_target(flow_dissector,
-					      FLOW_DISSECTOR_KEY_BASIC,
-					      target_container);
 	key_basic->n_proto = proto;
 	key_basic->ip_proto = ip_proto;
 	key_basic->thoff = (u16) nhoff;
-- 
cgit v1.2.3


From 42aecaa9bb2bd57eb8d61b4565cee5d3640863fb Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:39 -0700
Subject: net: Get skb hash over flow_keys structure

This patch changes flow hashing to use jhash2 over the flow_keys
structure instead just doing jhash_3words over src, dst, and ports.
This method will allow us take more input into the hashing function
so that we can include full IPv6 addresses, VLAN, flow labels etc.
without needing to resort to xor'ing which makes for a poor hash.

Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h       |  2 +-
 include/net/flow_dissector.h | 21 ++++++++++++++---
 include/net/ip.h             |  2 ++
 include/net/ipv6.h           |  2 ++
 net/core/flow_dissector.c    | 54 +++++++++++++++++++++++++++++++++-----------
 net/sched/cls_flower.c       |  2 ++
 6 files changed, 66 insertions(+), 17 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6b41c15efa27..cc612fc0a894 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1943,7 +1943,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb,
 	if (skb_transport_header_was_set(skb))
 		return;
 	else if (skb_flow_dissect_flow_keys(skb, &keys))
-		skb_set_transport_header(skb, keys.basic.thoff);
+		skb_set_transport_header(skb, keys.control.thoff);
 	else
 		skb_set_transport_header(skb, offset_hint);
 }
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index bac9c1421f58..cba6a10b214a 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -6,6 +6,15 @@
 #include <linux/in6.h>
 #include <uapi/linux/if_ether.h>
 
+/**
+ * struct flow_dissector_key_control:
+ * @thoff: Transport header offset
+ */
+struct flow_dissector_key_control {
+	u16	thoff;
+	u16	padding;
+};
+
 /**
  * struct flow_dissector_key_basic:
  * @thoff: Transport header offset
@@ -13,9 +22,9 @@
  * @ip_proto: Transport header protocol (eg. TCP/UDP)
  */
 struct flow_dissector_key_basic {
-	u16	thoff;
 	__be16	n_proto;
 	u8	ip_proto;
+	u8	padding;
 };
 
 /**
@@ -70,6 +79,7 @@ struct flow_dissector_key_eth_addrs {
 };
 
 enum flow_dissector_key_id {
+	FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
 	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
@@ -109,11 +119,16 @@ static inline bool skb_flow_dissect(const struct sk_buff *skb,
 }
 
 struct flow_keys {
-	struct flow_dissector_key_addrs addrs;
-	struct flow_dissector_key_ports ports;
+	struct flow_dissector_key_control control;
+#define FLOW_KEYS_HASH_START_FIELD basic
 	struct flow_dissector_key_basic basic;
+	struct flow_dissector_key_ports ports;
+	struct flow_dissector_key_addrs addrs;
 };
 
+#define FLOW_KEYS_HASH_OFFSET		\
+	offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD)
+
 extern struct flow_dissector flow_keys_dissector;
 extern struct flow_dissector flow_keys_buf_dissector;
 
diff --git a/include/net/ip.h b/include/net/ip.h
index 9b976cf99122..16cfc87fed6c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -360,6 +360,8 @@ static inline void inet_set_txhash(struct sock *sk)
 	struct inet_sock *inet = inet_sk(sk);
 	struct flow_keys keys;
 
+	memset(&keys, 0, sizeof(keys));
+
 	keys.addrs.src = inet->inet_saddr;
 	keys.addrs.dst = inet->inet_daddr;
 	keys.ports.src = inet->inet_sport;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 35d485c78080..474ca466a091 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -699,6 +699,8 @@ static inline void ip6_set_txhash(struct sock *sk)
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct flow_keys keys;
 
+	memset(&keys, 0, sizeof(keys));
+
 	keys.addrs.src = (__force __be32)ipv6_addr_hash(&np->saddr);
 	keys.addrs.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr);
 	keys.ports.src = inet->inet_sport;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 0763795bea8f..55b5f2962afa 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -57,9 +57,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 		flow_dissector->offset[key->key_id] = key->offset;
 	}
 
-	/* Ensure that the dissector always includes basic key. That way
-	 * we are able to avoid handling lack of it in fast path.
+	/* Ensure that the dissector always includes control and basic key.
+	 * That way we are able to avoid handling lack of these in fast path.
 	 */
+	BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
+					    FLOW_DISSECTOR_KEY_CONTROL));
 	BUG_ON(!skb_flow_dissector_uses_key(flow_dissector,
 					    FLOW_DISSECTOR_KEY_BASIC));
 }
@@ -120,6 +122,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 			void *target_container,
 			void *data, __be16 proto, int nhoff, int hlen)
 {
+	struct flow_dissector_key_control *key_control;
 	struct flow_dissector_key_basic *key_basic;
 	struct flow_dissector_key_addrs *key_addrs;
 	struct flow_dissector_key_ports *key_ports;
@@ -132,6 +135,13 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 		hlen = skb_headlen(skb);
 	}
 
+	/* It is ensured by skb_flow_dissector_init() that control key will
+	 * be always present.
+	 */
+	key_control = skb_flow_dissector_target(flow_dissector,
+						FLOW_DISSECTOR_KEY_CONTROL,
+						target_container);
+
 	/* It is ensured by skb_flow_dissector_init() that basic key will
 	 * be always present.
 	 */
@@ -219,7 +229,7 @@ flow_label:
 
 			key_basic->n_proto = proto;
 			key_basic->ip_proto = ip_proto;
-			key_basic->thoff = (u16)nhoff;
+			key_control->thoff = (u16)nhoff;
 
 			if (skb_flow_dissector_uses_key(flow_dissector,
 							FLOW_DISSECTOR_KEY_PORTS)) {
@@ -275,7 +285,7 @@ flow_label:
 		if (!hdr)
 			return false;
 		key_basic->n_proto = proto;
-		key_basic->thoff = (u16)nhoff;
+		key_control->thoff = (u16)nhoff;
 
 		if (skb_flow_dissector_uses_key(flow_dissector,
 						FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) {
@@ -288,7 +298,7 @@ flow_label:
 		return true;
 	}
 	case htons(ETH_P_FCOE):
-		key_basic->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
+		key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
 		/* fall through */
 	default:
 		return false;
@@ -345,7 +355,7 @@ flow_label:
 
 	key_basic->n_proto = proto;
 	key_basic->ip_proto = ip_proto;
-	key_basic->thoff = (u16) nhoff;
+	key_control->thoff = (u16)nhoff;
 
 	if (skb_flow_dissector_uses_key(flow_dissector,
 					FLOW_DISSECTOR_KEY_PORTS)) {
@@ -366,9 +376,21 @@ static __always_inline void __flow_hash_secret_init(void)
 	net_get_random_once(&hashrnd, sizeof(hashrnd));
 }
 
-static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c, u32 keyval)
+static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval)
+{
+	return jhash2(words, length, keyval);
+}
+
+static inline void *flow_keys_hash_start(struct flow_keys *flow)
 {
-	return jhash_3words(a, b, c, keyval);
+	BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32));
+	return (void *)flow + FLOW_KEYS_HASH_OFFSET;
+}
+
+static inline size_t flow_keys_hash_length(struct flow_keys *flow)
+{
+	BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
+	return (sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) / sizeof(u32);
 }
 
 static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
@@ -383,10 +405,8 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 		swap(keys->ports.src, keys->ports.dst);
 	}
 
-	hash = __flow_hash_3words((__force u32)keys->addrs.dst,
-				  (__force u32)keys->addrs.src,
-				  (__force u32)keys->ports.ports,
-				  keyval);
+	hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys),
+				 flow_keys_hash_length(keys), keyval);
 	if (!hash)
 		hash = 1;
 
@@ -473,7 +493,7 @@ EXPORT_SYMBOL(skb_get_hash_perturb);
 u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 		   const struct flow_keys *keys, int hlen)
 {
-	u32 poff = keys->basic.thoff;
+	u32 poff = keys->control.thoff;
 
 	switch (keys->basic.ip_proto) {
 	case IPPROTO_TCP: {
@@ -536,6 +556,10 @@ u32 skb_get_poff(const struct sk_buff *skb)
 }
 
 static const struct flow_dissector_key flow_keys_dissector_keys[] = {
+	{
+		.key_id = FLOW_DISSECTOR_KEY_CONTROL,
+		.offset = offsetof(struct flow_keys, control),
+	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_BASIC,
 		.offset = offsetof(struct flow_keys, basic),
@@ -555,6 +579,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 };
 
 static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
+	{
+		.key_id = FLOW_DISSECTOR_KEY_CONTROL,
+		.offset = offsetof(struct flow_keys, control),
+	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_BASIC,
 		.offset = offsetof(struct flow_keys, basic),
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 8c8f34ef6980..5a7d66c59684 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -25,6 +25,7 @@
 
 struct fl_flow_key {
 	int	indev_ifindex;
+	struct flow_dissector_key_control control;
 	struct flow_dissector_key_basic basic;
 	struct flow_dissector_key_eth_addrs eth;
 	union {
@@ -347,6 +348,7 @@ static void fl_init_dissector(struct cls_fl_head *head,
 	struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
 	size_t cnt = 0;
 
+	FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
 	FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
 	FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt,
 			       FLOW_DISSECTOR_KEY_ETH_ADDRS, eth);
-- 
cgit v1.2.3


From c3f8324188fa80178f20c8209b492ca6191177e8 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:40 -0700
Subject: net: Add full IPv6 addresses to flow_keys

This patch adds full IPv6 addresses into flow_keys and uses them as
input to the flow hash function. The implementation supports either
IPv4 or IPv6 addresses in a union, and selector is used to determine
how may words to input to jhash2.

We also add flow_get_u32_dst and flow_get_u32_src functions which are
used to get a u32 representation of the source and destination
addresses. For IPv6, ipv6_addr_hash is called. These functions retain
getting the legacy values of src and dst in flow_keys.

With this patch, Ethertype and IP protocol are now included in the
flow hash input.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                |   9 +-
 drivers/net/ethernet/cisco/enic/enic_clsf.c    |   8 +-
 drivers/net/ethernet/cisco/enic/enic_ethtool.c |   4 +-
 include/net/flow_dissector.h                   |  52 +++++++----
 include/net/ip.h                               |  19 +++-
 include/net/ipv6.h                             |  21 ++++-
 net/core/flow_dissector.c                      | 116 +++++++++++++++++++++----
 net/ethernet/eth.c                             |   2 +-
 net/sched/cls_flow.c                           |  14 ++-
 net/sched/cls_flower.c                         |  11 +--
 10 files changed, 193 insertions(+), 63 deletions(-)

(limited to 'net/core')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2268438f3f63..19eb990d398c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3059,8 +3059,7 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph))))
 			return false;
 		iph = ip_hdr(skb);
-		fk->addrs.src = iph->saddr;
-		fk->addrs.dst = iph->daddr;
+		iph_to_flow_copy_v4addrs(fk, iph);
 		noff += iph->ihl << 2;
 		if (!ip_is_fragment(iph))
 			proto = iph->protocol;
@@ -3068,8 +3067,7 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph6))))
 			return false;
 		iph6 = ipv6_hdr(skb);
-		fk->addrs.src = (__force __be32)ipv6_addr_hash(&iph6->saddr);
-		fk->addrs.dst = (__force __be32)ipv6_addr_hash(&iph6->daddr);
+		iph_to_flow_copy_v6addrs(fk, iph6);
 		noff += sizeof(*iph6);
 		proto = iph6->nexthdr;
 	} else {
@@ -3103,7 +3101,8 @@ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
 		hash = bond_eth_hash(skb);
 	else
 		hash = (__force u32)flow.ports.ports;
-	hash ^= (__force u32)flow.addrs.dst ^ (__force u32)flow.addrs.src;
+	hash ^= (__force u32)flow_get_u32_dst(&flow) ^
+		(__force u32)flow_get_u32_src(&flow);
 	hash ^= (hash >> 16);
 	hash ^= (hash >> 8);
 
diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c
index a31b57adcb37..d106186f4f4a 100644
--- a/drivers/net/ethernet/cisco/enic/enic_clsf.c
+++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c
@@ -33,8 +33,8 @@ int enic_addfltr_5t(struct enic *enic, struct flow_keys *keys, u16 rq)
 		return -EPROTONOSUPPORT;
 	};
 	data.type = FILTER_IPV4_5TUPLE;
-	data.u.ipv4.src_addr = ntohl(keys->addrs.src);
-	data.u.ipv4.dst_addr = ntohl(keys->addrs.dst);
+	data.u.ipv4.src_addr = ntohl(keys->addrs.v4addrs.src);
+	data.u.ipv4.dst_addr = ntohl(keys->addrs.v4addrs.dst);
 	data.u.ipv4.src_port = ntohs(keys->ports.src);
 	data.u.ipv4.dst_port = ntohs(keys->ports.dst);
 	data.u.ipv4.flags = FILTER_FIELDS_IPV4_5TUPLE;
@@ -158,8 +158,8 @@ static struct enic_rfs_fltr_node *htbl_key_search(struct hlist_head *h,
 	struct enic_rfs_fltr_node *tpos;
 
 	hlist_for_each_entry(tpos, h, node)
-		if (tpos->keys.addrs.src == k->addrs.src &&
-		    tpos->keys.addrs.dst == k->addrs.dst &&
+		if (tpos->keys.addrs.v4addrs.src == k->addrs.v4addrs.src &&
+		    tpos->keys.addrs.v4addrs.dst == k->addrs.v4addrs.dst &&
 		    tpos->keys.ports.ports == k->ports.ports &&
 		    tpos->keys.basic.ip_proto == k->basic.ip_proto &&
 		    tpos->keys.basic.n_proto == k->basic.n_proto)
diff --git a/drivers/net/ethernet/cisco/enic/enic_ethtool.c b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
index 117c0968dd0b..73874b2575bf 100644
--- a/drivers/net/ethernet/cisco/enic/enic_ethtool.c
+++ b/drivers/net/ethernet/cisco/enic/enic_ethtool.c
@@ -346,10 +346,10 @@ static int enic_grxclsrule(struct enic *enic, struct ethtool_rxnfc *cmd)
 		break;
 	}
 
-	fsp->h_u.tcp_ip4_spec.ip4src = n->keys.addrs.src;
+	fsp->h_u.tcp_ip4_spec.ip4src = flow_get_u32_src(&n->keys);
 	fsp->m_u.tcp_ip4_spec.ip4src = (__u32)~0;
 
-	fsp->h_u.tcp_ip4_spec.ip4dst = n->keys.addrs.dst;
+	fsp->h_u.tcp_ip4_spec.ip4dst = flow_get_u32_dst(&n->keys);
 	fsp->m_u.tcp_ip4_spec.ip4dst = (__u32)~0;
 
 	fsp->h_u.tcp_ip4_spec.psrc = n->keys.ports.src;
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index cba6a10b214a..306d4613abbb 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -12,7 +12,7 @@
  */
 struct flow_dissector_key_control {
 	u16	thoff;
-	u16	padding;
+	u16	addr_type;
 };
 
 /**
@@ -28,18 +28,39 @@ struct flow_dissector_key_basic {
 };
 
 /**
- * struct flow_dissector_key_addrs:
- * @src: source ip address in case of IPv4
- *	 For IPv6 it contains 32bit hash of src address
- * @dst: destination ip address in case of IPv4
- *	 For IPv6 it contains 32bit hash of dst address
+ * struct flow_dissector_key_ipv4_addrs:
+ * @src: source ip address
+ * @dst: destination ip address
  */
-struct flow_dissector_key_addrs {
+struct flow_dissector_key_ipv4_addrs {
 	/* (src,dst) must be grouped, in the same way than in IP header */
 	__be32 src;
 	__be32 dst;
 };
 
+/**
+ * struct flow_dissector_key_ipv6_addrs:
+ * @src: source ip address
+ * @dst: destination ip address
+ */
+struct flow_dissector_key_ipv6_addrs {
+	/* (src,dst) must be grouped, in the same way than in IP header */
+	struct in6_addr src;
+	struct in6_addr dst;
+};
+
+/**
+ * struct flow_dissector_key_addrs:
+ * @v4addrs: IPv4 addresses
+ * @v6addrs: IPv6 addresses
+ */
+struct flow_dissector_key_addrs {
+	union {
+		struct flow_dissector_key_ipv4_addrs v4addrs;
+		struct flow_dissector_key_ipv6_addrs v6addrs;
+	};
+};
+
 /**
  * flow_dissector_key_tp_ports:
  *	@ports: port numbers of Transport header
@@ -56,16 +77,6 @@ struct flow_dissector_key_ports {
 	};
 };
 
-/**
- * struct flow_dissector_key_ipv6_addrs:
- * @src: source ip address
- * @dst: destination ip address
- */
-struct flow_dissector_key_ipv6_addrs {
-	/* (src,dst) must be grouped, in the same way than in IP header */
-	struct in6_addr src;
-	struct in6_addr dst;
-};
 
 /**
  * struct flow_dissector_key_eth_addrs:
@@ -81,10 +92,10 @@ struct flow_dissector_key_eth_addrs {
 enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
-	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_addrs */
+	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
+	FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
 	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
-	FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
 	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
 
 	FLOW_DISSECTOR_KEY_MAX,
@@ -129,6 +140,9 @@ struct flow_keys {
 #define FLOW_KEYS_HASH_OFFSET		\
 	offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD)
 
+__be32 flow_get_u32_src(const struct flow_keys *flow);
+__be32 flow_get_u32_dst(const struct flow_keys *flow);
+
 extern struct flow_dissector flow_keys_dissector;
 extern struct flow_dissector flow_keys_buf_dissector;
 
diff --git a/include/net/ip.h b/include/net/ip.h
index 16cfc87fed6c..0750a186ea63 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -355,6 +355,20 @@ static inline __wsum inet_compute_pseudo(struct sk_buff *skb, int proto)
 				  skb->len, proto, 0);
 }
 
+/* copy IPv4 saddr & daddr to flow_keys, possibly using 64bit load/store
+ * Equivalent to :	flow->v4addrs.src = iph->saddr;
+ *			flow->v4addrs.dst = iph->daddr;
+ */
+static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow,
+					    const struct iphdr *iph)
+{
+	BUILD_BUG_ON(offsetof(typeof(flow->addrs), v4addrs.dst) !=
+		     offsetof(typeof(flow->addrs), v4addrs.src) +
+			      sizeof(flow->addrs.v4addrs.src));
+	memcpy(&flow->addrs.v4addrs, &iph->saddr, sizeof(flow->addrs.v4addrs));
+	flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+}
+
 static inline void inet_set_txhash(struct sock *sk)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -362,8 +376,9 @@ static inline void inet_set_txhash(struct sock *sk)
 
 	memset(&keys, 0, sizeof(keys));
 
-	keys.addrs.src = inet->inet_saddr;
-	keys.addrs.dst = inet->inet_daddr;
+	keys.addrs.v4addrs.src = inet->inet_saddr;
+	keys.addrs.v4addrs.dst = inet->inet_daddr;
+	keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
 	keys.ports.src = inet->inet_sport;
 	keys.ports.dst = inet->inet_dport;
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 474ca466a091..82dbdb092a5d 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -692,6 +692,20 @@ static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6,
 	return hlimit;
 }
 
+/* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store
+ * Equivalent to :	flow->v6addrs.src = iph->saddr;
+ *			flow->v6addrs.dst = iph->daddr;
+ */
+static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
+					    const struct ipv6hdr *iph)
+{
+	BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) !=
+		     offsetof(typeof(flow->addrs), v6addrs.src) +
+		     sizeof(flow->addrs.v6addrs.src));
+	memcpy(&flow->addrs.v6addrs, &iph->saddr, sizeof(flow->addrs.v6addrs));
+	flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 static inline void ip6_set_txhash(struct sock *sk)
 {
@@ -701,8 +715,11 @@ static inline void ip6_set_txhash(struct sock *sk)
 
 	memset(&keys, 0, sizeof(keys));
 
-	keys.addrs.src = (__force __be32)ipv6_addr_hash(&np->saddr);
-	keys.addrs.dst = (__force __be32)ipv6_addr_hash(&sk->sk_v6_daddr);
+	memcpy(&keys.addrs.v6addrs.src, &np->saddr,
+	       sizeof(keys.addrs.v6addrs.src));
+	memcpy(&keys.addrs.v6addrs.dst, &sk->sk_v6_daddr,
+	       sizeof(keys.addrs.v6addrs.dst));
+	keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
 	keys.ports.src = inet->inet_sport;
 	keys.ports.dst = inet->inet_dport;
 
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 55b5f2962afa..ca9d22488cfb 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -178,10 +178,12 @@ ip:
 		if (!skb_flow_dissector_uses_key(flow_dissector,
 						 FLOW_DISSECTOR_KEY_IPV4_ADDRS))
 			break;
+
 		key_addrs = skb_flow_dissector_target(flow_dissector,
-						      FLOW_DISSECTOR_KEY_IPV4_ADDRS,
-						      target_container);
-		memcpy(key_addrs, &iph->saddr, sizeof(*key_addrs));
+			      FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container);
+		memcpy(&key_addrs->v4addrs, &iph->saddr,
+		       sizeof(key_addrs->v4addrs));
+		key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
 		break;
 	}
 	case htons(ETH_P_IPV6): {
@@ -203,8 +205,11 @@ ipv6:
 							      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
 							      target_container);
 
-			key_addrs->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
-			key_addrs->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);
+			key_addrs->v4addrs.src =
+				(__force __be32)ipv6_addr_hash(&iph->saddr);
+			key_addrs->v4addrs.dst =
+				(__force __be32)ipv6_addr_hash(&iph->daddr);
+			key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
 			goto flow_label;
 		}
 		if (skb_flow_dissector_uses_key(flow_dissector,
@@ -216,6 +221,7 @@ ipv6:
 								   target_container);
 
 			memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs));
+			key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
 			goto flow_label;
 		}
 		break;
@@ -292,8 +298,9 @@ flow_label:
 			key_addrs = skb_flow_dissector_target(flow_dissector,
 							      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
 							      target_container);
-			key_addrs->src = hdr->srcnode;
-			key_addrs->dst = 0;
+			key_addrs->v4addrs.src = hdr->srcnode;
+			key_addrs->v4addrs.dst = 0;
+			key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
 		}
 		return true;
 	}
@@ -389,21 +396,88 @@ static inline void *flow_keys_hash_start(struct flow_keys *flow)
 
 static inline size_t flow_keys_hash_length(struct flow_keys *flow)
 {
+	size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
 	BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
-	return (sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) / sizeof(u32);
+	BUILD_BUG_ON(offsetof(typeof(*flow), addrs) !=
+		     sizeof(*flow) - sizeof(flow->addrs));
+
+	switch (flow->control.addr_type) {
+	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+		diff -= sizeof(flow->addrs.v4addrs);
+		break;
+	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+		diff -= sizeof(flow->addrs.v6addrs);
+		break;
+	}
+	return (sizeof(*flow) - diff) / sizeof(u32);
+}
+
+__be32 flow_get_u32_src(const struct flow_keys *flow)
+{
+	switch (flow->control.addr_type) {
+	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+		return flow->addrs.v4addrs.src;
+	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+		return (__force __be32)ipv6_addr_hash(
+			&flow->addrs.v6addrs.src);
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(flow_get_u32_src);
+
+__be32 flow_get_u32_dst(const struct flow_keys *flow)
+{
+	switch (flow->control.addr_type) {
+	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+		return flow->addrs.v4addrs.dst;
+	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+		return (__force __be32)ipv6_addr_hash(
+			&flow->addrs.v6addrs.dst);
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(flow_get_u32_dst);
+
+static inline void __flow_hash_consistentify(struct flow_keys *keys)
+{
+	int addr_diff, i;
+
+	switch (keys->control.addr_type) {
+	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+		addr_diff = (__force u32)keys->addrs.v4addrs.dst -
+			    (__force u32)keys->addrs.v4addrs.src;
+		if ((addr_diff < 0) ||
+		    (addr_diff == 0 &&
+		     ((__force u16)keys->ports.dst <
+		      (__force u16)keys->ports.src))) {
+			swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);
+			swap(keys->ports.src, keys->ports.dst);
+		}
+		break;
+	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
+		addr_diff = memcmp(&keys->addrs.v6addrs.dst,
+				   &keys->addrs.v6addrs.src,
+				   sizeof(keys->addrs.v6addrs.dst));
+		if ((addr_diff < 0) ||
+		    (addr_diff == 0 &&
+		     ((__force u16)keys->ports.dst <
+		      (__force u16)keys->ports.src))) {
+			for (i = 0; i < 4; i++)
+				swap(keys->addrs.v6addrs.src.s6_addr32[i],
+				     keys->addrs.v6addrs.dst.s6_addr32[i]);
+			swap(keys->ports.src, keys->ports.dst);
+		}
+		break;
+	}
 }
 
 static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 {
 	u32 hash;
 
-	/* get a consistent hash (same value on both flow directions) */
-	if (((__force u32)keys->addrs.dst < (__force u32)keys->addrs.src) ||
-	    (((__force u32)keys->addrs.dst == (__force u32)keys->addrs.src) &&
-	     ((__force u16)keys->ports.dst < (__force u16)keys->ports.src))) {
-		swap(keys->addrs.dst, keys->addrs.src);
-		swap(keys->ports.src, keys->ports.dst);
-	}
+	__flow_hash_consistentify(keys);
 
 	hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys),
 				 flow_keys_hash_length(keys), keyval);
@@ -451,8 +525,8 @@ void make_flow_keys_digest(struct flow_keys_digest *digest,
 	data->n_proto = flow->basic.n_proto;
 	data->ip_proto = flow->basic.ip_proto;
 	data->ports = flow->ports.ports;
-	data->src = flow->addrs.src;
-	data->dst = flow->addrs.dst;
+	data->src = flow->addrs.v4addrs.src;
+	data->dst = flow->addrs.v4addrs.dst;
 }
 EXPORT_SYMBOL(make_flow_keys_digest);
 
@@ -566,11 +640,15 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
-		.offset = offsetof(struct flow_keys, addrs),
+		.offset = offsetof(struct flow_keys, addrs.v4addrs),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+		.offset = offsetof(struct flow_keys, addrs.v6addrs),
 	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
-		.offset = offsetof(struct flow_keys, addrs),
+		.offset = offsetof(struct flow_keys, addrs.v4addrs),
 	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_PORTS,
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 7d0e239a6755..77e0f0e7a88e 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -133,7 +133,7 @@ u32 eth_get_headlen(void *data, unsigned int len)
 	/* parse any remaining L2/L3 headers, check for L4 */
 	if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
 					    sizeof(*eth), len))
-		return max_t(u32, keys.basic.thoff, sizeof(*eth));
+		return max_t(u32, keys.control.thoff, sizeof(*eth));
 
 	/* parse for any L4 headers */
 	return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len);
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index b4359924846c..76bc3a20ffdb 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -68,15 +68,21 @@ static inline u32 addr_fold(void *addr)
 
 static u32 flow_get_src(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->addrs.src)
-		return ntohl(flow->addrs.src);
+	__be32 src = flow_get_u32_src(flow);
+
+	if (src)
+		return ntohl(src);
+
 	return addr_fold(skb->sk);
 }
 
 static u32 flow_get_dst(const struct sk_buff *skb, const struct flow_keys *flow)
 {
-	if (flow->addrs.dst)
-		return ntohl(flow->addrs.dst);
+	__be32 dst = flow_get_u32_dst(flow);
+
+	if (dst)
+		return ntohl(dst);
+
 	return addr_fold(skb_dst(skb)) ^ (__force u16) tc_skb_protocol(skb);
 }
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 5a7d66c59684..b92d3f49c23e 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -28,8 +28,9 @@ struct fl_flow_key {
 	struct flow_dissector_key_control control;
 	struct flow_dissector_key_basic basic;
 	struct flow_dissector_key_eth_addrs eth;
+	struct flow_dissector_key_addrs ipaddrs;
 	union {
-		struct flow_dissector_key_addrs ipv4;
+		struct flow_dissector_key_ipv4_addrs ipv4;
 		struct flow_dissector_key_ipv6_addrs ipv6;
 	};
 	struct flow_dissector_key_ports tp;
@@ -260,14 +261,14 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 			       &mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
 			       sizeof(key->basic.ip_proto));
 	}
-	if (key->basic.n_proto == htons(ETH_P_IP)) {
+	if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
 		fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
 			       &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
 			       sizeof(key->ipv4.src));
 		fl_set_key_val(tb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST,
 			       &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK,
 			       sizeof(key->ipv4.dst));
-	} else if (key->basic.n_proto == htons(ETH_P_IPV6)) {
+	} else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
 		fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC,
 			       &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK,
 			       sizeof(key->ipv6.src));
@@ -610,7 +611,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 			    sizeof(key->basic.ip_proto)))
 		goto nla_put_failure;
 
-	if (key->basic.n_proto == htons(ETH_P_IP) &&
+	if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
 	    (fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
 			     &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
 			     sizeof(key->ipv4.src)) ||
@@ -618,7 +619,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 			     &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK,
 			     sizeof(key->ipv4.dst))))
 		goto nla_put_failure;
-	else if (key->basic.n_proto == htons(ETH_P_IPV6) &&
+	else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS &&
 		 (fl_dump_key_val(skb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC,
 				  &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK,
 				  sizeof(key->ipv6.src)) ||
-- 
cgit v1.2.3


From 9f24908901c5f4b1e3b07548106b1790af933476 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:41 -0700
Subject: net: Add keys for TIPC address

Add a new flow key for TIPC addresses.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 10 ++++++++++
 net/core/flow_dissector.c    | 18 +++++++++++++-----
 2 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'net/core')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 306d4613abbb..3ee606a54775 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -49,6 +49,14 @@ struct flow_dissector_key_ipv6_addrs {
 	struct in6_addr dst;
 };
 
+/**
+ * struct flow_dissector_key_tipc_addrs:
+ * @srcnode: source node address
+ */
+struct flow_dissector_key_tipc_addrs {
+	__be32 srcnode;
+};
+
 /**
  * struct flow_dissector_key_addrs:
  * @v4addrs: IPv4 addresses
@@ -58,6 +66,7 @@ struct flow_dissector_key_addrs {
 	union {
 		struct flow_dissector_key_ipv4_addrs v4addrs;
 		struct flow_dissector_key_ipv6_addrs v6addrs;
+		struct flow_dissector_key_tipc_addrs tipcaddrs;
 	};
 };
 
@@ -97,6 +106,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
 	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
+	FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index ca9d22488cfb..91861c3d45a7 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -294,13 +294,12 @@ flow_label:
 		key_control->thoff = (u16)nhoff;
 
 		if (skb_flow_dissector_uses_key(flow_dissector,
-						FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) {
+						FLOW_DISSECTOR_KEY_TIPC_ADDRS)) {
 			key_addrs = skb_flow_dissector_target(flow_dissector,
-							      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
+							      FLOW_DISSECTOR_KEY_TIPC_ADDRS,
 							      target_container);
-			key_addrs->v4addrs.src = hdr->srcnode;
-			key_addrs->v4addrs.dst = 0;
-			key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+			key_addrs->tipcaddrs.srcnode = hdr->srcnode;
+			key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS;
 		}
 		return true;
 	}
@@ -408,6 +407,9 @@ static inline size_t flow_keys_hash_length(struct flow_keys *flow)
 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 		diff -= sizeof(flow->addrs.v6addrs);
 		break;
+	case FLOW_DISSECTOR_KEY_TIPC_ADDRS:
+		diff -= sizeof(flow->addrs.tipcaddrs);
+		break;
 	}
 	return (sizeof(*flow) - diff) / sizeof(u32);
 }
@@ -420,6 +422,8 @@ __be32 flow_get_u32_src(const struct flow_keys *flow)
 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 		return (__force __be32)ipv6_addr_hash(
 			&flow->addrs.v6addrs.src);
+	case FLOW_DISSECTOR_KEY_TIPC_ADDRS:
+		return flow->addrs.tipcaddrs.srcnode;
 	default:
 		return 0;
 	}
@@ -650,6 +654,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 		.key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
 		.offset = offsetof(struct flow_keys, addrs.v4addrs),
 	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS,
+		.offset = offsetof(struct flow_keys, addrs.tipcaddrs),
+	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_PORTS,
 		.offset = offsetof(struct flow_keys, ports),
-- 
cgit v1.2.3


From 45b47fd00ca14df869979dbbe14324625ec93552 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:42 -0700
Subject: net: Get rid of IPv6 hash addresses flow keys

We don't need to return the IPv6 address hash as part of flow keys.
In general, using the IPv6 address hash is risky in a hash value
since the underlying use of xor provides no entropy. If someone
really needs the hash value they can get it from the full IPv6
addresses in flow keys (e.g. from flow_get_u32_src).

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  1 -
 net/core/flow_dissector.c    | 17 -----------------
 2 files changed, 18 deletions(-)

(limited to 'net/core')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 3ee606a54775..59f00f90fc6b 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -103,7 +103,6 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
 	FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
 	FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
-	FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS, /* struct flow_dissector_key_addrs */
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
 	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
 	FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 91861c3d45a7..5348a468de78 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -199,19 +199,6 @@ ipv6:
 		ip_proto = iph->nexthdr;
 		nhoff += sizeof(struct ipv6hdr);
 
-		if (skb_flow_dissector_uses_key(flow_dissector,
-						FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS)) {
-			key_addrs = skb_flow_dissector_target(flow_dissector,
-							      FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
-							      target_container);
-
-			key_addrs->v4addrs.src =
-				(__force __be32)ipv6_addr_hash(&iph->saddr);
-			key_addrs->v4addrs.dst =
-				(__force __be32)ipv6_addr_hash(&iph->daddr);
-			key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
-			goto flow_label;
-		}
 		if (skb_flow_dissector_uses_key(flow_dissector,
 						FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
 			struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs;
@@ -650,10 +637,6 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 		.key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
 		.offset = offsetof(struct flow_keys, addrs.v6addrs),
 	},
-	{
-		.key_id = FLOW_DISSECTOR_KEY_IPV6_HASH_ADDRS,
-		.offset = offsetof(struct flow_keys, addrs.v4addrs),
-	},
 	{
 		.key_id = FLOW_DISSECTOR_KEY_TIPC_ADDRS,
 		.offset = offsetof(struct flow_keys, addrs.tipcaddrs),
-- 
cgit v1.2.3


From d34af823ff401c312541aa613c49ea4b872bde9e Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:43 -0700
Subject: net: Add VLAN ID to flow_keys

In flow_dissector set vlan_id in flow_keys when VLAN is found.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  6 ++++++
 net/core/flow_dissector.c    | 14 ++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'net/core')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 59f00f90fc6b..08480fbb9035 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -27,6 +27,10 @@ struct flow_dissector_key_basic {
 	u8	padding;
 };
 
+struct flow_dissector_key_tags {
+	u32	vlan_id:12;
+};
+
 /**
  * struct flow_dissector_key_ipv4_addrs:
  * @src: source ip address
@@ -106,6 +110,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
 	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
 	FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */
+	FLOW_DISSECTOR_KEY_VLANID, /* struct flow_dissector_key_flow_tags */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
@@ -142,6 +147,7 @@ struct flow_keys {
 	struct flow_dissector_key_control control;
 #define FLOW_KEYS_HASH_START_FIELD basic
 	struct flow_dissector_key_basic basic;
+	struct flow_dissector_key_tags tags;
 	struct flow_dissector_key_ports ports;
 	struct flow_dissector_key_addrs addrs;
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 5348a468de78..5c66cb2e66df 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -126,6 +126,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	struct flow_dissector_key_basic *key_basic;
 	struct flow_dissector_key_addrs *key_addrs;
 	struct flow_dissector_key_ports *key_ports;
+	struct flow_dissector_key_tags *key_tags;
 	u8 ip_proto;
 
 	if (!data) {
@@ -246,6 +247,15 @@ flow_label:
 		if (!vlan)
 			return false;
 
+		if (skb_flow_dissector_uses_key(flow_dissector,
+						FLOW_DISSECTOR_KEY_VLANID)) {
+			key_tags = skb_flow_dissector_target(flow_dissector,
+							     FLOW_DISSECTOR_KEY_VLANID,
+							     target_container);
+
+			key_tags->vlan_id = skb_vlan_tag_get_id(skb);
+		}
+
 		proto = vlan->h_vlan_encapsulated_proto;
 		nhoff += sizeof(*vlan);
 		goto again;
@@ -645,6 +655,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 		.key_id = FLOW_DISSECTOR_KEY_PORTS,
 		.offset = offsetof(struct flow_keys, ports),
 	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_VLANID,
+		.offset = offsetof(struct flow_keys, tags),
+	},
 };
 
 static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
-- 
cgit v1.2.3


From 87ee9e52ffeb168803a76cc07734425227cc2268 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:44 -0700
Subject: net: Add IPv6 flow label to flow_keys

In flow_dissector set the flow label in flow_keys for IPv6. This also
removes the shortcircuiting of flow dissection when a non-zero label
is present, the flow label can be considered to provide additional
entropy for a hash.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  4 +++-
 net/core/flow_dissector.c    | 29 ++++++++++-------------------
 2 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'net/core')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 08480fbb9035..14d8483d836e 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -28,7 +28,8 @@ struct flow_dissector_key_basic {
 };
 
 struct flow_dissector_key_tags {
-	u32	vlan_id:12;
+	u32	vlan_id:12,
+		flow_label:20;
 };
 
 /**
@@ -111,6 +112,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
 	FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */
 	FLOW_DISSECTOR_KEY_VLANID, /* struct flow_dissector_key_flow_tags */
+	FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 5c66cb2e66df..70f0567b6be9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -210,30 +210,17 @@ ipv6:
 
 			memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs));
 			key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
-			goto flow_label;
 		}
-		break;
-flow_label:
+
 		flow_label = ip6_flowlabel(iph);
 		if (flow_label) {
-			/* Awesome, IPv6 packet has a flow label so we can
-			 * use that to represent the ports without any
-			 * further dissection.
-			 */
-
-			key_basic->n_proto = proto;
-			key_basic->ip_proto = ip_proto;
-			key_control->thoff = (u16)nhoff;
-
 			if (skb_flow_dissector_uses_key(flow_dissector,
-							FLOW_DISSECTOR_KEY_PORTS)) {
-				key_ports = skb_flow_dissector_target(flow_dissector,
-								      FLOW_DISSECTOR_KEY_PORTS,
-								      target_container);
-				key_ports->ports = flow_label;
+				FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
+				key_tags = skb_flow_dissector_target(flow_dissector,
+								     FLOW_DISSECTOR_KEY_FLOW_LABEL,
+								     target_container);
+				key_tags->flow_label = ntohl(flow_label);
 			}
-
-			return true;
 		}
 
 		break;
@@ -659,6 +646,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 		.key_id = FLOW_DISSECTOR_KEY_VLANID,
 		.offset = offsetof(struct flow_keys, tags),
 	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
+		.offset = offsetof(struct flow_keys, tags),
+	},
 };
 
 static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
-- 
cgit v1.2.3


From 1fdd512c92003cf2d671ba22753d13302bf8cd1d Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:45 -0700
Subject: net: Add GRE keyid in flow_keys

In flow dissector if a GRE header contains a keyid this is saved in the
new keyid field of flow_keys. The GRE keyid is then represented
in the flow hash function input.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  6 ++++++
 net/core/flow_dissector.c    | 24 +++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 14d8483d836e..6c5e8d262607 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -32,6 +32,10 @@ struct flow_dissector_key_tags {
 		flow_label:20;
 };
 
+struct flow_dissector_key_keyid {
+	__be32	keyid;
+};
+
 /**
  * struct flow_dissector_key_ipv4_addrs:
  * @src: source ip address
@@ -113,6 +117,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs */
 	FLOW_DISSECTOR_KEY_VLANID, /* struct flow_dissector_key_flow_tags */
 	FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */
+	FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
@@ -150,6 +155,7 @@ struct flow_keys {
 #define FLOW_KEYS_HASH_START_FIELD basic
 	struct flow_dissector_key_basic basic;
 	struct flow_dissector_key_tags tags;
+	struct flow_dissector_key_keyid keyid;
 	struct flow_dissector_key_ports ports;
 	struct flow_dissector_key_addrs addrs;
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 70f0567b6be9..7ddc0a7b3da5 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -127,6 +127,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 	struct flow_dissector_key_addrs *key_addrs;
 	struct flow_dissector_key_ports *key_ports;
 	struct flow_dissector_key_tags *key_tags;
+	struct flow_dissector_key_keyid *key_keyid;
 	u8 ip_proto;
 
 	if (!data) {
@@ -315,8 +316,25 @@ ipv6:
 		nhoff += 4;
 		if (hdr->flags & GRE_CSUM)
 			nhoff += 4;
-		if (hdr->flags & GRE_KEY)
+		if (hdr->flags & GRE_KEY) {
+			const __be32 *keyid;
+			__be32 _keyid;
+
+			keyid = __skb_header_pointer(skb, nhoff, sizeof(_keyid),
+						     data, hlen, &_keyid);
+
+			if (!keyid)
+				return false;
+
+			if (skb_flow_dissector_uses_key(flow_dissector,
+							FLOW_DISSECTOR_KEY_GRE_KEYID)) {
+				key_keyid = skb_flow_dissector_target(flow_dissector,
+								      FLOW_DISSECTOR_KEY_GRE_KEYID,
+								      target_container);
+				key_keyid->keyid = *keyid;
+			}
 			nhoff += 4;
+		}
 		if (hdr->flags & GRE_SEQ)
 			nhoff += 4;
 		if (proto == htons(ETH_P_TEB)) {
@@ -650,6 +668,10 @@ static const struct flow_dissector_key flow_keys_dissector_keys[] = {
 		.key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
 		.offset = offsetof(struct flow_keys, tags),
 	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
+		.offset = offsetof(struct flow_keys, keyid),
+	},
 };
 
 static const struct flow_dissector_key flow_keys_buf_dissector_keys[] = {
-- 
cgit v1.2.3


From b3baa0fbd02a1a9d493d8cb92ae4a4491b9e9d13 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Thu, 4 Jun 2015 09:16:46 -0700
Subject: mpls: Add MPLS entropy label in flow_keys

In flow dissector if an MPLS header contains an entropy label this is
saved in the new keyid field of flow_keys. The entropy label is
then represented in the flow hash function input.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h |  1 +
 net/core/flow_dissector.c    | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'net/core')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 6c5e8d262607..1a8c22419936 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -118,6 +118,7 @@ enum flow_dissector_key_id {
 	FLOW_DISSECTOR_KEY_VLANID, /* struct flow_dissector_key_flow_tags */
 	FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */
 	FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
+	FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */
 
 	FLOW_DISSECTOR_KEY_MAX,
 };
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 7ddc0a7b3da5..77e22e4fc898 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -15,6 +15,7 @@
 #include <linux/ppp_defs.h>
 #include <linux/stddef.h>
 #include <linux/if_ether.h>
+#include <linux/mpls.h>
 #include <net/flow_dissector.h>
 #include <scsi/fc/fc_fcoe.h>
 
@@ -288,6 +289,37 @@ ipv6:
 		}
 		return true;
 	}
+
+	case htons(ETH_P_MPLS_UC):
+	case htons(ETH_P_MPLS_MC): {
+		struct mpls_label *hdr, _hdr[2];
+mpls:
+		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
+					   hlen, &_hdr);
+		if (!hdr)
+			return false;
+
+		if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) ==
+		     MPLS_LABEL_ENTROPY) {
+			if (skb_flow_dissector_uses_key(flow_dissector,
+							FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
+				key_keyid = skb_flow_dissector_target(flow_dissector,
+								      FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
+								      target_container);
+				key_keyid->keyid = hdr[1].entry &
+					htonl(MPLS_LS_LABEL_MASK);
+			}
+
+			key_basic->n_proto = proto;
+			key_basic->ip_proto = ip_proto;
+			key_control->thoff = (u16)nhoff;
+
+			return true;
+		}
+
+		return true;
+	}
+
 	case htons(ETH_P_FCOE):
 		key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
 		/* fall through */
@@ -357,6 +389,9 @@ ipv6:
 	case IPPROTO_IPV6:
 		proto = htons(ETH_P_IPV6);
 		goto ipv6;
+	case IPPROTO_MPLS:
+		proto = htons(ETH_P_MPLS_UC);
+		goto mpls;
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 3431205e03977aaf32bce6d4b16fb8244b510056 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 4 Jun 2015 10:11:53 -0700
Subject: bpf: make programs see skb->data == L2 for ingress and egress

eBPF programs attached to ingress and egress qdiscs see inconsistent skb->data.
For ingress L2 header is already pulled, whereas for egress it's present.
This is known to program writers which are currently forced to use
BPF_LL_OFF workaround.
Since programs don't change skb internal pointers it is safe to do
pull/push right around invocation of the program and earlier taps and
later pt->func() will not be affected.
Multiple taps via packet_rcv(), tpacket_rcv() are doing the same trick
around run_filter/BPF_PROG_RUN even if skb_shared.

This fix finally allows programs to use optimized LD_ABS/IND instructions
without BPF_LL_OFF for higher performance.
tc ingress + cls_bpf + samples/bpf/tcbpf1_kern.o
       w/o JIT   w/JIT
before  20.5     23.6 Mpps
after   21.8     26.6 Mpps

Old programs with BPF_LL_OFF will still work as-is.

We can now undo most of the earlier workaround commit:
a166151cbe33 ("bpf: fix bpf helpers to use skb->mac_header relative offsets")

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c         | 26 +++-----------------------
 net/sched/act_bpf.c       |  9 ++++++++-
 net/sched/cls_bpf.c       | 16 +++++++++++++++-
 samples/bpf/tcbpf1_kern.c |  8 ++++----
 4 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 09b2062eb5b8..36a69e33d76b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1238,21 +1238,6 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
 	return 0;
 }
 
-/**
- *	bpf_skb_clone_not_writable - is the header of a clone not writable
- *	@skb: buffer to check
- *	@len: length up to which to write, can be negative
- *
- *	Returns true if modifying the header part of the cloned buffer
- *	does require the data to be copied. I.e. this version works with
- *	negative lengths needed for eBPF case!
- */
-static bool bpf_skb_clone_unwritable(const struct sk_buff *skb, int len)
-{
-	return skb_header_cloned(skb) ||
-	       (int) skb_headroom(skb) + len > skb->hdr_len;
-}
-
 #define BPF_RECOMPUTE_CSUM(flags)	((flags) & 1)
 
 static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
@@ -1275,9 +1260,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 	if (unlikely((u32) offset > 0xffff || len > sizeof(buf)))
 		return -EFAULT;
 
-	offset -= skb->data - skb_mac_header(skb);
 	if (unlikely(skb_cloned(skb) &&
-		     bpf_skb_clone_unwritable(skb, offset + len)))
+		     !skb_clone_writable(skb, offset + len)))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, len, buf);
@@ -1321,9 +1305,8 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
 
-	offset -= skb->data - skb_mac_header(skb);
 	if (unlikely(skb_cloned(skb) &&
-		     bpf_skb_clone_unwritable(skb, offset + sizeof(sum))))
+		     !skb_clone_writable(skb, offset + sizeof(sum))))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1369,9 +1352,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
 
-	offset -= skb->data - skb_mac_header(skb);
 	if (unlikely(skb_cloned(skb) &&
-		     bpf_skb_clone_unwritable(skb, offset + sizeof(sum))))
+		     !skb_clone_writable(skb, offset + sizeof(sum))))
 		return -EFAULT;
 
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
@@ -1425,8 +1407,6 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
 	if (unlikely(!skb2))
 		return -ENOMEM;
 
-	skb_push(skb2, skb2->data - skb_mac_header(skb2));
-
 	if (BPF_IS_REDIRECT_INGRESS(flags))
 		return dev_forward_skb(dev, skb2);
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index dc6a2d324bd8..1d56903fd4c7 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -37,6 +37,7 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 {
 	struct tcf_bpf *prog = act->priv;
 	int action, filter_res;
+	bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
 
 	if (unlikely(!skb_mac_header_was_set(skb)))
 		return TC_ACT_UNSPEC;
@@ -48,7 +49,13 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 
 	/* Needed here for accessing maps. */
 	rcu_read_lock();
-	filter_res = BPF_PROG_RUN(prog->filter, skb);
+	if (at_ingress) {
+		__skb_push(skb, skb->mac_len);
+		filter_res = BPF_PROG_RUN(prog->filter, skb);
+		__skb_pull(skb, skb->mac_len);
+	} else {
+		filter_res = BPF_PROG_RUN(prog->filter, skb);
+	}
 	rcu_read_unlock();
 
 	/* A BPF program may overwrite the default action opcode.
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 91bd9c19471d..c79ecfd36e0f 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -64,6 +64,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 {
 	struct cls_bpf_head *head = rcu_dereference_bh(tp->root);
 	struct cls_bpf_prog *prog;
+#ifdef CONFIG_NET_CLS_ACT
+	bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
+#else
+	bool at_ingress = false;
+#endif
 	int ret = -1;
 
 	if (unlikely(!skb_mac_header_was_set(skb)))
@@ -72,7 +77,16 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 	/* Needed here for accessing maps. */
 	rcu_read_lock();
 	list_for_each_entry_rcu(prog, &head->plist, link) {
-		int filter_res = BPF_PROG_RUN(prog->filter, skb);
+		int filter_res;
+
+		if (at_ingress) {
+			/* It is safe to push/pull even if skb_shared() */
+			__skb_push(skb, skb->mac_len);
+			filter_res = BPF_PROG_RUN(prog->filter, skb);
+			__skb_pull(skb, skb->mac_len);
+		} else {
+			filter_res = BPF_PROG_RUN(prog->filter, skb);
+		}
 
 		if (filter_res == 0)
 			continue;
diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c
index 7c27710f8296..9bfb2eb34563 100644
--- a/samples/bpf/tcbpf1_kern.c
+++ b/samples/bpf/tcbpf1_kern.c
@@ -21,7 +21,7 @@ static inline void set_dst_mac(struct __sk_buff *skb, char *mac)
 
 static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos)
 {
-	__u8 old_tos = load_byte(skb, BPF_LL_OFF + TOS_OFF);
+	__u8 old_tos = load_byte(skb, TOS_OFF);
 
 	bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2);
 	bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0);
@@ -34,7 +34,7 @@ static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos)
 
 static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip)
 {
-	__u32 old_ip = _htonl(load_word(skb, BPF_LL_OFF + IP_SRC_OFF));
+	__u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF));
 
 	bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip));
 	bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip));
@@ -44,7 +44,7 @@ static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip)
 #define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest))
 static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port)
 {
-	__u16 old_port = htons(load_half(skb, BPF_LL_OFF + TCP_DPORT_OFF));
+	__u16 old_port = htons(load_half(skb, TCP_DPORT_OFF));
 
 	bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port));
 	bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0);
@@ -53,7 +53,7 @@ static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port)
 SEC("classifier")
 int bpf_prog1(struct __sk_buff *skb)
 {
-	__u8 proto = load_byte(skb, BPF_LL_OFF + ETH_HLEN + offsetof(struct iphdr, protocol));
+	__u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
 	long *value;
 
 	if (proto == IPPROTO_TCP) {
-- 
cgit v1.2.3


From d691f9e8d4405c334aa10d556e73c8bf44cb0e01 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 4 Jun 2015 10:11:54 -0700
Subject: bpf: allow programs to write to certain skb fields

allow programs read/write skb->mark, tc_index fields and
((struct qdisc_skb_cb *)cb)->data.

mark and tc_index are generically useful in TC.
cb[0]-cb[4] are primarily used to pass arguments from one
program to another called via bpf_tail_call() which can
be seen in sockex3_kern.c example.

All fields of 'struct __sk_buff' are readable to socket and tc_cls_act progs.
mark, tc_index are writeable from tc_cls_act only.
cb[0]-cb[4] are writeable by both sockets and tc_cls_act.

Add verifier tests and improve sample code.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h         |  3 +-
 include/uapi/linux/bpf.h    |  2 +
 kernel/bpf/verifier.c       | 37 +++++++++++++-----
 net/core/filter.c           | 94 +++++++++++++++++++++++++++++++++++++++------
 samples/bpf/sockex3_kern.c  | 35 ++++++-----------
 samples/bpf/test_verifier.c | 84 +++++++++++++++++++++++++++++++++++++++-
 6 files changed, 207 insertions(+), 48 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ca854e5bb2f7..2235aee8096a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -105,7 +105,8 @@ struct bpf_verifier_ops {
 	 */
 	bool (*is_valid_access)(int off, int size, enum bpf_access_type type);
 
-	u32 (*convert_ctx_access)(int dst_reg, int src_reg, int ctx_off,
+	u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg,
+				  int src_reg, int ctx_off,
 				  struct bpf_insn *insn);
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 42aa19abab86..602f05b7a275 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -248,6 +248,8 @@ struct __sk_buff {
 	__u32 priority;
 	__u32 ingress_ifindex;
 	__u32 ifindex;
+	__u32 tc_index;
+	__u32 cb[5];
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cfd9a40b9a5a..039d866fd36a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1692,6 +1692,8 @@ static int do_check(struct verifier_env *env)
 			}
 
 		} else if (class == BPF_STX) {
+			enum bpf_reg_type dst_reg_type;
+
 			if (BPF_MODE(insn->code) == BPF_XADD) {
 				err = check_xadd(env, insn);
 				if (err)
@@ -1700,11 +1702,6 @@ static int do_check(struct verifier_env *env)
 				continue;
 			}
 
-			if (BPF_MODE(insn->code) != BPF_MEM ||
-			    insn->imm != 0) {
-				verbose("BPF_STX uses reserved fields\n");
-				return -EINVAL;
-			}
 			/* check src1 operand */
 			err = check_reg_arg(regs, insn->src_reg, SRC_OP);
 			if (err)
@@ -1714,6 +1711,8 @@ static int do_check(struct verifier_env *env)
 			if (err)
 				return err;
 
+			dst_reg_type = regs[insn->dst_reg].type;
+
 			/* check that memory (dst_reg + off) is writeable */
 			err = check_mem_access(env, insn->dst_reg, insn->off,
 					       BPF_SIZE(insn->code), BPF_WRITE,
@@ -1721,6 +1720,15 @@ static int do_check(struct verifier_env *env)
 			if (err)
 				return err;
 
+			if (insn->imm == 0) {
+				insn->imm = dst_reg_type;
+			} else if (dst_reg_type != insn->imm &&
+				   (dst_reg_type == PTR_TO_CTX ||
+				    insn->imm == PTR_TO_CTX)) {
+				verbose("same insn cannot be used with different pointers\n");
+				return -EINVAL;
+			}
+
 		} else if (class == BPF_ST) {
 			if (BPF_MODE(insn->code) != BPF_MEM ||
 			    insn->src_reg != BPF_REG_0) {
@@ -1839,12 +1847,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		if (BPF_CLASS(insn->code) == BPF_LDX &&
-		    (BPF_MODE(insn->code) != BPF_MEM ||
-		     insn->imm != 0)) {
+		    (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
 			verbose("BPF_LDX uses reserved fields\n");
 			return -EINVAL;
 		}
 
+		if (BPF_CLASS(insn->code) == BPF_STX &&
+		    ((BPF_MODE(insn->code) != BPF_MEM &&
+		      BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
+			verbose("BPF_STX uses reserved fields\n");
+			return -EINVAL;
+		}
+
 		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
 			struct bpf_map *map;
 			struct fd f;
@@ -1967,12 +1981,17 @@ static int convert_ctx_accesses(struct verifier_env *env)
 	struct bpf_prog *new_prog;
 	u32 cnt;
 	int i;
+	enum bpf_access_type type;
 
 	if (!env->prog->aux->ops->convert_ctx_access)
 		return 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
-		if (insn->code != (BPF_LDX | BPF_MEM | BPF_W))
+		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
+			type = BPF_READ;
+		else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
+			type = BPF_WRITE;
+		else
 			continue;
 
 		if (insn->imm != PTR_TO_CTX) {
@@ -1982,7 +2001,7 @@ static int convert_ctx_accesses(struct verifier_env *env)
 		}
 
 		cnt = env->prog->aux->ops->
-			convert_ctx_access(insn->dst_reg, insn->src_reg,
+			convert_ctx_access(type, insn->dst_reg, insn->src_reg,
 					   insn->off, insn_buf);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
diff --git a/net/core/filter.c b/net/core/filter.c
index 36a69e33d76b..d271c06bf01f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -46,6 +46,7 @@
 #include <linux/seccomp.h>
 #include <linux/if_vlan.h>
 #include <linux/bpf.h>
+#include <net/sch_generic.h>
 
 /**
  *	sk_filter - run a packet through a socket filter
@@ -1463,13 +1464,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 	}
 }
 
-static bool sk_filter_is_valid_access(int off, int size,
-				      enum bpf_access_type type)
+static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
-	/* only read is allowed */
-	if (type != BPF_READ)
-		return false;
-
 	/* check bounds */
 	if (off < 0 || off >= sizeof(struct __sk_buff))
 		return false;
@@ -1485,8 +1481,42 @@ static bool sk_filter_is_valid_access(int off, int size,
 	return true;
 }
 
-static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
-					struct bpf_insn *insn_buf)
+static bool sk_filter_is_valid_access(int off, int size,
+				      enum bpf_access_type type)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct __sk_buff, cb[0]) ...
+			offsetof(struct __sk_buff, cb[4]):
+			break;
+		default:
+			return false;
+		}
+	}
+
+	return __is_valid_access(off, size, type);
+}
+
+static bool tc_cls_act_is_valid_access(int off, int size,
+				       enum bpf_access_type type)
+{
+	if (type == BPF_WRITE) {
+		switch (off) {
+		case offsetof(struct __sk_buff, mark):
+		case offsetof(struct __sk_buff, tc_index):
+		case offsetof(struct __sk_buff, cb[0]) ...
+			offsetof(struct __sk_buff, cb[4]):
+			break;
+		default:
+			return false;
+		}
+	}
+	return __is_valid_access(off, size, type);
+}
+
+static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
+				      int src_reg, int ctx_off,
+				      struct bpf_insn *insn_buf)
 {
 	struct bpf_insn *insn = insn_buf;
 
@@ -1538,7 +1568,15 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
 		break;
 
 	case offsetof(struct __sk_buff, mark):
-		return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn);
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg,
+					      offsetof(struct sk_buff, mark));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+					      offsetof(struct sk_buff, mark));
+		break;
 
 	case offsetof(struct __sk_buff, pkt_type):
 		return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn);
@@ -1553,6 +1591,38 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
 	case offsetof(struct __sk_buff, vlan_tci):
 		return convert_skb_access(SKF_AD_VLAN_TAG,
 					  dst_reg, src_reg, insn);
+
+	case offsetof(struct __sk_buff, cb[0]) ...
+		offsetof(struct __sk_buff, cb[4]):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
+
+		ctx_off -= offsetof(struct __sk_buff, cb[0]);
+		ctx_off += offsetof(struct sk_buff, cb);
+		ctx_off += offsetof(struct qdisc_skb_cb, data);
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+		else
+			*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off);
+		break;
+
+	case offsetof(struct __sk_buff, tc_index):
+#ifdef CONFIG_NET_SCHED
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
+
+		if (type == BPF_WRITE)
+			*insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg,
+					      offsetof(struct sk_buff, tc_index));
+		else
+			*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
+					      offsetof(struct sk_buff, tc_index));
+		break;
+#else
+		if (type == BPF_WRITE)
+			*insn++ = BPF_MOV64_REG(dst_reg, dst_reg);
+		else
+			*insn++ = BPF_MOV64_IMM(dst_reg, 0);
+		break;
+#endif
 	}
 
 	return insn - insn_buf;
@@ -1561,13 +1631,13 @@ static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off,
 static const struct bpf_verifier_ops sk_filter_ops = {
 	.get_func_proto = sk_filter_func_proto,
 	.is_valid_access = sk_filter_is_valid_access,
-	.convert_ctx_access = sk_filter_convert_ctx_access,
+	.convert_ctx_access = bpf_net_convert_ctx_access,
 };
 
 static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.get_func_proto = tc_cls_act_func_proto,
-	.is_valid_access = sk_filter_is_valid_access,
-	.convert_ctx_access = sk_filter_convert_ctx_access,
+	.is_valid_access = tc_cls_act_is_valid_access,
+	.convert_ctx_access = bpf_net_convert_ctx_access,
 };
 
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c
index 2625b987944f..41ae2fd21b13 100644
--- a/samples/bpf/sockex3_kern.c
+++ b/samples/bpf/sockex3_kern.c
@@ -89,7 +89,6 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
 
 struct globals {
 	struct flow_keys flow;
-	__u32 nhoff;
 };
 
 struct bpf_map_def SEC("maps") percpu_map = {
@@ -139,7 +138,7 @@ static void update_stats(struct __sk_buff *skb, struct globals *g)
 static __always_inline void parse_ip_proto(struct __sk_buff *skb,
 					   struct globals *g, __u32 ip_proto)
 {
-	__u32 nhoff = g->nhoff;
+	__u32 nhoff = skb->cb[0];
 	int poff;
 
 	switch (ip_proto) {
@@ -165,7 +164,7 @@ static __always_inline void parse_ip_proto(struct __sk_buff *skb,
 		if (gre_flags & GRE_SEQ)
 			nhoff += 4;
 
-		g->nhoff = nhoff;
+		skb->cb[0] = nhoff;
 		parse_eth_proto(skb, gre_proto);
 		break;
 	}
@@ -195,7 +194,7 @@ PROG(PARSE_IP)(struct __sk_buff *skb)
 	if (!g)
 		return 0;
 
-	nhoff = g->nhoff;
+	nhoff = skb->cb[0];
 
 	if (unlikely(ip_is_fragment(skb, nhoff)))
 		return 0;
@@ -210,7 +209,7 @@ PROG(PARSE_IP)(struct __sk_buff *skb)
 	verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/);
 	nhoff += (verlen & 0xF) << 2;
 
-	g->nhoff = nhoff;
+	skb->cb[0] = nhoff;
 	parse_ip_proto(skb, g, ip_proto);
 	return 0;
 }
@@ -223,7 +222,7 @@ PROG(PARSE_IPV6)(struct __sk_buff *skb)
 	if (!g)
 		return 0;
 
-	nhoff = g->nhoff;
+	nhoff = skb->cb[0];
 
 	ip_proto = load_byte(skb,
 			     nhoff + offsetof(struct ipv6hdr, nexthdr));
@@ -233,25 +232,21 @@ PROG(PARSE_IPV6)(struct __sk_buff *skb)
 				     nhoff + offsetof(struct ipv6hdr, daddr));
 	nhoff += sizeof(struct ipv6hdr);
 
-	g->nhoff = nhoff;
+	skb->cb[0] = nhoff;
 	parse_ip_proto(skb, g, ip_proto);
 	return 0;
 }
 
 PROG(PARSE_VLAN)(struct __sk_buff *skb)
 {
-	struct globals *g = this_cpu_globals();
 	__u32 nhoff, proto;
 
-	if (!g)
-		return 0;
-
-	nhoff = g->nhoff;
+	nhoff = skb->cb[0];
 
 	proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
 						h_vlan_encapsulated_proto));
 	nhoff += sizeof(struct vlan_hdr);
-	g->nhoff = nhoff;
+	skb->cb[0] = nhoff;
 
 	parse_eth_proto(skb, proto);
 
@@ -260,17 +255,13 @@ PROG(PARSE_VLAN)(struct __sk_buff *skb)
 
 PROG(PARSE_MPLS)(struct __sk_buff *skb)
 {
-	struct globals *g = this_cpu_globals();
 	__u32 nhoff, label;
 
-	if (!g)
-		return 0;
-
-	nhoff = g->nhoff;
+	nhoff = skb->cb[0];
 
 	label = load_word(skb, nhoff);
 	nhoff += sizeof(struct mpls_label);
-	g->nhoff = nhoff;
+	skb->cb[0] = nhoff;
 
 	if (label & MPLS_LS_S_MASK) {
 		__u8 verlen = load_byte(skb, nhoff);
@@ -288,14 +279,10 @@ PROG(PARSE_MPLS)(struct __sk_buff *skb)
 SEC("socket/0")
 int main_prog(struct __sk_buff *skb)
 {
-	struct globals *g = this_cpu_globals();
 	__u32 nhoff = ETH_HLEN;
 	__u32 proto = load_half(skb, 12);
 
-	if (!g)
-		return 0;
-
-	g->nhoff = nhoff;
+	skb->cb[0] = nhoff;
 	parse_eth_proto(skb, proto);
 	return 0;
 }
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index 12f3780af73f..693605997abc 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -29,6 +29,7 @@ struct bpf_test {
 		ACCEPT,
 		REJECT
 	} result;
+	enum bpf_prog_type prog_type;
 };
 
 static struct bpf_test tests[] = {
@@ -743,6 +744,84 @@ static struct bpf_test tests[] = {
 		.errstr = "different pointers",
 		.result = REJECT,
 	},
+	{
+		"check skb->mark is not writeable by sockets",
+		.insns = {
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check skb->tc_index is not writeable by sockets",
+		.insns = {
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, tc_index)),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check non-u32 access to cb",
+		.insns = {
+			BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+	},
+	{
+		"check out of range skb->cb access",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[60])),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "invalid bpf_context access",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_SCHED_ACT,
+	},
+	{
+		"write skb fields from socket prog",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[4])),
+			BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, tc_index)),
+			BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[2])),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+	},
+	{
+		"write skb fields from tc_cls_act prog",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, cb[0])),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, mark)),
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+				    offsetof(struct __sk_buff, tc_index)),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, tc_index)),
+			BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+				    offsetof(struct __sk_buff, cb[3])),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
+	},
 };
 
 static int probe_filter_length(struct bpf_insn *fp)
@@ -775,6 +854,7 @@ static int test(void)
 
 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
 		struct bpf_insn *prog = tests[i].insns;
+		int prog_type = tests[i].prog_type;
 		int prog_len = probe_filter_length(prog);
 		int *fixup = tests[i].fixup;
 		int map_fd = -1;
@@ -789,8 +869,8 @@ static int test(void)
 		}
 		printf("#%d %s ", i, tests[i].descr);
 
-		prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog,
-					prog_len * sizeof(struct bpf_insn),
+		prog_fd = bpf_prog_load(prog_type ?: BPF_PROG_TYPE_SOCKET_FILTER,
+					prog, prog_len * sizeof(struct bpf_insn),
 					"GPL", 0);
 
 		if (tests[i].result == ACCEPT) {
-- 
cgit v1.2.3


From a4244b0cf58d56c171874e85228ba5deffeb017a Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Thu, 11 Jun 2015 10:28:16 +0300
Subject: net/ethtool: Add current supported tunable options

Add strings array of the current supported tunable options.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h |  6 ++++++
 net/core/ethtool.c           | 12 ++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'net/core')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 0594933cdf55..cd67aec187d9 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -215,6 +215,11 @@ enum tunable_id {
 	ETHTOOL_ID_UNSPEC,
 	ETHTOOL_RX_COPYBREAK,
 	ETHTOOL_TX_COPYBREAK,
+	/*
+	 * Add your fresh new tubale attribute above and remember to update
+	 * tunable_strings[] in net/core/ethtool.c
+	 */
+	__ETHTOOL_TUNABLE_COUNT,
 };
 
 enum tunable_type_id {
@@ -545,6 +550,7 @@ enum ethtool_stringset {
 	ETH_SS_NTUPLE_FILTERS,
 	ETH_SS_FEATURES,
 	ETH_SS_RSS_HASH_FUNCS,
+	ETH_SS_TUNABLES,
 };
 
 /**
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index eb0c3ace7458..b495ab1797fa 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -106,6 +106,13 @@ rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
 	[ETH_RSS_HASH_XOR_BIT] =	"xor",
 };
 
+static const char
+tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+	[ETHTOOL_ID_UNSPEC]     = "Unspec",
+	[ETHTOOL_RX_COPYBREAK]	= "rx-copybreak",
+	[ETHTOOL_TX_COPYBREAK]	= "tx-copybreak",
+};
+
 static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_gfeatures cmd = {
@@ -194,6 +201,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
 	if (sset == ETH_SS_RSS_HASH_FUNCS)
 		return ARRAY_SIZE(rss_hash_func_strings);
 
+	if (sset == ETH_SS_TUNABLES)
+		return ARRAY_SIZE(tunable_strings);
+
 	if (ops->get_sset_count && ops->get_strings)
 		return ops->get_sset_count(dev, sset);
 	else
@@ -211,6 +221,8 @@ static void __ethtool_get_strings(struct net_device *dev,
 	else if (stringset == ETH_SS_RSS_HASH_FUNCS)
 		memcpy(data, rss_hash_func_strings,
 		       sizeof(rss_hash_func_strings));
+	else if (stringset == ETH_SS_TUNABLES)
+		memcpy(data, tunable_strings, sizeof(tunable_strings));
 	else
 		/* ops->get_strings is valid because checked earlier */
 		ops->get_strings(dev, stringset, data);
-- 
cgit v1.2.3


From 611d23c559a328ca3f84ac120c02d5a9f88c08f5 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Fri, 12 Jun 2015 09:01:05 -0700
Subject: flow_dissector: Fix MPLS entropy label handling in flow dissector

Need to shift after masking to get label value for comparison.

Fixes: b3baa0fbd02a1a9d493d8 ("mpls: Add MPLS entropy label in flow_keys")
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 77e22e4fc898..1818cdcaa9b9 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -299,8 +299,8 @@ mpls:
 		if (!hdr)
 			return false;
 
-		if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) ==
-		     MPLS_LABEL_ENTROPY) {
+		if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >>
+		     MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) {
 			if (skb_flow_dissector_uses_key(flow_dissector,
 							FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
 				key_keyid = skb_flow_dissector_target(flow_dissector,
-- 
cgit v1.2.3


From 6a74fcf426f51aaa569f0b973d10ac20468df238 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Fri, 12 Jun 2015 09:01:06 -0700
Subject: flow_dissector: add support for dst, hop-by-hop and routing ext hdrs

If dst, hop-by-hop or routing extension headers are present determine
length of the options and skip over them in flow dissection.

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/flow_dissector.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'net/core')

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 1818cdcaa9b9..22e4dffa0c8b 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -327,6 +327,7 @@ mpls:
 		return false;
 	}
 
+ip_proto_again:
 	switch (ip_proto) {
 	case IPPROTO_GRE: {
 		struct gre_hdr {
@@ -383,6 +384,22 @@ mpls:
 		}
 		goto again;
 	}
+	case NEXTHDR_HOP:
+	case NEXTHDR_ROUTING:
+	case NEXTHDR_DEST: {
+		u8 _opthdr[2], *opthdr;
+
+		if (proto != htons(ETH_P_IPV6))
+			break;
+
+		opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr),
+					      data, hlen, &_opthdr);
+
+		ip_proto = _opthdr[0];
+		nhoff += (_opthdr[1] + 1) << 3;
+
+		goto ip_proto_again;
+	}
 	case IPPROTO_IPIP:
 		proto = htons(ETH_P_IP);
 		goto ip;
-- 
cgit v1.2.3


From 1e98a0f08abddde87f0f93237f10629ecb4880ef Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 12 Jun 2015 19:31:32 -0700
Subject: flow_dissector: fix ipv6 dst, hop-by-hop and routing ext hdrs

__skb_header_pointer() returns a pointer that must be checked.

Fixes infinite loop reported by Alexei, and add __must_check to
catch these errors earlier.

Fixes: 6a74fcf426f5 ("flow_dissector: add support for dst, hop-by-hop and routing ext hdrs")
Reported-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Tested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    | 9 +++++----
 net/core/flow_dissector.c | 6 ++++--
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cc612fc0a894..a7acc92aa668 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2743,8 +2743,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
 __wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
 		    __wsum csum);
 
-static inline void *__skb_header_pointer(const struct sk_buff *skb, int offset,
-					 int len, void *data, int hlen, void *buffer)
+static inline void * __must_check
+__skb_header_pointer(const struct sk_buff *skb, int offset,
+		     int len, void *data, int hlen, void *buffer)
 {
 	if (hlen - offset >= len)
 		return data + offset;
@@ -2756,8 +2757,8 @@ static inline void *__skb_header_pointer(const struct sk_buff *skb, int offset,
 	return buffer;
 }
 
-static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
-				       int len, void *buffer)
+static inline void * __must_check
+skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer)
 {
 	return __skb_header_pointer(skb, offset, len, skb->data,
 				    skb_headlen(skb), buffer);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 22e4dffa0c8b..476e5dda59e1 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -394,9 +394,11 @@ ip_proto_again:
 
 		opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr),
 					      data, hlen, &_opthdr);
+		if (!opthdr)
+			return false;
 
-		ip_proto = _opthdr[0];
-		nhoff += (_opthdr[1] + 1) << 3;
+		ip_proto = opthdr[0];
+		nhoff += (opthdr[1] + 1) << 3;
 
 		goto ip_proto_again;
 	}
-- 
cgit v1.2.3


From ffeedafbf0236f03aeb2e8db273b3e5ae5f5bc89 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 12 Jun 2015 19:39:12 -0700
Subject: bpf: introduce current->pid, tgid, uid, gid, comm accessors

eBPF programs attached to kprobes need to filter based on
current->pid, uid and other fields, so introduce helper functions:

u64 bpf_get_current_pid_tgid(void)
Return: current->tgid << 32 | current->pid

u64 bpf_get_current_uid_gid(void)
Return: current_gid << 32 | current_uid

bpf_get_current_comm(char *buf, int size_of_buf)
stores current->comm into buf

They can be used from the programs attached to TC as well to classify packets
based on current task fields.

Update tracex2 example to print histogram of write syscalls for each process
instead of aggregated for all.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h        |  3 +++
 include/uapi/linux/bpf.h   | 19 +++++++++++++
 kernel/bpf/core.c          |  3 +++
 kernel/bpf/helpers.c       | 58 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c   |  6 +++++
 net/core/filter.c          |  6 +++++
 samples/bpf/bpf_helpers.h  |  6 +++++
 samples/bpf/tracex2_kern.c | 24 +++++++++++++----
 samples/bpf/tracex2_user.c | 67 +++++++++++++++++++++++++++++++++++++++-------
 9 files changed, 178 insertions(+), 14 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2235aee8096a..1b9a3f5b27f6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -188,5 +188,8 @@ extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
 extern const struct bpf_func_proto bpf_ktime_get_ns_proto;
+extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto;
+extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
+extern const struct bpf_func_proto bpf_get_current_comm_proto;
 
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 602f05b7a275..29ef6f99e43d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -230,6 +230,25 @@ enum bpf_func_id {
 	 * Return: 0 on success
 	 */
 	BPF_FUNC_clone_redirect,
+
+	/**
+	 * u64 bpf_get_current_pid_tgid(void)
+	 * Return: current->tgid << 32 | current->pid
+	 */
+	BPF_FUNC_get_current_pid_tgid,
+
+	/**
+	 * u64 bpf_get_current_uid_gid(void)
+	 * Return: current_gid << 32 | current_uid
+	 */
+	BPF_FUNC_get_current_uid_gid,
+
+	/**
+	 * bpf_get_current_comm(char *buf, int size_of_buf)
+	 * stores current->comm into buf
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_get_current_comm,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1e00aa3316dc..1fc45cc83076 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -730,6 +730,9 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
 const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
+const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
+const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 
 /* Always built-in helper functions. */
 const struct bpf_func_proto bpf_tail_call_proto = {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 7ad5d8842d5b..1447ec09421e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -14,6 +14,8 @@
 #include <linux/random.h>
 #include <linux/smp.h>
 #include <linux/ktime.h>
+#include <linux/sched.h>
+#include <linux/uidgid.h>
 
 /* If kernel subsystem is allowing eBPF programs to call this function,
  * inside its own verifier_ops->get_func_proto() callback it should return
@@ -124,3 +126,59 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = {
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 };
+
+static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct task_struct *task = current;
+
+	if (!task)
+		return -EINVAL;
+
+	return (u64) task->tgid << 32 | task->pid;
+}
+
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
+	.func		= bpf_get_current_pid_tgid,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
+static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	struct task_struct *task = current;
+	kuid_t uid;
+	kgid_t gid;
+
+	if (!task)
+		return -EINVAL;
+
+	current_uid_gid(&uid, &gid);
+	return (u64) from_kgid(&init_user_ns, gid) << 32 |
+		from_kuid(&init_user_ns, uid);
+}
+
+const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
+	.func		= bpf_get_current_uid_gid,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+};
+
+static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
+{
+	struct task_struct *task = current;
+	char *buf = (char *) (long) r1;
+
+	if (!task)
+		return -EINVAL;
+
+	memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
+	return 0;
+}
+
+const struct bpf_func_proto bpf_get_current_comm_proto = {
+	.func		= bpf_get_current_comm,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_CONST_STACK_SIZE,
+};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 50c4015a8ad3..3a17638cdf46 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -162,6 +162,12 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_ktime_get_ns_proto;
 	case BPF_FUNC_tail_call:
 		return &bpf_tail_call_proto;
+	case BPF_FUNC_get_current_pid_tgid:
+		return &bpf_get_current_pid_tgid_proto;
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_get_current_comm:
+		return &bpf_get_current_comm_proto;
 
 	case BPF_FUNC_trace_printk:
 		/*
diff --git a/net/core/filter.c b/net/core/filter.c
index d271c06bf01f..20aa51ccbf9d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1459,6 +1459,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_l4_csum_replace_proto;
 	case BPF_FUNC_clone_redirect:
 		return &bpf_clone_redirect_proto;
+	case BPF_FUNC_get_current_pid_tgid:
+		return &bpf_get_current_pid_tgid_proto;
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_get_current_comm:
+		return &bpf_get_current_comm_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index f531a0b3282d..bdf1c1607b80 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -25,6 +25,12 @@ static void (*bpf_tail_call)(void *ctx, void *map, int index) =
 	(void *) BPF_FUNC_tail_call;
 static unsigned long long (*bpf_get_smp_processor_id)(void) =
 	(void *) BPF_FUNC_get_smp_processor_id;
+static unsigned long long (*bpf_get_current_pid_tgid)(void) =
+	(void *) BPF_FUNC_get_current_pid_tgid;
+static unsigned long long (*bpf_get_current_uid_gid)(void) =
+	(void *) BPF_FUNC_get_current_uid_gid;
+static int (*bpf_get_current_comm)(void *buf, int buf_size) =
+	(void *) BPF_FUNC_get_current_comm;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
index 19ec1cfc45db..dc50f4f2943f 100644
--- a/samples/bpf/tracex2_kern.c
+++ b/samples/bpf/tracex2_kern.c
@@ -62,11 +62,18 @@ static unsigned int log2l(unsigned long v)
 		return log2(v);
 }
 
+struct hist_key {
+	char comm[16];
+	u64 pid_tgid;
+	u64 uid_gid;
+	u32 index;
+};
+
 struct bpf_map_def SEC("maps") my_hist_map = {
-	.type = BPF_MAP_TYPE_ARRAY,
-	.key_size = sizeof(u32),
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(struct hist_key),
 	.value_size = sizeof(long),
-	.max_entries = 64,
+	.max_entries = 1024,
 };
 
 SEC("kprobe/sys_write")
@@ -75,11 +82,18 @@ int bpf_prog3(struct pt_regs *ctx)
 	long write_size = ctx->dx; /* arg3 */
 	long init_val = 1;
 	long *value;
-	u32 index = log2l(write_size);
+	struct hist_key key = {};
+
+	key.index = log2l(write_size);
+	key.pid_tgid = bpf_get_current_pid_tgid();
+	key.uid_gid = bpf_get_current_uid_gid();
+	bpf_get_current_comm(&key.comm, sizeof(key.comm));
 
-	value = bpf_map_lookup_elem(&my_hist_map, &index);
+	value = bpf_map_lookup_elem(&my_hist_map, &key);
 	if (value)
 		__sync_fetch_and_add(value, 1);
+	else
+		bpf_map_update_elem(&my_hist_map, &key, &init_val, BPF_ANY);
 	return 0;
 }
 char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
index 91b8d0896fbb..cd0241c1447a 100644
--- a/samples/bpf/tracex2_user.c
+++ b/samples/bpf/tracex2_user.c
@@ -3,6 +3,7 @@
 #include <stdlib.h>
 #include <signal.h>
 #include <linux/bpf.h>
+#include <string.h>
 #include "libbpf.h"
 #include "bpf_load.h"
 
@@ -20,23 +21,42 @@ static void stars(char *str, long val, long max, int width)
 	str[i] = '\0';
 }
 
-static void print_hist(int fd)
+struct task {
+	char comm[16];
+	__u64 pid_tgid;
+	__u64 uid_gid;
+};
+
+struct hist_key {
+	struct task t;
+	__u32 index;
+};
+
+#define SIZE sizeof(struct task)
+
+static void print_hist_for_pid(int fd, void *task)
 {
-	int key;
+	struct hist_key key = {}, next_key;
+	char starstr[MAX_STARS];
 	long value;
 	long data[MAX_INDEX] = {};
-	char starstr[MAX_STARS];
-	int i;
 	int max_ind = -1;
 	long max_value = 0;
+	int i, ind;
 
-	for (key = 0; key < MAX_INDEX; key++) {
-		bpf_lookup_elem(fd, &key, &value);
-		data[key] = value;
-		if (value && key > max_ind)
-			max_ind = key;
+	while (bpf_get_next_key(fd, &key, &next_key) == 0) {
+		if (memcmp(&next_key, task, SIZE)) {
+			key = next_key;
+			continue;
+		}
+		bpf_lookup_elem(fd, &next_key, &value);
+		ind = next_key.index;
+		data[ind] = value;
+		if (value && ind > max_ind)
+			max_ind = ind;
 		if (value > max_value)
 			max_value = value;
+		key = next_key;
 	}
 
 	printf("           syscall write() stats\n");
@@ -48,6 +68,35 @@ static void print_hist(int fd)
 		       MAX_STARS, starstr);
 	}
 }
+
+static void print_hist(int fd)
+{
+	struct hist_key key = {}, next_key;
+	static struct task tasks[1024];
+	int task_cnt = 0;
+	int i;
+
+	while (bpf_get_next_key(fd, &key, &next_key) == 0) {
+		int found = 0;
+
+		for (i = 0; i < task_cnt; i++)
+			if (memcmp(&tasks[i], &next_key, SIZE) == 0)
+				found = 1;
+		if (!found)
+			memcpy(&tasks[task_cnt++], &next_key, SIZE);
+		key = next_key;
+	}
+
+	for (i = 0; i < task_cnt; i++) {
+		printf("\npid %d cmd %s uid %d\n",
+		       (__u32) tasks[i].pid_tgid,
+		       tasks[i].comm,
+		       (__u32) tasks[i].uid_gid);
+		print_hist_for_pid(fd, &tasks[i]);
+	}
+
+}
+
 static void int_exit(int sig)
 {
 	print_hist(map_fd[1]);
-- 
cgit v1.2.3


From 0756ea3e85139d23a8148ebaa95411c2f0aa4f11 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 12 Jun 2015 19:39:13 -0700
Subject: bpf: allow networking programs to use bpf_trace_printk() for
 debugging

bpf_trace_printk() is a helper function used to debug eBPF programs.
Let socket and TC programs use it as well.
Note, it's DEBUG ONLY helper. If it's used in the program,
the kernel will print warning banner to make sure users don't use
it in production.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 kernel/bpf/core.c        |  4 ++++
 kernel/trace/bpf_trace.c | 20 ++++++++++++--------
 net/core/filter.c        |  2 ++
 4 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1b9a3f5b27f6..4383476a0d48 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -150,6 +150,7 @@ struct bpf_array {
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 void bpf_prog_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
+const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
 #ifdef CONFIG_BPF_SYSCALL
 void bpf_register_prog_type(struct bpf_prog_type_list *tl);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1fc45cc83076..c5bedc82bc1c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -733,6 +733,10 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
+{
+	return NULL;
+}
 
 /* Always built-in helper functions. */
 const struct bpf_func_proto bpf_tail_call_proto = {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3a17638cdf46..4f9b5d41869b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -147,6 +147,17 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
 	.arg2_type	= ARG_CONST_STACK_SIZE,
 };
 
+const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
+{
+	/*
+	 * this program might be calling bpf_trace_printk,
+	 * so allocate per-cpu printk buffers
+	 */
+	trace_printk_init_buffers();
+
+	return &bpf_trace_printk_proto;
+}
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -168,15 +179,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_current_uid_gid_proto;
 	case BPF_FUNC_get_current_comm:
 		return &bpf_get_current_comm_proto;
-
 	case BPF_FUNC_trace_printk:
-		/*
-		 * this program might be calling bpf_trace_printk,
-		 * so allocate per-cpu printk buffers
-		 */
-		trace_printk_init_buffers();
-
-		return &bpf_trace_printk_proto;
+		return bpf_get_trace_printk_proto();
 	default:
 		return NULL;
 	}
diff --git a/net/core/filter.c b/net/core/filter.c
index 20aa51ccbf9d..65ff107d3d29 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1442,6 +1442,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
 		return &bpf_tail_call_proto;
 	case BPF_FUNC_ktime_get_ns:
 		return &bpf_ktime_get_ns_proto;
+	case BPF_FUNC_trace_printk:
+		return bpf_get_trace_printk_proto();
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 3b766cd832328fcb87db3507e7b98cf42f21689d Mon Sep 17 00:00:00 2001
From: Eran Ben Elisha <eranbe@mellanox.com>
Date: Mon, 15 Jun 2015 17:59:07 +0300
Subject: net/core: Add reading VF statistics through the PF netdevice

Add ndo_get_vf_stats where the PF retrieves and fills the VFs traffic
statistics. We encode the VF stats in a nested manner to allow for
future extensions.

Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_link.h      |  9 ++++++++
 include/linux/netdevice.h    |  4 ++++
 include/uapi/linux/if_link.h | 13 +++++++++++
 net/core/rtnetlink.c         | 51 ++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index da4929927f69..ae5d0d22955d 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -5,6 +5,15 @@
 
 
 /* We don't want this structure exposed to user space */
+struct ifla_vf_stats {
+	__u64 rx_packets;
+	__u64 tx_packets;
+	__u64 rx_bytes;
+	__u64 tx_bytes;
+	__u64 broadcast;
+	__u64 multicast;
+};
+
 struct ifla_vf_info {
 	__u32 vf;
 	__u8 mac[32];
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6f5f71ff5169..e20979dfd6a9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1100,6 +1100,10 @@ struct net_device_ops {
 						     struct ifla_vf_info *ivf);
 	int			(*ndo_set_vf_link_state)(struct net_device *dev,
 							 int vf, int link_state);
+	int			(*ndo_get_vf_stats)(struct net_device *dev,
+						    int vf,
+						    struct ifla_vf_stats
+						    *vf_stats);
 	int			(*ndo_set_vf_port)(struct net_device *dev,
 						   int vf,
 						   struct nlattr *port[]);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 510efb360580..2c7e8e3d3981 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -484,6 +484,7 @@ enum {
 	IFLA_VF_RSS_QUERY_EN,	/* RSS Redirection Table and Hash Key query
 				 * on/off switch
 				 */
+	IFLA_VF_STATS,		/* network device statistics */
 	__IFLA_VF_MAX,
 };
 
@@ -533,6 +534,18 @@ struct ifla_vf_rss_query_en {
 	__u32 setting;
 };
 
+enum {
+	IFLA_VF_STATS_RX_PACKETS,
+	IFLA_VF_STATS_TX_PACKETS,
+	IFLA_VF_STATS_RX_BYTES,
+	IFLA_VF_STATS_TX_BYTES,
+	IFLA_VF_STATS_BROADCAST,
+	IFLA_VF_STATS_MULTICAST,
+	__IFLA_VF_STATS_MAX,
+};
+
+#define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1)
+
 /* VF ports management section
  *
  *	Nested layout of set/get msg is:
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 077b6d280371..2d102ce1474f 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -819,7 +819,19 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
 			 nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
 			 nla_total_size(sizeof(struct ifla_vf_rate)) +
 			 nla_total_size(sizeof(struct ifla_vf_link_state)) +
-			 nla_total_size(sizeof(struct ifla_vf_rss_query_en)));
+			 nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
+			 /* IFLA_VF_STATS_RX_PACKETS */
+			 nla_total_size(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_TX_PACKETS */
+			 nla_total_size(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_RX_BYTES */
+			 nla_total_size(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_TX_BYTES */
+			 nla_total_size(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_BROADCAST */
+			 nla_total_size(sizeof(__u64)) +
+			 /* IFLA_VF_STATS_MULTICAST */
+			 nla_total_size(sizeof(__u64)));
 		return size;
 	} else
 		return 0;
@@ -1123,7 +1135,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	    && (ext_filter_mask & RTEXT_FILTER_VF)) {
 		int i;
 
-		struct nlattr *vfinfo, *vf;
+		struct nlattr *vfinfo, *vf, *vfstats;
 		int num_vfs = dev_num_vf(dev->dev.parent);
 
 		vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
@@ -1138,6 +1150,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 			struct ifla_vf_spoofchk vf_spoofchk;
 			struct ifla_vf_link_state vf_linkstate;
 			struct ifla_vf_rss_query_en vf_rss_query_en;
+			struct ifla_vf_stats vf_stats;
 
 			/*
 			 * Not all SR-IOV capable drivers support the
@@ -1190,6 +1203,30 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 				    sizeof(vf_rss_query_en),
 				    &vf_rss_query_en))
 				goto nla_put_failure;
+			memset(&vf_stats, 0, sizeof(vf_stats));
+			if (dev->netdev_ops->ndo_get_vf_stats)
+				dev->netdev_ops->ndo_get_vf_stats(dev, i,
+								  &vf_stats);
+			vfstats = nla_nest_start(skb, IFLA_VF_STATS);
+			if (!vfstats) {
+				nla_nest_cancel(skb, vf);
+				nla_nest_cancel(skb, vfinfo);
+				goto nla_put_failure;
+			}
+			if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS,
+					vf_stats.rx_packets) ||
+			    nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS,
+					vf_stats.tx_packets) ||
+			    nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES,
+					vf_stats.rx_bytes) ||
+			    nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES,
+					vf_stats.tx_bytes) ||
+			    nla_put_u64(skb, IFLA_VF_STATS_BROADCAST,
+					vf_stats.broadcast) ||
+			    nla_put_u64(skb, IFLA_VF_STATS_MULTICAST,
+					vf_stats.multicast))
+				goto nla_put_failure;
+			nla_nest_end(skb, vfstats);
 			nla_nest_end(skb, vf);
 		}
 		nla_nest_end(skb, vfinfo);
@@ -1303,6 +1340,16 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
 	[IFLA_VF_RATE]		= { .len = sizeof(struct ifla_vf_rate) },
 	[IFLA_VF_LINK_STATE]	= { .len = sizeof(struct ifla_vf_link_state) },
 	[IFLA_VF_RSS_QUERY_EN]	= { .len = sizeof(struct ifla_vf_rss_query_en) },
+	[IFLA_VF_STATS]		= { .type = NLA_NESTED },
+};
+
+static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = {
+	[IFLA_VF_STATS_RX_PACKETS]	= { .type = NLA_U64 },
+	[IFLA_VF_STATS_TX_PACKETS]	= { .type = NLA_U64 },
+	[IFLA_VF_STATS_RX_BYTES]	= { .type = NLA_U64 },
+	[IFLA_VF_STATS_TX_BYTES]	= { .type = NLA_U64 },
+	[IFLA_VF_STATS_BROADCAST]	= { .type = NLA_U64 },
+	[IFLA_VF_STATS_MULTICAST]	= { .type = NLA_U64 },
 };
 
 static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
-- 
cgit v1.2.3


From eb4cb008529ca08e0d8c0fa54e8f739520197a65 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Mon, 15 Jun 2015 11:26:18 -0400
Subject: sock_diag: define destruction multicast groups

These groups will contain socket-destruction events for
AF_INET/AF_INET6, IPPROTO_TCP/IPPROTO_UDP.

Near the end of socket destruction, a check for listeners is
performed.  In the presence of a listener, rather than completely
cleanup the socket, a unit of work will be added to a private
work queue which will first broadcast information about the socket
and then finish the cleanup operation.

Signed-off-by: Craig Gallek <kraig@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sock_diag.h      | 42 +++++++++++++++++++++
 include/net/sock.h             |  1 +
 include/uapi/linux/sock_diag.h | 10 +++++
 net/core/sock.c                | 11 +++++-
 net/core/sock_diag.c           | 85 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 148 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 083ac388098e..fddebc617469 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -1,7 +1,10 @@
 #ifndef __SOCK_DIAG_H__
 #define __SOCK_DIAG_H__
 
+#include <linux/netlink.h>
 #include <linux/user_namespace.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
 #include <uapi/linux/sock_diag.h>
 
 struct sk_buff;
@@ -11,6 +14,7 @@ struct sock;
 struct sock_diag_handler {
 	__u8 family;
 	int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh);
+	int (*get_info)(struct sk_buff *skb, struct sock *sk);
 };
 
 int sock_diag_register(const struct sock_diag_handler *h);
@@ -26,4 +30,42 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr);
 int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk,
 			     struct sk_buff *skb, int attrtype);
 
+static inline
+enum sknetlink_groups sock_diag_destroy_group(const struct sock *sk)
+{
+	switch (sk->sk_family) {
+	case AF_INET:
+		switch (sk->sk_protocol) {
+		case IPPROTO_TCP:
+			return SKNLGRP_INET_TCP_DESTROY;
+		case IPPROTO_UDP:
+			return SKNLGRP_INET_UDP_DESTROY;
+		default:
+			return SKNLGRP_NONE;
+		}
+	case AF_INET6:
+		switch (sk->sk_protocol) {
+		case IPPROTO_TCP:
+			return SKNLGRP_INET6_TCP_DESTROY;
+		case IPPROTO_UDP:
+			return SKNLGRP_INET6_UDP_DESTROY;
+		default:
+			return SKNLGRP_NONE;
+		}
+	default:
+		return SKNLGRP_NONE;
+	}
+}
+
+static inline
+bool sock_diag_has_destroy_listeners(const struct sock *sk)
+{
+	const struct net *n = sock_net(sk);
+	const enum sknetlink_groups group = sock_diag_destroy_group(sk);
+
+	return group != SKNLGRP_NONE && n->diag_nlsk &&
+		netlink_has_listeners(n->diag_nlsk, group);
+}
+void sock_diag_broadcast_destroy(struct sock *sk);
+
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 26c1c3171e00..3e8258699270 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1518,6 +1518,7 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow)
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		      struct proto *prot, int kern);
 void sk_free(struct sock *sk);
+void sk_destruct(struct sock *sk);
 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
 
 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
diff --git a/include/uapi/linux/sock_diag.h b/include/uapi/linux/sock_diag.h
index b00e29efb161..49230d36f9ce 100644
--- a/include/uapi/linux/sock_diag.h
+++ b/include/uapi/linux/sock_diag.h
@@ -23,4 +23,14 @@ enum {
 	SK_MEMINFO_VARS,
 };
 
+enum sknetlink_groups {
+	SKNLGRP_NONE,
+	SKNLGRP_INET_TCP_DESTROY,
+	SKNLGRP_INET_UDP_DESTROY,
+	SKNLGRP_INET6_TCP_DESTROY,
+	SKNLGRP_INET6_UDP_DESTROY,
+	__SKNLGRP_MAX,
+};
+#define SKNLGRP_MAX	(__SKNLGRP_MAX - 1)
+
 #endif /* _UAPI__SOCK_DIAG_H__ */
diff --git a/net/core/sock.c b/net/core/sock.c
index 7063c329c1b6..1e1fe9a68d83 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -131,6 +131,7 @@
 #include <linux/ipsec.h>
 #include <net/cls_cgroup.h>
 #include <net/netprio_cgroup.h>
+#include <linux/sock_diag.h>
 
 #include <linux/filter.h>
 
@@ -1423,7 +1424,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 }
 EXPORT_SYMBOL(sk_alloc);
 
-static void __sk_free(struct sock *sk)
+void sk_destruct(struct sock *sk)
 {
 	struct sk_filter *filter;
 
@@ -1451,6 +1452,14 @@ static void __sk_free(struct sock *sk)
 	sk_prot_free(sk->sk_prot_creator, sk);
 }
 
+static void __sk_free(struct sock *sk)
+{
+	if (unlikely(sock_diag_has_destroy_listeners(sk)))
+		sock_diag_broadcast_destroy(sk);
+	else
+		sk_destruct(sk);
+}
+
 void sk_free(struct sock *sk)
 {
 	/*
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 74dddf84adcd..d79866c5f8bc 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -5,6 +5,9 @@
 #include <net/net_namespace.h>
 #include <linux/module.h>
 #include <net/sock.h>
+#include <linux/kernel.h>
+#include <linux/tcp.h>
+#include <linux/workqueue.h>
 
 #include <linux/inet_diag.h>
 #include <linux/sock_diag.h>
@@ -12,6 +15,7 @@
 static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
 static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
 static DEFINE_MUTEX(sock_diag_table_mutex);
+static struct workqueue_struct *broadcast_wq;
 
 static u64 sock_gen_cookie(struct sock *sk)
 {
@@ -101,6 +105,62 @@ out:
 }
 EXPORT_SYMBOL(sock_diag_put_filterinfo);
 
+struct broadcast_sk {
+	struct sock *sk;
+	struct work_struct work;
+};
+
+static size_t sock_diag_nlmsg_size(void)
+{
+	return NLMSG_ALIGN(sizeof(struct inet_diag_msg)
+	       + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */
+	       + nla_total_size(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
+}
+
+static void sock_diag_broadcast_destroy_work(struct work_struct *work)
+{
+	struct broadcast_sk *bsk =
+		container_of(work, struct broadcast_sk, work);
+	struct sock *sk = bsk->sk;
+	const struct sock_diag_handler *hndl;
+	struct sk_buff *skb;
+	const enum sknetlink_groups group = sock_diag_destroy_group(sk);
+	int err = -1;
+
+	WARN_ON(group == SKNLGRP_NONE);
+
+	skb = nlmsg_new(sock_diag_nlmsg_size(), GFP_KERNEL);
+	if (!skb)
+		goto out;
+
+	mutex_lock(&sock_diag_table_mutex);
+	hndl = sock_diag_handlers[sk->sk_family];
+	if (hndl && hndl->get_info)
+		err = hndl->get_info(skb, sk);
+	mutex_unlock(&sock_diag_table_mutex);
+
+	if (!err)
+		nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group,
+				GFP_KERNEL);
+	else
+		kfree_skb(skb);
+out:
+	sk_destruct(sk);
+	kfree(bsk);
+}
+
+void sock_diag_broadcast_destroy(struct sock *sk)
+{
+	/* Note, this function is often called from an interrupt context. */
+	struct broadcast_sk *bsk =
+		kmalloc(sizeof(struct broadcast_sk), GFP_ATOMIC);
+	if (!bsk)
+		return sk_destruct(sk);
+	bsk->sk = sk;
+	INIT_WORK(&bsk->work, sock_diag_broadcast_destroy_work);
+	queue_work(broadcast_wq, &bsk->work);
+}
+
 void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
 {
 	mutex_lock(&sock_diag_table_mutex);
@@ -211,10 +271,32 @@ static void sock_diag_rcv(struct sk_buff *skb)
 	mutex_unlock(&sock_diag_mutex);
 }
 
+static int sock_diag_bind(struct net *net, int group)
+{
+	switch (group) {
+	case SKNLGRP_INET_TCP_DESTROY:
+	case SKNLGRP_INET_UDP_DESTROY:
+		if (!sock_diag_handlers[AF_INET])
+			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+				       NETLINK_SOCK_DIAG, AF_INET);
+		break;
+	case SKNLGRP_INET6_TCP_DESTROY:
+	case SKNLGRP_INET6_UDP_DESTROY:
+		if (!sock_diag_handlers[AF_INET6])
+			request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+				       NETLINK_SOCK_DIAG, AF_INET);
+		break;
+	}
+	return 0;
+}
+
 static int __net_init diag_net_init(struct net *net)
 {
 	struct netlink_kernel_cfg cfg = {
+		.groups	= SKNLGRP_MAX,
 		.input	= sock_diag_rcv,
+		.bind	= sock_diag_bind,
+		.flags	= NL_CFG_F_NONROOT_RECV,
 	};
 
 	net->diag_nlsk = netlink_kernel_create(net, NETLINK_SOCK_DIAG, &cfg);
@@ -234,12 +316,15 @@ static struct pernet_operations diag_net_ops = {
 
 static int __init sock_diag_init(void)
 {
+	broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0);
+	BUG_ON(!broadcast_wq);
 	return register_pernet_subsys(&diag_net_ops);
 }
 
 static void __exit sock_diag_exit(void)
 {
 	unregister_pernet_subsys(&diag_net_ops);
+	destroy_workqueue(broadcast_wq);
 }
 
 module_init(sock_diag_init);
-- 
cgit v1.2.3


From 89d256bb69f2596c3a31ac51466eac9e1791c388 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Mon, 15 Jun 2015 20:49:24 -0700
Subject: bpf: disallow bpf tc programs access current->pid,uid

Accessing current->pid/uid from cls_bpf may lead to misleading results and
should not be used when TC classifiers need accurate information about pid/uid.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 65ff107d3d29..be3098fb65e4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1461,12 +1461,6 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_l4_csum_replace_proto;
 	case BPF_FUNC_clone_redirect:
 		return &bpf_clone_redirect_proto;
-	case BPF_FUNC_get_current_pid_tgid:
-		return &bpf_get_current_pid_tgid_proto;
-	case BPF_FUNC_get_current_uid_gid:
-		return &bpf_get_current_uid_gid_proto;
-	case BPF_FUNC_get_current_comm:
-		return &bpf_get_current_comm_proto;
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 2c51a97f76d20ebf1f50fef908b986cb051fdff9 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Tue, 16 Jun 2015 22:56:39 +0300
Subject: neigh: do not modify unlinked entries

The lockless lookups can return entry that is unlinked.
Sometimes they get reference before last neigh_cleanup_and_release,
sometimes they do not need reference. Later, any
modification attempts may result in the following problems:

1. entry is not destroyed immediately because neigh_update
can start the timer for dead entry, eg. on change to NUD_REACHABLE
state. As result, entry lives for some time but is invisible
and out of control.

2. __neigh_event_send can run in parallel with neigh_destroy
while refcnt=0 but if timer is started and expired refcnt can
reach 0 for second time leading to second neigh_destroy and
possible crash.

Thanks to Eric Dumazet and Ying Xue for their work and analyze
on the __neigh_event_send change.

Fixes: 767e97e1e0db ("neigh: RCU conversion of struct neighbour")
Fixes: a263b3093641 ("ipv4: Make neigh lookups directly in output packet path.")
Fixes: 6fd6ce2056de ("ipv6: Do not depend on rt->n in ip6_finish_output2().")
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 3de654256028..2237c1b3cdd2 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -957,6 +957,8 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 	rc = 0;
 	if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
 		goto out_unlock_bh;
+	if (neigh->dead)
+		goto out_dead;
 
 	if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
 		if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
@@ -1013,6 +1015,13 @@ out_unlock_bh:
 		write_unlock(&neigh->lock);
 	local_bh_enable();
 	return rc;
+
+out_dead:
+	if (neigh->nud_state & NUD_STALE)
+		goto out_unlock_bh;
+	write_unlock_bh(&neigh->lock);
+	kfree_skb(skb);
+	return 1;
 }
 EXPORT_SYMBOL(__neigh_event_send);
 
@@ -1076,6 +1085,8 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
 	if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
 	    (old & (NUD_NOARP | NUD_PERMANENT)))
 		goto out;
+	if (neigh->dead)
+		goto out;
 
 	if (!(new & NUD_VALID)) {
 		neigh_del_timer(neigh);
@@ -1225,6 +1236,8 @@ EXPORT_SYMBOL(neigh_update);
  */
 void __neigh_set_probe_once(struct neighbour *neigh)
 {
+	if (neigh->dead)
+		return;
 	neigh->updated = jiffies;
 	if (!(neigh->nud_state & NUD_FAILED))
 		return;
-- 
cgit v1.2.3


From 7d4f8d871ab15bd50a5771382ca2c9355b38d73c Mon Sep 17 00:00:00 2001
From: Scott Feldman <sfeldma@gmail.com>
Date: Mon, 22 Jun 2015 00:27:17 -0700
Subject: switchdev; add VLAN support for port's bridge_getlink

One more missing piece of the puzzle.  Add vlan dump support to switchdev
port's bridge_getlink.  iproute2 "bridge vlan show" cmd already knows how
to show the vlans installed on the bridge and the device , but (until now)
no one implemented the port vlan part of the netlink PF_BRIDGE:RTM_GETLINK
msg.  Before this patch, "bridge vlan show":

	$ bridge -c vlan show
	port    vlan ids
	sw1p1    30-34			<< bridge side vlans
		 57

	sw1p1				<< device side vlans (missing)

	sw1p2    57

	sw1p2

	sw1p3

	sw1p4

	br0     None

(When the port is bridged, the output repeats the vlan list for the vlans
on the bridge side of the port and the vlans on the device side of the
port.  The listing above show no vlans for the device side even though they
are installed).

After this patch:

	$ bridge -c vlan show
	port    vlan ids
	sw1p1    30-34			<< bridge side vlan
		 57

	sw1p1    30-34			<< device side vlans
		 57
		 3840 PVID

	sw1p2    57

	sw1p2    57
		 3840 PVID

	sw1p3    3842 PVID

	sw1p4    3843 PVID

	br0     None

I re-used ndo_dflt_bridge_getlink to add vlan fill call-back func.
switchdev support adds an obj dump for VLAN objects, using the same
call-back scheme as FDB dump.  Support included for both compressed and
un-compressed vlan dumps.

Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/be_main.c   |   2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c   |   4 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   3 +-
 drivers/net/ethernet/rocker/rocker.c          |  25 ++++++
 include/linux/rtnetlink.h                     |   6 +-
 net/core/rtnetlink.c                          |  18 +++-
 net/switchdev/switchdev.c                     | 123 +++++++++++++++++++++++++-
 7 files changed, 172 insertions(+), 9 deletions(-)

(limited to 'net/core')

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index c0f34845cf59..6f642426308c 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -5096,7 +5096,7 @@ static int be_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 	return ndo_dflt_bridge_getlink(skb, pid, seq, dev,
 				       hsw_mode == PORT_FWD_TYPE_VEPA ?
 				       BRIDGE_MODE_VEPA : BRIDGE_MODE_VEB,
-				       0, 0, nlflags);
+				       0, 0, nlflags, filter_mask, NULL);
 }
 
 #ifdef CONFIG_BE2NET_VXLAN
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 52d7d8b8f1f9..48a52b35b614 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -8069,7 +8069,7 @@ static int i40e_ndo_bridge_setlink(struct net_device *dev,
 #ifdef HAVE_BRIDGE_FILTER
 static int i40e_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 				   struct net_device *dev,
-				   u32 __always_unused filter_mask, int nlflags)
+				   u32 filter_mask, int nlflags)
 #else
 static int i40e_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 				   struct net_device *dev, int nlflags)
@@ -8095,7 +8095,7 @@ static int i40e_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 		return 0;
 
 	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, veb->bridge_mode,
-				       nlflags);
+				       nlflags, 0, 0, filter_mask, NULL);
 }
 #endif /* HAVE_BRIDGE_ATTRIBS */
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 3bf2f3cfd9f6..9aa6104e34ea 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8095,7 +8095,8 @@ static int ixgbe_ndo_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 		return 0;
 
 	return ndo_dflt_bridge_getlink(skb, pid, seq, dev,
-				       adapter->bridge_mode, 0, 0, nlflags);
+				       adapter->bridge_mode, 0, 0, nlflags,
+				       filter_mask, NULL);
 }
 
 static void *ixgbe_fwd_add(struct net_device *pdev, struct net_device *vdev)
diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 7d5d92a10284..83e913b60d9c 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4456,6 +4456,28 @@ static int rocker_port_fdb_dump(const struct rocker_port *rocker_port,
 	return err;
 }
 
+static int rocker_port_vlan_dump(const struct rocker_port *rocker_port,
+				 struct switchdev_obj *obj)
+{
+	struct switchdev_obj_vlan *vlan = &obj->u.vlan;
+	u16 vid;
+	int err = 0;
+
+	for (vid = 1; vid < VLAN_N_VID; vid++) {
+		if (!test_bit(vid, rocker_port->vlan_bitmap))
+			continue;
+		vlan->flags = 0;
+		if (rocker_vlan_id_is_internal(htons(vid)))
+			vlan->flags |= BRIDGE_VLAN_INFO_PVID;
+		vlan->vid_begin = vlan->vid_end = vid;
+		err = obj->cb(rocker_port->dev, obj);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
 static int rocker_port_obj_dump(struct net_device *dev,
 				struct switchdev_obj *obj)
 {
@@ -4466,6 +4488,9 @@ static int rocker_port_obj_dump(struct net_device *dev,
 	case SWITCHDEV_OBJ_PORT_FDB:
 		err = rocker_port_fdb_dump(rocker_port, obj);
 		break;
+	case SWITCHDEV_OBJ_PORT_VLAN:
+		err = rocker_port_vlan_dump(rocker_port, obj);
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index a2324fb45cf4..39adaa9529eb 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -114,5 +114,9 @@ extern int ndo_dflt_fdb_del(struct ndmsg *ndm,
 
 extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 				   struct net_device *dev, u16 mode,
-				   u32 flags, u32 mask, int nlflags);
+				   u32 flags, u32 mask, int nlflags,
+				   u32 filter_mask,
+				   int (*vlan_fill)(struct sk_buff *skb,
+						    struct net_device *dev,
+						    u32 filter_mask));
 #endif	/* __LINUX_RTNETLINK_H */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 2d102ce1474f..01ced4a889e0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2908,7 +2908,11 @@ static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
 
 int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 			    struct net_device *dev, u16 mode,
-			    u32 flags, u32 mask, int nlflags)
+			    u32 flags, u32 mask, int nlflags,
+			    u32 filter_mask,
+			    int (*vlan_fill)(struct sk_buff *skb,
+					     struct net_device *dev,
+					     u32 filter_mask))
 {
 	struct nlmsghdr *nlh;
 	struct ifinfomsg *ifm;
@@ -2916,6 +2920,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 	struct nlattr *protinfo;
 	u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
 	struct net_device *br_dev = netdev_master_upper_dev_get(dev);
+	int err = 0;
 
 	nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags);
 	if (nlh == NULL)
@@ -2956,6 +2961,13 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 			goto nla_put_failure;
 		}
 	}
+	if (vlan_fill) {
+		err = vlan_fill(skb, dev, filter_mask);
+		if (err) {
+			nla_nest_cancel(skb, br_afspec);
+			goto nla_put_failure;
+		}
+	}
 	nla_nest_end(skb, br_afspec);
 
 	protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED);
@@ -2989,9 +3001,9 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 	return 0;
 nla_put_failure:
 	nlmsg_cancel(skb, nlh);
-	return -EMSGSIZE;
+	return err ? err : -EMSGSIZE;
 }
-EXPORT_SYMBOL(ndo_dflt_bridge_getlink);
+EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink);
 
 static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
 {
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 448d9199cea2..3e4b35a0c8cb 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -391,6 +391,126 @@ int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
 }
 EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
 
+struct switchdev_vlan_dump {
+	struct switchdev_obj obj;
+	struct sk_buff *skb;
+	u32 filter_mask;
+	u16 flags;
+	u16 begin;
+	u16 end;
+};
+
+static int switchdev_port_vlan_dump_put(struct net_device *dev,
+					struct switchdev_vlan_dump *dump)
+{
+	struct bridge_vlan_info vinfo;
+
+	vinfo.flags = dump->flags;
+
+	if (dump->begin == 0 && dump->end == 0) {
+		return 0;
+	} else if (dump->begin == dump->end) {
+		vinfo.vid = dump->begin;
+		if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO,
+			    sizeof(vinfo), &vinfo))
+			return -EMSGSIZE;
+	} else {
+		vinfo.vid = dump->begin;
+		vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN;
+		if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO,
+			    sizeof(vinfo), &vinfo))
+			return -EMSGSIZE;
+		vinfo.vid = dump->end;
+		vinfo.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN;
+		vinfo.flags |= BRIDGE_VLAN_INFO_RANGE_END;
+		if (nla_put(dump->skb, IFLA_BRIDGE_VLAN_INFO,
+			    sizeof(vinfo), &vinfo))
+			return -EMSGSIZE;
+	}
+
+	return 0;
+}
+
+static int switchdev_port_vlan_dump_cb(struct net_device *dev,
+				       struct switchdev_obj *obj)
+{
+	struct switchdev_vlan_dump *dump =
+		container_of(obj, struct switchdev_vlan_dump, obj);
+	struct switchdev_obj_vlan *vlan = &dump->obj.u.vlan;
+	int err = 0;
+
+	if (vlan->vid_begin > vlan->vid_end)
+		return -EINVAL;
+
+	if (dump->filter_mask & RTEXT_FILTER_BRVLAN) {
+		dump->flags = vlan->flags;
+		for (dump->begin = dump->end = vlan->vid_begin;
+		     dump->begin <= vlan->vid_end;
+		     dump->begin++, dump->end++) {
+			err = switchdev_port_vlan_dump_put(dev, dump);
+			if (err)
+				return err;
+		}
+	} else if (dump->filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) {
+		if (dump->begin > vlan->vid_begin &&
+		    dump->begin >= vlan->vid_end) {
+			if ((dump->begin - 1) == vlan->vid_end &&
+			    dump->flags == vlan->flags) {
+				/* prepend */
+				dump->begin = vlan->vid_begin;
+			} else {
+				err = switchdev_port_vlan_dump_put(dev, dump);
+				dump->flags = vlan->flags;
+				dump->begin = vlan->vid_begin;
+				dump->end = vlan->vid_end;
+			}
+		} else if (dump->end <= vlan->vid_begin &&
+		           dump->end < vlan->vid_end) {
+			if ((dump->end  + 1) == vlan->vid_begin &&
+			    dump->flags == vlan->flags) {
+				/* append */
+				dump->end = vlan->vid_end;
+			} else {
+				err = switchdev_port_vlan_dump_put(dev, dump);
+				dump->flags = vlan->flags;
+				dump->begin = vlan->vid_begin;
+				dump->end = vlan->vid_end;
+			}
+		} else {
+			err = -EINVAL;
+		}
+	}
+
+	return err;
+}
+
+static int switchdev_port_vlan_fill(struct sk_buff *skb, struct net_device *dev,
+				    u32 filter_mask)
+{
+	struct switchdev_vlan_dump dump = {
+		.obj = {
+			.id = SWITCHDEV_OBJ_PORT_VLAN,
+			.cb = switchdev_port_vlan_dump_cb,
+		},
+		.skb = skb,
+		.filter_mask = filter_mask,
+	};
+	int err = 0;
+
+	if ((filter_mask & RTEXT_FILTER_BRVLAN) ||
+	    (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) {
+		err = switchdev_port_obj_dump(dev, &dump.obj);
+		if (err)
+			goto err_out;
+		if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)
+			/* last one */
+			err = switchdev_port_vlan_dump_put(dev, &dump);
+	}
+
+err_out:
+	return err == -EOPNOTSUPP ? 0 : err;
+}
+
 /**
  *	switchdev_port_bridge_getlink - Get bridge port attributes
  *
@@ -415,7 +535,8 @@ int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 		return err;
 
 	return ndo_dflt_bridge_getlink(skb, pid, seq, dev, mode,
-				       attr.u.brport_flags, mask, nlflags);
+				       attr.u.brport_flags, mask, nlflags,
+				       filter_mask, switchdev_port_vlan_fill);
 }
 EXPORT_SYMBOL_GPL(switchdev_port_bridge_getlink);
 
-- 
cgit v1.2.3