From 470acf55a021713869b9bcc967268ac90c8a0fac Mon Sep 17 00:00:00 2001
From: Gao Feng <fgao@ikuai8.com>
Date: Fri, 14 Apr 2017 10:00:08 +0800
Subject: netfilter: xt_CT: fix refcnt leak on error path

There are two cases which causes refcnt leak.

1. When nf_ct_timeout_ext_add failed in xt_ct_set_timeout, it should
free the timeout refcnt.
Now goto the err_put_timeout error handler instead of going ahead.

2. When the time policy is not found, we should call module_put.
Otherwise, the related cthelper module cannot be removed anymore.
It is easy to reproduce by typing the following command:
  # iptables -t raw -A OUTPUT -p tcp -j CT --helper ftp --timeout xxx

Signed-off-by: Gao Feng <fgao@ikuai8.com>
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_CT.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index b008db0184b8..81fdcdca7457 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -167,8 +167,10 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par,
 		goto err_put_timeout;
 	}
 	timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC);
-	if (timeout_ext == NULL)
+	if (!timeout_ext) {
 		ret = -ENOMEM;
+		goto err_put_timeout;
+	}
 
 	rcu_read_unlock();
 	return ret;
@@ -200,6 +202,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 			  struct xt_ct_target_info_v1 *info)
 {
 	struct nf_conntrack_zone zone;
+	struct nf_conn_help *help;
 	struct nf_conn *ct;
 	int ret = -EOPNOTSUPP;
 
@@ -248,7 +251,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 	if (info->timeout[0]) {
 		ret = xt_ct_set_timeout(ct, par, info->timeout);
 		if (ret < 0)
-			goto err3;
+			goto err4;
 	}
 	__set_bit(IPS_CONFIRMED_BIT, &ct->status);
 	nf_conntrack_get(&ct->ct_general);
@@ -256,6 +259,10 @@ out:
 	info->ct = ct;
 	return 0;
 
+err4:
+	help = nfct_help(ct);
+	if (help)
+		module_put(help->helper->me);
 err3:
 	nf_ct_tmpl_free(ct);
 err2:
-- 
cgit v1.2.3


From cf5d70918877c6a6655dc1e92e2ebb661ce904fd Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Fri, 14 Apr 2017 14:26:38 -0700
Subject: openvswitch: Delete conntrack entry clashing with an expectation.

Conntrack helpers do not check for a potentially clashing conntrack
entry when creating a new expectation.  Also, nf_conntrack_in() will
check expectations (via init_conntrack()) only if a conntrack entry
can not be found.  The expectation for a packet which also matches an
existing conntrack entry will not be removed by conntrack, and is
currently handled inconsistently by OVS, as OVS expects the
expectation to be removed when the connection tracking entry matching
that expectation is confirmed.

It should be noted that normally an IP stack would not allow reuse of
a 5-tuple of an old (possibly lingering) connection for a new data
connection, so this is somewhat unlikely corner case.  However, it is
possible that a misbehaving source could cause conntrack entries be
created that could then interfere with new related connections.

Fix this in the OVS module by deleting the clashing conntrack entry
after an expectation has been matched.  This causes the following
nf_conntrack_in() call also find the expectation and remove it when
creating the new conntrack entry, as well as the forthcoming reply
direction packets to match the new related connection instead of the
old clashing conntrack entry.

Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action")
Reported-by: Yang Song <yangsong@vmware.com>
Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/openvswitch/conntrack.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 7b2c2fce408a..c92a5795dcda 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -514,10 +514,38 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
 		   u16 proto, const struct sk_buff *skb)
 {
 	struct nf_conntrack_tuple tuple;
+	struct nf_conntrack_expect *exp;
 
 	if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple))
 		return NULL;
-	return __nf_ct_expect_find(net, zone, &tuple);
+
+	exp = __nf_ct_expect_find(net, zone, &tuple);
+	if (exp) {
+		struct nf_conntrack_tuple_hash *h;
+
+		/* Delete existing conntrack entry, if it clashes with the
+		 * expectation.  This can happen since conntrack ALGs do not
+		 * check for clashes between (new) expectations and existing
+		 * conntrack entries.  nf_conntrack_in() will check the
+		 * expectations only if a conntrack entry can not be found,
+		 * which can lead to OVS finding the expectation (here) in the
+		 * init direction, but which will not be removed by the
+		 * nf_conntrack_in() call, if a matching conntrack entry is
+		 * found instead.  In this case all init direction packets
+		 * would be reported as new related packets, while reply
+		 * direction packets would be reported as un-related
+		 * established packets.
+		 */
+		h = nf_conntrack_find_get(net, zone, &tuple);
+		if (h) {
+			struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+			nf_ct_delete(ct, 0, 0);
+			nf_conntrack_put(&ct->ct_general);
+		}
+	}
+
+	return exp;
 }
 
 /* This replicates logic from nf_conntrack_core.c that is not exported. */
-- 
cgit v1.2.3


From 66e5a6b18bd09d0431e97cd3c162e76c5c2aebba Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Sat, 15 Apr 2017 19:27:42 +0800
Subject: netfilter: nf_ct_helper: permit cthelpers with different names via
 nfnetlink

cthelpers added via nfnetlink may have the same tuple, i.e. except for
the l3proto and l4proto, other fields are all zero. So even with the
different names, we will also fail to add them:
  # nfct helper add ssdp inet udp
  # nfct helper add tftp inet udp
  nfct v1.4.3: netlink error: File exists

So in order to avoid unpredictable behaviour, we should:
1. cthelpers can be selected by nft ct helper obj or xt_CT target, so
report error if duplicated { name, l3proto, l4proto } tuple exist.
2. cthelpers can be selected by nf_ct_tuple_src_mask_cmp when
nf_ct_auto_assign_helper is enabled, so also report error if duplicated
{ l3proto, l4proto, src-port } tuple exist.

Also note, if the cthelper is added from userspace, then the src-port will
always be zero, it's invalid for nf_ct_auto_assign_helper, so there's no
need to check the second point listed above.

Fixes: 893e093c786c ("netfilter: nf_ct_helper: bail out on duplicated helpers")
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_helper.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 4eeb3418366a..99bcd44aac70 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -386,17 +386,33 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
 	struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) };
 	unsigned int h = helper_hash(&me->tuple);
 	struct nf_conntrack_helper *cur;
-	int ret = 0;
+	int ret = 0, i;
 
 	BUG_ON(me->expect_policy == NULL);
 	BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES);
 	BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1);
 
 	mutex_lock(&nf_ct_helper_mutex);
-	hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) {
-		if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) {
-			ret = -EEXIST;
-			goto out;
+	for (i = 0; i < nf_ct_helper_hsize; i++) {
+		hlist_for_each_entry(cur, &nf_ct_helper_hash[i], hnode) {
+			if (!strcmp(cur->name, me->name) &&
+			    (cur->tuple.src.l3num == NFPROTO_UNSPEC ||
+			     cur->tuple.src.l3num == me->tuple.src.l3num) &&
+			    cur->tuple.dst.protonum == me->tuple.dst.protonum) {
+				ret = -EEXIST;
+				goto out;
+			}
+		}
+	}
+
+	/* avoid unpredictable behaviour for auto_assign_helper */
+	if (!(me->flags & NF_CT_HELPER_F_USERSPACE)) {
+		hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) {
+			if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple,
+						     &mask)) {
+				ret = -EEXIST;
+				goto out;
+			}
 		}
 	}
 	hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]);
-- 
cgit v1.2.3


From 54a5f9d9abab639039eb7288bdc26c9c67f4e79b Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Sat, 15 Apr 2017 23:37:36 +0800
Subject: netfilter: nft_set_bitmap: free dummy elements when destroy the set

We forget to free dummy elements when deleting the set. So when I was
running nft-test.py, I saw many kmemleak warnings:
  kmemleak: 1344 new suspected memory leaks ...
  # cat /sys/kernel/debug/kmemleak
  unreferenced object 0xffff8800631345c8 (size 32):
  comm "nft", pid 9075, jiffies 4295743309 (age 1354.815s)
  hex dump (first 32 bytes):
    f8 63 13 63 00 88 ff ff 88 79 13 63 00 88 ff ff  .c.c.....y.c....
    04 0c 00 00 00 00 00 00 00 00 00 00 08 03 00 00  ................
  backtrace:
    [<ffffffff819059da>] kmemleak_alloc+0x4a/0xa0
    [<ffffffff81288174>] __kmalloc+0x164/0x310
    [<ffffffffa061269d>] nft_set_elem_init+0x3d/0x1b0 [nf_tables]
    [<ffffffffa06130da>] nft_add_set_elem+0x45a/0x8c0 [nf_tables]
    [<ffffffffa0613645>] nf_tables_newsetelem+0x105/0x1d0 [nf_tables]
    [<ffffffffa05fe6d4>] nfnetlink_rcv+0x414/0x770 [nfnetlink]
    [<ffffffff817f0ca6>] netlink_unicast+0x1f6/0x310
    [<ffffffff817f10c6>] netlink_sendmsg+0x306/0x3b0
  ...

Fixes: e920dde516088 ("netfilter: nft_set_bitmap: keep a list of dummy elements")
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_set_bitmap.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 8ebbc2940f4c..b988162b5b15 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -257,6 +257,11 @@ static int nft_bitmap_init(const struct nft_set *set,
 
 static void nft_bitmap_destroy(const struct nft_set *set)
 {
+	struct nft_bitmap *priv = nft_set_priv(set);
+	struct nft_bitmap_elem *be, *n;
+
+	list_for_each_entry_safe(be, n, &priv->list, head)
+		nft_set_elem_destroy(set, be, true);
 }
 
 static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
-- 
cgit v1.2.3


From 14e567615679a9999ce6bf4f23d6c9e00f03e00e Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Mon, 17 Apr 2017 21:18:55 +0800
Subject: netfilter: ctnetlink: drop the incorrect cthelper module request

First, when creating a new ct, we will invoke request_module to try to
load the related inkernel cthelper. So there's no need to call the
request_module again when updating the ct helpinfo.

Second, ctnetlink_change_helper may be called with rcu_read_lock held,
i.e. rcu_read_lock -> nfqnl_recv_verdict -> nfqnl_ct_parse ->
ctnetlink_glue_parse -> ctnetlink_glue_parse_ct ->
ctnetlink_change_helper. But the request_module invocation may sleep,
so we can't call it with the rcu_read_lock held.

Remove it now.

Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index dc7dfd68fafe..48c184552de0 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1512,23 +1512,8 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
 
 	helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
 					    nf_ct_protonum(ct));
-	if (helper == NULL) {
-#ifdef CONFIG_MODULES
-		spin_unlock_bh(&nf_conntrack_expect_lock);
-
-		if (request_module("nfct-helper-%s", helpname) < 0) {
-			spin_lock_bh(&nf_conntrack_expect_lock);
-			return -EOPNOTSUPP;
-		}
-
-		spin_lock_bh(&nf_conntrack_expect_lock);
-		helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
-						    nf_ct_protonum(ct));
-		if (helper)
-			return -EAGAIN;
-#endif
+	if (helper == NULL)
 		return -EOPNOTSUPP;
-	}
 
 	if (help) {
 		if (help->helper == helper) {
-- 
cgit v1.2.3


From 88be4c09d9008f9ff337cbf48c5d0f06c8f872e7 Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Mon, 17 Apr 2017 21:18:56 +0800
Subject: netfilter: ctnetlink: fix deadlock due to acquire _expect_lock twice

Currently, ctnetlink_change_conntrack is always protected by _expect_lock,
but this will cause a deadlock when deleting the helper from a conntrack,
as the _expect_lock will be acquired again by nf_ct_remove_expectations:

         CPU0
        ----
  lock(nf_conntrack_expect_lock);
  lock(nf_conntrack_expect_lock);

  *** DEADLOCK ***
  May be due to missing lock nesting notation

  2 locks held by lt-conntrack_gr/12853:
  #0:  (&table[i].mutex){+.+.+.}, at: [<ffffffffa05e2009>]
       nfnetlink_rcv_msg+0x399/0x6a9 [nfnetlink]
  #1:  (nf_conntrack_expect_lock){+.....}, at: [<ffffffffa05f2c1f>]
       ctnetlink_new_conntrack+0x17f/0x408 [nf_conntrack_netlink]

  Call Trace:
   dump_stack+0x85/0xc2
   __lock_acquire+0x1608/0x1680
   ? ctnetlink_parse_tuple_proto+0x10f/0x1c0 [nf_conntrack_netlink]
   lock_acquire+0x100/0x1f0
   ? nf_ct_remove_expectations+0x32/0x90 [nf_conntrack]
   _raw_spin_lock_bh+0x3f/0x50
   ? nf_ct_remove_expectations+0x32/0x90 [nf_conntrack]
   nf_ct_remove_expectations+0x32/0x90 [nf_conntrack]
   ctnetlink_change_helper+0xc6/0x190 [nf_conntrack_netlink]
   ctnetlink_new_conntrack+0x1b2/0x408 [nf_conntrack_netlink]
   nfnetlink_rcv_msg+0x60a/0x6a9 [nfnetlink]
   ? nfnetlink_rcv_msg+0x1b9/0x6a9 [nfnetlink]
   ? nfnetlink_bind+0x1a0/0x1a0 [nfnetlink]
   netlink_rcv_skb+0xa4/0xc0
   nfnetlink_rcv+0x87/0x770 [nfnetlink]

Since the operations are unrelated to nf_ct_expect, so we can drop the
_expect_lock. Also note, after removing the _expect_lock protection,
another CPU may invoke nf_conntrack_helper_unregister, so we should
use rcu_read_lock to protect __nf_conntrack_helper_find invoked by
ctnetlink_change_helper.

Fixes: ca7433df3a67 ("netfilter: conntrack: seperate expect locking from nf_conntrack_lock")
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 48c184552de0..e5f97777b1f4 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1510,23 +1510,29 @@ static int ctnetlink_change_helper(struct nf_conn *ct,
 		return 0;
 	}
 
+	rcu_read_lock();
 	helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct),
 					    nf_ct_protonum(ct));
-	if (helper == NULL)
+	if (helper == NULL) {
+		rcu_read_unlock();
 		return -EOPNOTSUPP;
+	}
 
 	if (help) {
 		if (help->helper == helper) {
 			/* update private helper data if allowed. */
 			if (helper->from_nlattr)
 				helper->from_nlattr(helpinfo, ct);
-			return 0;
+			err = 0;
 		} else
-			return -EBUSY;
+			err = -EBUSY;
+	} else {
+		/* we cannot set a helper for an existing conntrack */
+		err = -EOPNOTSUPP;
 	}
 
-	/* we cannot set a helper for an existing conntrack */
-	return -EOPNOTSUPP;
+	rcu_read_unlock();
+	return err;
 }
 
 static int ctnetlink_change_timeout(struct nf_conn *ct,
@@ -1945,9 +1951,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
 	err = -EEXIST;
 	ct = nf_ct_tuplehash_to_ctrack(h);
 	if (!(nlh->nlmsg_flags & NLM_F_EXCL)) {
-		spin_lock_bh(&nf_conntrack_expect_lock);
 		err = ctnetlink_change_conntrack(ct, cda);
-		spin_unlock_bh(&nf_conntrack_expect_lock);
 		if (err == 0) {
 			nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
 						      (1 << IPCT_ASSURED) |
@@ -2342,11 +2346,7 @@ ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct)
 	if (ret < 0)
 		return ret;
 
-	spin_lock_bh(&nf_conntrack_expect_lock);
-	ret = ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct);
-	spin_unlock_bh(&nf_conntrack_expect_lock);
-
-	return ret;
+	return ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct);
 }
 
 static int ctnetlink_glue_exp_parse(const struct nlattr * const *cda,
-- 
cgit v1.2.3


From 53b56da83d7899de375a9de153fd7f5397de85e6 Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Mon, 17 Apr 2017 21:18:57 +0800
Subject: netfilter: ctnetlink: make it safer when updating ct->status

After converting to use rcu for conntrack hash, one CPU may update
the ct->status via ctnetlink, while another CPU may process the
packets and update the ct->status.

So the non-atomic operation "ct->status |= status;" via ctnetlink
becomes unsafe, and this may clear the IPS_DYING_BIT bit set by
another CPU unexpectedly. For example:
         CPU0                            CPU1
  ctnetlink_change_status        __nf_conntrack_find_get
      old = ct->status              nf_ct_gc_expired
          -                         nf_ct_kill
          -                      test_and_set_bit(IPS_DYING_BIT
      new = old | status;                 -
  ct->status = new; <-- oops, _DYING_ is cleared!

Now using a series of atomic bit operation to solve the above issue.

Also note, user shouldn't set IPS_TEMPLATE, IPS_SEQ_ADJUST directly,
so make these two bits be unchangable too.

If we set the IPS_TEMPLATE_BIT, ct will be freed by nf_ct_tmpl_free,
but actually it is alloced by nf_conntrack_alloc.
If we set the IPS_SEQ_ADJUST_BIT, this may cause the NULL pointer
deference, as the nfct_seqadj(ct) maybe NULL.

Last, add some comments to describe the logic change due to the
commit a963d710f367 ("netfilter: ctnetlink: Fix regression in CTA_STATUS
processing"), which makes me feel a little confusing.

Fixes: 76507f69c44e ("[NETFILTER]: nf_conntrack: use RCU for conntrack hash")
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h | 13 ++++++---
 net/netfilter/nf_conntrack_netlink.c               | 33 ++++++++++++++++------
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 6a8e33dd4ecb..38fc383139f0 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -82,10 +82,6 @@ enum ip_conntrack_status {
 	IPS_DYING_BIT = 9,
 	IPS_DYING = (1 << IPS_DYING_BIT),
 
-	/* Bits that cannot be altered from userland. */
-	IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
-				 IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING),
-
 	/* Connection has fixed timeout. */
 	IPS_FIXED_TIMEOUT_BIT = 10,
 	IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT),
@@ -101,6 +97,15 @@ enum ip_conntrack_status {
 	/* Conntrack got a helper explicitly attached via CT target. */
 	IPS_HELPER_BIT = 13,
 	IPS_HELPER = (1 << IPS_HELPER_BIT),
+
+	/* Be careful here, modifying these bits can make things messy,
+	 * so don't let users modify them directly.
+	 */
+	IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
+				 IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
+				 IPS_SEQ_ADJUST | IPS_TEMPLATE),
+
+	__IPS_MAX_BIT = 14,
 };
 
 /* Connection tracking event types */
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index e5f97777b1f4..86deed6a8db4 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1419,6 +1419,24 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
 }
 #endif
 
+static void
+__ctnetlink_change_status(struct nf_conn *ct, unsigned long on,
+			  unsigned long off)
+{
+	unsigned int bit;
+
+	/* Ignore these unchangable bits */
+	on &= ~IPS_UNCHANGEABLE_MASK;
+	off &= ~IPS_UNCHANGEABLE_MASK;
+
+	for (bit = 0; bit < __IPS_MAX_BIT; bit++) {
+		if (on & (1 << bit))
+			set_bit(bit, &ct->status);
+		else if (off & (1 << bit))
+			clear_bit(bit, &ct->status);
+	}
+}
+
 static int
 ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
 {
@@ -1438,10 +1456,7 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
 		/* ASSURED bit can only be set */
 		return -EBUSY;
 
-	/* Be careful here, modifying NAT bits can screw up things,
-	 * so don't let users modify them directly if they don't pass
-	 * nf_nat_range. */
-	ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK);
+	__ctnetlink_change_status(ct, status, 0);
 	return 0;
 }
 
@@ -1628,7 +1643,7 @@ ctnetlink_change_seq_adj(struct nf_conn *ct,
 		if (ret < 0)
 			return ret;
 
-		ct->status |= IPS_SEQ_ADJUST;
+		set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
 	}
 
 	if (cda[CTA_SEQ_ADJ_REPLY]) {
@@ -1637,7 +1652,7 @@ ctnetlink_change_seq_adj(struct nf_conn *ct,
 		if (ret < 0)
 			return ret;
 
-		ct->status |= IPS_SEQ_ADJUST;
+		set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
 	}
 
 	return 0;
@@ -2289,10 +2304,10 @@ ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[])
 	/* This check is less strict than ctnetlink_change_status()
 	 * because callers often flip IPS_EXPECTED bits when sending
 	 * an NFQA_CT attribute to the kernel.  So ignore the
-	 * unchangeable bits but do not error out.
+	 * unchangeable bits but do not error out. Also user programs
+	 * are allowed to clear the bits that they are allowed to change.
 	 */
-	ct->status = (status & ~IPS_UNCHANGEABLE_MASK) |
-		     (ct->status & IPS_UNCHANGEABLE_MASK);
+	__ctnetlink_change_status(ct, status, ~status);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 64f3967c7aacbfa1f0614cdb1a23e3f7e76eb61b Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Mon, 17 Apr 2017 21:18:58 +0800
Subject: netfilter: ctnetlink: acquire ct->lock before operating nf_ct_seqadj

We should acquire the ct->lock before accessing or modifying the
nf_ct_seqadj, as another CPU may modify the nf_ct_seqadj at the same
time during its packet proccessing.

Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_netlink.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 86deed6a8db4..78f8c9adbd3c 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -417,8 +417,7 @@ nla_put_failure:
 	return -1;
 }
 
-static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb,
-				     const struct nf_conn *ct)
+static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, struct nf_conn *ct)
 {
 	struct nf_conn_seqadj *seqadj = nfct_seqadj(ct);
 	struct nf_ct_seqadj *seq;
@@ -426,15 +425,20 @@ static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb,
 	if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj)
 		return 0;
 
+	spin_lock_bh(&ct->lock);
 	seq = &seqadj->seq[IP_CT_DIR_ORIGINAL];
 	if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1)
-		return -1;
+		goto err;
 
 	seq = &seqadj->seq[IP_CT_DIR_REPLY];
 	if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1)
-		return -1;
+		goto err;
 
+	spin_unlock_bh(&ct->lock);
 	return 0;
+err:
+	spin_unlock_bh(&ct->lock);
+	return -1;
 }
 
 static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
@@ -1637,11 +1641,12 @@ ctnetlink_change_seq_adj(struct nf_conn *ct,
 	if (!seqadj)
 		return 0;
 
+	spin_lock_bh(&ct->lock);
 	if (cda[CTA_SEQ_ADJ_ORIG]) {
 		ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL],
 				     cda[CTA_SEQ_ADJ_ORIG]);
 		if (ret < 0)
-			return ret;
+			goto err;
 
 		set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
 	}
@@ -1650,12 +1655,16 @@ ctnetlink_change_seq_adj(struct nf_conn *ct,
 		ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY],
 				     cda[CTA_SEQ_ADJ_REPLY]);
 		if (ret < 0)
-			return ret;
+			goto err;
 
 		set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
 	}
 
+	spin_unlock_bh(&ct->lock);
 	return 0;
+err:
+	spin_unlock_bh(&ct->lock);
+	return ret;
 }
 
 static int
-- 
cgit v1.2.3


From 6bd3d1929246bb0cf4f3bd6e25a0ba442b0de845 Mon Sep 17 00:00:00 2001
From: Peter Tirsek <peter@tirsek.com>
Date: Tue, 18 Apr 2017 12:39:58 -0500
Subject: netfilter: xt_socket: Fix broken IPv6 handling

Commit 834184b1f3a4 ("netfilter: defrag: only register defrag
functionality if needed") used the outdated XT_SOCKET_HAVE_IPV6 macro
which was removed earlier in commit 8db4c5be88f6 ("netfilter: move
socket lookup infrastructure to nf_socket_ipv{4,6}.c"). With that macro
never being defined, the xt_socket match emits an "Unknown family 10"
warning when used with IPv6:

WARNING: CPU: 0 PID: 1377 at net/netfilter/xt_socket.c:160 socket_mt_enable_defrag+0x47/0x50 [xt_socket]
Unknown family 10
Modules linked in: xt_socket nf_socket_ipv4 nf_socket_ipv6 nf_defrag_ipv4 [...]
CPU: 0 PID: 1377 Comm: ip6tables-resto Not tainted 4.10.10 #1
Hardware name: [...]
Call Trace:
? __warn+0xe7/0x100
? socket_mt_enable_defrag+0x47/0x50 [xt_socket]
? socket_mt_enable_defrag+0x47/0x50 [xt_socket]
? warn_slowpath_fmt+0x39/0x40
? socket_mt_enable_defrag+0x47/0x50 [xt_socket]
? socket_mt_v2_check+0x12/0x40 [xt_socket]
? xt_check_match+0x6b/0x1a0 [x_tables]
? xt_find_match+0x93/0xd0 [x_tables]
? xt_request_find_match+0x20/0x80 [x_tables]
? translate_table+0x48e/0x870 [ip6_tables]
? translate_table+0x577/0x870 [ip6_tables]
? walk_component+0x3a/0x200
? kmalloc_order+0x1d/0x50
? do_ip6t_set_ctl+0x181/0x490 [ip6_tables]
? filename_lookup+0xa5/0x120
? nf_setsockopt+0x3a/0x60
? ipv6_setsockopt+0xb0/0xc0
? sock_common_setsockopt+0x23/0x30
? SyS_socketcall+0x41d/0x630
? vfs_read+0xfa/0x120
? do_fast_syscall_32+0x7a/0x110
? entry_SYSENTER_32+0x47/0x71

This patch brings the conditional back in line with how the rest of the
file handles IPv6.

Fixes: 834184b1f3a4 ("netfilter: defrag: only register defrag functionality if needed")
Signed-off-by: Peter Tirsek <peter@tirsek.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 770bbec878f1..e75ef39669c5 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -152,7 +152,7 @@ static int socket_mt_enable_defrag(struct net *net, int family)
 	switch (family) {
 	case NFPROTO_IPV4:
 		return nf_defrag_ipv4_enable(net);
-#ifdef XT_SOCKET_HAVE_IPV6
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
 	case NFPROTO_IPV6:
 		return nf_defrag_ipv6_enable(net);
 #endif
-- 
cgit v1.2.3


From cf3cb246e277d98987aa8d62ef2730dbee2f5fa7 Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Wed, 19 Apr 2017 21:47:33 +0200
Subject: bridge: ebtables: fix reception of frames DNAT-ed to bridge
 device/port
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When trying to redirect bridged frames to the bridge device itself or
a bridge port (brouting) via the dnat target then this currently fails:

The ethernet destination of the frame is dnat'ed to the MAC address of
the bridge device or port just fine. However, the IP code drops it in
the beginning of ip_input.c/ip_rcv() as the dnat target left
the skb->pkt_type as PACKET_OTHERHOST.

Fixing this by resetting skb->pkt_type to an appropriate type after
dnat'ing.

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebt_dnat.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index 4e0b0c359325..e0bb624c3845 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -9,6 +9,7 @@
  */
 #include <linux/module.h>
 #include <net/sock.h>
+#include "../br_private.h"
 #include <linux/netfilter.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_bridge/ebtables.h>
@@ -18,11 +19,30 @@ static unsigned int
 ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct ebt_nat_info *info = par->targinfo;
+	struct net_device *dev;
 
 	if (!skb_make_writable(skb, 0))
 		return EBT_DROP;
 
 	ether_addr_copy(eth_hdr(skb)->h_dest, info->mac);
+
+	if (is_multicast_ether_addr(info->mac)) {
+		if (is_broadcast_ether_addr(info->mac))
+			skb->pkt_type = PACKET_BROADCAST;
+		else
+			skb->pkt_type = PACKET_MULTICAST;
+	} else {
+		if (xt_hooknum(par) != NF_BR_BROUTING)
+			dev = br_port_get_rcu(xt_in(par))->br->dev;
+		else
+			dev = xt_in(par);
+
+		if (ether_addr_equal(info->mac, dev->dev_addr))
+			skb->pkt_type = PACKET_HOST;
+		else
+			skb->pkt_type = PACKET_OTHERHOST;
+	}
+
 	return info->target;
 }
 
-- 
cgit v1.2.3


From 277a292835c196894ef895d5e1fd6170bb916f55 Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Sun, 23 Apr 2017 18:29:30 +0800
Subject: netfilter: nft_dynset: continue to next expr if _OP_ADD succeeded

Currently, after adding the following nft rules:
  # nft add set x target1 { type ipv4_addr \; flags timeout \;}
  # nft add rule x y set add ip daddr timeout 1d @target1 counter

the counters will always be zero despite of the elements are added
to the dynamic set "target1" or not, as we will break the nft expr
traversal unconditionally:
  # nft list ruleset
  ...
  set target1 {
      ...
      elements = { 8.8.8.8 expires 23h59m53s}
  }
  chain output {
      ...
      set add ip daddr timeout 1d @target1 counter packets 0 bytes 0
                                                           ^       ^
      ...
  }

Since we add the elements to the set successfully, we should continue
to the next expression.

Additionally, if elements are added to "flow table" successfully, we
will _always_ continue to the next expr, even if the operation is
_OP_ADD. So it's better to keep them to be consistent.

Fixes: 22fe54d5fefc ("netfilter: nf_tables: add support for dynamic set updates")
Reported-by: Robert White <rwhite@pobox.com>
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_dynset.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 049ad2d9ee66..fafbeea3ed04 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -82,8 +82,7 @@ static void nft_dynset_eval(const struct nft_expr *expr,
 		    nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) {
 			timeout = priv->timeout ? : set->timeout;
 			*nft_set_ext_expiration(ext) = jiffies + timeout;
-		} else if (sexpr == NULL)
-			goto out;
+		}
 
 		if (sexpr != NULL)
 			sexpr->ops->eval(sexpr, regs, pkt);
@@ -92,7 +91,7 @@ static void nft_dynset_eval(const struct nft_expr *expr,
 			regs->verdict.code = NFT_BREAK;
 		return;
 	}
-out:
+
 	if (!priv->invert)
 		regs->verdict.code = NFT_BREAK;
 }
-- 
cgit v1.2.3


From 9dd2ab609eef736d5639e0de1bcc2e71e714b28e Mon Sep 17 00:00:00 2001
From: Dave Johnson <dave-kernel@centerclick.org>
Date: Mon, 24 Apr 2017 09:11:24 -0400
Subject: netfilter: Wrong icmp6 checksum for ICMPV6_TIME_EXCEED in reverse
 SNATv6 path

When recalculating the outer ICMPv6 checksum for a reverse path NATv6
such as ICMPV6_TIME_EXCEED nf_nat_icmpv6_reply_translation() was
accessing data beyond the headlen of the skb for non-linear skb.  This
resulted in incorrect ICMPv6 checksum as garbage data was used.

Patch replaces csum_partial() with skb_checksum() which supports
non-linear skbs similar to nf_nat_icmp_reply_translation() from ipv4.

Signed-off-by: Dave Johnson <dave-kernel@centerclick.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index e0be97e636a4..69937b637ee5 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -235,7 +235,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
 		inside->icmp6.icmp6_cksum =
 			csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
 					skb->len - hdrlen, IPPROTO_ICMPV6,
-					csum_partial(&inside->icmp6,
+					skb_checksum(skb, hdrlen,
 						     skb->len - hdrlen, 0));
 	}
 
-- 
cgit v1.2.3


From 1442f6f7c1b77de1c508318164a527e240c24a4d Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 20 Apr 2017 11:44:16 +0200
Subject: ipvs: explicitly forbid ipv6 service/dest creation if ipv6 mod is
 disabled

When creating a new ipvs service, ipv6 addresses are always accepted
if CONFIG_IP_VS_IPV6 is enabled. On dest creation the address family
is not explicitly checked.

This allows the user-space to configure ipvs services even if the
system is booted with ipv6.disable=1. On specific configuration, ipvs
can try to call ipv6 routing code at setup time, causing the kernel to
oops due to fib6_rules_ops being NULL.

This change addresses the issue adding a check for the ipv6
module being enabled while validating ipv6 service operations and
adding the same validation for dest operations.

According to git history, this issue is apparently present since
the introduction of ipv6 support, and the oops can be triggered
since commit 09571c7ae30865ad ("IPVS: Add function to determine
if IPv6 address is local")

Fixes: 09571c7ae30865ad ("IPVS: Add function to determine if IPv6 address is local")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_ctl.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5aeb0dde6ccc..4d753beaac32 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3078,6 +3078,17 @@ nla_put_failure:
 	return skb->len;
 }
 
+static bool ip_vs_is_af_valid(int af)
+{
+	if (af == AF_INET)
+		return true;
+#ifdef CONFIG_IP_VS_IPV6
+	if (af == AF_INET6 && ipv6_mod_enabled())
+		return true;
+#endif
+	return false;
+}
+
 static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs,
 				    struct ip_vs_service_user_kern *usvc,
 				    struct nlattr *nla, int full_entry,
@@ -3104,11 +3115,7 @@ static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs,
 	memset(usvc, 0, sizeof(*usvc));
 
 	usvc->af = nla_get_u16(nla_af);
-#ifdef CONFIG_IP_VS_IPV6
-	if (usvc->af != AF_INET && usvc->af != AF_INET6)
-#else
-	if (usvc->af != AF_INET)
-#endif
+	if (!ip_vs_is_af_valid(usvc->af))
 		return -EAFNOSUPPORT;
 
 	if (nla_fwmark) {
@@ -3610,6 +3617,11 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
 		if (udest.af == 0)
 			udest.af = svc->af;
 
+		if (!ip_vs_is_af_valid(udest.af)) {
+			ret = -EAFNOSUPPORT;
+			goto out;
+		}
+
 		if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) {
 			/* The synchronization protocol is incompatible
 			 * with mixed family services
-- 
cgit v1.2.3


From 7dde07e9c53617549d67dd3e1d791496d0d3868e Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 28 Apr 2017 15:57:56 +0300
Subject: netfilter: x_tables: unlock on error in xt_find_table_lock()

According to my static checker we should unlock here before the return.
That seems reasonable to me as well.

Fixes" b9e69e127397 ("netfilter: xtables: don't hook tables by default")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/x_tables.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 14857afc9937..f134d384852f 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1051,8 +1051,10 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 	list_for_each_entry(t, &init_net.xt.tables[af], list) {
 		if (strcmp(t->name, name))
 			continue;
-		if (!try_module_get(t->me))
+		if (!try_module_get(t->me)) {
+			mutex_unlock(&xt[af].mutex);
 			return NULL;
+		}
 
 		mutex_unlock(&xt[af].mutex);
 		if (t->table_init(net) != 0) {
-- 
cgit v1.2.3


From 1519fccb34371594f6a629bfad69605bc6f9dde3 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 28 Apr 2017 16:54:37 +0200
Subject: netfilter: update MAINTAINERS file

Several updates on the MAINTAINERS section for Netfilter:

1) Add Florian Westphal, he's been part of the coreteam since October 2012.
   He's been dedicating tireless efforts to improve the Netfilter codebase,
   fix bugs and push ongoing new developments ever since.

2) Add http://www.nftables.org/ URL, currently pointing to
   http://www.netfilter.org.

3) Update project status from Supported to Maintained.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Florian Westphal <fw@strlen.de>
---
 MAINTAINERS | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 38d3e4ed7208..fc95cb06fb29 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8701,14 +8701,16 @@ F:	drivers/net/ethernet/neterion/
 NETFILTER
 M:	Pablo Neira Ayuso <pablo@netfilter.org>
 M:	Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+M:	Florian Westphal <fw@strlen.de>
 L:	netfilter-devel@vger.kernel.org
 L:	coreteam@netfilter.org
 W:	http://www.netfilter.org/
 W:	http://www.iptables.org/
+W:	http://www.nftables.org/
 Q:	http://patchwork.ozlabs.org/project/netfilter-devel/list/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git
-S:	Supported
+S:	Maintained
 F:	include/linux/netfilter*
 F:	include/linux/netfilter/
 F:	include/net/netfilter/
-- 
cgit v1.2.3


From 9744a6fcefcb4d56501d69adb04c24559d353cad Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 1 May 2017 12:58:50 +0200
Subject: netfilter: nf_tables: check if same extensions are set when adding
 elements

If no NLM_F_EXCL is set and the element already exists in the set, make
sure that both elements have the same extensions.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 434c739dfeca..11a96e8dd3cd 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3749,6 +3749,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
 	err = set->ops->insert(ctx->net, set, &elem, &ext2);
 	if (err) {
 		if (err == -EEXIST) {
+			if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^
+			    nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) ||
+			    nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^
+			    nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF))
+				return -EBUSY;
 			if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
 			     nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
 			     memcmp(nft_set_ext_data(ext),
-- 
cgit v1.2.3


From 55de170382a92d6da6fc9f23efc21eb2f3d25126 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 2 May 2017 14:31:50 +0200
Subject: samples/bpf: adjust rlimit RLIMIT_MEMLOCK for traceex2, tracex3 and
 tracex4

Needed to adjust max locked memory RLIMIT_MEMLOCK for testing these bpf samples
as these are using more and larger maps than can fit in distro default 64Kbytes limit.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/tracex2_user.c | 7 +++++++
 samples/bpf/tracex3_user.c | 7 +++++++
 samples/bpf/tracex4_user.c | 8 ++++++++
 3 files changed, 22 insertions(+)

diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
index ded9804c5034..7fee0f1ba9a3 100644
--- a/samples/bpf/tracex2_user.c
+++ b/samples/bpf/tracex2_user.c
@@ -4,6 +4,7 @@
 #include <signal.h>
 #include <linux/bpf.h>
 #include <string.h>
+#include <sys/resource.h>
 
 #include "libbpf.h"
 #include "bpf_load.h"
@@ -112,6 +113,7 @@ static void int_exit(int sig)
 
 int main(int ac, char **argv)
 {
+	struct rlimit r = {1024*1024, RLIM_INFINITY};
 	char filename[256];
 	long key, next_key, value;
 	FILE *f;
@@ -119,6 +121,11 @@ int main(int ac, char **argv)
 
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
 	signal(SIGINT, int_exit);
 
 	/* start 'ping' in the background to have some kfree_skb events */
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
index 8f7d199d5945..fe372239d505 100644
--- a/samples/bpf/tracex3_user.c
+++ b/samples/bpf/tracex3_user.c
@@ -11,6 +11,7 @@
 #include <stdbool.h>
 #include <string.h>
 #include <linux/bpf.h>
+#include <sys/resource.h>
 
 #include "libbpf.h"
 #include "bpf_load.h"
@@ -112,11 +113,17 @@ static void print_hist(int fd)
 
 int main(int ac, char **argv)
 {
+	struct rlimit r = {1024*1024, RLIM_INFINITY};
 	char filename[256];
 	int i;
 
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
 	if (load_bpf_file(filename)) {
 		printf("%s", bpf_log_buf);
 		return 1;
diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c
index 03449f773cb1..22c644f1f4c3 100644
--- a/samples/bpf/tracex4_user.c
+++ b/samples/bpf/tracex4_user.c
@@ -12,6 +12,8 @@
 #include <string.h>
 #include <time.h>
 #include <linux/bpf.h>
+#include <sys/resource.h>
+
 #include "libbpf.h"
 #include "bpf_load.h"
 
@@ -50,11 +52,17 @@ static void print_old_objects(int fd)
 
 int main(int ac, char **argv)
 {
+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
 	char filename[256];
 	int i;
 
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
+		return 1;
+	}
+
 	if (load_bpf_file(filename)) {
 		printf("%s", bpf_log_buf);
 		return 1;
-- 
cgit v1.2.3


From 156450d9d964447adfb44a231c634d2f5609d110 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 2 May 2017 14:31:56 +0200
Subject: samples/bpf: make bpf_load.c code compatible with ELF maps section
 changes

This patch does proper parsing of the ELF "maps" section, in-order to
be both backwards and forwards compatible with changes to the map
definition struct bpf_map_def, which gets compiled into the ELF file.

The assumption is that new features with value zero, means that they
are not in-use.  For backward compatibility where loading an ELF file
with a smaller struct bpf_map_def, only copy objects ELF size, leaving
rest of loaders struct zero.  For forward compatibility where ELF file
have a larger struct bpf_map_def, only copy loaders own struct size
and verify that rest of the larger struct is zero, assuming this means
the newer feature was not activated, thus it should be safe for this
older loader to load this newer ELF file.

Fixes: fb30d4b71214 ("bpf: Add tests for map-in-map")
Fixes: 409526bea3c3 ("samples/bpf: bpf_load.c detect and abort if ELF maps section size is wrong")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/bpf_load.c | 224 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 155 insertions(+), 69 deletions(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 4221dc359453..fedec29c7817 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -39,6 +39,16 @@ int event_fd[MAX_PROGS];
 int prog_cnt;
 int prog_array_fd = -1;
 
+/* Keeping relevant info on maps */
+struct bpf_map_data {
+	int fd;
+	char *name;
+	size_t elf_offset;
+	struct bpf_map_def def;
+};
+struct bpf_map_data map_data[MAX_MAPS];
+int map_data_count = 0;
+
 static int populate_prog_array(const char *event, int prog_fd)
 {
 	int ind = atoi(event), err;
@@ -186,42 +196,39 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	return 0;
 }
 
-static int load_maps(struct bpf_map_def *maps, int nr_maps,
-		     const char **map_names, fixup_map_cb fixup_map)
+static int load_maps(struct bpf_map_data *maps, int nr_maps,
+		     fixup_map_cb fixup_map)
 {
 	int i;
-	/*
-	 * Warning: Using "maps" pointing to ELF data_maps->d_buf as
-	 * an array of struct bpf_map_def is a wrong assumption about
-	 * the ELF maps section format.
-	 */
+
 	for (i = 0; i < nr_maps; i++) {
 		if (fixup_map)
-			fixup_map(&maps[i], map_names[i], i);
+			fixup_map(&maps[i].def, maps[i].name, i);
 
-		if (maps[i].type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
-		    maps[i].type == BPF_MAP_TYPE_HASH_OF_MAPS) {
-			int inner_map_fd = map_fd[maps[i].inner_map_idx];
+		if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
+		    maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
+			int inner_map_fd = map_fd[maps[i].def.inner_map_idx];
 
-			map_fd[i] = bpf_create_map_in_map(maps[i].type,
-							  maps[i].key_size,
-							  inner_map_fd,
-							  maps[i].max_entries,
-							  maps[i].map_flags);
+			map_fd[i] = bpf_create_map_in_map(maps[i].def.type,
+							maps[i].def.key_size,
+							inner_map_fd,
+							maps[i].def.max_entries,
+							maps[i].def.map_flags);
 		} else {
-			map_fd[i] = bpf_create_map(maps[i].type,
-						   maps[i].key_size,
-						   maps[i].value_size,
-						   maps[i].max_entries,
-						   maps[i].map_flags);
+			map_fd[i] = bpf_create_map(maps[i].def.type,
+						   maps[i].def.key_size,
+						   maps[i].def.value_size,
+						   maps[i].def.max_entries,
+						   maps[i].def.map_flags);
 		}
 		if (map_fd[i] < 0) {
 			printf("failed to create a map: %d %s\n",
 			       errno, strerror(errno));
 			return 1;
 		}
+		maps[i].fd = map_fd[i];
 
-		if (maps[i].type == BPF_MAP_TYPE_PROG_ARRAY)
+		if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY)
 			prog_array_fd = map_fd[i];
 	}
 	return 0;
@@ -251,7 +258,8 @@ static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname,
 }
 
 static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
-				GElf_Shdr *shdr, struct bpf_insn *insn)
+				GElf_Shdr *shdr, struct bpf_insn *insn,
+				struct bpf_map_data *maps, int nr_maps)
 {
 	int i, nrels;
 
@@ -261,6 +269,8 @@ static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
 		GElf_Sym sym;
 		GElf_Rel rel;
 		unsigned int insn_idx;
+		bool match = false;
+		int j, map_idx;
 
 		gelf_getrel(data, i, &rel);
 
@@ -274,11 +284,21 @@ static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols,
 			return 1;
 		}
 		insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD;
-		/*
-		 * Warning: Using sizeof(struct bpf_map_def) here is a
-		 * wrong assumption about ELF maps section format
-		 */
-		insn[insn_idx].imm = map_fd[sym.st_value / sizeof(struct bpf_map_def)];
+
+		/* Match FD relocation against recorded map_data[] offset */
+		for (map_idx = 0; map_idx < nr_maps; map_idx++) {
+			if (maps[map_idx].elf_offset == sym.st_value) {
+				match = true;
+				break;
+			}
+		}
+		if (match) {
+			insn[insn_idx].imm = maps[map_idx].fd;
+		} else {
+			printf("invalid relo for insn[%d] no map_data match\n",
+			       insn_idx);
+			return 1;
+		}
 	}
 
 	return 0;
@@ -297,40 +317,112 @@ static int cmp_symbols(const void *l, const void *r)
 		return 0;
 }
 
-static int get_sorted_map_names(Elf *elf, Elf_Data *symbols, int maps_shndx,
-				int strtabidx, char **map_names)
+static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx,
+				 Elf *elf, Elf_Data *symbols, int strtabidx)
 {
-	GElf_Sym map_symbols[MAX_MAPS];
-	int i, nr_maps = 0;
+	int map_sz_elf, map_sz_copy;
+	bool validate_zero = false;
+	Elf_Data *data_maps;
+	int i, nr_maps;
+	GElf_Sym *sym;
+	Elf_Scn *scn;
+	int copy_sz;
+
+	if (maps_shndx < 0)
+		return -EINVAL;
+	if (!symbols)
+		return -EINVAL;
+
+	/* Get data for maps section via elf index */
+	scn = elf_getscn(elf, maps_shndx);
+	if (scn)
+		data_maps = elf_getdata(scn, NULL);
+	if (!scn || !data_maps) {
+		printf("Failed to get Elf_Data from maps section %d\n",
+		       maps_shndx);
+		return -EINVAL;
+	}
 
-	for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
-		assert(nr_maps < MAX_MAPS);
-		if (!gelf_getsym(symbols, i, &map_symbols[nr_maps]))
+	/* For each map get corrosponding symbol table entry */
+	sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym));
+	for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) {
+		assert(nr_maps < MAX_MAPS+1);
+		if (!gelf_getsym(symbols, i, &sym[nr_maps]))
 			continue;
-		if (map_symbols[nr_maps].st_shndx != maps_shndx)
+		if (sym[nr_maps].st_shndx != maps_shndx)
 			continue;
+		/* Only increment iif maps section */
 		nr_maps++;
 	}
 
-	qsort(map_symbols, nr_maps, sizeof(GElf_Sym), cmp_symbols);
+	/* Align to map_fd[] order, via sort on offset in sym.st_value */
+	qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols);
+
+	/* Keeping compatible with ELF maps section changes
+	 * ------------------------------------------------
+	 * The program size of struct bpf_map_def is known by loader
+	 * code, but struct stored in ELF file can be different.
+	 *
+	 * Unfortunately sym[i].st_size is zero.  To calculate the
+	 * struct size stored in the ELF file, assume all struct have
+	 * the same size, and simply divide with number of map
+	 * symbols.
+	 */
+	map_sz_elf = data_maps->d_size / nr_maps;
+	map_sz_copy = sizeof(struct bpf_map_def);
+	if (map_sz_elf < map_sz_copy) {
+		/*
+		 * Backward compat, loading older ELF file with
+		 * smaller struct, keeping remaining bytes zero.
+		 */
+		map_sz_copy = map_sz_elf;
+	} else if (map_sz_elf > map_sz_copy) {
+		/*
+		 * Forward compat, loading newer ELF file with larger
+		 * struct with unknown features. Assume zero means
+		 * feature not used.  Thus, validate rest of struct
+		 * data is zero.
+		 */
+		validate_zero = true;
+	}
 
+	/* Memcpy relevant part of ELF maps data to loader maps */
 	for (i = 0; i < nr_maps; i++) {
-		char *map_name;
-
-		map_name = elf_strptr(elf, strtabidx, map_symbols[i].st_name);
-		if (!map_name) {
-			printf("cannot get map symbol\n");
-			return -1;
-		}
-
-		map_names[i] = strdup(map_name);
-		if (!map_names[i]) {
+		unsigned char *addr, *end;
+		struct bpf_map_def *def;
+		const char *map_name;
+		size_t offset;
+
+		map_name = elf_strptr(elf, strtabidx, sym[i].st_name);
+		maps[i].name = strdup(map_name);
+		if (!maps[i].name) {
 			printf("strdup(%s): %s(%d)\n", map_name,
 			       strerror(errno), errno);
-			return -1;
+			free(sym);
+			return -errno;
+		}
+
+		/* Symbol value is offset into ELF maps section data area */
+		offset = sym[i].st_value;
+		def = (struct bpf_map_def *)(data_maps->d_buf + offset);
+		maps[i].elf_offset = offset;
+		memset(&maps[i].def, 0, sizeof(struct bpf_map_def));
+		memcpy(&maps[i].def, def, map_sz_copy);
+
+		/* Verify no newer features were requested */
+		if (validate_zero) {
+			addr = (unsigned char*) def + map_sz_copy;
+			end  = (unsigned char*) def + map_sz_elf;
+			for (; addr < end; addr++) {
+				if (*addr != 0) {
+					free(sym);
+					return -EFBIG;
+				}
+			}
 		}
 	}
 
+	free(sym);
 	return nr_maps;
 }
 
@@ -341,7 +433,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
 	GElf_Ehdr ehdr;
 	GElf_Shdr shdr, shdr_prog;
 	Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL;
-	char *shname, *shname_prog, *map_names[MAX_MAPS] = { NULL };
+	char *shname, *shname_prog;
+	int nr_maps = 0;
 
 	/* reset global variables */
 	kern_version = 0;
@@ -389,8 +482,12 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
 			}
 			memcpy(&kern_version, data->d_buf, sizeof(int));
 		} else if (strcmp(shname, "maps") == 0) {
+			int j;
+
 			maps_shndx = i;
 			data_maps = data;
+			for (j = 0; j < MAX_MAPS; j++)
+				map_data[j].fd = -1;
 		} else if (shdr.sh_type == SHT_SYMTAB) {
 			strtabidx = shdr.sh_link;
 			symbols = data;
@@ -405,27 +502,17 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
 	}
 
 	if (data_maps) {
-		int nr_maps;
-		int prog_elf_map_sz;
-
-		nr_maps = get_sorted_map_names(elf, symbols, maps_shndx,
-					       strtabidx, map_names);
-		if (nr_maps < 0)
-			goto done;
-
-		/* Deduce map struct size stored in ELF maps section */
-		prog_elf_map_sz = data_maps->d_size / nr_maps;
-		if (prog_elf_map_sz != sizeof(struct bpf_map_def)) {
-			printf("Error: ELF maps sec wrong size (%d/%lu),"
-			       " old kern.o file?\n",
-			       prog_elf_map_sz, sizeof(struct bpf_map_def));
+		nr_maps = load_elf_maps_section(map_data, maps_shndx,
+						elf, symbols, strtabidx);
+		if (nr_maps < 0) {
+			printf("Error: Failed loading ELF maps (errno:%d):%s\n",
+			       nr_maps, strerror(-nr_maps));
 			ret = 1;
 			goto done;
 		}
-
-		if (load_maps(data_maps->d_buf, nr_maps,
-			      (const char **)map_names, fixup_map))
+		if (load_maps(map_data, nr_maps, fixup_map))
 			goto done;
+		map_data_count = nr_maps;
 
 		processed_sec[maps_shndx] = true;
 	}
@@ -453,7 +540,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
 			processed_sec[shdr.sh_info] = true;
 			processed_sec[i] = true;
 
-			if (parse_relo_and_apply(data, symbols, &shdr, insns))
+			if (parse_relo_and_apply(data, symbols, &shdr, insns,
+						 map_data, nr_maps))
 				continue;
 
 			if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
@@ -488,8 +576,6 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
 
 	ret = 0;
 done:
-	for (i = 0; i < MAX_MAPS; i++)
-		free(map_names[i]);
 	close(fd);
 	return ret;
 }
-- 
cgit v1.2.3


From 6979bcc731f9680824a85a9efc43f36d01cec1b2 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 2 May 2017 14:32:01 +0200
Subject: samples/bpf: load_bpf.c make callback fixup more flexible

Do this change before others start to use this callback.
Change map_perf_test_user.c which seems to be the only user.

This patch extends capabilities of commit 9fd63d05f3e8 ("bpf:
Allow bpf sample programs (*_user.c) to change bpf_map_def").

Give fixup callback access to struct bpf_map_data, instead of
only stuct bpf_map_def.  This add flexibility to allow userspace
to reassign the map file descriptor.  This is very useful when
wanting to share maps between several bpf programs.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/bpf_load.c           | 17 ++++++++---------
 samples/bpf/bpf_load.h           | 10 ++++++++--
 samples/bpf/map_perf_test_user.c | 14 +++++++-------
 3 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index fedec29c7817..74456b3eb89a 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -39,13 +39,6 @@ int event_fd[MAX_PROGS];
 int prog_cnt;
 int prog_array_fd = -1;
 
-/* Keeping relevant info on maps */
-struct bpf_map_data {
-	int fd;
-	char *name;
-	size_t elf_offset;
-	struct bpf_map_def def;
-};
 struct bpf_map_data map_data[MAX_MAPS];
 int map_data_count = 0;
 
@@ -202,8 +195,14 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps,
 	int i;
 
 	for (i = 0; i < nr_maps; i++) {
-		if (fixup_map)
-			fixup_map(&maps[i].def, maps[i].name, i);
+		if (fixup_map) {
+			fixup_map(&maps[i], i);
+			/* Allow userspace to assign map FD prior to creation */
+			if (maps[i].fd != -1) {
+				map_fd[i] = maps[i].fd;
+				continue;
+			}
+		}
 
 		if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
 		    maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) {
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 05822f83173a..4d4fd4678a64 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -15,8 +15,14 @@ struct bpf_map_def {
 	unsigned int inner_map_idx;
 };
 
-typedef void (*fixup_map_cb)(struct bpf_map_def *map, const char *map_name,
-			     int idx);
+struct bpf_map_data {
+	int fd;
+	char *name;
+	size_t elf_offset;
+	struct bpf_map_def def;
+};
+
+typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx);
 
 extern int map_fd[MAX_MAPS];
 extern int prog_fd[MAX_PROGS];
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
index 6ac778153315..1a8894b5ac51 100644
--- a/samples/bpf/map_perf_test_user.c
+++ b/samples/bpf/map_perf_test_user.c
@@ -320,21 +320,21 @@ static void fill_lpm_trie(void)
 	assert(!r);
 }
 
-static void fixup_map(struct bpf_map_def *map, const char *name, int idx)
+static void fixup_map(struct bpf_map_data *map, int idx)
 {
 	int i;
 
-	if (!strcmp("inner_lru_hash_map", name)) {
+	if (!strcmp("inner_lru_hash_map", map->name)) {
 		inner_lru_hash_idx = idx;
-		inner_lru_hash_size = map->max_entries;
+		inner_lru_hash_size = map->def.max_entries;
 	}
 
-	if (!strcmp("array_of_lru_hashs", name)) {
+	if (!strcmp("array_of_lru_hashs", map->name)) {
 		if (inner_lru_hash_idx == -1) {
 			printf("inner_lru_hash_map must be defined before array_of_lru_hashs\n");
 			exit(1);
 		}
-		map->inner_map_idx = inner_lru_hash_idx;
+		map->def.inner_map_idx = inner_lru_hash_idx;
 		array_of_lru_hashs_idx = idx;
 	}
 
@@ -345,9 +345,9 @@ static void fixup_map(struct bpf_map_def *map, const char *name, int idx)
 
 	/* Only change the max_entries for the enabled test(s) */
 	for (i = 0; i < NR_TESTS; i++) {
-		if (!strcmp(test_map_names[i], name) &&
+		if (!strcmp(test_map_names[i], map->name) &&
 		    (check_test_flags(i))) {
-			map->max_entries = num_map_entries;
+			map->def.max_entries = num_map_entries;
 		}
 	}
 }
-- 
cgit v1.2.3


From 9178b4c17d1b8b21738771d53d999c990e69f538 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 2 May 2017 14:32:06 +0200
Subject: samples/bpf: export map_data[] for more info on maps

Giving *_user.c side tools access to map_data[] provides easier
access to information on the maps being loaded.  Still provide
the guarantee that the order maps are being defined in inside the
_kern.c file corresponds with the order in the array.  Now user
tools are not blind, but can inspect and verify the maps that got
loaded from the ELF binary.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 samples/bpf/bpf_load.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 4d4fd4678a64..ca0563d04744 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -24,12 +24,18 @@ struct bpf_map_data {
 
 typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx);
 
-extern int map_fd[MAX_MAPS];
 extern int prog_fd[MAX_PROGS];
 extern int event_fd[MAX_PROGS];
 extern char bpf_log_buf[BPF_LOG_BUF_SIZE];
 extern int prog_cnt;
 
+/* There is a one-to-one mapping between map_fd[] and map_data[].
+ * The map_data[] just contains more rich info on the given map.
+ */
+extern int map_fd[MAX_MAPS];
+extern struct bpf_map_data map_data[MAX_MAPS];
+extern int map_data_count;
+
 /* parses elf file compiled by llvm .c->.o
  * . parses 'maps' section and creates maps via BPF syscall
  * . parses 'license' section and passes it to syscall
-- 
cgit v1.2.3


From f6fec61eb555e47e87234e8915ad726ba6c2d3f8 Mon Sep 17 00:00:00 2001
From: David Cai <david.cai@microchip.com>
Date: Tue, 2 May 2017 20:59:14 +0000
Subject: smsc911x: Adding support for Micochip LAN9250 Ethernet controller

Adding support for Microchip LAN9250 Ethernet controller.

Signed-off-by: David Cai <david.cai@microchip.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/smsc/smsc911x.c | 49 ++++++++++++++++++++++--------------
 drivers/net/ethernet/smsc/smsc911x.h | 19 ++++++++++++++
 2 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c
index fa5ca0992be6..ea1bbc355b4d 100644
--- a/drivers/net/ethernet/smsc/smsc911x.c
+++ b/drivers/net/ethernet/smsc/smsc911x.c
@@ -25,7 +25,7 @@
  *   LAN9215, LAN9216, LAN9217, LAN9218
  *   LAN9210, LAN9211
  *   LAN9220, LAN9221
- *   LAN89218
+ *   LAN89218,LAN9250
  *
  */
 
@@ -1450,6 +1450,8 @@ static int smsc911x_soft_reset(struct smsc911x_data *pdata)
 	unsigned int timeout;
 	unsigned int temp;
 	int ret;
+	unsigned int reset_offset = HW_CFG;
+	unsigned int reset_mask = HW_CFG_SRST_;
 
 	/*
 	 * Make sure to power-up the PHY chip before doing a reset, otherwise
@@ -1476,15 +1478,23 @@ static int smsc911x_soft_reset(struct smsc911x_data *pdata)
 		}
 	}
 
+	if ((pdata->idrev & 0xFFFF0000) == LAN9250) {
+		/* special reset for  LAN9250 */
+		reset_offset = RESET_CTL;
+		reset_mask = RESET_CTL_DIGITAL_RST_;
+	}
+
 	/* Reset the LAN911x */
-	smsc911x_reg_write(pdata, HW_CFG, HW_CFG_SRST_);
+	smsc911x_reg_write(pdata, reset_offset, reset_mask);
+
+	/* verify reset bit is cleared */
 	timeout = 10;
 	do {
 		udelay(10);
-		temp = smsc911x_reg_read(pdata, HW_CFG);
-	} while ((--timeout) && (temp & HW_CFG_SRST_));
+		temp = smsc911x_reg_read(pdata, reset_offset);
+	} while ((--timeout) && (temp & reset_mask));
 
-	if (unlikely(temp & HW_CFG_SRST_)) {
+	if (unlikely(temp & reset_mask)) {
 		SMSC_WARN(pdata, drv, "Failed to complete reset");
 		return -EIO;
 	}
@@ -2253,28 +2263,29 @@ static int smsc911x_init(struct net_device *dev)
 
 	pdata->idrev = smsc911x_reg_read(pdata, ID_REV);
 	switch (pdata->idrev & 0xFFFF0000) {
-	case 0x01180000:
-	case 0x01170000:
-	case 0x01160000:
-	case 0x01150000:
-	case 0x218A0000:
+	case LAN9118:
+	case LAN9117:
+	case LAN9116:
+	case LAN9115:
+	case LAN89218:
 		/* LAN911[5678] family */
 		pdata->generation = pdata->idrev & 0x0000FFFF;
 		break;
 
-	case 0x118A0000:
-	case 0x117A0000:
-	case 0x116A0000:
-	case 0x115A0000:
+	case LAN9218:
+	case LAN9217:
+	case LAN9216:
+	case LAN9215:
 		/* LAN921[5678] family */
 		pdata->generation = 3;
 		break;
 
-	case 0x92100000:
-	case 0x92110000:
-	case 0x92200000:
-	case 0x92210000:
-		/* LAN9210/LAN9211/LAN9220/LAN9221 */
+	case LAN9210:
+	case LAN9211:
+	case LAN9220:
+	case LAN9221:
+	case LAN9250:
+		/* LAN9210/LAN9211/LAN9220/LAN9221/LAN9250 */
 		pdata->generation = 4;
 		break;
 
diff --git a/drivers/net/ethernet/smsc/smsc911x.h b/drivers/net/ethernet/smsc/smsc911x.h
index 54d648920a1b..8d75508acd2b 100644
--- a/drivers/net/ethernet/smsc/smsc911x.h
+++ b/drivers/net/ethernet/smsc/smsc911x.h
@@ -20,6 +20,22 @@
 #ifndef __SMSC911X_H__
 #define __SMSC911X_H__
 
+/*Chip ID*/
+#define LAN9115	0x01150000
+#define LAN9116	0x01160000
+#define LAN9117	0x01170000
+#define LAN9118	0x01180000
+#define LAN9215	0x115A0000
+#define LAN9216	0x116A0000
+#define LAN9217	0x117A0000
+#define LAN9218	0x118A0000
+#define LAN9210	0x92100000
+#define LAN9211	0x92110000
+#define LAN9220	0x92200000
+#define LAN9221	0x92210000
+#define LAN9250	0x92500000
+#define LAN89218	0x218A0000
+
 #define TX_FIFO_LOW_THRESHOLD	((u32)1600)
 #define SMSC911X_EEPROM_SIZE	((u32)128)
 #define USE_DEBUG		0
@@ -303,6 +319,9 @@
 #define E2P_DATA_EEPROM_DATA_		0x000000FF
 #define LAN_REGISTER_EXTENT		0x00000100
 
+#define RESET_CTL			0x1F8
+#define RESET_CTL_DIGITAL_RST_		0x00000001
+
 /*
  * MAC Control and Status Register (Indirect Address)
  * Offset (through the MAC_CSR CMD and DATA port)
-- 
cgit v1.2.3


From 6d717134a1a6e1b34a7d0d70e953037bc2642046 Mon Sep 17 00:00:00 2001
From: David Ahern <dsahern@gmail.com>
Date: Tue, 2 May 2017 14:43:44 -0700
Subject: net: ipv6: Do not duplicate DAD on link up

Andrey reported a warning triggered by the rcu code:

------------[ cut here ]------------
WARNING: CPU: 1 PID: 5911 at lib/debugobjects.c:289
debug_print_object+0x175/0x210
ODEBUG: activate active (active state 1) object type: rcu_head hint:
        (null)
Modules linked in:
CPU: 1 PID: 5911 Comm: a.out Not tainted 4.11.0-rc8+ #271
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16
 dump_stack+0x192/0x22d lib/dump_stack.c:52
 __warn+0x19f/0x1e0 kernel/panic.c:549
 warn_slowpath_fmt+0xe0/0x120 kernel/panic.c:564
 debug_print_object+0x175/0x210 lib/debugobjects.c:286
 debug_object_activate+0x574/0x7e0 lib/debugobjects.c:442
 debug_rcu_head_queue kernel/rcu/rcu.h:75
 __call_rcu.constprop.76+0xff/0x9c0 kernel/rcu/tree.c:3229
 call_rcu_sched+0x12/0x20 kernel/rcu/tree.c:3288
 rt6_rcu_free net/ipv6/ip6_fib.c:158
 rt6_release+0x1ea/0x290 net/ipv6/ip6_fib.c:188
 fib6_del_route net/ipv6/ip6_fib.c:1461
 fib6_del+0xa42/0xdc0 net/ipv6/ip6_fib.c:1500
 __ip6_del_rt+0x100/0x160 net/ipv6/route.c:2174
 ip6_del_rt+0x140/0x1b0 net/ipv6/route.c:2187
 __ipv6_ifa_notify+0x269/0x780 net/ipv6/addrconf.c:5520
 addrconf_ifdown+0xe60/0x1a20 net/ipv6/addrconf.c:3672
...

Andrey's reproducer program runs in a very tight loop, calling
'unshare -n' and then spawning 2 sets of 14 threads running random ioctl
calls. The relevant networking sequence:

1. New network namespace created via unshare -n
- ip6tnl0 device is created in down state

2. address added to ip6tnl0
- equivalent to ip -6 addr add dev ip6tnl0 fd00::bb/1
- DAD is started on the address and when it completes the host
  route is inserted into the FIB

3. ip6tnl0 is brought up
- the new fixup_permanent_addr function restarts DAD on the address

4. exit namespace
- teardown / cleanup sequence starts
- once in a blue moon, lo teardown appears to happen BEFORE teardown
  of ip6tunl0
  + down on 'lo' removes the host route from the FIB since the dst->dev
    for the route is loobback
  + host route added to rcu callback list
    * rcu callback has not run yet, so rt is NOT on the gc list so it has
      NOT been marked obsolete

5. in parallel to 4. worker_thread runs addrconf_dad_completed
- DAD on the address on ip6tnl0 completes
- calls ipv6_ifa_notify which inserts the host route

All of that happens very quickly. The result is that a host route that
has been deleted from the IPv6 FIB and added to the RCU list is re-inserted
into the FIB.

The exit namespace eventually gets to cleaning up ip6tnl0 which removes the
host route from the FIB again, calls the rcu function for cleanup -- and
triggers the double rcu trace.

The root cause is duplicate DAD on the address -- steps 2 and 3. Arguably,
DAD should not be started in step 2. The interface is in the down state,
so it can not really send out requests for the address which makes starting
DAD pointless.

Since the second DAD was introduced by a recent change, seems appropriate
to use it for the Fixes tag and have the fixup function only start DAD for
addresses in the PREDAD state which occurs in addrconf_ifdown if the
address is retained.

Big thanks to Andrey for isolating a reliable reproducer for this problem.
Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional")
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: David Ahern <dsahern@gmail.com>
Tested-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/addrconf.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b09ac38d8dc4..a2a370b71249 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3328,7 +3328,8 @@ static int fixup_permanent_addr(struct inet6_dev *idev,
 				      idev->dev, 0, 0);
 	}
 
-	addrconf_dad_start(ifp);
+	if (ifp->state == INET6_IFADDR_STATE_PREDAD)
+		addrconf_dad_start(ifp);
 
 	return 0;
 }
@@ -3683,7 +3684,7 @@ restart:
 		if (keep) {
 			/* set state to skip the notifier below */
 			state = INET6_IFADDR_STATE_DEAD;
-			ifa->state = 0;
+			ifa->state = INET6_IFADDR_STATE_PREDAD;
 			if (!(ifa->flags & IFA_F_NODAD))
 				ifa->flags |= IFA_F_TENTATIVE;
 
-- 
cgit v1.2.3


From 4d463c4dbc5c1c5d73e488d52faeec05570443a0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 3 May 2017 00:39:17 +0200
Subject: xdp: use common helper for netlink extended ack reporting

Small follow-up to d74a32acd59a ("xdp: use netlink extended ACK reporting")
in order to let drivers all use the same NL_SET_ERR_MSG_MOD() helper macro
for reporting. This also ensures that we consistently add the driver's
prefix for dumping the report in user space to indicate that the error
message is driver specific and not coming from core code. Furthermore,
NL_SET_ERR_MSG_MOD() now reuses NL_SET_ERR_MSG() and thus makes all macros
check the pointer as suggested.

References: https://www.spinics.net/lists/netdev/msg433267.html
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c |  4 ++--
 drivers/net/virtio_net.c                            |  8 ++++----
 include/linux/netlink.h                             | 19 ++++++++-----------
 3 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index db20376260f5..82bd6b0935f1 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2532,11 +2532,11 @@ nfp_net_check_config(struct nfp_net *nn, struct nfp_net_dp *dp,
 	if (!dp->xdp_prog)
 		return 0;
 	if (dp->fl_bufsz > PAGE_SIZE) {
-		NL_MOD_TRY_SET_ERR_MSG(extack, "MTU too large w/ XDP enabled");
+		NL_SET_ERR_MSG_MOD(extack, "MTU too large w/ XDP enabled");
 		return -EINVAL;
 	}
 	if (dp->num_tx_rings > nn->max_tx_rings) {
-		NL_MOD_TRY_SET_ERR_MSG(extack, "Insufficient number of TX rings w/ XDP enabled");
+		NL_SET_ERR_MSG_MOD(extack, "Insufficient number of TX rings w/ XDP enabled");
 		return -EINVAL;
 	}
 
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 3d0bc484b3d7..1c6d3923c224 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1891,17 +1891,17 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) ||
 	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) ||
 	    virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) {
-		NL_SET_ERR_MSG(extack, "can't set XDP while host is implementing LRO, disable LRO first");
+		NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO, disable LRO first");
 		return -EOPNOTSUPP;
 	}
 
 	if (vi->mergeable_rx_bufs && !vi->any_header_sg) {
-		NL_SET_ERR_MSG(extack, "XDP expects header/data in single page, any_header_sg required");
+		NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required");
 		return -EINVAL;
 	}
 
 	if (dev->mtu > max_sz) {
-		NL_SET_ERR_MSG(extack, "MTU too large to enable XDP");
+		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
 		netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz);
 		return -EINVAL;
 	}
@@ -1912,7 +1912,7 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 
 	/* XDP requires extra queues for XDP_TX */
 	if (curr_qp + xdp_qp > vi->max_queue_pairs) {
-		NL_SET_ERR_MSG(extack, "Too few free TX rings available");
+		NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available");
 		netdev_warn(dev, "request %i queues but max is %i\n",
 			    curr_qp + xdp_qp, vi->max_queue_pairs);
 		return -ENOMEM;
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index c20395edf2de..5fff5ba5964e 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -86,19 +86,16 @@ struct netlink_ext_ack {
  * Currently string formatting is not supported (due
  * to the lack of an output buffer.)
  */
-#define NL_SET_ERR_MSG(extack, msg) do {	\
-	static const char _msg[] = (msg);	\
-						\
-	(extack)->_msg = _msg;			\
+#define NL_SET_ERR_MSG(extack, msg) do {		\
+	static const char __msg[] = (msg);		\
+	struct netlink_ext_ack *__extack = (extack);	\
+							\
+	if (__extack)					\
+		__extack->_msg = __msg;			\
 } while (0)
 
-#define NL_MOD_TRY_SET_ERR_MSG(extack, msg) do {		\
-	static const char _msg[] = KBUILD_MODNAME ": " msg;	\
-	struct netlink_ext_ack *_extack = (extack);		\
-								\
-	if (_extack)						\
-		_extack->_msg = _msg;				\
-} while (0)
+#define NL_SET_ERR_MSG_MOD(extack, msg)			\
+	NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg)
 
 extern void netlink_kernel_release(struct sock *sk);
 extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups);
-- 
cgit v1.2.3


From 6ead18fb18c8182a81d70f24942f9547e56a35d2 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 2 May 2017 19:58:14 -0700
Subject: selftests/bpf: add a test case to check verifier pointer arithmetic

With clang/llvm 4.0+, the test case is able to generate
the following pattern:
....
440: (b7) r1 = 15
441: (05) goto pc+73
515: (79) r6 = *(u64 *)(r10 -152)
516: (bf) r7 = r10
517: (07) r7 += -112
518: (bf) r2 = r7
519: (0f) r2 += r1
520: (71) r1 = *(u8 *)(r8 +0)
521: (73) *(u8 *)(r2 +45) = r1
....

commit 332270fdc8b6 ("bpf: enhance verifier to understand stack
pointer arithmetic") improved verifier to handle such a pattern.
This patch adds a C test case to actually generate such a pattern.
A dummy tracepoint interface is used to load the program
into the kernel.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/bpf/Makefile          |   2 +-
 tools/testing/selftests/bpf/test_progs.c      |  16 ++
 tools/testing/selftests/bpf/test_tcp_estats.c | 258 ++++++++++++++++++++++++++
 3 files changed, 275 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_tcp_estats.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index d8d94b9bd76c..f4341e160de6 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -13,7 +13,7 @@ LDLIBS += -lcap -lelf
 
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs
 
-TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o
+TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o
 
 TEST_PROGS := test_kmod.sh
 
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 4ed049a0b14b..b59f5ed4ae40 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -268,6 +268,21 @@ out:
 	bpf_object__close(obj);
 }
 
+static void test_tcp_estats(void)
+{
+	const char *file = "./test_tcp_estats.o";
+	int err, prog_fd;
+	struct bpf_object *obj;
+	__u32 duration = 0;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	CHECK(err, "", "err %d errno %d\n", err, errno);
+	if (err)
+		return;
+
+	bpf_object__close(obj);
+}
+
 int main(void)
 {
 	struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY };
@@ -277,6 +292,7 @@ int main(void)
 	test_pkt_access();
 	test_xdp();
 	test_l4lb();
+	test_tcp_estats();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return 0;
diff --git a/tools/testing/selftests/bpf/test_tcp_estats.c b/tools/testing/selftests/bpf/test_tcp_estats.c
new file mode 100644
index 000000000000..bee3bbecc0c4
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcp_estats.c
@@ -0,0 +1,258 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+/* This program shows clang/llvm is able to generate code pattern
+ * like:
+ *   _tcp_send_active_reset:
+ *      0:       bf 16 00 00 00 00 00 00         r6 = r1
+ *    ......
+ *    335:       b7 01 00 00 0f 00 00 00         r1 = 15
+ *    336:       05 00 48 00 00 00 00 00         goto 72
+ *
+ *   LBB0_3:
+ *    337:       b7 01 00 00 01 00 00 00         r1 = 1
+ *    338:       63 1a d0 ff 00 00 00 00         *(u32 *)(r10 - 48) = r1
+ *    408:       b7 01 00 00 03 00 00 00         r1 = 3
+ *
+ *   LBB0_4:
+ *    409:       71 a2 fe ff 00 00 00 00         r2 = *(u8 *)(r10 - 2)
+ *    410:       bf a7 00 00 00 00 00 00         r7 = r10
+ *    411:       07 07 00 00 b8 ff ff ff         r7 += -72
+ *    412:       bf 73 00 00 00 00 00 00         r3 = r7
+ *    413:       0f 13 00 00 00 00 00 00         r3 += r1
+ *    414:       73 23 2d 00 00 00 00 00         *(u8 *)(r3 + 45) = r2
+ *
+ * From the above code snippet, the code generated by the compiler
+ * is reasonable. The "r1" is assigned to different values in basic
+ * blocks "_tcp_send_active_reset" and "LBB0_3", and used in "LBB0_4".
+ * The verifier should be able to handle such code patterns.
+ */
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/ipv6.h>
+#include <linux/version.h>
+#include <sys/socket.h>
+#include "bpf_helpers.h"
+
+#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
+#define TCP_ESTATS_MAGIC 0xBAADBEEF
+
+/* This test case needs "sock" and "pt_regs" data structure.
+ * Recursively, "sock" needs "sock_common" and "inet_sock".
+ * However, this is a unit test case only for
+ * verifier purpose without bpf program execution.
+ * We can safely mock much simpler data structures, basically
+ * only taking the necessary fields from kernel headers.
+ */
+typedef __u32 __bitwise __portpair;
+typedef __u64 __bitwise __addrpair;
+
+struct sock_common {
+	unsigned short		skc_family;
+	union {
+		__addrpair	skc_addrpair;
+		struct {
+			__be32	skc_daddr;
+			__be32	skc_rcv_saddr;
+		};
+	};
+	union {
+		__portpair	skc_portpair;
+		struct {
+			__be16	skc_dport;
+			__u16	skc_num;
+		};
+	};
+	struct in6_addr		skc_v6_daddr;
+	struct in6_addr		skc_v6_rcv_saddr;
+};
+
+struct sock {
+	struct sock_common	__sk_common;
+#define sk_family		__sk_common.skc_family
+#define sk_v6_daddr		__sk_common.skc_v6_daddr
+#define sk_v6_rcv_saddr		__sk_common.skc_v6_rcv_saddr
+};
+
+struct inet_sock {
+	struct sock		sk;
+#define inet_daddr		sk.__sk_common.skc_daddr
+#define inet_dport		sk.__sk_common.skc_dport
+	__be32			inet_saddr;
+	__be16			inet_sport;
+};
+
+struct pt_regs {
+	long di;
+};
+
+static inline struct inet_sock *inet_sk(const struct sock *sk)
+{
+	return (struct inet_sock *)sk;
+}
+
+/* Define various data structures for state recording.
+ * Some fields are not used due to test simplification.
+ */
+enum tcp_estats_addrtype {
+	TCP_ESTATS_ADDRTYPE_IPV4 = 1,
+	TCP_ESTATS_ADDRTYPE_IPV6 = 2
+};
+
+enum tcp_estats_event_type {
+	TCP_ESTATS_ESTABLISH,
+	TCP_ESTATS_PERIODIC,
+	TCP_ESTATS_TIMEOUT,
+	TCP_ESTATS_RETRANSMIT_TIMEOUT,
+	TCP_ESTATS_RETRANSMIT_OTHER,
+	TCP_ESTATS_SYN_RETRANSMIT,
+	TCP_ESTATS_SYNACK_RETRANSMIT,
+	TCP_ESTATS_TERM,
+	TCP_ESTATS_TX_RESET,
+	TCP_ESTATS_RX_RESET,
+	TCP_ESTATS_WRITE_TIMEOUT,
+	TCP_ESTATS_CONN_TIMEOUT,
+	TCP_ESTATS_ACK_LATENCY,
+	TCP_ESTATS_NEVENTS,
+};
+
+struct tcp_estats_event {
+	int pid;
+	int cpu;
+	unsigned long ts;
+	unsigned int magic;
+	enum tcp_estats_event_type event_type;
+};
+
+/* The below data structure is packed in order for
+ * llvm compiler to generate expected code.
+ */
+struct tcp_estats_conn_id {
+	unsigned int localaddressType;
+	struct {
+		unsigned char data[16];
+	} localaddress;
+	struct {
+		unsigned char data[16];
+	} remaddress;
+	unsigned short    localport;
+	unsigned short    remport;
+} __attribute__((__packed__));
+
+struct tcp_estats_basic_event {
+	struct tcp_estats_event event;
+	struct tcp_estats_conn_id conn_id;
+};
+
+struct bpf_map_def SEC("maps") ev_record_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct tcp_estats_basic_event),
+	.max_entries = 1024,
+};
+
+struct dummy_tracepoint_args {
+	unsigned long long pad;
+	struct sock *sock;
+};
+
+static __always_inline void tcp_estats_ev_init(struct tcp_estats_event *event,
+					       enum tcp_estats_event_type type)
+{
+	event->magic = TCP_ESTATS_MAGIC;
+	event->ts = bpf_ktime_get_ns();
+	event->event_type = type;
+}
+
+static __always_inline void unaligned_u32_set(unsigned char *to, __u8 *from)
+{
+	to[0] = _(from[0]);
+	to[1] = _(from[1]);
+	to[2] = _(from[2]);
+	to[3] = _(from[3]);
+}
+
+static __always_inline void conn_id_ipv4_init(struct tcp_estats_conn_id *conn_id,
+					      __be32 *saddr, __be32 *daddr)
+{
+	conn_id->localaddressType = TCP_ESTATS_ADDRTYPE_IPV4;
+
+	unaligned_u32_set(conn_id->localaddress.data, (__u8 *)saddr);
+	unaligned_u32_set(conn_id->remaddress.data, (__u8 *)daddr);
+}
+
+static __always_inline void conn_id_ipv6_init(struct tcp_estats_conn_id *conn_id,
+					      __be32 *saddr, __be32 *daddr)
+{
+	conn_id->localaddressType = TCP_ESTATS_ADDRTYPE_IPV6;
+
+	unaligned_u32_set(conn_id->localaddress.data, (__u8 *)saddr);
+	unaligned_u32_set(conn_id->localaddress.data + sizeof(__u32),
+			  (__u8 *)(saddr + 1));
+	unaligned_u32_set(conn_id->localaddress.data + sizeof(__u32) * 2,
+			  (__u8 *)(saddr + 2));
+	unaligned_u32_set(conn_id->localaddress.data + sizeof(__u32) * 3,
+			  (__u8 *)(saddr + 3));
+
+	unaligned_u32_set(conn_id->remaddress.data,
+			  (__u8 *)(daddr));
+	unaligned_u32_set(conn_id->remaddress.data + sizeof(__u32),
+			  (__u8 *)(daddr + 1));
+	unaligned_u32_set(conn_id->remaddress.data + sizeof(__u32) * 2,
+			  (__u8 *)(daddr + 2));
+	unaligned_u32_set(conn_id->remaddress.data + sizeof(__u32) * 3,
+			  (__u8 *)(daddr + 3));
+}
+
+static __always_inline void tcp_estats_conn_id_init(struct tcp_estats_conn_id *conn_id,
+						    struct sock *sk)
+{
+	conn_id->localport = _(inet_sk(sk)->inet_sport);
+	conn_id->remport = _(inet_sk(sk)->inet_dport);
+
+	if (_(sk->sk_family) == AF_INET6)
+		conn_id_ipv6_init(conn_id,
+				  sk->sk_v6_rcv_saddr.s6_addr32,
+				  sk->sk_v6_daddr.s6_addr32);
+	else
+		conn_id_ipv4_init(conn_id,
+				  &inet_sk(sk)->inet_saddr,
+				  &inet_sk(sk)->inet_daddr);
+}
+
+static __always_inline void tcp_estats_init(struct sock *sk,
+					    struct tcp_estats_event *event,
+					    struct tcp_estats_conn_id *conn_id,
+					    enum tcp_estats_event_type type)
+{
+	tcp_estats_ev_init(event, type);
+	tcp_estats_conn_id_init(conn_id, sk);
+}
+
+static __always_inline void send_basic_event(struct sock *sk,
+					     enum tcp_estats_event_type type)
+{
+	struct tcp_estats_basic_event ev;
+	__u32 key = bpf_get_prandom_u32();
+
+	memset(&ev, 0, sizeof(ev));
+	tcp_estats_init(sk, &ev.event, &ev.conn_id, type);
+	bpf_map_update_elem(&ev_record_map, &key, &ev, BPF_ANY);
+}
+
+SEC("dummy_tracepoint")
+int _dummy_tracepoint(struct dummy_tracepoint_args *arg)
+{
+	if (!arg->sock)
+		return 0;
+
+	send_basic_event(arg->sock, TCP_ESTATS_TX_RESET);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */
-- 
cgit v1.2.3


From 1c2dd16add7ebd7fc6a47f21a42f502cfed5ec2f Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 2 May 2017 21:14:43 -0700
Subject: selftests/bpf: get rid of -D__x86_64__

-D__x86_64__ workaround was used to make /usr/include/features.h
to follow expected path through the system include headers.
This is not portable.
Instead define dummy stubs.h which is used by 'clang -target bpf'

Fixes: 6882804c916b ("selftests/bpf: add a test for overlapping packet range checks")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 tools/testing/selftests/bpf/Makefile    | 4 ++--
 tools/testing/selftests/bpf/gnu/stubs.h | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/gnu/stubs.h

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index f4341e160de6..91edd0566237 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -34,6 +34,6 @@ $(BPFOBJ): force
 CLANG ?= clang
 
 %.o: %.c
-	$(CLANG) -I../../../include/uapi -I../../../../samples/bpf/ \
-		-D__x86_64__ -Wno-compare-distinct-pointer-types \
+	$(CLANG) -I. -I../../../include/uapi -I../../../../samples/bpf/ \
+		-Wno-compare-distinct-pointer-types \
 		-O2 -target bpf -c $< -o $@
diff --git a/tools/testing/selftests/bpf/gnu/stubs.h b/tools/testing/selftests/bpf/gnu/stubs.h
new file mode 100644
index 000000000000..719225b16626
--- /dev/null
+++ b/tools/testing/selftests/bpf/gnu/stubs.h
@@ -0,0 +1 @@
+/* dummy .h to trick /usr/include/features.h to work with 'clang -target bpf' */
-- 
cgit v1.2.3


From 37a7fdf2892be124545ddaf99b85de576121476c Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Wed, 3 May 2017 15:51:32 +0800
Subject: tg3: don't clear stats while tg3_close

Now tg3 NIC's stats will be cleared after ifdown/ifup. bond_get_stats traverse
its salves to get statistics,cumulative the increment.If a tg3 NIC is added to
bonding as a slave,ifdown/ifup will cause bonding's stats become tremendous value
(ex.1638.3 PiB) because of negative increment.

Fixes: 92feeabf3f67 ("tg3: Save stats across chip resets")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/tg3.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index f395b951f5e7..537d571ee601 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -11729,10 +11729,6 @@ static int tg3_close(struct net_device *dev)
 
 	tg3_stop(tp);
 
-	/* Clear stats across close / open calls */
-	memset(&tp->net_stats_prev, 0, sizeof(tp->net_stats_prev));
-	memset(&tp->estats_prev, 0, sizeof(tp->estats_prev));
-
 	if (pci_device_is_present(tp->pdev)) {
 		tg3_power_down_prepare(tp);
 
-- 
cgit v1.2.3


From 4c54dc0277d0d55a9248c43aebd31858f926a056 Mon Sep 17 00:00:00 2001
From: Daniele Palmas <dnlplm@gmail.com>
Date: Wed, 3 May 2017 10:30:11 +0200
Subject: net: usb: qmi_wwan: add Telit ME910 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds support for Telit ME910 PID 0x1100.

Signed-off-by: Daniele Palmas <dnlplm@gmail.com>
Acked-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/qmi_wwan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index a3ed8115747c..d7165767ca9d 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -1201,6 +1201,7 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x2357, 0x0201, 4)},	/* TP-LINK HSUPA Modem MA180 */
 	{QMI_FIXED_INTF(0x2357, 0x9000, 4)},	/* TP-LINK MA260 */
 	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1040, 2)},	/* Telit LE922A */
+	{QMI_FIXED_INTF(0x1bc7, 0x1100, 3)},	/* Telit ME910 */
 	{QMI_FIXED_INTF(0x1bc7, 0x1200, 5)},	/* Telit LE920 */
 	{QMI_QUIRK_SET_DTR(0x1bc7, 0x1201, 2)},	/* Telit LE920, LE920A4 */
 	{QMI_FIXED_INTF(0x1c9e, 0x9b01, 3)},	/* XS Stick W100-2 from 4G Systems */
-- 
cgit v1.2.3


From 86f8e247b960d6cf4cad8b65162e73301c5a58ec Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Wed, 3 May 2017 13:31:04 +0200
Subject: test_bpf: Use ULL suffix for 64-bit constants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On 32-bit:

    lib/test_bpf.c:4772: warning: integer constant is too large for ‘unsigned long’ type
    lib/test_bpf.c:4772: warning: integer constant is too large for ‘unsigned long’ type
    lib/test_bpf.c:4773: warning: integer constant is too large for ‘unsigned long’ type
    lib/test_bpf.c:4773: warning: integer constant is too large for ‘unsigned long’ type
    lib/test_bpf.c:4787: warning: integer constant is too large for ‘unsigned long’ type
    lib/test_bpf.c:4787: warning: integer constant is too large for ‘unsigned long’ type
    lib/test_bpf.c:4801: warning: integer constant is too large for ‘unsigned long’ type
    lib/test_bpf.c:4801: warning: integer constant is too large for ‘unsigned long’ type
    lib/test_bpf.c:4802: warning: integer constant is too large for ‘unsigned long’ type
    lib/test_bpf.c:4802: warning: integer constant is too large for ‘unsigned long’ type

On 32-bit systems, "long" is only 32-bit.
Replace the "UL" suffix by "ULL" to fix this.

Fixes: 85f68fe898320575 ("bpf, arm64: implement jiting of BPF_XADD")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 lib/test_bpf.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index a0f66280ea50..889bc31785be 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -4769,8 +4769,8 @@ static struct bpf_test tests[] = {
 			BPF_LD_IMM64(R1, 3),
 			BPF_LD_IMM64(R2, 2),
 			BPF_JMP_REG(BPF_JGE, R1, R2, 2),
-			BPF_LD_IMM64(R0, 0xffffffffffffffffUL),
-			BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeUL),
+			BPF_LD_IMM64(R0, 0xffffffffffffffffULL),
+			BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL),
 			BPF_EXIT_INSN(),
 		},
 		INTERNAL,
@@ -4784,7 +4784,7 @@ static struct bpf_test tests[] = {
 			BPF_LD_IMM64(R1, 3),
 			BPF_LD_IMM64(R2, 2),
 			BPF_JMP_REG(BPF_JGE, R1, R2, 0),
-			BPF_LD_IMM64(R0, 0xffffffffffffffffUL),
+			BPF_LD_IMM64(R0, 0xffffffffffffffffULL),
 			BPF_EXIT_INSN(),
 		},
 		INTERNAL,
@@ -4798,8 +4798,8 @@ static struct bpf_test tests[] = {
 			BPF_LD_IMM64(R1, 3),
 			BPF_LD_IMM64(R2, 2),
 			BPF_JMP_REG(BPF_JGE, R1, R2, 4),
-			BPF_LD_IMM64(R0, 0xffffffffffffffffUL),
-			BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeUL),
+			BPF_LD_IMM64(R0, 0xffffffffffffffffULL),
+			BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL),
 			BPF_EXIT_INSN(),
 		},
 		INTERNAL,
-- 
cgit v1.2.3


From ab71632c457269a6f3ec5a233c7a83682ae27019 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Wed, 3 May 2017 14:18:43 +0200
Subject: netfilter: conntrack: Force inlining of build check to prevent build
 failure

If gcc (e.g. 4.1.2) decides not to inline total_extension_size(), the
build will fail with:

    net/built-in.o: In function `nf_conntrack_init_start':
    (.text+0x9baf6): undefined reference to `__compiletime_assert_1893'

or

    ERROR: "__compiletime_assert_1893" [net/netfilter/nf_conntrack.ko] undefined!

Fix this by forcing inlining of total_extension_size().

Fixes: b3a5db109e0670d6 ("netfilter: conntrack: use u8 for extension sizes again")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netfilter/nf_conntrack_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index f9245dbfe435..3c8f1ed2f555 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1853,7 +1853,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
 		  &nf_conntrack_htable_size, 0600);
 
-static unsigned int total_extension_size(void)
+static __always_inline unsigned int total_extension_size(void)
 {
 	/* remember to add new extensions below */
 	BUILD_BUG_ON(NF_CT_EXT_NUM > 9);
-- 
cgit v1.2.3


From bfc32f297337e86bbf16512ea27dd50dadd5959e Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 14:04:26 -0400
Subject: ibmvnic: Move resource initialization to its own routine

Move all of the calls to initialize resources for the driver to
a separate routine.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 71 +++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 4fcd2f0378ba..c67f1d693c76 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -624,22 +624,10 @@ static int set_real_num_queues(struct net_device *netdev)
 	return rc;
 }
 
-static int ibmvnic_open(struct net_device *netdev)
+static int init_resources(struct ibmvnic_adapter *adapter)
 {
-	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
-	struct device *dev = &adapter->vdev->dev;
-	int rc = 0;
-	int i;
-
-	if (adapter->is_closed) {
-		rc = ibmvnic_init(adapter);
-		if (rc)
-			return rc;
-	}
-
-	rc = ibmvnic_login(netdev);
-	if (rc)
-		return rc;
+	struct net_device *netdev = adapter->netdev;
+	int i, rc;
 
 	rc = set_real_num_queues(netdev);
 	if (rc)
@@ -647,7 +635,7 @@ static int ibmvnic_open(struct net_device *netdev)
 
 	rc = init_sub_crq_irqs(adapter);
 	if (rc) {
-		dev_err(dev, "failed to initialize sub crq irqs\n");
+		netdev_err(netdev, "failed to initialize sub crq irqs\n");
 		return -1;
 	}
 
@@ -659,25 +647,47 @@ static int ibmvnic_open(struct net_device *netdev)
 	adapter->napi = kcalloc(adapter->req_rx_queues,
 				sizeof(struct napi_struct), GFP_KERNEL);
 	if (!adapter->napi)
-		goto ibmvnic_open_fail;
+		return -ENOMEM;
+
 	for (i = 0; i < adapter->req_rx_queues; i++) {
 		netif_napi_add(netdev, &adapter->napi[i], ibmvnic_poll,
 			       NAPI_POLL_WEIGHT);
-		napi_enable(&adapter->napi[i]);
 	}
 
 	send_map_query(adapter);
 
 	rc = init_rx_pools(netdev);
 	if (rc)
-		goto ibmvnic_open_fail;
+		return rc;
 
 	rc = init_tx_pools(netdev);
+	return rc;
+}
+
+static int ibmvnic_open(struct net_device *netdev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	int i, rc;
+
+	if (adapter->is_closed) {
+		rc = ibmvnic_init(adapter);
+		if (rc)
+			return rc;
+	}
+
+	rc = ibmvnic_login(netdev);
 	if (rc)
-		goto ibmvnic_open_fail;
+		return rc;
+
+	rc = init_resources(adapter);
+	if (rc)
+		return rc;
 
 	replenish_pools(adapter);
 
+	for (i = 0; i < adapter->req_rx_queues; i++)
+		napi_enable(&adapter->napi[i]);
+
 	/* We're ready to receive frames, enable the sub-crq interrupts and
 	 * set the logical link state to up
 	 */
@@ -688,19 +698,16 @@ static int ibmvnic_open(struct net_device *netdev)
 		enable_scrq_irq(adapter, adapter->tx_scrq[i]);
 
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_UP);
-	if (rc)
-		goto ibmvnic_open_fail;
-
-	netif_tx_start_all_queues(netdev);
-	adapter->is_closed = false;
-
-	return 0;
+	if (rc) {
+		for (i = 0; i < adapter->req_rx_queues; i++)
+			napi_disable(&adapter->napi[i]);
+		release_resources(adapter);
+	} else {
+		netif_tx_start_all_queues(netdev);
+		adapter->is_closed = false;
+	}
 
-ibmvnic_open_fail:
-	for (i = 0; i < adapter->req_rx_queues; i++)
-		napi_disable(&adapter->napi[i]);
-	release_resources(adapter);
-	return -ENOMEM;
+	return rc;
 }
 
 static void disable_sub_crqs(struct ibmvnic_adapter *adapter)
-- 
cgit v1.2.3


From 90c8014c2be5e5805d972341b40221da4309d564 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 14:04:32 -0400
Subject: ibmvnic: Replace is_closed with state field

Replace the is_closed flag in the ibmvnic adapter strcut with a
more comprehensive state field that tracks the current state of
the driver.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 20 +++++++++++++-------
 drivers/net/ethernet/ibm/ibmvnic.h | 12 ++++++++++--
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index c67f1d693c76..40a8ba0eeb7b 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -669,7 +669,9 @@ static int ibmvnic_open(struct net_device *netdev)
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 	int i, rc;
 
-	if (adapter->is_closed) {
+	adapter->state = VNIC_OPENING;
+
+	if (adapter->state == VNIC_CLOSED) {
 		rc = ibmvnic_init(adapter);
 		if (rc)
 			return rc;
@@ -704,7 +706,7 @@ static int ibmvnic_open(struct net_device *netdev)
 		release_resources(adapter);
 	} else {
 		netif_tx_start_all_queues(netdev);
-		adapter->is_closed = false;
+		adapter->state = VNIC_OPEN;
 	}
 
 	return rc;
@@ -733,7 +735,7 @@ static int ibmvnic_close(struct net_device *netdev)
 	int rc = 0;
 	int i;
 
-	adapter->closing = true;
+	adapter->state = VNIC_CLOSING;
 	disable_sub_crqs(adapter);
 
 	if (adapter->napi) {
@@ -748,8 +750,7 @@ static int ibmvnic_close(struct net_device *netdev)
 
 	release_resources(adapter);
 
-	adapter->is_closed = true;
-	adapter->closing = false;
+	adapter->state = VNIC_CLOSED;
 	return rc;
 }
 
@@ -1860,7 +1861,8 @@ static int pending_scrq(struct ibmvnic_adapter *adapter,
 {
 	union sub_crq *entry = &scrq->msgs[scrq->cur];
 
-	if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP || adapter->closing)
+	if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP ||
+	    adapter->state == VNIC_CLOSING)
 		return 1;
 	else
 		return 0;
@@ -3353,6 +3355,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 		return -ENOMEM;
 
 	adapter = netdev_priv(netdev);
+	adapter->state = VNIC_PROBING;
 	dev_set_drvdata(&dev->dev, netdev);
 	adapter->vdev = dev;
 	adapter->netdev = netdev;
@@ -3380,7 +3383,6 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	}
 
 	netdev->mtu = adapter->req_mtu - ETH_HLEN;
-	adapter->is_closed = false;
 
 	rc = register_netdev(netdev);
 	if (rc) {
@@ -3390,6 +3392,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	}
 	dev_info(&dev->dev, "ibmvnic registered\n");
 
+	adapter->state = VNIC_PROBED;
 	return 0;
 }
 
@@ -3398,12 +3401,15 @@ static int ibmvnic_remove(struct vio_dev *dev)
 	struct net_device *netdev = dev_get_drvdata(&dev->dev);
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 
+	adapter->state = VNIC_REMOVING;
 	unregister_netdev(netdev);
 
 	release_resources(adapter);
 	release_sub_crqs(adapter);
 	release_crq_queue(adapter);
 
+	adapter->state = VNIC_REMOVED;
+
 	free_netdev(netdev);
 	dev_set_drvdata(&dev->dev, NULL);
 
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index a69979f6f19d..03a866f542c4 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -913,6 +913,15 @@ struct ibmvnic_error_buff {
 	__be32 error_id;
 };
 
+enum vnic_state {VNIC_PROBING = 1,
+		 VNIC_PROBED,
+		 VNIC_OPENING,
+		 VNIC_OPEN,
+		 VNIC_CLOSING,
+		 VNIC_CLOSED,
+		 VNIC_REMOVING,
+		 VNIC_REMOVED};
+
 struct ibmvnic_adapter {
 	struct vio_dev *vdev;
 	struct net_device *netdev;
@@ -962,7 +971,6 @@ struct ibmvnic_adapter {
 	u64 promisc;
 
 	struct ibmvnic_tx_pool *tx_pool;
-	bool closing;
 	struct completion init_done;
 	int init_done_rc;
 
@@ -1011,5 +1019,5 @@ struct ibmvnic_adapter {
 	struct work_struct ibmvnic_xport;
 	struct tasklet_struct tasklet;
 	bool failover;
-	bool is_closed;
+	enum vnic_state state;
 };
-- 
cgit v1.2.3


From ed651a10875f13135a5f59c1bae4d51b377b3925 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 14:04:38 -0400
Subject: ibmvnic: Updated reset handling

The ibmvnic driver has multiple handlers for resetting the driver
depending on the reason the reset is needed (failover, lpm,
fatal erors,...). All of the reset handlers do essentially the same
thing, this patch moves this work to a common reset handler.

By doing this we also allow the driver to better handle situations
where we can get a reset while handling a reset.

The updated reset handling works by adding a reset work item to the
list of resets and then scheduling work to perform the reset. This
step is necessary because we can receive a reset in interrupt context
and we want to handle the reset out of interrupt context.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 413 +++++++++++++++++++++++--------------
 drivers/net/ethernet/ibm/ibmvnic.h |  19 +-
 2 files changed, 275 insertions(+), 157 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 40a8ba0eeb7b..a7c7a94c9d63 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -194,7 +194,8 @@ static void free_long_term_buff(struct ibmvnic_adapter *adapter,
 	if (!ltb->buff)
 		return;
 
-	if (!adapter->failover)
+	if (adapter->reset_reason != VNIC_RESET_FAILOVER &&
+	    adapter->reset_reason != VNIC_RESET_MOBILITY)
 		send_request_unmap(adapter, ltb->map_id);
 	dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr);
 }
@@ -292,9 +293,6 @@ static void replenish_pools(struct ibmvnic_adapter *adapter)
 {
 	int i;
 
-	if (adapter->migrated)
-		return;
-
 	adapter->replenish_task_cycles++;
 	for (i = 0; i < be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs);
 	     i++) {
@@ -569,11 +567,6 @@ static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state)
 	bool resend;
 	int rc;
 
-	if (adapter->logical_link_state == link_state) {
-		netdev_dbg(netdev, "Link state already %d\n", link_state);
-		return 0;
-	}
-
 	netdev_err(netdev, "setting link state %d\n", link_state);
 	memset(&crq, 0, sizeof(crq));
 	crq.logical_link_state.first = IBMVNIC_CRQ_CMD;
@@ -664,27 +657,13 @@ static int init_resources(struct ibmvnic_adapter *adapter)
 	return rc;
 }
 
-static int ibmvnic_open(struct net_device *netdev)
+static int __ibmvnic_open(struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	enum vnic_state prev_state = adapter->state;
 	int i, rc;
 
 	adapter->state = VNIC_OPENING;
-
-	if (adapter->state == VNIC_CLOSED) {
-		rc = ibmvnic_init(adapter);
-		if (rc)
-			return rc;
-	}
-
-	rc = ibmvnic_login(netdev);
-	if (rc)
-		return rc;
-
-	rc = init_resources(adapter);
-	if (rc)
-		return rc;
-
 	replenish_pools(adapter);
 
 	for (i = 0; i < adapter->req_rx_queues; i++)
@@ -693,22 +672,65 @@ static int ibmvnic_open(struct net_device *netdev)
 	/* We're ready to receive frames, enable the sub-crq interrupts and
 	 * set the logical link state to up
 	 */
-	for (i = 0; i < adapter->req_rx_queues; i++)
-		enable_scrq_irq(adapter, adapter->rx_scrq[i]);
+	for (i = 0; i < adapter->req_rx_queues; i++) {
+		if (prev_state == VNIC_CLOSED)
+			enable_irq(adapter->rx_scrq[i]->irq);
+		else
+			enable_scrq_irq(adapter, adapter->rx_scrq[i]);
+	}
 
-	for (i = 0; i < adapter->req_tx_queues; i++)
-		enable_scrq_irq(adapter, adapter->tx_scrq[i]);
+	for (i = 0; i < adapter->req_tx_queues; i++) {
+		if (prev_state == VNIC_CLOSED)
+			enable_irq(adapter->tx_scrq[i]->irq);
+		else
+			enable_scrq_irq(adapter, adapter->tx_scrq[i]);
+	}
 
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_UP);
 	if (rc) {
 		for (i = 0; i < adapter->req_rx_queues; i++)
 			napi_disable(&adapter->napi[i]);
 		release_resources(adapter);
-	} else {
-		netif_tx_start_all_queues(netdev);
-		adapter->state = VNIC_OPEN;
+		return rc;
 	}
 
+	netif_tx_start_all_queues(netdev);
+
+	if (prev_state == VNIC_CLOSED) {
+		for (i = 0; i < adapter->req_rx_queues; i++)
+			napi_schedule(&adapter->napi[i]);
+	}
+
+	adapter->state = VNIC_OPEN;
+	return rc;
+}
+
+static int ibmvnic_open(struct net_device *netdev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	int rc;
+
+	mutex_lock(&adapter->reset_lock);
+
+	if (adapter->state != VNIC_CLOSED) {
+		rc = ibmvnic_login(netdev);
+		if (rc) {
+			mutex_unlock(&adapter->reset_lock);
+			return rc;
+		}
+
+		rc = init_resources(adapter);
+		if (rc) {
+			netdev_err(netdev, "failed to initialize resources\n");
+			release_resources(adapter);
+			mutex_unlock(&adapter->reset_lock);
+			return rc;
+		}
+	}
+
+	rc = __ibmvnic_open(netdev);
+	mutex_unlock(&adapter->reset_lock);
+
 	return rc;
 }
 
@@ -729,13 +751,14 @@ static void disable_sub_crqs(struct ibmvnic_adapter *adapter)
 	}
 }
 
-static int ibmvnic_close(struct net_device *netdev)
+static int __ibmvnic_close(struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
 	int rc = 0;
 	int i;
 
 	adapter->state = VNIC_CLOSING;
+	netif_tx_stop_all_queues(netdev);
 	disable_sub_crqs(adapter);
 
 	if (adapter->napi) {
@@ -743,17 +766,24 @@ static int ibmvnic_close(struct net_device *netdev)
 			napi_disable(&adapter->napi[i]);
 	}
 
-	if (!adapter->failover)
-		netif_tx_stop_all_queues(netdev);
-
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN);
 
-	release_resources(adapter);
-
 	adapter->state = VNIC_CLOSED;
 	return rc;
 }
 
+static int ibmvnic_close(struct net_device *netdev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+	int rc;
+
+	mutex_lock(&adapter->reset_lock);
+	rc = __ibmvnic_close(netdev);
+	mutex_unlock(&adapter->reset_lock);
+
+	return rc;
+}
+
 /**
  * build_hdr_data - creates L2/L3/L4 header data buffer
  * @hdr_field - bitfield determining needed headers
@@ -915,7 +945,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 	handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
 				   be32_to_cpu(adapter->login_rsp_buf->
 					       off_txsubm_subcrqs));
-	if (adapter->migrated) {
+	if (adapter->resetting) {
 		if (!netif_subqueue_stopped(netdev, skb))
 			netif_stop_subqueue(netdev, queue_num);
 		dev_kfree_skb_any(skb);
@@ -1107,18 +1137,185 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p)
 	return 0;
 }
 
-static void ibmvnic_tx_timeout(struct net_device *dev)
+/**
+ * do_reset returns zero if we are able to keep processing reset events, or
+ * non-zero if we hit a fatal error and must halt.
+ */
+static int do_reset(struct ibmvnic_adapter *adapter,
+		    struct ibmvnic_rwi *rwi, u32 reset_state)
 {
-	struct ibmvnic_adapter *adapter = netdev_priv(dev);
-	int rc;
+	struct net_device *netdev = adapter->netdev;
+	int i, rc;
+
+	netif_carrier_off(netdev);
+	adapter->reset_reason = rwi->reset_reason;
+
+	if (rwi->reset_reason == VNIC_RESET_MOBILITY) {
+		rc = ibmvnic_reenable_crq_queue(adapter);
+		if (rc)
+			return 0;
+	}
+
+	rc = __ibmvnic_close(netdev);
+	if (rc)
+		return rc;
+
+	/* remove the closed state so when we call open it appears
+	 * we are coming from the probed state.
+	 */
+	adapter->state = VNIC_PROBED;
 
-	/* Adapter timed out, resetting it */
+	release_resources(adapter);
 	release_sub_crqs(adapter);
-	rc = ibmvnic_reset_crq(adapter);
+	release_crq_queue(adapter);
+
+	rc = ibmvnic_init(adapter);
 	if (rc)
-		dev_err(&adapter->vdev->dev, "Adapter timeout, reset failed\n");
-	else
-		ibmvnic_send_crq_init(adapter);
+		return 0;
+
+	/* If the adapter was in PROBE state prior to the reset, exit here. */
+	if (reset_state == VNIC_PROBED)
+		return 0;
+
+	rc = ibmvnic_login(netdev);
+	if (rc) {
+		adapter->state = VNIC_PROBED;
+		return 0;
+	}
+
+	rtnl_lock();
+	rc = init_resources(adapter);
+	rtnl_unlock();
+	if (rc)
+		return rc;
+
+	if (reset_state == VNIC_CLOSED)
+		return 0;
+
+	rc = __ibmvnic_open(netdev);
+	if (rc) {
+		if (list_empty(&adapter->rwi_list))
+			adapter->state = VNIC_CLOSED;
+		else
+			adapter->state = reset_state;
+
+		return 0;
+	}
+
+	netif_carrier_on(netdev);
+
+	/* kick napi */
+	for (i = 0; i < adapter->req_rx_queues; i++)
+		napi_schedule(&adapter->napi[i]);
+
+	return 0;
+}
+
+static struct ibmvnic_rwi *get_next_rwi(struct ibmvnic_adapter *adapter)
+{
+	struct ibmvnic_rwi *rwi;
+
+	mutex_lock(&adapter->rwi_lock);
+
+	if (!list_empty(&adapter->rwi_list)) {
+		rwi = list_first_entry(&adapter->rwi_list, struct ibmvnic_rwi,
+				       list);
+		list_del(&rwi->list);
+	} else {
+		rwi = NULL;
+	}
+
+	mutex_unlock(&adapter->rwi_lock);
+	return rwi;
+}
+
+static void free_all_rwi(struct ibmvnic_adapter *adapter)
+{
+	struct ibmvnic_rwi *rwi;
+
+	rwi = get_next_rwi(adapter);
+	while (rwi) {
+		kfree(rwi);
+		rwi = get_next_rwi(adapter);
+	}
+}
+
+static void __ibmvnic_reset(struct work_struct *work)
+{
+	struct ibmvnic_rwi *rwi;
+	struct ibmvnic_adapter *adapter;
+	struct net_device *netdev;
+	u32 reset_state;
+	int rc;
+
+	adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset);
+	netdev = adapter->netdev;
+
+	mutex_lock(&adapter->reset_lock);
+	adapter->resetting = true;
+	reset_state = adapter->state;
+
+	rwi = get_next_rwi(adapter);
+	while (rwi) {
+		rc = do_reset(adapter, rwi, reset_state);
+		kfree(rwi);
+		if (rc)
+			break;
+
+		rwi = get_next_rwi(adapter);
+	}
+
+	if (rc) {
+		free_all_rwi(adapter);
+		return;
+	}
+
+	adapter->resetting = false;
+	mutex_unlock(&adapter->reset_lock);
+}
+
+static void ibmvnic_reset(struct ibmvnic_adapter *adapter,
+			  enum ibmvnic_reset_reason reason)
+{
+	struct ibmvnic_rwi *rwi, *tmp;
+	struct net_device *netdev = adapter->netdev;
+	struct list_head *entry;
+
+	if (adapter->state == VNIC_REMOVING ||
+	    adapter->state == VNIC_REMOVED) {
+		netdev_dbg(netdev, "Adapter removing, skipping reset\n");
+		return;
+	}
+
+	mutex_lock(&adapter->rwi_lock);
+
+	list_for_each(entry, &adapter->rwi_list) {
+		tmp = list_entry(entry, struct ibmvnic_rwi, list);
+		if (tmp->reset_reason == reason) {
+			netdev_err(netdev, "Matching reset found, skipping\n");
+			mutex_unlock(&adapter->rwi_lock);
+			return;
+		}
+	}
+
+	rwi = kzalloc(sizeof(*rwi), GFP_KERNEL);
+	if (!rwi) {
+		mutex_unlock(&adapter->rwi_lock);
+		ibmvnic_close(netdev);
+		return;
+	}
+
+	rwi->reset_reason = reason;
+	list_add_tail(&rwi->list, &adapter->rwi_list);
+	mutex_unlock(&adapter->rwi_lock);
+	schedule_work(&adapter->ibmvnic_reset);
+}
+
+static void ibmvnic_tx_timeout(struct net_device *dev)
+{
+	struct ibmvnic_adapter *adapter = netdev_priv(dev);
+
+	ibmvnic_reset(adapter, VNIC_RESET_TIMEOUT);
 }
 
 static void remove_buff_from_pool(struct ibmvnic_adapter *adapter,
@@ -2000,18 +2197,6 @@ static int ibmvnic_send_crq_init(struct ibmvnic_adapter *adapter)
 	return ibmvnic_send_crq(adapter, &crq);
 }
 
-static int ibmvnic_send_crq_init_complete(struct ibmvnic_adapter *adapter)
-{
-	union ibmvnic_crq crq;
-
-	memset(&crq, 0, sizeof(crq));
-	crq.generic.first = IBMVNIC_CRQ_INIT_CMD;
-	crq.generic.cmd = IBMVNIC_CRQ_INIT_COMPLETE;
-	netdev_dbg(adapter->netdev, "Sending CRQ init complete\n");
-
-	return ibmvnic_send_crq(adapter, &crq);
-}
-
 static int send_version_xchg(struct ibmvnic_adapter *adapter)
 {
 	union ibmvnic_crq crq;
@@ -2509,6 +2694,9 @@ static void handle_error_indication(union ibmvnic_crq *crq,
 
 	if (be32_to_cpu(crq->error_indication.error_id))
 		request_error_information(adapter, crq);
+
+	if (crq->error_indication.flags & IBMVNIC_FATAL_ERROR)
+		ibmvnic_reset(adapter, VNIC_RESET_FATAL);
 }
 
 static void handle_change_mac_rsp(union ibmvnic_crq *crq,
@@ -2897,26 +3085,6 @@ out:
 	}
 }
 
-static void ibmvnic_xport_event(struct work_struct *work)
-{
-	struct ibmvnic_adapter *adapter = container_of(work,
-						       struct ibmvnic_adapter,
-						       ibmvnic_xport);
-	struct device *dev = &adapter->vdev->dev;
-	long rc;
-
-	release_sub_crqs(adapter);
-	if (adapter->migrated) {
-		rc = ibmvnic_reenable_crq_queue(adapter);
-		if (rc)
-			dev_err(dev, "Error after enable rc=%ld\n", rc);
-		adapter->migrated = false;
-		rc = ibmvnic_send_crq_init(adapter);
-		if (rc)
-			dev_err(dev, "Error sending init rc=%ld\n", rc);
-	}
-}
-
 static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 			       struct ibmvnic_adapter *adapter)
 {
@@ -2934,12 +3102,6 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 		switch (gen_crq->cmd) {
 		case IBMVNIC_CRQ_INIT:
 			dev_info(dev, "Partner initialized\n");
-			/* Send back a response */
-			rc = ibmvnic_send_crq_init_complete(adapter);
-			if (!rc)
-				schedule_work(&adapter->vnic_crq_init);
-			else
-				dev_err(dev, "Can't send initrsp rc=%ld\n", rc);
 			break;
 		case IBMVNIC_CRQ_INIT_COMPLETE:
 			dev_info(dev, "Partner initialization complete\n");
@@ -2950,19 +3112,18 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
 		}
 		return;
 	case IBMVNIC_CRQ_XPORT_EVENT:
+		netif_carrier_off(netdev);
 		if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) {
-			dev_info(dev, "Re-enabling adapter\n");
-			adapter->migrated = true;
-			schedule_work(&adapter->ibmvnic_xport);
+			dev_info(dev, "Migrated, re-enabling adapter\n");
+			ibmvnic_reset(adapter, VNIC_RESET_MOBILITY);
 		} else if (gen_crq->cmd == IBMVNIC_DEVICE_FAILOVER) {
 			dev_info(dev, "Backing device failover detected\n");
-			netif_carrier_off(netdev);
-			adapter->failover = true;
+			ibmvnic_reset(adapter, VNIC_RESET_FAILOVER);
 		} else {
 			/* The adapter lost the connection */
 			dev_err(dev, "Virtual Adapter failed (rc=%d)\n",
 				gen_crq->cmd);
-			schedule_work(&adapter->ibmvnic_xport);
+			ibmvnic_reset(adapter, VNIC_RESET_FATAL);
 		}
 		return;
 	case IBMVNIC_CRQ_CMD_RSP:
@@ -3243,64 +3404,6 @@ map_failed:
 	return retrc;
 }
 
-static void handle_crq_init_rsp(struct work_struct *work)
-{
-	struct ibmvnic_adapter *adapter = container_of(work,
-						       struct ibmvnic_adapter,
-						       vnic_crq_init);
-	struct device *dev = &adapter->vdev->dev;
-	struct net_device *netdev = adapter->netdev;
-	unsigned long timeout = msecs_to_jiffies(30000);
-	bool restart = false;
-	int rc;
-
-	if (adapter->failover) {
-		release_sub_crqs(adapter);
-		if (netif_running(netdev)) {
-			netif_tx_disable(netdev);
-			ibmvnic_close(netdev);
-			restart = true;
-		}
-	}
-
-	reinit_completion(&adapter->init_done);
-	send_version_xchg(adapter);
-	if (!wait_for_completion_timeout(&adapter->init_done, timeout)) {
-		dev_err(dev, "Passive init timeout\n");
-		goto task_failed;
-	}
-
-	netdev->mtu = adapter->req_mtu - ETH_HLEN;
-
-	if (adapter->failover) {
-		adapter->failover = false;
-		if (restart) {
-			rc = ibmvnic_open(netdev);
-			if (rc)
-				goto restart_failed;
-		}
-		netif_carrier_on(netdev);
-		return;
-	}
-
-	rc = register_netdev(netdev);
-	if (rc) {
-		dev_err(dev,
-			"failed to register netdev rc=%d\n", rc);
-		goto register_failed;
-	}
-	dev_info(dev, "ibmvnic registered\n");
-
-	return;
-
-restart_failed:
-	dev_err(dev, "Failed to restart ibmvnic, rc=%d\n", rc);
-register_failed:
-	release_sub_crqs(adapter);
-task_failed:
-	dev_err(dev, "Passive initialization was not successful\n");
-}
-
 static int ibmvnic_init(struct ibmvnic_adapter *adapter)
 {
 	struct device *dev = &adapter->vdev->dev;
@@ -3359,7 +3462,6 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	dev_set_drvdata(&dev->dev, netdev);
 	adapter->vdev = dev;
 	adapter->netdev = netdev;
-	adapter->failover = false;
 
 	ether_addr_copy(adapter->mac_addr, mac_addr_p);
 	ether_addr_copy(netdev->dev_addr, adapter->mac_addr);
@@ -3368,14 +3470,17 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	netdev->ethtool_ops = &ibmvnic_ethtool_ops;
 	SET_NETDEV_DEV(netdev, &dev->dev);
 
-	INIT_WORK(&adapter->vnic_crq_init, handle_crq_init_rsp);
-	INIT_WORK(&adapter->ibmvnic_xport, ibmvnic_xport_event);
-
 	spin_lock_init(&adapter->stats_lock);
 
 	INIT_LIST_HEAD(&adapter->errors);
 	spin_lock_init(&adapter->error_list_lock);
 
+	INIT_WORK(&adapter->ibmvnic_reset, __ibmvnic_reset);
+	INIT_LIST_HEAD(&adapter->rwi_list);
+	mutex_init(&adapter->reset_lock);
+	mutex_init(&adapter->rwi_lock);
+	adapter->resetting = false;
+
 	rc = ibmvnic_init(adapter);
 	if (rc) {
 		free_netdev(netdev);
@@ -3403,6 +3508,7 @@ static int ibmvnic_remove(struct vio_dev *dev)
 
 	adapter->state = VNIC_REMOVING;
 	unregister_netdev(netdev);
+	mutex_lock(&adapter->reset_lock);
 
 	release_resources(adapter);
 	release_sub_crqs(adapter);
@@ -3410,6 +3516,7 @@ static int ibmvnic_remove(struct vio_dev *dev)
 
 	adapter->state = VNIC_REMOVED;
 
+	mutex_unlock(&adapter->reset_lock);
 	free_netdev(netdev);
 	dev_set_drvdata(&dev->dev, NULL);
 
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h
index 03a866f542c4..4702b48cfa44 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -922,6 +922,16 @@ enum vnic_state {VNIC_PROBING = 1,
 		 VNIC_REMOVING,
 		 VNIC_REMOVED};
 
+enum ibmvnic_reset_reason {VNIC_RESET_FAILOVER = 1,
+			   VNIC_RESET_MOBILITY,
+			   VNIC_RESET_FATAL,
+			   VNIC_RESET_TIMEOUT};
+
+struct ibmvnic_rwi {
+	enum ibmvnic_reset_reason reset_reason;
+	struct list_head list;
+};
+
 struct ibmvnic_adapter {
 	struct vio_dev *vdev;
 	struct net_device *netdev;
@@ -931,7 +941,6 @@ struct ibmvnic_adapter {
 	dma_addr_t ip_offload_tok;
 	struct ibmvnic_control_ip_offload_buffer ip_offload_ctrl;
 	dma_addr_t ip_offload_ctrl_tok;
-	bool migrated;
 	u32 msg_enable;
 
 	/* Statistics */
@@ -1015,9 +1024,11 @@ struct ibmvnic_adapter {
 	__be64 tx_rx_desc_req;
 	u8 map_id;
 
-	struct work_struct vnic_crq_init;
-	struct work_struct ibmvnic_xport;
 	struct tasklet_struct tasklet;
-	bool failover;
 	enum vnic_state state;
+	enum ibmvnic_reset_reason reset_reason;
+	struct mutex reset_lock, rwi_lock;
+	struct list_head rwi_list;
+	struct work_struct ibmvnic_reset;
+	bool resetting;
 };
-- 
cgit v1.2.3


From c7bac00b4056ab951c377aff99bb91b81890c727 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 14:04:44 -0400
Subject: ibmvnic: Delete napi's when releasing driver resources

The napi structs allocated at drivier initializatio need to be
free'ed when releasing the drivers resources.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index a7c7a94c9d63..d52d98cf9317 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -552,11 +552,20 @@ static int ibmvnic_login(struct net_device *netdev)
 
 static void release_resources(struct ibmvnic_adapter *adapter)
 {
+	int i;
+
 	release_tx_pools(adapter);
 	release_rx_pools(adapter);
 
 	release_stats_token(adapter);
 	release_error_buffers(adapter);
+
+	if (adapter->napi) {
+		for (i = 0; i < adapter->req_rx_queues; i++) {
+			if (&adapter->napi[i])
+				netif_napi_del(&adapter->napi[i]);
+		}
+	}
 }
 
 static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state)
-- 
cgit v1.2.3


From e0ebe942f42952fb23bb0596f8775e50bed0b341 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 14:04:50 -0400
Subject: ibmvnic: Whitespace correction in release_rx_pools

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index d52d98cf9317..bbbd57e272c4 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -348,7 +348,7 @@ static void release_rx_pools(struct ibmvnic_adapter *adapter)
 		free_long_term_buff(adapter, &rx_pool->long_term_buff);
 
 		if (!rx_pool->rx_buff)
-		continue;
+			continue;
 
 		for (j = 0; j < rx_pool->size; j++) {
 			if (rx_pool->rx_buff[j].skb) {
-- 
cgit v1.2.3


From b41b83e9a784576b2bcc33bce447f7ce78fb265d Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 14:04:56 -0400
Subject: ibmvnic: Clean up tx pools when closing

When closing the ibmvnic driver, most notably during the reset
path, the tx pools need to be cleaned to ensure there are no
hanging skbs that need to be free'ed.

The need for this was found during debugging a loss of network
traffic after handling a driver reset. The underlying cause was
some skbs in the tx pool that were never free'ed. As a
result the upper network layers never tried a re-send since it
believed the driver still had the skb.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index bbbd57e272c4..2297cf2390e1 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -760,6 +760,34 @@ static void disable_sub_crqs(struct ibmvnic_adapter *adapter)
 	}
 }
 
+static void clean_tx_pools(struct ibmvnic_adapter *adapter)
+{
+	struct ibmvnic_tx_pool *tx_pool;
+	u64 tx_entries;
+	int tx_scrqs;
+	int i, j;
+
+	if (!adapter->tx_pool)
+		return;
+
+	tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs);
+	tx_entries = adapter->req_tx_entries_per_subcrq;
+
+	/* Free any remaining skbs in the tx buffer pools */
+	for (i = 0; i < tx_scrqs; i++) {
+		tx_pool = &adapter->tx_pool[i];
+		if (!tx_pool)
+			continue;
+
+		for (j = 0; j < tx_entries; j++) {
+			if (tx_pool->tx_buff[j].skb) {
+				dev_kfree_skb_any(tx_pool->tx_buff[j].skb);
+				tx_pool->tx_buff[j].skb = NULL;
+			}
+		}
+	}
+}
+
 static int __ibmvnic_close(struct net_device *netdev)
 {
 	struct ibmvnic_adapter *adapter = netdev_priv(netdev);
@@ -768,6 +796,8 @@ static int __ibmvnic_close(struct net_device *netdev)
 
 	adapter->state = VNIC_CLOSING;
 	netif_tx_stop_all_queues(netdev);
+
+	clean_tx_pools(adapter);
 	disable_sub_crqs(adapter);
 
 	if (adapter->napi) {
-- 
cgit v1.2.3


From 46293b940fede04f90aab18d4bfecc5bd942cf3a Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 14:05:02 -0400
Subject: ibmvnic: Wait for any pending scrqs entries at driver close

When closing the ibmvnic driver we need to wait for any pending
sub crq entries to ensure they are handled.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 47 ++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 2297cf2390e1..a312bc1fee6d 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -743,23 +743,6 @@ static int ibmvnic_open(struct net_device *netdev)
 	return rc;
 }
 
-static void disable_sub_crqs(struct ibmvnic_adapter *adapter)
-{
-	int i;
-
-	if (adapter->tx_scrq) {
-		for (i = 0; i < adapter->req_tx_queues; i++)
-			if (adapter->tx_scrq[i])
-				disable_irq(adapter->tx_scrq[i]->irq);
-	}
-
-	if (adapter->rx_scrq) {
-		for (i = 0; i < adapter->req_rx_queues; i++)
-			if (adapter->rx_scrq[i])
-				disable_irq(adapter->rx_scrq[i]->irq);
-	}
-}
-
 static void clean_tx_pools(struct ibmvnic_adapter *adapter)
 {
 	struct ibmvnic_tx_pool *tx_pool;
@@ -797,15 +780,39 @@ static int __ibmvnic_close(struct net_device *netdev)
 	adapter->state = VNIC_CLOSING;
 	netif_tx_stop_all_queues(netdev);
 
-	clean_tx_pools(adapter);
-	disable_sub_crqs(adapter);
-
 	if (adapter->napi) {
 		for (i = 0; i < adapter->req_rx_queues; i++)
 			napi_disable(&adapter->napi[i]);
 	}
 
+	clean_tx_pools(adapter);
+
+	if (adapter->tx_scrq) {
+		for (i = 0; i < adapter->req_tx_queues; i++)
+			if (adapter->tx_scrq[i]->irq)
+				disable_irq(adapter->tx_scrq[i]->irq);
+	}
+
 	rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN);
+	if (rc)
+		return rc;
+
+	if (adapter->rx_scrq) {
+		for (i = 0; i < adapter->req_rx_queues; i++) {
+			int retries = 10;
+
+			while (pending_scrq(adapter, adapter->rx_scrq[i])) {
+				retries--;
+				mdelay(100);
+
+				if (retries == 0)
+					break;
+			}
+
+			if (adapter->rx_scrq[i]->irq)
+				disable_irq(adapter->rx_scrq[i]->irq);
+		}
+	}
 
 	adapter->state = VNIC_CLOSED;
 	return rc;
-- 
cgit v1.2.3


From 161b8a813877fb7d3deba6b3bd2731d4f4a75c5e Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 14:05:08 -0400
Subject: ibmvnic: Check for driver reset first in ibmvnic_xmit

Move the check for the driver resetting to the first thing
in ibmvnic_xmit().

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index a312bc1fee6d..f1ee37716ad7 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -985,12 +985,6 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 	int index = 0;
 	int ret = 0;
 
-	tx_pool = &adapter->tx_pool[queue_num];
-	tx_scrq = adapter->tx_scrq[queue_num];
-	txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb));
-	handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
-				   be32_to_cpu(adapter->login_rsp_buf->
-					       off_txsubm_subcrqs));
 	if (adapter->resetting) {
 		if (!netif_subqueue_stopped(netdev, skb))
 			netif_stop_subqueue(netdev, queue_num);
@@ -1002,6 +996,12 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 		goto out;
 	}
 
+	tx_pool = &adapter->tx_pool[queue_num];
+	tx_scrq = adapter->tx_scrq[queue_num];
+	txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb));
+	handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) +
+		be32_to_cpu(adapter->login_rsp_buf->off_txsubm_subcrqs));
+
 	index = tx_pool->free_map[tx_pool->consumer_index];
 	offset = index * adapter->req_mtu;
 	dst = tx_pool->long_term_buff.buff + offset;
-- 
cgit v1.2.3


From ca05e31674b20904ff86464e475604f1de445365 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 14:05:14 -0400
Subject: ibmvnic: Continue skb processing after skb completion error

There is not a need to stop processing skbs if we encounter a
skb that has a receive completion error.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index f1ee37716ad7..00c9d5a305eb 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1404,7 +1404,7 @@ restart_poll:
 			/* free the entry */
 			next->rx_comp.first = 0;
 			remove_buff_from_pool(adapter, rx_buff);
-			break;
+			continue;
 		}
 
 		length = be32_to_cpu(next->rx_comp.len);
-- 
cgit v1.2.3


From 94ca305fd85cd052050b19b1762eb14eaa8cdd6a Mon Sep 17 00:00:00 2001
From: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 14:05:20 -0400
Subject: ibmvnic: Record SKB RX queue during poll

Map each RX SKB to the RX queue associated with the driver's RX SCRQ.
This should improve the RX CPU load balancing issues seen by the
performance team.

Signed-off-by: Thomas Falcon <tlfalcon@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 00c9d5a305eb..1b6268ce7b0d 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1428,6 +1428,7 @@ restart_poll:
 
 		skb_put(skb, length);
 		skb->protocol = eth_type_trans(skb, netdev);
+		skb_record_rx_queue(skb, scrq_num);
 
 		if (flags & IBMVNIC_IP_CHKSUM_GOOD &&
 		    flags & IBMVNIC_TCP_UDP_CHKSUM_GOOD) {
-- 
cgit v1.2.3


From 7c3e7de3f3a94fa34731f302e2f6606c9adc0f38 Mon Sep 17 00:00:00 2001
From: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Date: Wed, 3 May 2017 14:05:25 -0400
Subject: ibmvnic: Move queue restarting in ibmvnic_tx_complete

Restart of the subqueue should occur outside of the loop processing
any tx buffers instead of doing this in the middle of the loop.

Signed-off-by: Nathan Fontenot <nfont@linux.vnet.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/ibm/ibmvnic.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c
index 1b6268ce7b0d..4f2d329dba99 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1809,19 +1809,8 @@ restart_loop:
 			}
 
 			if (txbuff->last_frag) {
-				if (atomic_sub_return(next->tx_comp.num_comps,
-						      &scrq->used) <=
-				    (adapter->req_tx_entries_per_subcrq / 2) &&
-				    netif_subqueue_stopped(adapter->netdev,
-							   txbuff->skb)) {
-					netif_wake_subqueue(adapter->netdev,
-							    scrq->pool_index);
-					netdev_dbg(adapter->netdev,
-						   "Started queue %d\n",
-						   scrq->pool_index);
-				}
-
 				dev_kfree_skb_any(txbuff->skb);
+				txbuff->skb = NULL;
 			}
 
 			adapter->tx_pool[pool].free_map[adapter->tx_pool[pool].
@@ -1832,6 +1821,15 @@ restart_loop:
 		}
 		/* remove tx_comp scrq*/
 		next->tx_comp.first = 0;
+
+		if (atomic_sub_return(next->tx_comp.num_comps, &scrq->used) <=
+		    (adapter->req_tx_entries_per_subcrq / 2) &&
+		    __netif_subqueue_stopped(adapter->netdev,
+					     scrq->pool_index)) {
+			netif_wake_subqueue(adapter->netdev, scrq->pool_index);
+			netdev_info(adapter->netdev, "Started queue %d\n",
+				    scrq->pool_index);
+		}
 	}
 
 	enable_scrq_irq(adapter, scrq);
-- 
cgit v1.2.3


From 5d826b7b98a66c890920d690d101bf54d220ea92 Mon Sep 17 00:00:00 2001
From: Zhu Yanjun <yanjun.zhu@oracle.com>
Date: Wed, 3 May 2017 00:43:42 -0400
Subject: forcedeth: remove unnecessary carrier status check

Since netif_carrier_on() will do nothing if device's
carrier is already on, so it's unnecessary to do
carrier status check.

It's the same for netif_carrier_off().

Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/nvidia/forcedeth.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 978d32944c80..aa912f43e15f 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -4248,11 +4248,9 @@ static int nv_get_link_ksettings(struct net_device *dev,
 		/* We do not track link speed / duplex setting if the
 		 * interface is disabled. Force a link check */
 		if (nv_update_linkspeed(dev)) {
-			if (!netif_carrier_ok(dev))
-				netif_carrier_on(dev);
+			netif_carrier_on(dev);
 		} else {
-			if (netif_carrier_ok(dev))
-				netif_carrier_off(dev);
+			netif_carrier_off(dev);
 		}
 	}
 
-- 
cgit v1.2.3


From 8b485ce69876c65db12ed390e7f9c0d2a64eff2c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 3 May 2017 06:39:31 -0700
Subject: tcp: do not inherit fastopen_req from parent

Under fuzzer stress, it is possible that a child gets a non NULL
fastopen_req pointer from its parent at accept() time, when/if parent
morphs from listener to active session.

We need to make sure this can not happen, by clearing the field after
socket cloning.

BUG: Double free or freeing an invalid pointer
Unexpected shadow byte: 0xFB
CPU: 3 PID: 20933 Comm: syz-executor3 Not tainted 4.11.0+ #306
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs
01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x292/0x395 lib/dump_stack.c:52
 kasan_object_err+0x1c/0x70 mm/kasan/report.c:164
 kasan_report_double_free+0x5c/0x70 mm/kasan/report.c:185
 kasan_slab_free+0x9d/0xc0 mm/kasan/kasan.c:580
 slab_free_hook mm/slub.c:1357 [inline]
 slab_free_freelist_hook mm/slub.c:1379 [inline]
 slab_free mm/slub.c:2961 [inline]
 kfree+0xe8/0x2b0 mm/slub.c:3882
 tcp_free_fastopen_req net/ipv4/tcp.c:1077 [inline]
 tcp_disconnect+0xc15/0x13e0 net/ipv4/tcp.c:2328
 inet_child_forget+0xb8/0x600 net/ipv4/inet_connection_sock.c:898
 inet_csk_reqsk_queue_add+0x1e7/0x250
net/ipv4/inet_connection_sock.c:928
 tcp_get_cookie_sock+0x21a/0x510 net/ipv4/syncookies.c:217
 cookie_v4_check+0x1a19/0x28b0 net/ipv4/syncookies.c:384
 tcp_v4_cookie_check net/ipv4/tcp_ipv4.c:1384 [inline]
 tcp_v4_do_rcv+0x731/0x940 net/ipv4/tcp_ipv4.c:1421
 tcp_v4_rcv+0x2dc0/0x31c0 net/ipv4/tcp_ipv4.c:1715
 ip_local_deliver_finish+0x4cc/0xc20 net/ipv4/ip_input.c:216
 NF_HOOK include/linux/netfilter.h:257 [inline]
 ip_local_deliver+0x1ce/0x700 net/ipv4/ip_input.c:257
 dst_input include/net/dst.h:492 [inline]
 ip_rcv_finish+0xb1d/0x20b0 net/ipv4/ip_input.c:396
 NF_HOOK include/linux/netfilter.h:257 [inline]
 ip_rcv+0xd8c/0x19c0 net/ipv4/ip_input.c:487
 __netif_receive_skb_core+0x1ad1/0x3400 net/core/dev.c:4210
 __netif_receive_skb+0x2a/0x1a0 net/core/dev.c:4248
 process_backlog+0xe5/0x6c0 net/core/dev.c:4868
 napi_poll net/core/dev.c:5270 [inline]
 net_rx_action+0xe70/0x18e0 net/core/dev.c:5335
 __do_softirq+0x2fb/0xb99 kernel/softirq.c:284
 do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:899
 </IRQ>
 do_softirq.part.17+0x1e8/0x230 kernel/softirq.c:328
 do_softirq kernel/softirq.c:176 [inline]
 __local_bh_enable_ip+0x1cf/0x1e0 kernel/softirq.c:181
 local_bh_enable include/linux/bottom_half.h:31 [inline]
 rcu_read_unlock_bh include/linux/rcupdate.h:931 [inline]
 ip_finish_output2+0x9ab/0x15e0 net/ipv4/ip_output.c:230
 ip_finish_output+0xa35/0xdf0 net/ipv4/ip_output.c:316
 NF_HOOK_COND include/linux/netfilter.h:246 [inline]
 ip_output+0x1f6/0x7b0 net/ipv4/ip_output.c:404
 dst_output include/net/dst.h:486 [inline]
 ip_local_out+0x95/0x160 net/ipv4/ip_output.c:124
 ip_queue_xmit+0x9a8/0x1a10 net/ipv4/ip_output.c:503
 tcp_transmit_skb+0x1ade/0x3470 net/ipv4/tcp_output.c:1057
 tcp_write_xmit+0x79e/0x55b0 net/ipv4/tcp_output.c:2265
 __tcp_push_pending_frames+0xfa/0x3a0 net/ipv4/tcp_output.c:2450
 tcp_push+0x4ee/0x780 net/ipv4/tcp.c:683
 tcp_sendmsg+0x128d/0x39b0 net/ipv4/tcp.c:1342
 inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:762
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 SYSC_sendto+0x660/0x810 net/socket.c:1696
 SyS_sendto+0x40/0x50 net/socket.c:1664
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x446059
RSP: 002b:00007faa6761fb58 EFLAGS: 00000282 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 0000000000000017 RCX: 0000000000446059
RDX: 0000000000000001 RSI: 0000000020ba3fcd RDI: 0000000000000017
RBP: 00000000006e40a0 R08: 0000000020ba4ff0 R09: 0000000000000010
R10: 0000000020000000 R11: 0000000000000282 R12: 0000000000708150
R13: 0000000000000000 R14: 00007faa676209c0 R15: 00007faa67620700
Object at ffff88003b5bbcb8, in cache kmalloc-64 size: 64
Allocated:
PID = 20909
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:513
 set_track mm/kasan/kasan.c:525 [inline]
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:616
 kmem_cache_alloc_trace+0x82/0x270 mm/slub.c:2745
 kmalloc include/linux/slab.h:490 [inline]
 kzalloc include/linux/slab.h:663 [inline]
 tcp_sendmsg_fastopen net/ipv4/tcp.c:1094 [inline]
 tcp_sendmsg+0x221a/0x39b0 net/ipv4/tcp.c:1139
 inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:762
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 SYSC_sendto+0x660/0x810 net/socket.c:1696
 SyS_sendto+0x40/0x50 net/socket.c:1664
 entry_SYSCALL_64_fastpath+0x1f/0xbe
Freed:
PID = 20909
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:513
 set_track mm/kasan/kasan.c:525 [inline]
 kasan_slab_free+0x73/0xc0 mm/kasan/kasan.c:589
 slab_free_hook mm/slub.c:1357 [inline]
 slab_free_freelist_hook mm/slub.c:1379 [inline]
 slab_free mm/slub.c:2961 [inline]
 kfree+0xe8/0x2b0 mm/slub.c:3882
 tcp_free_fastopen_req net/ipv4/tcp.c:1077 [inline]
 tcp_disconnect+0xc15/0x13e0 net/ipv4/tcp.c:2328
 __inet_stream_connect+0x20c/0xf90 net/ipv4/af_inet.c:593
 tcp_sendmsg_fastopen net/ipv4/tcp.c:1111 [inline]
 tcp_sendmsg+0x23a8/0x39b0 net/ipv4/tcp.c:1139
 inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:762
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 SYSC_sendto+0x660/0x810 net/socket.c:1696
 SyS_sendto+0x40/0x50 net/socket.c:1664
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets")
Fixes: 7db92362d2fe ("tcp: fix potential double free issue for fastopen_req")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Acked-by: Wei Wang <weiwan@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_minisocks.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 8f6373b0cd77..717be4de5324 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -523,6 +523,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 			newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
 		newtp->rx_opt.mss_clamp = req->mss;
 		tcp_ecn_openreq_child(newtp, req);
+		newtp->fastopen_req = NULL;
 		newtp->fastopen_rsk = NULL;
 		newtp->syn_data_acked = 0;
 		newtp->rack.mstamp.v64 = 0;
-- 
cgit v1.2.3


From 985538eee06f169bb13c7ecb1a7fcdeb700b05c7 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 3 May 2017 14:50:40 +0100
Subject: net/sched: remove redundant null check on head

head is previously null checked and so the 2nd null check on head
is redundant and therefore can be removed.

Detected by CoverityScan, CID#1399505 ("Logically dead code")

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_matchall.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index 2efb36c08f2a..dee469fed967 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -203,8 +203,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb,
 
 	*arg = (unsigned long) head;
 	rcu_assign_pointer(tp->root, new);
-	if (head)
-		call_rcu(&head->rcu, mall_destroy_rcu);
+	call_rcu(&head->rcu, mall_destroy_rcu);
 	return 0;
 
 err_replace_hw_filter:
-- 
cgit v1.2.3


From 86f4c90a1c5c1493f07f2d12c1079f5bf01936f2 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Wed, 3 May 2017 17:06:58 +0200
Subject: ipv4, ipv6: ensure raw socket message is big enough to hold an IP
 header

raw_send_hdrinc() and rawv6_send_hdrinc() expect that the buffer copied
from the userspace contains the IPv4/IPv6 header, so if too few bytes are
copied, parts of the header may remain uninitialized.

This bug has been detected with KMSAN.

For the record, the KMSAN report:

==================================================================
BUG: KMSAN: use of unitialized memory in nf_ct_frag6_gather+0xf5a/0x44a0
inter: 0
CPU: 0 PID: 1036 Comm: probe Not tainted 4.11.0-rc5+ #2455
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16
 dump_stack+0x143/0x1b0 lib/dump_stack.c:52
 kmsan_report+0x16b/0x1e0 mm/kmsan/kmsan.c:1078
 __kmsan_warning_32+0x5c/0xa0 mm/kmsan/kmsan_instr.c:510
 nf_ct_frag6_gather+0xf5a/0x44a0 net/ipv6/netfilter/nf_conntrack_reasm.c:577
 ipv6_defrag+0x1d9/0x280 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c:68
 nf_hook_entry_hookfn ./include/linux/netfilter.h:102
 nf_hook_slow+0x13f/0x3c0 net/netfilter/core.c:310
 nf_hook ./include/linux/netfilter.h:212
 NF_HOOK ./include/linux/netfilter.h:255
 rawv6_send_hdrinc net/ipv6/raw.c:673
 rawv6_sendmsg+0x2fcb/0x41a0 net/ipv6/raw.c:919
 inet_sendmsg+0x3f8/0x6d0 net/ipv4/af_inet.c:762
 sock_sendmsg_nosec net/socket.c:633
 sock_sendmsg net/socket.c:643
 SYSC_sendto+0x6a5/0x7c0 net/socket.c:1696
 SyS_sendto+0xbc/0xe0 net/socket.c:1664
 do_syscall_64+0x72/0xa0 arch/x86/entry/common.c:285
 entry_SYSCALL64_slow_path+0x25/0x25 arch/x86/entry/entry_64.S:246
RIP: 0033:0x436e03
RSP: 002b:00007ffce48baf38 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 00000000004002b0 RCX: 0000000000436e03
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003
RBP: 00007ffce48baf90 R08: 00007ffce48baf50 R09: 000000000000001c
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000401790 R14: 0000000000401820 R15: 0000000000000000
origin: 00000000d9400053
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:362
 kmsan_internal_poison_shadow+0xb1/0x1a0 mm/kmsan/kmsan.c:257
 kmsan_poison_shadow+0x6d/0xc0 mm/kmsan/kmsan.c:270
 slab_alloc_node mm/slub.c:2735
 __kmalloc_node_track_caller+0x1f4/0x390 mm/slub.c:4341
 __kmalloc_reserve net/core/skbuff.c:138
 __alloc_skb+0x2cd/0x740 net/core/skbuff.c:231
 alloc_skb ./include/linux/skbuff.h:933
 alloc_skb_with_frags+0x209/0xbc0 net/core/skbuff.c:4678
 sock_alloc_send_pskb+0x9ff/0xe00 net/core/sock.c:1903
 sock_alloc_send_skb+0xe4/0x100 net/core/sock.c:1920
 rawv6_send_hdrinc net/ipv6/raw.c:638
 rawv6_sendmsg+0x2918/0x41a0 net/ipv6/raw.c:919
 inet_sendmsg+0x3f8/0x6d0 net/ipv4/af_inet.c:762
 sock_sendmsg_nosec net/socket.c:633
 sock_sendmsg net/socket.c:643
 SYSC_sendto+0x6a5/0x7c0 net/socket.c:1696
 SyS_sendto+0xbc/0xe0 net/socket.c:1664
 do_syscall_64+0x72/0xa0 arch/x86/entry/common.c:285
 return_from_SYSCALL_64+0x0/0x6a arch/x86/entry/entry_64.S:246
==================================================================

, triggered by the following syscalls:
  socket(PF_INET6, SOCK_RAW, IPPROTO_RAW) = 3
  sendto(3, NULL, 0, 0, {sa_family=AF_INET6, sin6_port=htons(0), inet_pton(AF_INET6, "ff00::", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = -1 EPERM

A similar report is triggered in net/ipv4/raw.c if we use a PF_INET socket
instead of a PF_INET6 one.

Signed-off-by: Alexander Potapenko <glider@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/raw.c | 3 +++
 net/ipv6/raw.c | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 9d943974de2b..bdffad875691 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -358,6 +358,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 			       rt->dst.dev->mtu);
 		return -EMSGSIZE;
 	}
+	if (length < sizeof(struct iphdr))
+		return -EINVAL;
+
 	if (flags&MSG_PROBE)
 		goto out;
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 0da6a12b5472..1f992d9e261d 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -632,6 +632,8 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 		ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu);
 		return -EMSGSIZE;
 	}
+	if (length < sizeof(struct ipv6hdr))
+		return -EINVAL;
 	if (flags&MSG_PROBE)
 		goto out;
 
-- 
cgit v1.2.3


From 5900eca1ac3bcb03cd0da1c9a5504f1fbfe9b5bc Mon Sep 17 00:00:00 2001
From: Pavel Belous <pavel.belous@aquantia.com>
Date: Wed, 3 May 2017 21:17:44 +0300
Subject: aquantia: Fix driver name reported by ethtool

V2: using "aquantia" subsystem tag.

The command "ethtool -i ethX" should display driver name (driver: atlantic)
instead vendor name (driver: aquantia).

Signed-off-by: Pavel Belous <pavel.belous@aquantia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/aquantia/atlantic/aq_cfg.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
index 5f99237a9d52..214986436ece 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h
@@ -68,7 +68,7 @@
 
 #define AQ_CFG_DRV_AUTHOR      "aQuantia"
 #define AQ_CFG_DRV_DESC        "aQuantia Corporation(R) Network Driver"
-#define AQ_CFG_DRV_NAME        "aquantia"
+#define AQ_CFG_DRV_NAME        "atlantic"
 #define AQ_CFG_DRV_VERSION	__stringify(NIC_MAJOR_DRIVER_VERSION)"."\
 				__stringify(NIC_MINOR_DRIVER_VERSION)"."\
 				__stringify(NIC_BUILD_DRIVER_VERSION)"."\
-- 
cgit v1.2.3


From 2be0f2644598ee4b8ea0ed6a32913e1ce0212c9c Mon Sep 17 00:00:00 2001
From: stephen hemminger <stephen@networkplumber.org>
Date: Wed, 3 May 2017 16:59:21 -0700
Subject: netvsc: make sure napi enabled before vmbus_open

This fixes a race where vmbus callback for new packet arriving
could occur before NAPI is initialized.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/hyperv/netvsc.c       | 8 +++++---
 drivers/net/hyperv/rndis_filter.c | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 15749d359e60..652453d9fb08 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -1322,6 +1322,10 @@ int netvsc_device_add(struct hv_device *device,
 		nvchan->channel = device->channel;
 	}
 
+	/* Enable NAPI handler before init callbacks */
+	netif_napi_add(ndev, &net_device->chan_table[0].napi,
+		       netvsc_poll, NAPI_POLL_WEIGHT);
+
 	/* Open the channel */
 	ret = vmbus_open(device->channel, ring_size * PAGE_SIZE,
 			 ring_size * PAGE_SIZE, NULL, 0,
@@ -1329,6 +1333,7 @@ int netvsc_device_add(struct hv_device *device,
 			 net_device->chan_table);
 
 	if (ret != 0) {
+		netif_napi_del(&net_device->chan_table[0].napi);
 		netdev_err(ndev, "unable to open channel: %d\n", ret);
 		goto cleanup;
 	}
@@ -1336,9 +1341,6 @@ int netvsc_device_add(struct hv_device *device,
 	/* Channel is opened */
 	netdev_dbg(ndev, "hv_netvsc channel opened successfully\n");
 
-	/* Enable NAPI handler for init callbacks */
-	netif_napi_add(ndev, &net_device->chan_table[0].napi,
-		       netvsc_poll, NAPI_POLL_WEIGHT);
 	napi_enable(&net_device->chan_table[0].napi);
 
 	/* Writing nvdev pointer unlocks netvsc_send(), make sure chn_table is
diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index ab92c3c95951..f9d5b0b8209a 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1018,7 +1018,7 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc)
 	if (ret == 0)
 		napi_enable(&nvchan->napi);
 	else
-		netdev_err(ndev, "sub channel open failed (%d)\n", ret);
+		netif_napi_del(&nvchan->napi);
 
 	if (refcount_dec_and_test(&nvscdev->sc_offered))
 		complete(&nvscdev->channel_init_wait);
-- 
cgit v1.2.3


From 77ef033b687c3e030017c94a29bf6ea3aaaef678 Mon Sep 17 00:00:00 2001
From: Michal Schmidt <mschmidt@redhat.com>
Date: Thu, 4 May 2017 16:48:58 +0200
Subject: rtnetlink: NUL-terminate IFLA_PHYS_PORT_NAME string

IFLA_PHYS_PORT_NAME is a string attribute, so terminate it with \0.
Otherwise libnl3 fails to validate netlink messages with this attribute.
"ip -detail a" assumes too that the attribute is NUL-terminated when
printing it. It often was, due to padding.

I noticed this as libvirtd failing to start on a system with sfc driver
after upgrading it to Linux 4.11, i.e. when sfc added support for
phys_port_name.

Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/rtnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 6e67315ec368..bcb0f610ee42 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1054,7 +1054,7 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev)
 		return err;
 	}
 
-	if (nla_put(skb, IFLA_PHYS_PORT_NAME, strlen(name), name))
+	if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name))
 		return -EMSGSIZE;
 
 	return 0;
-- 
cgit v1.2.3


From f870a3c6727db5fcfeaa42d099f75872e4b17553 Mon Sep 17 00:00:00 2001
From: "sudarsana.kalluru@cavium.com" <sudarsana.kalluru@cavium.com>
Date: Thu, 4 May 2017 08:15:03 -0700
Subject: qed*: Fix possible overflow for status block id field.

Value for status block id could be more than 256 in 100G mode, need to
update its data type from u8 to u16.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c       | 8 ++++----
 drivers/net/ethernet/qlogic/qed/qed_dev_api.h   | 4 ++--
 drivers/net/ethernet/qlogic/qed/qed_main.c      | 2 +-
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 5 ++---
 include/linux/qed/qed_if.h                      | 2 +-
 5 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 5f31140d0b77..5c6874af4b65 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -3586,7 +3586,7 @@ static int qed_set_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 }
 
 int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
-			 u16 coalesce, u8 qid, u16 sb_id)
+			 u16 coalesce, u16 qid, u16 sb_id)
 {
 	struct ustorm_eth_queue_zone eth_qzone;
 	u8 timeset, timer_res;
@@ -3607,7 +3607,7 @@ int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 	}
 	timeset = (u8)(coalesce >> timer_res);
 
-	rc = qed_fw_l2_queue(p_hwfn, (u16)qid, &fw_qid);
+	rc = qed_fw_l2_queue(p_hwfn, qid, &fw_qid);
 	if (rc)
 		return rc;
 
@@ -3628,7 +3628,7 @@ out:
 }
 
 int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
-			 u16 coalesce, u8 qid, u16 sb_id)
+			 u16 coalesce, u16 qid, u16 sb_id)
 {
 	struct xstorm_eth_queue_zone eth_qzone;
 	u8 timeset, timer_res;
@@ -3649,7 +3649,7 @@ int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
 	}
 	timeset = (u8)(coalesce >> timer_res);
 
-	rc = qed_fw_l2_queue(p_hwfn, (u16)qid, &fw_qid);
+	rc = qed_fw_l2_queue(p_hwfn, qid, &fw_qid);
 	if (rc)
 		return rc;
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
index cefe3ee9064a..12d16c096e36 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h
@@ -454,7 +454,7 @@ int qed_final_cleanup(struct qed_hwfn *p_hwfn,
  * @return int
  */
 int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
-			 u16 coalesce, u8 qid, u16 sb_id);
+			 u16 coalesce, u16 qid, u16 sb_id);
 
 /**
  * @brief qed_set_txq_coalesce - Configure coalesce parameters for a Tx queue
@@ -471,7 +471,7 @@ int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
  * @return int
  */
 int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
-			 u16 coalesce, u8 qid, u16 sb_id);
+			 u16 coalesce, u16 qid, u16 sb_id);
 
 const char *qed_hw_get_resc_name(enum qed_resources res_id);
 #endif
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index 59992cf20d42..a5eef1abc5a1 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1521,7 +1521,7 @@ static void qed_get_coalesce(struct qed_dev *cdev, u16 *rx_coal, u16 *tx_coal)
 }
 
 static int qed_set_coalesce(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal,
-			    u8 qid, u16 sb_id)
+			    u16 qid, u16 sb_id)
 {
 	struct qed_hwfn *hwfn;
 	struct qed_ptt *ptt;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index 4dcfe9614731..b22753c5fd44 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -706,8 +706,7 @@ static int qede_set_coalesce(struct net_device *dev,
 {
 	struct qede_dev *edev = netdev_priv(dev);
 	int i, rc = 0;
-	u16 rxc, txc;
-	u8 sb_id;
+	u16 rxc, txc, sb_id;
 
 	if (!netif_running(dev)) {
 		DP_INFO(edev, "Interface is down\n");
@@ -729,7 +728,7 @@ static int qede_set_coalesce(struct net_device *dev,
 	for_each_queue(i) {
 		sb_id = edev->fp_array[i].sb_info->igu_sb_id;
 		rc = edev->ops->common->set_coalesce(edev->cdev, rxc, txc,
-						     (u8)i, sb_id);
+						     (u16)i, sb_id);
 		if (rc) {
 			DP_INFO(edev, "Set coalesce error, rc = %d\n", rc);
 			return rc;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 5544d7b2f2bb..c70ac13a97e6 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -635,7 +635,7 @@ struct qed_common_ops {
  * @return 0 on success, error otherwise.
  */
 	int (*set_coalesce)(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal,
-			    u8 qid, u16 sb_id);
+			    u16 qid, u16 sb_id);
 
 /**
  * @brief set_led - Configure LED mode
-- 
cgit v1.2.3


From 34f9199ce7b7e5c641b96e928bd60e086bf7f278 Mon Sep 17 00:00:00 2001
From: "sudarsana.kalluru@cavium.com" <sudarsana.kalluru@cavium.com>
Date: Thu, 4 May 2017 08:15:04 -0700
Subject: qed: Fix overriding of supported autoneg value.

Driver currently uses advertised-autoneg value to populate the
supported-autoneg field. When advertised field is updated, user gets
the same value for supported field. Supported-autoneg value need to be
populated from the link capabilities value returned by the MFW.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qed/qed_dev.c  | 3 +++
 drivers/net/ethernet/qlogic/qed/qed_main.c | 6 +++++-
 drivers/net/ethernet/qlogic/qed/qed_mcp.h  | 1 +
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index 5c6874af4b65..bb70522ad362 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -2536,6 +2536,9 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 		DP_NOTICE(p_hwfn, "Unknown Speed in 0x%08x\n", link_temp);
 	}
 
+	p_hwfn->mcp_info->link_capabilities.default_speed_autoneg =
+		link->speed.autoneg;
+
 	link_temp &= NVM_CFG1_PORT_DRV_FLOW_CONTROL_MASK;
 	link_temp >>= NVM_CFG1_PORT_DRV_FLOW_CONTROL_OFFSET;
 	link->pause.autoneg = !!(link_temp &
diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index a5eef1abc5a1..b7ad36b91e12 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -1372,7 +1372,7 @@ static void qed_fill_link(struct qed_hwfn *hwfn,
 
 	/* TODO - at the moment assume supported and advertised speed equal */
 	if_link->supported_caps = QED_LM_FIBRE_BIT;
-	if (params.speed.autoneg)
+	if (link_caps.default_speed_autoneg)
 		if_link->supported_caps |= QED_LM_Autoneg_BIT;
 	if (params.pause.autoneg ||
 	    (params.pause.forced_rx && params.pause.forced_tx))
@@ -1382,6 +1382,10 @@ static void qed_fill_link(struct qed_hwfn *hwfn,
 		if_link->supported_caps |= QED_LM_Pause_BIT;
 
 	if_link->advertised_caps = if_link->supported_caps;
+	if (params.speed.autoneg)
+		if_link->advertised_caps |= QED_LM_Autoneg_BIT;
+	else
+		if_link->advertised_caps &= ~QED_LM_Autoneg_BIT;
 	if (params.speed.advertised_speeds &
 	    NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G)
 		if_link->advertised_caps |= QED_LM_1000baseT_Half_BIT |
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index 5ae35d6cc7d1..2b09b8545236 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -61,6 +61,7 @@ struct qed_mcp_link_params {
 
 struct qed_mcp_link_capabilities {
 	u32 speed_capabilities;
+	bool default_speed_autoneg;
 };
 
 struct qed_mcp_link_state {
-- 
cgit v1.2.3


From 161adb046b9119d480f2547db20fc3e9ce7cbf59 Mon Sep 17 00:00:00 2001
From: "sudarsana.kalluru@cavium.com" <sudarsana.kalluru@cavium.com>
Date: Thu, 4 May 2017 08:15:05 -0700
Subject: qede: Fix possible misconfiguration of advertised autoneg value.

Fail the configuration of advertised speed-autoneg value if the config
update is not supported.

Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>
Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/qede/qede_ethtool.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
index b22753c5fd44..172b292241a5 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c
@@ -493,6 +493,11 @@ static int qede_set_link_ksettings(struct net_device *dev,
 	params.override_flags |= QED_LINK_OVERRIDE_SPEED_ADV_SPEEDS;
 	params.override_flags |= QED_LINK_OVERRIDE_SPEED_AUTONEG;
 	if (base->autoneg == AUTONEG_ENABLE) {
+		if (!(current_link.supported_caps & QED_LM_Autoneg_BIT)) {
+			DP_INFO(edev, "Auto negotiation is not supported\n");
+			return -EOPNOTSUPP;
+		}
+
 		params.autoneg = true;
 		params.forced_speed = 0;
 		QEDE_ETHTOOL_TO_DRV_CAPS(params.adv_speeds, cmd, advertising)
-- 
cgit v1.2.3


From 2f460933f58eee3393aba64f0f6d14acb08d1724 Mon Sep 17 00:00:00 2001
From: WANG Cong <xiyou.wangcong@gmail.com>
Date: Wed, 3 May 2017 22:07:31 -0700
Subject: ipv6: initialize route null entry in addrconf_init()

Andrey reported a crash on init_net.ipv6.ip6_null_entry->rt6i_idev
since it is always NULL.

This is clearly wrong, we have code to initialize it to loopback_dev,
unfortunately the order is still not correct.

loopback_dev is registered very early during boot, we lose a chance
to re-initialize it in notifier. addrconf_init() is called after
ip6_route_init(), which means we have no chance to correct it.

Fix it by moving this initialization explicitly after
ipv6_add_dev(init_net.loopback_dev) in addrconf_init().

Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Tested-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_route.h |  1 +
 net/ipv6/addrconf.c     |  2 ++
 net/ipv6/route.c        | 26 +++++++++++++++-----------
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 9dc2c182a263..f5e625f53367 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -84,6 +84,7 @@ struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 			       int ifindex, struct flowi6 *fl6, int flags);
 
+void ip6_route_init_special_entries(void);
 int ip6_route_init(void);
 void ip6_route_cleanup(void);
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a2a370b71249..77a4bd526d6e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6573,6 +6573,8 @@ int __init addrconf_init(void)
 		goto errlo;
 	}
 
+	ip6_route_init_special_entries();
+
 	for (i = 0; i < IN6_ADDR_HSIZE; i++)
 		INIT_HLIST_HEAD(&inet6_addr_lst[i]);
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a1bf426c959b..2f1136627dcb 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4027,6 +4027,21 @@ static struct notifier_block ip6_route_dev_notifier = {
 	.priority = 0,
 };
 
+void __init ip6_route_init_special_entries(void)
+{
+	/* Registering of the loopback is done before this portion of code,
+	 * the loopback reference in rt6_info will not be taken, do it
+	 * manually for init_net */
+	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
+	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
+	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
+	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
+  #endif
+}
+
 int __init ip6_route_init(void)
 {
 	int ret;
@@ -4053,17 +4068,6 @@ int __init ip6_route_init(void)
 
 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
 
-	/* Registering of the loopback is done before this portion of code,
-	 * the loopback reference in rt6_info will not be taken, do it
-	 * manually for init_net */
-	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
-	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
-  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
-	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
-	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
-	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
-  #endif
 	ret = fib6_init();
 	if (ret)
 		goto out_register_subsys;
-- 
cgit v1.2.3


From 842be75c77cb72ee546a2b19da9c285fb3ded660 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 May 2017 08:42:30 +0200
Subject: cfg80211: make RATE_INFO_BW_20 the default

Due to the way I did the RX bitrate conversions in mac80211 with
spatch, going setting flags to setting the value, many drivers now
don't set the bandwidth value for 20 MHz, since with the flags it
wasn't necessary to (there was no 20 MHz flag, only the others.)

Rather than go through and try to fix up all the drivers, instead
renumber the enum so that 20 MHz, which is the typical bandwidth,
actually has the value 0, making those drivers all work again.

If VHT was hit used with a driver not reporting it, e.g. iwlmvm,
this manifested in hitting the bandwidth warning in
cfg80211_calculate_bitrate_vht().

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/cfg80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 6e90f1a4950f..15d6599b8bc6 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1013,9 +1013,9 @@ enum rate_info_flags {
  * @RATE_INFO_BW_160: 160 MHz bandwidth
  */
 enum rate_info_bw {
+	RATE_INFO_BW_20 = 0,
 	RATE_INFO_BW_5,
 	RATE_INFO_BW_10,
-	RATE_INFO_BW_20,
 	RATE_INFO_BW_40,
 	RATE_INFO_BW_80,
 	RATE_INFO_BW_160,
-- 
cgit v1.2.3