From 75cee397ae6f1020fbb75db90aa22a51bc3318ac Mon Sep 17 00:00:00 2001
From: Hoang Huu Le <hoang.h.le@dektech.com.au>
Date: Fri, 16 Oct 2020 09:31:18 +0700
Subject: tipc: re-configure queue limit for broadcast link

The queue limit of the broadcast link is being calculated base on initial
MTU. However, when MTU value changed (e.g manual changing MTU on NIC
device, MTU negotiation etc.,) we do not re-calculate queue limit.
This gives throughput does not reflect with the change.

So fix it by calling the function to re-calculate queue limit of the
broadcast link.

Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Hoang Huu Le <hoang.h.le@dektech.com.au>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/tipc/bcast.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 940d176e0e87..c77fd13e2777 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -108,6 +108,7 @@ static void tipc_bcbase_select_primary(struct net *net)
 {
 	struct tipc_bc_base *bb = tipc_bc_base(net);
 	int all_dests =  tipc_link_bc_peers(bb->link);
+	int max_win = tipc_link_max_win(bb->link);
 	int i, mtu, prim;
 
 	bb->primary_bearer = INVALID_BEARER_ID;
@@ -121,8 +122,11 @@ static void tipc_bcbase_select_primary(struct net *net)
 			continue;
 
 		mtu = tipc_bearer_mtu(net, i);
-		if (mtu < tipc_link_mtu(bb->link))
+		if (mtu < tipc_link_mtu(bb->link)) {
 			tipc_link_set_mtu(bb->link, mtu);
+			tipc_link_set_queue_limits(bb->link, max_win,
+						   max_win);
+		}
 		bb->bcast_support &= tipc_bearer_bcast_support(net, i);
 		if (bb->dests[i] < all_dests)
 			continue;
-- 
cgit v1.2.3


From ec78e31852c9bb7d96b6557468fecb6f6f3b28f3 Mon Sep 17 00:00:00 2001
From: Hoang Huu Le <hoang.h.le@dektech.com.au>
Date: Fri, 16 Oct 2020 09:31:19 +0700
Subject: tipc: fix incorrect setting window for bcast link

In commit 16ad3f4022bb
("tipc: introduce variable window congestion control"), we applied
the algorithm to select window size from minimum window to the
configured maximum window for unicast link, and, besides we chose
to keep the window size for broadcast link unchanged and equal (i.e
fix window 50)

However, when setting maximum window variable via command, the window
variable was re-initialized to unexpect value (i.e 32).

We fix this by updating the fix window for broadcast as we stated.

Fixes: 16ad3f4022bb ("tipc: introduce variable window congestion control")
Acked-by: Jon Maloy <jmaloy@redhat.com>
Signed-off-by: Hoang Huu Le <hoang.h.le@dektech.com.au>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/tipc/bcast.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index c77fd13e2777..d4beca895992 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -109,6 +109,7 @@ static void tipc_bcbase_select_primary(struct net *net)
 	struct tipc_bc_base *bb = tipc_bc_base(net);
 	int all_dests =  tipc_link_bc_peers(bb->link);
 	int max_win = tipc_link_max_win(bb->link);
+	int min_win = tipc_link_min_win(bb->link);
 	int i, mtu, prim;
 
 	bb->primary_bearer = INVALID_BEARER_ID;
@@ -124,7 +125,8 @@ static void tipc_bcbase_select_primary(struct net *net)
 		mtu = tipc_bearer_mtu(net, i);
 		if (mtu < tipc_link_mtu(bb->link)) {
 			tipc_link_set_mtu(bb->link, mtu);
-			tipc_link_set_queue_limits(bb->link, max_win,
+			tipc_link_set_queue_limits(bb->link,
+						   min_win,
 						   max_win);
 		}
 		bb->bcast_support &= tipc_bearer_bcast_support(net, i);
@@ -589,7 +591,7 @@ static int tipc_bc_link_set_queue_limits(struct net *net, u32 max_win)
 	if (max_win > TIPC_MAX_LINK_WIN)
 		return -EINVAL;
 	tipc_bcast_lock(net);
-	tipc_link_set_queue_limits(l, BCLINK_WIN_MIN, max_win);
+	tipc_link_set_queue_limits(l, tipc_link_min_win(l), max_win);
 	tipc_bcast_unlock(net);
 	return 0;
 }
-- 
cgit v1.2.3


From b38e7819cae946e2edf869e604af1e65a5d241c5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 15 Oct 2020 11:42:00 -0700
Subject: icmp: randomize the global rate limiter

Keyu Man reported that the ICMP rate limiter could be used
by attackers to get useful signal. Details will be provided
in an upcoming academic publication.

Our solution is to add some noise, so that the attackers
no longer can get help from the predictable token bucket limiter.

Fixes: 4cdf507d5452 ("icmp: add a global rate limitation")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Keyu Man <kman001@ucr.edu>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ip-sysctl.rst | 4 +++-
 net/ipv4/icmp.c                        | 7 +++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 837d51f9e1fa..25e6673a085a 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1142,13 +1142,15 @@ icmp_ratelimit - INTEGER
 icmp_msgs_per_sec - INTEGER
 	Limit maximal number of ICMP packets sent per second from this host.
 	Only messages whose type matches icmp_ratemask (see below) are
-	controlled by this limit.
+	controlled by this limit. For security reasons, the precise count
+	of messages per second is randomized.
 
 	Default: 1000
 
 icmp_msgs_burst - INTEGER
 	icmp_msgs_per_sec controls number of ICMP packets sent per second,
 	while icmp_msgs_burst controls the burst size of these packets.
+	For security reasons, the precise burst size is randomized.
 
 	Default: 50
 
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 07f67ced962a..005faea415a4 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -239,7 +239,7 @@ static struct {
 /**
  * icmp_global_allow - Are we allowed to send one more ICMP message ?
  *
- * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec.
+ * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec.
  * Returns false if we reached the limit and can not send another packet.
  * Note: called with BH disabled
  */
@@ -267,7 +267,10 @@ bool icmp_global_allow(void)
 	}
 	credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
 	if (credit) {
-		credit--;
+		/* We want to use a credit of one in average, but need to randomize
+		 * it for security reasons.
+		 */
+		credit = max_t(int, credit - prandom_u32_max(3), 0);
 		rc = true;
 	}
 	WRITE_ONCE(icmp_global.credit, credit);
-- 
cgit v1.2.3


From f981fc3d515a588c389242b7e3a71487b40571a5 Mon Sep 17 00:00:00 2001
From: Eelco Chaudron <echaudro@redhat.com>
Date: Sat, 17 Oct 2020 20:24:51 +0200
Subject: net: openvswitch: fix to make sure flow_lookup() is not preempted

The flow_lookup() function uses per CPU variables, which must be called
with BH disabled. However, this is fine in the general NAPI use case
where the local BH is disabled. But, it's also called from the netlink
context. The below patch makes sure that even in the netlink path, the
BH is disabled.

In addition, u64_stats_update_begin() requires a lock to ensure one writer
which is not ensured here. Making it per-CPU and disabling NAPI (softirq)
ensures that there is always only one writer.

Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on usage")
Reported-by: Juri Lelli <jlelli@redhat.com>
Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
Link: https://lore.kernel.org/r/160295903253.7789.826736662555102345.stgit@ebuild
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/openvswitch/flow_table.c | 58 ++++++++++++++++++++++++++------------------
 net/openvswitch/flow_table.h |  8 ++++--
 2 files changed, 41 insertions(+), 25 deletions(-)

(limited to 'net')

diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 87c286ad660e..f3486a37361a 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -175,7 +175,7 @@ static struct table_instance *table_instance_alloc(int new_size)
 
 static void __mask_array_destroy(struct mask_array *ma)
 {
-	free_percpu(ma->masks_usage_cntr);
+	free_percpu(ma->masks_usage_stats);
 	kfree(ma);
 }
 
@@ -199,15 +199,15 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
 		ma->masks_usage_zero_cntr[i] = 0;
 
 		for_each_possible_cpu(cpu) {
-			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
-							  cpu);
+			struct mask_array_stats *stats;
 			unsigned int start;
 			u64 counter;
 
+			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
 			do {
-				start = u64_stats_fetch_begin_irq(&ma->syncp);
-				counter = usage_counters[i];
-			} while (u64_stats_fetch_retry_irq(&ma->syncp, start));
+				start = u64_stats_fetch_begin_irq(&stats->syncp);
+				counter = stats->usage_cntrs[i];
+			} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
 
 			ma->masks_usage_zero_cntr[i] += counter;
 		}
@@ -230,9 +230,10 @@ static struct mask_array *tbl_mask_array_alloc(int size)
 					     sizeof(struct sw_flow_mask *) *
 					     size);
 
-	new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
-					       __alignof__(u64));
-	if (!new->masks_usage_cntr) {
+	new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
+						sizeof(u64) * size,
+						__alignof__(u64));
+	if (!new->masks_usage_stats) {
 		kfree(new);
 		return NULL;
 	}
@@ -722,6 +723,8 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
 
 /* Flow lookup does full lookup on flow table. It starts with
  * mask from index passed in *index.
+ * This function MUST be called with BH disabled due to the use
+ * of CPU specific variables.
  */
 static struct sw_flow *flow_lookup(struct flow_table *tbl,
 				   struct table_instance *ti,
@@ -731,7 +734,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
 				   u32 *n_cache_hit,
 				   u32 *index)
 {
-	u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
+	struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
 	struct sw_flow *flow;
 	struct sw_flow_mask *mask;
 	int i;
@@ -741,9 +744,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
 		if (mask) {
 			flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
 			if (flow) {
-				u64_stats_update_begin(&ma->syncp);
-				usage_counters[*index]++;
-				u64_stats_update_end(&ma->syncp);
+				u64_stats_update_begin(&stats->syncp);
+				stats->usage_cntrs[*index]++;
+				u64_stats_update_end(&stats->syncp);
 				(*n_cache_hit)++;
 				return flow;
 			}
@@ -762,9 +765,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
 		flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
 		if (flow) { /* Found */
 			*index = i;
-			u64_stats_update_begin(&ma->syncp);
-			usage_counters[*index]++;
-			u64_stats_update_end(&ma->syncp);
+			u64_stats_update_begin(&stats->syncp);
+			stats->usage_cntrs[*index]++;
+			u64_stats_update_end(&stats->syncp);
 			return flow;
 		}
 	}
@@ -850,9 +853,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
 	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
 	u32 __always_unused n_mask_hit;
 	u32 __always_unused n_cache_hit;
+	struct sw_flow *flow;
 	u32 index = 0;
 
-	return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
+	/* This function gets called trough the netlink interface and therefore
+	 * is preemptible. However, flow_lookup() function needs to be called
+	 * with BH disabled due to CPU specific variables.
+	 */
+	local_bh_disable();
+	flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
+	local_bh_enable();
+	return flow;
 }
 
 struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
@@ -1109,7 +1120,6 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
 
 	for (i = 0; i < ma->max; i++) {
 		struct sw_flow_mask *mask;
-		unsigned int start;
 		int cpu;
 
 		mask = rcu_dereference_ovsl(ma->masks[i]);
@@ -1120,14 +1130,16 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
 		masks_and_count[i].counter = 0;
 
 		for_each_possible_cpu(cpu) {
-			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
-							  cpu);
+			struct mask_array_stats *stats;
+			unsigned int start;
 			u64 counter;
 
+			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
 			do {
-				start = u64_stats_fetch_begin_irq(&ma->syncp);
-				counter = usage_counters[i];
-			} while (u64_stats_fetch_retry_irq(&ma->syncp, start));
+				start = u64_stats_fetch_begin_irq(&stats->syncp);
+				counter = stats->usage_cntrs[i];
+			} while (u64_stats_fetch_retry_irq(&stats->syncp,
+							   start));
 
 			masks_and_count[i].counter += counter;
 		}
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index d8fb7a3a3dfd..9e659db78c05 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -38,12 +38,16 @@ struct mask_count {
 	u64 counter;
 };
 
+struct mask_array_stats {
+	struct u64_stats_sync syncp;
+	u64 usage_cntrs[];
+};
+
 struct mask_array {
 	struct rcu_head rcu;
 	int count, max;
-	u64 __percpu *masks_usage_cntr;
+	struct mask_array_stats __percpu *masks_usage_stats;
 	u64 *masks_usage_zero_cntr;
-	struct u64_stats_sync syncp;
 	struct sw_flow_mask __rcu *masks[];
 };
 
-- 
cgit v1.2.3


From 0e8b8d6a2d85344d80dda5beadd98f5f86e8d3d3 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Thu, 15 Oct 2020 16:26:06 +0000
Subject: net: core: use list_del_init() instead of list_del() in
 netdev_run_todo()

dev->unlink_list is reused unless dev is deleted.
So, list_del() should not be used.
Due to using list_del(), dev->unlink_list can't be reused so that
dev->nested_level update logic doesn't work.
In order to fix this bug, list_del_init() should be used instead
of list_del().

Test commands:
    ip link add bond0 type bond
    ip link add bond1 type bond
    ip link set bond0 master bond1
    ip link set bond0 nomaster
    ip link set bond1 master bond0
    ip link set bond1 nomaster

Splat looks like:
[  255.750458][ T1030] ============================================
[  255.751967][ T1030] WARNING: possible recursive locking detected
[  255.753435][ T1030] 5.9.0-rc8+ #772 Not tainted
[  255.754553][ T1030] --------------------------------------------
[  255.756047][ T1030] ip/1030 is trying to acquire lock:
[  255.757304][ T1030] ffff88811782a280 (&dev_addr_list_lock_key/1){+...}-{2:2}, at: dev_mc_sync_multiple+0xc2/0x150
[  255.760056][ T1030]
[  255.760056][ T1030] but task is already holding lock:
[  255.761862][ T1030] ffff88811130a280 (&dev_addr_list_lock_key/1){+...}-{2:2}, at: bond_enslave+0x3d4d/0x43e0 [bonding]
[  255.764581][ T1030]
[  255.764581][ T1030] other info that might help us debug this:
[  255.766645][ T1030]  Possible unsafe locking scenario:
[  255.766645][ T1030]
[  255.768566][ T1030]        CPU0
[  255.769415][ T1030]        ----
[  255.770259][ T1030]   lock(&dev_addr_list_lock_key/1);
[  255.771629][ T1030]   lock(&dev_addr_list_lock_key/1);
[  255.772994][ T1030]
[  255.772994][ T1030]  *** DEADLOCK ***
[  255.772994][ T1030]
[  255.775091][ T1030]  May be due to missing lock nesting notation
[  255.775091][ T1030]
[  255.777182][ T1030] 2 locks held by ip/1030:
[  255.778299][ T1030]  #0: ffffffffb1f63250 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x2e4/0x8b0
[  255.780600][ T1030]  #1: ffff88811130a280 (&dev_addr_list_lock_key/1){+...}-{2:2}, at: bond_enslave+0x3d4d/0x43e0 [bonding]
[  255.783411][ T1030]
[  255.783411][ T1030] stack backtrace:
[  255.784874][ T1030] CPU: 7 PID: 1030 Comm: ip Not tainted 5.9.0-rc8+ #772
[  255.786595][ T1030] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
[  255.789030][ T1030] Call Trace:
[  255.789850][ T1030]  dump_stack+0x99/0xd0
[  255.790882][ T1030]  __lock_acquire.cold.71+0x166/0x3cc
[  255.792285][ T1030]  ? register_lock_class+0x1a30/0x1a30
[  255.793619][ T1030]  ? rcu_read_lock_sched_held+0x91/0xc0
[  255.794963][ T1030]  ? rcu_read_lock_bh_held+0xa0/0xa0
[  255.796246][ T1030]  lock_acquire+0x1b8/0x850
[  255.797332][ T1030]  ? dev_mc_sync_multiple+0xc2/0x150
[  255.798624][ T1030]  ? bond_enslave+0x3d4d/0x43e0 [bonding]
[  255.800039][ T1030]  ? check_flags+0x50/0x50
[  255.801143][ T1030]  ? lock_contended+0xd80/0xd80
[  255.802341][ T1030]  _raw_spin_lock_nested+0x2e/0x70
[  255.803592][ T1030]  ? dev_mc_sync_multiple+0xc2/0x150
[  255.804897][ T1030]  dev_mc_sync_multiple+0xc2/0x150
[  255.806168][ T1030]  bond_enslave+0x3d58/0x43e0 [bonding]
[  255.807542][ T1030]  ? __lock_acquire+0xe53/0x51b0
[  255.808824][ T1030]  ? bond_update_slave_arr+0xdc0/0xdc0 [bonding]
[  255.810451][ T1030]  ? check_chain_key+0x236/0x5e0
[  255.811742][ T1030]  ? mutex_is_locked+0x13/0x50
[  255.812910][ T1030]  ? rtnl_is_locked+0x11/0x20
[  255.814061][ T1030]  ? netdev_master_upper_dev_get+0xf/0x120
[  255.815553][ T1030]  do_setlink+0x94c/0x3040
[ ... ]

Reported-by: syzbot+4a0f7bc34e3997a6c7df@syzkaller.appspotmail.com
Fixes: 1fc70edb7d7b ("net: core: add nested_level variable in net_device")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Link: https://lore.kernel.org/r/20201015162606.9377-1-ap420073@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/core/dev.c b/net/core/dev.c
index 751e5264fd49..9499a414d67e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10213,7 +10213,7 @@ void netdev_run_todo(void)
 		struct net_device *dev = list_first_entry(&unlink_list,
 							  struct net_device,
 							  unlink_list);
-		list_del(&dev->unlink_list);
+		list_del_init(&dev->unlink_list);
 		dev->nested_level = dev->lower_level - 1;
 	}
 #endif
-- 
cgit v1.2.3


From bc7e343dbd4c1a86c490c1d4f08fca9ecdedbeaa Mon Sep 17 00:00:00 2001
From: Christian Eggers <ceggers@arri.de>
Date: Fri, 16 Oct 2020 19:16:03 +0200
Subject: net: dsa: tag_ksz: KSZ8795 and KSZ9477 also use tail tags

The Marvell 88E6060 uses tag_trailer.c and the KSZ8795, KSZ9477 and
KSZ9893 switches also use tail tags.

Fixes: 7a6ffe764be3 ("net: dsa: point out the tail taggers")
Signed-off-by: Christian Eggers <ceggers@arri.de>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20201016171603.10587-1-ceggers@arri.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/dsa/tag_ksz.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 945a9bd5ba35..0a5aa982c60d 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -123,6 +123,7 @@ static const struct dsa_device_ops ksz8795_netdev_ops = {
 	.xmit	= ksz8795_xmit,
 	.rcv	= ksz8795_rcv,
 	.overhead = KSZ_INGRESS_TAG_LEN,
+	.tail_tag = true,
 };
 
 DSA_TAG_DRIVER(ksz8795_netdev_ops);
@@ -199,6 +200,7 @@ static const struct dsa_device_ops ksz9477_netdev_ops = {
 	.xmit	= ksz9477_xmit,
 	.rcv	= ksz9477_rcv,
 	.overhead = KSZ9477_INGRESS_TAG_LEN,
+	.tail_tag = true,
 };
 
 DSA_TAG_DRIVER(ksz9477_netdev_ops);
-- 
cgit v1.2.3


From df6afe2f7c19349de2ee560dc62ea4d9ad3ff889 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Fri, 16 Oct 2020 20:29:14 +0300
Subject: nexthop: Fix performance regression in nexthop deletion

While insertion of 16k nexthops all using the same netdev ('dummy10')
takes less than a second, deletion takes about 130 seconds:

# time -p ip -b nexthop.batch
real 0.29
user 0.01
sys 0.15

# time -p ip link set dev dummy10 down
real 131.03
user 0.06
sys 0.52

This is because of repeated calls to synchronize_rcu() whenever a
nexthop is removed from a nexthop group:

# /usr/share/bcc/tools/offcputime -p `pgrep -nx ip` -K
...
    b'finish_task_switch'
    b'schedule'
    b'schedule_timeout'
    b'wait_for_completion'
    b'__wait_rcu_gp'
    b'synchronize_rcu.part.0'
    b'synchronize_rcu'
    b'__remove_nexthop'
    b'remove_nexthop'
    b'nexthop_flush_dev'
    b'nh_netdev_event'
    b'raw_notifier_call_chain'
    b'call_netdevice_notifiers_info'
    b'__dev_notify_flags'
    b'dev_change_flags'
    b'do_setlink'
    b'__rtnl_newlink'
    b'rtnl_newlink'
    b'rtnetlink_rcv_msg'
    b'netlink_rcv_skb'
    b'rtnetlink_rcv'
    b'netlink_unicast'
    b'netlink_sendmsg'
    b'____sys_sendmsg'
    b'___sys_sendmsg'
    b'__sys_sendmsg'
    b'__x64_sys_sendmsg'
    b'do_syscall_64'
    b'entry_SYSCALL_64_after_hwframe'
    -                ip (277)
        126554955

Since nexthops are always deleted under RTNL, synchronize_net() can be
used instead. It will call synchronize_rcu_expedited() which only blocks
for several microseconds as opposed to multiple milliseconds like
synchronize_rcu().

With this patch deletion of 16k nexthops takes less than a second:

# time -p ip link set dev dummy10 down
real 0.12
user 0.00
sys 0.04

Tested with fib_nexthops.sh which includes torture tests that prompted
the initial change:

# ./fib_nexthops.sh
...
Tests passed: 134
Tests failed:   0

Fixes: 90f33bffa382 ("nexthops: don't modify published nexthop groups")
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Link: https://lore.kernel.org/r/20201016172914.643282-1-idosch@idosch.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/nexthop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 8c0f17c6863c..0dc43ad28eb9 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -845,7 +845,7 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
 		remove_nh_grp_entry(net, nhge, nlinfo);
 
 	/* make sure all see the newly published array before releasing rtnl */
-	synchronize_rcu();
+	synchronize_net();
 }
 
 static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
-- 
cgit v1.2.3


From 79dce09ab02729a90cf6ce49c9aaaf17aa0d21db Mon Sep 17 00:00:00 2001
From: "longguang.yue" <bigclouds@163.com>
Date: Mon, 28 Sep 2020 10:49:38 +0800
Subject: ipvs: adjust the debug info in function set_tcp_state

Outputting client,virtual,dst addresses info when tcp state changes,
which makes the connection debug more clear

Signed-off-by: longguang.yue <bigclouds@163.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/ip_vs_proto_tcp.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index dc2e7da2742a..7da51390cea6 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -539,8 +539,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
 	if (new_state != cp->state) {
 		struct ip_vs_dest *dest = cp->dest;
 
-		IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
-			      "%s:%d state: %s->%s conn->refcnt:%d\n",
+		IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d "
+			      "d:%s:%d state: %s->%s conn->refcnt:%d\n",
 			      pd->pp->name,
 			      ((state_off == TCP_DIR_OUTPUT) ?
 			       "output " : "input "),
@@ -548,10 +548,12 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
 			      th->fin ? 'F' : '.',
 			      th->ack ? 'A' : '.',
 			      th->rst ? 'R' : '.',
-			      IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
-			      ntohs(cp->dport),
 			      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
 			      ntohs(cp->cport),
+			      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+			      ntohs(cp->vport),
+			      IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
+			      ntohs(cp->dport),
 			      tcp_state_name(cp->state),
 			      tcp_state_name(new_state),
 			      refcount_read(&cp->refcnt));
-- 
cgit v1.2.3


From 4f25434bccc28cf8a07876ef5142a2869a674353 Mon Sep 17 00:00:00 2001
From: Francesco Ruggeri <fruggeri@arista.com>
Date: Wed, 7 Oct 2020 12:32:52 -0700
Subject: netfilter: conntrack: connection timeout after re-register

If the first packet conntrack sees after a re-register is an outgoing
keepalive packet with no data (SEG.SEQ = SND.NXT-1), td_end is set to
SND.NXT-1.
When the peer correctly acknowledges SND.NXT, tcp_in_window fails
check III (Upper bound for valid (s)ack: sack <= receiver.td_end) and
returns false, which cascades into nf_conntrack_in setting
skb->_nfct = 0 and in later conntrack iptables rules not matching.
In cases where iptables are dropping packets that do not match
conntrack rules this can result in idle tcp connections to time out.

v2: adjust td_end when getting the reply rather than when sending out
    the keepalive packet.

Fixes: f94e63801ab2 ("netfilter: conntrack: reset tcp maxwin on re-register")
Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_conntrack_proto_tcp.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index e8c86ee4c1c4..c8fb2187ad4b 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -541,13 +541,20 @@ static bool tcp_in_window(const struct nf_conn *ct,
 			swin = win << sender->td_scale;
 			sender->td_maxwin = (swin == 0 ? 1 : swin);
 			sender->td_maxend = end + sender->td_maxwin;
-			/*
-			 * We haven't seen traffic in the other direction yet
-			 * but we have to tweak window tracking to pass III
-			 * and IV until that happens.
-			 */
-			if (receiver->td_maxwin == 0)
+			if (receiver->td_maxwin == 0) {
+				/* We haven't seen traffic in the other
+				 * direction yet but we have to tweak window
+				 * tracking to pass III and IV until that
+				 * happens.
+				 */
 				receiver->td_end = receiver->td_maxend = sack;
+			} else if (sack == receiver->td_end + 1) {
+				/* Likely a reply to a keepalive.
+				 * Needed for III.
+				 */
+				receiver->td_end++;
+			}
+
 		}
 	} else if (((state->state == TCP_CONNTRACK_SYN_SENT
 		     && dir == IP_CT_DIR_ORIGINAL)
-- 
cgit v1.2.3


From 68f9f9c2c3b6a7259f6a92bc26cdc7bd22e7a982 Mon Sep 17 00:00:00 2001
From: Georg Kohmann <geokohma@cisco.com>
Date: Tue, 13 Oct 2020 14:23:12 +0200
Subject: netfilter: Drop fragmented ndisc packets assembled in netfilter

Fragmented ndisc packets assembled in netfilter not dropped as specified
in RFC 6980, section 5. This behaviour breaks TAHI IPv6 Core Conformance
Tests v6LC.2.1.22/23, V6LC.2.2.26/27 and V6LC.2.3.18.

Setting IP6SKB_FRAGMENTED flag during reassembly.

References: commit b800c3b966bc ("ipv6: drop fragmented ndisc packets by default (RFC 6980)")
Signed-off-by: Georg Kohmann <geokohma@cisco.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index fed9666a2f7d..054d287eb13d 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -355,6 +355,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
 	ipv6_hdr(skb)->payload_len = htons(payload_len);
 	ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn);
 	IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
+	IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;
 
 	/* Yes, and fold redundant checksum back. 8) */
 	if (skb->ip_summed == CHECKSUM_COMPLETE)
-- 
cgit v1.2.3


From 63137bc5882a1882c553d389fdeeeace86ee1741 Mon Sep 17 00:00:00 2001
From: Timothée COCAULT <timothee.cocault@orange.com>
Date: Wed, 14 Oct 2020 12:36:15 +0000
Subject: netfilter: ebtables: Fixes dropping of small packets in bridge nat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes an error causing small packets to get dropped. skb_ensure_writable
expects the second parameter to be a length in the ethernet payload.=20
If we want to write the ethernet header (src, dst), we should pass 0.
Otherwise, packets with small payloads (< ETH_ALEN) will get dropped.

Fixes: c1a831167901 ("netfilter: bridge: convert skb_make_writable to skb_ensure_writable")
Signed-off-by: Timothée COCAULT <timothee.cocault@orange.com>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebt_dnat.c     | 2 +-
 net/bridge/netfilter/ebt_redirect.c | 2 +-
 net/bridge/netfilter/ebt_snat.c     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index 12a4f4d93681..3fda71a8579d 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -21,7 +21,7 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct ebt_nat_info *info = par->targinfo;
 
-	if (skb_ensure_writable(skb, ETH_ALEN))
+	if (skb_ensure_writable(skb, 0))
 		return EBT_DROP;
 
 	ether_addr_copy(eth_hdr(skb)->h_dest, info->mac);
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index 0cad62a4052b..307790562b49 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -21,7 +21,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct ebt_redirect_info *info = par->targinfo;
 
-	if (skb_ensure_writable(skb, ETH_ALEN))
+	if (skb_ensure_writable(skb, 0))
 		return EBT_DROP;
 
 	if (xt_hooknum(par) != NF_BR_BROUTING)
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index 27443bf229a3..7dfbcdfc30e5 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -22,7 +22,7 @@ ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct ebt_nat_info *info = par->targinfo;
 
-	if (skb_ensure_writable(skb, ETH_ALEN * 2))
+	if (skb_ensure_writable(skb, 0))
 		return EBT_DROP;
 
 	ether_addr_copy(eth_hdr(skb)->h_source, info->mac);
-- 
cgit v1.2.3


From 31cc578ae2de19c748af06d859019dced68e325d Mon Sep 17 00:00:00 2001
From: Saeed Mirzamohammadi <saeed.mirzamohammadi@oracle.com>
Date: Tue, 20 Oct 2020 13:41:36 +0200
Subject: netfilter: nftables_offload: KASAN slab-out-of-bounds Read in
 nft_flow_rule_create

This patch fixes the issue due to:

BUG: KASAN: slab-out-of-bounds in nft_flow_rule_create+0x622/0x6a2
net/netfilter/nf_tables_offload.c:40
Read of size 8 at addr ffff888103910b58 by task syz-executor227/16244

The error happens when expr->ops is accessed early on before performing the boundary check and after nft_expr_next() moves the expr to go out-of-bounds.

This patch checks the boundary condition before expr->ops that fixes the slab-out-of-bounds Read issue.

Add nft_expr_more() and use it to fix this problem.

Signed-off-by: Saeed Mirzamohammadi <saeed.mirzamohammadi@oracle.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 6 ++++++
 net/netfilter/nf_tables_api.c     | 6 +++---
 net/netfilter/nf_tables_offload.c | 4 ++--
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 3f7e56b1171e..55b4cadf290a 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -891,6 +891,12 @@ static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule)
 	return (struct nft_expr *)&rule->data[rule->dlen];
 }
 
+static inline bool nft_expr_more(const struct nft_rule *rule,
+				 const struct nft_expr *expr)
+{
+	return expr != nft_expr_last(rule) && expr->ops;
+}
+
 static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule)
 {
 	return (void *)&rule->data[rule->dlen];
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9957e0ed8658..65cb8e3c13d9 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -302,7 +302,7 @@ static void nft_rule_expr_activate(const struct nft_ctx *ctx,
 	struct nft_expr *expr;
 
 	expr = nft_expr_first(rule);
-	while (expr != nft_expr_last(rule) && expr->ops) {
+	while (nft_expr_more(rule, expr)) {
 		if (expr->ops->activate)
 			expr->ops->activate(ctx, expr);
 
@@ -317,7 +317,7 @@ static void nft_rule_expr_deactivate(const struct nft_ctx *ctx,
 	struct nft_expr *expr;
 
 	expr = nft_expr_first(rule);
-	while (expr != nft_expr_last(rule) && expr->ops) {
+	while (nft_expr_more(rule, expr)) {
 		if (expr->ops->deactivate)
 			expr->ops->deactivate(ctx, expr, phase);
 
@@ -3080,7 +3080,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
 	 * is called on error from nf_tables_newrule().
 	 */
 	expr = nft_expr_first(rule);
-	while (expr != nft_expr_last(rule) && expr->ops) {
+	while (nft_expr_more(rule, expr)) {
 		next = nft_expr_next(expr);
 		nf_tables_expr_destroy(ctx, expr);
 		expr = next;
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 7c7e06624dc3..9f625724a20f 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -37,7 +37,7 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net,
 	struct nft_expr *expr;
 
 	expr = nft_expr_first(rule);
-	while (expr->ops && expr != nft_expr_last(rule)) {
+	while (nft_expr_more(rule, expr)) {
 		if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION)
 			num_actions++;
 
@@ -61,7 +61,7 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net,
 	ctx->net = net;
 	ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC;
 
-	while (expr->ops && expr != nft_expr_last(rule)) {
+	while (nft_expr_more(rule, expr)) {
 		if (!expr->ops->offload) {
 			err = -EOPNOTSUPP;
 			goto err_out;
-- 
cgit v1.2.3


From 47b5d2a107396ab05e83a4dfbd30b563ecbc83af Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@nvidia.com>
Date: Mon, 19 Oct 2020 12:02:44 +0300
Subject: net/sched: act_ct: Fix adding udp port mangle operation

Need to use the udp header type and not tcp.

Fixes: 9c26ba9b1f45 ("net/sched: act_ct: Instantiate flow table entry actions")
Signed-off-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Paul Blakey <paulb@nvidia.com>
Link: https://lore.kernel.org/r/20201019090244.3015186-1-roid@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sched/act_ct.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 9c79fb92c2da..aba3cd85f284 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -156,11 +156,11 @@ tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple,
 	__be16 target_dst = target.dst.u.udp.port;
 
 	if (target_src != tuple->src.u.udp.port)
-		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
+		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
 					 offsetof(struct udphdr, source),
 					 0xFFFF, be16_to_cpu(target_src));
 	if (target_dst != tuple->dst.u.udp.port)
-		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
+		tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
 					 offsetof(struct udphdr, dest),
 					 0xFFFF, be16_to_cpu(target_dst));
 }
-- 
cgit v1.2.3


From fe2d9b1a0e7805384770ec0ddd34c9f1e9fe6fa8 Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Mon, 19 Oct 2020 18:23:15 +0800
Subject: mptcp: initialize mptcp_options_received's ahmac

Initialize mptcp_options_received's ahmac to zero, otherwise it
will be a random number when receiving ADD_ADDR suboption with echo-flag=1.

Fixes: 3df523ab582c5 ("mptcp: Add ADD_ADDR handling")
Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/options.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 092a2d48bfd3..1ff3469938b6 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -297,6 +297,7 @@ void mptcp_get_options(const struct sk_buff *skb,
 	mp_opt->mp_capable = 0;
 	mp_opt->mp_join = 0;
 	mp_opt->add_addr = 0;
+	mp_opt->ahmac = 0;
 	mp_opt->rm_addr = 0;
 	mp_opt->dss = 0;
 
-- 
cgit v1.2.3


From 65b8c8a620a390a901522f19beed1476e2274feb Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Mon, 19 Oct 2020 18:23:16 +0800
Subject: mptcp: move mptcp_options_received's port initialization

Move mptcp_options_received's port initialization from
mptcp_parse_option to mptcp_get_options, put it together with
the other fields initializations of mptcp_options_received.

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/options.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 1ff3469938b6..a044dd43411d 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -241,7 +241,6 @@ static void mptcp_parse_option(const struct sk_buff *skb,
 		}
 
 		mp_opt->add_addr = 1;
-		mp_opt->port = 0;
 		mp_opt->addr_id = *ptr++;
 		pr_debug("ADD_ADDR: id=%d, echo=%d", mp_opt->addr_id, mp_opt->echo);
 		if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
@@ -298,6 +297,7 @@ void mptcp_get_options(const struct sk_buff *skb,
 	mp_opt->mp_join = 0;
 	mp_opt->add_addr = 0;
 	mp_opt->ahmac = 0;
+	mp_opt->port = 0;
 	mp_opt->rm_addr = 0;
 	mp_opt->dss = 0;
 
-- 
cgit v1.2.3


From b142083b585c2c03af24cca4d274f797796a4064 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 19 Oct 2020 13:32:40 +0200
Subject: mptcp: MPTCP_KUNIT_TESTS should depend on MPTCP instead of selecting
 it

MPTCP_KUNIT_TESTS selects MPTCP, thus enabling an optional feature the
user may not want to enable.  Fix this by making the test depend on
MPTCP instead.

Fixes: a00a582203dbc43e ("mptcp: move crypto test to KUNIT")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Link: https://lore.kernel.org/r/20201019113240.11516-1-geert@linux-m68k.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/Kconfig | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
index 698bc3525160..abb0a992d4a0 100644
--- a/net/mptcp/Kconfig
+++ b/net/mptcp/Kconfig
@@ -22,11 +22,8 @@ config MPTCP_IPV6
 	select IPV6
 	default y
 
-endif
-
 config MPTCP_KUNIT_TESTS
 	tristate "This builds the MPTCP KUnit tests" if !KUNIT_ALL_TESTS
-	select MPTCP
 	depends on KUNIT
 	default KUNIT_ALL_TESTS
 	help
@@ -39,3 +36,4 @@ config MPTCP_KUNIT_TESTS
 
 	  If unsure, say N.
 
+endif
-- 
cgit v1.2.3


From 280e3ebdafb863b3cb50d5842f056267e15bf40c Mon Sep 17 00:00:00 2001
From: Defang Bo <bodefang@126.com>
Date: Mon, 19 Oct 2020 19:38:58 +0800
Subject: nfc: Ensure presence of NFC_ATTR_FIRMWARE_NAME attribute in
 nfc_genl_fw_download()

Check that the NFC_ATTR_FIRMWARE_NAME attributes are provided by
the netlink client prior to accessing them.This prevents potential
unhandled NULL pointer dereference exceptions which can be triggered
by malicious user-mode programs, if they omit one or both of these
attributes.

Similar to commit a0323b979f81 ("nfc: Ensure presence of required attributes in the activate_target handler").

Fixes: 9674da8759df ("NFC: Add firmware upload netlink command")
Signed-off-by: Defang Bo <bodefang@126.com>
Link: https://lore.kernel.org/r/1603107538-4744-1-git-send-email-bodefang@126.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/nfc/netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index e894254c17d4..8709f3d4e7c4 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -1217,7 +1217,7 @@ static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info)
 	u32 idx;
 	char firmware_name[NFC_FIRMWARE_NAME_MAXSIZE + 1];
 
-	if (!info->attrs[NFC_ATTR_DEVICE_INDEX])
+	if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_FIRMWARE_NAME])
 		return -EINVAL;
 
 	idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]);
-- 
cgit v1.2.3


From 010b430d5df556d5d232e3751ac691ba9e88c041 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Tue, 20 Oct 2020 09:38:39 +0200
Subject: mptcp: MPTCP_IPV6 should depend on IPV6 instead of selecting it

MPTCP_IPV6 selects IPV6, thus enabling an optional feature the user may
not want to enable.  Fix this by making MPTCP_IPV6 depend on IPV6, like
is done for all other IPv6 features.

Fixes: f870fa0b5768842c ("mptcp: Add MPTCP socket stubs")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Link: https://lore.kernel.org/r/20201020073839.29226-1-geert@linux-m68k.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
index abb0a992d4a0..8936604b3bf9 100644
--- a/net/mptcp/Kconfig
+++ b/net/mptcp/Kconfig
@@ -19,7 +19,7 @@ config INET_MPTCP_DIAG
 
 config MPTCP_IPV6
 	bool "MPTCP: IPv6 support for Multipath TCP"
-	select IPV6
+	depends on IPV6
 	default y
 
 config MPTCP_KUNIT_TESTS
-- 
cgit v1.2.3


From b130762161374b1ef31549bef8ebd4abeb998d94 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Tue, 20 Oct 2020 17:34:31 +0200
Subject: net/sched: act_gate: Unlock ->tcfa_lock in tc_setup_flow_action()

We need to jump to the "err_out_locked" label when
tcf_gate_get_entries() fails. Otherwise, tc_setup_flow_action() exits
with ->tcfa_lock still held.

Fixes: d29bdd69ecdd ("net: schedule: add action gate offloading")
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Link: https://lore.kernel.org/r/12f60e385584c52c22863701c0185e40ab08a7a7.1603207948.git.gnault@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sched/cls_api.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 41a55c6cbeb8..faeabff283a2 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3712,7 +3712,7 @@ int tc_setup_flow_action(struct flow_action *flow_action,
 			entry->gate.num_entries = tcf_gate_num_entries(act);
 			err = tcf_gate_get_entries(entry, act);
 			if (err)
-				goto err_out;
+				goto err_out_locked;
 		} else {
 			err = -EOPNOTSUPP;
 			goto err_out_locked;
-- 
cgit v1.2.3


From a7a12b5a0f950bc6b9f7153390634ea798738db9 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Wed, 21 Oct 2020 00:02:40 +0200
Subject: net/sched: act_tunnel_key: fix OOB write in case of IPv6 ERSPAN
 tunnels

the following command

 # tc action add action tunnel_key \
 > set src_ip 2001:db8::1 dst_ip 2001:db8::2 id 10 erspan_opts 1:6789:0:0

generates the following splat:

 BUG: KASAN: slab-out-of-bounds in tunnel_key_copy_opts+0xcc9/0x1010 [act_tunnel_key]
 Write of size 4 at addr ffff88813f5f1cc8 by task tc/873

 CPU: 2 PID: 873 Comm: tc Not tainted 5.9.0+ #282
 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014
 Call Trace:
  dump_stack+0x99/0xcb
  print_address_description.constprop.7+0x1e/0x230
  kasan_report.cold.13+0x37/0x7c
  tunnel_key_copy_opts+0xcc9/0x1010 [act_tunnel_key]
  tunnel_key_init+0x160c/0x1f40 [act_tunnel_key]
  tcf_action_init_1+0x5b5/0x850
  tcf_action_init+0x15d/0x370
  tcf_action_add+0xd9/0x2f0
  tc_ctl_action+0x29b/0x3a0
  rtnetlink_rcv_msg+0x341/0x8d0
  netlink_rcv_skb+0x120/0x380
  netlink_unicast+0x439/0x630
  netlink_sendmsg+0x719/0xbf0
  sock_sendmsg+0xe2/0x110
  ____sys_sendmsg+0x5ba/0x890
  ___sys_sendmsg+0xe9/0x160
  __sys_sendmsg+0xd3/0x170
  do_syscall_64+0x33/0x40
  entry_SYSCALL_64_after_hwframe+0x44/0xa9
 RIP: 0033:0x7f872a96b338
 Code: 89 02 48 c7 c0 ff ff ff ff eb b5 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 43 2c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 41 89 d4 55
 RSP: 002b:00007ffffe367518 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
 RAX: ffffffffffffffda RBX: 000000005f8f5aed RCX: 00007f872a96b338
 RDX: 0000000000000000 RSI: 00007ffffe367580 RDI: 0000000000000003
 RBP: 0000000000000000 R08: 0000000000000001 R09: 000000000000001c
 R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000001
 R13: 0000000000686760 R14: 0000000000000601 R15: 0000000000000000

 Allocated by task 873:
  kasan_save_stack+0x19/0x40
  __kasan_kmalloc.constprop.7+0xc1/0xd0
  __kmalloc+0x151/0x310
  metadata_dst_alloc+0x20/0x40
  tunnel_key_init+0xfff/0x1f40 [act_tunnel_key]
  tcf_action_init_1+0x5b5/0x850
  tcf_action_init+0x15d/0x370
  tcf_action_add+0xd9/0x2f0
  tc_ctl_action+0x29b/0x3a0
  rtnetlink_rcv_msg+0x341/0x8d0
  netlink_rcv_skb+0x120/0x380
  netlink_unicast+0x439/0x630
  netlink_sendmsg+0x719/0xbf0
  sock_sendmsg+0xe2/0x110
  ____sys_sendmsg+0x5ba/0x890
  ___sys_sendmsg+0xe9/0x160
  __sys_sendmsg+0xd3/0x170
  do_syscall_64+0x33/0x40
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

 The buggy address belongs to the object at ffff88813f5f1c00
  which belongs to the cache kmalloc-256 of size 256
 The buggy address is located 200 bytes inside of
  256-byte region [ffff88813f5f1c00, ffff88813f5f1d00)
 The buggy address belongs to the page:
 page:0000000011b48a19 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x13f5f0
 head:0000000011b48a19 order:1 compound_mapcount:0
 flags: 0x17ffffc0010200(slab|head)
 raw: 0017ffffc0010200 0000000000000000 0000000d00000001 ffff888107c43400
 raw: 0000000000000000 0000000080100010 00000001ffffffff 0000000000000000
 page dumped because: kasan: bad access detected

 Memory state around the buggy address:
  ffff88813f5f1b80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
  ffff88813f5f1c00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 >ffff88813f5f1c80: 00 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc
                                               ^
  ffff88813f5f1d00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
  ffff88813f5f1d80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc

using IPv6 tunnels, act_tunnel_key allocates a fixed amount of memory for
the tunnel metadata, but then it expects additional bytes to store tunnel
specific metadata with tunnel_key_copy_opts().

Fix the arguments of __ipv6_tun_set_dst(), so that 'md_size' contains the
size previously computed by tunnel_key_get_opts_len(), like it's done for
IPv4 tunnels.

Fixes: 0ed5269f9e41 ("net/sched: add tunnel option support to act_tunnel_key")
Reported-by: Shuang Li <shuali@redhat.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Link: https://lore.kernel.org/r/36ebe969f6d13ff59912d6464a4356fe6f103766.1603231100.git.dcaratti@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/sched/act_tunnel_key.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index a229751ee8c4..85c0d0d5b9da 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -459,7 +459,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 
 			metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port,
 						      0, flags,
-						      key_id, 0);
+						      key_id, opts_len);
 		} else {
 			NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst");
 			ret = -EINVAL;
-- 
cgit v1.2.3


From b7c24497baeaf21172b447f7cca36b0e99bd11e3 Mon Sep 17 00:00:00 2001
From: Alexander Ovechkin <ovov@yandex-team.ru>
Date: Tue, 20 Oct 2020 14:43:33 +0300
Subject: mpls: load mpls_gso after mpls_iptunnel

mpls_iptunnel is used only for mpls encapsuation, and if encaplusated
packet is larger than MTU we need mpls_gso for segmentation.

Signed-off-by: Alexander Ovechkin <ovov@yandex-team.ru>
Acked-by: Dmitry Yakunin <zeil@yandex-team.ru>
Reviewed-by: David Ahern <dsahern@gmail.com>
Link: https://lore.kernel.org/r/20201020114333.26866-1-ovov@yandex-team.ru
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mpls/mpls_iptunnel.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 2def85718d94..ef59e25dc482 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -300,5 +300,6 @@ static void __exit mpls_iptunnel_exit(void)
 module_exit(mpls_iptunnel_exit);
 
 MODULE_ALIAS_RTNL_LWT(MPLS);
+MODULE_SOFTDEP("post: mpls_gso");
 MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels");
 MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3


From 0ed37ac586c01fd5bf3f7559de79f1d621ccf192 Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matthieu.baerts@tessares.net>
Date: Wed, 21 Oct 2020 12:51:53 +0200
Subject: mptcp: depends on IPV6 but not as a module

Like TCP, MPTCP cannot be compiled as a module. Obviously, MPTCP IPv6'
support also depends on CONFIG_IPV6. But not all functions from IPv6
code are exported.

To simplify the code and reduce modifications outside MPTCP, it was
decided from the beginning to support MPTCP with IPv6 only if
CONFIG_IPV6 was built inlined. That's also why CONFIG_MPTCP_IPV6 was
created. More modifications are needed to support CONFIG_IPV6=m.

Even if it was not explicit, until recently, we were forcing CONFIG_IPV6
to be built-in because we had "select IPV6" in Kconfig. Now that we have
"depends on IPV6", we have to explicitly set "IPV6=y" to force
CONFIG_IPV6 not to be built as a module.

In other words, we can now only have CONFIG_MPTCP_IPV6=y if
CONFIG_IPV6=y.

Note that the new dependency might hide the fact IPv6 is not supported
in MPTCP even if we have CONFIG_IPV6=m. But selecting IPV6 like we did
before was forcing it to be built-in while it was maybe not what the
user wants.

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Fixes: 010b430d5df5 ("mptcp: MPTCP_IPV6 should depend on IPV6 instead of selecting it")
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Link: https://lore.kernel.org/r/20201021105154.628257-1-matthieu.baerts@tessares.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/mptcp/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
index 8936604b3bf9..a014149aa323 100644
--- a/net/mptcp/Kconfig
+++ b/net/mptcp/Kconfig
@@ -19,7 +19,7 @@ config INET_MPTCP_DIAG
 
 config MPTCP_IPV6
 	bool "MPTCP: IPv6 support for Multipath TCP"
-	depends on IPV6
+	depends on IPV6=y
 	default y
 
 config MPTCP_KUNIT_TESTS
-- 
cgit v1.2.3


From ba452c9e996d8a4c347b32805f91abb70de5de7e Mon Sep 17 00:00:00 2001
From: Toke Høiland-Jørgensen <toke@redhat.com>
Date: Tue, 20 Oct 2020 23:25:56 +0200
Subject: bpf: Fix bpf_redirect_neigh helper api to support supplying nexthop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Based on the discussion in [0], update the bpf_redirect_neigh() helper to
accept an optional parameter specifying the nexthop information. This makes
it possible to combine bpf_fib_lookup() and bpf_redirect_neigh() without
incurring a duplicate FIB lookup - since the FIB lookup helper will return
the nexthop information even if no neighbour is present, this can simply
be passed on to bpf_redirect_neigh() if bpf_fib_lookup() returns
BPF_FIB_LKUP_RET_NO_NEIGH. Thus fix & extend it before helper API is frozen.

  [0] https://lore.kernel.org/bpf/393e17fc-d187-3a8d-2f0d-a627c7c63fca@iogearbox.net/

Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/bpf/160322915615.32199.1187570224032024535.stgit@toke.dk
---
 include/linux/filter.h         |   9 +++
 include/uapi/linux/bpf.h       |  22 ++++--
 net/core/filter.c              | 158 ++++++++++++++++++++++++++---------------
 scripts/bpf_helpers_doc.py     |   1 +
 tools/include/uapi/linux/bpf.h |  22 ++++--
 5 files changed, 145 insertions(+), 67 deletions(-)

(limited to 'net')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 20fc24c9779a..72d62cbc1578 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -607,12 +607,21 @@ struct bpf_skb_data_end {
 	void *data_end;
 };
 
+struct bpf_nh_params {
+	u32 nh_family;
+	union {
+		u32 ipv4_nh;
+		struct in6_addr ipv6_nh;
+	};
+};
+
 struct bpf_redirect_info {
 	u32 flags;
 	u32 tgt_index;
 	void *tgt_value;
 	struct bpf_map *map;
 	u32 kern_flags;
+	struct bpf_nh_params nh;
 };
 
 DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bf5a99d803e4..e6ceac3f7d62 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3677,15 +3677,19 @@ union bpf_attr {
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
  *
- * long bpf_redirect_neigh(u32 ifindex, u64 flags)
+ * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)
  * 	Description
  * 		Redirect the packet to another net device of index *ifindex*
  * 		and fill in L2 addresses from neighboring subsystem. This helper
  * 		is somewhat similar to **bpf_redirect**\ (), except that it
  * 		populates L2 addresses as well, meaning, internally, the helper
- * 		performs a FIB lookup based on the skb's networking header to
- * 		get the address of the next hop and then relies on the neighbor
- * 		lookup for the L2 address of the nexthop.
+ * 		relies on the neighbor lookup for the L2 address of the nexthop.
+ *
+ * 		The helper will perform a FIB lookup based on the skb's
+ * 		networking header to get the address of the next hop, unless
+ * 		this is supplied by the caller in the *params* argument. The
+ * 		*plen* argument indicates the len of *params* and should be set
+ * 		to 0 if *params* is NULL.
  *
  * 		The *flags* argument is reserved and must be 0. The helper is
  * 		currently only supported for tc BPF program types, and enabled
@@ -4906,6 +4910,16 @@ struct bpf_fib_lookup {
 	__u8	dmac[6];     /* ETH_ALEN */
 };
 
+struct bpf_redir_neigh {
+	/* network family for lookup (AF_INET, AF_INET6) */
+	__u32 nh_family;
+	/* network address of nexthop; skips fib lookup to find gateway */
+	union {
+		__be32		ipv4_nh;
+		__u32		ipv6_nh[4];  /* in6_addr; network order */
+	};
+};
+
 enum bpf_task_fd_type {
 	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
 	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
diff --git a/net/core/filter.c b/net/core/filter.c
index c5e2a1c5fd8d..6d0fa65a4a46 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2165,12 +2165,12 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
 }
 
 #if IS_ENABLED(CONFIG_IPV6)
-static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
+static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
+			    struct net_device *dev, struct bpf_nh_params *nh)
 {
-	struct dst_entry *dst = skb_dst(skb);
-	struct net_device *dev = dst->dev;
 	u32 hh_len = LL_RESERVED_SPACE(dev);
 	const struct in6_addr *nexthop;
+	struct dst_entry *dst = NULL;
 	struct neighbour *neigh;
 
 	if (dev_xmit_recursion()) {
@@ -2196,8 +2196,13 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
 	}
 
 	rcu_read_lock_bh();
-	nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
-			      &ipv6_hdr(skb)->daddr);
+	if (!nh) {
+		dst = skb_dst(skb);
+		nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
+				      &ipv6_hdr(skb)->daddr);
+	} else {
+		nexthop = &nh->ipv6_nh;
+	}
 	neigh = ip_neigh_gw6(dev, nexthop);
 	if (likely(!IS_ERR(neigh))) {
 		int ret;
@@ -2210,36 +2215,43 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
 		return ret;
 	}
 	rcu_read_unlock_bh();
-	IP6_INC_STATS(dev_net(dst->dev),
-		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+	if (dst)
+		IP6_INC_STATS(dev_net(dst->dev),
+			      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 out_drop:
 	kfree_skb(skb);
 	return -ENETDOWN;
 }
 
-static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
+				   struct bpf_nh_params *nh)
 {
 	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
 	struct net *net = dev_net(dev);
 	int err, ret = NET_XMIT_DROP;
-	struct dst_entry *dst;
-	struct flowi6 fl6 = {
-		.flowi6_flags	= FLOWI_FLAG_ANYSRC,
-		.flowi6_mark	= skb->mark,
-		.flowlabel	= ip6_flowinfo(ip6h),
-		.flowi6_oif	= dev->ifindex,
-		.flowi6_proto	= ip6h->nexthdr,
-		.daddr		= ip6h->daddr,
-		.saddr		= ip6h->saddr,
-	};
 
-	dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
-	if (IS_ERR(dst))
-		goto out_drop;
+	if (!nh) {
+		struct dst_entry *dst;
+		struct flowi6 fl6 = {
+			.flowi6_flags = FLOWI_FLAG_ANYSRC,
+			.flowi6_mark  = skb->mark,
+			.flowlabel    = ip6_flowinfo(ip6h),
+			.flowi6_oif   = dev->ifindex,
+			.flowi6_proto = ip6h->nexthdr,
+			.daddr	      = ip6h->daddr,
+			.saddr	      = ip6h->saddr,
+		};
+
+		dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
+		if (IS_ERR(dst))
+			goto out_drop;
 
-	skb_dst_set(skb, dst);
+		skb_dst_set(skb, dst);
+	} else if (nh->nh_family != AF_INET6) {
+		goto out_drop;
+	}
 
-	err = bpf_out_neigh_v6(net, skb);
+	err = bpf_out_neigh_v6(net, skb, dev, nh);
 	if (unlikely(net_xmit_eval(err)))
 		dev->stats.tx_errors++;
 	else
@@ -2252,7 +2264,8 @@ out_xmit:
 	return ret;
 }
 #else
-static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
+				   struct bpf_nh_params *nh)
 {
 	kfree_skb(skb);
 	return NET_XMIT_DROP;
@@ -2260,11 +2273,9 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
 #endif /* CONFIG_IPV6 */
 
 #if IS_ENABLED(CONFIG_INET)
-static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
+static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
+			    struct net_device *dev, struct bpf_nh_params *nh)
 {
-	struct dst_entry *dst = skb_dst(skb);
-	struct rtable *rt = container_of(dst, struct rtable, dst);
-	struct net_device *dev = dst->dev;
 	u32 hh_len = LL_RESERVED_SPACE(dev);
 	struct neighbour *neigh;
 	bool is_v6gw = false;
@@ -2292,7 +2303,21 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
 	}
 
 	rcu_read_lock_bh();
-	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
+	if (!nh) {
+		struct dst_entry *dst = skb_dst(skb);
+		struct rtable *rt = container_of(dst, struct rtable, dst);
+
+		neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
+	} else if (nh->nh_family == AF_INET6) {
+		neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
+		is_v6gw = true;
+	} else if (nh->nh_family == AF_INET) {
+		neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
+	} else {
+		rcu_read_unlock_bh();
+		goto out_drop;
+	}
+
 	if (likely(!IS_ERR(neigh))) {
 		int ret;
 
@@ -2309,33 +2334,37 @@ out_drop:
 	return -ENETDOWN;
 }
 
-static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
+				   struct bpf_nh_params *nh)
 {
 	const struct iphdr *ip4h = ip_hdr(skb);
 	struct net *net = dev_net(dev);
 	int err, ret = NET_XMIT_DROP;
-	struct rtable *rt;
-	struct flowi4 fl4 = {
-		.flowi4_flags	= FLOWI_FLAG_ANYSRC,
-		.flowi4_mark	= skb->mark,
-		.flowi4_tos	= RT_TOS(ip4h->tos),
-		.flowi4_oif	= dev->ifindex,
-		.flowi4_proto	= ip4h->protocol,
-		.daddr		= ip4h->daddr,
-		.saddr		= ip4h->saddr,
-	};
 
-	rt = ip_route_output_flow(net, &fl4, NULL);
-	if (IS_ERR(rt))
-		goto out_drop;
-	if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
-		ip_rt_put(rt);
-		goto out_drop;
-	}
+	if (!nh) {
+		struct flowi4 fl4 = {
+			.flowi4_flags = FLOWI_FLAG_ANYSRC,
+			.flowi4_mark  = skb->mark,
+			.flowi4_tos   = RT_TOS(ip4h->tos),
+			.flowi4_oif   = dev->ifindex,
+			.flowi4_proto = ip4h->protocol,
+			.daddr	      = ip4h->daddr,
+			.saddr	      = ip4h->saddr,
+		};
+		struct rtable *rt;
+
+		rt = ip_route_output_flow(net, &fl4, NULL);
+		if (IS_ERR(rt))
+			goto out_drop;
+		if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
+			ip_rt_put(rt);
+			goto out_drop;
+		}
 
-	skb_dst_set(skb, &rt->dst);
+		skb_dst_set(skb, &rt->dst);
+	}
 
-	err = bpf_out_neigh_v4(net, skb);
+	err = bpf_out_neigh_v4(net, skb, dev, nh);
 	if (unlikely(net_xmit_eval(err)))
 		dev->stats.tx_errors++;
 	else
@@ -2348,14 +2377,16 @@ out_xmit:
 	return ret;
 }
 #else
-static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
+				   struct bpf_nh_params *nh)
 {
 	kfree_skb(skb);
 	return NET_XMIT_DROP;
 }
 #endif /* CONFIG_INET */
 
-static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
+static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
+				struct bpf_nh_params *nh)
 {
 	struct ethhdr *ethh = eth_hdr(skb);
 
@@ -2370,9 +2401,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
 	skb_reset_network_header(skb);
 
 	if (skb->protocol == htons(ETH_P_IP))
-		return __bpf_redirect_neigh_v4(skb, dev);
+		return __bpf_redirect_neigh_v4(skb, dev, nh);
 	else if (skb->protocol == htons(ETH_P_IPV6))
-		return __bpf_redirect_neigh_v6(skb, dev);
+		return __bpf_redirect_neigh_v6(skb, dev, nh);
 out:
 	kfree_skb(skb);
 	return -ENOTSUPP;
@@ -2382,7 +2413,8 @@ out:
 enum {
 	BPF_F_NEIGH	= (1ULL << 1),
 	BPF_F_PEER	= (1ULL << 2),
-#define BPF_F_REDIRECT_INTERNAL	(BPF_F_NEIGH | BPF_F_PEER)
+	BPF_F_NEXTHOP	= (1ULL << 3),
+#define BPF_F_REDIRECT_INTERNAL	(BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
 };
 
 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
@@ -2455,7 +2487,8 @@ int skb_do_redirect(struct sk_buff *skb)
 		return -EAGAIN;
 	}
 	return flags & BPF_F_NEIGH ?
-	       __bpf_redirect_neigh(skb, dev) :
+	       __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
+				    &ri->nh : NULL) :
 	       __bpf_redirect(skb, dev, flags);
 out_drop:
 	kfree_skb(skb);
@@ -2504,16 +2537,21 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
+BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
+	   int, plen, u64, flags)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 
-	if (unlikely(flags))
+	if (unlikely((plen && plen < sizeof(*params)) || flags))
 		return TC_ACT_SHOT;
 
-	ri->flags = BPF_F_NEIGH;
+	ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
 	ri->tgt_index = ifindex;
 
+	BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
+	if (plen)
+		memcpy(&ri->nh, params, sizeof(ri->nh));
+
 	return TC_ACT_REDIRECT;
 }
 
@@ -2522,7 +2560,9 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_ANYTHING,
-	.arg2_type	= ARG_ANYTHING,
+	.arg2_type      = ARG_PTR_TO_MEM_OR_NULL,
+	.arg3_type      = ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
 };
 
 BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 7d86fdd190be..6769caae142f 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -453,6 +453,7 @@ class PrinterHelpers(Printer):
             'struct bpf_perf_event_data',
             'struct bpf_perf_event_value',
             'struct bpf_pidns_info',
+            'struct bpf_redir_neigh',
             'struct bpf_sk_lookup',
             'struct bpf_sock',
             'struct bpf_sock_addr',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index bf5a99d803e4..e6ceac3f7d62 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3677,15 +3677,19 @@ union bpf_attr {
  * 	Return
  * 		The id is returned or 0 in case the id could not be retrieved.
  *
- * long bpf_redirect_neigh(u32 ifindex, u64 flags)
+ * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)
  * 	Description
  * 		Redirect the packet to another net device of index *ifindex*
  * 		and fill in L2 addresses from neighboring subsystem. This helper
  * 		is somewhat similar to **bpf_redirect**\ (), except that it
  * 		populates L2 addresses as well, meaning, internally, the helper
- * 		performs a FIB lookup based on the skb's networking header to
- * 		get the address of the next hop and then relies on the neighbor
- * 		lookup for the L2 address of the nexthop.
+ * 		relies on the neighbor lookup for the L2 address of the nexthop.
+ *
+ * 		The helper will perform a FIB lookup based on the skb's
+ * 		networking header to get the address of the next hop, unless
+ * 		this is supplied by the caller in the *params* argument. The
+ * 		*plen* argument indicates the len of *params* and should be set
+ * 		to 0 if *params* is NULL.
  *
  * 		The *flags* argument is reserved and must be 0. The helper is
  * 		currently only supported for tc BPF program types, and enabled
@@ -4906,6 +4910,16 @@ struct bpf_fib_lookup {
 	__u8	dmac[6];     /* ETH_ALEN */
 };
 
+struct bpf_redir_neigh {
+	/* network family for lookup (AF_INET, AF_INET6) */
+	__u32 nh_family;
+	/* network address of nexthop; skips fib lookup to find gateway */
+	union {
+		__be32		ipv4_nh;
+		__u32		ipv6_nh[4];  /* in6_addr; network order */
+	};
+};
+
 enum bpf_task_fd_type {
 	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
 	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
-- 
cgit v1.2.3


From ebfe3c5183733f784264450a41646a482f964e5e Mon Sep 17 00:00:00 2001
From: Di Zhu <zhudi21@huawei.com>
Date: Wed, 21 Oct 2020 10:00:53 +0800
Subject: rtnetlink: fix data overflow in rtnl_calcit()

"ip addr show" command execute error when we have a physical
network card with a large number of VFs

The return value of if_nlmsg_size() in rtnl_calcit() will exceed
range of u16 data type when any network cards has a larger number of
VFs. rtnl_vfinfo_size() will significant increase needed dump size when
the value of num_vfs is larger.

Eventually we get a wrong value of min_ifinfo_dump_size because of overflow
which decides the memory size needed by netlink dump and netlink_dump()
will return -EMSGSIZE because of not enough memory was allocated.

So fix it by promoting  min_dump_alloc data type to u32 to
avoid whole netlink message size overflow and it's also align
with the data type of struct netlink_callback{}.min_dump_alloc
which is assigned by return value of rtnl_calcit()

Signed-off-by: Di Zhu <zhudi21@huawei.com>
Link: https://lore.kernel.org/r/20201021020053.1401-1-zhudi21@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netlink.h |  2 +-
 net/core/rtnetlink.c    | 13 ++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'net')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 666cd0390699..9f118771e248 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -240,7 +240,7 @@ struct netlink_dump_control {
 	int (*done)(struct netlink_callback *);
 	void *data;
 	struct module *module;
-	u16 min_dump_alloc;
+	u32 min_dump_alloc;
 };
 
 int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 68e0682450c6..7d7223691783 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3709,13 +3709,13 @@ static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
 	return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack);
 }
 
-static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
+static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	struct net *net = sock_net(skb->sk);
-	struct net_device *dev;
+	size_t min_ifinfo_dump_size = 0;
 	struct nlattr *tb[IFLA_MAX+1];
 	u32 ext_filter_mask = 0;
-	u16 min_ifinfo_dump_size = 0;
+	struct net_device *dev;
 	int hdrlen;
 
 	/* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
@@ -3735,9 +3735,8 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
 	 */
 	rcu_read_lock();
 	for_each_netdev_rcu(net, dev) {
-		min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size,
-					     if_nlmsg_size(dev,
-						           ext_filter_mask));
+		min_ifinfo_dump_size = max(min_ifinfo_dump_size,
+					   if_nlmsg_size(dev, ext_filter_mask));
 	}
 	rcu_read_unlock();
 
@@ -5494,7 +5493,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
 		struct sock *rtnl;
 		rtnl_dumpit_func dumpit;
-		u16 min_dump_alloc = 0;
+		u32 min_dump_alloc = 0;
 
 		link = rtnl_get_link(family, type);
 		if (!link || !link->dumpit) {
-- 
cgit v1.2.3


From c77761c8a59405cb7aa44188b30fffe13fbdd02d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 21 Oct 2020 12:55:52 +0200
Subject: netfilter: nf_fwd_netdev: clear timestamp in forwarding path

Similar to 7980d2eabde8 ("ipvs: clear skb->tstamp in forwarding path").
fq qdisc requires tstamp to be cleared in forwarding path.

Fixes: 8203e2d844d3 ("net: clear skb->tstamp in forwarding paths")
Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC")
Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time.")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_dup_netdev.c  | 1 +
 net/netfilter/nft_fwd_netdev.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 2b01a151eaa8..a579e59ee5c5 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -19,6 +19,7 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev)
 		skb_push(skb, skb->mac_len);
 
 	skb->dev = dev;
+	skb->tstamp = 0;
 	dev_queue_xmit(skb);
 }
 
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 3087e23297db..b77985986b24 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -138,6 +138,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
 		return;
 
 	skb->dev = dev;
+	skb->tstamp = 0;
 	neigh_xmit(neigh_table, dev, addr, skb);
 out:
 	regs->verdict.code = verdict;
-- 
cgit v1.2.3


From 700465fd338fe5df08a1b2e27fa16981f562547f Mon Sep 17 00:00:00 2001
From: Ke Li <keli@akamai.com>
Date: Thu, 22 Oct 2020 02:41:46 -0400
Subject: net: Properly typecast int values to set sk_max_pacing_rate

In setsockopt(SO_MAX_PACING_RATE) on 64bit systems, sk_max_pacing_rate,
after extended from 'u32' to 'unsigned long', takes unintentionally
hiked value whenever assigned from an 'int' value with MSB=1, due to
binary sign extension in promoting s32 to u64, e.g. 0x80000000 becomes
0xFFFFFFFF80000000.

Thus inflated sk_max_pacing_rate causes subsequent getsockopt to return
~0U unexpectedly. It may also result in increased pacing rate.

Fix by explicitly casting the 'int' value to 'unsigned int' before
assigning it to sk_max_pacing_rate, for zero extension to happen.

Fixes: 76a9ebe811fb ("net: extend sk_pacing_rate to unsigned long")
Signed-off-by: Ji Li <jli@akamai.com>
Signed-off-by: Ke Li <keli@akamai.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201022064146.79873-1-keli@akamai.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/filter.c | 3 ++-
 net/core/sock.c   | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index 6d0fa65a4a46..2ca5eecebacf 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4733,7 +4733,8 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
 				cmpxchg(&sk->sk_pacing_status,
 					SK_PACING_NONE,
 					SK_PACING_NEEDED);
-			sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
+			sk->sk_max_pacing_rate = (val == ~0U) ?
+						 ~0UL : (unsigned int)val;
 			sk->sk_pacing_rate = min(sk->sk_pacing_rate,
 						 sk->sk_max_pacing_rate);
 			break;
diff --git a/net/core/sock.c b/net/core/sock.c
index 4e8729357122..727ea1cc633c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1163,7 +1163,7 @@ set_sndbuf:
 
 	case SO_MAX_PACING_RATE:
 		{
-		unsigned long ulval = (val == ~0U) ? ~0UL : val;
+		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
 
 		if (sizeof(ulval) != sizeof(val) &&
 		    optlen >= sizeof(ulval) &&
-- 
cgit v1.2.3


From 18ded910b589839e38a51623a179837ab4cc3789 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Thu, 22 Oct 2020 10:33:31 -0400
Subject: tcp: fix to update snd_wl1 in bulk receiver fast path

In the header prediction fast path for a bulk data receiver, if no
data is newly acknowledged then we do not call tcp_ack() and do not
call tcp_ack_update_window(). This means that a bulk receiver that
receives large amounts of data can have the incoming sequence numbers
wrap, so that the check in tcp_may_update_window fails:
   after(ack_seq, tp->snd_wl1)

If the incoming receive windows are zero in this state, and then the
connection that was a bulk data receiver later wants to send data,
that connection can find itself persistently rejecting the window
updates in incoming ACKs. This means the connection can persistently
fail to discover that the receive window has opened, which in turn
means that the connection is unable to send anything, and the
connection's sending process can get permanently "stuck".

The fix is to update snd_wl1 in the header prediction fast path for a
bulk data receiver, so that it keeps up and does not see wrapping
problems.

This fix is based on a very nice and thorough analysis and diagnosis
by Apollon Oikonomopoulos (see link below).

This is a stable candidate but there is no Fixes tag here since the
bug predates current git history. Just for fun: looks like the bug
dates back to when header prediction was added in Linux v2.1.8 in Nov
1996. In that version tcp_rcv_established() was added, and the code
only updates snd_wl1 in tcp_ack(), and in the new "Bulk data transfer:
receiver" code path it does not call tcp_ack(). This fix seems to
apply cleanly at least as far back as v3.2.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reported-by: Apollon Oikonomopoulos <apoikos@dmesg.gr>
Tested-by: Apollon Oikonomopoulos <apoikos@dmesg.gr>
Link: https://www.spinics.net/lists/netdev/msg692430.html
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20201022143331.1887495-1-ncardwell.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/tcp_input.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 67f10d3ec240..fc445833b5e5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5827,6 +5827,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
 				tcp_data_snd_check(sk);
 				if (!inet_csk_ack_scheduled(sk))
 					goto no_ack;
+			} else {
+				tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
 			}
 
 			__tcp_ack_snd_check(sk, 0);
-- 
cgit v1.2.3