From 268bb03856ed6c8511c31d08de0148782f50822f Mon Sep 17 00:00:00 2001 From: Thiago Rafael Becker Date: Wed, 17 Nov 2021 10:26:30 -0300 Subject: sunrpc: fix header include guard in trace header rpcgss.h include protection was protecting against the define for rpcrdma.h. Signed-off-by: Thiago Rafael Becker Signed-off-by: Trond Myklebust --- include/trace/events/rpcgss.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h index 3ba63319af3c..c9048f3e471b 100644 --- a/include/trace/events/rpcgss.h +++ b/include/trace/events/rpcgss.h @@ -8,7 +8,7 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM rpcgss -#if !defined(_TRACE_RPCRDMA_H) || defined(TRACE_HEADER_MULTI_READ) +#if !defined(_TRACE_RPCGSS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RPCGSS_H #include -- cgit v1.2.3 From 28c916ade1bd4205958f74bb817fd3a05dbb7afc Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 18 Nov 2021 16:30:14 +0100 Subject: ASoC: soc-acpi: Set mach->id field on comp_ids matches Commit dac7cbd55dca ("ASoC: Intel: soc-acpi-byt: shrink tables using compatible IDs") and commit 959ae8215a9e ("ASoC: Intel: soc-acpi-cht: shrink tables using compatible IDs") simplified the match tables in soc-acpi-intel-byt-match.c and soc-acpi-intel-cht-match.c by merging identical entries using the new .comp_ids snd_soc_acpi_mach field to point a single entry to multiple ACPI HIDs and clearing the previously unique per entry .id field. But various machine drivers from sound/soc/intel/boards rely on mach->id in one or more ways, e.g. some drivers contain the following snippets: adev = acpi_dev_get_first_match_dev(mach->id, NULL, -1); pkg_found = snd_soc_acpi_find_package_from_hid(mach->id, ... if (!strncmp(snd_soc_cards[i].codec_id, mach->id, 8)) { ... All of which are broken by the match table shrinking. Make the snd_soc_acpi_mach.id field non const (the storage for the tables already is non const) and on a comps_ids match copy the matching HID to the id field to fix this. Fixes: dac7cbd55dca ("ASoC: Intel: soc-acpi-byt: shrink tables using compatible IDs") Fixes: 959ae8215a9e ("ASoC: Intel: soc-acpi-cht: shrink tables using compatible IDs") Suggested-by: Pierre-Louis Bossart Cc: Pierre-Louis Bossart Cc: Brent Lu Signed-off-by: Hans de Goede Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20211118153014.349222-1-hdegoede@redhat.com Signed-off-by: Mark Brown --- include/sound/soc-acpi.h | 2 +- sound/soc/soc-acpi.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/sound/soc-acpi.h b/include/sound/soc-acpi.h index 31f4c4f9aeea..ac0893df9c76 100644 --- a/include/sound/soc-acpi.h +++ b/include/sound/soc-acpi.h @@ -147,7 +147,7 @@ struct snd_soc_acpi_link_adr { */ /* Descriptor for SST ASoC machine driver */ struct snd_soc_acpi_mach { - const u8 id[ACPI_ID_LEN]; + u8 id[ACPI_ID_LEN]; const struct snd_soc_acpi_codecs *comp_ids; const u32 link_mask; const struct snd_soc_acpi_link_adr *links; diff --git a/sound/soc/soc-acpi.c b/sound/soc/soc-acpi.c index 2ae99b49d3f5..cbd7ea48837b 100644 --- a/sound/soc/soc-acpi.c +++ b/sound/soc/soc-acpi.c @@ -20,8 +20,10 @@ static bool snd_soc_acpi_id_present(struct snd_soc_acpi_mach *machine) if (comp_ids) { for (i = 0; i < comp_ids->num_codecs; i++) { - if (acpi_dev_present(comp_ids->codecs[i], NULL, -1)) + if (acpi_dev_present(comp_ids->codecs[i], NULL, -1)) { + strscpy(machine->id, comp_ids->codecs[i], ACPI_ID_LEN); return true; + } } } -- cgit v1.2.3 From f124034faa911ed534bf8c4881ad98dbbde2a966 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Nov 2021 18:44:17 -0500 Subject: Revert "virtio_ring: validate used buffer length" This reverts commit 939779f5152d161b34f612af29e7dc1ac4472fcf. Attempts to validate length in the core did not work out: there turn out to exist multiple broken devices, and in particular legacy devices are known to be broken in this respect. We have ideas for handling this better in the next version but for now let's revert to a known good state to make sure drivers work for people. Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 60 -------------------------------------------- include/linux/virtio.h | 2 -- 2 files changed, 62 deletions(-) (limited to 'include') diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 00f64f2f8b72..6d2614e34470 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -14,9 +14,6 @@ #include #include -static bool force_used_validation = false; -module_param(force_used_validation, bool, 0444); - #ifdef DEBUG /* For development, we want to crash whenever the ring is screwed. */ #define BAD_RING(_vq, fmt, args...) \ @@ -185,9 +182,6 @@ struct vring_virtqueue { } packed; }; - /* Per-descriptor in buffer length */ - u32 *buflen; - /* How to notify other side. FIXME: commonalize hcalls! */ bool (*notify)(struct virtqueue *vq); @@ -496,7 +490,6 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, unsigned int i, n, avail, descs_used, prev, err_idx; int head; bool indirect; - u32 buflen = 0; START_USE(vq); @@ -578,7 +571,6 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE, indirect); - buflen += sg->length; } } /* Last one doesn't continue. */ @@ -618,10 +610,6 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, else vq->split.desc_state[head].indir_desc = ctx; - /* Store in buffer length if necessary */ - if (vq->buflen) - vq->buflen[head] = buflen; - /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); @@ -796,11 +784,6 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, BAD_RING(vq, "id %u is not a head!\n", i); return NULL; } - if (vq->buflen && unlikely(*len > vq->buflen[i])) { - BAD_RING(vq, "used len %d is larger than in buflen %u\n", - *len, vq->buflen[i]); - return NULL; - } /* detach_buf_split clears data, so grab it now. */ ret = vq->split.desc_state[i].data; @@ -1079,7 +1062,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, unsigned int i, n, err_idx; u16 head, id; dma_addr_t addr; - u32 buflen = 0; head = vq->packed.next_avail_idx; desc = alloc_indirect_packed(total_sg, gfp); @@ -1109,8 +1091,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, desc[i].addr = cpu_to_le64(addr); desc[i].len = cpu_to_le32(sg->length); i++; - if (n >= out_sgs) - buflen += sg->length; } } @@ -1164,10 +1144,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, vq->packed.desc_state[id].indir_desc = desc; vq->packed.desc_state[id].last = id; - /* Store in buffer length if necessary */ - if (vq->buflen) - vq->buflen[id] = buflen; - vq->num_added += 1; pr_debug("Added buffer head %i to %p\n", head, vq); @@ -1203,7 +1179,6 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, __le16 head_flags, flags; u16 head, id, prev, curr, avail_used_flags; int err; - u32 buflen = 0; START_USE(vq); @@ -1283,8 +1258,6 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, 1 << VRING_PACKED_DESC_F_AVAIL | 1 << VRING_PACKED_DESC_F_USED; } - if (n >= out_sgs) - buflen += sg->length; } } @@ -1304,10 +1277,6 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, vq->packed.desc_state[id].indir_desc = ctx; vq->packed.desc_state[id].last = prev; - /* Store in buffer length if necessary */ - if (vq->buflen) - vq->buflen[id] = buflen; - /* * A driver MUST NOT make the first descriptor in the list * available before all subsequent descriptors comprising @@ -1494,11 +1463,6 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, BAD_RING(vq, "id %u is not a head!\n", id); return NULL; } - if (vq->buflen && unlikely(*len > vq->buflen[id])) { - BAD_RING(vq, "used len %d is larger than in buflen %u\n", - *len, vq->buflen[id]); - return NULL; - } /* detach_buf_packed clears data, so grab it now. */ ret = vq->packed.desc_state[id].data; @@ -1704,7 +1668,6 @@ static struct virtqueue *vring_create_virtqueue_packed( struct vring_virtqueue *vq; struct vring_packed_desc *ring; struct vring_packed_desc_event *driver, *device; - struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver); dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr; size_t ring_size_in_bytes, event_size_in_bytes; @@ -1794,15 +1757,6 @@ static struct virtqueue *vring_create_virtqueue_packed( if (!vq->packed.desc_extra) goto err_desc_extra; - if (!drv->suppress_used_validation || force_used_validation) { - vq->buflen = kmalloc_array(num, sizeof(*vq->buflen), - GFP_KERNEL); - if (!vq->buflen) - goto err_buflen; - } else { - vq->buflen = NULL; - } - /* No callback? Tell other side not to bother us. */ if (!callback) { vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; @@ -1815,8 +1769,6 @@ static struct virtqueue *vring_create_virtqueue_packed( spin_unlock(&vdev->vqs_list_lock); return &vq->vq; -err_buflen: - kfree(vq->packed.desc_extra); err_desc_extra: kfree(vq->packed.desc_state); err_desc_state: @@ -2224,7 +2176,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, void (*callback)(struct virtqueue *), const char *name) { - struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver); struct vring_virtqueue *vq; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) @@ -2284,15 +2235,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, if (!vq->split.desc_extra) goto err_extra; - if (!drv->suppress_used_validation || force_used_validation) { - vq->buflen = kmalloc_array(vring.num, sizeof(*vq->buflen), - GFP_KERNEL); - if (!vq->buflen) - goto err_buflen; - } else { - vq->buflen = NULL; - } - /* Put everything in free lists. */ vq->free_head = 0; memset(vq->split.desc_state, 0, vring.num * @@ -2303,8 +2245,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, spin_unlock(&vdev->vqs_list_lock); return &vq->vq; -err_buflen: - kfree(vq->split.desc_extra); err_extra: kfree(vq->split.desc_state); err_state: diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 44d0e09da2d9..41edbc01ffa4 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -152,7 +152,6 @@ size_t virtio_max_dma_size(struct virtio_device *vdev); * @feature_table_size: number of entries in the feature table array. * @feature_table_legacy: same as feature_table but when working in legacy mode. * @feature_table_size_legacy: number of entries in feature table legacy array. - * @suppress_used_validation: set to not have core validate used length * @probe: the function to call when a device is found. Returns 0 or -errno. * @scan: optional function to call after successful probe; intended * for virtio-scsi to invoke a scan. @@ -169,7 +168,6 @@ struct virtio_driver { unsigned int feature_table_size; const unsigned int *feature_table_legacy; unsigned int feature_table_size_legacy; - bool suppress_used_validation; int (*validate)(struct virtio_device *dev); int (*probe)(struct virtio_device *dev); void (*scan)(struct virtio_device *dev); -- cgit v1.2.3 From dacb5d8875cc6cd3a553363b4d6f06760fcbe70c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Nov 2021 19:34:21 +0100 Subject: tcp: fix page frag corruption on page fault Steffen reported a TCP stream corruption for HTTP requests served by the apache web-server using a cifs mount-point and memory mapping the relevant file. The root cause is quite similar to the one addressed by commit 20eb4f29b602 ("net: fix sk_page_frag() recursion from memory reclaim"). Here the nested access to the task page frag is caused by a page fault on the (mmapped) user-space memory buffer coming from the cifs file. The page fault handler performs an smb transaction on a different socket, inside the same process context. Since sk->sk_allaction for such socket does not prevent the usage for the task_frag, the nested allocation modify "under the hood" the page frag in use by the outer sendmsg call, corrupting the stream. The overall relevant stack trace looks like the following: httpd 78268 [001] 3461630.850950: probe:tcp_sendmsg_locked: ffffffff91461d91 tcp_sendmsg_locked+0x1 ffffffff91462b57 tcp_sendmsg+0x27 ffffffff9139814e sock_sendmsg+0x3e ffffffffc06dfe1d smb_send_kvec+0x28 [...] ffffffffc06cfaf8 cifs_readpages+0x213 ffffffff90e83c4b read_pages+0x6b ffffffff90e83f31 __do_page_cache_readahead+0x1c1 ffffffff90e79e98 filemap_fault+0x788 ffffffff90eb0458 __do_fault+0x38 ffffffff90eb5280 do_fault+0x1a0 ffffffff90eb7c84 __handle_mm_fault+0x4d4 ffffffff90eb8093 handle_mm_fault+0xc3 ffffffff90c74f6d __do_page_fault+0x1ed ffffffff90c75277 do_page_fault+0x37 ffffffff9160111e page_fault+0x1e ffffffff9109e7b5 copyin+0x25 ffffffff9109eb40 _copy_from_iter_full+0xe0 ffffffff91462370 tcp_sendmsg_locked+0x5e0 ffffffff91462370 tcp_sendmsg_locked+0x5e0 ffffffff91462b57 tcp_sendmsg+0x27 ffffffff9139815c sock_sendmsg+0x4c ffffffff913981f7 sock_write_iter+0x97 ffffffff90f2cc56 do_iter_readv_writev+0x156 ffffffff90f2dff0 do_iter_write+0x80 ffffffff90f2e1c3 vfs_writev+0xa3 ffffffff90f2e27c do_writev+0x5c ffffffff90c042bb do_syscall_64+0x5b ffffffff916000ad entry_SYSCALL_64_after_hwframe+0x65 The cifs filesystem rightfully sets sk_allocations to GFP_NOFS, we can avoid the nesting using the sk page frag for allocation lacking the __GFP_FS flag. Do not define an additional mm-helper for that, as this is strictly tied to the sk page frag usage. v1 -> v2: - use a stricted sk_page_frag() check instead of reordering the code (Eric) Reported-by: Steffen Froemer Fixes: 5640f7685831 ("net: use a per task frag allocator") Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index b32906e1ab55..715cdb4b2b79 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2430,19 +2430,22 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk) * @sk: socket * * Use the per task page_frag instead of the per socket one for - * optimization when we know that we're in the normal context and owns + * optimization when we know that we're in process context and own * everything that's associated with %current. * - * gfpflags_allow_blocking() isn't enough here as direct reclaim may nest - * inside other socket operations and end up recursing into sk_page_frag() - * while it's already in use. + * Both direct reclaim and page faults can nest inside other + * socket operations and end up recursing into sk_page_frag() + * while it's already in use: explicitly avoid task page_frag + * usage if the caller is potentially doing any of them. + * This assumes that page fault handlers use the GFP_NOFS flags. * * Return: a per task page_frag if context allows that, * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { - if (gfpflags_normal_context(sk->sk_allocation)) + if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) == + (__GFP_DIRECT_RECLAIM | __GFP_FS)) return ¤t->task_frag; return &sk->sk_frag; -- cgit v1.2.3 From cdef485217d30382f3bf6448c54b4401648fe3f1 Mon Sep 17 00:00:00 2001 From: msizanoen1 Date: Tue, 23 Nov 2021 13:48:32 +0100 Subject: ipv6: fix memory leak in fib6_rule_suppress The kernel leaks memory when a `fib` rule is present in IPv6 nftables firewall rules and a suppress_prefix rule is present in the IPv6 routing rules (used by certain tools such as wg-quick). In such scenarios, every incoming packet will leak an allocation in `ip6_dst_cache` slab cache. After some hours of `bpftrace`-ing and source code reading, I tracked down the issue to ca7a03c41753 ("ipv6: do not free rt if FIB_LOOKUP_NOREF is set on suppress rule"). The problem with that change is that the generic `args->flags` always have `FIB_LOOKUP_NOREF` set[1][2] but the IPv6-specific flag `RT6_LOOKUP_F_DST_NOREF` might not be, leading to `fib6_rule_suppress` not decreasing the refcount when needed. How to reproduce: - Add the following nftables rule to a prerouting chain: meta nfproto ipv6 fib saddr . mark . iif oif missing drop This can be done with: sudo nft create table inet test sudo nft create chain inet test test_chain '{ type filter hook prerouting priority filter + 10; policy accept; }' sudo nft add rule inet test test_chain meta nfproto ipv6 fib saddr . mark . iif oif missing drop - Run: sudo ip -6 rule add table main suppress_prefixlength 0 - Watch `sudo slabtop -o | grep ip6_dst_cache` to see memory usage increase with every incoming ipv6 packet. This patch exposes the protocol-specific flags to the protocol specific `suppress` function, and check the protocol-specific `flags` argument for RT6_LOOKUP_F_DST_NOREF instead of the generic FIB_LOOKUP_NOREF when decreasing the refcount, like this. [1]: https://github.com/torvalds/linux/blob/ca7a03c4175366a92cee0ccc4fec0038c3266e26/net/ipv6/fib6_rules.c#L71 [2]: https://github.com/torvalds/linux/blob/ca7a03c4175366a92cee0ccc4fec0038c3266e26/net/ipv6/fib6_rules.c#L99 Link: https://bugzilla.kernel.org/show_bug.cgi?id=215105 Fixes: ca7a03c41753 ("ipv6: do not free rt if FIB_LOOKUP_NOREF is set on suppress rule") Cc: stable@vger.kernel.org Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- include/net/fib_rules.h | 4 +++- net/core/fib_rules.c | 2 +- net/ipv4/fib_rules.c | 1 + net/ipv6/fib6_rules.c | 4 ++-- 4 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 4b10676c69d1..bd07484ab9dd 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -69,7 +69,7 @@ struct fib_rules_ops { int (*action)(struct fib_rule *, struct flowi *, int, struct fib_lookup_arg *); - bool (*suppress)(struct fib_rule *, + bool (*suppress)(struct fib_rule *, int, struct fib_lookup_arg *); int (*match)(struct fib_rule *, struct flowi *, int); @@ -218,7 +218,9 @@ INDIRECT_CALLABLE_DECLARE(int fib4_rule_action(struct fib_rule *rule, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib6_rule_suppress(struct fib_rule *rule, + int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib4_rule_suppress(struct fib_rule *rule, + int flags, struct fib_lookup_arg *arg)); #endif diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 79df7cd9dbc1..1bb567a3b329 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -323,7 +323,7 @@ jumped: if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, fib6_rule_suppress, fib4_rule_suppress, - rule, arg)) + rule, flags, arg)) continue; if (err != -EAGAIN) { diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index ce54a30c2ef1..364ad3446b2f 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -141,6 +141,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_action(struct fib_rule *rule, } INDIRECT_CALLABLE_SCOPE bool fib4_rule_suppress(struct fib_rule *rule, + int flags, struct fib_lookup_arg *arg) { struct fib_result *result = (struct fib_result *) arg->result; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 40f3e4f9f33a..dcedfe29d9d9 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -267,6 +267,7 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_action(struct fib_rule *rule, } INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule, + int flags, struct fib_lookup_arg *arg) { struct fib6_result *res = arg->result; @@ -294,8 +295,7 @@ INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule, return false; suppress_route: - if (!(arg->flags & FIB_LOOKUP_NOREF)) - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); return true; } -- cgit v1.2.3 From 20ae1d6aa159eb91a9bf09ff92ccaa94dbea92c2 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Nov 2021 10:39:25 -0500 Subject: wireguard: device: reset peer src endpoint when netns exits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each peer's endpoint contains a dst_cache entry that takes a reference to another netdev. When the containing namespace exits, we take down the socket and prevent future sockets from being created (by setting creating_net to NULL), which removes that potential reference on the netns. However, it doesn't release references to the netns that a netdev cached in dst_cache might be taking, so the netns still might fail to exit. Since the socket is gimped anyway, we can simply clear all the dst_caches (by way of clearing the endpoint src), which will release all references. However, the current dst_cache_reset function only releases those references lazily. But it turns out that all of our usages of wg_socket_clear_peer_endpoint_src are called from contexts that are not exactly high-speed or bottle-necked. For example, when there's connection difficulty, or when userspace is reconfiguring the interface. And in particular for this patch, when the netns is exiting. So for those cases, it makes more sense to call dst_release immediately. For that, we add a small helper function to dst_cache. This patch also adds a test to netns.sh from Hangbin Liu to ensure this doesn't regress. Tested-by: Hangbin Liu Reported-by: Xiumei Mu Cc: Toke Høiland-Jørgensen Cc: Paolo Abeni Fixes: 900575aa33a3 ("wireguard: device: avoid circular netns references") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/device.c | 3 +++ drivers/net/wireguard/socket.c | 2 +- include/net/dst_cache.h | 11 +++++++++++ net/core/dst_cache.c | 19 +++++++++++++++++++ tools/testing/selftests/wireguard/netns.sh | 24 +++++++++++++++++++++++- 5 files changed, 57 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 551ddaaaf540..77e64ea6be67 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -398,6 +398,7 @@ static struct rtnl_link_ops link_ops __read_mostly = { static void wg_netns_pre_exit(struct net *net) { struct wg_device *wg; + struct wg_peer *peer; rtnl_lock(); list_for_each_entry(wg, &device_list, device_list) { @@ -407,6 +408,8 @@ static void wg_netns_pre_exit(struct net *net) mutex_lock(&wg->device_update_lock); rcu_assign_pointer(wg->creating_net, NULL); wg_socket_reinit(wg, NULL, NULL); + list_for_each_entry(peer, &wg->peer_list, peer_list) + wg_socket_clear_peer_endpoint_src(peer); mutex_unlock(&wg->device_update_lock); } } diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 8c496b747108..6f07b949cb81 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -308,7 +308,7 @@ void wg_socket_clear_peer_endpoint_src(struct wg_peer *peer) { write_lock_bh(&peer->endpoint_lock); memset(&peer->endpoint.src6, 0, sizeof(peer->endpoint.src6)); - dst_cache_reset(&peer->endpoint_cache); + dst_cache_reset_now(&peer->endpoint_cache); write_unlock_bh(&peer->endpoint_lock); } diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h index 67634675e919..df6622a5fe98 100644 --- a/include/net/dst_cache.h +++ b/include/net/dst_cache.h @@ -79,6 +79,17 @@ static inline void dst_cache_reset(struct dst_cache *dst_cache) dst_cache->reset_ts = jiffies; } +/** + * dst_cache_reset_now - invalidate the cache contents immediately + * @dst_cache: the cache + * + * The caller must be sure there are no concurrent users, as this frees + * all dst_cache users immediately, rather than waiting for the next + * per-cpu usage like dst_cache_reset does. Most callers should use the + * higher speed lazily-freed dst_cache_reset function instead. + */ +void dst_cache_reset_now(struct dst_cache *dst_cache); + /** * dst_cache_init - initialize the cache, allocating the required storage * @dst_cache: the cache diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index be74ab4551c2..0ccfd5fa5cb9 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -162,3 +162,22 @@ void dst_cache_destroy(struct dst_cache *dst_cache) free_percpu(dst_cache->cache); } EXPORT_SYMBOL_GPL(dst_cache_destroy); + +void dst_cache_reset_now(struct dst_cache *dst_cache) +{ + int i; + + if (!dst_cache->cache) + return; + + dst_cache->reset_ts = jiffies; + for_each_possible_cpu(i) { + struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i); + struct dst_entry *dst = idst->dst; + + idst->cookie = 0; + idst->dst = NULL; + dst_release(dst); + } +} +EXPORT_SYMBOL_GPL(dst_cache_reset_now); diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index 2e5c1630885e..8a9461aa0878 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -613,6 +613,28 @@ ip0 link set wg0 up kill $ncat_pid ip0 link del wg0 +# Ensure that dst_cache references don't outlive netns lifetime +ip1 link add dev wg0 type wireguard +ip2 link add dev wg0 type wireguard +configure_peers +ip1 link add veth1 type veth peer name veth2 +ip1 link set veth2 netns $netns2 +ip1 addr add fd00:aa::1/64 dev veth1 +ip2 addr add fd00:aa::2/64 dev veth2 +ip1 link set veth1 up +ip2 link set veth2 up +waitiface $netns1 veth1 +waitiface $netns2 veth2 +ip1 -6 route add default dev veth1 via fd00:aa::2 +ip2 -6 route add default dev veth2 via fd00:aa::1 +n1 wg set wg0 peer "$pub2" endpoint [fd00:aa::2]:2 +n2 wg set wg0 peer "$pub1" endpoint [fd00:aa::1]:1 +n1 ping6 -c 1 fd00::2 +pp ip netns delete $netns1 +pp ip netns delete $netns2 +pp ip netns add $netns1 +pp ip netns add $netns2 + # Ensure there aren't circular reference loops ip1 link add wg1 type wireguard ip2 link add wg2 type wireguard @@ -631,7 +653,7 @@ while read -t 0.1 -r line 2>/dev/null || [[ $? -ne 142 ]]; do done < /dev/kmsg alldeleted=1 for object in "${!objects[@]}"; do - if [[ ${objects["$object"]} != *createddestroyed ]]; then + if [[ ${objects["$object"]} != *createddestroyed && ${objects["$object"]} != *createdcreateddestroyeddestroyed ]]; then echo "Error: $object: merely ${objects["$object"]}" >&3 alldeleted=0 fi -- cgit v1.2.3 From f7e5b9bfa6c8820407b64eabc1f29c9a87e8993d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 29 Nov 2021 10:39:29 -0500 Subject: siphash: use _unaligned version by default On ARM v6 and later, we define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS because the ordinary load/store instructions (ldr, ldrh, ldrb) can tolerate any misalignment of the memory address. However, load/store double and load/store multiple instructions (ldrd, ldm) may still only be used on memory addresses that are 32-bit aligned, and so we have to use the CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS macro with care, or we may end up with a severe performance hit due to alignment traps that require fixups by the kernel. Testing shows that this currently happens with clang-13 but not gcc-11. In theory, any compiler version can produce this bug or other problems, as we are dealing with undefined behavior in C99 even on architectures that support this in hardware, see also https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100363. Fortunately, the get_unaligned() accessors do the right thing: when building for ARMv6 or later, the compiler will emit unaligned accesses using the ordinary load/store instructions (but avoid the ones that require 32-bit alignment). When building for older ARM, those accessors will emit the appropriate sequence of ldrb/mov/orr instructions. And on architectures that can truly tolerate any kind of misalignment, the get_unaligned() accessors resolve to the leXX_to_cpup accessors that operate on aligned addresses. Since the compiler will in fact emit ldrd or ldm instructions when building this code for ARM v6 or later, the solution is to use the unaligned accessors unconditionally on architectures where this is known to be fast. The _aligned version of the hash function is however still needed to get the best performance on architectures that cannot do any unaligned access in hardware. This new version avoids the undefined behavior and should produce the fastest hash on all architectures we support. Link: https://lore.kernel.org/linux-arm-kernel/20181008211554.5355-4-ard.biesheuvel@linaro.org/ Link: https://lore.kernel.org/linux-crypto/CAK8P3a2KfmmGDbVHULWevB0hv71P2oi2ZCHEAqT=8dQfa0=cqQ@mail.gmail.com/ Reported-by: Ard Biesheuvel Fixes: 2c956a60778c ("siphash: add cryptographically secure PRF") Signed-off-by: Arnd Bergmann Reviewed-by: Jason A. Donenfeld Acked-by: Ard Biesheuvel Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- include/linux/siphash.h | 14 ++++---------- lib/siphash.c | 12 ++++++------ 2 files changed, 10 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/siphash.h b/include/linux/siphash.h index bf21591a9e5e..0cda61855d90 100644 --- a/include/linux/siphash.h +++ b/include/linux/siphash.h @@ -27,9 +27,7 @@ static inline bool siphash_key_is_zero(const siphash_key_t *key) } u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key); -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key); -#endif u64 siphash_1u64(const u64 a, const siphash_key_t *key); u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key); @@ -82,10 +80,9 @@ static inline u64 ___siphash_aligned(const __le64 *data, size_t len, static inline u64 siphash(const void *data, size_t len, const siphash_key_t *key) { -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + !IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) return __siphash_unaligned(data, len, key); -#endif return ___siphash_aligned(data, len, key); } @@ -96,10 +93,8 @@ typedef struct { u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key); -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key); -#endif u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key); u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key); @@ -135,10 +130,9 @@ static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len, static inline u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key) { -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + !IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) return __hsiphash_unaligned(data, len, key); -#endif return ___hsiphash_aligned(data, len, key); } diff --git a/lib/siphash.c b/lib/siphash.c index a90112ee72a1..72b9068ab57b 100644 --- a/lib/siphash.c +++ b/lib/siphash.c @@ -49,6 +49,7 @@ SIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); @@ -80,8 +81,8 @@ u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) POSTAMBLE } EXPORT_SYMBOL(__siphash_aligned); +#endif -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); @@ -113,7 +114,6 @@ u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) POSTAMBLE } EXPORT_SYMBOL(__siphash_unaligned); -#endif /** * siphash_1u64 - compute 64-bit siphash PRF value of a u64 @@ -250,6 +250,7 @@ EXPORT_SYMBOL(siphash_3u32); HSIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); @@ -280,8 +281,8 @@ u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); +#endif -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { @@ -313,7 +314,6 @@ u32 __hsiphash_unaligned(const void *data, size_t len, HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); -#endif /** * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32 @@ -418,6 +418,7 @@ EXPORT_SYMBOL(hsiphash_4u32); HSIPROUND; \ return v1 ^ v3; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); @@ -438,8 +439,8 @@ u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); +#endif -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { @@ -461,7 +462,6 @@ u32 __hsiphash_unaligned(const void *data, size_t len, HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); -#endif /** * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32 -- cgit v1.2.3 From 502e82b91361955c66c8453b5b7a905b0b5bd5a1 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 7 Nov 2021 17:21:45 +0200 Subject: net/mlx5: Fix access to a non-supported register Validate MRTC register is supported before triggering a delayed work which accesses it. Fixes: 5a1023deeed0 ("net/mlx5: Add periodic update of host time to firmware") Signed-off-by: Aya Levin Reviewed-by: Gal Pressman Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 8 +++----- include/linux/mlx5/mlx5_ifc.h | 5 ++++- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 380f50d5462d..3ca998874c50 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -836,7 +836,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev) health->timer.expires = jiffies + msecs_to_jiffies(poll_interval_ms); add_timer(&health->timer); - if (mlx5_core_is_pf(dev)) + if (mlx5_core_is_pf(dev) && MLX5_CAP_MCAM_REG(dev, mrtc)) queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index e127c0530b3a..7df9c7f8d9c8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1071,18 +1071,16 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot) mlx5_set_driver_version(dev); - mlx5_start_health_poll(dev); - err = mlx5_query_hca_caps(dev); if (err) { mlx5_core_err(dev, "query hca failed\n"); - goto stop_health; + goto reclaim_boot_pages; } + mlx5_start_health_poll(dev); + return 0; -stop_health: - mlx5_stop_health_poll(dev, boot); reclaim_boot_pages: mlx5_reclaim_startup_pages(dev); err_disable_hca: diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3636df90899a..fbaab440a484 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9698,7 +9698,10 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 regs_84_to_68[0x11]; u8 tracer_registers[0x4]; - u8 regs_63_to_32[0x20]; + u8 regs_63_to_46[0x12]; + u8 mrtc[0x1]; + u8 regs_44_to_32[0xd]; + u8 regs_31_to_0[0x20]; }; -- cgit v1.2.3 From 6bbfa44116689469267f1a6e3d233b52114139d2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 1 Dec 2021 23:45:50 +0900 Subject: kprobes: Limit max data_size of the kretprobe instances The 'kprobe::data_size' is unsigned, thus it can not be negative. But if user sets it enough big number (e.g. (size_t)-8), the result of 'data_size + sizeof(struct kretprobe_instance)' becomes smaller than sizeof(struct kretprobe_instance) or zero. In result, the kretprobe_instance are allocated without enough memory, and kretprobe accesses outside of allocated memory. To avoid this issue, introduce a max limitation of the kretprobe::data_size. 4KB per instance should be OK. Link: https://lkml.kernel.org/r/163836995040.432120.10322772773821182925.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: f47cd9b553aa ("kprobes: kretprobe user entry-handler") Reported-by: zhangyue Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 2 ++ kernel/kprobes.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e974caf39d3e..8c8f7a4d93af 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -153,6 +153,8 @@ struct kretprobe { struct kretprobe_holder *rph; }; +#define KRETPROBE_MAX_DATA_SIZE 4096 + struct kretprobe_instance { union { struct freelist_node freelist; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e9db0c810554..21eccc961bba 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2086,6 +2086,9 @@ int register_kretprobe(struct kretprobe *rp) } } + if (rp->data_size > KRETPROBE_MAX_DATA_SIZE) + return -E2BIG; + rp->kp.pre_handler = pre_handler_kretprobe; rp->kp.post_handler = NULL; -- cgit v1.2.3 From 7a10d8c810cfad3e79372d7d1c77899d86cd6662 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Nov 2021 09:01:55 -0800 Subject: net: annotate data-races on txq->xmit_lock_owner syzbot found that __dev_queue_xmit() is reading txq->xmit_lock_owner without annotations. No serious issue there, let's document what is happening there. BUG: KCSAN: data-race in __dev_queue_xmit / __dev_queue_xmit write to 0xffff888139d09484 of 4 bytes by interrupt on cpu 0: __netif_tx_unlock include/linux/netdevice.h:4437 [inline] __dev_queue_xmit+0x948/0xf70 net/core/dev.c:4229 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline] macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567 __netdev_start_xmit include/linux/netdevice.h:4987 [inline] netdev_start_xmit include/linux/netdevice.h:5001 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3590 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259 neigh_hh_output include/net/neighbour.h:511 [inline] neigh_output include/net/neighbour.h:525 [inline] ip6_finish_output2+0x995/0xbb0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421 expire_timers+0x116/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x410 kernel/time/timer.c:1734 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747 __do_softirq+0x158/0x2de kernel/softirq.c:558 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x37/0x70 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x3e/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 read to 0xffff888139d09484 of 4 bytes by interrupt on cpu 1: __dev_queue_xmit+0x5e3/0xf70 net/core/dev.c:4213 dev_queue_xmit_accel+0x19/0x20 net/core/dev.c:4265 macvlan_queue_xmit drivers/net/macvlan.c:543 [inline] macvlan_start_xmit+0x2b3/0x3d0 drivers/net/macvlan.c:567 __netdev_start_xmit include/linux/netdevice.h:4987 [inline] netdev_start_xmit include/linux/netdevice.h:5001 [inline] xmit_one+0x105/0x2f0 net/core/dev.c:3590 dev_hard_start_xmit+0x72/0x120 net/core/dev.c:3606 sch_direct_xmit+0x1b2/0x7c0 net/sched/sch_generic.c:342 __dev_xmit_skb+0x83d/0x1370 net/core/dev.c:3817 __dev_queue_xmit+0x590/0xf70 net/core/dev.c:4194 dev_queue_xmit+0x13/0x20 net/core/dev.c:4259 neigh_resolve_output+0x3db/0x410 net/core/neighbour.c:1523 neigh_output include/net/neighbour.h:527 [inline] ip6_finish_output2+0x9be/0xbb0 net/ipv6/ip6_output.c:126 __ip6_finish_output net/ipv6/ip6_output.c:191 [inline] ip6_finish_output+0x444/0x4c0 net/ipv6/ip6_output.c:201 NF_HOOK_COND include/linux/netfilter.h:296 [inline] ip6_output+0x10e/0x210 net/ipv6/ip6_output.c:224 dst_output include/net/dst.h:450 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ndisc_send_skb+0x486/0x610 net/ipv6/ndisc.c:508 ndisc_send_rs+0x3b0/0x3e0 net/ipv6/ndisc.c:702 addrconf_rs_timer+0x370/0x540 net/ipv6/addrconf.c:3898 call_timer_fn+0x2e/0x240 kernel/time/timer.c:1421 expire_timers+0x116/0x240 kernel/time/timer.c:1466 __run_timers+0x368/0x410 kernel/time/timer.c:1734 run_timer_softirq+0x2e/0x60 kernel/time/timer.c:1747 __do_softirq+0x158/0x2de kernel/softirq.c:558 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x37/0x70 kernel/softirq.c:648 sysvec_apic_timer_interrupt+0x8d/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 kcsan_setup_watchpoint+0x94/0x420 kernel/kcsan/core.c:443 folio_test_anon include/linux/page-flags.h:581 [inline] PageAnon include/linux/page-flags.h:586 [inline] zap_pte_range+0x5ac/0x10e0 mm/memory.c:1347 zap_pmd_range mm/memory.c:1467 [inline] zap_pud_range mm/memory.c:1496 [inline] zap_p4d_range mm/memory.c:1517 [inline] unmap_page_range+0x2dc/0x3d0 mm/memory.c:1538 unmap_single_vma+0x157/0x210 mm/memory.c:1583 unmap_vmas+0xd0/0x180 mm/memory.c:1615 exit_mmap+0x23d/0x470 mm/mmap.c:3170 __mmput+0x27/0x1b0 kernel/fork.c:1113 mmput+0x3d/0x50 kernel/fork.c:1134 exit_mm+0xdb/0x170 kernel/exit.c:507 do_exit+0x608/0x17a0 kernel/exit.c:819 do_group_exit+0xce/0x180 kernel/exit.c:929 get_signal+0xfc3/0x1550 kernel/signal.c:2852 arch_do_signal_or_restart+0x8c/0x2e0 arch/x86/kernel/signal.c:868 handle_signal_work kernel/entry/common.c:148 [inline] exit_to_user_mode_loop kernel/entry/common.c:172 [inline] exit_to_user_mode_prepare+0x113/0x190 kernel/entry/common.c:207 __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:300 do_syscall_64+0x50/0xd0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000000 -> 0xffffffff Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 28712 Comm: syz-executor.0 Tainted: G W 5.16.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20211130170155.2331929-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 19 +++++++++++++------ net/core/dev.c | 5 ++++- 2 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec42495a43a..be5cb3360b94 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4404,7 +4404,8 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) { spin_lock(&txq->_xmit_lock); - txq->xmit_lock_owner = cpu; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, cpu); } static inline bool __netif_tx_acquire(struct netdev_queue *txq) @@ -4421,26 +4422,32 @@ static inline void __netif_tx_release(struct netdev_queue *txq) static inline void __netif_tx_lock_bh(struct netdev_queue *txq) { spin_lock_bh(&txq->_xmit_lock); - txq->xmit_lock_owner = smp_processor_id(); + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); } static inline bool __netif_tx_trylock(struct netdev_queue *txq) { bool ok = spin_trylock(&txq->_xmit_lock); - if (likely(ok)) - txq->xmit_lock_owner = smp_processor_id(); + + if (likely(ok)) { + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id()); + } return ok; } static inline void __netif_tx_unlock(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock(&txq->_xmit_lock); } static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) { - txq->xmit_lock_owner = -1; + /* Pairs with READ_ONCE() in __dev_queue_xmit() */ + WRITE_ONCE(txq->xmit_lock_owner, -1); spin_unlock_bh(&txq->_xmit_lock); } diff --git a/net/core/dev.c b/net/core/dev.c index 15ac064b5562..2a352e668d10 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4210,7 +4210,10 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) if (dev->flags & IFF_UP) { int cpu = smp_processor_id(); /* ok because BHs are off */ - if (txq->xmit_lock_owner != cpu) { + /* Other cpus might concurrently change txq->xmit_lock_owner + * to -1 or to their cpu id, but not to our id. + */ + if (READ_ONCE(txq->xmit_lock_owner) != cpu) { if (dev_xmit_recursion()) goto recursion_alert; -- cgit v1.2.3 From a37a0ee4d25c152a087c5e913b76f207eaebdb54 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Nov 2021 10:29:39 -0800 Subject: net: avoid uninit-value from tcp_conn_request A recent change triggers a KMSAN warning, because request sockets do not initialize @sk_rx_queue_mapping field. Add sk_rx_queue_update() helper to make our intent clear. BUG: KMSAN: uninit-value in sk_rx_queue_set include/net/sock.h:1922 [inline] BUG: KMSAN: uninit-value in tcp_conn_request+0x3bcc/0x4dc0 net/ipv4/tcp_input.c:6922 sk_rx_queue_set include/net/sock.h:1922 [inline] tcp_conn_request+0x3bcc/0x4dc0 net/ipv4/tcp_input.c:6922 tcp_v4_conn_request+0x218/0x2a0 net/ipv4/tcp_ipv4.c:1528 tcp_rcv_state_process+0x2c5/0x3290 net/ipv4/tcp_input.c:6406 tcp_v4_do_rcv+0xb4e/0x1330 net/ipv4/tcp_ipv4.c:1738 tcp_v4_rcv+0x468d/0x4ed0 net/ipv4/tcp_ipv4.c:2100 ip_protocol_deliver_rcu+0x760/0x10b0 net/ipv4/ip_input.c:204 ip_local_deliver_finish net/ipv4/ip_input.c:231 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ip_local_deliver+0x584/0x8c0 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:460 [inline] ip_sublist_rcv_finish net/ipv4/ip_input.c:551 [inline] ip_list_rcv_finish net/ipv4/ip_input.c:601 [inline] ip_sublist_rcv+0x11fd/0x1520 net/ipv4/ip_input.c:609 ip_list_rcv+0x95f/0x9a0 net/ipv4/ip_input.c:644 __netif_receive_skb_list_ptype net/core/dev.c:5505 [inline] __netif_receive_skb_list_core+0xe34/0x1240 net/core/dev.c:5553 __netif_receive_skb_list+0x7fc/0x960 net/core/dev.c:5605 netif_receive_skb_list_internal+0x868/0xde0 net/core/dev.c:5696 gro_normal_list net/core/dev.c:5850 [inline] napi_complete_done+0x579/0xdd0 net/core/dev.c:6587 virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline] virtnet_poll+0x17b6/0x2350 drivers/net/virtio_net.c:1557 __napi_poll+0x14e/0xbc0 net/core/dev.c:7020 napi_poll net/core/dev.c:7087 [inline] net_rx_action+0x824/0x1880 net/core/dev.c:7174 __do_softirq+0x1fe/0x7eb kernel/softirq.c:558 invoke_softirq+0xa4/0x130 kernel/softirq.c:432 __irq_exit_rcu kernel/softirq.c:636 [inline] irq_exit_rcu+0x76/0x130 kernel/softirq.c:648 common_interrupt+0xb6/0xd0 arch/x86/kernel/irq.c:240 asm_common_interrupt+0x1e/0x40 smap_restore arch/x86/include/asm/smap.h:67 [inline] get_shadow_origin_ptr mm/kmsan/instrumentation.c:31 [inline] __msan_metadata_ptr_for_load_1+0x28/0x30 mm/kmsan/instrumentation.c:63 tomoyo_check_acl+0x1b0/0x630 security/tomoyo/domain.c:173 tomoyo_path_permission security/tomoyo/file.c:586 [inline] tomoyo_check_open_permission+0x61f/0xe10 security/tomoyo/file.c:777 tomoyo_file_open+0x24f/0x2d0 security/tomoyo/tomoyo.c:311 security_file_open+0xb1/0x1f0 security/security.c:1635 do_dentry_open+0x4e4/0x1bf0 fs/open.c:809 vfs_open+0xaf/0xe0 fs/open.c:957 do_open fs/namei.c:3426 [inline] path_openat+0x52f1/0x5dd0 fs/namei.c:3559 do_filp_open+0x306/0x760 fs/namei.c:3586 do_sys_openat2+0x263/0x8f0 fs/open.c:1212 do_sys_open fs/open.c:1228 [inline] __do_sys_open fs/open.c:1236 [inline] __se_sys_open fs/open.c:1232 [inline] __x64_sys_open+0x314/0x380 fs/open.c:1232 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x54/0xd0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x44/0xae Uninit was created at: __alloc_pages+0xbc7/0x10a0 mm/page_alloc.c:5409 alloc_pages+0x8a5/0xb80 alloc_slab_page mm/slub.c:1810 [inline] allocate_slab+0x287/0x1c20 mm/slub.c:1947 new_slab mm/slub.c:2010 [inline] ___slab_alloc+0xbdf/0x1e90 mm/slub.c:3039 __slab_alloc mm/slub.c:3126 [inline] slab_alloc_node mm/slub.c:3217 [inline] slab_alloc mm/slub.c:3259 [inline] kmem_cache_alloc+0xbb3/0x11c0 mm/slub.c:3264 reqsk_alloc include/net/request_sock.h:91 [inline] inet_reqsk_alloc+0xaf/0x8b0 net/ipv4/tcp_input.c:6712 tcp_conn_request+0x910/0x4dc0 net/ipv4/tcp_input.c:6852 tcp_v4_conn_request+0x218/0x2a0 net/ipv4/tcp_ipv4.c:1528 tcp_rcv_state_process+0x2c5/0x3290 net/ipv4/tcp_input.c:6406 tcp_v4_do_rcv+0xb4e/0x1330 net/ipv4/tcp_ipv4.c:1738 tcp_v4_rcv+0x468d/0x4ed0 net/ipv4/tcp_ipv4.c:2100 ip_protocol_deliver_rcu+0x760/0x10b0 net/ipv4/ip_input.c:204 ip_local_deliver_finish net/ipv4/ip_input.c:231 [inline] NF_HOOK include/linux/netfilter.h:307 [inline] ip_local_deliver+0x584/0x8c0 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:460 [inline] ip_sublist_rcv_finish net/ipv4/ip_input.c:551 [inline] ip_list_rcv_finish net/ipv4/ip_input.c:601 [inline] ip_sublist_rcv+0x11fd/0x1520 net/ipv4/ip_input.c:609 ip_list_rcv+0x95f/0x9a0 net/ipv4/ip_input.c:644 __netif_receive_skb_list_ptype net/core/dev.c:5505 [inline] __netif_receive_skb_list_core+0xe34/0x1240 net/core/dev.c:5553 __netif_receive_skb_list+0x7fc/0x960 net/core/dev.c:5605 netif_receive_skb_list_internal+0x868/0xde0 net/core/dev.c:5696 gro_normal_list net/core/dev.c:5850 [inline] napi_complete_done+0x579/0xdd0 net/core/dev.c:6587 virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline] virtnet_poll+0x17b6/0x2350 drivers/net/virtio_net.c:1557 __napi_poll+0x14e/0xbc0 net/core/dev.c:7020 napi_poll net/core/dev.c:7087 [inline] net_rx_action+0x824/0x1880 net/core/dev.c:7174 __do_softirq+0x1fe/0x7eb kernel/softirq.c:558 Fixes: 342159ee394d ("net: avoid dirtying sk->sk_rx_queue_mapping") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20211130182939.2584764-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/busy_poll.h | 2 +- include/net/sock.h | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index 4202c609bb0b..7994455ec714 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -133,7 +133,7 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id)) WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif - sk_rx_queue_set(sk, skb); + sk_rx_queue_update(sk, skb); } static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id) diff --git a/include/net/sock.h b/include/net/sock.h index 715cdb4b2b79..bea21ff70e74 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1913,18 +1913,31 @@ static inline int sk_tx_queue_get(const struct sock *sk) return -1; } -static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) +static inline void __sk_rx_queue_set(struct sock *sk, + const struct sk_buff *skb, + bool force_set) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING if (skb_rx_queue_recorded(skb)) { u16 rx_queue = skb_get_rx_queue(skb); - if (unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue)) + if (force_set || + unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue)) WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue); } #endif } +static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) +{ + __sk_rx_queue_set(sk, skb, true); +} + +static inline void sk_rx_queue_update(struct sock *sk, const struct sk_buff *skb) +{ + __sk_rx_queue_set(sk, skb, false); +} + static inline void sk_rx_queue_clear(struct sock *sk) { #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING -- cgit v1.2.3 From 213f5f8f31f10aa1e83187ae20fb7fa4e626b724 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Dec 2021 18:26:35 -0800 Subject: ipv4: convert fib_num_tclassid_users to atomic_t Before commit faa041a40b9f ("ipv4: Create cleanup helper for fib_nh") changes to net->ipv4.fib_num_tclassid_users were protected by RTNL. After the change, this is no longer the case, as free_fib_info_rcu() runs after rcu grace period, without rtnl being held. Fixes: faa041a40b9f ("ipv4: Create cleanup helper for fib_nh") Signed-off-by: Eric Dumazet Cc: David Ahern Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 +- include/net/netns/ipv4.h | 2 +- net/ipv4/fib_frontend.c | 2 +- net/ipv4/fib_rules.c | 4 ++-- net/ipv4/fib_semantics.c | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index ab5348e57db1..3417ba2d27ad 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -438,7 +438,7 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { - return net->ipv4.fib_num_tclassid_users; + return atomic_read(&net->ipv4.fib_num_tclassid_users); } #else static inline int fib_num_tclassid_users(struct net *net) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2f65701a43c9..6c5b2efc4f17 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -65,7 +65,7 @@ struct netns_ipv4 { bool fib_has_custom_local_routes; bool fib_offload_disabled; #ifdef CONFIG_IP_ROUTE_CLASSID - int fib_num_tclassid_users; + atomic_t fib_num_tclassid_users; #endif struct hlist_head *fib_table_hash; struct sock *fibnl; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 9fe13e4f5d08..4d61ddd8a0ec 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1582,7 +1582,7 @@ static int __net_init fib_net_init(struct net *net) int error; #ifdef CONFIG_IP_ROUTE_CLASSID - net->ipv4.fib_num_tclassid_users = 0; + atomic_set(&net->ipv4.fib_num_tclassid_users, 0); #endif error = ip_fib_net_init(net); if (error < 0) diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 364ad3446b2f..d279cb8ac158 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -264,7 +264,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (tb[FRA_FLOW]) { rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); if (rule4->tclassid) - net->ipv4.fib_num_tclassid_users++; + atomic_inc(&net->ipv4.fib_num_tclassid_users); } #endif @@ -296,7 +296,7 @@ static int fib4_rule_delete(struct fib_rule *rule) #ifdef CONFIG_IP_ROUTE_CLASSID if (((struct fib4_rule *)rule)->tclassid) - net->ipv4.fib_num_tclassid_users--; + atomic_dec(&net->ipv4.fib_num_tclassid_users); #endif net->ipv4.fib_has_custom_rules = true; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3364cb9c67e0..fde7797b5806 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -220,7 +220,7 @@ void fib_nh_release(struct net *net, struct fib_nh *fib_nh) { #ifdef CONFIG_IP_ROUTE_CLASSID if (fib_nh->nh_tclassid) - net->ipv4.fib_num_tclassid_users--; + atomic_dec(&net->ipv4.fib_num_tclassid_users); #endif fib_nh_common_release(&fib_nh->nh_common); } @@ -632,7 +632,7 @@ int fib_nh_init(struct net *net, struct fib_nh *nh, #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; if (nh->nh_tclassid) - net->ipv4.fib_num_tclassid_users++; + atomic_inc(&net->ipv4.fib_num_tclassid_users); #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->fib_nh_weight = nh_weight; -- cgit v1.2.3 From 72f6a45202f20f0e1a46b0acb7803369cc53d0b8 Mon Sep 17 00:00:00 2001 From: Xiayu Zhang Date: Wed, 1 Dec 2021 10:57:13 +0800 Subject: Fix Comment of ETH_P_802_3_MIN The description of ETH_P_802_3_MIN is misleading. The value of EthernetType in Ethernet II frame is more than 0x0600, the value of Length in 802.3 frame is less than 0x0600. Signed-off-by: Xiayu Zhang Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 5da4ee234e0b..c0c2f3ed5729 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -117,7 +117,7 @@ #define ETH_P_IFE 0xED3E /* ForCES inter-FE LFB type */ #define ETH_P_AF_IUCV 0xFBFB /* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */ -#define ETH_P_802_3_MIN 0x0600 /* If the value in the ethernet type is less than this value +#define ETH_P_802_3_MIN 0x0600 /* If the value in the ethernet type is more than this value * then the frame is Ethernet II. Else it is 802.3 */ /* -- cgit v1.2.3