From b2403a61308533c576c9dd783fcb73a9186e0b37 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 14 Feb 2022 09:15:34 -0500 Subject: fs, net: Move read_descriptor_t to net.h fs.h has no more need for this typedef; networking is now the sole user of the read_descriptor_t. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Al Viro Acked-by: Al Viro --- include/linux/net.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux/net.h') diff --git a/include/linux/net.h b/include/linux/net.h index ba736b457a06..12093f4db50c 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -125,6 +125,25 @@ struct socket { struct socket_wq wq; }; +/* + * "descriptor" for what we're up to with a read. + * This allows us to use the same read code yet + * have multiple different users of the data that + * we read from a file. + * + * The simplest case just copies the data to user + * mode. + */ +typedef struct { + size_t written; + size_t count; + union { + char __user *buf; + void *data; + } arg; + int error; +} read_descriptor_t; + struct vm_area_struct; struct page; struct sockaddr; -- cgit v1.2.3 From 965b57b469a589d64d81b1688b38dcb537011bb0 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 15 Jun 2022 09:20:12 -0700 Subject: net: Introduce a new proto_ops ->read_skb() Currently both splice() and sockmap use ->read_sock() to read skb from receive queue, but for sockmap we only read one entire skb at a time, so ->read_sock() is too conservative to use. Introduce a new proto_ops ->read_skb() which supports this sematic, with this we can finally pass the ownership of skb to recv actors. For non-TCP protocols, all ->read_sock() can be simply converted to ->read_skb(). Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20220615162014.89193-3-xiyou.wangcong@gmail.com --- include/linux/net.h | 4 ++++ include/net/tcp.h | 3 +-- include/net/udp.h | 3 +-- net/core/skmsg.c | 20 +++++--------------- net/ipv4/af_inet.c | 3 ++- net/ipv4/tcp.c | 9 +++------ net/ipv4/udp.c | 10 ++++------ net/ipv6/af_inet6.c | 3 ++- net/unix/af_unix.c | 23 +++++++++-------------- 9 files changed, 31 insertions(+), 47 deletions(-) (limited to 'include/linux/net.h') diff --git a/include/linux/net.h b/include/linux/net.h index 12093f4db50c..a03485e8cbb2 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -152,6 +152,8 @@ struct module; struct sk_buff; typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, unsigned int, size_t); +typedef int (*skb_read_actor_t)(struct sock *, struct sk_buff *); + struct proto_ops { int family; @@ -214,6 +216,8 @@ struct proto_ops { */ int (*read_sock)(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); + /* This is different from read_sock(), it reads an entire skb at a time. */ + int (*read_skb)(struct sock *sk, skb_read_actor_t recv_actor); int (*sendpage_locked)(struct sock *sk, struct page *page, int offset, size_t size, int flags); int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg, diff --git a/include/net/tcp.h b/include/net/tcp.h index 7547d90fbb57..8e48dc56837b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -672,8 +672,7 @@ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); -int tcp_read_skb(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor); +int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); void tcp_initialize_rcv_mss(struct sock *sk); diff --git a/include/net/udp.h b/include/net/udp.h index b60eea2e3fae..987f7fc7c0aa 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -306,8 +306,7 @@ struct sock *__udp6_lib_lookup(struct net *net, struct sk_buff *skb); struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); -int udp_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor); +int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 7e03f96e441b..f7f63b7d990c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -1160,21 +1160,17 @@ static void sk_psock_done_strp(struct sk_psock *psock) } #endif /* CONFIG_BPF_STREAM_PARSER */ -static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, - unsigned int offset, size_t orig_len) +static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb) { - struct sock *sk = (struct sock *)desc->arg.data; struct sk_psock *psock; struct bpf_prog *prog; int ret = __SK_DROP; - int len = orig_len; + int len = skb->len; /* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */ skb = skb_clone(skb, GFP_ATOMIC); - if (!skb) { - desc->error = -ENOMEM; + if (!skb) return 0; - } rcu_read_lock(); psock = sk_psock(sk); @@ -1204,16 +1200,10 @@ out: static void sk_psock_verdict_data_ready(struct sock *sk) { struct socket *sock = sk->sk_socket; - read_descriptor_t desc; - if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) + if (unlikely(!sock || !sock->ops || !sock->ops->read_skb)) return; - - desc.arg.data = sk; - desc.error = 0; - desc.count = 1; - - sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); + sock->ops->read_skb(sk, sk_psock_verdict_recv); } void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index da81f56fdd1c..7abd652a558f 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1040,6 +1040,7 @@ const struct proto_ops inet_stream_ops = { .sendpage = inet_sendpage, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, + .read_skb = tcp_read_skb, .sendmsg_locked = tcp_sendmsg_locked, .sendpage_locked = tcp_sendpage_locked, .peek_len = tcp_peek_len, @@ -1067,7 +1068,7 @@ const struct proto_ops inet_dgram_ops = { .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, - .read_sock = udp_read_sock, + .read_skb = udp_read_skb, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 124f384f8695..9d2fd3ced21b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1734,8 +1734,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, } EXPORT_SYMBOL(tcp_read_sock); -int tcp_read_skb(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct tcp_sock *tp = tcp_sk(sk); u32 seq = tp->copied_seq; @@ -1750,7 +1749,7 @@ int tcp_read_skb(struct sock *sk, read_descriptor_t *desc, int used; __skb_unlink(skb, &sk->sk_receive_queue); - used = recv_actor(desc, skb, 0, skb->len); + used = recv_actor(sk, skb); if (used <= 0) { if (!copied) copied = used; @@ -1765,9 +1764,7 @@ int tcp_read_skb(struct sock *sk, read_descriptor_t *desc, break; } consume_skb(skb); - if (!desc->count) - break; - WRITE_ONCE(tp->copied_seq, seq); + break; } WRITE_ONCE(tp->copied_seq, seq); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6172b4750a88..c660b0bc4d14 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1797,8 +1797,7 @@ busy_check: } EXPORT_SYMBOL(__skb_recv_udp); -int udp_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { int copied = 0; @@ -1820,7 +1819,7 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc, continue; } - used = recv_actor(desc, skb, 0, skb->len); + used = recv_actor(sk, skb); if (used <= 0) { if (!copied) copied = used; @@ -1831,13 +1830,12 @@ int udp_read_sock(struct sock *sk, read_descriptor_t *desc, } kfree_skb(skb); - if (!desc->count) - break; + break; } return copied; } -EXPORT_SYMBOL(udp_read_sock); +EXPORT_SYMBOL(udp_read_skb); /* * This should be easy, if there is something there we diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 658823e91eca..0ee0770e79aa 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -702,6 +702,7 @@ const struct proto_ops inet6_stream_ops = { .sendpage_locked = tcp_sendpage_locked, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, + .read_skb = tcp_read_skb, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, @@ -727,7 +728,7 @@ const struct proto_ops inet6_dgram_ops = { .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet6_sendmsg, /* retpoline's sake */ .recvmsg = inet6_recvmsg, /* retpoline's sake */ - .read_sock = udp_read_sock, + .read_skb = udp_read_skb, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3453e0053f76..1bed3739768c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -741,10 +741,8 @@ static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); -static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor); -static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor); +static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); +static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); @@ -798,7 +796,7 @@ static const struct proto_ops unix_stream_ops = { .shutdown = unix_shutdown, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, - .read_sock = unix_stream_read_sock, + .read_skb = unix_stream_read_skb, .mmap = sock_no_mmap, .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, @@ -823,7 +821,7 @@ static const struct proto_ops unix_dgram_ops = { .listen = sock_no_listen, .shutdown = unix_shutdown, .sendmsg = unix_dgram_sendmsg, - .read_sock = unix_read_sock, + .read_skb = unix_read_skb, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, @@ -2487,8 +2485,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si return __unix_dgram_recvmsg(sk, msg, size, flags); } -static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { int copied = 0; @@ -2503,7 +2500,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, if (!skb) return err; - used = recv_actor(desc, skb, 0, skb->len); + used = recv_actor(sk, skb); if (used <= 0) { if (!copied) copied = used; @@ -2514,8 +2511,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, } kfree_skb(skb); - if (!desc->count) - break; + break; } return copied; @@ -2650,13 +2646,12 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, } #endif -static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { if (unlikely(sk->sk_state != TCP_ESTABLISHED)) return -ENOTCONN; - return unix_read_sock(sk, desc, recv_actor); + return unix_read_skb(sk, recv_actor); } static int unix_stream_read_generic(struct unix_stream_read_state *state, -- cgit v1.2.3 From 62df90b53e6f332bb69b73621998826c49a17323 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sun, 19 Jun 2022 15:46:41 +0800 Subject: net, lib/once: remove {net_}get_random_once_wait macro DO_ONCE(func, ...) will call func with spinlock which acquired by spin_lock_irqsave in __do_once_start. But the get_random_once_wait will sleep in get_random_bytes_wait -> wait_for_random_bytes. Fortunately, there is no place to use {net_}get_random_once_wait, so we could remove them simply. Link: https://lkml.kernel.org/r/20220619074641.40916-1-wuchi.zero@gmail.com Signed-off-by: wuchi Acked-by: Jakub Kicinski Cc: David S. Miller Cc: Eric Dumazet Cc: Paolo Abeni Signed-off-by: Andrew Morton --- include/linux/net.h | 2 -- include/linux/once.h | 2 -- 2 files changed, 4 deletions(-) (limited to 'include/linux/net.h') diff --git a/include/linux/net.h b/include/linux/net.h index 12093f4db50c..8613772a1f58 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -303,8 +303,6 @@ do { \ #define net_get_random_once(buf, nbytes) \ get_random_once((buf), (nbytes)) -#define net_get_random_once_wait(buf, nbytes) \ - get_random_once_wait((buf), (nbytes)) /* * E.g. XFS meta- & log-data is in slab pages, or bcache meta diff --git a/include/linux/once.h b/include/linux/once.h index f54523052bbc..b14d8b309d52 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -54,7 +54,5 @@ void __do_once_done(bool *done, struct static_key_true *once_key, #define get_random_once(buf, nbytes) \ DO_ONCE(get_random_bytes, (buf), (nbytes)) -#define get_random_once_wait(buf, nbytes) \ - DO_ONCE(get_random_bytes_wait, (buf), (nbytes)) \ #endif /* _LINUX_ONCE_H */ -- cgit v1.2.3 From e993ffe3da4bcddea0536b03be1031bf35cd8d85 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 21 Oct 2022 11:16:39 +0100 Subject: net: flag sockets supporting msghdr originated zerocopy We need an efficient way in io_uring to check whether a socket supports zerocopy with msghdr provided ubuf_info. Add a new flag into the struct socket flags fields. Cc: # 6.0 Signed-off-by: Pavel Begunkov Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/3dafafab822b1c66308bb58a0ac738b1e3f53f74.1666346426.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/net.h | 1 + net/ipv4/tcp.c | 1 + net/ipv4/udp.c | 1 + 3 files changed, 3 insertions(+) (limited to 'include/linux/net.h') diff --git a/include/linux/net.h b/include/linux/net.h index 711c3593c3b8..18d942bbdf6e 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -41,6 +41,7 @@ struct net; #define SOCK_NOSPACE 2 #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 +#define SOCK_SUPPORT_ZC 5 #ifndef ARCH_HAS_SOCKET_TYPES /** diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f8232811a5be..ef14efa1fb70 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk) WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); + set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); } EXPORT_SYMBOL(tcp_init_sock); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 662d717d5123..1c646797cc79 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1624,6 +1624,7 @@ int udp_init_sock(struct sock *sk) { skb_queue_head_init(&udp_sk(sk)->reader_queue); sk->sk_destruct = udp_destruct_sock; + set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); return 0; } -- cgit v1.2.3