diff options
Diffstat (limited to 'include/net')
220 files changed, 11675 insertions, 4993 deletions
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h index 60cad0d200a4..fd7a034b8278 100644 --- a/include/net/9p/9p.h +++ b/include/net/9p/9p.h @@ -24,6 +24,8 @@ * @P9_DEBUG_PKT: packet marshalling/unmarshalling * @P9_DEBUG_FSC: FS-cache tracing * @P9_DEBUG_VPKT: Verbose packet debugging (full packet dump) + * @P9_DEBUG_CACHE: cache operations tracing + * @P9_DEBUG_MMAP: memory-mapped I/O tracing * * These flags are passed at mount time to turn on various levels of * verbosity and tracing which will be output to the system logs. @@ -68,13 +70,39 @@ void _p9_debug(enum p9_debug_flags level, const char *func, * @P9_RSYMLINK: make symlink response * @P9_TMKNOD: create a special file object request * @P9_RMKNOD: create a special file object response + * @P9_TLOPEN: open a file for I/O (9P2000.L) + * @P9_RLOPEN: response with qid and iounit (9P2000.L) * @P9_TLCREATE: prepare a handle for I/O on an new file for 9P2000.L * @P9_RLCREATE: response with file access information for 9P2000.L * @P9_TRENAME: rename request * @P9_RRENAME: rename response - * @P9_TMKDIR: create a directory request - * @P9_RMKDIR: create a directory response - * @P9_TVERSION: version handshake request + * @P9_TREADLINK: read symbolic link target (9P2000.L) + * @P9_RREADLINK: response with symbolic link target (9P2000.L) + * @P9_TGETATTR: get file attributes request (9P2000.L) + * @P9_RGETATTR: get file attributes response (9P2000.L) + * @P9_TSETATTR: set file attributes request (9P2000.L) + * @P9_RSETATTR: set file attributes response (9P2000.L) + * @P9_TXATTRWALK: prepare to read/list extended attributes (9P2000.L) + * @P9_RXATTRWALK: response with extended attribute size (9P2000.L) + * @P9_TXATTRCREATE: prepare to set extended attribute (9P2000.L) + * @P9_RXATTRCREATE: set extended attribute response (9P2000.L) + * @P9_TREADDIR: read directory entries request (9P2000.L) + * @P9_RREADDIR: read directory entries response (9P2000.L) + * @P9_TFSYNC: flush cached file data to storage request (9P2000.L) + * @P9_RFSYNC: flush cached file data to storage response (9P2000.L) + * @P9_TLOCK: acquire or release a POSIX record lock (9P2000.L) + * @P9_RLOCK: POSIX record lock response (9P2000.L) + * @P9_TGETLOCK: test for existence of POSIX record lock (9P2000.L) + * @P9_RGETLOCK: POSIX record lock test response (9P2000.L) + * @P9_TLINK: create a hard link (9P2000.L) + * @P9_RLINK: hard link response (9P2000.L) + * @P9_TRENAMEAT: safely rename across directories (9P2000.L) + * @P9_RRENAMEAT: rename response (9P2000.L) + * @P9_TUNLINKAT: unlink a file or directory (9P2000.L) + * @P9_RUNLINKAT: unlink response (9P2000.L) + * @P9_TMKDIR: create a directory request (9P2000.L) + * @P9_RMKDIR: create a directory response (9P2000.L) + * @P9_TVERSION: negotiate protocol version and message size * @P9_RVERSION: version handshake response * @P9_TAUTH: request to establish authentication channel * @P9_RAUTH: response with authentication information @@ -194,6 +222,10 @@ enum p9_msg_t { * @P9_ORCLOSE: remove the file when the file is closed * @P9_OAPPEND: open the file and seek to the end * @P9_OEXCL: only create a file, do not open it + * @P9L_MODE_MASK: mask for protocol mode bits (client-side only) + * @P9L_DIRECT: disable client-side caching for this file + * @P9L_NOWRITECACHE: disable write caching for this file + * @P9L_LOOSE: enable loose cache consistency * * 9P open modes differ slightly from Posix standard modes. * In particular, there are extra modes which specify different diff --git a/include/net/9p/client.h b/include/net/9p/client.h index 4f785098c67a..838a94218b59 100644 --- a/include/net/9p/client.h +++ b/include/net/9p/client.h @@ -16,6 +16,12 @@ /* Number of requests per row */ #define P9_ROW_MAXTAG 255 +/* DEFAULT MSIZE = 32 pages worth of payload + P9_HDRSZ + + * room for write (16 extra) or read (11 extra) operands. + */ + +#define DEFAULT_MSIZE ((128 * 1024) + P9_IOHDRSZ) + /** enum p9_proto_versions - 9P protocol versions * @p9_proto_legacy: 9P Legacy mode, pre-9P2000.u * @p9_proto_2000u: 9P2000.u extension @@ -127,6 +133,96 @@ struct p9_client { }; /** + * struct p9_fd_opts - holds client options during parsing + * @msize: maximum data size negotiated by protocol + * @prot-Oversion: 9P protocol version to use + * @trans_mod: module API instantiated with this client + * + * These parsed options get transferred into client in + * apply_client_options() + */ +struct p9_client_opts { + unsigned int msize; + unsigned char proto_version; + struct p9_trans_module *trans_mod; +}; + +/** + * struct p9_fd_opts - per-transport options for fd transport + * @rfd: file descriptor for reading (trans=fd) + * @wfd: file descriptor for writing (trans=fd) + * @port: port to connect to (trans=tcp) + * @privport: port is privileged + */ +struct p9_fd_opts { + int rfd; + int wfd; + u16 port; + bool privport; +}; + +/** + * struct p9_rdma_opts - Collection of mount options for rdma transport + * @port: port of connection + * @privport: Whether a privileged port may be used + * @sq_depth: The requested depth of the SQ. This really doesn't need + * to be any deeper than the number of threads used in the client + * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth + * @timeout: Time to wait in msecs for CM events + */ +struct p9_rdma_opts { + short port; + bool privport; + int sq_depth; + int rq_depth; + long timeout; +}; + +/** + * struct p9_session_opts - holds parsed options for v9fs_session_info + * @flags: session options of type &p9_session_flags + * @nodev: set to 1 to disable device mapping + * @debug: debug level + * @afid: authentication handle + * @cache: cache mode of type &p9_cache_bits + * @cachetag: the tag of the cache associated with this session + * @uname: string user name to mount hierarchy as + * @aname: mount specifier for remote hierarchy + * @dfltuid: default numeric userid to mount hierarchy as + * @dfltgid: default numeric groupid to mount hierarchy as + * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy + * @session_lock_timeout: retry interval for blocking locks + * + * This strucure holds options which are parsed and will be transferred + * to the v9fs_session_info structure when mounted, and therefore largely + * duplicates struct v9fs_session_info. + */ +struct p9_session_opts { + unsigned int flags; + unsigned char nodev; + unsigned short debug; + unsigned int afid; + unsigned int cache; +#ifdef CONFIG_9P_FSCACHE + char *cachetag; +#endif + char *uname; + char *aname; + kuid_t dfltuid; + kgid_t dfltgid; + kuid_t uid; + long session_lock_timeout; +}; + +/* Used by mount API to store parsed mount options */ +struct v9fs_context { + struct p9_client_opts client_opts; + struct p9_fd_opts fd_opts; + struct p9_rdma_opts rdma_opts; + struct p9_session_opts session_opts; +}; + +/** * struct p9_fid - file system entity handle * @clnt: back pointer to instantiating &p9_client * @fid: numeric identifier for this handle @@ -183,7 +279,7 @@ int p9_client_rename(struct p9_fid *fid, struct p9_fid *newdirfid, const char *name); int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name, struct p9_fid *newdirfid, const char *new_name); -struct p9_client *p9_client_create(const char *dev_name, char *options); +struct p9_client *p9_client_create(struct fs_context *fc); void p9_client_destroy(struct p9_client *clnt); void p9_client_disconnect(struct p9_client *clnt); void p9_client_begin_disconnect(struct p9_client *clnt); diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h index 766ec07c9599..a912bbaa862f 100644 --- a/include/net/9p/transport.h +++ b/include/net/9p/transport.h @@ -14,6 +14,13 @@ #define P9_DEF_MIN_RESVPORT (665U) #define P9_DEF_MAX_RESVPORT (1023U) +#define P9_FD_PORT 564 + +#define P9_RDMA_PORT 5640 +#define P9_RDMA_SQ_DEPTH 32 +#define P9_RDMA_RQ_DEPTH 32 +#define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ + /** * struct p9_trans_module - transport module interface * @list: used to maintain a list of currently available transports @@ -24,6 +31,9 @@ * we're less flexible when choosing the response message * size in this case * @def: set if this transport should be considered the default + * @supports_vmalloc: set if this transport can work with vmalloc'd buffers + * (non-physically contiguous memory). Transports requiring + * DMA should leave this as false. * @create: member function to create a new connection on this transport * @close: member function to discard a connection on this transport * @request: member function to issue a request to the transport @@ -43,10 +53,11 @@ struct p9_trans_module { char *name; /* name of transport */ int maxsize; /* max message size of transport */ bool pooled_rbuffers; - int def; /* this transport should be default */ + bool def; /* this transport should be default */ + bool supports_vmalloc; /* can work with vmalloc'd buffers */ struct module *owner; int (*create)(struct p9_client *client, - const char *devname, char *args); + struct fs_context *fc); void (*close)(struct p9_client *client); int (*request)(struct p9_client *client, struct p9_req_t *req); int (*cancel)(struct p9_client *client, struct p9_req_t *req); diff --git a/include/net/Space.h b/include/net/Space.h index ef42629f4258..6a0b6674d930 100644 --- a/include/net/Space.h +++ b/include/net/Space.h @@ -3,10 +3,5 @@ * ethernet adaptor have the name "eth[0123...]". */ -struct net_device *ultra_probe(int unit); -struct net_device *wd_probe(int unit); struct net_device *ne_probe(int unit); -struct net_device *smc_init(int unit); struct net_device *cs89x0_probe(int unit); -struct net_device *tc515_probe(int unit); -struct net_device *lance_probe(int unit); diff --git a/include/net/act_api.h b/include/net/act_api.h index 404df8557f6a..fd2967ee08f7 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -33,7 +33,10 @@ struct tc_action { struct tcf_t tcfa_tm; struct gnet_stats_basic_sync tcfa_bstats; struct gnet_stats_basic_sync tcfa_bstats_hw; - struct gnet_stats_queue tcfa_qstats; + + atomic_t tcfa_drops; + atomic_t tcfa_overlimits; + struct net_rate_estimator __rcu *tcfa_rate_est; spinlock_t tcfa_lock; struct gnet_stats_basic_sync __percpu *cpu_bstats; @@ -42,6 +45,7 @@ struct tc_action { struct tc_cookie __rcu *user_cookie; struct tcf_chain __rcu *goto_chain; u32 tcfa_flags; + struct rcu_head tcfa_rcu; u8 hw_stats; u8 used_hw_stats; bool used_hw_stats_valid; @@ -53,7 +57,6 @@ struct tc_action { #define tcf_action common.tcfa_action #define tcf_tm common.tcfa_tm #define tcf_bstats common.tcfa_bstats -#define tcf_qstats common.tcfa_qstats #define tcf_rate_est common.tcfa_rate_est #define tcf_lock common.tcfa_lock @@ -68,6 +71,7 @@ struct tc_action { #define TCA_ACT_FLAGS_REPLACE (1U << (TCA_ACT_FLAGS_USER_BITS + 2)) #define TCA_ACT_FLAGS_NO_RTNL (1U << (TCA_ACT_FLAGS_USER_BITS + 3)) #define TCA_ACT_FLAGS_AT_INGRESS (1U << (TCA_ACT_FLAGS_USER_BITS + 4)) +#define TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT (1U << (TCA_ACT_FLAGS_USER_BITS + 5)) /* Update lastuse only if needed, to avoid dirtying a cache line. * We use a temp variable to avoid fetching jiffies twice. @@ -76,19 +80,24 @@ static inline void tcf_lastuse_update(struct tcf_t *tm) { unsigned long now = jiffies; - if (tm->lastuse != now) - tm->lastuse = now; - if (unlikely(!tm->firstuse)) - tm->firstuse = now; + if (READ_ONCE(tm->lastuse) != now) + WRITE_ONCE(tm->lastuse, now); + if (unlikely(!READ_ONCE(tm->firstuse))) + WRITE_ONCE(tm->firstuse, now); } static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm) { - dtm->install = jiffies_to_clock_t(jiffies - stm->install); - dtm->lastuse = jiffies_to_clock_t(jiffies - stm->lastuse); - dtm->firstuse = stm->firstuse ? - jiffies_to_clock_t(jiffies - stm->firstuse) : 0; - dtm->expires = jiffies_to_clock_t(stm->expires); + unsigned long firstuse, now = jiffies; + + dtm->install = jiffies_to_clock_t(now - READ_ONCE(stm->install)); + dtm->lastuse = jiffies_to_clock_t(now - READ_ONCE(stm->lastuse)); + + firstuse = READ_ONCE(stm->firstuse); + dtm->firstuse = firstuse ? + jiffies_to_clock_t(now - firstuse) : 0; + + dtm->expires = jiffies_to_clock_t(READ_ONCE(stm->expires)); } static inline enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats) @@ -152,7 +161,7 @@ int tc_action_net_init(struct net *net, struct tc_action_net *tn, { int err = 0; - tn->idrinfo = kmalloc(sizeof(*tn->idrinfo), GFP_KERNEL); + tn->idrinfo = kmalloc_obj(*tn->idrinfo); if (!tn->idrinfo) return -ENOMEM; tn->ops = ops; @@ -170,14 +179,12 @@ static inline void tc_action_net_exit(struct list_head *net_list, { struct net *net; - rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { struct tc_action_net *tn = net_generic(net, id); tcf_idrinfo_destroy(tn->ops, tn->idrinfo); kfree(tn->idrinfo); } - rtnl_unlock(); } int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, @@ -238,9 +245,7 @@ static inline void tcf_action_inc_drop_qstats(struct tc_action *a) qstats_drop_inc(this_cpu_ptr(a->cpu_qstats)); return; } - spin_lock(&a->tcfa_lock); - qstats_drop_inc(&a->tcfa_qstats); - spin_unlock(&a->tcfa_lock); + atomic_inc(&a->tcfa_drops); } static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) @@ -249,9 +254,7 @@ static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a) qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats)); return; } - spin_lock(&a->tcfa_lock); - qstats_overlimit_inc(&a->tcfa_qstats); - spin_unlock(&a->tcfa_lock); + atomic_inc(&a->tcfa_overlimits); } void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 9e5e95988b9e..9e96776945e5 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -8,7 +8,8 @@ #define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ -#define TEMP_VALID_LIFETIME (7*86400) /* 1 week */ +/* TEMP_VALID_LIFETIME default value as specified in RFC 8981 3.8 */ +#define TEMP_VALID_LIFETIME (2*86400) /* 2 days */ #define TEMP_PREFERRED_LIFETIME (86400) /* 24 hours */ #define REGEN_MIN_ADVANCE (2) /* 2 seconds */ #define REGEN_MAX_RETRY (3) @@ -347,6 +348,11 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) return rcu_dereference_rtnl(dev->ip6_ptr); } +static inline struct inet6_dev *in6_dev_rcu(const struct net_device *dev) +{ + return rcu_dereference(dev->ip6_ptr); +} + static inline struct inet6_dev *__in6_dev_get_rtnl_net(const struct net_device *dev) { return rtnl_net_dereference(dev_net(dev), dev->ip6_ptr); diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 0754c463224a..0fb4c41c9bbf 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -16,6 +16,7 @@ struct sock; struct socket; struct rxrpc_call; struct rxrpc_peer; +struct krb5_buffer; enum rxrpc_abort_reason; enum rxrpc_interruptibility { @@ -24,23 +25,33 @@ enum rxrpc_interruptibility { RXRPC_UNINTERRUPTIBLE, /* Call should not be interruptible at all */ }; +enum rxrpc_oob_type { + RXRPC_OOB_CHALLENGE, /* Security challenge for a connection */ +}; + /* * Debug ID counter for tracing. */ extern atomic_t rxrpc_debug_id; +/* + * Operations table for rxrpc to call out to a kernel application (e.g. kAFS). + */ +struct rxrpc_kernel_ops { + void (*notify_new_call)(struct sock *sk, struct rxrpc_call *call, + unsigned long user_call_ID); + void (*discard_new_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*user_attach_call)(struct rxrpc_call *call, unsigned long user_call_ID); + void (*notify_oob)(struct sock *sk, struct sk_buff *oob); +}; + typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *, unsigned long); typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *, - unsigned long); -typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long); -typedef void (*rxrpc_user_attach_call_t)(struct rxrpc_call *, unsigned long); -void rxrpc_kernel_new_call_notification(struct socket *, - rxrpc_notify_new_call_t, - rxrpc_discard_new_call_t); +void rxrpc_kernel_set_notifications(struct socket *sock, + const struct rxrpc_kernel_ops *app_ops); struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_peer *peer, struct key *key, @@ -69,17 +80,36 @@ struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer); struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call); const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer); const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer); +unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data); +unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer); unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *); -int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, - rxrpc_user_attach_call_t, unsigned long, gfp_t, - unsigned int); +int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, + unsigned long user_call_ID, gfp_t gfp, + unsigned int debug_id); void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64); bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *); -u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *); -void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, - unsigned long); int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); int rxrpc_sock_set_security_keyring(struct sock *, struct key *); +int rxrpc_sock_set_manage_response(struct sock *sk, bool set); + +enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata); +struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock, + enum rxrpc_oob_type *_type); +void rxrpc_kernel_free_oob(struct sk_buff *oob); +void rxrpc_kernel_query_challenge(struct sk_buff *challenge, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata, + u16 *_service_id, u8 *_security_index); +int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code, + int error, enum rxrpc_abort_reason why); +int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge); +u32 rxgk_kernel_query_challenge(struct sk_buff *challenge); +int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, + struct krb5_buffer *appdata); +u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call, + u16 *_service_id, u32 *_enctype); #endif /* _NET_RXRPC_H */ diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 63129c79b8cb..34f53dde65ce 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -2,11 +2,15 @@ #ifndef __LINUX_NET_AFUNIX_H #define __LINUX_NET_AFUNIX_H -#include <linux/socket.h> -#include <linux/un.h> +#include <linux/atomic.h> #include <linux/mutex.h> +#include <linux/net.h> +#include <linux/path.h> #include <linux/refcount.h> +#include <linux/spinlock.h> +#include <linux/wait.h> #include <net/sock.h> +#include <uapi/linux/un.h> #if IS_ENABLED(CONFIG_UNIX) struct unix_sock *unix_get_socket(struct file *filp); @@ -17,61 +21,17 @@ static inline struct unix_sock *unix_get_socket(struct file *filp) } #endif -extern unsigned int unix_tot_inflight; -void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); -void unix_del_edges(struct scm_fp_list *fpl); -void unix_update_edges(struct unix_sock *receiver); -int unix_prepare_fpl(struct scm_fp_list *fpl); -void unix_destroy_fpl(struct scm_fp_list *fpl); -void unix_gc(void); -void wait_for_unix_gc(struct scm_fp_list *fpl); - -struct unix_vertex { - struct list_head edges; - struct list_head entry; - struct list_head scc_entry; - unsigned long out_degree; - unsigned long index; - unsigned long scc_index; -}; - -struct unix_edge { - struct unix_sock *predecessor; - struct unix_sock *successor; - struct list_head vertex_entry; - struct list_head stack_entry; -}; - -struct sock *unix_peer_get(struct sock *sk); - -#define UNIX_HASH_MOD (256 - 1) -#define UNIX_HASH_SIZE (256 * 2) -#define UNIX_HASH_BITS 8 - struct unix_address { refcount_t refcnt; int len; struct sockaddr_un name[]; }; -struct unix_skb_parms { - struct pid *pid; /* Skb credentials */ - kuid_t uid; - kgid_t gid; - struct scm_fp_list *fp; /* Passed files */ -#ifdef CONFIG_SECURITY_NETWORK - u32 secid; /* Security ID */ -#endif - u32 consumed; -} __randomize_layout; - struct scm_stat { atomic_t nr_fds; unsigned long nr_unix_fds; }; -#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) - /* The AF_UNIX socket */ struct unix_sock { /* WARNING: sk has to be the first member */ @@ -84,8 +44,11 @@ struct unix_sock { struct unix_vertex *vertex; spinlock_t lock; struct socket_wq peer_wq; +#define peer_wait peer_wq.wait wait_queue_entry_t peer_wake; struct scm_stat scm_stat; + int inq_len; + bool recvmsg_inq; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) struct sk_buff *oob_skb; #endif @@ -97,32 +60,4 @@ struct unix_sock { #define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) #define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) -#define peer_wait peer_wq.wait - -long unix_inq_len(struct sock *sk); -long unix_outq_len(struct sock *sk); - -int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, - int flags); -int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, - int flags); -#ifdef CONFIG_SYSCTL -int unix_sysctl_register(struct net *net); -void unix_sysctl_unregister(struct net *net); -#else -static inline int unix_sysctl_register(struct net *net) { return 0; } -static inline void unix_sysctl_unregister(struct net *net) {} -#endif - -#ifdef CONFIG_BPF_SYSCALL -extern struct proto unix_dgram_proto; -extern struct proto unix_stream_proto; - -int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); -int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); -void __init unix_bpf_build_proto(void); -#else -static inline void __init unix_bpf_build_proto(void) -{} -#endif #endif diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 9e85424c8343..4e40063adab4 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -10,6 +10,7 @@ #include <linux/kernel.h> #include <linux/workqueue.h> +#include <net/netns/vsock.h> #include <net/sock.h> #include <uapi/linux/vm_sockets.h> @@ -124,7 +125,7 @@ struct vsock_transport { size_t len, int flags); int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *, struct msghdr *, size_t len); - bool (*dgram_allow)(u32 cid, u32 port); + bool (*dgram_allow)(struct vsock_sock *vsk, u32 cid, u32 port); /* STREAM. */ /* TODO: stream_bind() */ @@ -136,14 +137,14 @@ struct vsock_transport { s64 (*stream_has_space)(struct vsock_sock *); u64 (*stream_rcvhiwat)(struct vsock_sock *); bool (*stream_is_active)(struct vsock_sock *); - bool (*stream_allow)(u32 cid, u32 port); + bool (*stream_allow)(struct vsock_sock *vsk, u32 cid, u32 port); /* SEQ_PACKET. */ ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, int flags); int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg, size_t len); - bool (*seqpacket_allow)(u32 remote_cid); + bool (*seqpacket_allow)(struct vsock_sock *vsk, u32 remote_cid); u32 (*seqpacket_has_data)(struct vsock_sock *vsk); /* Notification. */ @@ -178,6 +179,15 @@ struct vsock_transport { /* Addressing. */ u32 (*get_local_cid)(void); + /* Check if this transport serves a specific remote CID. + * For H2G transports: return true if the CID belongs to a registered + * guest. If not implemented, all CIDs > VMADDR_CID_HOST go to H2G. + * For G2H transports: return true if the transport can reach arbitrary + * CIDs via the hypervisor (i.e. supports the fallback overlay). VMCI + * does not implement this as it only serves CIDs 0 and 2. + */ + bool (*has_remote_cid)(struct vsock_sock *vsk, u32 remote_cid); + /* Read a single skb */ int (*read_skb)(struct vsock_sock *, skb_read_actor_t); @@ -216,11 +226,17 @@ void vsock_remove_connected(struct vsock_sock *vsk); struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr); struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, struct sockaddr_vm *dst); +struct sock *vsock_find_bound_socket_net(struct sockaddr_vm *addr, + struct net *net); +struct sock *vsock_find_connected_socket_net(struct sockaddr_vm *src, + struct sockaddr_vm *dst, + struct net *net); void vsock_remove_sock(struct vsock_sock *vsk); void vsock_for_each_connected_socket(struct vsock_transport *transport, void (*fn)(struct sock *sk)); int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk); bool vsock_find_cid(unsigned int cid); +void vsock_linger(struct sock *sk); /**** TAP ****/ @@ -242,8 +258,8 @@ int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); -#ifdef CONFIG_BPF_SYSCALL extern struct proto vsock_proto; +#ifdef CONFIG_BPF_SYSCALL int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void __init vsock_bpf_build_proto(void); #else @@ -255,4 +271,62 @@ static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t) { return t->msgzerocopy_allow && t->msgzerocopy_allow(); } + +static inline enum vsock_net_mode vsock_net_mode(struct net *net) +{ + if (!net) + return VSOCK_NET_MODE_GLOBAL; + + return READ_ONCE(net->vsock.mode); +} + +static inline bool vsock_net_mode_global(struct vsock_sock *vsk) +{ + return vsock_net_mode(sock_net(sk_vsock(vsk))) == VSOCK_NET_MODE_GLOBAL; +} + +static inline bool vsock_net_set_child_mode(struct net *net, + enum vsock_net_mode mode) +{ + int new_locked = mode + 1; + int old_locked = 0; /* unlocked */ + + if (try_cmpxchg(&net->vsock.child_ns_mode_locked, + &old_locked, new_locked)) { + WRITE_ONCE(net->vsock.child_ns_mode, mode); + return true; + } + + return old_locked == new_locked; +} + +static inline enum vsock_net_mode vsock_net_child_mode(struct net *net) +{ + return READ_ONCE(net->vsock.child_ns_mode); +} + +/* Return true if two namespaces pass the mode rules. Otherwise, return false. + * + * A NULL namespace is treated as VSOCK_NET_MODE_GLOBAL. + * + * Read more about modes in the comment header of net/vmw_vsock/af_vsock.c. + */ +static inline bool vsock_net_check_mode(struct net *ns0, struct net *ns1) +{ + enum vsock_net_mode mode0, mode1; + + /* Any vsocks within the same network namespace are always reachable, + * regardless of the mode. + */ + if (net_eq(ns0, ns1)) + return true; + + mode0 = vsock_net_mode(ns0); + mode1 = vsock_net_mode(ns1); + + /* Different namespaces are only reachable if they are both + * global mode. + */ + return mode0 == VSOCK_NET_MODE_GLOBAL && mode0 == mode1; +} #endif /* __AF_VSOCK_H__ */ diff --git a/include/net/aligned_data.h b/include/net/aligned_data.h new file mode 100644 index 000000000000..e1a1c8aedc79 --- /dev/null +++ b/include/net/aligned_data.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_ALIGNED_DATA_H +#define _NET_ALIGNED_DATA_H + +#include <linux/atomic.h> +#include <linux/types.h> + +/* Structure holding cacheline aligned fields on SMP builds. + * Each field or group should have an ____cacheline_aligned_in_smp + * attribute to ensure no accidental false sharing can happen. + */ +struct net_aligned_data { + atomic64_t net_cookie ____cacheline_aligned_in_smp; +#if defined(CONFIG_INET) + atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; + atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp; +#endif +}; + +extern struct net_aligned_data net_aligned_data; + +#endif /* _NET_ALIGNED_DATA_H */ diff --git a/include/net/atmclip.h b/include/net/atmclip.h deleted file mode 100644 index 70e350e0db3d..000000000000 --- a/include/net/atmclip.h +++ /dev/null @@ -1,53 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* net/atm/atmarp.h - RFC1577 ATM ARP */ - -/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ - - -#ifndef _ATMCLIP_H -#define _ATMCLIP_H - -#include <linux/netdevice.h> -#include <linux/atm.h> -#include <linux/atmdev.h> -#include <linux/atmarp.h> -#include <linux/spinlock.h> -#include <net/neighbour.h> - - -#define CLIP_VCC(vcc) ((struct clip_vcc *) ((vcc)->user_back)) - -struct sk_buff; - -struct clip_vcc { - struct atm_vcc *vcc; /* VCC descriptor */ - struct atmarp_entry *entry; /* ATMARP table entry, NULL if IP addr. - isn't known yet */ - int xoff; /* 1 if send buffer is full */ - unsigned char encap; /* 0: NULL, 1: LLC/SNAP */ - unsigned long last_use; /* last send or receive operation */ - unsigned long idle_timeout; /* keep open idle for so many jiffies*/ - void (*old_push)(struct atm_vcc *vcc,struct sk_buff *skb); - /* keep old push fn for chaining */ - void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); - /* keep old pop fn for chaining */ - struct clip_vcc *next; /* next VCC */ -}; - - -struct atmarp_entry { - struct clip_vcc *vccs; /* active VCCs; NULL if resolution is - pending */ - unsigned long expires; /* entry expiration time */ - struct neighbour *neigh; /* neighbour back-pointer */ -}; - -#define PRIV(dev) ((struct clip_priv *) netdev_priv(dev)) - -struct clip_priv { - int number; /* for convenience ... */ - spinlock_t xoff_lock; /* ensures that pop is atomic (SMP) */ - struct net_device *next; /* next CLIP interface */ -}; - -#endif diff --git a/include/net/ax25.h b/include/net/ax25.h index 4ee141aae0a2..6b2f518facdb 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -1,487 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* - * Declarations of AX.25 type objects. - * - * Alan Cox (GW4PTS) 10/11/93 - */ #ifndef _AX25_H -#define _AX25_H +#define _AX25_H #include <linux/ax25.h> -#include <linux/spinlock.h> -#include <linux/timer.h> -#include <linux/list.h> -#include <linux/slab.h> -#include <linux/refcount.h> -#include <net/neighbour.h> -#include <net/sock.h> -#include <linux/seq_file.h> -#define AX25_T1CLAMPLO 1 -#define AX25_T1CLAMPHI (30 * HZ) - -#define AX25_BPQ_HEADER_LEN 16 -#define AX25_KISS_HEADER_LEN 1 - -#define AX25_HEADER_LEN 17 -#define AX25_ADDR_LEN 7 -#define AX25_DIGI_HEADER_LEN (AX25_MAX_DIGIS * AX25_ADDR_LEN) -#define AX25_MAX_HEADER_LEN (AX25_HEADER_LEN + AX25_DIGI_HEADER_LEN) - -/* AX.25 Protocol IDs */ -#define AX25_P_ROSE 0x01 -#define AX25_P_VJCOMP 0x06 /* Compressed TCP/IP packet */ - /* Van Jacobsen (RFC 1144) */ -#define AX25_P_VJUNCOMP 0x07 /* Uncompressed TCP/IP packet */ - /* Van Jacobsen (RFC 1144) */ -#define AX25_P_SEGMENT 0x08 /* Segmentation fragment */ -#define AX25_P_TEXNET 0xc3 /* TEXTNET datagram protocol */ -#define AX25_P_LQ 0xc4 /* Link Quality Protocol */ -#define AX25_P_ATALK 0xca /* Appletalk */ -#define AX25_P_ATALK_ARP 0xcb /* Appletalk ARP */ -#define AX25_P_IP 0xcc /* ARPA Internet Protocol */ -#define AX25_P_ARP 0xcd /* ARPA Address Resolution */ -#define AX25_P_FLEXNET 0xce /* FlexNet */ -#define AX25_P_NETROM 0xcf /* NET/ROM */ -#define AX25_P_TEXT 0xF0 /* No layer 3 protocol impl. */ - -/* AX.25 Segment control values */ -#define AX25_SEG_REM 0x7F -#define AX25_SEG_FIRST 0x80 - -#define AX25_CBIT 0x80 /* Command/Response bit */ -#define AX25_EBIT 0x01 /* HDLC Address Extension bit */ -#define AX25_HBIT 0x80 /* Has been repeated bit */ - -#define AX25_SSSID_SPARE 0x60 /* Unused bits in SSID for standard AX.25 */ -#define AX25_ESSID_SPARE 0x20 /* Unused bits in SSID for extended AX.25 */ -#define AX25_DAMA_FLAG 0x20 /* Well, it is *NOT* unused! (dl1bke 951121 */ - -#define AX25_COND_ACK_PENDING 0x01 -#define AX25_COND_REJECT 0x02 -#define AX25_COND_PEER_RX_BUSY 0x04 -#define AX25_COND_OWN_RX_BUSY 0x08 -#define AX25_COND_DAMA_MODE 0x10 - -#ifndef _LINUX_NETDEVICE_H -#include <linux/netdevice.h> -#endif - -/* Upper sub-layer (LAPB) definitions */ - -/* Control field templates */ -#define AX25_I 0x00 /* Information frames */ -#define AX25_S 0x01 /* Supervisory frames */ -#define AX25_RR 0x01 /* Receiver ready */ -#define AX25_RNR 0x05 /* Receiver not ready */ -#define AX25_REJ 0x09 /* Reject */ -#define AX25_U 0x03 /* Unnumbered frames */ -#define AX25_SABM 0x2f /* Set Asynchronous Balanced Mode */ -#define AX25_SABME 0x6f /* Set Asynchronous Balanced Mode Extended */ -#define AX25_DISC 0x43 /* Disconnect */ -#define AX25_DM 0x0f /* Disconnected mode */ -#define AX25_UA 0x63 /* Unnumbered acknowledge */ -#define AX25_FRMR 0x87 /* Frame reject */ -#define AX25_UI 0x03 /* Unnumbered information */ -#define AX25_XID 0xaf /* Exchange information */ -#define AX25_TEST 0xe3 /* Test */ - -#define AX25_PF 0x10 /* Poll/final bit for standard AX.25 */ -#define AX25_EPF 0x01 /* Poll/final bit for extended AX.25 */ - -#define AX25_ILLEGAL 0x100 /* Impossible to be a real frame type */ - -#define AX25_POLLOFF 0 -#define AX25_POLLON 1 - -/* AX25 L2 C-bit */ -#define AX25_COMMAND 1 -#define AX25_RESPONSE 2 - -/* Define Link State constants. */ - -enum { - AX25_STATE_0, /* Listening */ - AX25_STATE_1, /* SABM sent */ - AX25_STATE_2, /* DISC sent */ - AX25_STATE_3, /* Established */ - AX25_STATE_4 /* Recovery */ -}; - -#define AX25_MODULUS 8 /* Standard AX.25 modulus */ -#define AX25_EMODULUS 128 /* Extended AX.25 modulus */ - -enum { - AX25_PROTO_STD_SIMPLEX, - AX25_PROTO_STD_DUPLEX, -#ifdef CONFIG_AX25_DAMA_SLAVE - AX25_PROTO_DAMA_SLAVE, -#ifdef CONFIG_AX25_DAMA_MASTER - AX25_PROTO_DAMA_MASTER, -#define AX25_PROTO_MAX AX25_PROTO_DAMA_MASTER -#endif -#endif - __AX25_PROTO_MAX, - AX25_PROTO_MAX = __AX25_PROTO_MAX -1 -}; - -enum { - AX25_VALUES_IPDEFMODE, /* 0=DG 1=VC */ - AX25_VALUES_AXDEFMODE, /* 0=Normal 1=Extended Seq Nos */ - AX25_VALUES_BACKOFF, /* 0=None 1=Linear 2=Exponential */ - AX25_VALUES_CONMODE, /* Allow connected modes - 0=No 1=no "PID text" 2=all PIDs */ - AX25_VALUES_WINDOW, /* Default window size for standard AX.25 */ - AX25_VALUES_EWINDOW, /* Default window size for extended AX.25 */ - AX25_VALUES_T1, /* Default T1 timeout value */ - AX25_VALUES_T2, /* Default T2 timeout value */ - AX25_VALUES_T3, /* Default T3 timeout value */ - AX25_VALUES_IDLE, /* Connected mode idle timer */ - AX25_VALUES_N2, /* Default N2 value */ - AX25_VALUES_PACLEN, /* AX.25 MTU */ - AX25_VALUES_PROTOCOL, /* Std AX.25, DAMA Slave, DAMA Master */ -#ifdef CONFIG_AX25_DAMA_SLAVE - AX25_VALUES_DS_TIMEOUT, /* DAMA Slave timeout */ -#endif - AX25_MAX_VALUES /* THIS MUST REMAIN THE LAST ENTRY OF THIS LIST */ -}; - -#define AX25_DEF_IPDEFMODE 0 /* Datagram */ -#define AX25_DEF_AXDEFMODE 0 /* Normal */ -#define AX25_DEF_BACKOFF 1 /* Linear backoff */ -#define AX25_DEF_CONMODE 2 /* Connected mode allowed */ -#define AX25_DEF_WINDOW 2 /* Window=2 */ -#define AX25_DEF_EWINDOW 32 /* Module-128 Window=32 */ -#define AX25_DEF_T1 10000 /* T1=10s */ -#define AX25_DEF_T2 3000 /* T2=3s */ -#define AX25_DEF_T3 300000 /* T3=300s */ -#define AX25_DEF_N2 10 /* N2=10 */ -#define AX25_DEF_IDLE 0 /* Idle=None */ -#define AX25_DEF_PACLEN 256 /* Paclen=256 */ -#define AX25_DEF_PROTOCOL AX25_PROTO_STD_SIMPLEX /* Standard AX.25 */ -#define AX25_DEF_DS_TIMEOUT 180000 /* DAMA timeout 3 minutes */ - -typedef struct ax25_uid_assoc { - struct hlist_node uid_node; - refcount_t refcount; - kuid_t uid; - ax25_address call; -} ax25_uid_assoc; - -#define ax25_uid_for_each(__ax25, list) \ - hlist_for_each_entry(__ax25, list, uid_node) - -#define ax25_uid_hold(ax25) \ - refcount_inc(&((ax25)->refcount)) - -static inline void ax25_uid_put(ax25_uid_assoc *assoc) -{ - if (refcount_dec_and_test(&assoc->refcount)) { - kfree(assoc); - } -} - -typedef struct { - ax25_address calls[AX25_MAX_DIGIS]; - unsigned char repeated[AX25_MAX_DIGIS]; - unsigned char ndigi; - signed char lastrepeat; -} ax25_digi; - -typedef struct ax25_route { - struct ax25_route *next; - ax25_address callsign; - struct net_device *dev; - ax25_digi *digipeat; - char ip_mode; -} ax25_route; - -void __ax25_put_route(ax25_route *ax25_rt); - -extern rwlock_t ax25_route_lock; - -static inline void ax25_route_lock_use(void) -{ - read_lock(&ax25_route_lock); -} - -static inline void ax25_route_lock_unuse(void) -{ - read_unlock(&ax25_route_lock); -} - -typedef struct { - char slave; /* slave_mode? */ - struct timer_list slave_timer; /* timeout timer */ - unsigned short slave_timeout; /* when? */ -} ax25_dama_info; - -struct ctl_table; - -typedef struct ax25_dev { - struct list_head list; - - struct net_device *dev; - netdevice_tracker dev_tracker; - - struct net_device *forward; - struct ctl_table_header *sysheader; - int values[AX25_MAX_VALUES]; -#if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) - ax25_dama_info dama; -#endif - refcount_t refcount; - bool device_up; - struct rcu_head rcu; -} ax25_dev; - -typedef struct ax25_cb { - struct hlist_node ax25_node; - ax25_address source_addr, dest_addr; - ax25_digi *digipeat; - ax25_dev *ax25_dev; - netdevice_tracker dev_tracker; - unsigned char iamdigi; - unsigned char state, modulus, pidincl; - unsigned short vs, vr, va; - unsigned char condition, backoff; - unsigned char n2, n2count; - struct timer_list t1timer, t2timer, t3timer, idletimer; - unsigned long t1, t2, t3, idle, rtt; - unsigned short paclen, fragno, fraglen; - struct sk_buff_head write_queue; - struct sk_buff_head reseq_queue; - struct sk_buff_head ack_queue; - struct sk_buff_head frag_queue; - unsigned char window; - struct timer_list timer, dtimer; - struct sock *sk; /* Backlink to socket */ - refcount_t refcount; -} ax25_cb; - -struct ax25_sock { - struct sock sk; - struct ax25_cb *cb; -}; - -#define ax25_sk(ptr) container_of_const(ptr, struct ax25_sock, sk) - -static inline struct ax25_cb *sk_to_ax25(const struct sock *sk) -{ - return ax25_sk(sk)->cb; -} - -#define ax25_for_each(__ax25, list) \ - hlist_for_each_entry(__ax25, list, ax25_node) - -#define ax25_cb_hold(__ax25) \ - refcount_inc(&((__ax25)->refcount)) - -static __inline__ void ax25_cb_put(ax25_cb *ax25) -{ - if (refcount_dec_and_test(&ax25->refcount)) { - kfree(ax25->digipeat); - kfree(ax25); - } -} - -static inline void ax25_dev_hold(ax25_dev *ax25_dev) -{ - refcount_inc(&ax25_dev->refcount); -} - -static inline void ax25_dev_put(ax25_dev *ax25_dev) -{ - if (refcount_dec_and_test(&ax25_dev->refcount)) - kfree_rcu(ax25_dev, rcu); -} -static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) -{ - skb->dev = dev; - skb_reset_mac_header(skb); - skb->pkt_type = PACKET_HOST; - return htons(ETH_P_AX25); -} - -/* af_ax25.c */ -extern struct hlist_head ax25_list; -extern spinlock_t ax25_list_lock; -void ax25_cb_add(ax25_cb *); -struct sock *ax25_find_listener(ax25_address *, int, struct net_device *, int); -struct sock *ax25_get_socket(ax25_address *, ax25_address *, int); -ax25_cb *ax25_find_cb(const ax25_address *, ax25_address *, ax25_digi *, - struct net_device *); -void ax25_send_to_raw(ax25_address *, struct sk_buff *, int); -void ax25_destroy_socket(ax25_cb *); -ax25_cb * __must_check ax25_create_cb(void); -void ax25_fillin_cb(ax25_cb *, ax25_dev *); -struct sock *ax25_make_new(struct sock *, struct ax25_dev *); - -/* ax25_addr.c */ -extern const ax25_address ax25_bcast; -extern const ax25_address ax25_defaddr; -extern const ax25_address null_ax25_address; -char *ax2asc(char *buf, const ax25_address *); -void asc2ax(ax25_address *addr, const char *callsign); -int ax25cmp(const ax25_address *, const ax25_address *); -int ax25digicmp(const ax25_digi *, const ax25_digi *); -const unsigned char *ax25_addr_parse(const unsigned char *, int, - ax25_address *, ax25_address *, ax25_digi *, int *, int *); -int ax25_addr_build(unsigned char *, const ax25_address *, - const ax25_address *, const ax25_digi *, int, int); -int ax25_addr_size(const ax25_digi *); -void ax25_digi_invert(const ax25_digi *, ax25_digi *); - -/* ax25_dev.c */ -extern spinlock_t ax25_dev_lock; - -#if IS_ENABLED(CONFIG_AX25) -static inline ax25_dev *ax25_dev_ax25dev(const struct net_device *dev) -{ - return rcu_dereference_rtnl(dev->ax25_ptr); -} -#endif - -ax25_dev *ax25_addr_ax25dev(ax25_address *); -void ax25_dev_device_up(struct net_device *); -void ax25_dev_device_down(struct net_device *); -int ax25_fwd_ioctl(unsigned int, struct ax25_fwd_struct *); -struct net_device *ax25_fwd_dev(struct net_device *); -void ax25_dev_free(void); - -/* ax25_ds_in.c */ -int ax25_ds_frame_in(ax25_cb *, struct sk_buff *, int); - -/* ax25_ds_subr.c */ -void ax25_ds_nr_error_recovery(ax25_cb *); -void ax25_ds_enquiry_response(ax25_cb *); -void ax25_ds_establish_data_link(ax25_cb *); -void ax25_dev_dama_off(ax25_dev *); -void ax25_dama_on(ax25_cb *); -void ax25_dama_off(ax25_cb *); - -/* ax25_ds_timer.c */ -void ax25_ds_setup_timer(ax25_dev *); -void ax25_ds_set_timer(ax25_dev *); -void ax25_ds_del_timer(ax25_dev *); -void ax25_ds_timer(ax25_cb *); -void ax25_ds_t1_timeout(ax25_cb *); -void ax25_ds_heartbeat_expiry(ax25_cb *); -void ax25_ds_t3timer_expiry(ax25_cb *); -void ax25_ds_idletimer_expiry(ax25_cb *); - -/* ax25_iface.c */ - -struct ax25_protocol { - struct ax25_protocol *next; - unsigned int pid; - int (*func)(struct sk_buff *, ax25_cb *); -}; - -void ax25_register_pid(struct ax25_protocol *ap); -void ax25_protocol_release(unsigned int); - -struct ax25_linkfail { - struct hlist_node lf_node; - void (*func)(ax25_cb *, int); -}; - -void ax25_linkfail_register(struct ax25_linkfail *lf); -void ax25_linkfail_release(struct ax25_linkfail *lf); -int __must_check ax25_listen_register(const ax25_address *, - struct net_device *); -void ax25_listen_release(const ax25_address *, struct net_device *); -int(*ax25_protocol_function(unsigned int))(struct sk_buff *, ax25_cb *); -int ax25_listen_mine(const ax25_address *, struct net_device *); -void ax25_link_failed(ax25_cb *, int); -int ax25_protocol_is_registered(unsigned int); - -/* ax25_in.c */ -int ax25_rx_iframe(ax25_cb *, struct sk_buff *); -int ax25_kiss_rcv(struct sk_buff *, struct net_device *, struct packet_type *, - struct net_device *); - -/* ax25_ip.c */ -netdev_tx_t ax25_ip_xmit(struct sk_buff *skb); -extern const struct header_ops ax25_header_ops; - -/* ax25_out.c */ -ax25_cb *ax25_send_frame(struct sk_buff *, int, const ax25_address *, - ax25_address *, ax25_digi *, struct net_device *); -void ax25_output(ax25_cb *, int, struct sk_buff *); -void ax25_kick(ax25_cb *); -void ax25_transmit_buffer(ax25_cb *, struct sk_buff *, int); -void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev); -int ax25_check_iframes_acked(ax25_cb *, unsigned short); - -/* ax25_route.c */ -void ax25_rt_device_down(struct net_device *); -int ax25_rt_ioctl(unsigned int, void __user *); -extern const struct seq_operations ax25_rt_seqops; -ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev); -int ax25_rt_autobind(ax25_cb *, ax25_address *); -struct sk_buff *ax25_rt_build_path(struct sk_buff *, ax25_address *, - ax25_address *, ax25_digi *); -void ax25_rt_free(void); - -/* ax25_std_in.c */ -int ax25_std_frame_in(ax25_cb *, struct sk_buff *, int); - -/* ax25_std_subr.c */ -void ax25_std_nr_error_recovery(ax25_cb *); -void ax25_std_establish_data_link(ax25_cb *); -void ax25_std_transmit_enquiry(ax25_cb *); -void ax25_std_enquiry_response(ax25_cb *); -void ax25_std_timeout_response(ax25_cb *); - -/* ax25_std_timer.c */ -void ax25_std_heartbeat_expiry(ax25_cb *); -void ax25_std_t1timer_expiry(ax25_cb *); -void ax25_std_t2timer_expiry(ax25_cb *); -void ax25_std_t3timer_expiry(ax25_cb *); -void ax25_std_idletimer_expiry(ax25_cb *); - -/* ax25_subr.c */ -void ax25_clear_queues(ax25_cb *); -void ax25_frames_acked(ax25_cb *, unsigned short); -void ax25_requeue_frames(ax25_cb *); -int ax25_validate_nr(ax25_cb *, unsigned short); -int ax25_decode(ax25_cb *, struct sk_buff *, int *, int *, int *); -void ax25_send_control(ax25_cb *, int, int, int); -void ax25_return_dm(struct net_device *, ax25_address *, ax25_address *, - ax25_digi *); -void ax25_calculate_t1(ax25_cb *); -void ax25_calculate_rtt(ax25_cb *); -void ax25_disconnect(ax25_cb *, int); - -/* ax25_timer.c */ -void ax25_setup_timers(ax25_cb *); -void ax25_start_heartbeat(ax25_cb *); -void ax25_start_t1timer(ax25_cb *); -void ax25_start_t2timer(ax25_cb *); -void ax25_start_t3timer(ax25_cb *); -void ax25_start_idletimer(ax25_cb *); -void ax25_stop_heartbeat(ax25_cb *); -void ax25_stop_t1timer(ax25_cb *); -void ax25_stop_t2timer(ax25_cb *); -void ax25_stop_t3timer(ax25_cb *); -void ax25_stop_idletimer(ax25_cb *); -int ax25_t1timer_running(ax25_cb *); -unsigned long ax25_display_timer(struct timer_list *); - -/* ax25_uid.c */ -extern int ax25_uid_policy; -ax25_uid_assoc *ax25_findbyuid(kuid_t); -int __must_check ax25_uid_ioctl(int, struct sockaddr_ax25 *); -extern const struct seq_operations ax25_uid_seqops; -void ax25_uid_free(void); - -/* sysctl_net_ax25.c */ -#ifdef CONFIG_SYSCTL -int ax25_register_dev_sysctl(ax25_dev *ax25_dev); -void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev); -#else -static inline int ax25_register_dev_sysctl(ax25_dev *ax25_dev) { return 0; } -static inline void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev) {} -#endif /* CONFIG_SYSCTL */ +#define AX25_ADDR_LEN 7 +#define AX25_P_IP 0xCC #endif diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 435250c72d56..3faea66b1979 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -29,6 +29,7 @@ #include <linux/poll.h> #include <net/sock.h> #include <linux/seq_file.h> +#include <linux/ethtool.h> #define BT_SUBSYS_VERSION 2 #define BT_SUBSYS_REVISION 22 @@ -129,21 +130,30 @@ struct bt_voice { #define BT_RCVMTU 13 #define BT_PHY 14 -#define BT_PHY_BR_1M_1SLOT 0x00000001 -#define BT_PHY_BR_1M_3SLOT 0x00000002 -#define BT_PHY_BR_1M_5SLOT 0x00000004 -#define BT_PHY_EDR_2M_1SLOT 0x00000008 -#define BT_PHY_EDR_2M_3SLOT 0x00000010 -#define BT_PHY_EDR_2M_5SLOT 0x00000020 -#define BT_PHY_EDR_3M_1SLOT 0x00000040 -#define BT_PHY_EDR_3M_3SLOT 0x00000080 -#define BT_PHY_EDR_3M_5SLOT 0x00000100 -#define BT_PHY_LE_1M_TX 0x00000200 -#define BT_PHY_LE_1M_RX 0x00000400 -#define BT_PHY_LE_2M_TX 0x00000800 -#define BT_PHY_LE_2M_RX 0x00001000 -#define BT_PHY_LE_CODED_TX 0x00002000 -#define BT_PHY_LE_CODED_RX 0x00004000 +#define BT_PHY_BR_1M_1SLOT BIT(0) +#define BT_PHY_BR_1M_3SLOT BIT(1) +#define BT_PHY_BR_1M_5SLOT BIT(2) +#define BT_PHY_EDR_2M_1SLOT BIT(3) +#define BT_PHY_EDR_2M_3SLOT BIT(4) +#define BT_PHY_EDR_2M_5SLOT BIT(5) +#define BT_PHY_EDR_3M_1SLOT BIT(6) +#define BT_PHY_EDR_3M_3SLOT BIT(7) +#define BT_PHY_EDR_3M_5SLOT BIT(8) +#define BT_PHY_LE_1M_TX BIT(9) +#define BT_PHY_LE_1M_RX BIT(10) +#define BT_PHY_LE_2M_TX BIT(11) +#define BT_PHY_LE_2M_RX BIT(12) +#define BT_PHY_LE_CODED_TX BIT(13) +#define BT_PHY_LE_CODED_RX BIT(14) + +#define BT_PHY_BREDR_MASK (BT_PHY_BR_1M_1SLOT | BT_PHY_BR_1M_3SLOT | \ + BT_PHY_BR_1M_5SLOT | BT_PHY_EDR_2M_1SLOT | \ + BT_PHY_EDR_2M_3SLOT | BT_PHY_EDR_2M_5SLOT | \ + BT_PHY_EDR_3M_1SLOT | BT_PHY_EDR_3M_3SLOT | \ + BT_PHY_EDR_3M_5SLOT) +#define BT_PHY_LE_MASK (BT_PHY_LE_1M_TX | BT_PHY_LE_1M_RX | \ + BT_PHY_LE_2M_TX | BT_PHY_LE_2M_RX | \ + BT_PHY_LE_CODED_TX | BT_PHY_LE_CODED_RX) #define BT_MODE 15 @@ -156,6 +166,7 @@ struct bt_voice { #define BT_PKT_STATUS 16 #define BT_SCM_PKT_STATUS 0x03 +#define BT_SCM_ERROR 0x04 #define BT_ISO_QOS 17 @@ -171,7 +182,7 @@ struct bt_iso_io_qos { __u32 interval; __u16 latency; __u16 sdu; - __u8 phy; + __u8 phys; __u8 rtn; }; @@ -210,9 +221,9 @@ struct bt_iso_qos { }; }; -#define BT_ISO_PHY_1M 0x01 -#define BT_ISO_PHY_2M 0x02 -#define BT_ISO_PHY_CODED 0x04 +#define BT_ISO_PHY_1M BIT(0) +#define BT_ISO_PHY_2M BIT(1) +#define BT_ISO_PHY_CODED BIT(2) #define BT_ISO_PHY_ANY (BT_ISO_PHY_1M | BT_ISO_PHY_2M | \ BT_ISO_PHY_CODED) @@ -242,6 +253,12 @@ struct bt_codecs { #define BT_ISO_BASE 20 +/* Socket option value 21 reserved */ + +#define BT_PKT_SEQNUM 22 + +#define BT_SCM_PKT_SEQNUM 0x05 + __printf(1, 2) void bt_info(const char *fmt, ...); __printf(1, 2) @@ -264,7 +281,8 @@ void bt_err_ratelimited(const char *fmt, ...); #define BT_ERR(fmt, ...) bt_err(fmt "\n", ##__VA_ARGS__) #if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG) -#define BT_DBG(fmt, ...) bt_dbg(fmt "\n", ##__VA_ARGS__) +#define BT_DBG(fmt, ...) \ + bt_dbg("%s:%d: " fmt "\n", __func__, __LINE__, ##__VA_ARGS__) #else #define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__) #endif @@ -380,6 +398,7 @@ void baswap(bdaddr_t *dst, const bdaddr_t *src); struct bt_sock { struct sock sk; struct list_head accept_q; + spinlock_t accept_q_lock; /* protects accept_q */ struct sock *parent; unsigned long flags; void (*skb_msg_name)(struct sk_buff *, void *, int *); @@ -389,7 +408,8 @@ struct bt_sock { enum { BT_SK_DEFER_SETUP, BT_SK_SUSPEND, - BT_SK_PKT_STATUS + BT_SK_PKT_STATUS, + BT_SK_PKT_SEQNUM, }; struct bt_sock_list { @@ -447,6 +467,9 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, hci_req_complete_t *req_complete, hci_req_complete_skb_t *req_complete_skb); +int hci_ethtool_ts_info(unsigned int index, int sk_proto, + struct kernel_ethtool_ts_info *ts_info); + #define HCI_REQ_START BIT(0) #define HCI_REQ_SKB BIT(1) @@ -470,6 +493,7 @@ struct bt_skb_cb { u8 pkt_type; u8 force_active; u16 expect; + u16 pkt_seqnum; u8 incoming:1; u8 pkt_status:2; union { @@ -483,6 +507,7 @@ struct bt_skb_cb { #define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type #define hci_skb_pkt_status(skb) bt_cb((skb))->pkt_status +#define hci_skb_pkt_seqnum(skb) bt_cb((skb))->pkt_seqnum #define hci_skb_expect(skb) bt_cb((skb))->expect #define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode #define hci_skb_event(skb) bt_cb((skb))->hci.req_event @@ -633,7 +658,7 @@ static inline void sco_exit(void) #if IS_ENABLED(CONFIG_BT_LE) int iso_init(void); int iso_exit(void); -bool iso_enabled(void); +bool iso_inited(void); #else static inline int iso_init(void) { @@ -645,7 +670,7 @@ static inline int iso_exit(void) return 0; } -static inline bool iso_enabled(void) +static inline bool iso_inited(void) { return false; } diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 3ec915738112..572b1c620c5d 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -208,6 +208,13 @@ enum { */ HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, + /* When this quirk is set consider Sync Flow Control as supported by + * the driver. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED, + /* When this quirk is set, the LE states reported through the * HCI_LE_READ_SUPPORTED_STATES are invalid/broken. * @@ -354,6 +361,24 @@ enum { * during the hdev->setup vendor callback. */ HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY, + + /* When this quirk is set, the HCI_OP_READ_VOICE_SETTING command is + * skipped. This is required for a subset of the CSR controller clones + * which erroneously claim to support it. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_BROKEN_READ_VOICE_SETTING, + + /* When this quirk is set, the HCI_OP_READ_PAGE_SCAN_TYPE command is + * skipped. This is required for a subset of the CSR controller clones + * which erroneously claim to support it. + * + * This quirk must be set before hci_register_dev is called. + */ + HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, + + __HCI_NUM_QUIRKS, }; /* HCI device flags */ @@ -409,6 +434,7 @@ enum { HCI_USER_CHANNEL, HCI_EXT_CONFIGURED, HCI_LE_ADV, + HCI_LE_ADV_0, HCI_LE_PER_ADV, HCI_LE_SCAN, HCI_SSP_ENABLED, @@ -432,6 +458,7 @@ enum { HCI_WIDEBAND_SPEECH_ENABLED, HCI_EVENT_FILTER_CONFIGURED, HCI_PA_SYNC, + HCI_SCO_FLOWCTL, HCI_DUT_MODE, HCI_VENDOR_DIAG, @@ -462,6 +489,7 @@ enum { #define HCI_AUTO_OFF_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ #define HCI_ACL_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ #define HCI_LE_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */ +#define HCI_ISO_TX_TIMEOUT usecs_to_jiffies(0x7fffff) /* 8388607 usecs */ /* HCI data types */ #define HCI_COMMAND_PKT 0x01 @@ -470,6 +498,7 @@ enum { #define HCI_EVENT_PKT 0x04 #define HCI_ISODATA_PKT 0x05 #define HCI_DIAG_PKT 0xf0 +#define HCI_DRV_PKT 0xf1 #define HCI_VENDOR_PKT 0xff /* HCI packet types */ @@ -533,7 +562,9 @@ enum { #define ESCO_LINK 0x02 /* Low Energy links do not have defined link type. Use invented one */ #define LE_LINK 0x80 -#define ISO_LINK 0x82 +#define CIS_LINK 0x82 +#define BIS_LINK 0x83 +#define PA_LINK 0x84 #define INVALID_LINK 0xff /* LMP features */ @@ -616,10 +647,15 @@ enum { #define HCI_LE_EXT_ADV 0x10 #define HCI_LE_PERIODIC_ADV 0x20 #define HCI_LE_CHAN_SEL_ALG2 0x40 +#define HCI_LE_PAST_SENDER 0x01 +#define HCI_LE_PAST_RECEIVER 0x02 #define HCI_LE_CIS_CENTRAL 0x10 #define HCI_LE_CIS_PERIPHERAL 0x20 #define HCI_LE_ISO_BROADCASTER 0x40 #define HCI_LE_ISO_SYNC_RECEIVER 0x80 +#define HCI_LE_LL_EXT_FEATURE 0x80 +#define HCI_LE_CS 0x40 +#define HCI_LE_CS_HOST 0x80 /* Connection modes */ #define HCI_CM_ACTIVE 0x0000 @@ -855,6 +891,11 @@ struct hci_cp_remote_name_req_cancel { bdaddr_t bdaddr; } __packed; +struct hci_rp_remote_name_req_cancel { + __u8 status; + bdaddr_t bdaddr; +} __packed; + #define HCI_OP_READ_REMOTE_FEATURES 0x041b struct hci_cp_read_remote_features { __le16 handle; @@ -1427,8 +1468,12 @@ struct hci_rp_read_data_block_size { } __packed; #define HCI_OP_READ_LOCAL_CODECS 0x100b -struct hci_std_codecs { +struct hci_std_codecs_hdr { __u8 num; +} __packed; + +struct hci_std_codecs { + struct hci_std_codecs_hdr; __u8 codec[]; } __packed; @@ -1446,7 +1491,7 @@ struct hci_vnd_codecs { struct hci_rp_read_local_supported_codecs { __u8 status; - struct hci_std_codecs std_codecs; + struct hci_std_codecs_hdr std_codecs; struct hci_vnd_codecs vnd_codecs; } __packed; @@ -1463,8 +1508,12 @@ struct hci_std_codec_v2 { __u8 transport; } __packed; -struct hci_std_codecs_v2 { +struct hci_std_codecs_v2_hdr { __u8 num; +} __packed; + +struct hci_std_codecs_v2 { + struct hci_std_codecs_v2_hdr; struct hci_std_codec_v2 codec[]; } __packed; @@ -1481,7 +1530,7 @@ struct hci_vnd_codecs_v2 { struct hci_rp_read_local_supported_codecs_v2 { __u8 status; - struct hci_std_codecs_v2 std_codecs; + struct hci_std_codecs_v2_hdr std_codecs; struct hci_vnd_codecs_v2 vendor_codecs; } __packed; @@ -1528,6 +1577,11 @@ struct hci_rp_read_tx_power { __s8 tx_power; } __packed; +#define HCI_OP_WRITE_SYNC_FLOWCTL 0x0c2f +struct hci_cp_write_sync_flowctl { + __u8 enable; +} __packed; + #define HCI_OP_READ_PAGE_SCAN_TYPE 0x0c46 struct hci_rp_read_page_scan_type { __u8 status; @@ -1839,6 +1893,15 @@ struct hci_cp_le_set_default_phy { #define HCI_LE_SET_PHY_2M 0x02 #define HCI_LE_SET_PHY_CODED 0x04 +#define HCI_OP_LE_SET_PHY 0x2032 +struct hci_cp_le_set_phy { + __le16 handle; + __u8 all_phys; + __u8 tx_phys; + __u8 rx_phys; + __le16 phy_opts; +} __packed; + #define HCI_OP_LE_SET_EXT_SCAN_PARAMS 0x2041 struct hci_cp_le_set_ext_scan_params { __u8 own_addr_type; @@ -1897,6 +1960,8 @@ struct hci_cp_le_pa_create_sync { __u8 sync_cte_type; } __packed; +#define HCI_OP_LE_PA_CREATE_SYNC_CANCEL 0x2045 + #define HCI_OP_LE_PA_TERM_SYNC 0x2046 struct hci_cp_le_pa_term_sync { __le16 handle; @@ -2025,6 +2090,44 @@ struct hci_cp_le_set_privacy_mode { __u8 mode; } __packed; +#define HCI_OP_LE_PAST 0x205a +struct hci_cp_le_past { + __le16 handle; + __le16 service_data; + __le16 sync_handle; +} __packed; + +struct hci_rp_le_past { + __u8 status; + __le16 handle; +} __packed; + +#define HCI_OP_LE_PAST_SET_INFO 0x205b +struct hci_cp_le_past_set_info { + __le16 handle; + __le16 service_data; + __u8 adv_handle; +} __packed; + +struct hci_rp_le_past_set_info { + __u8 status; + __le16 handle; +} __packed; + +#define HCI_OP_LE_PAST_PARAMS 0x205c +struct hci_cp_le_past_params { + __le16 handle; + __u8 mode; + __le16 skip; + __le16 sync_timeout; + __u8 cte_type; +} __packed; + +struct hci_rp_le_past_params { + __u8 status; + __le16 handle; +} __packed; + #define HCI_OP_LE_READ_BUFFER_SIZE_V2 0x2060 struct hci_rp_le_read_buffer_size_v2 { __u8 status; @@ -2052,8 +2155,8 @@ struct hci_cis_params { __u8 cis_id; __le16 c_sdu; __le16 p_sdu; - __u8 c_phy; - __u8 p_phy; + __u8 c_phys; + __u8 p_phys; __u8 c_rtn; __u8 p_rtn; } __packed; @@ -2172,6 +2275,217 @@ struct hci_cp_le_set_host_feature { __u8 bit_value; } __packed; +#define HCI_OP_LE_READ_ALL_LOCAL_FEATURES 0x2087 +struct hci_rp_le_read_all_local_features { + __u8 status; + __u8 page; + __u8 features[248]; +} __packed; + +#define HCI_OP_LE_READ_ALL_REMOTE_FEATURES 0x2088 +struct hci_cp_le_read_all_remote_features { + __le16 handle; + __u8 pages; +} __packed; + +/* Channel Sounding Commands */ +#define HCI_OP_LE_CS_RD_LOCAL_SUPP_CAP 0x2089 +struct hci_rp_le_cs_rd_local_supp_cap { + __u8 status; + __u8 num_config_supported; + __le16 max_consecutive_procedures_supported; + __u8 num_antennas_supported; + __u8 max_antenna_paths_supported; + __u8 roles_supported; + __u8 modes_supported; + __u8 rtt_capability; + __u8 rtt_aa_only_n; + __u8 rtt_sounding_n; + __u8 rtt_random_payload_n; + __le16 nadm_sounding_capability; + __le16 nadm_random_capability; + __u8 cs_sync_phys_supported; + __le16 subfeatures_supported; + __le16 t_ip1_times_supported; + __le16 t_ip2_times_supported; + __le16 t_fcs_times_supported; + __le16 t_pm_times_supported; + __u8 t_sw_time_supported; + __u8 tx_snr_capability; +} __packed; + +#define HCI_OP_LE_CS_RD_RMT_SUPP_CAP 0x208A +struct hci_cp_le_cs_rd_local_supp_cap { + __le16 handle; +} __packed; + +#define HCI_OP_LE_CS_WR_CACHED_RMT_SUPP_CAP 0x208B +struct hci_cp_le_cs_wr_cached_rmt_supp_cap { + __le16 handle; + __u8 num_config_supported; + __le16 max_consecutive_procedures_supported; + __u8 num_antennas_supported; + __u8 max_antenna_paths_supported; + __u8 roles_supported; + __u8 modes_supported; + __u8 rtt_capability; + __u8 rtt_aa_only_n; + __u8 rtt_sounding_n; + __u8 rtt_random_payload_n; + __le16 nadm_sounding_capability; + __le16 nadm_random_capability; + __u8 cs_sync_phys_supported; + __le16 subfeatures_supported; + __le16 t_ip1_times_supported; + __le16 t_ip2_times_supported; + __le16 t_fcs_times_supported; + __le16 t_pm_times_supported; + __u8 t_sw_time_supported; + __u8 tx_snr_capability; +} __packed; + +struct hci_rp_le_cs_wr_cached_rmt_supp_cap { + __u8 status; + __le16 handle; +} __packed; + +#define HCI_OP_LE_CS_SEC_ENABLE 0x208C +struct hci_cp_le_cs_sec_enable { + __le16 handle; +} __packed; + +#define HCI_OP_LE_CS_SET_DEFAULT_SETTINGS 0x208D +struct hci_cp_le_cs_set_default_settings { + __le16 handle; + __u8 role_enable; + __u8 cs_sync_ant_sel; + __s8 max_tx_power; +} __packed; + +struct hci_rp_le_cs_set_default_settings { + __u8 status; + __le16 handle; +} __packed; + +#define HCI_OP_LE_CS_RD_RMT_FAE_TABLE 0x208E +struct hci_cp_le_cs_rd_rmt_fae_table { + __le16 handle; +} __packed; + +#define HCI_OP_LE_CS_WR_CACHED_RMT_FAE_TABLE 0x208F +struct hci_cp_le_cs_wr_rmt_cached_fae_table { + __le16 handle; + __u8 remote_fae_table[72]; +} __packed; + +struct hci_rp_le_cs_wr_rmt_cached_fae_table { + __u8 status; + __le16 handle; +} __packed; + +#define HCI_OP_LE_CS_CREATE_CONFIG 0x2090 +struct hci_cp_le_cs_create_config { + __le16 handle; + __u8 config_id; + __u8 create_context; + __u8 main_mode_type; + __u8 sub_mode_type; + __u8 min_main_mode_steps; + __u8 max_main_mode_steps; + __u8 main_mode_repetition; + __u8 mode_0_steps; + __u8 role; + __u8 rtt_type; + __u8 cs_sync_phy; + __u8 channel_map[10]; + __u8 channel_map_repetition; + __u8 channel_selection_type; + __u8 ch3c_shape; + __u8 ch3c_jump; + __u8 reserved; +} __packed; + +#define HCI_OP_LE_CS_REMOVE_CONFIG 0x2091 +struct hci_cp_le_cs_remove_config { + __le16 handle; + __u8 config_id; +} __packed; + +#define HCI_OP_LE_CS_SET_CH_CLASSIFICATION 0x2092 +struct hci_cp_le_cs_set_ch_classification { + __u8 ch_classification[10]; +} __packed; + +struct hci_rp_le_cs_set_ch_classification { + __u8 status; +} __packed; + +#define HCI_OP_LE_CS_SET_PROC_PARAM 0x2093 +struct hci_cp_le_cs_set_proc_param { + __le16 handle; + __u8 config_id; + __le16 max_procedure_len; + __le16 min_procedure_interval; + __le16 max_procedure_interval; + __le16 max_procedure_count; + __u8 min_subevent_len[3]; + __u8 max_subevent_len[3]; + __u8 tone_antenna_config_selection; + __u8 phy; + __u8 tx_power_delta; + __u8 preferred_peer_antenna; + __u8 snr_control_initiator; + __u8 snr_control_reflector; +} __packed; + +struct hci_rp_le_cs_set_proc_param { + __u8 status; + __le16 handle; +} __packed; + +#define HCI_OP_LE_CS_SET_PROC_ENABLE 0x2094 +struct hci_cp_le_cs_set_proc_enable { + __le16 handle; + __u8 config_id; + __u8 enable; +} __packed; + +#define HCI_OP_LE_CS_TEST 0x2095 +struct hci_cp_le_cs_test { + __u8 main_mode_type; + __u8 sub_mode_type; + __u8 main_mode_repetition; + __u8 mode_0_steps; + __u8 role; + __u8 rtt_type; + __u8 cs_sync_phy; + __u8 cs_sync_antenna_selection; + __u8 subevent_len[3]; + __le16 subevent_interval; + __u8 max_num_subevents; + __u8 transmit_power_level; + __u8 t_ip1_time; + __u8 t_ip2_time; + __u8 t_fcs_time; + __u8 t_pm_time; + __u8 t_sw_time; + __u8 tone_antenna_config_selection; + __u8 reserved; + __u8 snr_control_initiator; + __u8 snr_control_reflector; + __le16 drbg_nonce; + __u8 channel_map_repetition; + __le16 override_config; + __u8 override_parameters_length; + __u8 override_parameters_data[]; +} __packed; + +struct hci_rp_le_cs_test { + __u8 status; +} __packed; + +#define HCI_OP_LE_CS_TEST_END 0x2096 + /* ---- HCI Events ---- */ struct hci_ev_status { __u8 status; @@ -2594,6 +2908,7 @@ struct hci_ev_le_conn_complete { #define LE_EXT_ADV_DIRECT_IND 0x0004 #define LE_EXT_ADV_SCAN_RSP 0x0008 #define LE_EXT_ADV_LEGACY_PDU 0x0010 +#define LE_EXT_ADV_DATA_STATUS_MASK 0x0060 #define LE_EXT_ADV_EVT_TYPE_MASK 0x007f #define ADDR_LE_DEV_PUBLIC 0x00 @@ -2739,6 +3054,11 @@ struct hci_ev_le_per_adv_report { __u8 data[]; } __packed; +#define HCI_EV_LE_PA_SYNC_LOST 0x10 +struct hci_ev_le_pa_sync_lost { + __le16 handle; +} __packed; + #define LE_PA_DATA_COMPLETE 0x00 #define LE_PA_DATA_MORE_TO_COME 0x01 #define LE_PA_DATA_TRUNCATED 0x02 @@ -2751,6 +3071,20 @@ struct hci_evt_le_ext_adv_set_term { __u8 num_evts; } __packed; +#define HCI_EV_LE_PAST_RECEIVED 0x18 +struct hci_ev_le_past_received { + __u8 status; + __le16 handle; + __le16 service_data; + __le16 sync_handle; + __u8 sid; + __u8 bdaddr_type; + bdaddr_t bdaddr; + __u8 phy; + __le16 interval; + __u8 clock_accuracy; +} __packed; + #define HCI_EVT_LE_CIS_ESTABLISHED 0x19 struct hci_evt_le_cis_established { __u8 status; @@ -2796,8 +3130,8 @@ struct hci_evt_le_create_big_complete { __le16 bis_handle[]; } __packed; -#define HCI_EVT_LE_BIG_SYNC_ESTABILISHED 0x1d -struct hci_evt_le_big_sync_estabilished { +#define HCI_EVT_LE_BIG_SYNC_ESTABLISHED 0x1d +struct hci_evt_le_big_sync_established { __u8 status; __u8 handle; __u8 latency[3]; @@ -2811,6 +3145,12 @@ struct hci_evt_le_big_sync_estabilished { __le16 bis[]; } __packed; +#define HCI_EVT_LE_BIG_SYNC_LOST 0x1e +struct hci_evt_le_big_sync_lost { + __u8 handle; + __u8 reason; +} __packed; + #define HCI_EVT_LE_BIG_INFO_ADV_REPORT 0x22 struct hci_evt_le_big_info_adv_report { __le16 sync_handle; @@ -2828,6 +3168,138 @@ struct hci_evt_le_big_info_adv_report { __u8 encryption; } __packed; +#define HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE 0x2b +struct hci_evt_le_read_all_remote_features_complete { + __u8 status; + __le16 handle; + __u8 max_pages; + __u8 valid_pages; + __u8 features[248]; +} __packed; + +/* Channel Sounding Events */ +#define HCI_EVT_LE_CS_READ_RMT_SUPP_CAP_COMPLETE 0x2C +struct hci_evt_le_cs_read_rmt_supp_cap_complete { + __u8 status; + __le16 handle; + __u8 num_configs_supp; + __le16 max_consec_proc_supp; + __u8 num_ant_supp; + __u8 max_ant_path_supp; + __u8 roles_supp; + __u8 modes_supp; + __u8 rtt_cap; + __u8 rtt_aa_only_n; + __u8 rtt_sounding_n; + __u8 rtt_rand_payload_n; + __le16 nadm_sounding_cap; + __le16 nadm_rand_cap; + __u8 cs_sync_phys_supp; + __le16 sub_feat_supp; + __le16 t_ip1_times_supp; + __le16 t_ip2_times_supp; + __le16 t_fcs_times_supp; + __le16 t_pm_times_supp; + __u8 t_sw_times_supp; + __u8 tx_snr_cap; +} __packed; + +#define HCI_EVT_LE_CS_READ_RMT_FAE_TABLE_COMPLETE 0x2D +struct hci_evt_le_cs_read_rmt_fae_table_complete { + __u8 status; + __le16 handle; + __u8 remote_fae_table[72]; +} __packed; + +#define HCI_EVT_LE_CS_SECURITY_ENABLE_COMPLETE 0x2E +struct hci_evt_le_cs_security_enable_complete { + __u8 status; + __le16 handle; +} __packed; + +#define HCI_EVT_LE_CS_CONFIG_COMPLETE 0x2F +struct hci_evt_le_cs_config_complete { + __u8 status; + __le16 handle; + __u8 config_id; + __u8 action; + __u8 main_mode_type; + __u8 sub_mode_type; + __u8 min_main_mode_steps; + __u8 max_main_mode_steps; + __u8 main_mode_rep; + __u8 mode_0_steps; + __u8 role; + __u8 rtt_type; + __u8 cs_sync_phy; + __u8 channel_map[10]; + __u8 channel_map_rep; + __u8 channel_sel_type; + __u8 ch3c_shape; + __u8 ch3c_jump; + __u8 reserved; + __u8 t_ip1_time; + __u8 t_ip2_time; + __u8 t_fcs_time; + __u8 t_pm_time; +} __packed; + +#define HCI_EVT_LE_CS_PROCEDURE_ENABLE_COMPLETE 0x30 +struct hci_evt_le_cs_procedure_enable_complete { + __u8 status; + __le16 handle; + __u8 config_id; + __u8 state; + __u8 tone_ant_config_sel; + __s8 sel_tx_pwr; + __u8 sub_evt_len[3]; + __u8 sub_evts_per_evt; + __le16 sub_evt_intrvl; + __le16 evt_intrvl; + __le16 proc_intrvl; + __le16 proc_counter; + __le16 max_proc_len; +} __packed; + +#define HCI_EVT_LE_CS_SUBEVENT_RESULT 0x31 +struct hci_evt_le_cs_subevent_result { + __le16 handle; + __u8 config_id; + __le16 start_acl_conn_evt_counter; + __le16 proc_counter; + __le16 freq_comp; + __u8 ref_pwr_lvl; + __u8 proc_done_status; + __u8 subevt_done_status; + __u8 abort_reason; + __u8 num_ant_paths; + __u8 num_steps_reported; + __u8 step_mode[0]; /* depends on num_steps_reported */ + __u8 step_channel[0]; /* depends on num_steps_reported */ + __u8 step_data_length[0]; /* depends on num_steps_reported */ + __u8 step_data[0]; /* depends on num_steps_reported */ +} __packed; + +#define HCI_EVT_LE_CS_SUBEVENT_RESULT_CONTINUE 0x32 +struct hci_evt_le_cs_subevent_result_continue { + __le16 handle; + __u8 config_id; + __u8 proc_done_status; + __u8 subevt_done_status; + __u8 abort_reason; + __u8 num_ant_paths; + __u8 num_steps_reported; + __u8 step_mode[0]; /* depends on num_steps_reported */ + __u8 step_channel[0]; /* depends on num_steps_reported */ + __u8 step_data_length[0]; /* depends on num_steps_reported */ + __u8 step_data[0]; /* depends on num_steps_reported */ +} __packed; + +#define HCI_EVT_LE_CS_TEST_END_COMPLETE 0x33 +struct hci_evt_le_cs_test_end_complete { + __u8 status; +} __packed; + #define HCI_EV_VENDOR 0xff /* Internal events generated by Bluetooth stack */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6281063cbd8e..aa600fbf9a53 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -29,8 +29,11 @@ #include <linux/idr.h> #include <linux/leds.h> #include <linux/rculist.h> +#include <linux/spinlock.h> +#include <linux/srcu.h> #include <net/bluetooth/hci.h> +#include <net/bluetooth/hci_drv.h> #include <net/bluetooth/hci_sync.h> #include <net/bluetooth/hci_sock.h> #include <net/bluetooth/coredump.h> @@ -92,6 +95,7 @@ struct discovery_state { u16 uuid_count; u8 (*uuids)[16]; unsigned long name_resolve_timeout; + spinlock_t lock; }; #define SUSPEND_NOTIFIER_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ @@ -125,7 +129,9 @@ struct hci_conn_hash { struct list_head list; unsigned int acl_num; unsigned int sco_num; - unsigned int iso_num; + unsigned int cis_num; + unsigned int bis_num; + unsigned int pa_num; unsigned int le_num; unsigned int le_num_peripheral; }; @@ -160,6 +166,7 @@ enum hci_conn_flags { HCI_CONN_FLAG_REMOTE_WAKEUP = BIT(0), HCI_CONN_FLAG_DEVICE_PRIVACY = BIT(1), HCI_CONN_FLAG_ADDRESS_RESOLUTION = BIT(2), + HCI_CONN_FLAG_PAST = BIT(3), }; typedef u8 hci_conn_flags_t; @@ -238,9 +245,11 @@ struct adv_info { bool enabled; bool pending; bool periodic; + bool periodic_enabled; __u8 mesh; __u8 instance; __u8 handle; + __u8 sid; __u32 flags; __u16 timeout; __u16 remaining_time; @@ -261,6 +270,12 @@ struct adv_info { struct delayed_work rpa_expired_cb; }; +struct tx_queue { + struct sk_buff_head queue; + unsigned int extra; + unsigned int tracked; +}; + #define HCI_MAX_ADV_INSTANCES 5 #define HCI_DEFAULT_ADV_DURATION 2 @@ -339,6 +354,7 @@ struct adv_monitor { struct hci_dev { struct list_head list; + struct srcu_struct srcu; struct mutex lock; struct ida unset_handle_ida; @@ -362,7 +378,7 @@ struct hci_dev { __u8 minor_class; __u8 max_page; __u8 features[HCI_MAX_PAGES][8]; - __u8 le_features[8]; + __u8 le_features[248]; __u8 le_accept_list_size; __u8 le_resolv_list_size; __u8 le_num_of_adv_sets; @@ -454,7 +470,7 @@ struct hci_dev { unsigned int auto_accept_delay; - unsigned long quirks; + DECLARE_BITMAP(quirk_flags, __HCI_NUM_QUIRKS); atomic_t cmd_cnt; unsigned int acl_cnt; @@ -473,6 +489,7 @@ struct hci_dev { unsigned long acl_last_tx; unsigned long le_last_tx; + unsigned long iso_last_tx; __u8 le_tx_def_phys; __u8 le_rx_def_phys; @@ -539,6 +556,7 @@ struct hci_dev { struct hci_conn_hash conn_hash; struct list_head mesh_pending; + struct mutex mgmt_pending_lock; struct list_head mgmt_pending; struct list_head reject_list; struct list_head accept_list; @@ -607,6 +625,8 @@ struct hci_dev { struct list_head monitored_devices; bool advmon_pend_notify; + struct hci_drv *hci_drv; + #if IS_ENABLED(CONFIG_BT_LEDS) struct led_trigger *power_led; #endif @@ -643,6 +663,10 @@ struct hci_dev { u8 (*classify_pkt_type)(struct hci_dev *hdev, struct sk_buff *skb); }; +#define hci_set_quirk(hdev, nr) set_bit((nr), (hdev)->quirk_flags) +#define hci_clear_quirk(hdev, nr) clear_bit((nr), (hdev)->quirk_flags) +#define hci_test_quirk(hdev, nr) test_bit((nr), (hdev)->quirk_flags) + #define HCI_PHY_HANDLE(handle) (handle & 0xff) enum conn_reasons { @@ -678,6 +702,7 @@ struct hci_conn { __u8 attempt; __u8 dev_class[3]; __u8 features[HCI_MAX_PAGES][8]; + __u8 le_features[248]; __u16 pkt_type; __u16 link_policy; __u8 key_type; @@ -705,6 +730,8 @@ struct hci_conn { __u16 le_per_adv_data_offset; __u8 le_adv_phy; __u8 le_adv_sec_phy; + __u8 le_tx_def_phys; + __u8 le_rx_def_phys; __u8 le_tx_phy; __u8 le_rx_phy; __s8 rssi; @@ -726,13 +753,14 @@ struct hci_conn { __u8 remote_cap; __u8 remote_auth; - __u8 remote_id; unsigned int sent; struct sk_buff_head data_q; struct list_head chan_list; + struct tx_queue tx_q; + struct delayed_work disc_work; struct delayed_work auto_accept_work; struct delayed_work idle_work; @@ -814,29 +842,30 @@ extern struct mutex hci_cb_list_lock; #define hci_dev_test_and_clear_flag(hdev, nr) test_and_clear_bit((nr), (hdev)->dev_flags) #define hci_dev_test_and_change_flag(hdev, nr) test_and_change_bit((nr), (hdev)->dev_flags) -#define hci_dev_clear_volatile_flags(hdev) \ - do { \ - hci_dev_clear_flag(hdev, HCI_LE_SCAN); \ - hci_dev_clear_flag(hdev, HCI_LE_ADV); \ - hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);\ - hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ); \ - hci_dev_clear_flag(hdev, HCI_QUALITY_REPORT); \ +#define hci_dev_clear_volatile_flags(hdev) \ + do { \ + hci_dev_clear_flag((hdev), HCI_LE_SCAN); \ + hci_dev_clear_flag((hdev), HCI_LE_ADV); \ + hci_dev_clear_flag((hdev), HCI_LL_RPA_RESOLUTION); \ + hci_dev_clear_flag((hdev), HCI_PERIODIC_INQ); \ + hci_dev_clear_flag((hdev), HCI_QUALITY_REPORT); \ } while (0) #define hci_dev_le_state_simultaneous(hdev) \ - (!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks) && \ - (hdev->le_states[4] & 0x08) && /* Central */ \ - (hdev->le_states[4] & 0x40) && /* Peripheral */ \ - (hdev->le_states[3] & 0x10)) /* Simultaneous */ + (!hci_test_quirk((hdev), HCI_QUIRK_BROKEN_LE_STATES) && \ + ((hdev)->le_states[4] & 0x08) && /* Central */ \ + ((hdev)->le_states[4] & 0x40) && /* Peripheral */ \ + ((hdev)->le_states[3] & 0x10)) /* Simultaneous */ /* ----- HCI interface to upper protocols ----- */ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr); int l2cap_disconn_ind(struct hci_conn *hcon); -void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags); +int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, + u16 flags); #if IS_ENABLED(CONFIG_BT_BREDR) int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags); -void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb); +int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb); #else static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) @@ -844,23 +873,30 @@ static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, return 0; } -static inline void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) +static inline int sco_recv_scodata(struct hci_dev *hdev, u16 handle, + struct sk_buff *skb) { + kfree_skb(skb); + return -ENOENT; } #endif #if IS_ENABLED(CONFIG_BT_LE) int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags); -void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags); +int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, + u16 flags); #else static inline int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { return 0; } -static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, - u16 flags) + +static inline int iso_recv(struct hci_dev *hdev, u16 handle, + struct sk_buff *skb, u16 flags) { + kfree_skb(skb); + return -ENOENT; } #endif @@ -870,6 +906,7 @@ static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, static inline void discovery_init(struct hci_dev *hdev) { + spin_lock_init(&hdev->discovery.lock); hdev->discovery.state = DISCOVERY_STOPPED; INIT_LIST_HEAD(&hdev->discovery.all); INIT_LIST_HEAD(&hdev->discovery.unknown); @@ -884,8 +921,11 @@ static inline void hci_discovery_filter_clear(struct hci_dev *hdev) hdev->discovery.report_invalid_rssi = true; hdev->discovery.rssi = HCI_RSSI_INVALID; hdev->discovery.uuid_count = 0; + + spin_lock(&hdev->discovery.lock); kfree(hdev->discovery.uuids); hdev->discovery.uuids = NULL; + spin_unlock(&hdev->discovery.lock); } bool hci_discovery_active(struct hci_dev *hdev); @@ -988,8 +1028,14 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c) case ESCO_LINK: h->sco_num++; break; - case ISO_LINK: - h->iso_num++; + case CIS_LINK: + h->cis_num++; + break; + case BIS_LINK: + h->bis_num++; + break; + case PA_LINK: + h->pa_num++; break; } } @@ -1014,8 +1060,14 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c) case ESCO_LINK: h->sco_num--; break; - case ISO_LINK: - h->iso_num--; + case CIS_LINK: + h->cis_num--; + break; + case BIS_LINK: + h->bis_num--; + break; + case PA_LINK: + h->pa_num--; break; } } @@ -1031,8 +1083,12 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type) case SCO_LINK: case ESCO_LINK: return h->sco_num; - case ISO_LINK: - return h->iso_num; + case CIS_LINK: + return h->cis_num; + case BIS_LINK: + return h->bis_num; + case PA_LINK: + return h->pa_num; default: return 0; } @@ -1042,7 +1098,15 @@ static inline unsigned int hci_conn_count(struct hci_dev *hdev) { struct hci_conn_hash *c = &hdev->conn_hash; - return c->acl_num + c->sco_num + c->le_num + c->iso_num; + return c->acl_num + c->sco_num + c->le_num + c->cis_num + c->bis_num + + c->pa_num; +} + +static inline unsigned int hci_iso_count(struct hci_dev *hdev) +{ + struct hci_conn_hash *c = &hdev->conn_hash; + + return c->cis_num + c->bis_num; } static inline bool hci_conn_valid(struct hci_dev *hdev, struct hci_conn *conn) @@ -1092,7 +1156,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, ba) || c->type != ISO_LINK) + if (bacmp(&c->dst, ba) || c->type != BIS_LINK) continue; if (c->iso_qos.bcast.bis == bis) { @@ -1105,10 +1169,8 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev, return NULL; } -static inline struct hci_conn *hci_conn_hash_lookup_sid(struct hci_dev *hdev, - __u8 sid, - bdaddr_t *dst, - __u8 dst_type) +static inline struct hci_conn * +hci_conn_hash_lookup_create_pa_sync(struct hci_dev *hdev) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; @@ -1116,8 +1178,10 @@ static inline struct hci_conn *hci_conn_hash_lookup_sid(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || bacmp(&c->dst, dst) || - c->dst_type != dst_type || c->sid != sid) + if (c->type != PA_LINK) + continue; + + if (!test_bit(HCI_CONN_CREATE_PA_SYNC, &c->flags)) continue; rcu_read_unlock(); @@ -1140,8 +1204,8 @@ hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, ba) || c->type != ISO_LINK || - !test_bit(HCI_CONN_PER_ADV, &c->flags)) + if (bacmp(&c->dst, ba) || c->type != BIS_LINK || + !test_bit(HCI_CONN_PER_ADV, &c->flags)) continue; if (c->iso_qos.bcast.big == big && @@ -1194,6 +1258,27 @@ static inline struct hci_conn *hci_conn_hash_lookup_ba(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_conn_hash_lookup_role(struct hci_dev *hdev, + __u8 type, __u8 role, + bdaddr_t *ba) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == type && c->role == role && !bacmp(&c->dst, ba)) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev, bdaddr_t *ba, __u8 ba_type) @@ -1230,7 +1315,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) + if (c->type != CIS_LINK) continue; /* Match CIG ID if set */ @@ -1262,7 +1347,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY)) + if (c->type != CIS_LINK) continue; if (handle == c->iso_qos.ucast.cig) { @@ -1285,17 +1370,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) - continue; - - /* An ISO_LINK hcon with BDADDR_ANY as destination - * address is a Broadcast connection. A Broadcast - * slave connection is associated with a PA train, - * so the sync_handle can be used to differentiate - * from unicast. - */ - if (bacmp(&c->dst, BDADDR_ANY) && - c->sync_handle == HCI_SYNC_HANDLE_INVALID) + if (c->type != BIS_LINK) continue; if (handle == c->iso_qos.bcast.big) { @@ -1319,7 +1394,7 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != PA_LINK) continue; if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) { @@ -1334,7 +1409,8 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev, } static inline struct hci_conn * -hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) +hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state, + __u8 role) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; @@ -1342,8 +1418,7 @@ hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK || - c->state != state) + if (c->type != BIS_LINK || c->state != state || c->role != role) continue; if (handle == c->iso_qos.bcast.big) { @@ -1366,8 +1441,8 @@ hci_conn_hash_lookup_pa_sync_big_handle(struct hci_dev *hdev, __u8 big) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK || - !test_bit(HCI_CONN_PA_SYNC, &c->flags)) + if (c->type != BIS_LINK || + !test_bit(HCI_CONN_PA_SYNC, &c->flags)) continue; if (c->iso_qos.bcast.big == big) { @@ -1389,7 +1464,7 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != ISO_LINK) + if (c->type != PA_LINK) continue; /* Ignore the listen hcon, we are looking @@ -1409,26 +1484,6 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle) return NULL; } -static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, - __u8 type, __u16 state) -{ - struct hci_conn_hash *h = &hdev->conn_hash; - struct hci_conn *c; - - rcu_read_lock(); - - list_for_each_entry_rcu(c, &h->list, list) { - if (c->type == type && c->state == state) { - rcu_read_unlock(); - return c; - } - } - - rcu_read_unlock(); - - return NULL; -} - typedef void (*hci_conn_func_t)(struct hci_conn *conn, void *data); static inline void hci_conn_hash_list_state(struct hci_dev *hdev, hci_conn_func_t func, __u8 type, @@ -1516,14 +1571,12 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); bool hci_iso_setup_path(struct hci_conn *conn); int hci_le_create_cis_pending(struct hci_dev *hdev); -int hci_pa_create_sync_pending(struct hci_dev *hdev); -int hci_le_big_create_sync_pending(struct hci_dev *hdev); int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, - u8 role, u16 handle); + u8 dst_type, u8 role, u16 handle); struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type, - bdaddr_t *dst, u8 role); + bdaddr_t *dst, u8 dst_type, u8 role); void hci_conn_del(struct hci_conn *conn); void hci_conn_hash_flush(struct hci_dev *hdev); @@ -1547,20 +1600,24 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, __u16 setting, struct bt_codec *codec, u16 timeout); struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos); -struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout); +struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base); + __u8 base_len, __u8 *base, u16 timeout); +int hci_past_bis(struct hci_conn *conn, bdaddr_t *dst, __u8 dst_type); struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos); -struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos, - __u8 data_len, __u8 *data); + u16 timeout); +struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, + __u8 dst_type, __u8 sid, + struct bt_iso_qos *qos, + __u8 data_len, __u8 *data, u16 timeout); struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos); -int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, - struct bt_iso_qos *qos, - __u16 sync_handle, __u8 num_bis, __u8 bis[]); +int hci_conn_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, + struct bt_iso_qos *qos, __u16 sync_handle, + __u8 num_bis, __u8 bis[]); int hci_conn_check_link_mode(struct hci_conn *conn); int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level); int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type, @@ -1572,6 +1629,18 @@ void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active); void hci_conn_failed(struct hci_conn *conn, u8 status); u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle); +void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb); +void hci_conn_tx_dequeue(struct hci_conn *conn); +void hci_setup_tx_timestamp(struct sk_buff *skb, size_t key_offset, + const struct sockcm_cookie *sockc); + +static inline void hci_sockcm_init(struct sockcm_cookie *sockc, struct sock *sk) +{ + *sockc = (struct sockcm_cookie) { + .tsflags = READ_ONCE(sk->sk_tsflags), + }; +} + /* * hci_conn_get() and hci_conn_put() are used to control the life-time of an * "hci_conn" object. They do not guarantee that the hci_conn object is running, @@ -1780,6 +1849,7 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, void hci_uuids_clear(struct hci_dev *hdev); void hci_link_keys_clear(struct hci_dev *hdev); +u8 *hci_conn_key_enc_size(struct hci_conn *conn); struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr); struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, bdaddr_t *bdaddr, u8 *val, u8 type, @@ -1816,6 +1886,7 @@ int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, void hci_adv_instances_clear(struct hci_dev *hdev); struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance); +struct adv_info *hci_find_adv_sid(struct hci_dev *hdev, u8 sid); struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance); struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, @@ -1823,7 +1894,7 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u16 timeout, u16 duration, s8 tx_power, u32 min_interval, u32 max_interval, u8 mesh_handle); -struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, +struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u8 sid, u32 flags, u8 data_len, u8 *data, u32 min_interval, u32 max_interval); int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, @@ -1858,6 +1929,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define lmp_hold_capable(dev) ((dev)->features[0][0] & LMP_HOLD) #define lmp_sniff_capable(dev) ((dev)->features[0][0] & LMP_SNIFF) #define lmp_park_capable(dev) ((dev)->features[0][1] & LMP_PARK) +#define lmp_sco_capable(dev) ((dev)->features[0][1] & LMP_SCO) #define lmp_inq_rssi_capable(dev) ((dev)->features[0][3] & LMP_RSSI_INQ) #define lmp_esco_capable(dev) ((dev)->features[0][3] & LMP_ESCO) #define lmp_bredr_capable(dev) (!((dev)->features[0][4] & LMP_NO_BREDR)) @@ -1900,6 +1972,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); !hci_dev_test_flag(dev, HCI_RPA_EXPIRED)) #define adv_rpa_valid(adv) (bacmp(&adv->random_addr, BDADDR_ANY) && \ !adv->rpa_expired) +#define le_enabled(dev) (lmp_le_capable(dev) && \ + hci_dev_test_flag(dev, HCI_LE_ENABLED)) #define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M)) @@ -1910,36 +1984,41 @@ void hci_conn_del_sysfs(struct hci_conn *conn); ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M)) #define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED) && \ - !test_bit(HCI_QUIRK_BROKEN_LE_CODED, \ - &(dev)->quirks)) + !hci_test_quirk((dev), \ + HCI_QUIRK_BROKEN_LE_CODED)) #define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) #define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY) +#define ll_privacy_enabled(dev) (le_enabled(dev) && ll_privacy_capable(dev)) #define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \ - (hdev->commands[39] & 0x04)) + ((dev)->commands[39] & 0x04)) #define read_key_size_capable(dev) \ ((dev)->commands[20] & 0x10 && \ - !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &hdev->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE)) + +#define read_voice_setting_capable(dev) \ + ((dev)->commands[9] & 0x04 && \ + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_READ_VOICE_SETTING)) /* Use enhanced synchronous connection if command is supported and its quirk * has not been set. */ #define enhanced_sync_conn_capable(dev) \ (((dev)->commands[29] & 0x08) && \ - !test_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN)) /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ ((dev)->commands[37] & 0x40) && \ - !test_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_EXT_SCAN)) /* Use ext create connection if command is supported */ #define use_ext_conn(dev) (((dev)->commands[37] & 0x80) && \ - !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_EXT_CREATE_CONN)) /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) @@ -1954,25 +2033,54 @@ void hci_conn_del_sysfs(struct hci_conn *conn); */ #define use_enhanced_conn_complete(dev) ((ll_privacy_capable(dev) || \ ext_adv_capable(dev)) && \ - !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, \ - &(dev)->quirks)) + !hci_test_quirk((dev), \ + HCI_QUIRK_BROKEN_EXT_CREATE_CONN)) /* Periodic advertising support */ #define per_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_PERIODIC_ADV)) /* CIS Master/Slave and BIS support */ #define iso_capable(dev) (cis_capable(dev) || bis_capable(dev)) +#define iso_enabled(dev) (le_enabled(dev) && iso_capable(dev)) #define cis_capable(dev) \ (cis_central_capable(dev) || cis_peripheral_capable(dev)) +#define cis_enabled(dev) (le_enabled(dev) && cis_capable(dev)) #define cis_central_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_CENTRAL) +#define cis_central_enabled(dev) \ + (le_enabled(dev) && cis_central_capable(dev)) #define cis_peripheral_capable(dev) \ ((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL) +#define cis_peripheral_enabled(dev) \ + (le_enabled(dev) && cis_peripheral_capable(dev)) #define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER) -#define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) +#define bis_enabled(dev) (le_enabled(dev) && bis_capable(dev)) +#define sync_recv_capable(dev) \ + ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) +#define sync_recv_enabled(dev) (le_enabled(dev) && sync_recv_capable(dev)) +#define past_sender_capable(dev) \ + ((dev)->le_features[3] & HCI_LE_PAST_SENDER) +#define past_receiver_capable(dev) \ + ((dev)->le_features[3] & HCI_LE_PAST_RECEIVER) +#define past_capable(dev) \ + (past_sender_capable(dev) || past_receiver_capable(dev)) +#define past_sender_enabled(dev) \ + (le_enabled(dev) && past_sender_capable(dev)) +#define past_receiver_enabled(dev) \ + (le_enabled(dev) && past_receiver_capable(dev)) +#define past_enabled(dev) \ + (past_sender_enabled(dev) || past_receiver_enabled(dev)) +#define ll_ext_feature_capable(dev) \ + ((dev)->le_features[7] & HCI_LE_LL_EXT_FEATURE) + +/* Channel sounding support */ +#define le_cs_capable(dev) \ + ((dev)->le_features[5] & HCI_LE_CS) +#define le_cs_host_capable(dev) \ + ((dev)->le_features[5] & HCI_LE_CS_HOST) #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ - (!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks))) + (!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG))) /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 @@ -1988,7 +2096,9 @@ static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, case ESCO_LINK: return sco_connect_ind(hdev, bdaddr, flags); - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: + case PA_LINK: return iso_connect_ind(hdev, bdaddr, flags); default: @@ -2232,6 +2342,7 @@ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode); void *hci_recv_event_data(struct hci_dev *hdev, __u8 event); u32 hci_conn_get_phy(struct hci_conn *conn); +int hci_conn_set_phy(struct hci_conn *conn, u32 phys); /* ----- HCI Sockets ----- */ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb); @@ -2354,8 +2465,6 @@ void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status); void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, u8 status); void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status); -void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status); -void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status); void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len, @@ -2381,13 +2490,12 @@ void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance); void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev, u8 instance); -void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle); int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip); void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle, bdaddr_t *bdaddr, u8 addr_type); int hci_abort_conn(struct hci_conn *conn, u8 reason); -u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, +void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier); void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, __u8 ltk[16], __u8 key_size); diff --git a/include/net/bluetooth/hci_drv.h b/include/net/bluetooth/hci_drv.h new file mode 100644 index 000000000000..3fd6fdbdb02e --- /dev/null +++ b/include/net/bluetooth/hci_drv.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Google Corporation + */ + +#ifndef __HCI_DRV_H +#define __HCI_DRV_H + +#include <linux/types.h> + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci.h> + +struct hci_drv_cmd_hdr { + __le16 opcode; + __le16 len; +} __packed; + +struct hci_drv_ev_hdr { + __le16 opcode; + __le16 len; +} __packed; + +#define HCI_DRV_EV_CMD_STATUS 0x0000 +struct hci_drv_ev_cmd_status { + __le16 opcode; + __u8 status; +} __packed; + +#define HCI_DRV_EV_CMD_COMPLETE 0x0001 +struct hci_drv_ev_cmd_complete { + __le16 opcode; + __u8 status; + __u8 data[]; +} __packed; + +#define HCI_DRV_STATUS_SUCCESS 0x00 +#define HCI_DRV_STATUS_UNSPECIFIED_ERROR 0x01 +#define HCI_DRV_STATUS_UNKNOWN_COMMAND 0x02 +#define HCI_DRV_STATUS_INVALID_PARAMETERS 0x03 + +#define HCI_DRV_MAX_DRIVER_NAME_LENGTH 32 + +/* Common commands that make sense on all drivers start from 0x0000 */ +#define HCI_DRV_OP_READ_INFO 0x0000 +#define HCI_DRV_READ_INFO_SIZE 0 +struct hci_drv_rp_read_info { + __u8 driver_name[HCI_DRV_MAX_DRIVER_NAME_LENGTH]; + __le16 num_supported_commands; + __le16 supported_commands[] __counted_by_le(num_supported_commands); +} __packed; + +/* Driver specific OGF (Opcode Group Field) + * Commands in this group may have different meanings across different drivers. + */ +#define HCI_DRV_OGF_DRIVER_SPECIFIC 0x01 + +int hci_drv_cmd_status(struct hci_dev *hdev, u16 cmd, u8 status); +int hci_drv_cmd_complete(struct hci_dev *hdev, u16 cmd, u8 status, void *rp, + size_t rp_len); +int hci_drv_process_cmd(struct hci_dev *hdev, struct sk_buff *cmd_skb); + +struct hci_drv_handler { + int (*func)(struct hci_dev *hdev, void *data, u16 data_len); + size_t data_len; +}; + +struct hci_drv { + size_t common_handler_count; + const struct hci_drv_handler *common_handlers; + + size_t specific_handler_count; + const struct hci_drv_handler *specific_handlers; +}; + +#endif /* __HCI_DRV_H */ diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h index 082f89531b88..bbd752494ef9 100644 --- a/include/net/bluetooth/hci_mon.h +++ b/include/net/bluetooth/hci_mon.h @@ -51,6 +51,8 @@ struct hci_mon_hdr { #define HCI_MON_CTRL_EVENT 17 #define HCI_MON_ISO_TX_PKT 18 #define HCI_MON_ISO_RX_PKT 19 +#define HCI_MON_DRV_TX_PKT 20 +#define HCI_MON_DRV_RX_PKT 21 struct hci_mon_new_index { __u8 type; diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 7e2cf0cca939..73e494b2591d 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -93,7 +93,7 @@ int hci_update_class_sync(struct hci_dev *hdev); int hci_update_eir_sync(struct hci_dev *hdev); int hci_update_class_sync(struct hci_dev *hdev); -int hci_update_name_sync(struct hci_dev *hdev); +int hci_update_name_sync(struct hci_dev *hdev, const u8 *name); int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode); int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, @@ -115,8 +115,8 @@ int hci_enable_ext_advertising_sync(struct hci_dev *hdev, u8 instance); int hci_enable_advertising_sync(struct hci_dev *hdev); int hci_enable_advertising(struct hci_dev *hdev); -int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, - u8 *data, u32 flags, u16 min_interval, +int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 sid, + u8 data_len, u8 *data, u32 flags, u16 min_interval, u16 max_interval, u16 sync_interval); int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance); @@ -185,3 +185,12 @@ int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn); int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn); int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn, struct hci_conn_params *params); + +int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn); +int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn); +int hci_past_sync(struct hci_conn *conn, struct hci_conn *le); + +int hci_le_read_remote_features(struct hci_conn *conn); + +int hci_acl_change_pkt_type(struct hci_conn *conn, u16 pkt_type); +int hci_le_set_phy(struct hci_conn *conn, u8 tx_phys, u8 rx_phys); diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 9189354c568f..e0a1f2293679 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -33,6 +33,7 @@ /* L2CAP defaults */ #define L2CAP_DEFAULT_MTU 672 #define L2CAP_DEFAULT_MIN_MTU 48 +#define L2CAP_SIG_MTU 48 /* BR/EDR signaling MTU */ #define L2CAP_DEFAULT_FLUSH_TO 0xFFFF #define L2CAP_EFS_DEFAULT_FLUSH_TO 0xFFFFFFFF #define L2CAP_DEFAULT_TX_WINDOW 63 @@ -284,9 +285,9 @@ struct l2cap_conn_rsp { #define L2CAP_CR_LE_BAD_KEY_SIZE 0x0007 #define L2CAP_CR_LE_ENCRYPTION 0x0008 #define L2CAP_CR_LE_INVALID_SCID 0x0009 -#define L2CAP_CR_LE_SCID_IN_USE 0X000A -#define L2CAP_CR_LE_UNACCEPT_PARAMS 0X000B -#define L2CAP_CR_LE_INVALID_PARAMS 0X000C +#define L2CAP_CR_LE_SCID_IN_USE 0x000A +#define L2CAP_CR_LE_UNACCEPT_PARAMS 0x000B +#define L2CAP_CR_LE_INVALID_PARAMS 0x000C /* connect/create channel status */ #define L2CAP_CS_NO_INFO 0x0000 @@ -493,6 +494,8 @@ struct l2cap_ecred_reconf_req { #define L2CAP_RECONF_SUCCESS 0x0000 #define L2CAP_RECONF_INVALID_MTU 0x0001 #define L2CAP_RECONF_INVALID_MPS 0x0002 +#define L2CAP_RECONF_INVALID_CID 0x0003 +#define L2CAP_RECONF_INVALID_PARAMS 0x0004 struct l2cap_ecred_reconf_rsp { __le16 result; @@ -655,8 +658,8 @@ struct l2cap_conn { struct sk_buff *rx_skb; __u32 rx_len; + struct ida tx_ida; __u8 tx_ident; - struct mutex ident_lock; struct sk_buff_head pending_rx; struct work_struct pending_rx_work; @@ -955,7 +958,8 @@ void l2cap_chan_close(struct l2cap_chan *chan, int reason); int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bdaddr_t *dst, u8 dst_type, u16 timeout); int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu); -int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len); +int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, + const struct sockcm_cookie *sockc); void l2cap_chan_busy(struct l2cap_chan *chan, int busy); void l2cap_chan_rx_avail(struct l2cap_chan *chan, ssize_t rx_avail); int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index affac861efdc..8234915854b6 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -53,10 +53,15 @@ struct mgmt_hdr { } __packed; struct mgmt_tlv { - __le16 type; - __u8 length; + /* New members MUST be added within the __struct_group() macro below. */ + __struct_group(mgmt_tlv_hdr, __hdr, __packed, + __le16 type; + __u8 length; + ); __u8 value[]; } __packed; +static_assert(offsetof(struct mgmt_tlv, value) == sizeof(struct mgmt_tlv_hdr), + "struct member likely outside of __struct_group()"); struct mgmt_addr_info { bdaddr_t bdaddr; @@ -113,6 +118,9 @@ struct mgmt_rp_read_index_list { #define MGMT_SETTING_CIS_PERIPHERAL BIT(19) #define MGMT_SETTING_ISO_BROADCASTER BIT(20) #define MGMT_SETTING_ISO_SYNC_RECEIVER BIT(21) +#define MGMT_SETTING_LL_PRIVACY BIT(22) +#define MGMT_SETTING_PAST_SENDER BIT(23) +#define MGMT_SETTING_PAST_RECEIVER BIT(24) #define MGMT_OP_READ_INFO 0x0004 #define MGMT_READ_INFO_SIZE 0 @@ -774,7 +782,7 @@ struct mgmt_adv_pattern { __u8 ad_type; __u8 offset; __u8 length; - __u8 value[31]; + __u8 value[HCI_MAX_AD_LENGTH]; } __packed; #define MGMT_OP_ADD_ADV_PATTERNS_MONITOR 0x0052 @@ -847,7 +855,7 @@ struct mgmt_cp_set_mesh { __le16 window; __le16 period; __u8 num_ad_types; - __u8 ad_types[]; + __u8 ad_types[] __counted_by(num_ad_types); } __packed; #define MGMT_SET_MESH_RECEIVER_SIZE 6 diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index 2053cd8e788a..05572c19e14b 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -26,6 +26,7 @@ enum { BOND_AD_STABLE = 0, BOND_AD_BANDWIDTH = 1, BOND_AD_COUNT = 2, + BOND_AD_PRIO = 3, }; /* rx machine states(43.4.11 in the 802.3ad standard) */ @@ -242,7 +243,7 @@ typedef struct port { churn_state_t sm_churn_actor_state; churn_state_t sm_churn_partner_state; struct slave *slave; /* pointer to the bond slave that this port belongs to */ - struct aggregator *aggregator; /* pointer to an aggregator that this port related to */ + struct aggregator __rcu *aggregator; /* pointer to an aggregator that this port related to */ struct port *next_port_in_aggregator; /* Next port on the linked list of the parent aggregator */ u32 transaction_id; /* continuous number for identification of Marker PDU's; */ struct lacpdu lacpdu; /* the lacpdu that will be sent for this port */ @@ -274,6 +275,7 @@ struct ad_slave_info { struct port port; /* 802.3ad port structure */ struct bond_3ad_stats stats; u16 id; + u16 port_priority; }; static inline const char *bond_3ad_churn_desc(churn_state_t state) @@ -307,6 +309,7 @@ int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); int bond_3ad_set_carrier(struct bonding *bond); void bond_3ad_update_lacp_rate(struct bonding *bond); +void bond_3ad_update_lacp_active(struct bonding *bond); void bond_3ad_update_ad_actor_settings(struct bonding *bond); int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats); size_t bond_3ad_stats_size(void); diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 18687ccf0638..e6eedf23aea1 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -77,6 +77,8 @@ enum { BOND_OPT_NS_TARGETS, BOND_OPT_PRIO, BOND_OPT_COUPLED_CONTROL, + BOND_OPT_BROADCAST_NEIGH, + BOND_OPT_ACTOR_PORT_PRIO, BOND_OPT_LAST }; diff --git a/include/net/bonding.h b/include/net/bonding.h index 8bb5f016969f..edd1942dcd73 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -69,9 +69,6 @@ #define bond_first_slave_rcu(bond) \ netdev_lower_get_first_private_rcu(bond->dev) -#define bond_is_first_slave(bond, pos) (pos == bond_first_slave(bond)) -#define bond_is_last_slave(bond, pos) (pos == bond_last_slave(bond)) - /** * bond_for_each_slave - iterate over all slaves * @bond: the bond holding this list @@ -91,22 +88,22 @@ NETIF_F_GSO_ESP) #ifdef CONFIG_NET_POLL_CONTROLLER -extern atomic_t netpoll_block_tx; +DECLARE_STATIC_KEY_FALSE(netpoll_block_tx); static inline void block_netpoll_tx(void) { - atomic_inc(&netpoll_block_tx); + static_branch_inc(&netpoll_block_tx); } static inline void unblock_netpoll_tx(void) { - atomic_dec(&netpoll_block_tx); + static_branch_dec(&netpoll_block_tx); } static inline int is_netpoll_tx_blocked(struct net_device *dev) { - if (unlikely(netpoll_tx_running(dev))) - return atomic_read(&netpoll_block_tx); + if (static_branch_unlikely(&netpoll_block_tx)) + return netpoll_tx_running(dev); return 0; } #else @@ -115,6 +112,8 @@ static inline int is_netpoll_tx_blocked(struct net_device *dev) #define is_netpoll_tx_blocked(dev) (0) #endif +DECLARE_STATIC_KEY_FALSE(bond_bcast_neigh_enabled); + struct bond_params { int mode; int xmit_policy; @@ -124,7 +123,6 @@ struct bond_params { int arp_interval; int arp_validate; int arp_all_targets; - int use_carrier; int fail_over_mac; int updelay; int downdelay; @@ -149,6 +147,7 @@ struct bond_params { struct in6_addr ns_targets[BOND_MAX_NS_TARGETS]; #endif int coupled_control; + int broadcast_neighbor; /* 2 bytes of padding : see ether_addr_equal_64bits() */ u8 ad_actor_system[ETH_ALEN + 2]; @@ -252,6 +251,7 @@ struct bonding { struct delayed_work ad_work; struct delayed_work mcast_work; struct delayed_work slave_arr_work; + struct delayed_work peer_notify_work; #ifdef CONFIG_DEBUG_FS /* debugging support via debugfs */ struct dentry *debug_dir; @@ -519,13 +519,14 @@ static inline int bond_is_ip6_target_ok(struct in6_addr *addr) static inline unsigned long slave_oldest_target_arp_rx(struct bonding *bond, struct slave *slave) { + unsigned long tmp, ret = READ_ONCE(slave->target_last_arp_rx[0]); int i = 1; - unsigned long ret = slave->target_last_arp_rx[0]; - - for (; (i < BOND_MAX_ARP_TARGETS) && bond->params.arp_targets[i]; i++) - if (time_before(slave->target_last_arp_rx[i], ret)) - ret = slave->target_last_arp_rx[i]; + for (; (i < BOND_MAX_ARP_TARGETS) && bond->params.arp_targets[i]; i++) { + tmp = READ_ONCE(slave->target_last_arp_rx[i]); + if (time_before(tmp, ret)) + ret = tmp; + } return ret; } @@ -535,7 +536,7 @@ static inline unsigned long slave_last_rx(struct bonding *bond, if (bond->params.arp_all_targets == BOND_ARP_TARGETS_ALL) return slave_oldest_target_arp_rx(bond, slave); - return slave->last_rx; + return READ_ONCE(slave->last_rx); } static inline void slave_update_last_tx(struct slave *slave) @@ -695,6 +696,8 @@ void bond_debug_register(struct bonding *bond); void bond_debug_unregister(struct bonding *bond); void bond_debug_reregister(struct bonding *bond); const char *bond_mode_name(int mode); +bool __bond_xdp_check(int mode, int xmit_policy); +bool bond_xdp_check(struct bonding *bond, int mode); void bond_setup(struct net_device *bond_dev); unsigned int bond_get_num_tx_queues(void); int bond_netlink_init(void); @@ -706,7 +709,9 @@ struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, int level); int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave); void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay); +void bond_peer_notify_work_rearm(struct bonding *bond, unsigned long delay); void bond_work_init_all(struct bonding *bond); +void bond_work_cancel_all(struct bonding *bond); #ifdef CONFIG_PROC_FS void bond_create_proc_entry(struct bonding *bond); diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index c39a426ebf52..6e172d0f6ef5 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -24,6 +24,11 @@ */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) +static inline bool napi_id_valid(unsigned int napi_id) +{ + return napi_id >= MIN_NAPI_ID; +} + #define BUSY_POLL_BUDGET 8 #ifdef CONFIG_NET_RX_BUSY_POLL @@ -114,7 +119,7 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(sk->sk_napi_id); - if (napi_id >= MIN_NAPI_ID) + if (napi_id_valid(napi_id)) napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, READ_ONCE(sk->sk_prefer_busy_poll), READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); @@ -122,18 +127,24 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) } /* used in the NIC receive handler to mark the skb */ -static inline void skb_mark_napi_id(struct sk_buff *skb, - struct napi_struct *napi) +static inline void __skb_mark_napi_id(struct sk_buff *skb, + const struct gro_node *gro) { #ifdef CONFIG_NET_RX_BUSY_POLL /* If the skb was already marked with a valid NAPI ID, avoid overwriting * it. */ - if (skb->napi_id < MIN_NAPI_ID) - skb->napi_id = napi->napi_id; + if (!napi_id_valid(skb->napi_id)) + skb->napi_id = gro->cached_napi_id; #endif } +static inline void skb_mark_napi_id(struct sk_buff *skb, + const struct napi_struct *napi) +{ + __skb_mark_napi_id(skb, &napi->gro); +} + /* used in the protocol handler to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { diff --git a/include/net/caif/caif_dev.h b/include/net/caif/caif_dev.h deleted file mode 100644 index b655d8666f55..000000000000 --- a/include/net/caif/caif_dev.h +++ /dev/null @@ -1,128 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland - */ - -#ifndef CAIF_DEV_H_ -#define CAIF_DEV_H_ - -#include <net/caif/caif_layer.h> -#include <net/caif/cfcnfg.h> -#include <net/caif/caif_device.h> -#include <linux/caif/caif_socket.h> -#include <linux/if.h> -#include <linux/net.h> - -/** - * struct caif_param - CAIF parameters. - * @size: Length of data - * @data: Binary Data Blob - */ -struct caif_param { - u16 size; - u8 data[256]; -}; - -/** - * struct caif_connect_request - Request data for CAIF channel setup. - * @protocol: Type of CAIF protocol to use (at, datagram etc) - * @sockaddr: Socket address to connect. - * @priority: Priority of the connection. - * @link_selector: Link selector (high bandwidth or low latency) - * @ifindex: kernel index of the interface. - * @param: Connect Request parameters (CAIF_SO_REQ_PARAM). - * - * This struct is used when connecting a CAIF channel. - * It contains all CAIF channel configuration options. - */ -struct caif_connect_request { - enum caif_protocol_type protocol; - struct sockaddr_caif sockaddr; - enum caif_channel_priority priority; - enum caif_link_selector link_selector; - int ifindex; - struct caif_param param; -}; - -/** - * caif_connect_client - Connect a client to CAIF Core Stack. - * @config: Channel setup parameters, specifying what address - * to connect on the Modem. - * @client_layer: User implementation of client layer. This layer - * MUST have receive and control callback functions - * implemented. - * @ifindex: Link layer interface index used for this connection. - * @headroom: Head room needed by CAIF protocol. - * @tailroom: Tail room needed by CAIF protocol. - * - * This function connects a CAIF channel. The Client must implement - * the struct cflayer. This layer represents the Client layer and holds - * receive functions and control callback functions. Control callback - * function will receive information about connect/disconnect responses, - * flow control etc (see enum caif_control). - * E.g. CAIF Socket will call this function for each socket it connects - * and have one client_layer instance for each socket. - */ -int caif_connect_client(struct net *net, - struct caif_connect_request *conn_req, - struct cflayer *client_layer, int *ifindex, - int *headroom, int *tailroom); - -/** - * caif_disconnect_client - Disconnects a client from the CAIF stack. - * - * @client_layer: Client layer to be disconnected. - */ -int caif_disconnect_client(struct net *net, struct cflayer *client_layer); - - -/** - * caif_client_register_refcnt - register ref-count functions provided by client. - * - * @adapt_layer: Client layer using CAIF Stack. - * @hold: Function provided by client layer increasing ref-count - * @put: Function provided by client layer decreasing ref-count - * - * Client of the CAIF Stack must register functions for reference counting. - * These functions are called by the CAIF Stack for every upstream packet, - * and must therefore be implemented efficiently. - * - * Client should call caif_free_client when reference count degrease to zero. - */ - -void caif_client_register_refcnt(struct cflayer *adapt_layer, - void (*hold)(struct cflayer *lyr), - void (*put)(struct cflayer *lyr)); -/** - * caif_free_client - Free memory used to manage the client in the CAIF Stack. - * - * @client_layer: Client layer to be removed. - * - * This function must be called from client layer in order to free memory. - * Caller must guarantee that no packets are in flight upstream when calling - * this function. - */ -void caif_free_client(struct cflayer *adap_layer); - -/** - * struct caif_enroll_dev - Enroll a net-device as a CAIF Link layer - * @dev: Network device to enroll. - * @caifdev: Configuration information from CAIF Link Layer - * @link_support: Link layer support layer - * @head_room: Head room needed by link support layer - * @layer: Lowest layer in CAIF stack - * @rcv_fun: Receive function for CAIF stack. - * - * This function enroll a CAIF link layer into CAIF Stack and - * expects the interface to be able to handle CAIF payload. - * The link_support layer is used to add any Link Layer specific - * framing. - */ -int caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, - struct cflayer *link_support, int head_room, - struct cflayer **layer, int (**rcv_func)( - struct sk_buff *, struct net_device *, - struct packet_type *, struct net_device *)); - -#endif /* CAIF_DEV_H_ */ diff --git a/include/net/caif/caif_device.h b/include/net/caif/caif_device.h deleted file mode 100644 index 91d1fd5b44a4..000000000000 --- a/include/net/caif/caif_device.h +++ /dev/null @@ -1,55 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland - */ - -#ifndef CAIF_DEVICE_H_ -#define CAIF_DEVICE_H_ -#include <linux/kernel.h> -#include <linux/net.h> -#include <linux/netdevice.h> -#include <linux/caif/caif_socket.h> -#include <net/caif/caif_device.h> - -/** - * struct caif_dev_common - data shared between CAIF drivers and stack. - * @flowctrl: Flow Control callback function. This function is - * supplied by CAIF Core Stack and is used by CAIF - * Link Layer to send flow-stop to CAIF Core. - * The flow information will be distributed to all - * clients of CAIF. - * - * @link_select: Profile of device, either high-bandwidth or - * low-latency. This member is set by CAIF Link - * Layer Device in order to indicate if this device - * is a high bandwidth or low latency device. - * - * @use_frag: CAIF Frames may be fragmented. - * Is set by CAIF Link Layer in order to indicate if the - * interface receives fragmented frames that must be - * assembled by CAIF Core Layer. - * - * @use_fcs: Indicate if Frame CheckSum (fcs) is used. - * Is set if the physical interface is - * using Frame Checksum on the CAIF Frames. - * - * @use_stx: Indicate STart of frame eXtension (stx) in use. - * Is set if the CAIF Link Layer expects - * CAIF Frames to start with the STX byte. - * - * This structure is shared between the CAIF drivers and the CAIF stack. - * It is used by the device to register its behavior. - * CAIF Core layer must set the member flowctrl in order to supply - * CAIF Link Layer with the flow control function. - * - */ - struct caif_dev_common { - void (*flowctrl)(struct net_device *net, int on); - enum caif_link_selector link_select; - int use_frag; - int use_fcs; - int use_stx; -}; - -#endif /* CAIF_DEVICE_H_ */ diff --git a/include/net/caif/caif_layer.h b/include/net/caif/caif_layer.h deleted file mode 100644 index 053e7c6a6a66..000000000000 --- a/include/net/caif/caif_layer.h +++ /dev/null @@ -1,277 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland - */ - -#ifndef CAIF_LAYER_H_ -#define CAIF_LAYER_H_ - -#include <linux/list.h> - -struct cflayer; -struct cfpkt; -struct caif_payload_info; - -#define CAIF_LAYER_NAME_SZ 16 - -/** - * caif_assert() - Assert function for CAIF. - * @assert: expression to evaluate. - * - * This function will print a error message and a do WARN_ON if the - * assertion fails. Normally this will do a stack up at the current location. - */ -#define caif_assert(assert) \ -do { \ - if (!(assert)) { \ - pr_err("caif:Assert detected:'%s'\n", #assert); \ - WARN_ON(!(assert)); \ - } \ -} while (0) - -/** - * enum caif_ctrlcmd - CAIF Stack Control Signaling sent in layer.ctrlcmd(). - * - * @CAIF_CTRLCMD_FLOW_OFF_IND: Flow Control is OFF, transmit function - * should stop sending data - * - * @CAIF_CTRLCMD_FLOW_ON_IND: Flow Control is ON, transmit function - * can start sending data - * - * @CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: Remote end modem has decided to close - * down channel - * - * @CAIF_CTRLCMD_INIT_RSP: Called initially when the layer below - * has finished initialization - * - * @CAIF_CTRLCMD_DEINIT_RSP: Called when de-initialization is - * complete - * - * @CAIF_CTRLCMD_INIT_FAIL_RSP: Called if initialization fails - * - * @_CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND: CAIF Link layer temporarily cannot - * send more packets. - * @_CAIF_CTRLCMD_PHYIF_FLOW_ON_IND: Called if CAIF Link layer is able - * to send packets again. - * @_CAIF_CTRLCMD_PHYIF_DOWN_IND: Called if CAIF Link layer is going - * down. - * - * These commands are sent upwards in the CAIF stack to the CAIF Client. - * They are used for signaling originating from the modem or CAIF Link Layer. - * These are either responses (*_RSP) or events (*_IND). - */ -enum caif_ctrlcmd { - CAIF_CTRLCMD_FLOW_OFF_IND, - CAIF_CTRLCMD_FLOW_ON_IND, - CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, - CAIF_CTRLCMD_INIT_RSP, - CAIF_CTRLCMD_DEINIT_RSP, - CAIF_CTRLCMD_INIT_FAIL_RSP, - _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND, - _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND, - _CAIF_CTRLCMD_PHYIF_DOWN_IND, -}; - -/** - * enum caif_modemcmd - Modem Control Signaling, sent from CAIF Client - * to the CAIF Link Layer or modem. - * - * @CAIF_MODEMCMD_FLOW_ON_REQ: Flow Control is ON, transmit function - * can start sending data. - * - * @CAIF_MODEMCMD_FLOW_OFF_REQ: Flow Control is OFF, transmit function - * should stop sending data. - * - * @_CAIF_MODEMCMD_PHYIF_USEFULL: Notify physical layer that it is in use - * - * @_CAIF_MODEMCMD_PHYIF_USELESS: Notify physical layer that it is - * no longer in use. - * - * These are requests sent 'downwards' in the stack. - * Flow ON, OFF can be indicated to the modem. - */ -enum caif_modemcmd { - CAIF_MODEMCMD_FLOW_ON_REQ = 0, - CAIF_MODEMCMD_FLOW_OFF_REQ = 1, - _CAIF_MODEMCMD_PHYIF_USEFULL = 3, - _CAIF_MODEMCMD_PHYIF_USELESS = 4 -}; - -/** - * enum caif_direction - CAIF Packet Direction. - * Indicate if a packet is to be sent out or to be received in. - * @CAIF_DIR_IN: Incoming packet received. - * @CAIF_DIR_OUT: Outgoing packet to be transmitted. - */ -enum caif_direction { - CAIF_DIR_IN = 0, - CAIF_DIR_OUT = 1 -}; - -/** - * struct cflayer - CAIF Stack layer. - * Defines the framework for the CAIF Core Stack. - * @up: Pointer up to the layer above. - * @dn: Pointer down to the layer below. - * @node: List node used when layer participate in a list. - * @receive: Packet receive function. - * @transmit: Packet transmit function. - * @ctrlcmd: Used for control signalling upwards in the stack. - * @modemcmd: Used for control signaling downwards in the stack. - * @id: The identity of this layer - * @name: Name of the layer. - * - * This structure defines the layered structure in CAIF. - * - * It defines CAIF layering structure, used by all CAIF Layers and the - * layers interfacing CAIF. - * - * In order to integrate with CAIF an adaptation layer on top of the CAIF stack - * and PHY layer below the CAIF stack - * must be implemented. These layer must follow the design principles below. - * - * Principles for layering of protocol layers: - * - All layers must use this structure. If embedding it, then place this - * structure first in the layer specific structure. - * - * - Each layer should not depend on any others layer's private data. - * - * - In order to send data upwards do - * layer->up->receive(layer->up, packet); - * - * - In order to send data downwards do - * layer->dn->transmit(layer->dn, info, packet); - */ -struct cflayer { - struct cflayer *up; - struct cflayer *dn; - struct list_head node; - - /* - * receive() - Receive Function (non-blocking). - * Contract: Each layer must implement a receive function passing the - * CAIF packets upwards in the stack. - * Packet handling rules: - * - The CAIF packet (cfpkt) ownership is passed to the - * called receive function. This means that the - * packet cannot be accessed after passing it to the - * above layer using up->receive(). - * - * - If parsing of the packet fails, the packet must be - * destroyed and negative error code returned - * from the function. - * EXCEPTION: If the framing layer (cffrml) returns - * -EILSEQ, the packet is not freed. - * - * - If parsing succeeds (and above layers return OK) then - * the function must return a value >= 0. - * - * Returns result < 0 indicates an error, 0 or positive value - * indicates success. - * - * @layr: Pointer to the current layer the receive function is - * implemented for (this pointer). - * @cfpkt: Pointer to CaifPacket to be handled. - */ - int (*receive)(struct cflayer *layr, struct cfpkt *cfpkt); - - /* - * transmit() - Transmit Function (non-blocking). - * Contract: Each layer must implement a transmit function passing the - * CAIF packet downwards in the stack. - * Packet handling rules: - * - The CAIF packet (cfpkt) ownership is passed to the - * transmit function. This means that the packet - * cannot be accessed after passing it to the below - * layer using dn->transmit(). - * - * - Upon error the packet ownership is still passed on, - * so the packet shall be freed where error is detected. - * Callers of the transmit function shall not free packets, - * but errors shall be returned. - * - * - Return value less than zero means error, zero or - * greater than zero means OK. - * - * Returns result < 0 indicates an error, 0 or positive value - * indicates success. - * - * @layr: Pointer to the current layer the receive function - * isimplemented for (this pointer). - * @cfpkt: Pointer to CaifPacket to be handled. - */ - int (*transmit) (struct cflayer *layr, struct cfpkt *cfpkt); - - /* - * cttrlcmd() - Control Function upwards in CAIF Stack (non-blocking). - * Used for signaling responses (CAIF_CTRLCMD_*_RSP) - * and asynchronous events from the modem (CAIF_CTRLCMD_*_IND) - * - * @layr: Pointer to the current layer the receive function - * is implemented for (this pointer). - * @ctrl: Control Command. - */ - void (*ctrlcmd) (struct cflayer *layr, enum caif_ctrlcmd ctrl, - int phyid); - - /* - * modemctrl() - Control Function used for controlling the modem. - * Used to signal down-wards in the CAIF stack. - * Returns 0 on success, < 0 upon failure. - * - * @layr: Pointer to the current layer the receive function - * is implemented for (this pointer). - * @ctrl: Control Command. - */ - int (*modemcmd) (struct cflayer *layr, enum caif_modemcmd ctrl); - - unsigned int id; - char name[CAIF_LAYER_NAME_SZ]; -}; - -/** - * layer_set_up() - Set the up pointer for a specified layer. - * @layr: Layer where up pointer shall be set. - * @above: Layer above. - */ -#define layer_set_up(layr, above) ((layr)->up = (struct cflayer *)(above)) - -/** - * layer_set_dn() - Set the down pointer for a specified layer. - * @layr: Layer where down pointer shall be set. - * @below: Layer below. - */ -#define layer_set_dn(layr, below) ((layr)->dn = (struct cflayer *)(below)) - -/** - * struct dev_info - Physical Device info information about physical layer. - * @dev: Pointer to native physical device. - * @id: Physical ID of the physical connection used by the - * logical CAIF connection. Used by service layers to - * identify their physical id to Caif MUX (CFMUXL)so - * that the MUX can add the correct physical ID to the - * packet. - */ -struct dev_info { - void *dev; - unsigned int id; -}; - -/** - * struct caif_payload_info - Payload information embedded in packet (sk_buff). - * - * @dev_info: Information about the receiving device. - * - * @hdr_len: Header length, used to align pay load on 32bit boundary. - * - * @channel_id: Channel ID of the logical CAIF connection. - * Used by mux to insert channel id into the caif packet. - */ -struct caif_payload_info { - struct dev_info *dev_info; - unsigned short hdr_len; - unsigned short channel_id; -}; - -#endif /* CAIF_LAYER_H_ */ diff --git a/include/net/caif/cfcnfg.h b/include/net/caif/cfcnfg.h deleted file mode 100644 index 8819ff4db35a..000000000000 --- a/include/net/caif/cfcnfg.h +++ /dev/null @@ -1,90 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland - */ - -#ifndef CFCNFG_H_ -#define CFCNFG_H_ -#include <linux/spinlock.h> -#include <linux/netdevice.h> -#include <net/caif/caif_layer.h> -#include <net/caif/cfctrl.h> - -struct cfcnfg; - -/** - * enum cfcnfg_phy_preference - Physical preference HW Abstraction - * - * @CFPHYPREF_UNSPECIFIED: Default physical interface - * - * @CFPHYPREF_LOW_LAT: Default physical interface for low-latency - * traffic - * @CFPHYPREF_HIGH_BW: Default physical interface for high-bandwidth - * traffic - * @CFPHYPREF_LOOP: TEST only Loopback interface simulating modem - * responses. - * - */ -enum cfcnfg_phy_preference { - CFPHYPREF_UNSPECIFIED, - CFPHYPREF_LOW_LAT, - CFPHYPREF_HIGH_BW, - CFPHYPREF_LOOP -}; - -/** - * cfcnfg_create() - Get the CAIF configuration object given network. - * @net: Network for the CAIF configuration object. - */ -struct cfcnfg *get_cfcnfg(struct net *net); - -/** - * cfcnfg_create() - Create the CAIF configuration object. - */ -struct cfcnfg *cfcnfg_create(void); - -/** - * cfcnfg_remove() - Remove the CFCNFG object - * @cfg: config object - */ -void cfcnfg_remove(struct cfcnfg *cfg); - -/** - * cfcnfg_add_phy_layer() - Adds a physical layer to the CAIF stack. - * @cnfg: Pointer to a CAIF configuration object, created by - * cfcnfg_create(). - * @dev: Pointer to link layer device - * @phy_layer: Specify the physical layer. The transmit function - * MUST be set in the structure. - * @pref: The phy (link layer) preference. - * @link_support: Protocol implementation for link layer specific protocol. - * @fcs: Specify if checksum is used in CAIF Framing Layer. - * @head_room: Head space needed by link specific protocol. - */ -int -cfcnfg_add_phy_layer(struct cfcnfg *cnfg, - struct net_device *dev, struct cflayer *phy_layer, - enum cfcnfg_phy_preference pref, - struct cflayer *link_support, - bool fcs, int head_room); - -/** - * cfcnfg_del_phy_layer - Deletes an phy layer from the CAIF stack. - * - * @cnfg: Pointer to a CAIF configuration object, created by - * cfcnfg_create(). - * @phy_layer: Adaptation layer to be removed. - */ -int cfcnfg_del_phy_layer(struct cfcnfg *cnfg, struct cflayer *phy_layer); - -/** - * cfcnfg_set_phy_state() - Set the state of the physical interface device. - * @cnfg: Configuration object - * @phy_layer: Physical Layer representation - * @up: State of device - */ -int cfcnfg_set_phy_state(struct cfcnfg *cnfg, struct cflayer *phy_layer, - bool up); - -#endif /* CFCNFG_H_ */ diff --git a/include/net/caif/cfctrl.h b/include/net/caif/cfctrl.h deleted file mode 100644 index 86d17315c8a1..000000000000 --- a/include/net/caif/cfctrl.h +++ /dev/null @@ -1,130 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland - */ - -#ifndef CFCTRL_H_ -#define CFCTRL_H_ -#include <net/caif/caif_layer.h> -#include <net/caif/cfsrvl.h> - -/* CAIF Control packet commands */ -enum cfctrl_cmd { - CFCTRL_CMD_LINK_SETUP = 0, - CFCTRL_CMD_LINK_DESTROY = 1, - CFCTRL_CMD_LINK_ERR = 2, - CFCTRL_CMD_ENUM = 3, - CFCTRL_CMD_SLEEP = 4, - CFCTRL_CMD_WAKE = 5, - CFCTRL_CMD_LINK_RECONF = 6, - CFCTRL_CMD_START_REASON = 7, - CFCTRL_CMD_RADIO_SET = 8, - CFCTRL_CMD_MODEM_SET = 9, - CFCTRL_CMD_MASK = 0xf -}; - -/* Channel types */ -enum cfctrl_srv { - CFCTRL_SRV_DECM = 0, - CFCTRL_SRV_VEI = 1, - CFCTRL_SRV_VIDEO = 2, - CFCTRL_SRV_DBG = 3, - CFCTRL_SRV_DATAGRAM = 4, - CFCTRL_SRV_RFM = 5, - CFCTRL_SRV_UTIL = 6, - CFCTRL_SRV_MASK = 0xf -}; - -#define CFCTRL_RSP_BIT 0x20 -#define CFCTRL_ERR_BIT 0x10 - -struct cfctrl_rsp { - void (*linksetup_rsp)(struct cflayer *layer, u8 linkid, - enum cfctrl_srv serv, u8 phyid, - struct cflayer *adapt_layer); - void (*linkdestroy_rsp)(struct cflayer *layer, u8 linkid); - void (*linkerror_ind)(void); - void (*enum_rsp)(void); - void (*sleep_rsp)(void); - void (*wake_rsp)(void); - void (*restart_rsp)(void); - void (*radioset_rsp)(void); - void (*reject_rsp)(struct cflayer *layer, u8 linkid, - struct cflayer *client_layer); -}; - -/* Link Setup Parameters for CAIF-Links. */ -struct cfctrl_link_param { - enum cfctrl_srv linktype;/* (T3,T0) Type of Channel */ - u8 priority; /* (P4,P0) Priority of the channel */ - u8 phyid; /* (U2-U0) Physical interface to connect */ - u8 endpoint; /* (E1,E0) Endpoint for data channels */ - u8 chtype; /* (H1,H0) Channel-Type, applies to - * VEI, DEBUG */ - union { - struct { - u8 connid; /* (D7,D0) Video LinkId */ - } video; - - struct { - u32 connid; /* (N31,Ngit0) Connection ID used - * for Datagram */ - } datagram; - - struct { - u32 connid; /* Connection ID used for RFM */ - char volume[20]; /* Volume to mount for RFM */ - } rfm; /* Configuration for RFM */ - - struct { - u16 fifosize_kb; /* Psock FIFO size in KB */ - u16 fifosize_bufs; /* Psock # signal buffers */ - char name[16]; /* Name of the PSOCK service */ - u8 params[255]; /* Link setup Parameters> */ - u16 paramlen; /* Length of Link Setup - * Parameters */ - } utility; /* Configuration for Utility Links (Psock) */ - } u; -}; - -/* This structure is used internally in CFCTRL */ -struct cfctrl_request_info { - int sequence_no; - enum cfctrl_cmd cmd; - u8 channel_id; - struct cfctrl_link_param param; - struct cflayer *client_layer; - struct list_head list; -}; - -struct cfctrl { - struct cfsrvl serv; - struct cfctrl_rsp res; - atomic_t req_seq_no; - atomic_t rsp_seq_no; - struct list_head list; - /* Protects from simultaneous access to first_req list */ - spinlock_t info_list_lock; -#ifndef CAIF_NO_LOOP - u8 loop_linkid; - int loop_linkused[256]; - /* Protects simultaneous access to loop_linkid and loop_linkused */ - spinlock_t loop_linkid_lock; -#endif - -}; - -void cfctrl_enum_req(struct cflayer *cfctrl, u8 physlinkid); -int cfctrl_linkup_request(struct cflayer *cfctrl, - struct cfctrl_link_param *param, - struct cflayer *user_layer); -int cfctrl_linkdown_req(struct cflayer *cfctrl, u8 linkid, - struct cflayer *client); - -struct cflayer *cfctrl_create(void); -struct cfctrl_rsp *cfctrl_get_respfuncs(struct cflayer *layer); -int cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer); -void cfctrl_remove(struct cflayer *layr); - -#endif /* CFCTRL_H_ */ diff --git a/include/net/caif/cffrml.h b/include/net/caif/cffrml.h deleted file mode 100644 index 1ab8a80ede4d..000000000000 --- a/include/net/caif/cffrml.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland - */ - -#ifndef CFFRML_H_ -#define CFFRML_H_ -#include <net/caif/caif_layer.h> -#include <linux/netdevice.h> - -struct cffrml; -struct cflayer *cffrml_create(u16 phyid, bool use_fcs); -void cffrml_free(struct cflayer *layr); -void cffrml_set_uplayer(struct cflayer *this, struct cflayer *up); -void cffrml_set_dnlayer(struct cflayer *this, struct cflayer *dn); -void cffrml_put(struct cflayer *layr); -void cffrml_hold(struct cflayer *layr); -int cffrml_refcnt_read(struct cflayer *layr); - -#endif /* CFFRML_H_ */ diff --git a/include/net/caif/cfmuxl.h b/include/net/caif/cfmuxl.h deleted file mode 100644 index 92ccb2648309..000000000000 --- a/include/net/caif/cfmuxl.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland - */ - -#ifndef CFMUXL_H_ -#define CFMUXL_H_ -#include <net/caif/caif_layer.h> - -struct cfsrvl; -struct cffrml; - -struct cflayer *cfmuxl_create(void); -int cfmuxl_set_uplayer(struct cflayer *layr, struct cflayer *up, u8 linkid); -struct cflayer *cfmuxl_remove_dnlayer(struct cflayer *layr, u8 phyid); -int cfmuxl_set_dnlayer(struct cflayer *layr, struct cflayer *up, u8 phyid); -struct cflayer *cfmuxl_remove_uplayer(struct cflayer *layr, u8 linkid); - -#endif /* CFMUXL_H_ */ diff --git a/include/net/caif/cfpkt.h b/include/net/caif/cfpkt.h deleted file mode 100644 index acf664227d96..000000000000 --- a/include/net/caif/cfpkt.h +++ /dev/null @@ -1,232 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland - */ - -#ifndef CFPKT_H_ -#define CFPKT_H_ -#include <net/caif/caif_layer.h> -#include <linux/types.h> -struct cfpkt; - -/* Create a CAIF packet. - * len: Length of packet to be created - * @return New packet. - */ -struct cfpkt *cfpkt_create(u16 len); - -/* - * Destroy a CAIF Packet. - * pkt Packet to be destroyed. - */ -void cfpkt_destroy(struct cfpkt *pkt); - -/* - * Extract header from packet. - * - * pkt Packet to extract header data from. - * data Pointer to copy the header data into. - * len Length of head data to copy. - * @return zero on success and error code upon failure - */ -int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len); - -static inline u8 cfpkt_extr_head_u8(struct cfpkt *pkt) -{ - u8 tmp; - - cfpkt_extr_head(pkt, &tmp, 1); - - return tmp; -} - -static inline u16 cfpkt_extr_head_u16(struct cfpkt *pkt) -{ - __le16 tmp; - - cfpkt_extr_head(pkt, &tmp, 2); - - return le16_to_cpu(tmp); -} - -static inline u32 cfpkt_extr_head_u32(struct cfpkt *pkt) -{ - __le32 tmp; - - cfpkt_extr_head(pkt, &tmp, 4); - - return le32_to_cpu(tmp); -} - -/* - * Peek header from packet. - * Reads data from packet without changing packet. - * - * pkt Packet to extract header data from. - * data Pointer to copy the header data into. - * len Length of head data to copy. - * @return zero on success and error code upon failure - */ -int cfpkt_peek_head(struct cfpkt *pkt, void *data, u16 len); - -/* - * Extract header from trailer (end of packet). - * - * pkt Packet to extract header data from. - * data Pointer to copy the trailer data into. - * len Length of header data to copy. - * @return zero on success and error code upon failure - */ -int cfpkt_extr_trail(struct cfpkt *pkt, void *data, u16 len); - -/* - * Add header to packet. - * - * - * pkt Packet to add header data to. - * data Pointer to data to copy into the header. - * len Length of header data to copy. - * @return zero on success and error code upon failure - */ -int cfpkt_add_head(struct cfpkt *pkt, const void *data, u16 len); - -/* - * Add trailer to packet. - * - * - * pkt Packet to add trailer data to. - * data Pointer to data to copy into the trailer. - * len Length of trailer data to copy. - * @return zero on success and error code upon failure - */ -int cfpkt_add_trail(struct cfpkt *pkt, const void *data, u16 len); - -/* - * Pad trailer on packet. - * Moves data pointer in packet, no content copied. - * - * pkt Packet in which to pad trailer. - * len Length of padding to add. - * @return zero on success and error code upon failure - */ -int cfpkt_pad_trail(struct cfpkt *pkt, u16 len); - -/* - * Add a single byte to packet body (tail). - * - * pkt Packet in which to add byte. - * data Byte to add. - * @return zero on success and error code upon failure - */ -int cfpkt_addbdy(struct cfpkt *pkt, const u8 data); - -/* - * Add a data to packet body (tail). - * - * pkt Packet in which to add data. - * data Pointer to data to copy into the packet body. - * len Length of data to add. - * @return zero on success and error code upon failure - */ -int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len); - -/* - * Checks whether there are more data to process in packet. - * pkt Packet to check. - * @return true if more data are available in packet false otherwise - */ -bool cfpkt_more(struct cfpkt *pkt); - -/* - * Checks whether the packet is erroneous, - * i.e. if it has been attempted to extract more data than available in packet - * or writing more data than has been allocated in cfpkt_create(). - * pkt Packet to check. - * @return true on error false otherwise - */ -bool cfpkt_erroneous(struct cfpkt *pkt); - -/* - * Get the packet length. - * pkt Packet to get length from. - * @return Number of bytes in packet. - */ -u16 cfpkt_getlen(struct cfpkt *pkt); - -/* - * Set the packet length, by adjusting the trailer pointer according to length. - * pkt Packet to set length. - * len Packet length. - * @return Number of bytes in packet. - */ -int cfpkt_setlen(struct cfpkt *pkt, u16 len); - -/* - * cfpkt_append - Appends a packet's data to another packet. - * dstpkt: Packet to append data into, WILL BE FREED BY THIS FUNCTION - * addpkt: Packet to be appended and automatically released, - * WILL BE FREED BY THIS FUNCTION. - * expectlen: Packet's expected total length. This should be considered - * as a hint. - * NB: Input packets will be destroyed after appending and cannot be used - * after calling this function. - * @return The new appended packet. - */ -struct cfpkt *cfpkt_append(struct cfpkt *dstpkt, struct cfpkt *addpkt, - u16 expectlen); - -/* - * cfpkt_split - Split a packet into two packets at the specified split point. - * pkt: Packet to be split (will contain the first part of the data on exit) - * pos: Position to split packet in two parts. - * @return The new packet, containing the second part of the data. - */ -struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos); - -/* - * Iteration function, iterates the packet buffers from start to end. - * - * Checksum iteration function used to iterate buffers - * (we may have packets consisting of a chain of buffers) - * pkt: Packet to calculate checksum for - * iter_func: Function pointer to iteration function - * chks: Checksum calculated so far. - * buf: Pointer to the buffer to checksum - * len: Length of buf. - * data: Initial checksum value. - * @return Checksum of buffer. - */ - -int cfpkt_iterate(struct cfpkt *pkt, - u16 (*iter_func)(u16 chks, void *buf, u16 len), - u16 data); - -/* Map from a "native" packet (e.g. Linux Socket Buffer) to a CAIF packet. - * dir - Direction indicating whether this packet is to be sent or received. - * nativepkt - The native packet to be transformed to a CAIF packet - * @return The mapped CAIF Packet CFPKT. - */ -struct cfpkt *cfpkt_fromnative(enum caif_direction dir, void *nativepkt); - -/* Map from a CAIF packet to a "native" packet (e.g. Linux Socket Buffer). - * pkt - The CAIF packet to be transformed into a "native" packet. - * @return The native packet transformed from a CAIF packet. - */ -void *cfpkt_tonative(struct cfpkt *pkt); - -/* - * Returns packet information for a packet. - * pkt Packet to get info from; - * @return Packet information - */ -struct caif_payload_info *cfpkt_info(struct cfpkt *pkt); - -/** cfpkt_set_prio - set priority for a CAIF packet. - * - * @pkt: The CAIF packet to be adjusted. - * @prio: one of TC_PRIO_ constants. - */ -void cfpkt_set_prio(struct cfpkt *pkt, int prio); - -#endif /* CFPKT_H_ */ diff --git a/include/net/caif/cfserl.h b/include/net/caif/cfserl.h deleted file mode 100644 index 67cce8757175..000000000000 --- a/include/net/caif/cfserl.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland - */ - -#ifndef CFSERL_H_ -#define CFSERL_H_ -#include <net/caif/caif_layer.h> - -struct cflayer *cfserl_create(int instance, bool use_stx); -void cfserl_release(struct cflayer *layer); -#endif diff --git a/include/net/caif/cfsrvl.h b/include/net/caif/cfsrvl.h deleted file mode 100644 index a000dc45f966..000000000000 --- a/include/net/caif/cfsrvl.h +++ /dev/null @@ -1,61 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson AB 2010 - * Author: Sjur Brendeland - */ - -#ifndef CFSRVL_H_ -#define CFSRVL_H_ -#include <linux/list.h> -#include <linux/stddef.h> -#include <linux/types.h> -#include <linux/kref.h> -#include <linux/rculist.h> - -struct cfsrvl { - struct cflayer layer; - bool open; - bool phy_flow_on; - bool modem_flow_on; - bool supports_flowctrl; - void (*release)(struct cflayer *layer); - struct dev_info dev_info; - void (*hold)(struct cflayer *lyr); - void (*put)(struct cflayer *lyr); - struct rcu_head rcu; -}; - -struct cflayer *cfvei_create(u8 linkid, struct dev_info *dev_info); -struct cflayer *cfdgml_create(u8 linkid, struct dev_info *dev_info); -struct cflayer *cfutill_create(u8 linkid, struct dev_info *dev_info); -struct cflayer *cfvidl_create(u8 linkid, struct dev_info *dev_info); -struct cflayer *cfrfml_create(u8 linkid, struct dev_info *dev_info, - int mtu_size); -struct cflayer *cfdbgl_create(u8 linkid, struct dev_info *dev_info); - -bool cfsrvl_phyid_match(struct cflayer *layer, int phyid); - -void cfsrvl_init(struct cfsrvl *service, - u8 channel_id, - struct dev_info *dev_info, - bool supports_flowctrl); -bool cfsrvl_ready(struct cfsrvl *service, int *err); - -static inline void cfsrvl_get(struct cflayer *layr) -{ - struct cfsrvl *s = container_of(layr, struct cfsrvl, layer); - if (layr == NULL || layr->up == NULL || s->hold == NULL) - return; - - s->hold(layr->up); -} - -static inline void cfsrvl_put(struct cflayer *layr) -{ - struct cfsrvl *s = container_of(layr, struct cfsrvl, layer); - if (layr == NULL || layr->up == NULL || s->hold == NULL) - return; - - s->put(layr->up); -} -#endif /* CFSRVL_H_ */ diff --git a/include/net/can.h b/include/net/can.h new file mode 100644 index 000000000000..6db9e826f0e0 --- /dev/null +++ b/include/net/can.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * net/can.h + * + * Definitions for the CAN network socket buffer extensions + * + * Copyright (C) 2026 Oliver Hartkopp <socketcan@hartkopp.net> + * + */ + +#ifndef _NET_CAN_H +#define _NET_CAN_H + +/** + * struct can_skb_ext - skb extensions for CAN specific content + * @can_iif: ifindex of the first interface the CAN frame appeared on + * @can_framelen: cached echo CAN frame length for bql + * @can_gw_hops: can-gw CAN frame time-to-live counter + * @can_ext_flags: CAN skb extensions flags + */ +struct can_skb_ext { + int can_iif; + u16 can_framelen; + u8 can_gw_hops; + u8 can_ext_flags; +}; + +#endif /* _NET_CAN_H */ diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 363d7dd2255a..9d3639ff9c28 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7,7 +7,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include <linux/ethtool.h> @@ -101,16 +101,6 @@ struct wiphy; * @IEEE80211_CHAN_NO_10MHZ: 10 MHz bandwidth is not permitted * on this channel. * @IEEE80211_CHAN_NO_HE: HE operation is not permitted on this channel. - * @IEEE80211_CHAN_1MHZ: 1 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_2MHZ: 2 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_4MHZ: 4 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_8MHZ: 8 MHz bandwidth is permitted - * on this channel. - * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted - * on this channel. * @IEEE80211_CHAN_NO_320MHZ: If the driver supports 320 MHz on the band, * this flag indicates that a 320 MHz channel cannot use this * channel as the control or any of the secondary channels. @@ -127,6 +117,16 @@ struct wiphy; * even if it is otherwise disabled. * @IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP: Allow using this channel for AP operation * with very low power (VLP), even if otherwise set to NO_IR. + * @IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY: Allow activity on a 20 MHz channel, + * even if otherwise set to NO_IR. + * @IEEE80211_CHAN_S1G_NO_PRIMARY: Prevents the channel for use as an S1G + * primary channel. Does not prevent the wider operating channel + * described by the chandef from being used. In order for a 2MHz primary + * to be used, both 1MHz subchannels shall not contain this flag. + * @IEEE80211_CHAN_NO_4MHZ: 4 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_8MHZ: 8 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_16MHZ: 16 MHz bandwidth is not permitted on this channel. + * @IEEE80211_CHAN_NO_UHR: UHR operation is not permitted on this channel. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = BIT(0), @@ -143,11 +143,8 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_NO_20MHZ = BIT(11), IEEE80211_CHAN_NO_10MHZ = BIT(12), IEEE80211_CHAN_NO_HE = BIT(13), - IEEE80211_CHAN_1MHZ = BIT(14), - IEEE80211_CHAN_2MHZ = BIT(15), - IEEE80211_CHAN_4MHZ = BIT(16), - IEEE80211_CHAN_8MHZ = BIT(17), - IEEE80211_CHAN_16MHZ = BIT(18), + /* can use free bits here */ + IEEE80211_CHAN_NO_UHR = BIT(18), IEEE80211_CHAN_NO_320MHZ = BIT(19), IEEE80211_CHAN_NO_EHT = BIT(20), IEEE80211_CHAN_DFS_CONCURRENT = BIT(21), @@ -155,6 +152,11 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT = BIT(23), IEEE80211_CHAN_CAN_MONITOR = BIT(24), IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP = BIT(25), + IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY = BIT(26), + IEEE80211_CHAN_S1G_NO_PRIMARY = BIT(27), + IEEE80211_CHAN_NO_4MHZ = BIT(28), + IEEE80211_CHAN_NO_8MHZ = BIT(29), + IEEE80211_CHAN_NO_16MHZ = BIT(30), }; #define IEEE80211_CHAN_NO_HT40 \ @@ -188,6 +190,8 @@ enum ieee80211_channel_flags { * on this channel. * @dfs_state_entered: timestamp (jiffies) when the dfs state was entered. * @dfs_cac_ms: DFS CAC time in milliseconds, this is valid for DFS channels. + * @cac_start_time: timestamp (CLOCK_BOOTTIME, nanoseconds) when CAC was + * started on this channel. Zero when CAC is not in progress. * @psd: power spectral density (in dBm) */ struct ieee80211_channel { @@ -205,6 +209,7 @@ struct ieee80211_channel { enum nl80211_dfs_state dfs_state; unsigned long dfs_state_entered; unsigned int dfs_cac_ms; + u64 cac_start_time; s8 psd; }; @@ -429,6 +434,18 @@ struct ieee80211_sta_eht_cap { u8 eht_ppe_thres[IEEE80211_EHT_PPE_THRES_MAX_LEN]; }; +/** + * struct ieee80211_sta_uhr_cap - STA's UHR capabilities + * @has_uhr: true iff UHR is supported and data is valid + * @mac: fixed MAC capabilities + * @phy: fixed PHY capabilities + */ +struct ieee80211_sta_uhr_cap { + bool has_uhr; + struct ieee80211_uhr_cap_mac mac; + struct ieee80211_uhr_cap_phy phy; +}; + /* sparse defines __CHECKER__; see Documentation/dev-tools/sparse.rst */ #ifdef __CHECKER__ /* @@ -454,6 +471,7 @@ struct ieee80211_sta_eht_cap { * @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a * 6 GHz band channel (and 0 may be valid value). * @eht_cap: STA's EHT capabilities + * @uhr_cap: STA's UHR capabilities * @vendor_elems: vendor element(s) to advertise * @vendor_elems.data: vendor element(s) data * @vendor_elems.len: vendor element(s) length @@ -463,6 +481,7 @@ struct ieee80211_sband_iftype_data { struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; struct ieee80211_sta_eht_cap eht_cap; + struct ieee80211_sta_uhr_cap uhr_cap; struct { const u8 *data; unsigned int len; @@ -557,7 +576,7 @@ struct ieee80211_sta_s1g_cap { * @vht_cap: VHT capabilities in this band * @s1g_cap: S1G capabilities in this band * @edmg_cap: EDMG capabilities in this band - * @s1g_cap: S1G capabilities in this band (S1B band only, of course) + * @s1g_cap: S1G capabilities in this band (S1G band only, of course) * @n_iftype_data: number of iftype data entries * @iftype_data: interface type data entries. Note that the bits in * @types_mask inside this structure cannot overlap (i.e. only @@ -630,7 +649,7 @@ ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband, const struct ieee80211_sband_iftype_data *data; int i; - if (WARN_ON(iftype >= NL80211_IFTYPE_MAX)) + if (WARN_ON(iftype >= NUM_NL80211_IFTYPES)) return NULL; if (iftype == NL80211_IFTYPE_AP_VLAN) @@ -685,7 +704,7 @@ ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband, } /** - * ieee80211_get_eht_iftype_cap - return ETH capabilities for an sband's iftype + * ieee80211_get_eht_iftype_cap - return EHT capabilities for an sband's iftype * @sband: the sband to search for the iftype on * @iftype: enum nl80211_iftype * @@ -705,6 +724,26 @@ ieee80211_get_eht_iftype_cap(const struct ieee80211_supported_band *sband, } /** + * ieee80211_get_uhr_iftype_cap - return UHR capabilities for an sband's iftype + * @sband: the sband to search for the iftype on + * @iftype: enum nl80211_iftype + * + * Return: pointer to the struct ieee80211_sta_uhr_cap, or NULL is none found + */ +static inline const struct ieee80211_sta_uhr_cap * +ieee80211_get_uhr_iftype_cap(const struct ieee80211_supported_band *sband, + enum nl80211_iftype iftype) +{ + const struct ieee80211_sband_iftype_data *data = + ieee80211_get_sband_iftype_data(sband, iftype); + + if (data && data->uhr_cap.has_uhr) + return &data->uhr_cap; + + return NULL; +} + +/** * wiphy_read_of_freq_limits - read frequency limits from device tree * * @wiphy: the wireless device to get extra limits for @@ -786,8 +825,7 @@ struct vif_params { * @key: key material * @key_len: length of key material * @cipher: cipher suite selector - * @seq: sequence counter (IV/PN) for TKIP and CCMP keys, only used - * with the get_key() callback, must be in little endian, + * @seq: sequence counter (IV/PN), must be in little endian, * length given by @seq_len. * @seq_len: length of @seq. * @vlan_id: vlan_id for VLAN group key (if nonzero) @@ -818,6 +856,9 @@ struct key_params { * @punctured: mask of the punctured 20 MHz subchannels, with * bits turned on being disabled (punctured); numbered * from lower to higher frequency (like in the spec) + * @s1g_primary_2mhz: Indicates if the control channel pointed to + * by 'chan' exists as a 1MHz primary subchannel within an + * S1G 2MHz primary channel. */ struct cfg80211_chan_def { struct ieee80211_channel *chan; @@ -827,6 +868,7 @@ struct cfg80211_chan_def { struct ieee80211_edmg edmg; u16 freq1_offset; u16 punctured; + bool s1g_primary_2mhz; }; /* @@ -838,9 +880,12 @@ struct cfg80211_bitrate_mask { u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; u16 vht_mcs[NL80211_VHT_NSS_MAX]; u16 he_mcs[NL80211_HE_NSS_MAX]; + u16 eht_mcs[NL80211_EHT_NSS_MAX]; enum nl80211_txrate_gi gi; enum nl80211_he_gi he_gi; + enum nl80211_eht_gi eht_gi; enum nl80211_he_ltf he_ltf; + enum nl80211_eht_ltf eht_ltf; } control[NUM_NL80211_BANDS]; }; @@ -968,7 +1013,8 @@ cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1, chandef1->center_freq1 == chandef2->center_freq1 && chandef1->freq1_offset == chandef2->freq1_offset && chandef1->center_freq2 == chandef2->center_freq2 && - chandef1->punctured == chandef2->punctured); + chandef1->punctured == chandef2->punctured && + chandef1->s1g_primary_2mhz == chandef2->s1g_primary_2mhz); } /** @@ -985,6 +1031,18 @@ cfg80211_chandef_is_edmg(const struct cfg80211_chan_def *chandef) } /** + * cfg80211_chandef_is_s1g - check if chandef represents an S1G channel + * @chandef: the channel definition + * + * Return: %true if S1G. + */ +static inline bool +cfg80211_chandef_is_s1g(const struct cfg80211_chan_def *chandef) +{ + return chandef->chan->band == NL80211_BAND_S1GHZ; +} + +/** * cfg80211_chandef_compatible - check if two channel definitions are compatible * @chandef1: first channel definition * @chandef2: second channel definition @@ -996,6 +1054,7 @@ const struct cfg80211_chan_def * cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1, const struct cfg80211_chan_def *chandef2); + /** * nl80211_chan_width_to_mhz - get the channel width in MHz * @chan_width: the channel width from &enum nl80211_chan_width @@ -1006,6 +1065,17 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1, int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width); /** + * cfg80211_chandef_get_width - return chandef width in MHz + * @c: chandef to return bandwidth for + * Return: channel width in MHz for the given chandef; note that it returns + * 80 for 80+80 configurations + */ +static inline int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) +{ + return nl80211_chan_width_to_mhz(c->width); +} + +/** * cfg80211_chandef_valid - check if a channel definition is valid * @chandef: the channel definition to check * Return: %true if the channel definition is valid. %false otherwise. @@ -1083,43 +1153,6 @@ int cfg80211_chandef_primary(const struct cfg80211_chan_def *chandef, int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef); /** - * ieee80211_chanwidth_rate_flags - return rate flags for channel width - * @width: the channel width of the channel - * - * In some channel types, not all rates may be used - for example CCK - * rates may not be used in 5/10 MHz channels. - * - * Returns: rate flags which apply for this channel width - */ -static inline enum ieee80211_rate_flags -ieee80211_chanwidth_rate_flags(enum nl80211_chan_width width) -{ - switch (width) { - case NL80211_CHAN_WIDTH_5: - return IEEE80211_RATE_SUPPORTS_5MHZ; - case NL80211_CHAN_WIDTH_10: - return IEEE80211_RATE_SUPPORTS_10MHZ; - default: - break; - } - return 0; -} - -/** - * ieee80211_chandef_rate_flags - returns rate flags for a channel - * @chandef: channel definition for the channel - * - * See ieee80211_chanwidth_rate_flags(). - * - * Returns: rate flags which apply for this channel - */ -static inline enum ieee80211_rate_flags -ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef) -{ - return ieee80211_chanwidth_rate_flags(chandef->width); -} - -/** * ieee80211_chandef_max_power - maximum transmission power for the chandef * * In some regulations, the transmit power may depend on the configured channel @@ -1286,11 +1319,13 @@ struct cfg80211_crypto_settings { * struct cfg80211_mbssid_config - AP settings for multi bssid * * @tx_wdev: pointer to the transmitted interface in the MBSSID set + * @tx_link_id: link ID of the transmitted profile in an MLD. * @index: index of this AP in the multi bssid group. * @ema: set to true if the beacons should be sent out in EMA mode. */ struct cfg80211_mbssid_config { struct wireless_dev *tx_wdev; + u8 tx_link_id; u8 index; bool ema; }; @@ -1445,6 +1480,23 @@ struct cfg80211_unsol_bcast_probe_resp { }; /** + * struct cfg80211_s1g_short_beacon - S1G short beacon data. + * + * @update: Set to true if the feature configuration should be updated. + * @short_head: Short beacon head. + * @short_tail: Short beacon tail. + * @short_head_len: Short beacon head len. + * @short_tail_len: Short beacon tail len. + */ +struct cfg80211_s1g_short_beacon { + bool update; + const u8 *short_head; + const u8 *short_tail; + size_t short_head_len; + size_t short_tail_len; +}; + +/** * struct cfg80211_ap_settings - AP configuration * * Used to configure an AP interface. @@ -1473,6 +1525,7 @@ struct cfg80211_unsol_bcast_probe_resp { * @he_cap: HE capabilities (or %NULL if HE isn't enabled) * @eht_cap: EHT capabilities (or %NULL if EHT isn't enabled) * @eht_oper: EHT operation IE (or %NULL if EHT isn't enabled) + * @uhr_oper: UHR operation (or %NULL if UHR isn't enabled) * @ht_required: stations must support HT * @vht_required: stations must support VHT * @twt_responder: Enable Target Wait Time @@ -1484,6 +1537,8 @@ struct cfg80211_unsol_bcast_probe_resp { * @fils_discovery: FILS discovery transmission parameters * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters * @mbssid_config: AP settings for multiple bssid + * @s1g_long_beacon_period: S1G long beacon period + * @s1g_short_beacon: S1G short beacon data */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1510,6 +1565,7 @@ struct cfg80211_ap_settings { const struct ieee80211_he_operation *he_oper; const struct ieee80211_eht_cap_elem *eht_cap; const struct ieee80211_eht_operation *eht_oper; + const struct ieee80211_uhr_operation *uhr_oper; bool ht_required, vht_required, he_required, sae_h2e_required; bool twt_responder; u32 flags; @@ -1517,6 +1573,8 @@ struct cfg80211_ap_settings { struct cfg80211_fils_discovery fils_discovery; struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; struct cfg80211_mbssid_config mbssid_config; + u8 s1g_long_beacon_period; + struct cfg80211_s1g_short_beacon s1g_short_beacon; }; @@ -1528,11 +1586,13 @@ struct cfg80211_ap_settings { * @beacon: beacon data * @fils_discovery: FILS discovery transmission parameters * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters + * @s1g_short_beacon: S1G short beacon data */ struct cfg80211_ap_update { struct cfg80211_beacon_data beacon; struct cfg80211_fils_discovery fils_discovery; struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; + struct cfg80211_s1g_short_beacon s1g_short_beacon; }; /** @@ -1547,6 +1607,7 @@ struct cfg80211_ap_update { * @n_counter_offsets_beacon: number of csa counters the beacon (tail) * @n_counter_offsets_presp: number of csa counters in the probe response * @beacon_after: beacon data to be used on the new channel + * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters * @radar_required: whether radar detection is required on the new channel * @block_tx: whether transmissions should be blocked while changing * @count: number of beacons until switch @@ -1561,6 +1622,7 @@ struct cfg80211_csa_settings { unsigned int n_counter_offsets_beacon; unsigned int n_counter_offsets_presp; struct cfg80211_beacon_data beacon_after; + struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; bool radar_required; bool block_tx; u8 count; @@ -1576,6 +1638,7 @@ struct cfg80211_csa_settings { * @counter_offset_beacon: offsets of the counters within the beacon (tail) * @counter_offset_presp: offsets of the counters within the probe response * @beacon_next: beacon data to be used after the color change + * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters * @count: number of beacons until the color change * @color: the color used after the change * @link_id: defines the link on which color change is expected during MLO. @@ -1586,6 +1649,7 @@ struct cfg80211_color_change_settings { u16 counter_offset_beacon; u16 counter_offset_presp; struct cfg80211_beacon_data beacon_next; + struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; u8 count; u8 color; u8 link_id; @@ -1674,6 +1738,9 @@ struct sta_txpwr { * @he_6ghz_capa: HE 6 GHz Band capabilities of station * @eht_capa: EHT capabilities of station * @eht_capa_len: the length of the EHT capabilities + * @s1g_capa: S1G capabilities of station + * @uhr_capa: UHR capabilities of the station + * @uhr_capa_len: the length of the UHR capabilities */ struct link_station_parameters { const u8 *mld_mac; @@ -1692,6 +1759,9 @@ struct link_station_parameters { const struct ieee80211_he_6ghz_capa *he_6ghz_capa; const struct ieee80211_eht_cap_elem *eht_capa; u8 eht_capa_len; + const struct ieee80211_s1g_cap *s1g_capa; + const struct ieee80211_uhr_cap *uhr_capa; + u8 uhr_capa_len; }; /** @@ -1756,7 +1826,12 @@ struct cfg80211_ttlm_params { * @supported_oper_classes_len: number of supported operating classes * @support_p2p_ps: information if station supports P2P PS mechanism * @airtime_weight: airtime scheduler weight for this station + * @eml_cap_present: Specifies if EML capabilities field (@eml_cap) is + * present/updated + * @eml_cap: EML capabilities of this station * @link_sta_params: link related params. + * @epp_peer: EPP peer indication + * @nmi_mac: MAC address of the NMI station of the NAN peer */ struct station_parameters { struct net_device *vlan; @@ -1780,7 +1855,11 @@ struct station_parameters { u8 supported_oper_classes_len; int support_p2p_ps; u16 airtime_weight; + bool eml_cap_present; + u16 eml_cap; struct link_station_parameters link_sta_params; + bool epp_peer; + const u8 *nmi_mac; }; /** @@ -1820,6 +1899,8 @@ struct station_del_parameters { * entry that is operating, has been marked authorized by userspace) * @CFG80211_STA_MESH_PEER_KERNEL: peer on mesh interface (kernel managed) * @CFG80211_STA_MESH_PEER_USER: peer on mesh interface (user managed) + * @CFG80211_STA_NAN_MGMT: NAN management interface station + * @CFG80211_STA_NAN_DATA: NAN data path station */ enum cfg80211_station_type { CFG80211_STA_AP_CLIENT, @@ -1831,6 +1912,8 @@ enum cfg80211_station_type { CFG80211_STA_TDLS_PEER_ACTIVE, CFG80211_STA_MESH_PEER_KERNEL, CFG80211_STA_MESH_PEER_USER, + CFG80211_STA_NAN_MGMT, + CFG80211_STA_NAN_DATA, }; /** @@ -1866,6 +1949,11 @@ int cfg80211_check_station_change(struct wiphy *wiphy, * @RATE_INFO_FLAGS_EXTENDED_SC_DMG: 60GHz extended SC MCS * @RATE_INFO_FLAGS_EHT_MCS: EHT MCS information * @RATE_INFO_FLAGS_S1G_MCS: MCS field filled with S1G MCS + * @RATE_INFO_FLAGS_UHR_MCS: UHR MCS information + * @RATE_INFO_FLAGS_UHR_ELR_MCS: UHR ELR MCS was used + * (set together with @RATE_INFO_FLAGS_UHR_MCS) + * @RATE_INFO_FLAGS_UHR_IM: UHR Interference Mitigation + * was used */ enum rate_info_flags { RATE_INFO_FLAGS_MCS = BIT(0), @@ -1877,6 +1965,9 @@ enum rate_info_flags { RATE_INFO_FLAGS_EXTENDED_SC_DMG = BIT(6), RATE_INFO_FLAGS_EHT_MCS = BIT(7), RATE_INFO_FLAGS_S1G_MCS = BIT(8), + RATE_INFO_FLAGS_UHR_MCS = BIT(9), + RATE_INFO_FLAGS_UHR_ELR_MCS = BIT(10), + RATE_INFO_FLAGS_UHR_IM = BIT(11), }; /** @@ -1892,7 +1983,7 @@ enum rate_info_flags { * @RATE_INFO_BW_160: 160 MHz bandwidth * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation * @RATE_INFO_BW_320: 320 MHz bandwidth - * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT RU allocation + * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT/UHR RU allocation * @RATE_INFO_BW_1: 1 MHz bandwidth * @RATE_INFO_BW_2: 2 MHz bandwidth * @RATE_INFO_BW_4: 4 MHz bandwidth @@ -1923,7 +2014,7 @@ enum rate_info_bw { * * @flags: bitflag of flags from &enum rate_info_flags * @legacy: bitrate in 100kbit/s for 802.11abg - * @mcs: mcs index if struct describes an HT/VHT/HE/EHT/S1G rate + * @mcs: mcs index if struct describes an HT/VHT/HE/EHT/S1G/UHR rate * @nss: number of streams (VHT & HE only) * @bw: bandwidth (from &enum rate_info_bw) * @he_gi: HE guard interval (from &enum nl80211_he_gi) @@ -2034,6 +2125,99 @@ struct cfg80211_tid_stats { #define IEEE80211_MAX_CHAINS 4 /** + * struct link_station_info - link station information + * + * Link station information filled by driver for get_station() and + * dump_station(). + * @filled: bit flag of flags using the bits of &enum nl80211_sta_info to + * indicate the relevant values in this struct for them + * @connected_time: time(in secs) since a link of station is last connected + * @inactive_time: time since last activity for link station(tx/rx) + * in milliseconds + * @assoc_at: bootime (ns) of the last association of link of station + * @rx_bytes: bytes (size of MPDUs) received from this link of station + * @tx_bytes: bytes (size of MPDUs) transmitted to this link of station + * @signal: The signal strength, type depends on the wiphy's signal_type. + * For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_. + * @signal_avg: Average signal strength, type depends on the wiphy's + * signal_type. For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_ + * @chains: bitmask for filled values in @chain_signal, @chain_signal_avg + * @chain_signal: per-chain signal strength of last received packet in dBm + * @chain_signal_avg: per-chain signal strength average in dBm + * @txrate: current unicast bitrate from this link of station + * @rxrate: current unicast bitrate to this link of station + * @rx_packets: packets (MSDUs & MMPDUs) received from this link of station + * @tx_packets: packets (MSDUs & MMPDUs) transmitted to this link of station + * @tx_retries: cumulative retry counts (MPDUs) for this link of station + * @tx_failed: number of failed transmissions (MPDUs) (retries exceeded, no ACK) + * @rx_dropped_misc: Dropped for un-specified reason. + * @bss_param: current BSS parameters + * @beacon_loss_count: Number of times beacon loss event has triggered. + * @expected_throughput: expected throughput in kbps (including 802.11 headers) + * towards this station. + * @rx_beacon: number of beacons received from this peer + * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received + * from this peer + * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer + * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer + * @airtime_weight: current airtime scheduling weight + * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last + * (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs. + * Note that this doesn't use the @filled bit, but is used if non-NULL. + * @ack_signal: signal strength (in dBm) of the last ACK frame. + * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has + * been sent. + * @rx_mpdu_count: number of MPDUs received from this station + * @fcs_err_count: number of packets (MPDUs) received from this station with + * an FCS error. This counter should be incremented only when TA of the + * received packet with an FCS error matches the peer MAC address. + * @addr: For MLO STA connection, filled with address of the link of station. + */ +struct link_station_info { + u64 filled; + u32 connected_time; + u32 inactive_time; + u64 assoc_at; + u64 rx_bytes; + u64 tx_bytes; + s8 signal; + s8 signal_avg; + + u8 chains; + s8 chain_signal[IEEE80211_MAX_CHAINS]; + s8 chain_signal_avg[IEEE80211_MAX_CHAINS]; + + struct rate_info txrate; + struct rate_info rxrate; + u32 rx_packets; + u32 tx_packets; + u32 tx_retries; + u32 tx_failed; + u32 rx_dropped_misc; + struct sta_bss_parameters bss_param; + + u32 beacon_loss_count; + + u32 expected_throughput; + + u64 tx_duration; + u64 rx_duration; + u64 rx_beacon; + u8 rx_beacon_signal_avg; + + u16 airtime_weight; + + s8 ack_signal; + s8 avg_ack_signal; + struct cfg80211_tid_stats *pertid; + + u32 rx_mpdu_count; + u32 fcs_err_count; + + u8 addr[ETH_ALEN] __aligned(2); +}; + +/** * struct station_info - station information * * Station information filled by driver for get_station() and dump_station. @@ -2045,9 +2229,6 @@ struct cfg80211_tid_stats { * @assoc_at: bootime (ns) of the last association * @rx_bytes: bytes (size of MPDUs) received from this station * @tx_bytes: bytes (size of MPDUs) transmitted to this station - * @llid: mesh local link id - * @plid: mesh peer link id - * @plink_state: mesh peer link state * @signal: The signal strength, type depends on the wiphy's signal_type. * For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_. * @signal_avg: Average signal strength, type depends on the wiphy's signal_type. @@ -2067,14 +2248,20 @@ struct cfg80211_tid_stats { * This number should increase every time the list of stations * changes, i.e. when a station is added or removed, so that * userspace can tell whether it got a consistent snapshot. + * @beacon_loss_count: Number of times beacon loss event has triggered. * @assoc_req_ies: IEs from (Re)Association Request. * This is used only when in AP mode with drivers that do not use * user space MLME/SME implementation. The information is provided for * the cfg80211_new_sta() calls to notify user space of the IEs. * @assoc_req_ies_len: Length of assoc_req_ies buffer in octets. * @sta_flags: station flags mask & values - * @beacon_loss_count: Number of times beacon loss event has triggered. * @t_offset: Time offset of the station relative to this host. + * @llid: mesh local link id + * @plid: mesh peer link id + * @plink_state: mesh peer link state + * @connected_to_gate: true if mesh STA has a path to mesh gate + * @connected_to_as: true if mesh STA has a path to authentication server + * @airtime_link_metric: mesh airtime link metric. * @local_pm: local mesh STA power save mode * @peer_pm: peer mesh STA power save mode * @nonpeer_pm: non-peer mesh STA power save mode @@ -2083,7 +2270,6 @@ struct cfg80211_tid_stats { * @rx_beacon: number of beacons received from this peer * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received * from this peer - * @connected_to_gate: true if mesh STA has a path to mesh gate * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer * @airtime_weight: current airtime scheduling weight @@ -2097,8 +2283,6 @@ struct cfg80211_tid_stats { * @fcs_err_count: number of packets (MPDUs) received from this station with * an FCS error. This counter should be incremented only when TA of the * received packet with an FCS error matches the peer MAC address. - * @airtime_link_metric: mesh airtime link metric. - * @connected_to_as: true if mesh STA has a path to authentication server * @mlo_params_valid: Indicates @assoc_link_id and @mld_addr fields are filled * by driver. Drivers use this only in cfg80211_new_sta() calls when AP * MLD's MLME/SME is offload to driver. Drivers won't fill this @@ -2117,6 +2301,11 @@ struct cfg80211_tid_stats { * dump_station() callbacks. User space needs this information to determine * the accepted and rejected affiliated links of the connected station. * @assoc_resp_ies_len: Length of @assoc_resp_ies buffer in octets. + * @valid_links: bitmap of valid links, or 0 for non-MLO. Drivers fill this + * information in cfg80211_new_sta(), cfg80211_del_sta_sinfo(), + * get_station() and dump_station() callbacks. + * @links: reference to Link sta entries for MLO STA, all link specific + * information is accessed through links[link_id]. */ struct station_info { u64 filled; @@ -2125,9 +2314,6 @@ struct station_info { u64 assoc_at; u64 rx_bytes; u64 tx_bytes; - u16 llid; - u16 plid; - u8 plink_state; s8 signal; s8 signal_avg; @@ -2147,41 +2333,46 @@ struct station_info { int generation; + u32 beacon_loss_count; + const u8 *assoc_req_ies; size_t assoc_req_ies_len; - u32 beacon_loss_count; s64 t_offset; + u16 llid; + u16 plid; + u8 plink_state; + u8 connected_to_gate; + u8 connected_to_as; + u32 airtime_link_metric; enum nl80211_mesh_power_mode local_pm; enum nl80211_mesh_power_mode peer_pm; enum nl80211_mesh_power_mode nonpeer_pm; u32 expected_throughput; - u64 tx_duration; - u64 rx_duration; - u64 rx_beacon; - u8 rx_beacon_signal_avg; - u8 connected_to_gate; + u16 airtime_weight; - struct cfg80211_tid_stats *pertid; s8 ack_signal; s8 avg_ack_signal; + struct cfg80211_tid_stats *pertid; - u16 airtime_weight; + u64 tx_duration; + u64 rx_duration; + u64 rx_beacon; + u8 rx_beacon_signal_avg; u32 rx_mpdu_count; u32 fcs_err_count; - u32 airtime_link_metric; - - u8 connected_to_as; - bool mlo_params_valid; u8 assoc_link_id; u8 mld_addr[ETH_ALEN] __aligned(2); const u8 *assoc_resp_ies; size_t assoc_resp_ies_len; + + u16 valid_links; + struct link_station_info *links[IEEE80211_MLD_MAX_NUM_LINKS]; }; /** @@ -2265,7 +2456,7 @@ static inline int cfg80211_get_station(struct net_device *dev, * @MONITOR_FLAG_PLCPFAIL: pass frames with bad PLCP * @MONITOR_FLAG_CONTROL: pass control frames * @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering - * @MONITOR_FLAG_COOK_FRAMES: report frames after processing + * @MONITOR_FLAG_COOK_FRAMES: deprecated, will unconditionally be refused * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address * @MONITOR_FLAG_SKIP_TX: do not pass locally transmitted frames */ @@ -2344,6 +2535,29 @@ struct mpath_info { }; /** + * enum wiphy_bss_param_flags - bit positions for supported bss parameters. + * + * @WIPHY_BSS_PARAM_CTS_PROT: support changing CTS protection. + * @WIPHY_BSS_PARAM_SHORT_PREAMBLE: support changing short preamble usage. + * @WIPHY_BSS_PARAM_SHORT_SLOT_TIME: support changing short slot time usage. + * @WIPHY_BSS_PARAM_BASIC_RATES: support reconfiguring basic rates. + * @WIPHY_BSS_PARAM_AP_ISOLATE: support changing AP isolation. + * @WIPHY_BSS_PARAM_HT_OPMODE: support changing HT operating mode. + * @WIPHY_BSS_PARAM_P2P_CTWINDOW: support reconfiguring ctwindow. + * @WIPHY_BSS_PARAM_P2P_OPPPS: support changing P2P opportunistic power-save. + */ +enum wiphy_bss_param_flags { + WIPHY_BSS_PARAM_CTS_PROT = BIT(0), + WIPHY_BSS_PARAM_SHORT_PREAMBLE = BIT(1), + WIPHY_BSS_PARAM_SHORT_SLOT_TIME = BIT(2), + WIPHY_BSS_PARAM_BASIC_RATES = BIT(3), + WIPHY_BSS_PARAM_AP_ISOLATE = BIT(4), + WIPHY_BSS_PARAM_HT_OPMODE = BIT(5), + WIPHY_BSS_PARAM_P2P_CTWINDOW = BIT(6), + WIPHY_BSS_PARAM_P2P_OPPPS = BIT(7), +}; + +/** * struct bss_parameters - BSS parameters * * Used to change BSS parameters (mainly for AP mode). @@ -2662,15 +2876,16 @@ struct cfg80211_scan_6ghz_params { * @wiphy: the wiphy this was for * @scan_start: time (in jiffies) when the scan started * @wdev: the wireless device to scan for - * @info: (internal) information about completed scan - * @notified: (internal) scan request was notified as done or aborted * @no_cck: used to send probe requests at non CCK rate in 2GHz band * @mac_addr: MAC address used with randomisation * @mac_addr_mask: MAC address mask used with randomisation, bits that * are 0 in the mask should be randomised, bits that are 1 should * be taken from the @mac_addr * @scan_6ghz: relevant for split scan request only, - * true if this is the second scan request + * true if this is a 6 GHz scan request + * @first_part: %true if this is the first part of a split scan request or a + * scan that was not split. May be %true for a @scan_6ghz scan if no other + * channels were requested * @n_6ghz_params: number of 6 GHz params * @scan_6ghz_params: 6 GHz params * @bssid: BSSID to scan for (most commonly, the wildcard BSSID) @@ -2694,20 +2909,17 @@ struct cfg80211_scan_request { u8 mac_addr[ETH_ALEN] __aligned(2); u8 mac_addr_mask[ETH_ALEN] __aligned(2); u8 bssid[ETH_ALEN] __aligned(2); - - /* internal */ struct wiphy *wiphy; unsigned long scan_start; - struct cfg80211_scan_info info; - bool notified; bool no_cck; bool scan_6ghz; + bool first_part; u32 n_6ghz_params; struct cfg80211_scan_6ghz_params *scan_6ghz_params; s8 tsf_report_link_id; /* keep last */ - struct ieee80211_channel *channels[] __counted_by(n_channels); + struct ieee80211_channel *channels[]; }; static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) @@ -2947,6 +3159,7 @@ struct cfg80211_bss_ies { * @nontrans_list: list of non-transmitted BSS, if this is a transmitted one * (multi-BSSID support) * @signal: signal strength value (type depends on the wiphy's signal_type) + * @ts_boottime: timestamp of the last BSS update in nanoseconds since boot * @chains: bitmask for filled values in @chain_signal. * @chain_signal: per-chain signal strength of last received BSS in dBm. * @bssid_index: index in the multiple BSS set @@ -2971,6 +3184,8 @@ struct cfg80211_bss { s32 signal; + u64 ts_boottime; + u16 beacon_interval; u16 capability; @@ -3067,8 +3282,6 @@ struct cfg80211_auth_request { * if this is %NULL for a link, that link is not requested * @elems: extra elements for the per-STA profile for this link * @elems_len: length of the elements - * @disabled: If set this link should be included during association etc. but it - * should not be used until enabled by the AP MLD. * @error: per-link error code, must be <= 0. If there is an error, then the * operation as a whole must fail. */ @@ -3076,11 +3289,23 @@ struct cfg80211_assoc_link { struct cfg80211_bss *bss; const u8 *elems; size_t elems_len; - bool disabled; int error; }; /** + * struct cfg80211_ml_reconf_req - MLO link reconfiguration request + * @add_links: data for links to add, see &struct cfg80211_assoc_link + * @rem_links: bitmap of links to remove + * @ext_mld_capa_ops: extended MLD capabilities and operations set by + * userspace for the ML reconfiguration action frame + */ +struct cfg80211_ml_reconf_req { + struct cfg80211_assoc_link add_links[IEEE80211_MLD_MAX_NUM_LINKS]; + u16 rem_links; + u16 ext_mld_capa_ops; +}; + +/** * enum cfg80211_assoc_req_flags - Over-ride default behaviour in association. * * @ASSOC_REQ_DISABLE_HT: Disable HT (802.11n) @@ -3096,6 +3321,7 @@ struct cfg80211_assoc_link { * Drivers shall disable MLO features for the current association if this * flag is not set. * @ASSOC_REQ_SPP_AMSDU: SPP A-MSDUs will be used on this connection (if any) + * @ASSOC_REQ_DISABLE_UHR: Disable UHR */ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_HT = BIT(0), @@ -3106,6 +3332,7 @@ enum cfg80211_assoc_req_flags { ASSOC_REQ_DISABLE_EHT = BIT(5), CONNECT_REQ_MLO_SUPPORT = BIT(6), ASSOC_REQ_SPP_AMSDU = BIT(7), + ASSOC_REQ_DISABLE_UHR = BIT(8), }; /** @@ -3130,10 +3357,10 @@ enum cfg80211_assoc_req_flags { * included in the Current AP address field of the Reassociation Request * frame. * @flags: See &enum cfg80211_assoc_req_flags - * @supported_selectors: supported selectors in IEEE 802.11 format + * @supported_selectors: supported BSS selectors in IEEE 802.11 format * (or %NULL for no change). * If %NULL, then support for SAE_H2E should be assumed. - * @supported_selectors_len: Length of supported_selectors in octets. + * @supported_selectors_len: number of supported BSS selectors * @ht_capa: HT Capabilities over-rides. Values set in ht_capa_mask * will be used in ht_capa. Un-supported values will be ignored. * @ht_capa_mask: The bits of ht_capa which are to be used. @@ -3152,6 +3379,8 @@ enum cfg80211_assoc_req_flags { * the link on which the association request should be sent * @ap_mld_addr: AP MLD address in case of MLO association request, * valid iff @link_id >= 0 + * @ext_mld_capa_ops: extended MLD capabilities and operations set by + * userspace for the association */ struct cfg80211_assoc_request { struct cfg80211_bss *bss; @@ -3172,6 +3401,7 @@ struct cfg80211_assoc_request { struct cfg80211_assoc_link links[IEEE80211_MLD_MAX_NUM_LINKS]; const u8 *ap_mld_addr; s8 link_id; + u16 ext_mld_capa_ops; }; /** @@ -3757,6 +3987,109 @@ struct cfg80211_qos_map { }; /** + * DOC: Neighbor Awareness Networking (NAN) + * + * NAN uses two interface types: + * + * - %NL80211_IFTYPE_NAN: a non-netdev interface. This has two roles: (1) holds + * the configuration of all NAN activities (DE parameters, synchronisation + * parameters, local schedule, etc.), and (2) uses as the NAN Management + * Interface (NMI), which is used for NAN management communication. + * + * - %NL80211_IFTYPE_NAN_DATA: The NAN Data Interface (NDI), used for data + * communication with NAN peers. + * + * An NDI interface can only be started (IFF_UP) if the NMI one is running and + * NAN is started. Before NAN is stopped, all associated NDI interfaces + * must be stopped first. + * + * The local schedule specifies which channels the device is available on and + * when. Must be cancelled before NAN is stopped. + * + * NAN Stations + * ~~~~~~~~~~~~ + * + * There are two types of stations corresponding to the two interface types: + * + * - NMI station: Represents the NAN peer. Peer-specific data such as the peer's + * schedule and the HT, VHT and HE capabilities belongs to the NMI station. + * Also used for Tx/Rx of NAN management frames to/from the peer. + * Added on the %NL80211_IFTYPE_NAN interface. + * + * - NDI station: Used for Tx/Rx of data frames (and non-NAN management frames) + * for a specific NDP established with the NAN peer. Added on the + * %NL80211_IFTYPE_NAN_DATA interface. + * + * A peer may reuse its NMI address as the NDI address. In that case, two + * separate stations should be added even though they share the same MAC + * address. + * + * HT, VHT and HE capabilities should not changes after it was set. It is the + * driver's responsibility to check that. + * + * An NDI station can only be added if the corresponding NMI station has already + * been configured with HT (and possibly VHT and HE) capabilities. It is the + * driver's responsibility to check that. + * + * All NDI stations must be removed before corresponding NMI station is removed. + * Therefore, removing a NMI station implies that the associated NDI station(s) + * (if any) will be removed first. + * + * NAN Dependencies + * ~~~~~~~~~~~~~~~~ + * + * The following diagram shows the dependencies between NAN components. + * An arrow from A to B means A must be started/added before B, and B must be + * stopped/removed before A: + * + * +-------------+ + * | NMI iface |---(local schedule) + * +------+------+ + * / \ + * v v + * +-----------+ +-------------+ + * | NDI iface | | NMI sta |---(peer schedule) + * +-----+-----+ +------+------+ + * \ / + * v v + * +----------+ + * | NDI sta | + * +----------+ + */ + +/** + * struct cfg80211_nan_band_config - NAN band specific configuration + * + * @chan: Pointer to the IEEE 802.11 channel structure. The channel to be used + * for NAN operations on this band. For 2.4 GHz band, this is always + * channel 6. For 5 GHz band, the channel is either 44 or 149, according + * to the regulatory constraints. If chan pointer is NULL the entire band + * configuration entry is considered invalid and should not be used. + * @rssi_close: RSSI close threshold used for NAN state transition algorithm + * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State + * Transition" of Wi-Fi Aware Specification v4.0. If not + * specified (set to 0), default device value is used. The value should + * be greater than -60 dBm. + * @rssi_middle: RSSI middle threshold used for NAN state transition algorithm. + * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State + * Transition" of Wi-Fi Aware Specification v4.0. If not + * specified (set to 0), default device value is used. The value should be + * greater than -75 dBm and less than rssi_close. + * @awake_dw_interval: Committed DW interval. Valid values range: 0-5. 0 + * indicates no wakeup for DW and can't be used on 2.4GHz band, otherwise + * 2^(n-1). + * @disable_scan: If true, the device will not scan this band for cluster + * merge. Disabling scan on 2.4 GHz band is not allowed. + */ +struct cfg80211_nan_band_config { + struct ieee80211_channel *chan; + s8 rssi_close; + s8 rssi_middle; + u8 awake_dw_interval; + bool disable_scan; +}; + +/** * struct cfg80211_nan_conf - NAN configuration * * This struct defines NAN configuration parameters @@ -3765,10 +4098,129 @@ struct cfg80211_qos_map { * @bands: operating bands, a bitmap of &enum nl80211_band values. * For instance, for NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * @cluster_id: cluster ID used for NAN synchronization. This is a MAC address + * that can take a value from 50-6F-9A-01-00-00 to 50-6F-9A-01-FF-FF. + * @scan_period: period (in seconds) between NAN scans. + * @scan_dwell_time: dwell time (in milliseconds) for NAN scans. + * @discovery_beacon_interval: interval (in TUs) for discovery beacons. + * @enable_dw_notification: flag to enable/disable discovery window + * notifications. + * @band_cfgs: array of band specific configurations, indexed by + * &enum nl80211_band values. + * @extra_nan_attrs: pointer to additional NAN attributes. + * @extra_nan_attrs_len: length of the additional NAN attributes. + * @vendor_elems: pointer to vendor-specific elements. + * @vendor_elems_len: length of the vendor-specific elements. */ struct cfg80211_nan_conf { u8 master_pref; u8 bands; + u8 cluster_id[ETH_ALEN] __aligned(2); + u16 scan_period; + u16 scan_dwell_time; + u8 discovery_beacon_interval; + bool enable_dw_notification; + struct cfg80211_nan_band_config band_cfgs[NUM_NL80211_BANDS]; + const u8 *extra_nan_attrs; + u16 extra_nan_attrs_len; + const u8 *vendor_elems; + u16 vendor_elems_len; +}; + +#define CFG80211_NAN_SCHED_NUM_TIME_SLOTS 32 + +/** + * struct cfg80211_nan_channel - NAN channel configuration + * + * This struct defines a NAN channel configuration + * + * @chandef: the channel definition + * @channel_entry: pointer to the Channel Entry blob as defined in Wi-Fi Aware + * (TM) 4.0 specification Table 100 (Channel Entry format for the NAN + * Availability attribute). + * @rx_nss: number of spatial streams supported on this channel + */ +struct cfg80211_nan_channel { + struct cfg80211_chan_def chandef; + const u8 *channel_entry; + u8 rx_nss; +}; + +/** + * struct cfg80211_nan_local_sched - NAN local schedule + * + * This struct defines NAN local schedule parameters + * + * @schedule: a mapping of time slots to chandef indexes in %nan_channels. + * An unscheduled slot will be set to %NL80211_NAN_SCHED_NOT_AVAIL_SLOT. + * @n_channels: number of channel definitions in %nan_channels. + * @nan_avail_blob: pointer to NAN Availability attribute blob. + * See %NL80211_ATTR_NAN_AVAIL_BLOB for more details. + * @nan_avail_blob_len: length of the @nan_avail_blob in bytes. + * @deferred: if true, the command containing this schedule configuration is a + * request from the device to perform an announced schedule update. This + * means that it needs to send the updated NAN availability to the peers, + * and do the actual switch on the right time (i.e. at the end of the slot + * after the slot in which the updated NAN Availability was sent). + * See %NL80211_ATTR_NAN_SCHED_DEFERRED for more details. + * If false, the schedule is applied immediately. + * @nan_channels: array of NAN channel definitions that can be scheduled. + */ +struct cfg80211_nan_local_sched { + u8 schedule[CFG80211_NAN_SCHED_NUM_TIME_SLOTS]; + u8 n_channels; + const u8 *nan_avail_blob; + u16 nan_avail_blob_len; + bool deferred; + struct cfg80211_nan_channel nan_channels[] __counted_by(n_channels); +}; + +/** + * struct cfg80211_nan_peer_map - NAN peer schedule map + * + * This struct defines a single NAN peer schedule map + * + * @map_id: map ID of this schedule map + * @schedule: a mapping of time slots to chandef indexes in the schedule's + * @nan_channels. Each slot lasts 16TUs. An unscheduled slot will be + * set to %NL80211_NAN_SCHED_NOT_AVAIL_SLOT. + */ +struct cfg80211_nan_peer_map { + u8 map_id; + u8 schedule[CFG80211_NAN_SCHED_NUM_TIME_SLOTS]; +}; + +#define CFG80211_NAN_MAX_PEER_MAPS 2 +#define CFG80211_NAN_INVALID_MAP_ID 0xff + +/** + * struct cfg80211_nan_peer_sched - NAN peer schedule + * + * This struct defines NAN peer schedule parameters for a peer. + * + * @peer_addr: MAC address of the peer (NMI address) + * @seq_id: sequence ID of the peer schedule. + * @committed_dw: committed DW as published by the peer. + * See %NL80211_ATTR_NAN_COMMITTED_DW + * @max_chan_switch: maximum channel switch time in microseconds as published + * by the peer. See %NL80211_ATTR_NAN_MAX_CHAN_SWITCH_TIME. + * @init_ulw: initial ULWs as published by the peer. + * @ulw_size: number of bytes in @init_ulw. + * @n_channels: number of channel definitions in @nan_channels. + * @nan_channels: array of NAN channel definitions for this schedule. + * @maps: array of peer schedule maps. Unused entries have + * map_id = %CFG80211_NAN_INVALID_MAP_ID. + */ +struct cfg80211_nan_peer_sched { + const u8 *peer_addr; + u8 seq_id; + u16 committed_dw; + u16 max_chan_switch; + const u8 *init_ulw; + u16 ulw_size; + u8 n_channels; + struct cfg80211_nan_channel *nan_channels; + struct cfg80211_nan_peer_map maps[CFG80211_NAN_MAX_PEER_MAPS]; }; /** @@ -3777,10 +4229,17 @@ struct cfg80211_nan_conf { * * @CFG80211_NAN_CONF_CHANGED_PREF: master preference * @CFG80211_NAN_CONF_CHANGED_BANDS: operating bands + * @CFG80211_NAN_CONF_CHANGED_CONFIG: changed additional configuration. + * When this flag is set, it indicates that some additional attribute(s) + * (other then master_pref and bands) have been changed. In this case, + * all the unchanged attributes will be properly configured to their + * previous values. The driver doesn't need to store any + * previous configuration besides master_pref and bands. */ enum cfg80211_nan_conf_changes { CFG80211_NAN_CONF_CHANGED_PREF = BIT(0), CFG80211_NAN_CONF_CHANGED_BANDS = BIT(1), + CFG80211_NAN_CONF_CHANGED_CONFIG = BIT(2), }; /** @@ -3957,6 +4416,7 @@ struct cfg80211_ftm_responder_stats { * @num_bursts_exp: actual number of bursts exponent negotiated * @burst_duration: actual burst duration negotiated * @ftms_per_burst: actual FTMs per burst negotiated + * @burst_period: actual burst period negotiated in units of 100ms * @lci_len: length of LCI information (if present) * @civicloc_len: length of civic location information (if present) * @lci: LCI data (may be %NULL) @@ -3998,6 +4458,7 @@ struct cfg80211_pmsr_ftm_result { u8 num_bursts_exp; u8 burst_duration; u8 ftms_per_burst; + u16 burst_period; s32 rssi_avg; s32 rssi_spread; struct rate_info tx_rate, rx_rate; @@ -4060,7 +4521,9 @@ struct cfg80211_pmsr_result { * @burst_period: burst period to use * @asap: indicates to use ASAP mode * @num_bursts_exp: number of bursts exponent - * @burst_duration: burst duration + * @burst_duration: burst duration. If @trigger_based or @non_trigger_based is + * set, this is the burst duration in milliseconds, and zero means the + * device should pick an appropriate value based on @ftms_per_burst. * @ftms_per_burst: number of FTMs per burst * @ftmr_retries: number of retries for FTM request * @request_lci: request LCI information @@ -4073,6 +4536,8 @@ struct cfg80211_pmsr_result { * EDCA based ranging will be used. * @lmr_feedback: negotiate for I2R LMR feedback. Only valid if either * @trigger_based or @non_trigger_based is set. + * @rsta: Operate as the RSTA in the measurement. Only valid if @lmr_feedback + * and either @trigger_based or @non_trigger_based is set. * @bss_color: the bss color of the responder. Optional. Set to zero to * indicate the driver should set the BSS color. Only valid if * @non_trigger_based or @trigger_based is set. @@ -4088,7 +4553,8 @@ struct cfg80211_pmsr_ftm_request_peer { request_civicloc:1, trigger_based:1, non_trigger_based:1, - lmr_feedback:1; + lmr_feedback:1, + rsta:1; u8 num_bursts_exp; u8 burst_duration; u8 ftms_per_burst; @@ -4537,6 +5003,19 @@ struct mgmt_frame_regs { * @nan_change_conf: changes NAN configuration. The changed parameters must * be specified in @changes (using &enum cfg80211_nan_conf_changes); * All other parameters must be ignored. + * @nan_set_local_sched: configure the local schedule for NAN. The schedule + * consists of an array of %cfg80211_nan_channel and the schedule itself, + * in which each entry maps each time slot to the channel on which the + * radio should operate on. If the chandef of a NAN channel is not + * changed, the channel entry must also remain unchanged. It is the + * driver's responsibility to verify this. + * @nan_set_peer_sched: configure the peer schedule for NAN. The schedule + * consists of an array of %cfg80211_nan_channel and the schedule itself, + * in which each entry maps each time slot to a channel on which the + * radio should operate on. In addition, it contains more peer's schedule + * information such as committed DW, etc. When updating an existing peer + * schedule, the full new schedule is provided - partial updates are not + * supported, and the new schedule completely replaces the previous one. * * @set_multicast_to_unicast: configure multicast to unicast conversion for BSS * @@ -4631,24 +5110,24 @@ struct cfg80211_ops { struct wireless_dev *wdev, unsigned int link_id); - int (*add_key)(struct wiphy *wiphy, struct net_device *netdev, + int (*add_key)(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params); - int (*get_key)(struct wiphy *wiphy, struct net_device *netdev, + int (*get_key)(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params*)); - int (*del_key)(struct wiphy *wiphy, struct net_device *netdev, + int (*del_key)(struct wiphy *wiphy, struct wireless_dev *wdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr); int (*set_default_key)(struct wiphy *wiphy, struct net_device *netdev, int link_id, u8 key_index, bool unicast, bool multicast); int (*set_default_mgmt_key)(struct wiphy *wiphy, - struct net_device *netdev, int link_id, + struct wireless_dev *wdev, int link_id, u8 key_index); int (*set_default_beacon_key)(struct wiphy *wiphy, - struct net_device *netdev, + struct wireless_dev *wdev, int link_id, u8 key_index); @@ -4660,17 +5139,17 @@ struct cfg80211_ops { unsigned int link_id); - int (*add_station)(struct wiphy *wiphy, struct net_device *dev, + int (*add_station)(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params); - int (*del_station)(struct wiphy *wiphy, struct net_device *dev, + int (*del_station)(struct wiphy *wiphy, struct wireless_dev *wdev, struct station_del_parameters *params); - int (*change_station)(struct wiphy *wiphy, struct net_device *dev, + int (*change_station)(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_parameters *params); - int (*get_station)(struct wiphy *wiphy, struct net_device *dev, + int (*get_station)(struct wiphy *wiphy, struct wireless_dev *wdev, const u8 *mac, struct station_info *sinfo); - int (*dump_station)(struct wiphy *wiphy, struct net_device *dev, + int (*dump_station)(struct wiphy *wiphy, struct wireless_dev *wdev, int idx, u8 *mac, struct station_info *sinfo); int (*add_mpath)(struct wiphy *wiphy, struct net_device *dev, @@ -4750,12 +5229,14 @@ struct cfg80211_ops { int (*set_mcast_rate)(struct wiphy *wiphy, struct net_device *dev, int rate[NUM_NL80211_BANDS]); - int (*set_wiphy_params)(struct wiphy *wiphy, u32 changed); + int (*set_wiphy_params)(struct wiphy *wiphy, int radio_idx, + u32 changed); int (*set_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, enum nl80211_tx_power_setting type, int mbm); int (*get_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id, int *dbm); + int radio_idx, unsigned int link_id, int *dbm); void (*rfkill_poll)(struct wiphy *wiphy); @@ -4817,8 +5298,10 @@ struct cfg80211_ops { struct wireless_dev *wdev, struct mgmt_frame_regs *upd); - int (*set_antenna)(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant); - int (*get_antenna)(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant); + int (*set_antenna)(struct wiphy *wiphy, int radio_idx, + u32 tx_ant, u32 rx_ant); + int (*get_antenna)(struct wiphy *wiphy, int radio_idx, + u32 *tx_ant, u32 *rx_ant); int (*sched_scan_start)(struct wiphy *wiphy, struct net_device *dev, @@ -4910,7 +5393,12 @@ struct cfg80211_ops { struct wireless_dev *wdev, struct cfg80211_nan_conf *conf, u32 changes); - + int (*nan_set_local_sched)(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct cfg80211_nan_local_sched *sched); + int (*nan_set_peer_sched)(struct wiphy *wiphy, + struct wireless_dev *wdev, + struct cfg80211_nan_peer_sched *sched); int (*set_multicast_to_unicast)(struct wiphy *wiphy, struct net_device *dev, const bool enabled); @@ -4970,8 +5458,7 @@ struct cfg80211_ops { struct cfg80211_ttlm_params *params); u32 (*get_radio_mask)(struct wiphy *wiphy, struct net_device *dev); int (*assoc_ml_reconf)(struct wiphy *wiphy, struct net_device *dev, - struct cfg80211_assoc_link *add_links, - u16 rem_links); + struct cfg80211_ml_reconf_req *req); int (*set_epcs)(struct wiphy *wiphy, struct net_device *dev, bool val); }; @@ -5405,6 +5892,18 @@ cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type); * not limited) * @ftm.trigger_based: trigger based ranging measurement is supported * @ftm.non_trigger_based: non trigger based ranging measurement is supported + * @ftm.support_6ghz: supports ranging in 6 GHz band + * @ftm.max_tx_ltf_rep: maximum number of TX LTF repetitions supported (0 means + * only one LTF, no repetitions) + * @ftm.max_rx_ltf_rep: maximum number of RX LTF repetitions supported (0 means + * only one LTF, no repetitions) + * @ftm.max_tx_sts: maximum number of TX STS supported (zero based) + * @ftm.max_rx_sts: maximum number of RX STS supported (zero based) + * @ftm.max_total_ltf_tx: maximum total number of LTFs that can be transmitted + * (0 means unknown) + * @ftm.max_total_ltf_rx: maximum total number of LTFs that can be received + * (0 means unknown) + * @ftm.support_rsta: supports operating as RSTA in PMSR FTM request */ struct cfg80211_pmsr_capabilities { unsigned int max_peers; @@ -5422,7 +5921,15 @@ struct cfg80211_pmsr_capabilities { request_lci:1, request_civicloc:1, trigger_based:1, - non_trigger_based:1; + non_trigger_based:1, + support_6ghz:1; + u8 max_tx_ltf_rep; + u8 max_rx_ltf_rep; + u8 max_tx_sts; + u8 max_rx_sts; + u8 max_total_ltf_tx; + u8 max_total_ltf_rx; + u8 support_rsta:1; } ftm; }; @@ -5442,6 +5949,22 @@ struct wiphy_iftype_akm_suites { }; /** + * struct wiphy_radio_cfg - physical radio config of a wiphy + * This structure describes the configurations of a physical radio in a + * wiphy. It is used to denote per-radio attributes belonging to a wiphy. + * + * @rts_threshold: RTS threshold (dot11RTSThreshold); + * -1 (default) = RTS/CTS disabled + * @radio_debugfsdir: Pointer to debugfs directory containing the radio- + * specific parameters. + * NULL (default) = Debugfs directory not created + */ +struct wiphy_radio_cfg { + u32 rts_threshold; + struct dentry *radio_debugfsdir; +}; + +/** * struct wiphy_radio_freq_range - wiphy frequency range * @start_freq: start range edge frequency (kHz) * @end_freq: end range edge frequency (kHz) @@ -5477,6 +6000,53 @@ struct wiphy_radio { u32 antenna_mask; }; +/** + * enum wiphy_nan_flags - NAN capabilities + * + * @WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC: Device supports NAN configurable + * synchronization. + * @WIPHY_NAN_FLAGS_USERSPACE_DE: Device doesn't support DE offload. + */ +enum wiphy_nan_flags { + WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC = BIT(0), + WIPHY_NAN_FLAGS_USERSPACE_DE = BIT(1), +}; + +/** + * struct wiphy_nan_capa - NAN capabilities + * + * This structure describes the NAN capabilities of a wiphy. + * + * @flags: NAN capabilities flags, see &enum wiphy_nan_flags + * @op_mode: NAN operation mode, as defined in Wi-Fi Aware (TM) specification + * Table 81. + * @n_antennas: number of antennas supported by the device for Tx/Rx. Lower + * nibble indicates the number of TX antennas and upper nibble indicates the + * number of RX antennas. Value 0 indicates the information is not + * available. + * @max_channel_switch_time: maximum channel switch time in milliseconds. + * @dev_capabilities: NAN device capabilities as defined in Wi-Fi Aware (TM) + * specification Table 79 (Capabilities field). + * @phy: Band-agnostic capabilities for NAN data interfaces. Since NAN + * operates on multiple channels simultaneously, these capabilities apply + * across all bands. Valid only if NL80211_IFTYPE_NAN_DATA is supported. + * @phy.ht: HT capabilities (mandatory for NAN data) + * @phy.vht: VHT capabilities (optional) + * @phy.he: HE capabilities (optional) + */ +struct wiphy_nan_capa { + u32 flags; + u8 op_mode; + u8 n_antennas; + u16 max_channel_switch_time; + u8 dev_capabilities; + struct { + struct ieee80211_sta_ht_cap ht; + struct ieee80211_sta_vht_cap vht; + struct ieee80211_sta_he_cap he; + } phy; +}; + #define CFG80211_HW_TIMESTAMP_ALL_PEERS 0xffff /** @@ -5637,6 +6207,11 @@ struct wiphy_radio { * and probe responses. This value should be set if the driver * wishes to limit the number of csa counters. Default (0) means * infinite. + * @bss_param_support: bitmask indicating which bss_parameters as defined in + * &struct bss_parameters the driver can actually handle in the + * .change_bss() callback. The bit positions are defined in &enum + * wiphy_bss_param_flags. + * * @bss_select_support: bitmask indicating the BSS selection criteria supported * by the driver in the .connect() callback. The bit position maps to the * attribute indices defined in &enum nl80211_bss_select_attr. @@ -5645,6 +6220,7 @@ struct wiphy_radio { * bitmap of &enum nl80211_band values. For instance, for * NL80211_BAND_2GHZ, bit 0 would be set * (i.e. BIT(NL80211_BAND_2GHZ)). + * @nan_capa: NAN capabilities * * @txq_limit: configuration of internal TX queue frame limit * @txq_memory_limit: configuration internal TX queue memory limit @@ -5696,6 +6272,10 @@ struct wiphy_radio { * supports enabling HW timestamping for all peers (i.e. no need to * specify a mac address). * + * @radio_cfg: configuration of radios belonging to a muli-radio wiphy. This + * struct contains a list of all radio specific attributes and should be + * used only for multi-radio wiphy. + * * @radio: radios belonging to this wiphy * @n_radio: number of radios */ @@ -5785,6 +6365,8 @@ struct wiphy { void (*reg_notifier)(struct wiphy *wiphy, struct regulatory_request *request); + struct wiphy_radio_cfg *radio_cfg; + /* fields below are read-only, assigned by cfg80211 */ const struct ieee80211_regdomain __rcu *regd; @@ -5816,9 +6398,11 @@ struct wiphy { u8 max_num_csa_counters; + u32 bss_param_support; u32 bss_select_support; u8 nan_supported_bands; + struct wiphy_nan_capa nan_capa; u32 txq_limit; u32 txq_memory_limit; @@ -6138,6 +6722,11 @@ static inline void wiphy_delayed_work_init(struct wiphy_delayed_work *dwork, * after wiphy_lock() was called. Therefore, wiphy_cancel_work() can * use just cancel_work() instead of cancel_work_sync(), it requires * being in a section protected by wiphy_lock(). + * + * Note that these are scheduled with a timer where the accuracy + * becomes less the longer in the future the scheduled timer is. Use + * wiphy_hrtimer_work_queue() if the timer must be not be late by more + * than approximately 10 percent. */ void wiphy_delayed_work_queue(struct wiphy *wiphy, struct wiphy_delayed_work *dwork, @@ -6209,6 +6798,79 @@ void wiphy_delayed_work_flush(struct wiphy *wiphy, bool wiphy_delayed_work_pending(struct wiphy *wiphy, struct wiphy_delayed_work *dwork); +struct wiphy_hrtimer_work { + struct wiphy_work work; + struct wiphy *wiphy; + struct hrtimer timer; +}; + +enum hrtimer_restart wiphy_hrtimer_work_timer(struct hrtimer *t); + +static inline void wiphy_hrtimer_work_init(struct wiphy_hrtimer_work *hrwork, + wiphy_work_func_t func) +{ + hrtimer_setup(&hrwork->timer, wiphy_hrtimer_work_timer, + CLOCK_BOOTTIME, HRTIMER_MODE_REL); + wiphy_work_init(&hrwork->work, func); +} + +/** + * wiphy_hrtimer_work_queue - queue hrtimer work for the wiphy + * @wiphy: the wiphy to queue for + * @hrwork: the high resolution timer worker + * @delay: the delay given as a ktime_t + * + * Please refer to wiphy_delayed_work_queue(). The difference is that + * the hrtimer work uses a high resolution timer for scheduling. This + * may be needed if timeouts might be scheduled further in the future + * and the accuracy of the normal timer is not sufficient. + * + * Expect a delay of a few milliseconds as the timer is scheduled + * with some slack and some more time may pass between queueing the + * work and its start. + */ +void wiphy_hrtimer_work_queue(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork, + ktime_t delay); + +/** + * wiphy_hrtimer_work_cancel - cancel previously queued hrtimer work + * @wiphy: the wiphy, for debug purposes + * @hrtimer: the hrtimer work to cancel + * + * Cancel the work *without* waiting for it, this assumes being + * called under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_hrtimer_work_cancel(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrtimer); + +/** + * wiphy_hrtimer_work_flush - flush previously queued hrtimer work + * @wiphy: the wiphy, for debug purposes + * @hrwork: the hrtimer work to flush + * + * Flush the work (i.e. run it if pending). This must be called + * under the wiphy mutex acquired by wiphy_lock(). + */ +void wiphy_hrtimer_work_flush(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork); + +/** + * wiphy_hrtimer_work_pending - Find out whether a wiphy hrtimer + * work item is currently pending. + * + * @wiphy: the wiphy, for debug purposes + * @hrwork: the hrtimer work in question + * + * Return: true if timer is pending, false otherwise + * + * Please refer to the wiphy_delayed_work_pending() documentation as + * this is the equivalent function for hrtimer based delayed work + * items. + */ +bool wiphy_hrtimer_work_pending(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork); + /** * enum ieee80211_ap_reg_power - regulatory power for an Access Point * @@ -6276,8 +6938,8 @@ enum ieee80211_ap_reg_power { * the P2P Device. * @ps: powersave mode is enabled * @ps_timeout: dynamic powersave timeout - * @ap_unexpected_nlportid: (private) netlink port ID of application - * registered for unexpected class 3 frames (AP mode) + * @unexpected_nlportid: (private) netlink port ID of application + * registered for unexpected frames (AP mode or NAN_DATA mode) * @conn: (private) cfg80211 software SME connection state machine data * @connect_keys: (private) keys to set after connection is established * @conn_bss_type: connecting/connected BSS type @@ -6339,7 +7001,7 @@ struct wireless_dev { bool ps; int ps_timeout; - u32 ap_unexpected_nlportid; + u32 unexpected_nlportid; u32 owner_nlportid; bool nl_owner_dead; @@ -6397,6 +7059,12 @@ struct wireless_dev { struct { struct cfg80211_chan_def chandef; } ocb; + struct { + u8 cluster_id[ETH_ALEN] __aligned(2); + u8 n_channels; + struct cfg80211_chan_def *chandefs; + bool sched_update_pending; + } nan; } u; struct { @@ -6505,16 +7173,6 @@ ieee80211_channel_to_khz(const struct ieee80211_channel *chan) } /** - * ieee80211_s1g_channel_width - get allowed channel width from @chan - * - * Only allowed for band NL80211_BAND_S1GHZ - * @chan: channel - * Return: The allowed channel width for this center_freq - */ -enum nl80211_chan_width -ieee80211_s1g_channel_width(const struct ieee80211_channel *chan); - -/** * ieee80211_channel_to_freq_khz - convert channel number to frequency * @chan: channel number * @band: band, necessary due to channel number overlap @@ -6593,6 +7251,19 @@ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan) } /** + * ieee80211_radio_freq_range_valid - Check if the radio supports the + * specified frequency range + * + * @radio: wiphy radio + * @freq: the frequency (in KHz) to be queried + * @width: the bandwidth (in KHz) to be queried + * + * Return: whether or not the given frequency range is valid for the given radio + */ +bool ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio, + u32 freq, u32 width); + +/** * cfg80211_radio_chandef_valid - Check if the radio supports the chandef * * @radio: wiphy radio @@ -8465,6 +9136,17 @@ void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie, int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp); /** + * cfg80211_link_sinfo_alloc_tid_stats - allocate per-tid statistics. + * + * @link_sinfo: the link station information + * @gfp: allocation flags + * + * Return: 0 on success. Non-zero on error. + */ +int cfg80211_link_sinfo_alloc_tid_stats(struct link_station_info *link_sinfo, + gfp_t gfp); + +/** * cfg80211_sinfo_release_content - release contents of station info * @sinfo: the station information * @@ -8475,40 +9157,47 @@ int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp); static inline void cfg80211_sinfo_release_content(struct station_info *sinfo) { kfree(sinfo->pertid); + + for (int link_id = 0; link_id < ARRAY_SIZE(sinfo->links); link_id++) { + if (sinfo->links[link_id]) { + kfree(sinfo->links[link_id]->pertid); + kfree(sinfo->links[link_id]); + } + } } /** * cfg80211_new_sta - notify userspace about station * - * @dev: the netdev + * @wdev: the wireless device * @mac_addr: the station's address * @sinfo: the station information * @gfp: allocation flags */ -void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, +void cfg80211_new_sta(struct wireless_dev *wdev, const u8 *mac_addr, struct station_info *sinfo, gfp_t gfp); /** * cfg80211_del_sta_sinfo - notify userspace about deletion of a station - * @dev: the netdev + * @wdev: the wireless device * @mac_addr: the station's address. For MLD station, MLD address is used. * @sinfo: the station information/statistics * @gfp: allocation flags */ -void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr, +void cfg80211_del_sta_sinfo(struct wireless_dev *wdev, const u8 *mac_addr, struct station_info *sinfo, gfp_t gfp); /** * cfg80211_del_sta - notify userspace about deletion of a station * - * @dev: the netdev + * @wdev: the wireless device * @mac_addr: the station's address. For MLD station, MLD address is used. * @gfp: allocation flags */ -static inline void cfg80211_del_sta(struct net_device *dev, +static inline void cfg80211_del_sta(struct wireless_dev *wdev, const u8 *mac_addr, gfp_t gfp) { - cfg80211_del_sta_sinfo(dev, mac_addr, NULL, gfp); + cfg80211_del_sta_sinfo(wdev, mac_addr, NULL, gfp); } /** @@ -8879,22 +9568,25 @@ void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index, /** * cfg80211_rx_spurious_frame - inform userspace about a spurious frame * @dev: The device the frame matched to + * @link_id: the link the frame was received on, -1 if not applicable or unknown * @addr: the transmitter address * @gfp: context flags * - * This function is used in AP mode (only!) to inform userspace that - * a spurious class 3 frame was received, to be able to deauth the - * sender. + * This function is used in AP mode to inform userspace that a spurious + * class 3 frame was received, to be able to deauth the sender. + * It is also used in NAN_DATA mode to report frames from unknown peers + * (A2 not assigned to any active NDP), per Wi-Fi Aware (TM) 4.0 specification 6.2.5. * Return: %true if the frame was passed to userspace (or this failed * for a reason other than not having a subscription.) */ -bool cfg80211_rx_spurious_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp); +bool cfg80211_rx_spurious_frame(struct net_device *dev, const u8 *addr, + int link_id, gfp_t gfp); /** * cfg80211_rx_unexpected_4addr_frame - inform about unexpected WDS frame * @dev: The device the frame matched to * @addr: the transmitter address + * @link_id: the link the frame was received on, -1 if not applicable or unknown * @gfp: context flags * * This function is used in AP mode (only!) to inform userspace that @@ -8904,8 +9596,8 @@ bool cfg80211_rx_spurious_frame(struct net_device *dev, * Return: %true if the frame was passed to userspace (or this failed * for a reason other than not having a subscription.) */ -bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp); +bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, const u8 *addr, + int link_id, gfp_t gfp); /** * cfg80211_probe_status - notify userspace about probe status @@ -9371,6 +10063,32 @@ int cfg80211_iter_combinations(struct wiphy *wiphy, void (*iter)(const struct ieee80211_iface_combination *c, void *data), void *data); +/** + * cfg80211_get_radio_idx_by_chan - get the radio index by the channel + * + * @wiphy: the wiphy + * @chan: channel for which the supported radio index is required + * + * Return: radio index on success or -EINVAL otherwise + */ +int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, + const struct ieee80211_channel *chan); + +/** + * cfg80211_stop_link - stop AP/P2P_GO link if link_id is non-negative or stops + * all links on the interface. + * + * @wiphy: the wiphy + * @wdev: wireless device + * @link_id: valid link ID in case of MLO AP/P2P_GO Operation or else -1 + * @gfp: context flags + * + * If link_id is set during MLO operation, stops only the specified AP/P2P_GO + * link and if link_id is set to -1 or last link is stopped, the entire + * interface is stopped as if AP was stopped, IBSS/mesh left, STA disconnected. + */ +void cfg80211_stop_link(struct wiphy *wiphy, struct wireless_dev *wdev, + int link_id, gfp_t gfp); /** * cfg80211_stop_iface - trigger interface disconnection @@ -9384,8 +10102,11 @@ int cfg80211_iter_combinations(struct wiphy *wiphy, * * Note: This doesn't need any locks and is asynchronous. */ -void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev, - gfp_t gfp); +static inline void +cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev, gfp_t gfp) +{ + cfg80211_stop_link(wiphy, wdev, -1, gfp); +} /** * cfg80211_shutdown_all_interfaces - shut down all interfaces for a wiphy @@ -9501,6 +10222,18 @@ void cfg80211_nan_func_terminated(struct wireless_dev *wdev, enum nl80211_nan_func_term_reason reason, u64 cookie, gfp_t gfp); +/** + * cfg80211_nan_sched_update_done - notify deferred schedule update completion + * @wdev: the wireless device reporting the event + * @success: whether or not the schedule update was successful + * @gfp: allocation flags + * + * This function notifies user space that a deferred local NAN schedule update + * (requested with %NL80211_ATTR_NAN_SCHED_DEFERRED) has been completed. + */ +void cfg80211_nan_sched_update_done(struct wireless_dev *wdev, bool success, + gfp_t gfp); + /* ethtool helper */ void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info); @@ -9719,6 +10452,36 @@ static inline int cfg80211_color_change_notify(struct net_device *dev, } /** + * cfg80211_6ghz_power_type - determine AP regulatory power type + * @control: control flags + * @client_flags: &enum ieee80211_channel_flags for station mode to enable + * SP to LPI fallback, zero otherwise. + * + * Return: regulatory power type from &enum ieee80211_ap_reg_power + */ +static inline enum ieee80211_ap_reg_power +cfg80211_6ghz_power_type(u8 control, u32 client_flags) +{ + switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { + case IEEE80211_6GHZ_CTRL_REG_LPI_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: + case IEEE80211_6GHZ_CTRL_REG_AP_ROLE_NOT_RELEVANT: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD: + return IEEE80211_REG_LPI_AP; + case IEEE80211_6GHZ_CTRL_REG_SP_AP: + return IEEE80211_REG_SP_AP; + case IEEE80211_6GHZ_CTRL_REG_VLP_AP: + return IEEE80211_REG_VLP_AP; + case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: + if (client_flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT) + return IEEE80211_REG_LPI_AP; + return IEEE80211_REG_SP_AP; + default: + return IEEE80211_REG_UNSET_AP; + } +} + +/** * cfg80211_links_removed - Notify about removed STA MLD setup links. * @dev: network device. * @link_mask: BIT mask of removed STA MLD setup link IDs. @@ -9735,6 +10498,11 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask); * struct cfg80211_mlo_reconf_done_data - MLO reconfiguration data * @buf: MLO Reconfiguration Response frame (header + body) * @len: length of the frame data + * @driver_initiated: Indicates whether the add links request is initiated by + * driver. This is set to true when the link reconfiguration request + * initiated by driver due to AP link recommendation requests + * (Ex: BTM (BSS Transition Management) request) handling offloaded to + * driver. * @added_links: BIT mask of links successfully added to the association * @links: per-link information indexed by link ID * @links.bss: the BSS that MLO reconfiguration was requested for, ownership of @@ -9747,9 +10515,11 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask); struct cfg80211_mlo_reconf_done_data { const u8 *buf; size_t len; + bool driver_initiated; u16 added_links; struct { struct cfg80211_bss *bss; + u8 *addr; } links[IEEE80211_MLD_MAX_NUM_LINKS]; }; @@ -9781,6 +10551,62 @@ void cfg80211_schedule_channels_check(struct wireless_dev *wdev); */ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled); +/** + * cfg80211_next_nan_dw_notif - Notify about the next NAN Discovery Window (DW) + * @wdev: Pointer to the wireless device structure + * @chan: DW channel (6, 44 or 149) + * @gfp: Memory allocation flags + */ +void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, + struct ieee80211_channel *chan, gfp_t gfp); + +/** + * cfg80211_nan_cluster_joined - Notify about NAN cluster join + * @wdev: Pointer to the wireless device structure + * @cluster_id: Cluster ID of the NAN cluster that was joined or started + * @new_cluster: Indicates if this is a new cluster or an existing one + * @gfp: Memory allocation flags + * + * This function is used to notify user space when a NAN cluster has been + * joined, providing the cluster ID and a flag whether it is a new cluster. + */ +void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, + const u8 *cluster_id, bool new_cluster, + gfp_t gfp); + +/** + * cfg80211_nan_ulw_update - Notify user space about ULW update + * @wdev: Pointer to the wireless device structure + * @ulw: Pointer to the ULW blob data + * @ulw_len: Length of the ULW blob in bytes + * @gfp: Memory allocation flags + * + * This function is used by drivers to notify user space when the device's + * ULW (Unaligned Schedule) blob has been updated. User space can use this + * blob to attach to frames sent to peers. + */ +void cfg80211_nan_ulw_update(struct wireless_dev *wdev, + const u8 *ulw, size_t ulw_len, gfp_t gfp); + +/** + * cfg80211_nan_channel_evac - Notify user space about NAN channel evacuation + * @wdev: Pointer to the wireless device structure + * @chandef: Pointer to the channel definition of the NAN channel that was + * evacuated + * @gfp: Memory allocation flags + * + * This function is used by drivers to notify user space when a NAN + * channel has been evacuated (i.e. ULWed) due to channel resource conflicts + * with other interfaces. + * This can happen when another interface sharing the channel resource with NAN + * needs to move to a different channel (e.g. due to channel switch or link + * switch). User space may reconfigure the local schedule to exclude the + * evacuated channel. + */ +void cfg80211_nan_channel_evac(struct wireless_dev *wdev, + const struct cfg80211_chan_def *chandef, + gfp_t gfp); + #ifdef CONFIG_CFG80211_DEBUGFS /** * wiphy_locked_debugfs_read - do a locked read in debugfs @@ -9831,4 +10657,95 @@ ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file, void *data); #endif +/** + * cfg80211_s1g_get_start_freq_khz - get S1G chandef start frequency + * @chandef: the chandef to use + * + * Return: the chandefs starting frequency in KHz + */ +static inline u32 +cfg80211_s1g_get_start_freq_khz(const struct cfg80211_chan_def *chandef) +{ + u32 bw_mhz = cfg80211_chandef_get_width(chandef); + u32 center_khz = + MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; + return center_khz - bw_mhz * 500 + 500; +} + +/** + * cfg80211_s1g_get_end_freq_khz - get S1G chandef end frequency + * @chandef: the chandef to use + * + * Return: the chandefs ending frequency in KHz + */ +static inline u32 +cfg80211_s1g_get_end_freq_khz(const struct cfg80211_chan_def *chandef) +{ + u32 bw_mhz = cfg80211_chandef_get_width(chandef); + u32 center_khz = + MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset; + return center_khz + bw_mhz * 500 - 500; +} + +/** + * cfg80211_s1g_get_primary_sibling - retrieve the sibling 1MHz subchannel + * for an S1G chandef using a 2MHz primary channel. + * @wiphy: wiphy the channel belongs to + * @chandef: the chandef to use + * + * When chandef::s1g_primary_2mhz is set to true, we are operating on a 2MHz + * primary channel. The 1MHz subchannel designated by the primary channel + * location exists within chandef::chan, whilst the 'sibling' is denoted as + * being the other 1MHz subchannel that make up the 2MHz primary channel. + * + * Returns: the sibling 1MHz &struct ieee80211_channel, or %NULL on failure. + */ +static inline struct ieee80211_channel * +cfg80211_s1g_get_primary_sibling(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + int width_mhz = cfg80211_chandef_get_width(chandef); + u32 pri_1mhz_khz, sibling_1mhz_khz, op_low_1mhz_khz, pri_index; + + if (!chandef->s1g_primary_2mhz || width_mhz < 2) + return NULL; + + pri_1mhz_khz = ieee80211_channel_to_khz(chandef->chan); + op_low_1mhz_khz = cfg80211_s1g_get_start_freq_khz(chandef); + + /* + * Compute the index of the primary 1 MHz subchannel within the + * operating channel, relative to the lowest 1 MHz center frequency. + * Flip the least significant bit to select the even/odd sibling, + * then translate that index back into a channel frequency. + */ + pri_index = (pri_1mhz_khz - op_low_1mhz_khz) / 1000; + sibling_1mhz_khz = op_low_1mhz_khz + ((pri_index ^ 1) * 1000); + + return ieee80211_get_channel_khz(wiphy, sibling_1mhz_khz); +} + + +/** + * cfg80211_incumbent_signal_notify - Notify userspace of incumbent signal detection + * @wiphy: the wiphy to use + * @chandef: channel definition in which the interference was detected + * @signal_interference_bitmap: bitmap indicating interference across 20 MHz segments + * @gfp: allocation context for message creation and multicast; pass GFP_ATOMIC + * if called from atomic context (e.g. firmware event handler), otherwise + * GFP_KERNEL + * + * Use this function to notify userspace when an incumbent signal is detected on + * the operating channel in the 6 GHz band. The notification includes the + * current channel definition and a bitmap representing interference across + * the operating bandwidth. Each bit in the bitmap corresponds to a 20 MHz + * segment, with the lowest bit representing the lowest frequency segment. + * Punctured sub-channels are included in the bitmap structure but are always + * set to zero since interference detection is not performed on them. + */ +void cfg80211_incumbent_signal_notify(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef, + u32 signal_interference_bitmap, + gfp_t gfp); + #endif /* __NET_CFG80211_H */ diff --git a/include/net/checksum.h b/include/net/checksum.h index 243f972267b8..3cbab35de5ab 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -99,12 +99,6 @@ csum_block_add(__wsum csum, __wsum csum2, int offset) } static __always_inline __wsum -csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) -{ - return csum_block_add(csum, csum2, offset); -} - -static __always_inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { return csum_block_add(csum, ~csum2, offset); @@ -115,12 +109,6 @@ static __always_inline __wsum csum_unfold(__sum16 n) return (__force __wsum)n; } -static __always_inline -__wsum csum_partial_ext(const void *buff, int len, __wsum sum) -{ - return csum_partial(buff, len, sum); -} - #define CSUM_MANGLED_0 ((__force __sum16)0xffff) static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) @@ -164,7 +152,7 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, - __wsum diff, bool pseudohdr); + __wsum diff, bool pseudohdr, bool ipv6); static __always_inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index 7e78e7d6f015..668aeee9b3f6 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -63,7 +63,7 @@ static inline u32 task_get_classid(const struct sk_buff *skb) * calls by looking at the number of nested bh disable calls because * softirqs always disables bh. */ - if (in_serving_softirq()) { + if (softirq_count()) { struct sock *sk = skb_to_full_sk(skb); /* If there is an sock_cgroup_classid we'll use that. */ diff --git a/include/net/codel_impl.h b/include/net/codel_impl.h index 78a27ac73070..2c1f0ec309e9 100644 --- a/include/net/codel_impl.h +++ b/include/net/codel_impl.h @@ -120,10 +120,10 @@ static bool codel_should_drop(const struct sk_buff *skb, } skb_len = skb_len_func(skb); - vars->ldelay = now - skb_time_func(skb); + WRITE_ONCE(vars->ldelay, now - skb_time_func(skb)); if (unlikely(skb_len > stats->maxpacket)) - stats->maxpacket = skb_len; + WRITE_ONCE(stats->maxpacket, skb_len); if (codel_time_before(vars->ldelay, params->target) || *backlog <= params->mtu) { @@ -158,7 +158,8 @@ static struct sk_buff *codel_dequeue(void *ctx, bool drop; if (!skb) { - vars->dropping = false; + vars->first_above_time = 0; + WRITE_ONCE(vars->dropping, false); return skb; } now = codel_get_time(); @@ -167,7 +168,7 @@ static struct sk_buff *codel_dequeue(void *ctx, if (vars->dropping) { if (!drop) { /* sojourn time below target - leave dropping state */ - vars->dropping = false; + WRITE_ONCE(vars->dropping, false); } else if (codel_time_after_eq(now, vars->drop_next)) { /* It's time for the next drop. Drop the current * packet and dequeue the next. The dequeue might @@ -179,16 +180,18 @@ static struct sk_buff *codel_dequeue(void *ctx, */ while (vars->dropping && codel_time_after_eq(now, vars->drop_next)) { - vars->count++; /* dont care of possible wrap - * since there is no more divide - */ + /* dont care of possible wrap + * since there is no more divide. + */ + WRITE_ONCE(vars->count, vars->count + 1); codel_Newton_step(vars); if (params->ecn && INET_ECN_set_ce(skb)) { - stats->ecn_mark++; - vars->drop_next = + WRITE_ONCE(stats->ecn_mark, + stats->ecn_mark + 1); + WRITE_ONCE(vars->drop_next, codel_control_law(vars->drop_next, params->interval, - vars->rec_inv_sqrt); + vars->rec_inv_sqrt)); goto end; } stats->drop_len += skb_len_func(skb); @@ -201,13 +204,13 @@ static struct sk_buff *codel_dequeue(void *ctx, skb_time_func, backlog, now)) { /* leave dropping state */ - vars->dropping = false; + WRITE_ONCE(vars->dropping, false); } else { /* and schedule the next drop */ - vars->drop_next = + WRITE_ONCE(vars->drop_next, codel_control_law(vars->drop_next, params->interval, - vars->rec_inv_sqrt); + vars->rec_inv_sqrt)); } } } @@ -215,7 +218,7 @@ static struct sk_buff *codel_dequeue(void *ctx, u32 delta; if (params->ecn && INET_ECN_set_ce(skb)) { - stats->ecn_mark++; + WRITE_ONCE(stats->ecn_mark, stats->ecn_mark + 1); } else { stats->drop_len += skb_len_func(skb); drop_func(skb, ctx); @@ -226,7 +229,7 @@ static struct sk_buff *codel_dequeue(void *ctx, stats, skb_len_func, skb_time_func, backlog, now); } - vars->dropping = true; + WRITE_ONCE(vars->dropping, true); /* if min went above target close to when we last went below it * assume that the drop rate that controlled the queue on the * last cycle is a good starting point to control it now. @@ -235,19 +238,20 @@ static struct sk_buff *codel_dequeue(void *ctx, if (delta > 1 && codel_time_before(now - vars->drop_next, 16 * params->interval)) { - vars->count = delta; + WRITE_ONCE(vars->count, delta); /* we dont care if rec_inv_sqrt approximation * is not very precise : * Next Newton steps will correct it quadratically. */ codel_Newton_step(vars); } else { - vars->count = 1; + WRITE_ONCE(vars->count, 1); vars->rec_inv_sqrt = ~0U >> REC_INV_SQRT_SHIFT; } - vars->lastcount = vars->count; - vars->drop_next = codel_control_law(now, params->interval, - vars->rec_inv_sqrt); + WRITE_ONCE(vars->lastcount, vars->count); + WRITE_ONCE(vars->drop_next, + codel_control_law(now, params->interval, + vars->rec_inv_sqrt)); } end: if (skb && codel_time_after(vars->ldelay, params->ce_threshold)) { @@ -261,7 +265,7 @@ end: params->ce_threshold_selector)); } if (set_ce && INET_ECN_set_ce(skb)) - stats->ce_mark++; + WRITE_ONCE(stats->ce_mark, stats->ce_mark + 1); } return skb; } diff --git a/include/net/devlink.h b/include/net/devlink.h index b8783126c1ed..bcd31de1f890 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -78,6 +78,9 @@ struct devlink_port_pci_sf_attrs { * @flavour: flavour of the port * @split: indicates if this is split port * @splittable: indicates if the port can be split. + * @no_phys_port_name: skip automatic phys_port_name generation; for + * compatibility only, newly added driver/port instance + * should never set this. * @lanes: maximum number of lanes the port supports. 0 value is not passed to netlink. * @switch_id: if the port is part of switch, this is buffer with ID, otherwise this is NULL * @phys: physical port attributes @@ -87,7 +90,8 @@ struct devlink_port_pci_sf_attrs { */ struct devlink_port_attrs { u8 split:1, - splittable:1; + splittable:1, + no_phys_port_name:1; u32 lanes; enum devlink_port_flavour flavour; struct netdev_phys_item_id switch_id; @@ -118,11 +122,14 @@ struct devlink_rate { u32 tx_priority; u32 tx_weight; + + u32 tc_bw[DEVLINK_RATE_TCS_MAX]; }; struct devlink_port { struct list_head list; struct list_head region_list; + struct list_head resource_list; struct devlink *devlink; const struct devlink_port_ops *ops; unsigned int index; @@ -420,17 +427,19 @@ typedef u64 devlink_resource_occ_get_t(void *priv); #define __DEVLINK_PARAM_MAX_STRING_VALUE 32 enum devlink_param_type { - DEVLINK_PARAM_TYPE_U8, - DEVLINK_PARAM_TYPE_U16, - DEVLINK_PARAM_TYPE_U32, - DEVLINK_PARAM_TYPE_STRING, - DEVLINK_PARAM_TYPE_BOOL, + DEVLINK_PARAM_TYPE_U8 = DEVLINK_VAR_ATTR_TYPE_U8, + DEVLINK_PARAM_TYPE_U16 = DEVLINK_VAR_ATTR_TYPE_U16, + DEVLINK_PARAM_TYPE_U32 = DEVLINK_VAR_ATTR_TYPE_U32, + DEVLINK_PARAM_TYPE_U64 = DEVLINK_VAR_ATTR_TYPE_U64, + DEVLINK_PARAM_TYPE_STRING = DEVLINK_VAR_ATTR_TYPE_STRING, + DEVLINK_PARAM_TYPE_BOOL = DEVLINK_VAR_ATTR_TYPE_FLAG, }; union devlink_param_value { u8 vu8; u16 vu16; u32 vu32; + u64 vu64; char vstr[__DEVLINK_PARAM_MAX_STRING_VALUE]; bool vbool; }; @@ -471,6 +480,10 @@ struct devlink_flash_notify { * @set: set parameter value, used for runtime and permanent * configuration modes * @validate: validate input value is applicable (within value range, etc.) + * @get_default: get parameter default value, used for runtime and permanent + * configuration modes + * @reset_default: reset parameter to default value, used for runtime and permanent + * configuration modes * * This struct should be used by the driver to fill the data for * a parameter it registers. @@ -482,13 +495,20 @@ struct devlink_param { enum devlink_param_type type; unsigned long supported_cmodes; int (*get)(struct devlink *devlink, u32 id, - struct devlink_param_gset_ctx *ctx); + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); int (*set)(struct devlink *devlink, u32 id, struct devlink_param_gset_ctx *ctx, struct netlink_ext_ack *extack); int (*validate)(struct devlink *devlink, u32 id, union devlink_param_value val, struct netlink_ext_ack *extack); + int (*get_default)(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); + int (*reset_default)(struct devlink *devlink, u32 id, + enum devlink_param_cmode cmode, + struct netlink_ext_ack *extack); }; struct devlink_param_item { @@ -500,6 +520,7 @@ struct devlink_param_item { * until reload. */ bool driverinit_value_new_valid; + union devlink_param_value driverinit_default; }; enum devlink_param_generic_id { @@ -520,6 +541,11 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP, DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, + DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, + DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, + DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, + DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, + DEVLINK_PARAM_GENERIC_ID_MAX_MAC_PER_VF, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -578,6 +604,21 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME "event_eq_size" #define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE DEVLINK_PARAM_TYPE_U32 +#define DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME "enable_phc" +#define DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE DEVLINK_PARAM_TYPE_BOOL + +#define DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME "clock_id" +#define DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE DEVLINK_PARAM_TYPE_U64 + +#define DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME "total_vfs" +#define DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE DEVLINK_PARAM_TYPE_U32 + +#define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME "num_doorbells" +#define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE DEVLINK_PARAM_TYPE_U32 + +#define DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_NAME "max_mac_per_vf" +#define DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ @@ -601,6 +642,37 @@ enum devlink_param_generic_id { .validate = _validate, \ } +#define DEVLINK_PARAM_GENERIC_WITH_DEFAULTS(_id, _cmodes, _get, _set, \ + _validate, _get_default, \ + _reset_default) \ +{ \ + .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ + .name = DEVLINK_PARAM_GENERIC_##_id##_NAME, \ + .type = DEVLINK_PARAM_GENERIC_##_id##_TYPE, \ + .generic = true, \ + .supported_cmodes = _cmodes, \ + .get = _get, \ + .set = _set, \ + .validate = _validate, \ + .get_default = _get_default, \ + .reset_default = _reset_default, \ +} + +#define DEVLINK_PARAM_DRIVER_WITH_DEFAULTS(_id, _name, _type, _cmodes, \ + _get, _set, _validate, \ + _get_default, _reset_default) \ +{ \ + .id = _id, \ + .name = _name, \ + .type = _type, \ + .supported_cmodes = _cmodes, \ + .get = _get, \ + .set = _set, \ + .validate = _validate, \ + .get_default = _get_default, \ + .reset_default = _reset_default, \ +} + /* Identifier of board design */ #define DEVLINK_INFO_VERSION_GENERIC_BOARD_ID "board.id" /* Revision of board design */ @@ -730,6 +802,10 @@ enum devlink_health_reporter_state { * if priv_ctx is NULL, run a full dump * @diagnose: callback to diagnose the current status * @test: callback to trigger a test event + * @default_graceful_period: default min time (in msec) + * between recovery attempts + * @default_burst_period: default time (in msec) for + * error recoveries before starting the grace period */ struct devlink_health_reporter_ops { @@ -744,6 +820,8 @@ struct devlink_health_reporter_ops { struct netlink_ext_ack *extack); int (*test)(struct devlink_health_reporter *reporter, struct netlink_ext_ack *extack); + u64 default_graceful_period; + u64 default_burst_period; }; /** @@ -1482,6 +1560,9 @@ struct devlink_ops { u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate, + void *priv, u32 *tc_bw, + struct netlink_ext_ack *extack); int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, @@ -1490,6 +1571,9 @@ struct devlink_ops { u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate, + void *priv, u32 *tc_bw, + struct netlink_ext_ack *extack); int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, struct netlink_ext_ack *extack); int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, @@ -1528,6 +1612,9 @@ struct devlink_ops { void *devlink_priv(struct devlink *devlink); struct devlink *priv_to_devlink(void *priv); struct device *devlink_to_dev(const struct devlink *devlink); +const char *devlink_bus_name(const struct devlink *devlink); +const char *devlink_dev_name(const struct devlink *devlink); +const char *devlink_dev_driver_name(const struct devlink *devlink); /* Devlink instance explicit locking */ void devl_lock(struct devlink *devlink); @@ -1561,6 +1648,13 @@ void devlink_register(struct devlink *devlink); void devlink_unregister(struct devlink *devlink); void devlink_free(struct devlink *devlink); +struct devlink *devlink_shd_get(const char *id, + const struct devlink_ops *ops, + size_t priv_size, + const struct device_driver *driver); +void devlink_shd_put(struct devlink *devlink); +void *devlink_shd_get_priv(struct devlink *devlink); + /** * struct devlink_port_ops - Port operations * @port_split: Callback used to split the port into multiple ones. @@ -1721,7 +1815,7 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev); void devlink_port_type_clear(struct devlink_port *devlink_port); void devlink_port_attrs_set(struct devlink_port *devlink_port, - struct devlink_port_attrs *devlink_port_attrs); + const struct devlink_port_attrs *attrs); void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, bool external); void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, @@ -1792,12 +1886,19 @@ int devl_resource_register(struct devlink *devlink, u64 resource_size, u64 resource_id, u64 parent_resource_id, - const struct devlink_resource_size_params *size_params); + const struct devlink_resource_size_params *params); void devl_resources_unregister(struct devlink *devlink); void devlink_resources_unregister(struct devlink *devlink); int devl_resource_size_get(struct devlink *devlink, u64 resource_id, u64 *p_resource_size); +int +devl_port_resource_register(struct devlink_port *devlink_port, + const char *resource_name, + u64 resource_size, u64 resource_id, + u64 parent_resource_id, + const struct devlink_resource_size_params *params); +void devl_port_resources_unregister(struct devlink_port *devlink_port); int devl_dpipe_table_resource_set(struct devlink *devlink, const char *table_name, u64 resource_id, u64 resource_units); @@ -1906,22 +2007,22 @@ void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, struct devlink_health_reporter * devl_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devlink_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devl_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv); + void *priv); void devl_health_reporter_destroy(struct devlink_health_reporter *reporter); diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 32a34dfe8cc5..2f312d1f67d6 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -40,8 +40,12 @@ FN(TCP_OFOMERGE) \ FN(TCP_RFC7323_PAWS) \ FN(TCP_RFC7323_PAWS_ACK) \ + FN(TCP_RFC7323_TW_PAWS) \ + FN(TCP_RFC7323_TSECR) \ + FN(TCP_LISTEN_OVERFLOW) \ FN(TCP_OLD_SEQUENCE) \ FN(TCP_INVALID_SEQUENCE) \ + FN(TCP_INVALID_END_SEQUENCE) \ FN(TCP_INVALID_ACK_SEQUENCE) \ FN(TCP_RESET) \ FN(TCP_INVALID_SYN) \ @@ -59,21 +63,20 @@ FN(NEIGH_FAILED) \ FN(NEIGH_QUEUEFULL) \ FN(NEIGH_DEAD) \ + FN(NEIGH_HH_FILLFAIL) \ FN(TC_EGRESS) \ FN(SECURITY_HOOK) \ FN(QDISC_DROP) \ - FN(QDISC_OVERLIMIT) \ - FN(QDISC_CONGESTED) \ - FN(CAKE_FLOOD) \ - FN(FQ_BAND_LIMIT) \ - FN(FQ_HORIZON_LIMIT) \ - FN(FQ_FLOW_LIMIT) \ + FN(QDISC_BURST_DROP) \ FN(CPU_BACKLOG) \ + FN(MACVLAN_BROADCAST_BACKLOG) \ + FN(IPVLAN_MULTICAST_BACKLOG) \ FN(XDP) \ FN(TC_INGRESS) \ FN(UNHANDLED_PROTO) \ FN(SKB_CSUM) \ FN(SKB_GSO_SEG) \ + FN(SKB_BAD_GSO) \ FN(SKB_UCOPY_FAULT) \ FN(DEV_HDR) \ FN(DEV_READY) \ @@ -96,6 +99,7 @@ FN(FRAG_TOO_FAR) \ FN(TCP_MINTTL) \ FN(IPV6_BAD_EXTHDR) \ + FN(IPV6_TOO_MANY_EXTHDRS) \ FN(IPV6_NDISC_FRAG) \ FN(IPV6_NDISC_HOP_LIMIT) \ FN(IPV6_NDISC_BAD_CODE) \ @@ -117,6 +121,13 @@ FN(ARP_PVLAN_DISABLE) \ FN(MAC_IEEE_MAC_CONTROL) \ FN(BRIDGE_INGRESS_STP_STATE) \ + FN(CAN_RX_INVALID_FRAME) \ + FN(CANFD_RX_INVALID_FRAME) \ + FN(CANXL_RX_INVALID_FRAME) \ + FN(PFMEMALLOC) \ + FN(PSP_INPUT) \ + FN(PSP_OUTPUT) \ + FN(RECURSION_LIMIT) \ FNe(MAX) /** @@ -281,11 +292,30 @@ enum skb_drop_reason { * Corresponds to LINUX_MIB_PAWS_OLD_ACK. */ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK, + /** + * @SKB_DROP_REASON_TCP_RFC7323_TW_PAWS: PAWS check, socket is in + * TIME_WAIT state. + * Corresponds to LINUX_MIB_PAWS_TW_REJECTED. + */ + SKB_DROP_REASON_TCP_RFC7323_TW_PAWS, + /** + * @SKB_DROP_REASON_TCP_RFC7323_TSECR: PAWS check, invalid TSEcr. + * Corresponds to LINUX_MIB_TSECRREJECTED. + */ + SKB_DROP_REASON_TCP_RFC7323_TSECR, + /** @SKB_DROP_REASON_TCP_LISTEN_OVERFLOW: listener queue full. */ + SKB_DROP_REASON_TCP_LISTEN_OVERFLOW, /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ SKB_DROP_REASON_TCP_OLD_SEQUENCE, - /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */ + /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field. */ SKB_DROP_REASON_TCP_INVALID_SEQUENCE, /** + * @SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE: + * Not acceptable END_SEQ field. + * Corresponds to LINUX_MIB_BEYOND_WINDOW. + */ + SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE, + /** * @SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ * field because ack sequence is not in the window between snd_una * and snd_nxt @@ -332,51 +362,40 @@ enum skb_drop_reason { SKB_DROP_REASON_NEIGH_QUEUEFULL, /** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */ SKB_DROP_REASON_NEIGH_DEAD, + /** @SKB_DROP_REASON_NEIGH_HH_FILLFAIL: failed to fill the device hard header */ + SKB_DROP_REASON_NEIGH_HH_FILLFAIL, /** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */ SKB_DROP_REASON_TC_EGRESS, /** @SKB_DROP_REASON_SECURITY_HOOK: dropped due to security HOOK */ SKB_DROP_REASON_SECURITY_HOOK, /** - * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc when packet outputting ( - * failed to enqueue to current qdisc) + * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc during enqueue or + * dequeue. More specific drop reasons are available via the + * qdisc:qdisc_drop tracepoint, which also provides qdisc handle + * and name for identifying the source. */ SKB_DROP_REASON_QDISC_DROP, /** - * @SKB_DROP_REASON_QDISC_OVERLIMIT: dropped by qdisc when a qdisc - * instance exceeds its total buffer size limit. - */ - SKB_DROP_REASON_QDISC_OVERLIMIT, - /** - * @SKB_DROP_REASON_QDISC_CONGESTED: dropped by a qdisc AQM algorithm - * due to congestion. - */ - SKB_DROP_REASON_QDISC_CONGESTED, - /** - * @SKB_DROP_REASON_CAKE_FLOOD: dropped by the flood protection part of - * CAKE qdisc AQM algorithm (BLUE). - */ - SKB_DROP_REASON_CAKE_FLOOD, - /** - * @SKB_DROP_REASON_FQ_BAND_LIMIT: dropped by fq qdisc when per band - * limit is reached. - */ - SKB_DROP_REASON_FQ_BAND_LIMIT, - /** - * @SKB_DROP_REASON_FQ_HORIZON_LIMIT: dropped by fq qdisc when packet - * timestamp is too far in the future. - */ - SKB_DROP_REASON_FQ_HORIZON_LIMIT, - /** - * @SKB_DROP_REASON_FQ_FLOW_LIMIT: dropped by fq qdisc when a flow - * exceeds its limits. + * @SKB_DROP_REASON_QDISC_BURST_DROP: dropped when net.core.qdisc_max_burst + * limit is hit. */ - SKB_DROP_REASON_FQ_FLOW_LIMIT, + SKB_DROP_REASON_QDISC_BURST_DROP, /** * @SKB_DROP_REASON_CPU_BACKLOG: failed to enqueue the skb to the per CPU * backlog queue. This can be caused by backlog queue full (see * netdev_max_backlog in net.rst) or RPS flow limit */ SKB_DROP_REASON_CPU_BACKLOG, + /** + * @SKB_DROP_REASON_MACVLAN_BROADCAST_BACKLOG: failed to enqueue the skb + * to macvlan broadcast queue. + */ + SKB_DROP_REASON_MACVLAN_BROADCAST_BACKLOG, + /** + * @SKB_DROP_REASON_IPVLAN_MULTICAST_BACKLOG: failed to enqueue the skb + * to ipvlan multicast queue. + */ + SKB_DROP_REASON_IPVLAN_MULTICAST_BACKLOG, /** @SKB_DROP_REASON_XDP: dropped by XDP in input path */ SKB_DROP_REASON_XDP, /** @SKB_DROP_REASON_TC_INGRESS: dropped in TC ingress HOOK */ @@ -387,6 +406,8 @@ enum skb_drop_reason { SKB_DROP_REASON_SKB_CSUM, /** @SKB_DROP_REASON_SKB_GSO_SEG: gso segmentation error */ SKB_DROP_REASON_SKB_GSO_SEG, + /** @SKB_DROP_REASON_SKB_BAD_GSO: malicious gso packet. */ + SKB_DROP_REASON_SKB_BAD_GSO, /** * @SKB_DROP_REASON_SKB_UCOPY_FAULT: failed to copy data from user space, * e.g., via zerocopy_sg_from_iter() or skb_orphan_frags_rx() @@ -474,6 +495,11 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_MINTTL, /** @SKB_DROP_REASON_IPV6_BAD_EXTHDR: Bad IPv6 extension header. */ SKB_DROP_REASON_IPV6_BAD_EXTHDR, + /** + * @SKB_DROP_REASON_IPV6_TOO_MANY_EXTHDRS: Number of IPv6 extension + * headers in the packet exceeds IP6_MAX_EXT_HDRS_CNT. + */ + SKB_DROP_REASON_IPV6_TOO_MANY_EXTHDRS, /** @SKB_DROP_REASON_IPV6_NDISC_FRAG: invalid frag (suppress_frag_ndisc). */ SKB_DROP_REASON_IPV6_NDISC_FRAG, /** @SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT: invalid hop limit. */ @@ -555,6 +581,32 @@ enum skb_drop_reason { */ SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE, /** + * @SKB_DROP_REASON_CAN_RX_INVALID_FRAME: received + * non conform CAN frame (or device is unable to receive CAN frames) + */ + SKB_DROP_REASON_CAN_RX_INVALID_FRAME, + /** + * @SKB_DROP_REASON_CANFD_RX_INVALID_FRAME: received + * non conform CAN-FD frame (or device is unable to receive CAN frames) + */ + SKB_DROP_REASON_CANFD_RX_INVALID_FRAME, + /** + * @SKB_DROP_REASON_CANXL_RX_INVALID_FRAME: received + * non conform CAN-XL frame (or device is unable to receive CAN frames) + */ + SKB_DROP_REASON_CANXL_RX_INVALID_FRAME, + /** + * @SKB_DROP_REASON_PFMEMALLOC: packet allocated from memory reserve + * reached a path or socket not eligible for use of memory reserves + */ + SKB_DROP_REASON_PFMEMALLOC, + /** @SKB_DROP_REASON_PSP_INPUT: PSP input checks failed */ + SKB_DROP_REASON_PSP_INPUT, + /** @SKB_DROP_REASON_PSP_OUTPUT: PSP output checks failed */ + SKB_DROP_REASON_PSP_OUTPUT, + /** @SKB_DROP_REASON_RECURSION_LIMIT: Dead loop on virtual device. */ + SKB_DROP_REASON_RECURSION_LIMIT, + /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen */ diff --git a/include/net/dropreason-qdisc.h b/include/net/dropreason-qdisc.h new file mode 100644 index 000000000000..fb151cd31751 --- /dev/null +++ b/include/net/dropreason-qdisc.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _LINUX_DROPREASON_QDISC_H +#define _LINUX_DROPREASON_QDISC_H +#include <net/dropreason.h> + +#define DEFINE_QDISC_DROP_REASON(FN, FNe) \ + FN(UNSPEC) \ + FN(GENERIC) \ + FN(OVERLIMIT) \ + FN(CONGESTED) \ + FN(MAXFLOWS) \ + FN(FLOOD_PROTECTION) \ + FN(BAND_LIMIT) \ + FN(HORIZON_LIMIT) \ + FN(FLOW_LIMIT) \ + FN(L4S_STEP_NON_ECN) \ + FNe(MAX) + +#undef FN +#undef FNe +#define FN(reason) QDISC_DROP_##reason, +#define FNe(reason) QDISC_DROP_##reason + +/** + * enum qdisc_drop_reason - reason why a qdisc dropped a packet + * + * Qdisc-specific drop reasons for packet drops that occur within the + * traffic control (TC) queueing discipline layer. These reasons provide + * detailed diagnostics about why packets were dropped by various qdisc + * algorithms, enabling fine-grained monitoring and troubleshooting of + * queue behavior. + */ +enum qdisc_drop_reason { + /** + * @QDISC_DROP_UNSPEC: unspecified/invalid qdisc drop reason. + * Value 0 serves as analogous to SKB_NOT_DROPPED_YET for enum skb_drop_reason. + * Used for catching zero-initialized drop_reason fields. + */ + QDISC_DROP_UNSPEC = 0, + /** + * @__QDISC_DROP_REASON: subsystem base value for qdisc drop reasons + */ + __QDISC_DROP_REASON = SKB_DROP_REASON_SUBSYS_QDISC << + SKB_DROP_REASON_SUBSYS_SHIFT, + /** + * @QDISC_DROP_GENERIC: generic/default qdisc drop, used when no + * more specific reason applies + */ + QDISC_DROP_GENERIC, + /** + * @QDISC_DROP_OVERLIMIT: packet dropped because the qdisc queue + * length exceeded its configured limit (sch->limit). This typically + * indicates the queue is full and cannot accept more packets. + */ + QDISC_DROP_OVERLIMIT, + /** + * @QDISC_DROP_CONGESTED: packet dropped due to active congestion + * control algorithms (e.g., CoDel, PIE, RED) detecting network + * congestion. The qdisc proactively dropped the packet to signal + * congestion to the sender and prevent bufferbloat. + */ + QDISC_DROP_CONGESTED, + /** + * @QDISC_DROP_MAXFLOWS: packet dropped because the qdisc's flow + * tracking table is full and no free slots are available to allocate + * for a new flow. This indicates flow table exhaustion in flow-based + * qdiscs that maintain per-flow state (e.g., SFQ). + */ + QDISC_DROP_MAXFLOWS, + /** + * @QDISC_DROP_FLOOD_PROTECTION: packet dropped by flood protection + * mechanism detecting unresponsive flows (potential DoS/flood). + * Used by qdiscs implementing probabilistic drop algorithms like + * BLUE (e.g., CAKE's Cobalt AQM). + */ + QDISC_DROP_FLOOD_PROTECTION, + /** + * @QDISC_DROP_BAND_LIMIT: packet dropped because the priority band's + * limit was reached. Used by qdiscs with priority bands that have + * per-band packet limits (e.g., FQ). + */ + QDISC_DROP_BAND_LIMIT, + /** + * @QDISC_DROP_HORIZON_LIMIT: packet dropped because its timestamp + * is too far in the future (beyond the configured horizon). + * Used by qdiscs with time-based scheduling (e.g., FQ). + */ + QDISC_DROP_HORIZON_LIMIT, + /** + * @QDISC_DROP_FLOW_LIMIT: packet dropped because an individual flow + * exceeded its per-flow packet/depth limit. Used by FQ and SFQ qdiscs + * to enforce per-flow fairness and prevent a single flow from + * monopolizing queue resources. + */ + QDISC_DROP_FLOW_LIMIT, + /** + * @QDISC_DROP_L4S_STEP_NON_ECN: DualPI2 qdisc dropped a non-ECN-capable + * packet because the L4S queue delay exceeded the step threshold. + * Since the packet cannot be ECN-marked, it must be dropped to signal + * congestion. See RFC 9332 for the DualQ Coupled AQM step mechanism. + */ + QDISC_DROP_L4S_STEP_NON_ECN, + /** + * @QDISC_DROP_MAX: the maximum of qdisc drop reasons, which + * shouldn't be used as a real 'reason' - only for tracing code gen + */ + QDISC_DROP_MAX, +}; + +#undef FN +#undef FNe + +#endif diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 56cb7be92244..1df60645fb27 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -18,17 +18,17 @@ enum skb_drop_reason_subsys { SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE, /** - * @SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR: mac80211 drop reasons - * for frames still going to monitor, see net/mac80211/drop.h - */ - SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR, - - /** * @SKB_DROP_REASON_SUBSYS_OPENVSWITCH: openvswitch drop reasons, * see net/openvswitch/drop.h */ SKB_DROP_REASON_SUBSYS_OPENVSWITCH, + /** + * @SKB_DROP_REASON_SUBSYS_QDISC: TC qdisc drop reasons, + * see include/net/dropreason-qdisc.h + */ + SKB_DROP_REASON_SUBSYS_QDISC, + /** @SKB_DROP_REASON_SUBSYS_NUM: number of subsystems defined */ SKB_DROP_REASON_SUBSYS_NUM }; diff --git a/include/net/dsa.h b/include/net/dsa.h index a0a9481c52c2..8b6d34e8a6f0 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -54,11 +54,16 @@ struct tc_action; #define DSA_TAG_PROTO_RZN1_A5PSW_VALUE 26 #define DSA_TAG_PROTO_LAN937X_VALUE 27 #define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE 28 +#define DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE 29 +#define DSA_TAG_PROTO_YT921X_VALUE 30 +#define DSA_TAG_PROTO_MXL_GSW1XX_VALUE 31 +#define DSA_TAG_PROTO_MXL862_VALUE 32 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE, DSA_TAG_PROTO_BRCM_LEGACY = DSA_TAG_PROTO_BRCM_LEGACY_VALUE, + DSA_TAG_PROTO_BRCM_LEGACY_FCS = DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE, DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE, DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE, DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE, @@ -85,6 +90,9 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_RZN1_A5PSW = DSA_TAG_PROTO_RZN1_A5PSW_VALUE, DSA_TAG_PROTO_LAN937X = DSA_TAG_PROTO_LAN937X_VALUE, DSA_TAG_PROTO_VSC73XX_8021Q = DSA_TAG_PROTO_VSC73XX_8021Q_VALUE, + DSA_TAG_PROTO_YT921X = DSA_TAG_PROTO_YT921X_VALUE, + DSA_TAG_PROTO_MXL_GSW1XX = DSA_TAG_PROTO_MXL_GSW1XX_VALUE, + DSA_TAG_PROTO_MXL862 = DSA_TAG_PROTO_MXL862_VALUE, }; struct dsa_switch; @@ -210,12 +218,6 @@ struct dsa_mall_mirror_tc_entry { bool ingress; }; -/* TC port policer entry */ -struct dsa_mall_policer_tc_entry { - u32 burst; - u64 rate_bytes_per_sec; -}; - /* TC matchall entry */ struct dsa_mall_tc_entry { struct list_head list; @@ -223,7 +225,7 @@ struct dsa_mall_tc_entry { enum dsa_port_mall_action_type type; union { struct dsa_mall_mirror_tc_entry mirror; - struct dsa_mall_policer_tc_entry policer; + struct flow_action_police policer; }; }; @@ -296,6 +298,7 @@ struct dsa_port { struct devlink_port devlink_port; struct phylink *pl; struct phylink_config pl_config; + netdevice_tracker conduit_tracker; struct dsa_lag *lag; struct net_device *hsr_dev; @@ -828,6 +831,22 @@ dsa_tree_offloads_bridge_dev(struct dsa_switch_tree *dst, return false; } +#define dsa_switch_for_each_bridge_member(_dp, _ds, _bdev) \ + dsa_switch_for_each_user_port(_dp, _ds) \ + if (dsa_port_offloads_bridge_dev(_dp, _bdev)) + +static inline u32 +dsa_bridge_ports(struct dsa_switch *ds, const struct net_device *bdev) +{ + struct dsa_port *dp; + u32 mask = 0; + + dsa_switch_for_each_bridge_member(dp, ds, bdev) + mask |= BIT(dp->index); + + return mask; +} + static inline bool dsa_port_tree_same(const struct dsa_port *a, const struct dsa_port *b) { @@ -1103,7 +1122,7 @@ struct dsa_switch_ops { void (*port_mirror_del)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); int (*port_policer_add)(struct dsa_switch *ds, int port, - struct dsa_mall_policer_tc_entry *policer); + const struct flow_action_police *policer); void (*port_policer_del)(struct dsa_switch *ds, int port); int (*port_setup_tc)(struct dsa_switch *ds, int port, enum tc_setup_type type, void *type_data); @@ -1131,9 +1150,10 @@ struct dsa_switch_ops { * PTP functionality */ int (*port_hwtstamp_get)(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *config); int (*port_hwtstamp_set)(struct dsa_switch *ds, int port, - struct ifreq *ifr); + struct kernel_hwtstamp_config *config, + struct netlink_ext_ack *extack); void (*port_txtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, @@ -1244,7 +1264,8 @@ struct dsa_switch_ops { dsa_devlink_param_get, dsa_devlink_param_set, NULL) int dsa_devlink_param_get(struct devlink *dl, u32 id, - struct devlink_param_gset_ctx *ctx); + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); int dsa_devlink_param_set(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx, struct netlink_ext_ack *extack); @@ -1307,11 +1328,6 @@ static inline int dsa_devlink_port_to_port(struct devlink_port *port) return port->index; } -struct dsa_switch_driver { - struct list_head list; - const struct dsa_switch_ops *ops; -}; - bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); @@ -1319,6 +1335,15 @@ bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); +int dsa_port_simple_hsr_validate(struct dsa_switch *ds, int port, + struct net_device *hsr, + struct netlink_ext_ack *extack); +int dsa_port_simple_hsr_join(struct dsa_switch *ds, int port, + struct net_device *hsr, + struct netlink_ext_ack *extack); +int dsa_port_simple_hsr_leave(struct dsa_switch *ds, int port, + struct net_device *hsr); + /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(const struct net_device *dev) { diff --git a/include/net/dst.h b/include/net/dst.h index 78c78cdce0e9..307073eae7f8 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -24,7 +24,10 @@ struct sk_buff; struct dst_entry { - struct net_device *dev; + union { + struct net_device *dev; + struct net_device __rcu *dev_rcu; + }; struct dst_ops *ops; unsigned long _metrics; unsigned long expires; @@ -216,6 +219,12 @@ static inline u32 dst_mtu(const struct dst_entry *dst) return INDIRECT_CALL_INET(dst->ops->mtu, ip6_mtu, ipv4_mtu, dst); } +/* Variant of dst_mtu() for IPv4 users. */ +static inline u32 dst4_mtu(const struct dst_entry *dst) +{ + return INDIRECT_CALL_1(dst->ops->mtu, ipv4_mtu, dst); +} + /* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric) { @@ -240,9 +249,9 @@ static inline void dst_hold(struct dst_entry *dst) static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) { - if (unlikely(time != dst->lastuse)) { + if (unlikely(time != READ_ONCE(dst->lastuse))) { dst->__use++; - dst->lastuse = time; + WRITE_ONCE(dst->lastuse, time); } } @@ -431,13 +440,15 @@ static inline void dst_link_failure(struct sk_buff *skb) static inline void dst_set_expires(struct dst_entry *dst, int timeout) { - unsigned long expires = jiffies + timeout; + unsigned long old, expires = jiffies + timeout; if (expires == 0) expires = 1; - if (dst->expires == 0 || time_before(expires, dst->expires)) - dst->expires = expires; + old = READ_ONCE(dst->expires); + + if (!old || time_before(expires, old)) + WRITE_ONCE(dst->expires, expires); } static inline unsigned int dst_dev_overhead(struct dst_entry *dst, @@ -456,7 +467,7 @@ INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, /* Output packet to network from transport. */ static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - return INDIRECT_CALL_INET(skb_dst(skb)->output, + return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->output), ip6_output, ip_output, net, sk, skb); } @@ -466,7 +477,7 @@ INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *)); /* Input packet from network to transport. */ static inline int dst_input(struct sk_buff *skb) { - return INDIRECT_CALL_INET(skb_dst(skb)->input, + return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->input), ip6_input, ip_local_deliver, skb); } @@ -476,7 +487,7 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) { - if (dst->obsolete) + if (READ_ONCE(dst->obsolete)) dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie); return dst; @@ -561,6 +572,41 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } +static inline struct net_device *dst_dev(const struct dst_entry *dst) +{ + return READ_ONCE(dst->dev); +} + +static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst) +{ + return rcu_dereference(dst->dev_rcu); +} + +static inline struct net *dst_dev_net_rcu(const struct dst_entry *dst) +{ + return dev_net_rcu(dst_dev_rcu(dst)); +} + +static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) +{ + return dst_dev(skb_dst(skb)); +} + +static inline struct net_device *skb_dst_dev_rcu(const struct sk_buff *skb) +{ + return dst_dev_rcu(skb_dst(skb)); +} + +static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) +{ + return dev_net(skb_dst_dev(skb)); +} + +static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) +{ + return dev_net_rcu(skb_dst_dev_rcu(skb)); +} + struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 84c15402931c..1fc2fb03ce3f 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -3,6 +3,7 @@ #define __NET_DST_METADATA_H 1 #include <linux/skbuff.h> +#include <net/ip.h> #include <net/ip_tunnels.h> #include <net/macsec.h> #include <net/dst.h> @@ -163,11 +164,8 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb) if (!new_md) return ERR_PTR(-ENOMEM); - unsafe_memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, - sizeof(struct ip_tunnel_info) + md_size, - /* metadata_dst_alloc() reserves room (md_size bytes) for - * options right after the ip_tunnel_info struct. - */); + memcpy(&new_md->u.tun_info, &md_dst->u.tun_info, + sizeof(struct ip_tunnel_info) + md_size); #ifdef CONFIG_DST_CACHE /* Unclone the dst cache if there is one */ if (new_md->u.tun_info.dst_cache.cache) { @@ -223,9 +221,15 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, int md_size) { const struct iphdr *iph = ip_hdr(skb); + struct metadata_dst *tun_dst; + + tun_dst = __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, + 0, flags, tunnel_id, md_size); - return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, - 0, flags, tunnel_id, md_size); + if (tun_dst && (iph->frag_off & htons(IP_DF))) + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, + tun_dst->u.tun_info.key.tun_flags); + return tun_dst; } static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr, diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 04383d90a1e3..6e68e359ad18 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -43,6 +43,10 @@ struct fib_rule { struct fib_kuid_range uid_range; struct fib_rule_port_range sport_range; struct fib_rule_port_range dport_range; + u16 sport_mask; + u16 dport_mask; + u8 iif_is_l3_master; + u8 oif_is_l3_master; struct rcu_head rcu; }; @@ -146,6 +150,17 @@ static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a, ntohs(port) <= a->end; } +static inline bool fib_rule_port_match(const struct fib_rule_port_range *range, + u16 port_mask, __be16 port) +{ + if ((range->start ^ ntohs(port)) & port_mask) + return false; + if (!port_mask && fib_rule_port_range_set(range) && + !fib_rule_port_inrange(range, port)) + return false; + return true; +} + static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a) { return a->start != 0 && a->end != 0 && a->end < 0xffff && @@ -159,6 +174,12 @@ static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a, a->end == b->end; } +static inline bool +fib_rule_port_is_range(const struct fib_rule_port_range *range) +{ + return range->start != range->end; +} + static inline bool fib_rule_requires_fldissect(struct fib_rule *rule) { return rule->iifindex != LOOPBACK_IFINDEX && (rule->ip_proto || @@ -178,10 +199,10 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); unsigned int fib_rules_seq_read(const struct net *net, int family); -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack); -int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack); +int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held); +int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held); INDIRECT_CALLABLE_DECLARE(int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); diff --git a/include/net/flow.h b/include/net/flow.h index 335bbc52171c..ae9481c40063 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -12,6 +12,7 @@ #include <linux/atomic.h> #include <linux/container_of.h> #include <linux/uidgid.h> +#include <net/inet_dscp.h> struct flow_keys; @@ -32,12 +33,14 @@ struct flowi_common { int flowic_iif; int flowic_l3mdev; __u32 flowic_mark; - __u8 flowic_tos; + dscp_t flowic_dscp; __u8 flowic_scope; __u8 flowic_proto; __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 +#define FLOWI_FLAG_L3MDEV_OIF 0x04 +#define FLOWI_FLAG_ANY_SPORT 0x08 __u32 flowic_secid; kuid_t flowic_uid; __u32 flowic_multipath_hash; @@ -68,7 +71,7 @@ struct flowi4 { #define flowi4_iif __fl_common.flowic_iif #define flowi4_l3mdev __fl_common.flowic_l3mdev #define flowi4_mark __fl_common.flowic_mark -#define flowi4_tos __fl_common.flowic_tos +#define flowi4_dscp __fl_common.flowic_dscp #define flowi4_scope __fl_common.flowic_scope #define flowi4_proto __fl_common.flowic_proto #define flowi4_flags __fl_common.flowic_flags @@ -101,7 +104,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_l3mdev = 0; fl4->flowi4_mark = mark; - fl4->flowi4_tos = tos; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); fl4->flowi4_scope = scope; fl4->flowi4_proto = proto; fl4->flowi4_flags = flags; @@ -139,7 +142,7 @@ struct flowi6 { #define flowi6_uid __fl_common.flowic_uid struct in6_addr daddr; struct in6_addr saddr; - /* Note: flowi6_tos is encoded in flowlabel, too. */ + /* Note: flowi6_dscp is encoded in flowlabel, too. */ __be32 flowlabel; union flowi_uli uli; #define fl6_sport uli.ports.sport @@ -161,7 +164,7 @@ struct flowi { #define flowi_iif u.__fl_common.flowic_iif #define flowi_l3mdev u.__fl_common.flowic_l3mdev #define flowi_mark u.__fl_common.flowic_mark -#define flowi_tos u.__fl_common.flowic_tos +#define flowi_dscp u.__fl_common.flowic_dscp #define flowi_scope u.__fl_common.flowic_scope #define flowi_proto u.__fl_common.flowic_proto #define flowi_flags u.__fl_common.flowic_flags diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 596ab9791e4d..70a02ee14308 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -231,6 +231,21 @@ struct flow_action_cookie *flow_action_cookie_create(void *data, gfp_t gfp); void flow_action_cookie_destroy(struct flow_action_cookie *cookie); +struct flow_action_police { + u32 burst; + u64 rate_bytes_ps; + u64 peakrate_bytes_ps; + u32 avrate; + u16 overhead; + u64 burst_pkt; + u64 rate_pkt_ps; + u32 mtu; + struct { + enum flow_action_id act_id; + u32 extval; + } exceed, notexceed; +}; + struct flow_action_entry { enum flow_action_id id; u32 hw_index; @@ -275,20 +290,7 @@ struct flow_action_entry { u32 trunc_size; bool truncate; } sample; - struct { /* FLOW_ACTION_POLICE */ - u32 burst; - u64 rate_bytes_ps; - u64 peakrate_bytes_ps; - u32 avrate; - u16 overhead; - u64 burst_pkt; - u64 rate_pkt_ps; - u32 mtu; - struct { - enum flow_action_id act_id; - u32 extval; - } exceed, notexceed; - } police; + struct flow_action_police police; /* FLOW_ACTION_POLICE */ struct { /* FLOW_ACTION_CT */ int action; u16 zone; @@ -526,7 +528,7 @@ static inline bool flow_rule_has_enc_control_flags(const u32 enc_ctrl_flags, * * Return: true if control flags are set, false otherwise. */ -static inline bool flow_rule_match_has_control_flags(struct flow_rule *rule, +static inline bool flow_rule_match_has_control_flags(const struct flow_rule *rule, struct netlink_ext_ack *extack) { struct flow_match_control match; @@ -718,7 +720,7 @@ struct flow_offload_action { struct flow_offload_action *offload_action_alloc(unsigned int num_actions); static inline struct flow_rule * -flow_cls_offload_flow_rule(struct flow_cls_offload *flow_cmd) +flow_cls_offload_flow_rule(const struct flow_cls_offload *flow_cmd) { return flow_cmd->rule; } diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index 9467e33dfb36..8aca881937da 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -358,7 +358,7 @@ static int fq_init(struct fq *fq, int flows_cnt) fq->limit = 8192; fq->memory_limit = 16 << 20; /* 16 MBytes */ - fq->flows = kvcalloc(fq->flows_cnt, sizeof(fq->flows[0]), GFP_KERNEL); + fq->flows = kvzalloc_objs(fq->flows[0], fq->flows_cnt); if (!fq->flows) return -ENOMEM; diff --git a/include/net/genetlink.h b/include/net/genetlink.h index a03d56765832..d70510ac31ab 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -62,7 +62,7 @@ struct genl_info; * @small_ops: the small-struct operations supported by this family * @n_small_ops: number of small-struct operations supported by this family * @split_ops: the split do/dump form of operation definition - * @n_split_ops: number of entries in @split_ops, not that with split do/dump + * @n_split_ops: number of entries in @split_ops, note that with split do/dump * ops the number of entries is not the same as number of commands * @sock_priv_size: the size of per-socket private memory * @sock_priv_init: the per-socket private memory initializer @@ -489,8 +489,10 @@ genlmsg_multicast_netns_filtered(const struct genl_family *family, netlink_filter_fn filter, void *filter_data) { - if (WARN_ON_ONCE(group >= family->n_mcgrps)) + if (WARN_ON_ONCE(group >= family->n_mcgrps)) { + nlmsg_free(skb); return -EINVAL; + } group = family->mcgrp_offset + group; return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group, flags, filter, filter_data); diff --git a/include/net/gro.h b/include/net/gro.h index 7b548f91754b..2300b6da05b2 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -71,14 +71,11 @@ struct napi_gro_cb { /* Free the skb? */ u8 free:2; - /* Used in foo-over-udp, set in udp[46]_gro_receive */ - u8 is_ipv6:1; - /* Used in GRE, set in fou/gue_gro_receive */ u8 is_fou:1; /* Used to determine if ipid_offset can be ignored */ - u8 ip_fixedid:1; + u8 ip_fixedid:2; /* Number of gro_receive callbacks this packet already went through */ u8 recursion_counter:4; @@ -408,9 +405,8 @@ INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); -INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, - struct sk_buff *)); -INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); +struct sk_buff *udp6_gro_receive(struct list_head *, struct sk_buff *); +int udp6_gro_complete(struct sk_buff *, int); #define indirect_call_gro_receive_inet(cb, f2, f1, head, skb) \ ({ \ @@ -445,29 +441,25 @@ static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb, } static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2, - struct sk_buff *p, bool outer) + struct sk_buff *p, bool inner) { const u32 id = ntohl(*(__be32 *)&iph->id); const u32 id2 = ntohl(*(__be32 *)&iph2->id); const u16 ipid_offset = (id >> 16) - (id2 >> 16); const u16 count = NAPI_GRO_CB(p)->count; - const u32 df = id & IP_DF; - int flush; /* All fields must match except length and checksum. */ - flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF)); - - if (flush | (outer && df)) - return flush; + if ((iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | ((id ^ id2) & IP_DF)) + return true; /* When we receive our second frame we can make a decision on if we * continue this flow as an atomic flow with a fixed ID or if we use * an incrementing ID. */ - if (count == 1 && df && !ipid_offset) - NAPI_GRO_CB(p)->ip_fixedid = true; + if (count == 1 && !ipid_offset) + NAPI_GRO_CB(p)->ip_fixedid |= 1 << inner; - return ipid_offset ^ (count * !NAPI_GRO_CB(p)->ip_fixedid); + return ipid_offset ^ (count * !(NAPI_GRO_CB(p)->ip_fixedid & (1 << inner))); } static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2) @@ -482,7 +474,7 @@ static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr static inline int __gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p, const u16 diff, - bool outer) + bool inner) { const void *nh = th - diff; const void *nh2 = th2 - diff; @@ -490,47 +482,70 @@ static inline int __gro_receive_network_flush(const void *th, const void *th2, if (((struct iphdr *)nh)->version == 6) return ipv6_gro_flush(nh, nh2); else - return inet_gro_flush(nh, nh2, p, outer); + return inet_gro_flush(nh, nh2, p, inner); } static inline int gro_receive_network_flush(const void *th, const void *th2, struct sk_buff *p) { - const bool encap_mark = NAPI_GRO_CB(p)->encap_mark; int off = skb_transport_offset(p); int flush; - flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark); - if (encap_mark) - flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false); + flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, false); + if (NAPI_GRO_CB(p)->encap_mark) + flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, true); return flush; } int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); +void __gro_flush(struct gro_node *gro, bool flush_old); + +static inline void gro_flush(struct gro_node *gro, bool flush_old) +{ + if (!gro->bitmask) + return; + + __gro_flush(gro, flush_old); +} + +static inline void napi_gro_flush(struct napi_struct *napi, bool flush_old) +{ + gro_flush(&napi->gro, flush_old); +} /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ -static inline void gro_normal_list(struct napi_struct *napi) +static inline void gro_normal_list(struct gro_node *gro) { - if (!napi->rx_count) + if (!gro->rx_count) return; - netif_receive_skb_list_internal(&napi->rx_list); - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; + netif_receive_skb_list_internal(&gro->rx_list); + INIT_LIST_HEAD(&gro->rx_list); + gro->rx_count = 0; +} + +static inline void gro_flush_normal(struct gro_node *gro, bool flush_old) +{ + gro_flush(gro, flush_old); + gro_normal_list(gro); } /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ -static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs) +static inline void gro_normal_one(struct gro_node *gro, struct sk_buff *skb, + int segs) { - list_add_tail(&skb->list, &napi->rx_list); - napi->rx_count += segs; - if (napi->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) - gro_normal_list(napi); + list_add_tail(&skb->list, &gro->rx_list); + gro->rx_count += segs; + if (gro->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch)) + gro_normal_list(gro); } +void gro_init(struct gro_node *gro); +void gro_cleanup(struct gro_node *gro); + /* This function is the alternative of 'inet_iif' and 'inet_sdif' * functions in case we can not rely on fields of IPCB. * @@ -577,4 +592,31 @@ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int * struct packet_offload *gro_find_receive_by_type(__be16 type); struct packet_offload *gro_find_complete_by_type(__be16 type); +static inline struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb) +{ + unsigned int thlen, hlen, off; + struct tcphdr *th; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*th); + th = skb_gro_header(skb, hlen, off); + if (unlikely(!th)) + return NULL; + + thlen = th->doff * 4; + if (unlikely(thlen < sizeof(*th))) + return NULL; + + hlen = off + thlen; + if (!skb_gro_may_pull(skb, hlen)) { + th = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!th)) + return NULL; + } + + skb_gro_pull(skb, thlen); + + return th; +} + #endif /* _NET_GRO_H */ diff --git a/include/net/hotdata.h b/include/net/hotdata.h index 30e9570beb2a..62534d1f3c70 100644 --- a/include/net/hotdata.h +++ b/include/net/hotdata.h @@ -2,9 +2,18 @@ #ifndef _NET_HOTDATA_H #define _NET_HOTDATA_H +#include <linux/llist.h> #include <linux/types.h> #include <linux/netdevice.h> #include <net/protocol.h> +#ifdef CONFIG_RPS +#include <net/rps-types.h> +#endif + +struct skb_defer_node { + struct llist_head defer_list; + atomic_long_t defer_count; +} ____cacheline_aligned_in_smp; /* Read mostly data used in network fast paths. */ struct net_hotdata { @@ -23,19 +32,20 @@ struct net_hotdata { struct net_offload udpv6_offload; #endif struct list_head offload_base; - struct list_head ptype_all; struct kmem_cache *skbuff_cache; struct kmem_cache *skbuff_fclone_cache; struct kmem_cache *skb_small_head_cache; #ifdef CONFIG_RPS - struct rps_sock_flow_table __rcu *rps_sock_flow_table; + rps_tag_ptr rps_sock_flow_table; u32 rps_cpu_mask; #endif + struct skb_defer_node __percpu *skb_defer_nodes; int gro_normal_batch; int netdev_budget; int netdev_budget_usecs; int tstamp_prequeue; int max_backlog; + int qdisc_max_burst; int dev_tx_weight; int dev_rx_weight; int sysctl_max_skb_frags; diff --git a/include/net/icmp.h b/include/net/icmp.h index caddf4a59ad1..935ee13d9ae9 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -37,10 +37,10 @@ struct sk_buff; struct net; void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, - const struct ip_options *opt); + const struct inet_skb_parm *parm); static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) { - __icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt); + __icmp_send(skb_in, type, code, info, IPCB(skb_in)); } #if IS_ENABLED(CONFIG_NF_NAT) @@ -48,8 +48,10 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info); #else static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { - struct ip_options opts = { 0 }; - __icmp_send(skb_in, type, code, info, &opts); + struct inet_skb_parm parm; + + memset(&parm, 0, sizeof(parm)); + __icmp_send(skb_in, type, code, info, &parm); } #endif diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h index 813e163ce27c..c60867e7e43c 100644 --- a/include/net/ieee80211_radiotap.h +++ b/include/net/ieee80211_radiotap.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2017 Intel Deutschland GmbH - * Copyright (c) 2018-2019, 2021-2022 Intel Corporation + * Copyright (c) 2018-2019, 2021-2022, 2025 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -202,6 +202,24 @@ enum ieee80211_radiotap_vht_coding { IEEE80211_RADIOTAP_CODING_LDPC_USER3 = 0x08, }; +enum ieee80211_radiotap_vht_bandwidth { + /* Note: more values are defined but can't really be used */ + IEEE80211_RADIOTAP_VHT_BW_20 = 0, + IEEE80211_RADIOTAP_VHT_BW_40 = 1, + IEEE80211_RADIOTAP_VHT_BW_80 = 4, + IEEE80211_RADIOTAP_VHT_BW_160 = 11, +}; + +struct ieee80211_radiotap_vht { + __le16 known; + u8 flags; + u8 bandwidth; + u8 mcs_nss[4]; + u8 coding; + u8 group_id; + __le16 partial_aid; +} __packed; + /* for IEEE80211_RADIOTAP_TIMESTAMP */ enum ieee80211_radiotap_timestamp_unit_spos { IEEE80211_RADIOTAP_TIMESTAMP_UNIT_MASK = 0x000F, diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h index 025bd8d3c769..b814e1acc512 100644 --- a/include/net/inet6_connection_sock.h +++ b/include/net/inet6_connection_sock.h @@ -18,12 +18,14 @@ struct sk_buff; struct sock; struct sockaddr; -struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6, - const struct request_sock *req, u8 proto); +struct dst_entry *inet6_csk_route_socket(struct sock *sk, + struct flowi6 *fl6); -void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); +struct dst_entry *inet6_csk_route_req(const struct sock *sk, + struct dst_entry *dst, + struct flowi6 *fl6, + const struct request_sock *req, u8 proto); int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); -struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu); #endif /* _INET6_CONNECTION_SOCK_H */ diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 74dd90ff5f12..2cc5d416bbb5 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -24,6 +24,8 @@ struct inet_hashinfo; +void inet6_init_ehash_secret(void); + static inline unsigned int __inet6_ehashfn(const u32 lhash, const u16 lport, const u32 fhash, @@ -41,7 +43,6 @@ static inline unsigned int __inet6_ehashfn(const u32 lhash, * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, @@ -65,7 +66,6 @@ struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk, inet6_ehashfn_t *ehashfn); struct sock *inet6_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, @@ -83,7 +83,6 @@ struct sock *inet6_lookup_run_sk_lookup(const struct net *net, inet6_ehashfn_t *ehashfn); static inline struct sock *__inet6_lookup(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, @@ -92,14 +91,14 @@ static inline struct sock *__inet6_lookup(const struct net *net, const int dif, const int sdif, bool *refcounted) { - struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr, - sport, daddr, hnum, + struct sock *sk = __inet6_lookup_established(net, saddr, sport, + daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; - return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport, + return inet6_lookup_listener(net, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } @@ -143,14 +142,13 @@ struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff, return reuse_sk; } -static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, +static inline struct sock *__inet6_lookup_skb(struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, int iif, int sdif, bool *refcounted) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = skb_dst_dev_net_rcu(skb); const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct sock *sk; @@ -161,20 +159,16 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; - return __inet6_lookup(net, hashinfo, skb, - doff, &ip6h->saddr, sport, + return __inet6_lookup(net, skb, doff, &ip6h->saddr, sport, &ip6h->daddr, ntohs(dport), iif, sdif, refcounted); } -struct sock *inet6_lookup(const struct net *net, struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, +struct sock *inet6_lookup(const struct net *net, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, const int dif); -int inet6_hash(struct sock *sk); - static inline bool inet6_match(const struct net *net, const struct sock *sk, const struct in6_addr *saddr, const struct in6_addr *daddr, @@ -183,7 +177,7 @@ static inline bool inet6_match(const struct net *net, const struct sock *sk, { if (!net_eq(sock_net(sk), net) || sk->sk_family != AF_INET6 || - sk->sk_portpair != ports || + READ_ONCE(sk->sk_portpair) != ports || !ipv6_addr_equal(&sk->sk_v6_daddr, saddr) || !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) return false; diff --git a/include/net/inet_common.h b/include/net/inet_common.h index c17a6585d0b0..3d747896be30 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -19,15 +19,14 @@ struct msghdr; struct net; struct page; struct sock; -struct sockaddr; struct socket; int inet_release(struct socket *sock); -int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags); -int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags, int is_sendmsg); -int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, +int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags); int inet_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg); @@ -42,8 +41,8 @@ int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); int __inet_listen_sk(struct sock *sk, int backlog); void inet_sock_destruct(struct sock *sk); -int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); -int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); +int inet_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); /* Don't allocate port at this moment, defer to connect. */ #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ @@ -52,7 +51,7 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); #define BIND_FROM_BPF (1 << 2) /* Skip CAP_NET_BIND_SERVICE check. */ #define BIND_NO_CAP_NET_BIND_SERVICE (1 << 3) -int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, +int __inet_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer); @@ -60,8 +59,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet_ctl_sock_create(struct sock **sk, unsigned short family, unsigned short type, unsigned char protocol, struct net *net); -int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, - int *addr_len); +int inet_recv_error(struct sock *sk, struct msghdr *msg, int len); struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb); int inet_gro_complete(struct sk_buff *skb, int nhoff); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c7f42844c79a..433c2df23076 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -34,7 +34,7 @@ struct tcp_congestion_ops; */ struct inet_connection_sock_af_ops { int (*queue_xmit)(struct sock *sk, struct sk_buff *skb, struct flowi *fl); - void (*send_check)(struct sock *sk, struct sk_buff *skb); + u16 net_header_len; int (*rebuild_header)(struct sock *sk); void (*sk_rx_dst_set)(struct sock *sk, const struct sk_buff *skb); int (*conn_request)(struct sock *sk, struct sk_buff *skb); @@ -42,14 +42,13 @@ struct inet_connection_sock_af_ops { struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, - bool *own_req); - u16 net_header_len; - u16 sockaddr_len; + bool *own_req, + void (*opt_child_init)(struct sock *newsk, + const struct sock *sk)); int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); - void (*addr2sockaddr)(struct sock *sk, struct sockaddr *); void (*mtu_reduced)(struct sock *sk); }; @@ -58,15 +57,15 @@ struct inet_connection_sock_af_ops { * @icsk_accept_queue: FIFO of established children * @icsk_bind_hash: Bind node * @icsk_bind2_hash: Bind node in the bhash2 table - * @icsk_timeout: Timeout - * @icsk_retransmit_timer: Resend (no ack) + * @icsk_delack_timer: Delayed ACK timer + * @icsk_keepalive_timer: Keepalive timer + * @mptcp_tout_timer: mptcp timer * @icsk_rto: Retransmit timeout * @icsk_pmtu_cookie Last pmtu seen by socket * @icsk_ca_ops Pluggable congestion control hook * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ulp_ops Pluggable ULP control hook * @icsk_ulp_data ULP private data - * @icsk_clean_acked Clean acked data hook * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event @@ -85,18 +84,20 @@ struct inet_connection_sock { struct request_sock_queue icsk_accept_queue; struct inet_bind_bucket *icsk_bind_hash; struct inet_bind2_bucket *icsk_bind2_hash; - unsigned long icsk_timeout; - struct timer_list icsk_retransmit_timer; - struct timer_list icsk_delack_timer; + struct timer_list icsk_delack_timer; + union { + struct timer_list icsk_keepalive_timer; + struct timer_list mptcp_tout_timer; + }; __u32 icsk_rto; __u32 icsk_rto_min; + u32 icsk_rto_max; __u32 icsk_delack_max; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; const struct tcp_ulp_ops *icsk_ulp_ops; void __rcu *icsk_ulp_data; - void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state:5, icsk_ca_initialized:1, @@ -116,8 +117,8 @@ struct inet_connection_sock { #define ATO_BITS 8 __u32 ato:ATO_BITS, /* Predicted tick of soft clock */ lrcv_flowlabel:20, /* last received ipv6 flowlabel */ - unused:4; - unsigned long timeout; /* Currently scheduled timeout */ + dst_quick_ack:1, /* cache dst RTAX_QUICKACK */ + unused:3; __u32 lrcvtime; /* timestamp of last received data packet */ __u16 last_seg_size; /* Size of last incoming segment */ __u16 rcv_mss; /* MSS used for delayed ACK decisions */ @@ -189,8 +190,16 @@ static inline void inet_csk_delack_init(struct sock *sk) memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack)); } -void inet_csk_delete_keepalive_timer(struct sock *sk); -void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout); +static inline unsigned long tcp_timeout_expires(const struct sock *sk) +{ + return READ_ONCE(sk->tcp_retransmit_timer.expires); +} + +static inline unsigned long +icsk_delack_timeout(const struct inet_connection_sock *icsk) +{ + return READ_ONCE(icsk->icsk_delack_timer.expires); +} static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) { @@ -199,7 +208,7 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { smp_store_release(&icsk->icsk_pending, 0); #ifdef INET_CSK_CLEAR_TIMERS - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer(sk, &sk->tcp_retransmit_timer); #endif } else if (what == ICSK_TIME_DACK) { smp_store_release(&icsk->icsk_ack.pending, 0); @@ -227,16 +236,15 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, when = max_when; } + when += jiffies; if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) { smp_store_release(&icsk->icsk_pending, what); - icsk->icsk_timeout = jiffies + when; - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + sk_reset_timer(sk, &sk->tcp_retransmit_timer, when); } else if (what == ICSK_TIME_DACK) { smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_TIMER); - icsk->icsk_ack.timeout = jiffies + when; - sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + sk_reset_timer(sk, &icsk->icsk_delack_timer, when); } else { pr_debug("inet_csk BUG: unknown timer value\n"); } @@ -264,8 +272,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child); -bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - unsigned long timeout); +bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req); struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req); @@ -288,22 +295,8 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req); -static inline unsigned long -reqsk_timeout(struct request_sock *req, unsigned long max_timeout) -{ - u64 timeout = (u64)req->timeout << req->num_timeout; - - return (unsigned long)min_t(u64, timeout, max_timeout); -} - -static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk) -{ - /* The below has to be done to allow calling inet_csk_destroy_sock */ - sock_set_flag(sk, SOCK_DEAD); - this_cpu_inc(*sk->sk_prot->orphan_count); -} - void inet_csk_destroy_sock(struct sock *sk); +void inet_csk_prepare_for_destroy_sock(struct sock *sk); void inet_csk_prepare_forced_close(struct sock *sk); /* @@ -318,11 +311,10 @@ static inline __poll_t inet_csk_listen_poll(const struct sock *sk) int inet_csk_listen_start(struct sock *sk); void inet_csk_listen_stop(struct sock *sk); -void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); - /* update the fast reuse flag when adding a socket */ -void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, - struct sock *sk); +void inet_csk_update_fastreuse(const struct sock *sk, + struct inet_bind_bucket *tb, + struct inet_bind2_bucket *tb2); struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu); diff --git a/include/net/inet_dscp.h b/include/net/inet_dscp.h index 72f250dffada..1aa9f04ed1ab 100644 --- a/include/net/inet_dscp.h +++ b/include/net/inet_dscp.h @@ -39,6 +39,12 @@ typedef u8 __bitwise dscp_t; #define INET_DSCP_MASK 0xfc +/* A few places in the IPv4 code need to ignore the three high order bits of + * DSCP because of backward compatibility (as these bits used to represent the + * IPv4 Precedence in RFC 791's TOS field and were ignored). + */ +#define INET_DSCP_LEGACY_TOS_MASK ((__force dscp_t)0x1c) + static inline dscp_t inet_dsfield_to_dscp(__u8 dsfield) { return (__force dscp_t)(dsfield & INET_DSCP_MASK); diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index ea32393464a2..827b87a95dab 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -51,11 +51,25 @@ static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner) return outer; } +/* Apply either ECT(0) or ECT(1) */ +static inline void __INET_ECN_xmit(struct sock *sk, bool use_ect_1) +{ + __u8 ect = use_ect_1 ? INET_ECN_ECT_1 : INET_ECN_ECT_0; + + /* Mask the complete byte in case the connection alternates between + * ECT(0) and ECT(1). + */ + inet_sk(sk)->tos &= ~INET_ECN_MASK; + inet_sk(sk)->tos |= ect; + if (inet6_sk(sk)) { + inet6_sk(sk)->tclass &= ~INET_ECN_MASK; + inet6_sk(sk)->tclass |= ect; + } +} + static inline void INET_ECN_xmit(struct sock *sk) { - inet_sk(sk)->tos |= INET_ECN_ECT_0; - if (inet6_sk(sk) != NULL) - inet6_sk(sk)->tclass |= INET_ECN_ECT_0; + __INET_ECN_xmit(sk, false); } static inline void INET_ECN_dontxmit(struct sock *sk) diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 5af6eb14c5db..365925c9d262 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -123,31 +123,19 @@ void inet_frags_fini(struct inet_frags *); int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net); -static inline void fqdir_pre_exit(struct fqdir *fqdir) -{ - /* Prevent creation of new frags. - * Pairs with READ_ONCE() in inet_frag_find(). - */ - WRITE_ONCE(fqdir->high_thresh, 0); - - /* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire() - * and ip6frag_expire_frag_queue(). - */ - WRITE_ONCE(fqdir->dead, true); -} +void fqdir_pre_exit(struct fqdir *fqdir); void fqdir_exit(struct fqdir *fqdir); -void inet_frag_kill(struct inet_frag_queue *q); +void inet_frag_kill(struct inet_frag_queue *q, int *refs); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); -/* Free all skbs in the queue; return the sum of their truesizes. */ -unsigned int inet_frag_rbtree_purge(struct rb_root *root, - enum skb_drop_reason reason); +void inet_frag_queue_flush(struct inet_frag_queue *q, + enum skb_drop_reason reason); -static inline void inet_frag_put(struct inet_frag_queue *q) +static inline void inet_frag_putn(struct inet_frag_queue *q, int refs) { - if (refcount_dec_and_test(&q->refcnt)) + if (refs && refcount_sub_and_test(refs, &q->refcnt)) inet_frag_destroy(q); } diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 5eea47f135a4..6e2fe186d0dc 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -89,6 +89,7 @@ struct inet_bind_bucket { bool fast_ipv6_only; struct hlist_node node; struct hlist_head bhash2; + struct rcu_head rcu; }; struct inet_bind2_bucket { @@ -107,6 +108,8 @@ struct inet_bind2_bucket { struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; + signed char fastreuse; + signed char fastreuseport; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) @@ -174,14 +177,9 @@ struct inet_hashinfo { bool pernet; } ____cacheline_aligned_in_smp; -static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) +static inline struct inet_hashinfo *tcp_get_hashinfo(const struct sock *sk) { -#if IS_ENABLED(CONFIG_IP_DCCP) - return sk->sk_prot->h.hashinfo ? : - sock_net(sk)->ipv4.tcp_death_row.hashinfo; -#else return sock_net(sk)->ipv4.tcp_death_row.hashinfo; -#endif } static inline struct inet_listen_hashbucket * @@ -206,12 +204,6 @@ static inline spinlock_t *inet_ehash_lockp( int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo); -static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h) -{ - kfree(h->lhash2); - h->lhash2 = NULL; -} - static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) { kvfree(hashinfo->ehash_locks); @@ -226,8 +218,7 @@ struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev); -void inet_bind_bucket_destroy(struct kmem_cache *cachep, - struct inet_bind_bucket *tb); +void inet_bind_bucket_destroy(struct inet_bind_bucket *tb); bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, @@ -273,6 +264,20 @@ inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk, return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; } +static inline bool inet_use_hash2_on_bind(const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + return false; + + if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) + return true; + } +#endif + return sk->sk_rcv_saddr != htonl(INADDR_ANY); +} + struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port); @@ -295,17 +300,14 @@ void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit); -int inet_hashinfo2_init_mod(struct inet_hashinfo *h); bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk); -int __inet_hash(struct sock *sk, struct sock *osk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); struct sock *__inet_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, @@ -313,12 +315,12 @@ struct sock *__inet_lookup_listener(const struct net *net, const int dif, const int sdif); static inline struct sock *inet_lookup_listener(struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - __be32 saddr, __be16 sport, - __be32 daddr, __be16 dport, int dif, int sdif) + struct sk_buff *skb, int doff, + __be32 saddr, __be16 sport, + __be32 daddr, __be16 dport, + int dif, int sdif) { - return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport, + return __inet_lookup_listener(net, skb, doff, saddr, sport, daddr, ntohs(dport), dif, sdif); } @@ -356,7 +358,7 @@ static inline bool inet_match(const struct net *net, const struct sock *sk, int dif, int sdif) { if (!net_eq(sock_net(sk), net) || - sk->sk_portpair != ports || + READ_ONCE(sk->sk_portpair) != ports || sk->sk_addrpair != cookie) return false; @@ -369,7 +371,6 @@ static inline bool inet_match(const struct net *net, const struct sock *sk, * not check it for lookups anymore, thanks Alexey. -DaveM */ struct sock *__inet_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 hnum, const int dif, const int sdif); @@ -395,18 +396,16 @@ struct sock *inet_lookup_run_sk_lookup(const struct net *net, __be32 daddr, u16 hnum, const int dif, inet_ehashfn_t *ehashfn); -static inline struct sock * - inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo, - const __be32 saddr, const __be16 sport, - const __be32 daddr, const __be16 dport, - const int dif) +static inline struct sock *inet_lookup_established(struct net *net, + const __be32 saddr, const __be16 sport, + const __be32 daddr, const __be16 dport, + const int dif) { - return __inet_lookup_established(net, hashinfo, saddr, sport, daddr, + return __inet_lookup_established(net, saddr, sport, daddr, ntohs(dport), dif, 0); } static inline struct sock *__inet_lookup(struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, @@ -416,18 +415,17 @@ static inline struct sock *__inet_lookup(struct net *net, u16 hnum = ntohs(dport); struct sock *sk; - sk = __inet_lookup_established(net, hashinfo, saddr, sport, + sk = __inet_lookup_established(net, saddr, sport, daddr, hnum, dif, sdif); *refcounted = true; if (sk) return sk; *refcounted = false; - return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, + return __inet_lookup_listener(net, skb, doff, saddr, sport, daddr, hnum, dif, sdif); } static inline struct sock *inet_lookup(struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, const __be16 sport, const __be32 daddr, const __be16 dport, @@ -436,7 +434,7 @@ static inline struct sock *inet_lookup(struct net *net, struct sock *sk; bool refcounted; - sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + sk = __inet_lookup(net, skb, doff, saddr, sport, daddr, dport, dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) @@ -484,15 +482,14 @@ struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff, return reuse_sk; } -static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, - struct sk_buff *skb, +static inline struct sock *__inet_lookup_skb(struct sk_buff *skb, int doff, const __be16 sport, const __be16 dport, const int sdif, bool *refcounted) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = skb_dst_dev_net_rcu(skb); const struct iphdr *iph = ip_hdr(skb); struct sock *sk; @@ -503,8 +500,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, if (sk) return sk; - return __inet_lookup(net, hashinfo, skb, - doff, iph->saddr, sport, + return __inet_lookup(net, skb, doff, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), sdif, refcounted); } @@ -527,9 +523,12 @@ static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr) int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, + u32 hash_port0, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, - struct inet_timewait_sock **)); + struct inet_timewait_sock **, + bool rcu_lookup, + u32 hash)); int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 1086256549fa..7cdcbed3e5cb 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -26,6 +26,8 @@ #include <net/tcp_states.h> #include <net/l3mdev.h> +#define IP_OPTIONS_DATA_FIXED_SIZE 40 + /** struct ip_options - IP Options * * @faddr - Saved first hop address @@ -58,12 +60,9 @@ struct ip_options { struct ip_options_rcu { struct rcu_head rcu; - struct ip_options opt; -}; -struct ip_options_data { - struct ip_options_rcu opt; - char data[40]; + /* Must be last as it ends in a flexible-array member. */ + struct ip_options opt; }; struct inet_request_sock { @@ -101,10 +100,7 @@ struct inet_request_sock { }; }; -static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) -{ - return (struct inet_request_sock *)sk; -} +#define inet_rsk(ptr) container_of_const(ptr, struct inet_request_sock, req) static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) { @@ -163,6 +159,13 @@ static inline bool inet_sk_bound_dev_eq(const struct net *net, #endif } +struct inet6_cork { + struct ipv6_txoptions *opt; + u8 hop_limit; + u8 tclass; + u8 dontfrag:1; +}; + struct inet_cork { unsigned int flags; __be32 addr; @@ -183,6 +186,9 @@ struct inet_cork { struct inet_cork_full { struct inet_cork base; struct flowi fl; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_cork base6; +#endif }; struct ip_mc_socklist; @@ -214,6 +220,7 @@ struct inet_sock { struct sock sk; #if IS_ENABLED(CONFIG_IPV6) struct ipv6_pinfo *pinet6; + struct ipv6_fl_socklist __rcu *ipv6_fl_list; #endif /* Socket demultiplex comparisons on incoming packets. */ #define inet_daddr sk.__sk_common.skc_daddr @@ -354,14 +361,6 @@ static inline struct sock *skb_to_full_sk(const struct sk_buff *skb) #define inet_sk(ptr) container_of_const(ptr, struct inet_sock, sk) -static inline void __inet_sk_copy_descendant(struct sock *sk_to, - const struct sock *sk_from, - const int ancestor_size) -{ - memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1, - sk_from->sk_prot->obj_size - ancestor_size); -} - int inet_sk_rebuild_header(struct sock *sk); /** diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 67a313575780..63a644ff30de 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -70,7 +70,8 @@ struct inet_timewait_sock { unsigned int tw_transparent : 1, tw_flowlabel : 20, tw_usec_ts : 1, - tw_pad : 2, /* 2 bits hole */ + tw_connect_bind : 1, + tw_pad : 1, /* 1 bit hole */ tw_tos : 8; u32 tw_txhash; u32 tw_priority; @@ -81,6 +82,14 @@ struct inet_timewait_sock { struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; struct inet_bind2_bucket *tw_tb2; +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_assoc __rcu *psp_assoc; +#endif +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff* (*tw_validate_xmit_skb)(struct sock *sk, + struct net_device *dev, + struct sk_buff *skb); +#endif }; #define tw_tclass tw_tos diff --git a/include/net/ioam6.h b/include/net/ioam6.h index 2cbbee6e806a..a75912fe247e 100644 --- a/include/net/ioam6.h +++ b/include/net/ioam6.h @@ -60,6 +60,8 @@ void ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_trace_hdr *trace, bool is_input); +u8 ioam6_trace_compute_nodelen(u32 trace_type); + int ioam6_init(void); void ioam6_exit(void); diff --git a/include/net/ip.h b/include/net/ip.h index ba7b43447775..7f2fe1a8401b 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -59,6 +59,7 @@ struct inet_skb_parm { #define IPSKB_L3SLAVE BIT(7) #define IPSKB_NOPOLICY BIT(8) #define IPSKB_MULTIPATH BIT(9) +#define IPSKB_MCROUTE BIT(10) u16 frag_max_size; }; @@ -92,14 +93,15 @@ static inline void ipcm_init(struct ipcm_cookie *ipcm) static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, const struct inet_sock *inet) { - ipcm_init(ipcm); + *ipcm = (struct ipcm_cookie) { + .tos = READ_ONCE(inet->tos), + }; + + sockcm_init(&ipcm->sockc, &inet->sk); - ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); - ipcm->sockc.priority = READ_ONCE(inet->sk.sk_priority); - ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags); ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; - ipcm->protocol = inet->inet_num; + ipcm->protocol = READ_ONCE(inet->inet_num); } #define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb)) @@ -166,6 +168,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, int ip_local_deliver(struct sk_buff *skb); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto); int ip_mr_input(struct sk_buff *skb); +int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, @@ -257,16 +260,9 @@ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet, return RT_SCOPE_UNIVERSE; } -static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet) -{ - u8 dsfield = ipc->tos != -1 ? ipc->tos : READ_ONCE(inet->tos); - - return dsfield & INET_DSCP_MASK; -} - /* datagram.c */ -int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); -int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int __ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); +int ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); void ip4_datagram_release_cb(struct sock *sk); @@ -330,11 +326,12 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } #endif -#define snmp_get_cpu_field64_batch(buff64, stats_list, mib_statistic, offset) \ +#define snmp_get_cpu_field64_batch_cnt(buff64, stats_list, cnt, \ + mib_statistic, offset) \ { \ int i, c; \ for_each_possible_cpu(c) { \ - for (i = 0; stats_list[i].name; i++) \ + for (i = 0; i < cnt; i++) \ buff64[i] += snmp_get_cpu_field64( \ mib_statistic, \ c, stats_list[i].entry, \ @@ -342,11 +339,11 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } \ } -#define snmp_get_cpu_field_batch(buff, stats_list, mib_statistic) \ +#define snmp_get_cpu_field_batch_cnt(buff, stats_list, cnt, mib_statistic) \ { \ int i, c; \ for_each_possible_cpu(c) { \ - for (i = 0; stats_list[i].name; i++) \ + for (i = 0; i < cnt; i++) \ buff[i] += snmp_get_cpu_field( \ mib_statistic, \ c, stats_list[i].entry); \ @@ -363,7 +360,7 @@ static inline void inet_get_local_port_range(const struct net *net, int *low, in bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); #ifdef CONFIG_SYSCTL -static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port) +static inline bool inet_is_local_reserved_port(const struct net *net, unsigned short port) { if (!net->ipv4.sysctl_local_reserved_ports) return false; @@ -471,17 +468,19 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { const struct rtable *rt = dst_rtable(dst); + const struct net_device *dev; unsigned int mtu, res; struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst->dev); + dev = dst_dev_rcu(dst); + net = dev_net_rcu(dev); if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { mtu = rt->rt_pmtu; - if (mtu && time_before(jiffies, rt->dst.expires)) + if (mtu && time_before(jiffies, READ_ONCE(rt->dst.expires))) goto out; } @@ -490,7 +489,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, if (mtu) goto out; - mtu = READ_ONCE(dst->dev->mtu); + mtu = READ_ONCE(dev->mtu); if (unlikely(ip_mtu_locked(dst))) { if (rt->rt_uses_gateway && mtu > 576) @@ -510,16 +509,17 @@ out: static inline unsigned int ip_skb_dst_mtu(struct sock *sk, const struct sk_buff *skb) { + const struct dst_entry *dst = skb_dst(skb); unsigned int mtu; if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) { bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; - return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); + return ip_dst_mtu_maybe_forward(dst, forwarding); } - mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); - return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu); + mtu = min(READ_ONCE(dst_dev(dst)->mtu), IP_MAX_MTU); + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len, @@ -804,7 +804,7 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)); -int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); +int ip_recv_error(struct sock *sk, struct msghdr *msg, int len); void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, diff --git a/include/net/ip6_checksum.h b/include/net/ip6_checksum.h index c8a96b888277..6677b3cc3972 100644 --- a/include/net/ip6_checksum.h +++ b/include/net/ip6_checksum.h @@ -82,6 +82,4 @@ static inline __sum16 udp_v6_check(int len, void udp6_set_csum(bool nocheck, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int len); - -int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto); #endif diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 7c87873ae211..9cd27e1b9b69 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -198,6 +198,7 @@ struct fib6_info { fib6_destroying:1, unused:4; + struct list_head purge_link; struct rcu_head rcu; struct nexthop *nh; struct fib6_nh fib6_nh[]; @@ -485,11 +486,30 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr) rcu_read_unlock(); } +#if IS_ENABLED(CONFIG_IPV6) int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); void fib6_nh_release(struct fib6_nh *fib6_nh); void fib6_nh_release_dsts(struct fib6_nh *fib6_nh); +#else +static inline int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, + struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel"); + return -EAFNOSUPPORT; +} + +static inline void fib6_nh_release(struct fib6_nh *fib6_nh) +{ +} + +static inline void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) +{ +} +#endif + int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, @@ -501,17 +521,26 @@ int call_fib6_multipath_entry_notifiers(struct net *net, unsigned int nsiblings, struct netlink_ext_ack *extack); int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt); +#if IS_ENABLED(CONFIG_IPV6) void fib6_rt_update(struct net *net, struct fib6_info *rt, struct nl_info *info); +#else +static inline void fib6_rt_update(struct net *net, struct fib6_info *rt, + struct nl_info *info) +{ +} +#endif void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); +void fib6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args, + unsigned long now); void fib6_run_gc(unsigned long expires, struct net *net, bool force); - void fib6_gc_cleanup(void); int fib6_init(void); +#if IS_ENABLED(CONFIG_IPV6) /* Add the route to the gc list if it is not already there * * The callers should hold f6i->fib6_table->tb6_lock. @@ -544,6 +573,23 @@ static inline void fib6_remove_gc_list(struct fib6_info *f6i) hlist_del_init(&f6i->gc_link); } +static inline void fib6_may_remove_gc_list(struct net *net, + struct fib6_info *f6i) +{ + struct fib6_gc_args gc_args; + + if (hlist_unhashed(&f6i->gc_link)) + return; + + gc_args.timeout = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval); + gc_args.more = 0; + + rcu_read_lock(); + fib6_age_exceptions(f6i, &gc_args, jiffies); + rcu_read_unlock(); +} +#endif + struct ipv6_route_iter { struct seq_net_private p; struct fib6_walker w; @@ -568,8 +614,13 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); void fib6_update_sernum(struct net *net, struct fib6_info *rt); +#if IS_ENABLED(CONFIG_IPV6) void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt); -void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i); +#else +static inline void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) +{ +} +#endif void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val); static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) @@ -579,7 +630,7 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, bool offload, bool trap, bool offload_failed); -#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) +#if IS_ENABLED(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) struct bpf_iter__ipv6_route { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct fib6_info *, rt); diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 6dbdf60b342f..09ffe0f13ce7 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -77,7 +77,14 @@ static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) f6i->fib6_nh->fib_nh_gw_family; } +#if IS_ENABLED(CONFIG_IPV6) void ip6_route_input(struct sk_buff *skb); +#else +static inline void ip6_route_input(struct sk_buff *skb) +{ +} +#endif + struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, struct flowi6 *fl6, @@ -119,7 +126,15 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); int ip6_ins_rt(struct net *net, struct fib6_info *f6i); +#if IS_ENABLED(CONFIG_IPV6) int ip6_del_rt(struct net *net, struct fib6_info *f6i, bool skip_notify); +#else +static inline int ip6_del_rt(struct net *net, struct fib6_info *f6i, + bool skip_notify) +{ + return -EAFNOSUPPORT; +} +#endif void rt6_flush_exceptions(struct fib6_info *f6i); void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, @@ -229,16 +244,16 @@ static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) * Store a destination cache entry in a socket */ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, - const struct in6_addr *daddr, - const struct in6_addr *saddr) + bool daddr_set, + bool saddr_set) { struct ipv6_pinfo *np = inet6_sk(sk); np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); sk_setup_caps(sk, dst); - np->daddr_cache = daddr; + np->daddr_cache = daddr_set; #ifdef CONFIG_IPV6_SUBTREES - np->saddr_cache = saddr; + np->saddr_cache = saddr_set; #endif } @@ -252,19 +267,43 @@ static inline bool ipv6_unicast_destination(const struct sk_buff *skb) return rt->rt6i_flags & RTF_LOCAL; } +static inline bool __ipv6_anycast_destination(const struct rt6key *rt6i_dst, + u32 rt6i_flags, + const struct in6_addr *daddr) +{ + return rt6i_flags & RTF_ANYCAST || + (rt6i_dst->plen < 127 && + !(rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) && + ipv6_addr_equal(&rt6i_dst->addr, daddr)); +} + static inline bool ipv6_anycast_destination(const struct dst_entry *dst, const struct in6_addr *daddr) { const struct rt6_info *rt = dst_rt6_info(dst); - return rt->rt6i_flags & RTF_ANYCAST || - (rt->rt6i_dst.plen < 127 && - !(rt->rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) && - ipv6_addr_equal(&rt->rt6i_dst.addr, daddr)); + return __ipv6_anycast_destination(&rt->rt6i_dst, rt->rt6i_flags, daddr); } +#if IS_ENABLED(CONFIG_IPV6) int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)); +#else +static inline int ip6_fragment(struct net *net, struct sock *sk, + struct sk_buff *skb, + int (*output)(struct net *, struct sock *, + struct sk_buff *)) +{ + kfree_skb(skb); + return -EAFNOSUPPORT; +} +#endif + +/* Variant of dst_mtu() for IPv6 users */ +static inline u32 dst6_mtu(const struct dst_entry *dst) +{ + return INDIRECT_CALL_1(dst->ops->mtu, ip6_mtu, dst); +} static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb) { @@ -274,7 +313,7 @@ static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb) unsigned int mtu; if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) { - mtu = READ_ONCE(dst->dev->mtu); + mtu = READ_ONCE(dst_dev(dst)->mtu); mtu -= lwtunnel_headroom(dst->lwtstate, mtu); } else { mtu = dst_mtu(dst); @@ -337,7 +376,7 @@ static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst mtu = IPV6_MIN_MTU; rcu_read_lock(); - idev = __in6_dev_get(dst->dev); + idev = __in6_dev_get(dst_dev_rcu(dst)); if (idev) mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 399592405c72..b99805ee2fd1 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -152,19 +152,34 @@ int ip6_tnl_get_iflink(const struct net_device *dev); int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu); static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, - struct net_device *dev) + struct net_device *dev, u16 ip6cb_flags) { int pkt_len, err; + if (unlikely(dev_recursion_level() > IP_TUNNEL_RECURSION_LIMIT)) { + if (dev) { + net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", + dev->name); + DEV_STATS_INC(dev, tx_errors); + } + kfree_skb_reason(skb, SKB_DROP_REASON_RECURSION_LIMIT); + return; + } + + dev_xmit_recursion_inc(); + memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + IP6CB(skb)->flags = ip6cb_flags; pkt_len = skb->len - skb_inner_network_offset(skb); - err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); + err = ip6_local_out(skb_dst_dev_net(skb), sk, skb); if (dev) { if (unlikely(net_xmit_eval(err))) pkt_len = -1; iptunnel_xmit_stats(dev, pkt_len); } + + dev_xmit_recursion_dec(); } #endif #endif diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index a113c11ab56b..318593743b6e 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -162,6 +162,8 @@ struct fib_info { struct fib_nh fib_nh[] __counted_by(fib_nhs); }; +int __net_init fib4_semantics_init(struct net *net); +void __net_exit fib4_semantics_exit(struct net *net); #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rule; @@ -438,7 +440,7 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net, static inline bool fib_dscp_masked_match(dscp_t dscp, const struct flowi4 *fl4) { - return dscp == inet_dsfield_to_dscp(RT_TOS(fl4->flowi4_tos)); + return dscp == (fl4->flowi4_dscp & INET_DSCP_LEGACY_TOS_MASK); } /* Exported by fib_frontend.c */ @@ -557,7 +559,7 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net, siphash_aligned_key_t hash_key; u32 mp_seed; - mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).mp_seed; + mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.mp_seed); fib_multipath_hash_construct_key(&hash_key, mp_seed); return flow_hash_from_keys_seed(keys, &hash_key); @@ -572,7 +574,8 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net, int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack); -void fib_select_multipath(struct fib_result *res, int hash); +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 1aa31bdb2b31..d708b66e55cd 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -11,12 +11,15 @@ #include <linux/bitops.h> #include <net/dsfield.h> +#include <net/flow.h> #include <net/gro_cells.h> +#include <net/inet_dscp.h> #include <net/inet_ecn.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/lwtunnel.h> #include <net/dst_cache.h> +#include <net/netdev_lock.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> @@ -24,6 +27,13 @@ #include <net/ip6_route.h> #endif +/* Recursion limit for tunnel xmit to detect routing loops. + * Unlike XMIT_RECURSION_LIMIT (8) used in the no-qdisc path, tunnel + * recursion involves route lookups and full IP output, consuming much + * more stack per level, so a lower limit is needed. + */ +#define IP_TUNNEL_RECURSION_LIMIT 5 + /* Keep error state on tunnel for 30 sec */ #define IPTUNNEL_ERR_TIMEO (30*HZ) @@ -95,8 +105,8 @@ struct ip_tunnel_encap { #define ip_tunnel_info_opts(info) \ _Generic(info, \ - const struct ip_tunnel_info * : ((const void *)((info) + 1)),\ - struct ip_tunnel_info * : ((void *)((info) + 1))\ + const struct ip_tunnel_info * : ((const void *)(info)->options),\ + struct ip_tunnel_info * : ((void *)(info)->options)\ ) struct ip_tunnel_info { @@ -107,6 +117,7 @@ struct ip_tunnel_info { #endif u8 options_len; u8 mode; + u8 options[] __aligned_largest __counted_by(options_len); }; /* 6rd prefix/relay information */ @@ -361,7 +372,7 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4, fl4->daddr = daddr; fl4->saddr = saddr; - fl4->flowi4_tos = tos; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; fl4->flowi4_mark = mark; @@ -369,17 +380,26 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4, fl4->flowi4_flags = flow_flags; } -int ip_tunnel_init(struct net_device *dev); +int __ip_tunnel_init(struct net_device *dev); +#define ip_tunnel_init(DEV) \ +({ \ + struct net_device *__dev = (DEV); \ + int __res = __ip_tunnel_init(__dev); \ + \ + if (!__res) \ + netdev_lockdep_set_classes(__dev);\ + __res; \ +}) + void ip_tunnel_uninit(struct net_device *dev); void ip_tunnel_dellink(struct net_device *dev, struct list_head *head); struct net *ip_tunnel_get_link_net(const struct net_device *dev); int ip_tunnel_get_iflink(const struct net_device *dev); int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, struct rtnl_link_ops *ops, char *devname); - -void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id, - struct rtnl_link_ops *ops, - struct list_head *dev_to_kill); +void ip_tunnel_delete_net(struct net *net, unsigned int id, + struct rtnl_link_ops *ops, + struct list_head *dev_to_kill); void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, const u8 protocol); @@ -406,8 +426,9 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm_kern *p, __u32 fwmark); -int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm_kern *p, __u32 fwmark); +int ip_tunnel_newlink(struct net *net, struct net_device *dev, + struct nlattr *tb[], struct ip_tunnel_parm_kern *p, + __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], @@ -602,12 +623,27 @@ static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, - u8 tos, u8 ttl, __be16 df, bool xnet); + u8 tos, u8 ttl, __be16 df, bool xnet, u16 ipcb_flags); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, int headroom, bool reply); +static inline void ip_tunnel_adj_headroom(struct net_device *dev, + unsigned int headroom) +{ + /* we must cap headroom to some upperlimit, else pskb_expand_head + * will overflow header offsets in skb_headers_offset_update(). + */ + const unsigned int max_allowed = 512; + + if (headroom > max_allowed) + headroom = max_allowed; + + if (headroom > READ_ONCE(dev->needed_headroom)) + WRITE_ONCE(dev->needed_headroom, headroom); +} + int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask); static inline int iptunnel_pull_offloads(struct sk_buff *skb) @@ -629,13 +665,29 @@ static inline int iptunnel_pull_offloads(struct sk_buff *skb) static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) { if (pkt_len > 0) { - struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats); - - u64_stats_update_begin(&tstats->syncp); - u64_stats_add(&tstats->tx_bytes, pkt_len); - u64_stats_inc(&tstats->tx_packets); - u64_stats_update_end(&tstats->syncp); - put_cpu_ptr(tstats); + if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) { + struct pcpu_dstats *dstats = get_cpu_ptr(dev->dstats); + + u64_stats_update_begin(&dstats->syncp); + u64_stats_add(&dstats->tx_bytes, pkt_len); + u64_stats_inc(&dstats->tx_packets); + u64_stats_update_end(&dstats->syncp); + put_cpu_ptr(dstats); + return; + } + if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) { + struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + u64_stats_add(&tstats->tx_bytes, pkt_len); + u64_stats_inc(&tstats->tx_packets); + u64_stats_update_end(&tstats->syncp); + put_cpu_ptr(tstats); + return; + } + pr_err_once("iptunnel_xmit_stats pcpu_stat_type=%d\n", + dev->pcpu_stat_type); + WARN_ON_ONCE(1); return; } @@ -650,7 +702,7 @@ static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) static inline void ip_tunnel_info_opts_get(void *to, const struct ip_tunnel_info *info) { - memcpy(to, info + 1, info->options_len); + memcpy(to, ip_tunnel_info_opts(info), info->options_len); } static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ff406ef4fd4a..e517eaaa177b 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -11,6 +11,7 @@ #include <asm/types.h> /* for __uXX types */ #include <linux/list.h> /* for struct list_head */ +#include <linux/rculist_bl.h> /* for struct hlist_bl_head */ #include <linux/spinlock.h> /* for struct rwlock_t */ #include <linux/atomic.h> /* for struct atomic_t */ #include <linux/refcount.h> /* for struct refcount_t */ @@ -30,10 +31,23 @@ #endif #include <net/net_namespace.h> /* Netw namespace */ #include <linux/sched/isolation.h> +#include <linux/siphash.h> #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 +/* conn_tab limits (as per Kconfig) */ +#define IP_VS_CONN_TAB_MIN_BITS 8 +#if BITS_PER_LONG > 32 +#define IP_VS_CONN_TAB_MAX_BITS 27 +#else +#define IP_VS_CONN_TAB_MAX_BITS 20 +#endif + +/* svc_table limits */ +#define IP_VS_SVC_TAB_MIN_BITS 4 +#define IP_VS_SVC_TAB_MAX_BITS 20 + /* Generic access of ipvs struct */ static inline struct netns_ipvs *net_ipvs(struct net* net) { @@ -43,8 +57,6 @@ static inline struct netns_ipvs *net_ipvs(struct net* net) /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; -extern struct mutex __ip_vs_mutex; - struct ip_vs_iphdr { int hdr_flags; /* ipvs flags */ __u32 off; /* Where IP or IPv4 header starts */ @@ -265,6 +277,29 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len, pr_err(msg, ##__VA_ARGS__); \ } while (0) +struct ip_vs_aligned_lock { + spinlock_t l; /* Protect buckets */ +} ____cacheline_aligned_in_smp; + +/* For arrays per family */ +enum { + IP_VS_AF_INET, + IP_VS_AF_INET6, + IP_VS_AF_MAX +}; + +static inline int ip_vs_af_index(int af) +{ + return af == AF_INET6 ? IP_VS_AF_INET6 : IP_VS_AF_INET; +} + +/* work_flags */ +enum { + IP_VS_WORK_SVC_RESIZE, /* Schedule svc_resize_work */ + IP_VS_WORK_SVC_NORESIZE, /* Stopping svc_resize_work */ + IP_VS_WORK_CONN_RESIZE, /* Schedule conn_resize_work */ +}; + /* The port number of FTP service (in network order). */ #define FTPPORT cpu_to_be16(21) #define FTPDATA cpu_to_be16(20) @@ -456,6 +491,7 @@ struct ip_vs_est_kt_data { DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */ unsigned long est_timer; /* estimation timer (jiffies) */ struct ip_vs_stats *calc_stats; /* Used for calculation */ + int needed; /* task is needed */ int tick_len[IPVS_EST_NTICKS]; /* est count */ int id; /* ktid per netns */ int chain_max; /* max ests per tick chain */ @@ -466,6 +502,198 @@ struct ip_vs_est_kt_data { int est_row; /* estimated row */ }; +/* IPVS resizable hash tables */ +struct ip_vs_rht { + struct hlist_bl_head *buckets; + struct ip_vs_rht __rcu *new_tbl; /* New/Same table */ + seqcount_t *seqc; /* Protects moves */ + struct ip_vs_aligned_lock *lock; /* Protect seqc */ + int mask; /* Buckets mask */ + int size; /* Buckets */ + int seqc_mask; /* seqc mask */ + int lock_mask; /* lock mask */ + u32 table_id; + int u_thresh; /* upper threshold */ + int l_thresh; /* lower threshold */ + int lfactor; /* Load Factor (shift)*/ + int bits; /* size = 1 << bits */ + siphash_key_t hash_key; + struct rcu_head rcu_head; +}; + +/** + * ip_vs_rht_for_each_table() - Walk the hash tables + * @table: struct ip_vs_rht __rcu *table + * @t: current table, used as cursor, struct ip_vs_rht *var + * @p: previous table, temp struct ip_vs_rht *var + * + * Walk tables assuming others can not change the installed tables + */ +#define ip_vs_rht_for_each_table(table, t, p) \ + for (p = NULL, t = rcu_dereference_protected(table, 1); \ + t != p; \ + p = t, t = rcu_dereference_protected(t->new_tbl, 1)) + +/** + * ip_vs_rht_for_each_table_rcu() - Walk the hash tables under RCU reader lock + * @table: struct ip_vs_rht __rcu *table + * @t: current table, used as cursor, struct ip_vs_rht *var + * @p: previous table, temp struct ip_vs_rht *var + * + * We usually search in one table and also in second table on resizing + */ +#define ip_vs_rht_for_each_table_rcu(table, t, p) \ + for (p = NULL, t = rcu_dereference(table); \ + t != p; \ + p = t, t = rcu_dereference(t->new_tbl)) + +/** + * ip_vs_rht_for_each_bucket() - Walk all table buckets + * @t: current table, used as cursor, struct ip_vs_rht *var + * @bucket: bucket index, used as cursor, u32 var + * @head: bucket address, used as cursor, struct hlist_bl_head *var + */ +#define ip_vs_rht_for_each_bucket(t, bucket, head) \ + for (bucket = 0, head = (t)->buckets; \ + bucket < t->size; bucket++, head++) + +/** + * ip_vs_rht_for_bucket_retry() - Retry bucket if entries are moved + * @t: current table, used as cursor, struct ip_vs_rht *var + * @bucket: index of current bucket or hash key + * @sc: temp seqcount_t *var + * @seq: temp unsigned int var for sequence count + * @retry: temp int var + */ +#define ip_vs_rht_for_bucket_retry(t, bucket, sc, seq, retry) \ + for (retry = 1, sc = &(t)->seqc[(bucket) & (t)->seqc_mask]; \ + retry && ({ seq = read_seqcount_begin(sc); 1; }); \ + retry = read_seqcount_retry(sc, seq)) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() - Declare variables + * + * Variables for ip_vs_rht_walk_buckets_rcu + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() \ + struct ip_vs_rht *_t, *_p; \ + unsigned int _seq; \ + seqcount_t *_sc; \ + u32 _bucket; \ + int _retry +/** + * ip_vs_rht_walk_buckets_rcu() - Walk all buckets under RCU read lock + * @table: struct ip_vs_rht __rcu *table + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Can be used while others add/delete/move entries + * Not suitable if duplicates are not desired + * Possible cases for reader that uses cond_resched_rcu() in the loop: + * - new table can not be installed, no need to repeat + * - new table can be installed => check and repeat if new table is + * installed, needed for !PREEMPT_RCU + */ +#define ip_vs_rht_walk_buckets_rcu(table, head) \ + ip_vs_rht_for_each_table_rcu(table, _t, _p) \ + ip_vs_rht_for_each_bucket(_t, _bucket, head) \ + ip_vs_rht_for_bucket_retry(_t, _bucket, _sc, \ + _seq, _retry) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() - Declare variables + * + * Variables for ip_vs_rht_walk_bucket_rcu + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() \ + unsigned int _seq; \ + seqcount_t *_sc; \ + int _retry +/** + * ip_vs_rht_walk_bucket_rcu() - Walk bucket under RCU read lock + * @t: current table, struct ip_vs_rht *var + * @bucket: index of current bucket or hash key + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Can be used while others add/delete/move entries + * Not suitable if duplicates are not desired + * Possible cases for reader that uses cond_resched_rcu() in the loop: + * - new table can not be installed, no need to repeat + * - new table can be installed => check and repeat if new table is + * installed, needed for !PREEMPT_RCU + */ +#define ip_vs_rht_walk_bucket_rcu(t, bucket, head) \ + if (({ head = (t)->buckets + ((bucket) & (t)->mask); 0; })) \ + {} \ + else \ + ip_vs_rht_for_bucket_retry(t, (bucket), _sc, _seq, _retry) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() - Declare variables + * + * Variables for ip_vs_rht_walk_buckets_safe_rcu + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() \ + struct ip_vs_rht *_t, *_p; \ + u32 _bucket +/** + * ip_vs_rht_walk_buckets_safe_rcu() - Walk all buckets under RCU read lock + * @table: struct ip_vs_rht __rcu *table + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Can be used while others add/delete entries but moving is disabled + * Using cond_resched_rcu() should be safe if tables do not change + */ +#define ip_vs_rht_walk_buckets_safe_rcu(table, head) \ + ip_vs_rht_for_each_table_rcu(table, _t, _p) \ + ip_vs_rht_for_each_bucket(_t, _bucket, head) + +/** + * DECLARE_IP_VS_RHT_WALK_BUCKETS() - Declare variables + * + * Variables for ip_vs_rht_walk_buckets + */ +#define DECLARE_IP_VS_RHT_WALK_BUCKETS() \ + struct ip_vs_rht *_t, *_p; \ + u32 _bucket + +/** + * ip_vs_rht_walk_buckets() - Walk all buckets + * @table: struct ip_vs_rht __rcu *table + * @head: bucket address, used as cursor, struct hlist_bl_head *var + * + * Use if others can not add/delete/move entries + */ +#define ip_vs_rht_walk_buckets(table, head) \ + ip_vs_rht_for_each_table(table, _t, _p) \ + ip_vs_rht_for_each_bucket(_t, _bucket, head) + +/* Entries can be in one of two tables, so we flip bit when new table is + * created and store it as highest bit in hash keys + */ +#define IP_VS_RHT_TABLE_ID_MASK BIT(31) + +/* Check if hash key is from this table */ +static inline bool ip_vs_rht_same_table(struct ip_vs_rht *t, u32 hash_key) +{ + return !((t->table_id ^ hash_key) & IP_VS_RHT_TABLE_ID_MASK); +} + +/* Build per-table hash key from hash value */ +static inline u32 ip_vs_rht_build_hash_key(struct ip_vs_rht *t, u32 hash) +{ + return t->table_id | (hash & ~IP_VS_RHT_TABLE_ID_MASK); +} + +void ip_vs_rht_free(struct ip_vs_rht *t); +void ip_vs_rht_rcu_free(struct rcu_head *head); +struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks); +int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, + int lfactor, int min_bits, int max_bits); +void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor, + int min_bits, int max_bits); +u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af, + const union nf_inet_addr *addr, u32 v1, u32 v2); + struct dst_entry; struct iphdr; struct ip_vs_conn; @@ -559,50 +787,48 @@ struct ip_vs_conn_param { __u8 pe_data_len; }; +/* Hash node in conn_tab */ +struct ip_vs_conn_hnode { + struct hlist_bl_node node; /* node in conn_tab */ + u32 hash_key; /* Key for the hash table */ + u8 dir; /* 0=out->in, 1=in->out */ +} __packed; + /* IP_VS structure allocated for each dynamically scheduled connection */ struct ip_vs_conn { - struct hlist_node c_list; /* hashed list heads */ - /* Protocol, addresses and port numbers */ + /* Cacheline for hash table nodes - rarely modified */ + + struct ip_vs_conn_hnode hn0; /* Original direction */ + u8 af; /* address family */ __be16 cport; + struct ip_vs_conn_hnode hn1; /* Reply direction */ + u8 daf; /* Address family of the dest */ __be16 dport; - __be16 vport; - u16 af; /* address family */ - union nf_inet_addr caddr; /* client address */ - union nf_inet_addr vaddr; /* virtual address */ - union nf_inet_addr daddr; /* destination address */ + struct ip_vs_dest *dest; /* real server */ + atomic_t n_control; /* Number of controlled ones */ volatile __u32 flags; /* status flags */ - __u16 protocol; /* Which protocol (TCP/UDP) */ - __u16 daf; /* Address family of the dest */ - struct netns_ipvs *ipvs; - - /* counter and timer */ - refcount_t refcnt; /* reference count */ - struct timer_list timer; /* Expiration timer */ - volatile unsigned long timeout; /* timeout */ + /* 44/64 */ - /* Flags and state transition */ - spinlock_t lock; /* lock for state transition */ + struct ip_vs_conn *control; /* Master control connection */ + const struct ip_vs_pe *pe; + char *pe_data; + __u8 pe_data_len; volatile __u16 state; /* state info */ volatile __u16 old_state; /* old state, to be used for * state transition triggered * synchronization */ - __u32 fwmark; /* Fire wall mark from skb */ - unsigned long sync_endtime; /* jiffies + sent_retries */ + /* 2-byte hole */ + /* 64/96 */ - /* Control members */ - struct ip_vs_conn *control; /* Master control connection */ - atomic_t n_control; /* Number of controlled ones */ - struct ip_vs_dest *dest; /* real server */ - atomic_t in_pkts; /* incoming packet counter */ + union nf_inet_addr caddr; /* client address */ + union nf_inet_addr vaddr; /* virtual address */ + /* 96/128 */ - /* Packet transmitter for different forwarding methods. If it - * mangles the packet, it must return NF_DROP or better NF_STOLEN, - * otherwise this must be changed to a sk_buff **. - * NF_ACCEPT can be returned when destination is local. - */ - int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); + union nf_inet_addr daddr; /* destination address */ + __u32 fwmark; /* Fire wall mark from skb */ + __be16 vport; + __u16 protocol; /* Which protocol (TCP/UDP) */ /* Note: we can group the following members into a structure, * in order to save more space, and the following members are @@ -610,14 +836,31 @@ struct ip_vs_conn { */ struct ip_vs_app *app; /* bound ip_vs_app object */ void *app_data; /* Application private data */ + /* 128/168 */ struct_group(sync_conn_opt, struct ip_vs_seq in_seq; /* incoming seq. struct */ struct ip_vs_seq out_seq; /* outgoing seq. struct */ ); + /* 152/192 */ - const struct ip_vs_pe *pe; - char *pe_data; - __u8 pe_data_len; + struct timer_list timer; /* Expiration timer */ + volatile unsigned long timeout; /* timeout */ + spinlock_t lock; /* lock for state transition */ + refcount_t refcnt; /* reference count */ + atomic_t in_pkts; /* incoming packet counter */ + /* 64-bit: 4-byte gap */ + + /* 188/256 */ + unsigned long sync_endtime; /* jiffies + sent_retries */ + struct netns_ipvs *ipvs; + + /* Packet transmitter for different forwarding methods. If it + * mangles the packet, it must return NF_DROP or better NF_STOLEN, + * otherwise this must be changed to a sk_buff **. + * NF_ACCEPT can be returned when destination is local. + */ + int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); struct rcu_head rcu_head; }; @@ -673,15 +916,15 @@ struct ip_vs_dest_user_kern { * forwarding entries. */ struct ip_vs_service { - struct hlist_node s_list; /* for normal service table */ - struct hlist_node f_list; /* for fwmark-based service table */ - atomic_t refcnt; /* reference counter */ - + struct hlist_bl_node s_list; /* node in service table */ + u32 hash_key; /* Key for the hash table */ u16 af; /* address family */ __u16 protocol; /* which protocol (TCP/UDP) */ + union nf_inet_addr addr; /* IP address for virtual service */ - __be16 port; /* port number for the service */ __u32 fwmark; /* firewall mark of the service */ + atomic_t refcnt; /* reference counter */ + __be16 port; /* port number for the service */ unsigned int flags; /* service status flags */ unsigned int timeout; /* persistent timeout in ticks */ __be32 netmask; /* grouping granularity, mask/plen */ @@ -791,8 +1034,8 @@ struct ip_vs_pe { int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb); bool (*ct_match)(const struct ip_vs_conn_param *p, struct ip_vs_conn *ct); - u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval, - bool inverse); + u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, + struct ip_vs_rht *t, bool inverse); int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf); /* create connections for real-server outgoing packets */ struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc, @@ -931,21 +1174,28 @@ struct netns_ipvs { #endif /* ip_vs_conn */ atomic_t conn_count; /* connection counter */ + atomic_t no_cport_conns[IP_VS_AF_MAX]; + struct delayed_work conn_resize_work;/* resize conn_tab */ /* ip_vs_ctl */ struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ - int num_services; /* no of virtual services */ - int num_services6; /* IPv6 virtual services */ - /* Trash for destinations */ struct list_head dest_trash; spinlock_t dest_trash_lock; struct timer_list dest_trash_timer; /* expiration timer */ + struct mutex service_mutex; /* service reconfig */ + struct rw_semaphore svc_resize_sem; /* svc_table resizing */ + struct rw_semaphore svc_replace_sem; /* svc_table replace */ + struct delayed_work svc_resize_work; /* resize svc_table */ + atomic_t svc_table_changes;/* ++ on table changes */ /* Service counters */ - atomic_t ftpsvc_counter; - atomic_t nullsvc_counter; - atomic_t conn_out_counter; + atomic_t num_services[IP_VS_AF_MAX]; /* Services */ + atomic_t fwm_services[IP_VS_AF_MAX]; /* Services */ + atomic_t nonfwm_services[IP_VS_AF_MAX];/* Services */ + atomic_t ftpsvc_counter[IP_VS_AF_MAX]; /* FTPPORT */ + atomic_t nullsvc_counter[IP_VS_AF_MAX];/* Zero port */ + atomic_t conn_out_counter[IP_VS_AF_MAX];/* out conn */ #ifdef CONFIG_SYSCTL /* delayed work for expiring no dest connections */ @@ -956,6 +1206,7 @@ struct netns_ipvs { int drop_counter; int old_secure_tcp; atomic_t dropentry; + s8 dropentry_counters[8]; /* locks in ctl.c */ spinlock_t dropentry_lock; /* drop entry handling */ spinlock_t droppacket_lock; /* drop packet handling */ @@ -1002,6 +1253,8 @@ struct netns_ipvs { int sysctl_est_nice; /* kthread nice */ int est_stopped; /* stop tasks */ #endif + int sysctl_conn_lfactor; + int sysctl_svc_lfactor; /* ip_vs_lblc */ int sysctl_lblc_expiration; @@ -1011,6 +1264,7 @@ struct netns_ipvs { int sysctl_lblcr_expiration; struct ctl_table_header *lblcr_ctl_header; struct ctl_table *lblcr_ctl_table; + unsigned long work_flags; /* IP_VS_WORK_* flags */ /* ip_vs_est */ struct delayed_work est_reload_work;/* Reload kthread tasks */ struct mutex est_mutex; /* protect kthread tasks */ @@ -1041,6 +1295,10 @@ struct netns_ipvs { */ unsigned int mixed_address_family_dests; unsigned int hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */ + + struct ip_vs_rht __rcu *svc_table; /* Services */ + struct ip_vs_rht __rcu *conn_tab; /* Connections */ + atomic_t conn_tab_changes;/* ++ on new table */ }; #define DEFAULT_SYNC_THRESHOLD 3 @@ -1155,7 +1413,7 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return ipvs->sysctl_run_estimation; } -static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +static inline const struct cpumask *__sysctl_est_cpulist(struct netns_ipvs *ipvs) { if (ipvs->est_cpulist_valid) return ipvs->sysctl_est_cpulist; @@ -1163,6 +1421,14 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) return housekeeping_cpumask(HK_TYPE_KTHREAD); } +static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs) +{ + if (ipvs->est_cpulist_valid) + return ipvs->sysctl_est_cpulist; + else + return NULL; +} + static inline int sysctl_est_nice(struct netns_ipvs *ipvs) { return ipvs->sysctl_est_nice; @@ -1265,11 +1531,16 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return 1; } -static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +static inline const struct cpumask *__sysctl_est_cpulist(struct netns_ipvs *ipvs) { return housekeeping_cpumask(HK_TYPE_KTHREAD); } +static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs) +{ + return NULL; +} + static inline int sysctl_est_nice(struct netns_ipvs *ipvs) { return IPVS_EST_NICE; @@ -1277,6 +1548,36 @@ static inline int sysctl_est_nice(struct netns_ipvs *ipvs) #endif +/* Get load factor to map conn_count/u_thresh to t->size */ +static inline int sysctl_conn_lfactor(struct netns_ipvs *ipvs) +{ + return READ_ONCE(ipvs->sysctl_conn_lfactor); +} + +/* Get load factor to map num_services/u_thresh to t->size + * Smaller value decreases u_thresh to reduce collisions but increases + * the table size + * Returns factor where: + * - <0: u_thresh = size >> -factor, eg. lfactor -2 = 25% load + * - >=0: u_thresh = size << factor, eg. lfactor 1 = 200% load + */ +static inline int sysctl_svc_lfactor(struct netns_ipvs *ipvs) +{ + return READ_ONCE(ipvs->sysctl_svc_lfactor); +} + +static inline bool sysctl_est_cpulist_empty(struct netns_ipvs *ipvs) +{ + guard(rcu)(); + return cpumask_empty(__sysctl_est_cpulist(ipvs)); +} + +static inline unsigned int sysctl_est_cpulist_weight(struct netns_ipvs *ipvs) +{ + guard(rcu)(); + return cpumask_weight(__sysctl_est_cpulist(ipvs)); +} + /* IPVS core functions * (from ip_vs_core.c) */ @@ -1350,6 +1651,23 @@ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) } void ip_vs_conn_put(struct ip_vs_conn *cp); void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); +int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, + int lfactor); +struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets, + int lfactor); + +static inline struct ip_vs_conn * +ip_vs_hn0_to_conn(struct ip_vs_conn_hnode *hn) +{ + return container_of(hn, struct ip_vs_conn, hn0); +} + +static inline struct ip_vs_conn * +ip_vs_hn_to_conn(struct ip_vs_conn_hnode *hn) +{ + return hn->dir ? container_of(hn, struct ip_vs_conn, hn1) : + container_of(hn, struct ip_vs_conn, hn0); +} struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, @@ -1506,8 +1824,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler); int ip_vs_bind_scheduler(struct ip_vs_service *svc, struct ip_vs_scheduler *scheduler); -void ip_vs_unbind_scheduler(struct ip_vs_service *svc, - struct ip_vs_scheduler *sched); +void ip_vs_unbind_scheduler(struct ip_vs_service *svc); struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name); void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler); struct ip_vs_conn * @@ -1580,18 +1897,26 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); -void ip_vs_est_reload_start(struct netns_ipvs *ipvs); +void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool restart); int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd); void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); +static inline void ip_vs_stop_estimator_tot_stats(struct netns_ipvs *ipvs) +{ +#ifdef CONFIG_SYSCTL + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + ipvs->tot_stats->s.est.ktid = -2; +#endif +} + static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL /* Stop tasks while cpulist is empty or if disabled with flag */ ipvs->est_stopped = !sysctl_run_estimation(ipvs) || (ipvs->est_cpulist_valid && - cpumask_empty(sysctl_est_cpulist(ipvs))); + sysctl_est_cpulist_empty(ipvs)); #endif } @@ -1607,7 +1932,7 @@ static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs) static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs) { unsigned int limit = IPVS_EST_CPU_KTHREADS * - cpumask_weight(sysctl_est_cpulist(ipvs)); + sysctl_est_cpulist_weight(ipvs); return max(1U, limit); } @@ -1703,6 +2028,13 @@ static inline char ip_vs_fwd_tag(struct ip_vs_conn *cp) return fwd; } +/* Check if connection uses double hashing */ +static inline bool ip_vs_conn_use_hash2(struct ip_vs_conn *cp) +{ + return IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ && + !(cp->flags & IP_VS_CONN_F_TEMPLATE); +} + void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int dir); diff --git a/include/net/ipcomp.h b/include/net/ipcomp.h index 8660a2a6d1fc..51401f01e2a5 100644 --- a/include/net/ipcomp.h +++ b/include/net/ipcomp.h @@ -3,20 +3,9 @@ #define _NET_IPCOMP_H #include <linux/skbuff.h> -#include <linux/types.h> - -#define IPCOMP_SCRATCH_SIZE 65400 - -struct crypto_comp; -struct ip_comp_hdr; - -struct ipcomp_data { - u16 threshold; - struct crypto_comp * __percpu *tfms; -}; struct ip_comp_hdr; -struct sk_buff; +struct netlink_ext_ack; struct xfrm_state; int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index f5c43ad1565e..1dec81faff28 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -25,8 +25,6 @@ struct ip_tunnel_info; #define SIN6_LEN_RFC2133 24 -#define IPV6_MAXPLEN 65535 - /* * NextHeader field of IPv6 header */ @@ -92,6 +90,9 @@ struct ip_tunnel_info; #define IP6_DEFAULT_MAX_DST_OPTS_LEN INT_MAX /* No limit */ #define IP6_DEFAULT_MAX_HBH_OPTS_LEN INT_MAX /* No limit */ +/* Hard limit on traversed IPv6 extension headers */ +#define IP6_MAX_EXT_HDRS_CNT 12 + /* * Addr type * @@ -151,17 +152,6 @@ struct frag_hdr { __be32 identification; }; -/* - * Jumbo payload option, as described in RFC 2675 2. - */ -struct hop_jumbo_hdr { - u8 nexthdr; - u8 hdrlen; - u8 tlv_type; /* IPV6_TLV_JUMBO, 0xC2 */ - u8 tlv_len; /* 4 */ - __be32 jumbo_payload_len; -}; - #define IP6_MF 0x0001 #define IP6_OFFSET 0xFFF8 @@ -246,17 +236,20 @@ extern int sysctl_mld_qrv; #define _DEVADD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ + unsigned long _field = (field); \ + unsigned long _val = (val); \ if (likely(_idev != NULL)) \ - mod##SNMP_ADD_STATS((_idev)->stats.statname, (field), (val)); \ - mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, (field), (val));\ + mod##SNMP_ADD_STATS((_idev)->stats.statname, _field, _val); \ + mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, _field, _val);\ }) #define _DEVUPD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ + unsigned long _val = (val); \ if (likely(_idev != NULL)) \ - mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, (val)); \ - mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\ + mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, _val); \ + mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, _val);\ }) /* MIBs */ @@ -363,15 +356,6 @@ struct ipcm6_cookie { struct ipv6_txoptions *opt; }; -static inline void ipcm6_init(struct ipcm6_cookie *ipc6) -{ - *ipc6 = (struct ipcm6_cookie) { - .hlimit = -1, - .tclass = -1, - .dontfrag = -1, - }; -} - static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, const struct sock *sk) { @@ -380,6 +364,8 @@ static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, .tclass = inet6_sk(sk)->tclass, .dontfrag = inet6_test_bit(DONTFRAG, sk), }; + + sockcm_init(&ipc6->sockc, sk); } static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) @@ -468,72 +454,6 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt); -/* This helper is specialized for BIG TCP needs. - * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header. - * It assumes headers are already in skb->head. - * Returns: 0, or IPPROTO_TCP if a BIG TCP packet is there. - */ -static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) -{ - const struct hop_jumbo_hdr *jhdr; - const struct ipv6hdr *nhdr; - - if (likely(skb->len <= GRO_LEGACY_MAX_SIZE)) - return 0; - - if (skb->protocol != htons(ETH_P_IPV6)) - return 0; - - if (skb_network_offset(skb) + - sizeof(struct ipv6hdr) + - sizeof(struct hop_jumbo_hdr) > skb_headlen(skb)) - return 0; - - nhdr = ipv6_hdr(skb); - - if (nhdr->nexthdr != NEXTHDR_HOP) - return 0; - - jhdr = (const struct hop_jumbo_hdr *) (nhdr + 1); - if (jhdr->tlv_type != IPV6_TLV_JUMBO || jhdr->hdrlen != 0 || - jhdr->nexthdr != IPPROTO_TCP) - return 0; - return jhdr->nexthdr; -} - -/* Return 0 if HBH header is successfully removed - * Or if HBH removal is unnecessary (packet is not big TCP) - * Return error to indicate dropping the packet - */ -static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb) -{ - const int hophdr_len = sizeof(struct hop_jumbo_hdr); - int nexthdr = ipv6_has_hopopt_jumbo(skb); - struct ipv6hdr *h6; - - if (!nexthdr) - return 0; - - if (skb_cow_head(skb, 0)) - return -1; - - /* Remove the HBH header. - * Layout: [Ethernet header][IPv6 header][HBH][L4 Header] - */ - memmove(skb_mac_header(skb) + hophdr_len, skb_mac_header(skb), - skb_network_header(skb) - skb_mac_header(skb) + - sizeof(struct ipv6hdr)); - - __skb_pull(skb, hophdr_len); - skb->network_header += hophdr_len; - skb->mac_header += hophdr_len; - - h6 = ipv6_hdr(skb); - h6->nexthdr = nexthdr; - - return 0; -} - static inline bool ipv6_accept_ra(const struct inet6_dev *idev) { s32 accept_ra = READ_ONCE(idev->cnf.accept_ra); @@ -935,10 +855,10 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, #if IS_ENABLED(CONFIG_IPV6) -static inline bool ipv6_can_nonlocal_bind(struct net *net, - struct inet_sock *inet) +static inline bool ipv6_can_nonlocal_bind(const struct net *net, + const struct inet_sock *inet) { - return net->ipv6.sysctl.ip_nonlocal_bind || + return READ_ONCE(net->ipv6.sysctl.ip_nonlocal_bind) || test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) || test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags); } @@ -953,10 +873,12 @@ static inline bool ipv6_can_nonlocal_bind(struct net *net, #define IP6_DEFAULT_AUTO_FLOW_LABELS IP6_AUTO_FLOW_LABEL_OPTOUT -static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, +static inline __be32 ip6_make_flowlabel(const struct net *net, + struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) { + u8 auto_flowlabels; u32 hash; /* @flowlabel may include more than a flow label, eg, the traffic class. @@ -964,10 +886,12 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, */ flowlabel &= IPV6_FLOWLABEL_MASK; - if (flowlabel || - net->ipv6.sysctl.auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF || - (!autolabel && - net->ipv6.sysctl.auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED)) + if (flowlabel) + return flowlabel; + + auto_flowlabels = READ_ONCE(net->ipv6.sysctl.auto_flowlabels); + if (auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF || + (!autolabel && auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED)) return flowlabel; hash = skb_get_hash_flowi6(skb, fl6); @@ -980,15 +904,15 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; - if (net->ipv6.sysctl.flowlabel_state_ranges) + if (READ_ONCE(net->ipv6.sysctl.flowlabel_state_ranges)) flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG; return flowlabel; } -static inline int ip6_default_np_autolabel(struct net *net) +static inline int ip6_default_np_autolabel(const struct net *net) { - switch (net->ipv6.sysctl.auto_flowlabels) { + switch (READ_ONCE(net->ipv6.sysctl.auto_flowlabels)) { case IP6_AUTO_FLOW_LABEL_OFF: case IP6_AUTO_FLOW_LABEL_OPTIN: default: @@ -999,13 +923,13 @@ static inline int ip6_default_np_autolabel(struct net *net) } } #else -static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, +static inline __be32 ip6_make_flowlabel(const struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) { return flowlabel; } -static inline int ip6_default_np_autolabel(struct net *net) +static inline int ip6_default_np_autolabel(const struct net *net) { return 0; } @@ -1014,11 +938,11 @@ static inline int ip6_default_np_autolabel(struct net *net) #if IS_ENABLED(CONFIG_IPV6) static inline int ip6_multipath_hash_policy(const struct net *net) { - return net->ipv6.sysctl.multipath_hash_policy; + return READ_ONCE(net->ipv6.sysctl.multipath_hash_policy); } static inline u32 ip6_multipath_hash_fields(const struct net *net) { - return net->ipv6.sysctl.multipath_hash_fields; + return READ_ONCE(net->ipv6.sysctl.multipath_hash_fields); } #else static inline int ip6_multipath_hash_policy(const struct net *net) @@ -1107,8 +1031,7 @@ void ip6_flush_pending_frames(struct sock *sk); int ip6_send_skb(struct sk_buff *skb); struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue, - struct inet_cork_full *cork, - struct inet6_cork *v6_cork); + struct inet_cork_full *cork); struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), @@ -1119,14 +1042,23 @@ struct sk_buff *ip6_make_skb(struct sock *sk, static inline struct sk_buff *ip6_finish_skb(struct sock *sk) { - return __ip6_make_skb(sk, &sk->sk_write_queue, &inet_sk(sk)->cork, - &inet6_sk(sk)->cork); + return __ip6_make_skb(sk, &sk->sk_write_queue, &inet_sk(sk)->cork); } int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); +#if IS_ENABLED(CONFIG_IPV6) struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst); +#else +static inline struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, + struct flowi6 *fl6, + const struct in6_addr *final_dst) +{ + return ERR_PTR(-EAFNOSUPPORT); +} +#endif + struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst, bool connected); @@ -1151,11 +1083,11 @@ int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); * Extension header (options) processing */ -void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, - u8 *proto, struct in6_addr **daddr_p, - struct in6_addr *saddr); -void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, - u8 *proto); +u8 ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, + u8 proto, struct in6_addr **daddr_p, + struct in6_addr *saddr); +u8 ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, + u8 proto); int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp, __be16 *frag_offp); @@ -1174,9 +1106,19 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type); -struct in6_addr *fl6_update_dst(struct flowi6 *fl6, - const struct ipv6_txoptions *opt, - struct in6_addr *orig); +struct in6_addr *__fl6_update_dst(struct flowi6 *fl6, + const struct ipv6_txoptions *opt, + struct in6_addr *orig); + +static inline struct in6_addr * +fl6_update_dst(struct flowi6 *fl6, const struct ipv6_txoptions *opt, + struct in6_addr *orig) +{ + if (likely(!opt)) + return NULL; + + return __fl6_update_dst(fl6, opt, orig); +} /* * socket options (ipv6_sockglue.c) @@ -1192,18 +1134,16 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); -int __ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, +int __ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); -int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); -int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr, +int ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); +int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr); void ip6_datagram_release_cb(struct sock *sk); -int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, - int *addr_len); -int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, - int *addr_len); +int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len); +int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len); void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); @@ -1212,8 +1152,10 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); void inet6_cleanup_sock(struct sock *sk); void inet6_sock_destruct(struct sock *sk); int inet6_release(struct socket *sock); -int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); -int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int __inet6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len, + u32 flags); +int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); +int inet6_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); @@ -1252,8 +1194,6 @@ int tcp6_proc_init(struct net *net); void tcp6_proc_exit(struct net *net); int udp6_proc_init(struct net *net); void udp6_proc_exit(struct net *net); -int udplite6_proc_init(void); -void udplite6_proc_exit(void); int ipv6_misc_proc_init(void); void ipv6_misc_proc_exit(void); int snmp6_register_dev(struct inet6_dev *idev); @@ -1284,12 +1224,15 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, static inline int ip6_sock_set_v6only(struct sock *sk) { - if (inet_sk(sk)->inet_num) - return -EINVAL; + int ret = 0; + lock_sock(sk); - sk->sk_ipv6only = true; + if (inet_sk(sk)->inet_num) + ret = -EINVAL; + else + sk->sk_ipv6only = true; release_sock(sk); - return 0; + return ret; } static inline void ip6_sock_set_recverr(struct sock *sk) diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index 7321ffe3a108..41d9fc6965f9 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -66,18 +66,22 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) { struct net_device *dev = NULL; struct sk_buff *head; + int refs = 1; rcu_read_lock(); - /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */ - if (READ_ONCE(fq->q.fqdir->dead)) - goto out_rcu_unlock; spin_lock(&fq->q.lock); if (fq->q.flags & INET_FRAG_COMPLETE) goto out; fq->q.flags |= INET_FRAG_DROP; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, &refs); + + /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */ + if (READ_ONCE(fq->q.fqdir->dead)) { + inet_frag_queue_flush(&fq->q, 0); + goto out; + } dev = dev_get_by_index_rcu(net, fq->iif); if (!dev) @@ -109,7 +113,7 @@ out: spin_unlock(&fq->q.lock); out_rcu_unlock: rcu_read_unlock(); - inet_frag_put(&fq->q); + inet_frag_putn(&fq->q, refs); } /* Check if the upper layer header is truncated in the first fragment. */ diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h deleted file mode 100644 index 8a3465c8c2c5..000000000000 --- a/include/net/ipv6_stubs.h +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _IPV6_STUBS_H -#define _IPV6_STUBS_H - -#include <linux/in6.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <net/dst.h> -#include <net/flow.h> -#include <net/neighbour.h> -#include <net/sock.h> -#include <net/ipv6.h> - -/* structs from net/ip6_fib.h */ -struct fib6_info; -struct fib6_nh; -struct fib6_config; -struct fib6_result; - -/* This is ugly, ideally these symbols should be built - * into the core kernel. - */ -struct ipv6_stub { - int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex, - const struct in6_addr *addr); - int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex, - const struct in6_addr *addr); - struct dst_entry *(*ipv6_dst_lookup_flow)(struct net *net, - const struct sock *sk, - struct flowi6 *fl6, - const struct in6_addr *final_dst); - int (*ipv6_route_input)(struct sk_buff *skb); - - struct fib6_table *(*fib6_get_table)(struct net *net, u32 id); - int (*fib6_lookup)(struct net *net, int oif, struct flowi6 *fl6, - struct fib6_result *res, int flags); - int (*fib6_table_lookup)(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, - struct fib6_result *res, int flags); - void (*fib6_select_path)(const struct net *net, struct fib6_result *res, - struct flowi6 *fl6, int oif, bool oif_match, - const struct sk_buff *skb, int strict); - u32 (*ip6_mtu_from_fib6)(const struct fib6_result *res, - const struct in6_addr *daddr, - const struct in6_addr *saddr); - - int (*fib6_nh_init)(struct net *net, struct fib6_nh *fib6_nh, - struct fib6_config *cfg, gfp_t gfp_flags, - struct netlink_ext_ack *extack); - void (*fib6_nh_release)(struct fib6_nh *fib6_nh); - void (*fib6_nh_release_dsts)(struct fib6_nh *fib6_nh); - void (*fib6_update_sernum)(struct net *net, struct fib6_info *rt); - int (*ip6_del_rt)(struct net *net, struct fib6_info *rt, bool skip_notify); - void (*fib6_rt_update)(struct net *net, struct fib6_info *rt, - struct nl_info *info); - - void (*udpv6_encap_enable)(void); - void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, - const struct in6_addr *solicited_addr, - bool router, bool solicited, bool override, bool inc_opt); -#if IS_ENABLED(CONFIG_XFRM) - void (*xfrm6_local_rxpmtu)(struct sk_buff *skb, u32 mtu); - int (*xfrm6_udp_encap_rcv)(struct sock *sk, struct sk_buff *skb); - struct sk_buff *(*xfrm6_gro_udp_encap_rcv)(struct sock *sk, - struct list_head *head, - struct sk_buff *skb); - int (*xfrm6_rcv_encap)(struct sk_buff *skb, int nexthdr, __be32 spi, - int encap_type); -#endif - struct neigh_table *nd_tbl; - - int (*ipv6_fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, - int (*output)(struct net *, struct sock *, struct sk_buff *)); - struct net_device *(*ipv6_dev_find)(struct net *net, const struct in6_addr *addr, - struct net_device *dev); - int (*ip6_xmit)(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, - __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority); -}; -extern const struct ipv6_stub *ipv6_stub __read_mostly; - -/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ -struct ipv6_bpf_stub { - int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, - u32 flags); - struct sock *(*udp6_lib_lookup)(const struct net *net, - const struct in6_addr *saddr, __be16 sport, - const struct in6_addr *daddr, __be16 dport, - int dif, int sdif, struct udp_table *tbl, - struct sk_buff *skb); - int (*ipv6_setsockopt)(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen); - int (*ipv6_getsockopt)(struct sock *sk, int level, int optname, - sockptr_t optval, sockptr_t optlen); - int (*ipv6_dev_get_saddr)(struct net *net, - const struct net_device *dst_dev, - const struct in6_addr *daddr, - unsigned int prefs, - struct in6_addr *saddr); -}; -extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; - -#endif diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h index 9804fa5d9c67..18f511da9a09 100644 --- a/include/net/iucv/iucv.h +++ b/include/net/iucv/iucv.h @@ -195,35 +195,15 @@ struct iucv_handler { struct list_head paths; }; -/** - * iucv_register: - * @handler: address of iucv handler structure - * @smp: != 0 indicates that the handler can deal with out of order messages - * - * Registers a driver with IUCV. - * - * Returns: 0 on success, -ENOMEM if the memory allocation for the pathid - * table failed, or -EIO if IUCV_DECLARE_BUFFER failed on all cpus. - */ int iucv_register(struct iucv_handler *handler, int smp); +void iucv_unregister(struct iucv_handler *handler, int smp); /** - * iucv_unregister - * @handler: address of iucv handler structure - * @smp: != 0 indicates that the handler can deal with out of order messages - * - * Unregister driver from IUCV. - */ -void iucv_unregister(struct iucv_handler *handle, int smp); - -/** - * iucv_path_alloc + * iucv_path_alloc - Allocate a new path structure for use with iucv_connect. * @msglim: initial message limit * @flags: initial flags * @gfp: kmalloc allocation flag * - * Allocate a new path structure for use with iucv_connect. - * * Returns: NULL if the memory allocation failed or a pointer to the * path structure. */ @@ -231,7 +211,7 @@ static inline struct iucv_path *iucv_path_alloc(u16 msglim, u8 flags, gfp_t gfp) { struct iucv_path *path; - path = kzalloc(sizeof(struct iucv_path), gfp); + path = kzalloc_obj(struct iucv_path, gfp); if (path) { path->msglim = msglim; path->flags = flags; @@ -240,229 +220,48 @@ static inline struct iucv_path *iucv_path_alloc(u16 msglim, u8 flags, gfp_t gfp) } /** - * iucv_path_free + * iucv_path_free - Frees a path structure. * @path: address of iucv path structure - * - * Frees a path structure. */ static inline void iucv_path_free(struct iucv_path *path) { kfree(path); } -/** - * iucv_path_accept - * @path: address of iucv path structure - * @handler: address of iucv handler structure - * @userdata: 16 bytes of data reflected to the communication partner - * @private: private data passed to interrupt handlers for this path - * - * This function is issued after the user received a connection pending - * external interrupt and now wishes to complete the IUCV communication path. - * - * Returns: the result of the CP IUCV call. - */ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, u8 *userdata, void *private); -/** - * iucv_path_connect - * @path: address of iucv path structure - * @handler: address of iucv handler structure - * @userid: 8-byte user identification - * @system: 8-byte target system identification - * @userdata: 16 bytes of data reflected to the communication partner - * @private: private data passed to interrupt handlers for this path - * - * This function establishes an IUCV path. Although the connect may complete - * successfully, you are not able to use the path until you receive an IUCV - * Connection Complete external interrupt. - * - * Returns: the result of the CP IUCV call. - */ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, u8 *userid, u8 *system, u8 *userdata, void *private); -/** - * iucv_path_quiesce: - * @path: address of iucv path structure - * @userdata: 16 bytes of data reflected to the communication partner - * - * This function temporarily suspends incoming messages on an IUCV path. - * You can later reactivate the path by invoking the iucv_resume function. - * - * Returns: the result from the CP IUCV call. - */ int iucv_path_quiesce(struct iucv_path *path, u8 *userdata); -/** - * iucv_path_resume: - * @path: address of iucv path structure - * @userdata: 16 bytes of data reflected to the communication partner - * - * This function resumes incoming messages on an IUCV path that has - * been stopped with iucv_path_quiesce. - * - * Returns: the result from the CP IUCV call. - */ int iucv_path_resume(struct iucv_path *path, u8 *userdata); -/** - * iucv_path_sever - * @path: address of iucv path structure - * @userdata: 16 bytes of data reflected to the communication partner - * - * This function terminates an IUCV path. - * - * Returns: the result from the CP IUCV call. - */ int iucv_path_sever(struct iucv_path *path, u8 *userdata); -/** - * iucv_message_purge - * @path: address of iucv path structure - * @msg: address of iucv msg structure - * @srccls: source class of message - * - * Cancels a message you have sent. - * - * Returns: the result from the CP IUCV call. - */ int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, u32 srccls); -/** - * iucv_message_receive - * @path: address of iucv path structure - * @msg: address of iucv msg structure - * @flags: flags that affect how the message is received (IUCV_IPBUFLST) - * @buffer: address of data buffer or address of struct iucv_array - * @size: length of data buffer - * @residual: - * - * This function receives messages that are being sent to you over - * established paths. This function will deal with RMDATA messages - * embedded in struct iucv_message as well. - * - * Locking: local_bh_enable/local_bh_disable - * - * Returns: the result from the CP IUCV call. - */ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, size_t *residual); -/** - * __iucv_message_receive - * @path: address of iucv path structure - * @msg: address of iucv msg structure - * @flags: flags that affect how the message is received (IUCV_IPBUFLST) - * @buffer: address of data buffer or address of struct iucv_array - * @size: length of data buffer - * @residual: - * - * This function receives messages that are being sent to you over - * established paths. This function will deal with RMDATA messages - * embedded in struct iucv_message as well. - * - * Locking: no locking. - * - * Returns: the result from the CP IUCV call. - */ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, size_t *residual); -/** - * iucv_message_reject - * @path: address of iucv path structure - * @msg: address of iucv msg structure - * - * The reject function refuses a specified message. Between the time you - * are notified of a message and the time that you complete the message, - * the message may be rejected. - * - * Returns: the result from the CP IUCV call. - */ int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg); -/** - * iucv_message_reply - * @path: address of iucv path structure - * @msg: address of iucv msg structure - * @flags: how the reply is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST) - * @reply: address of data buffer or address of struct iucv_array - * @size: length of reply data buffer - * - * This function responds to the two-way messages that you receive. You - * must identify completely the message to which you wish to reply. ie, - * pathid, msgid, and trgcls. Prmmsg signifies the data is moved into - * the parameter list. - * - * Returns: the result from the CP IUCV call. - */ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *reply, size_t size); -/** - * iucv_message_send - * @path: address of iucv path structure - * @msg: address of iucv msg structure - * @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST) - * @srccls: source class of message - * @buffer: address of data buffer or address of struct iucv_array - * @size: length of send buffer - * - * This function transmits data to another application. Data to be - * transmitted is in a buffer and this is a one-way message and the - * receiver will not reply to the message. - * - * Locking: local_bh_enable/local_bh_disable - * - * Returns: the result from the CP IUCV call. - */ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size); -/** - * __iucv_message_send - * @path: address of iucv path structure - * @msg: address of iucv msg structure - * @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST) - * @srccls: source class of message - * @buffer: address of data buffer or address of struct iucv_array - * @size: length of send buffer - * - * This function transmits data to another application. Data to be - * transmitted is in a buffer and this is a one-way message and the - * receiver will not reply to the message. - * - * Locking: no locking. - * - * Returns: the result from the CP IUCV call. - */ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size); -/** - * iucv_message_send2way - * @path: address of iucv path structure - * @msg: address of iucv msg structure - * @flags: how the message is sent and the reply is received - * (IUCV_IPRMDATA, IUCV_IPBUFLST, IUCV_IPPRTY, IUCV_ANSLST) - * @srccls: source class of message - * @buffer: address of data buffer or address of struct iucv_array - * @size: length of send buffer - * @ansbuf: address of answer buffer or address of struct iucv_array - * @asize: size of reply buffer - * - * This function transmits data to another application. Data to be - * transmitted is in a buffer. The receiver of the send is expected to - * reply to the message and a buffer is provided into which IUCV moves - * the reply to this message. - * - * Returns: the result from the CP IUCV call. - */ int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size, void *answer, size_t asize, size_t *residual); diff --git a/include/net/kcm.h b/include/net/kcm.h index 441e993be634..d9c35e71ecea 100644 --- a/include/net/kcm.h +++ b/include/net/kcm.h @@ -71,7 +71,6 @@ struct kcm_sock { struct list_head wait_psock_list; struct sk_buff *seq_skb; struct mutex tx_mutex; - u32 tx_stopped : 1; /* Don't use bit fields here, these are set under different locks */ bool tx_wait; diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index f7fe796e8429..710e98665eb3 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -59,6 +59,20 @@ int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net, int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, struct fib_lookup_arg *arg); +static inline +bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex) +{ + return !(fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF) && + fl->flowi_l3mdev == iifindex; +} + +static inline +bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex) +{ + return fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF && + fl->flowi_l3mdev == oifindex; +} + void l3mdev_update_flow(struct net *net, struct flowi *fl); int l3mdev_master_ifindex_rcu(const struct net_device *dev); @@ -193,18 +207,19 @@ struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb) static inline struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto) { - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev; + rcu_read_lock(); + dev = skb_dst_dev_rcu(skb); if (netif_is_l3_slave(dev)) { struct net_device *master; - rcu_read_lock(); master = netdev_master_upper_dev_get_rcu(dev); if (master && master->l3mdev_ops->l3mdev_l3_out) skb = master->l3mdev_ops->l3mdev_l3_out(master, sk, skb, proto); - rcu_read_unlock(); } + rcu_read_unlock(); return skb; } @@ -327,6 +342,19 @@ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl, { return 1; } + +static inline +bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex) +{ + return false; +} + +static inline +bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex) +{ + return false; +} + static inline void l3mdev_update_flow(struct net *net, struct flowi *fl) { diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h index 43574bd6612f..5d991404845e 100644 --- a/include/net/libeth/rx.h +++ b/include/net/libeth/rx.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2024 Intel Corporation */ +/* Copyright (C) 2024-2025 Intel Corporation */ #ifndef __LIBETH_RX_H #define __LIBETH_RX_H @@ -13,8 +13,10 @@ /* Space reserved in front of each frame */ #define LIBETH_SKB_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN) +#define LIBETH_XDP_HEADROOM (ALIGN(XDP_PACKET_HEADROOM, NET_SKB_PAD) + \ + NET_IP_ALIGN) /* Maximum headroom for worst-case calculations */ -#define LIBETH_MAX_HEADROOM LIBETH_SKB_HEADROOM +#define LIBETH_MAX_HEADROOM LIBETH_XDP_HEADROOM /* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */ #define LIBETH_RX_LL_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN) /* Maximum supported L2-L4 header length */ @@ -31,7 +33,7 @@ /** * struct libeth_fqe - structure representing an Rx buffer (fill queue element) - * @page: page holding the buffer + * @netmem: network memory reference holding the buffer * @offset: offset from the page start (to the headroom) * @truesize: total space occupied by the buffer (w/ headroom and tailroom) * @@ -40,7 +42,7 @@ * former, @offset is always 0 and @truesize is always ```PAGE_SIZE```. */ struct libeth_fqe { - struct page *page; + netmem_ref netmem; u32 offset; u32 truesize; } __aligned_largest; @@ -66,6 +68,7 @@ enum libeth_fqe_type { * @count: number of descriptors/buffers the queue has * @type: type of the buffers this queue has * @hsplit: flag whether header split is enabled + * @xdp: flag indicating whether XDP is enabled * @buf_len: HW-writeable length per each buffer * @nid: ID of the closest NUMA node with memory */ @@ -81,6 +84,7 @@ struct libeth_fq { /* Cold fields */ enum libeth_fqe_type type:2; bool hsplit:1; + bool xdp:1; u32 buf_len; int nid; @@ -102,15 +106,16 @@ static inline dma_addr_t libeth_rx_alloc(const struct libeth_fq_fp *fq, u32 i) struct libeth_fqe *buf = &fq->fqes[i]; buf->truesize = fq->truesize; - buf->page = page_pool_dev_alloc(fq->pp, &buf->offset, &buf->truesize); - if (unlikely(!buf->page)) + buf->netmem = page_pool_dev_alloc_netmem(fq->pp, &buf->offset, + &buf->truesize); + if (unlikely(!buf->netmem)) return DMA_MAPPING_ERROR; - return page_pool_get_dma_addr(buf->page) + buf->offset + + return page_pool_get_dma_addr_netmem(buf->netmem) + buf->offset + fq->pp->p.offset; } -void libeth_rx_recycle_slow(struct page *page); +void libeth_rx_recycle_slow(netmem_ref netmem); /** * libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA @@ -126,18 +131,19 @@ void libeth_rx_recycle_slow(struct page *page); static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe, u32 len) { - struct page *page = fqe->page; + netmem_ref netmem = fqe->netmem; /* Very rare, but possible case. The most common reason: * the last fragment contained FCS only, which was then * stripped by the HW. */ if (unlikely(!len)) { - libeth_rx_recycle_slow(page); + libeth_rx_recycle_slow(netmem); return false; } - page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len); + page_pool_dma_sync_netmem_for_cpu(netmem_get_pp(netmem), netmem, + fqe->offset, len); return true; } @@ -198,6 +204,53 @@ struct libeth_rx_pt { enum xdp_rss_hash_type hash_type:16; }; +/** + * struct libeth_rx_csum - checksum offload bits decoded from the Rx descriptor + * @l3l4p: detectable L3 and L4 integrity check is processed by the hardware + * @ipe: IP checksum error + * @eipe: external (outermost) IP header (only for tunels) + * @eudpe: external (outermost) UDP checksum error (only for tunels) + * @ipv6exadd: IPv6 header with extension headers + * @l4e: L4 integrity error + * @pprs: set for packets that skip checksum calculation in the HW pre parser + * @nat: the packet is a UDP tunneled packet + * @raw_csum_valid: set if raw checksum is valid + * @pad: padding to naturally align raw_csum field + * @raw_csum: raw checksum + */ +struct libeth_rx_csum { + u32 l3l4p:1; + u32 ipe:1; + u32 eipe:1; + u32 eudpe:1; + u32 ipv6exadd:1; + u32 l4e:1; + u32 pprs:1; + u32 nat:1; + + u32 raw_csum_valid:1; + u32 pad:7; + u32 raw_csum:16; +}; + +/** + * struct libeth_rqe_info - receive queue element info + * @len: packet length + * @ptype: packet type based on types programmed into the device + * @eop: whether it's the last fragment of the packet + * @rxe: MAC errors: CRC, Alignment, Oversize, Undersizes, Length error + * @vlan: C-VLAN or S-VLAN tag depending on the VLAN offload configuration + */ +struct libeth_rqe_info { + u32 len; + + u32 ptype:14; + u32 eop:1; + u32 rxe:1; + + u32 vlan:16; +}; + void libeth_rx_pt_gen_hash_type(struct libeth_rx_pt *pt); /** diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h index 35614f9523f6..c3db5c6f1641 100644 --- a/include/net/libeth/tx.h +++ b/include/net/libeth/tx.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2024 Intel Corporation */ +/* Copyright (C) 2024-2025 Intel Corporation */ #ifndef __LIBETH_TX_H #define __LIBETH_TX_H @@ -12,11 +12,17 @@ /** * enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion - * @LIBETH_SQE_EMPTY: unused/empty, no action required + * @LIBETH_SQE_EMPTY: unused/empty OR XDP_TX/XSk frame, no action required * @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required * @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree() * @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA * @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats + * @__LIBETH_SQE_XDP_START: separator between skb and XDP types + * @LIBETH_SQE_XDP_TX: &skb_shared_info, libeth_xdp_return_buff_bulk(), stats + * @LIBETH_SQE_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame_bulk(), stats + * @LIBETH_SQE_XDP_XMIT_FRAG: &xdp_frame frag, only unmap DMA + * @LIBETH_SQE_XSK_TX: &libeth_xdp_buff on XSk queue, xsk_buff_free(), stats + * @LIBETH_SQE_XSK_TX_FRAG: &libeth_xdp_buff frag on XSk queue, xsk_buff_free() */ enum libeth_sqe_type { LIBETH_SQE_EMPTY = 0U, @@ -24,6 +30,13 @@ enum libeth_sqe_type { LIBETH_SQE_SLAB, LIBETH_SQE_FRAG, LIBETH_SQE_SKB, + + __LIBETH_SQE_XDP_START, + LIBETH_SQE_XDP_TX = __LIBETH_SQE_XDP_START, + LIBETH_SQE_XDP_XMIT, + LIBETH_SQE_XDP_XMIT_FRAG, + LIBETH_SQE_XSK_TX, + LIBETH_SQE_XSK_TX_FRAG, }; /** @@ -32,6 +45,9 @@ enum libeth_sqe_type { * @rs_idx: index of the last buffer from the batch this one was sent in * @raw: slab buffer to free via kfree() * @skb: &sk_buff to consume + * @sinfo: skb shared info of an XDP_TX frame + * @xdpf: XDP frame from ::ndo_xdp_xmit() + * @xsk: XSk Rx frame from XDP_TX action * @dma: DMA address to unmap * @len: length of the mapped region to unmap * @nr_frags: number of frags in the frame this buffer belongs to @@ -46,6 +62,9 @@ struct libeth_sqe { union { void *raw; struct sk_buff *skb; + struct skb_shared_info *sinfo; + struct xdp_frame *xdpf; + struct libeth_xdp_buff *xsk; }; DEFINE_DMA_UNMAP_ADDR(dma); @@ -71,7 +90,10 @@ struct libeth_sqe { /** * struct libeth_cq_pp - completion queue poll params * @dev: &device to perform DMA unmapping + * @bq: XDP frame bulk to combine return operations * @ss: onstack NAPI stats to fill + * @xss: onstack XDPSQ NAPI stats to fill + * @xdp_tx: number of XDP-not-XSk frames processed * @napi: whether it's called from the NAPI context * * libeth uses this structure to access objects needed for performing full @@ -80,7 +102,13 @@ struct libeth_sqe { */ struct libeth_cq_pp { struct device *dev; - struct libeth_sq_napi_stats *ss; + struct xdp_frame_bulk *bq; + + union { + struct libeth_sq_napi_stats *ss; + struct libeth_xdpsq_napi_stats *xss; + }; + u32 xdp_tx; bool napi; }; @@ -126,4 +154,6 @@ static inline void libeth_tx_complete(struct libeth_sqe *sqe, sqe->type = LIBETH_SQE_EMPTY; } +void libeth_tx_complete_any(struct libeth_sqe *sqe, struct libeth_cq_pp *cp); + #endif /* __LIBETH_TX_H */ diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h index 603825e45133..cf1d78a9dc38 100644 --- a/include/net/libeth/types.h +++ b/include/net/libeth/types.h @@ -1,10 +1,32 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2024 Intel Corporation */ +/* Copyright (C) 2024-2025 Intel Corporation */ #ifndef __LIBETH_TYPES_H #define __LIBETH_TYPES_H -#include <linux/types.h> +#include <linux/workqueue.h> + +/* Stats */ + +/** + * struct libeth_rq_napi_stats - "hot" counters to update in Rx polling loop + * @packets: received frames counter + * @bytes: sum of bytes of received frames above + * @fragments: sum of fragments of received S/G frames + * @hsplit: number of frames the device performed the header split for + * @raw: alias to access all the fields as an array + */ +struct libeth_rq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + u32 fragments; + u32 hsplit; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; /** * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop @@ -22,4 +44,84 @@ struct libeth_sq_napi_stats { }; }; +/** + * struct libeth_xdpsq_napi_stats - "hot" counters to update in XDP Tx + * completion loop + * @packets: completed frames counter + * @bytes: sum of bytes of completed frames above + * @fragments: sum of fragments of completed S/G frames + * @raw: alias to access all the fields as an array + */ +struct libeth_xdpsq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + u32 fragments; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; + +/* XDP */ + +/* + * The following structures should be embedded into driver's queue structure + * and passed to the libeth_xdp helpers, never used directly. + */ + +/* XDPSQ sharing */ + +/** + * struct libeth_xdpsq_lock - locking primitive for sharing XDPSQs + * @lock: spinlock for locking the queue + * @share: whether this particular queue is shared + */ +struct libeth_xdpsq_lock { + spinlock_t lock; + bool share; +}; + +/* XDPSQ clean-up timers */ + +/** + * struct libeth_xdpsq_timer - timer for cleaning up XDPSQs w/o interrupts + * @xdpsq: queue this timer belongs to + * @lock: lock for the queue + * @dwork: work performing cleanups + * + * XDPSQs not using interrupts but lazy cleaning, i.e. only when there's no + * space for sending the current queued frame/bulk, must fire up timers to + * make sure there are no stale buffers to free. + */ +struct libeth_xdpsq_timer { + void *xdpsq; + struct libeth_xdpsq_lock *lock; + + struct delayed_work dwork; +}; + +/* Rx polling path */ + +/** + * struct libeth_xdp_buff_stash - struct for stashing &xdp_buff onto a queue + * @data: pointer to the start of the frame, xdp_buff.data + * @headroom: frame headroom, xdp_buff.data - xdp_buff.data_hard_start + * @len: frame linear space length, xdp_buff.data_end - xdp_buff.data + * @frame_sz: truesize occupied by the frame, xdp_buff.frame_sz + * @flags: xdp_buff.flags + * + * &xdp_buff is 56 bytes long on x64, &libeth_xdp_buff is 64 bytes. This + * structure carries only necessary fields to save/restore a partially built + * frame on the queue structure to finish it during the next NAPI poll. + */ +struct libeth_xdp_buff_stash { + void *data; + u16 headroom; + u16 len; + + u32 frame_sz:24; + u32 flags:8; +} __aligned_largest; + #endif /* __LIBETH_TYPES_H */ diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h new file mode 100644 index 000000000000..898723ab62e8 --- /dev/null +++ b/include/net/libeth/xdp.h @@ -0,0 +1,1870 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2025 Intel Corporation */ + +#ifndef __LIBETH_XDP_H +#define __LIBETH_XDP_H + +#include <linux/bpf_trace.h> +#include <linux/unroll.h> + +#include <net/libeth/rx.h> +#include <net/libeth/tx.h> +#include <net/xsk_buff_pool.h> + +/* + * Defined as bits to be able to use them as a mask on Rx. + * Also used as internal return values on Tx. + */ +enum { + LIBETH_XDP_PASS = 0U, + LIBETH_XDP_DROP = BIT(0), + LIBETH_XDP_ABORTED = BIT(1), + LIBETH_XDP_TX = BIT(2), + LIBETH_XDP_REDIRECT = BIT(3), +}; + +/* + * &xdp_buff_xsk is the largest structure &libeth_xdp_buff gets casted to, + * pick maximum pointer-compatible alignment. + */ +#define __LIBETH_XDP_BUFF_ALIGN \ + (IS_ALIGNED(sizeof(struct xdp_buff_xsk), 16) ? 16 : \ + IS_ALIGNED(sizeof(struct xdp_buff_xsk), 8) ? 8 : \ + sizeof(long)) + +/** + * struct libeth_xdp_buff - libeth extension over &xdp_buff + * @base: main &xdp_buff + * @data: shortcut for @base.data + * @desc: RQ descriptor containing metadata for this buffer + * @priv: driver-private scratchspace + * + * The main reason for this is to have a pointer to the descriptor to be able + * to quickly get frame metadata from xdpmo and driver buff-to-xdp callbacks + * (as well as bigger alignment). + * Pointer/layout-compatible with &xdp_buff and &xdp_buff_xsk. + */ +struct libeth_xdp_buff { + union { + struct xdp_buff base; + void *data; + }; + + const void *desc; + unsigned long priv[] + __aligned(__LIBETH_XDP_BUFF_ALIGN); +} __aligned(__LIBETH_XDP_BUFF_ALIGN); +static_assert(offsetof(struct libeth_xdp_buff, data) == + offsetof(struct xdp_buff_xsk, xdp.data)); +static_assert(offsetof(struct libeth_xdp_buff, desc) == + offsetof(struct xdp_buff_xsk, cb)); +static_assert(IS_ALIGNED(sizeof(struct xdp_buff_xsk), + __alignof(struct libeth_xdp_buff))); + +/** + * __LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack + * @name: name of the variable to declare + * @...: sizeof() of the driver-private data + */ +#define __LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + ___LIBETH_XDP_ONSTACK_BUFF(name, ##__VA_ARGS__) +/** + * LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack + * @name: name of the variable to declare + * @...: type or variable name of the driver-private data + */ +#define LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + __LIBETH_XDP_ONSTACK_BUFF(name, __libeth_xdp_priv_sz(__VA_ARGS__)) + +#define ___LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + __DEFINE_FLEX(struct libeth_xdp_buff, name, priv, \ + LIBETH_XDP_PRIV_SZ(__VA_ARGS__ + 0), \ + __uninitialized); \ + LIBETH_XDP_ASSERT_PRIV_SZ(__VA_ARGS__ + 0) + +#define __libeth_xdp_priv_sz(...) \ + CONCATENATE(__libeth_xdp_psz, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) + +#define __libeth_xdp_psz0(...) +#define __libeth_xdp_psz1(...) sizeof(__VA_ARGS__) + +#define LIBETH_XDP_PRIV_SZ(sz) \ + (ALIGN(sz, __alignof(struct libeth_xdp_buff)) / sizeof(long)) + +/* Performs XSK_CHECK_PRIV_TYPE() */ +#define LIBETH_XDP_ASSERT_PRIV_SZ(sz) \ + static_assert(offsetofend(struct xdp_buff_xsk, cb) >= \ + struct_size_t(struct libeth_xdp_buff, priv, \ + LIBETH_XDP_PRIV_SZ(sz))) + +/* XDPSQ sharing */ + +DECLARE_STATIC_KEY_FALSE(libeth_xdpsq_share); + +/** + * libeth_xdpsq_num - calculate optimal number of XDPSQs for this device + sys + * @rxq: current number of active Rx queues + * @txq: current number of active Tx queues + * @max: maximum number of Tx queues + * + * Each RQ must have its own XDPSQ for XSk pairs, each CPU must have own XDPSQ + * for lockless sending (``XDP_TX``, .ndo_xdp_xmit()). Cap the maximum of these + * two with the number of SQs the device can have (minus used ones). + * + * Return: number of XDP Tx queues the device needs to use. + */ +static inline u32 libeth_xdpsq_num(u32 rxq, u32 txq, u32 max) +{ + return min(max(nr_cpu_ids, rxq), max - txq); +} + +/** + * libeth_xdpsq_shared - whether XDPSQs can be shared between several CPUs + * @num: number of active XDPSQs + * + * Return: true if there's no 1:1 XDPSQ/CPU association, false otherwise. + */ +static inline bool libeth_xdpsq_shared(u32 num) +{ + return num < nr_cpu_ids; +} + +/** + * libeth_xdpsq_id - get XDPSQ index corresponding to this CPU + * @num: number of active XDPSQs + * + * Helper for libeth_xdp routines, do not use in drivers directly. + * + * Return: XDPSQ index needs to be used on this CPU. + */ +static inline u32 libeth_xdpsq_id(u32 num) +{ + u32 ret = raw_smp_processor_id(); + + if (static_branch_unlikely(&libeth_xdpsq_share) && + libeth_xdpsq_shared(num)) + ret %= num; + + return ret; +} + +void __libeth_xdpsq_get(struct libeth_xdpsq_lock *lock, + const struct net_device *dev); +void __libeth_xdpsq_put(struct libeth_xdpsq_lock *lock, + const struct net_device *dev); + +/** + * libeth_xdpsq_get - initialize &libeth_xdpsq_lock + * @lock: lock to initialize + * @dev: netdev which this lock belongs to + * @share: whether XDPSQs can be shared + * + * Tracks the current XDPSQ association and enables the static lock + * if needed. + */ +static inline void libeth_xdpsq_get(struct libeth_xdpsq_lock *lock, + const struct net_device *dev, + bool share) +{ + if (unlikely(share)) + __libeth_xdpsq_get(lock, dev); +} + +/** + * libeth_xdpsq_put - deinitialize &libeth_xdpsq_lock + * @lock: lock to deinitialize + * @dev: netdev which this lock belongs to + * + * Tracks the current XDPSQ association and disables the static lock + * if needed. + */ +static inline void libeth_xdpsq_put(struct libeth_xdpsq_lock *lock, + const struct net_device *dev) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_put(lock, dev); +} + +void __libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock); +void __libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock); + +/** + * libeth_xdpsq_lock - grab &libeth_xdpsq_lock if needed + * @lock: lock to take + * + * Touches the underlying spinlock only if the static key is enabled + * and the queue itself is marked as shareable. + */ +static inline void libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_lock(lock); +} + +/** + * libeth_xdpsq_unlock - free &libeth_xdpsq_lock if needed + * @lock: lock to free + * + * Touches the underlying spinlock only if the static key is enabled + * and the queue itself is marked as shareable. + */ +static inline void libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_unlock(lock); +} + +/* XDPSQ clean-up timers */ + +void libeth_xdpsq_init_timer(struct libeth_xdpsq_timer *timer, void *xdpsq, + struct libeth_xdpsq_lock *lock, + void (*poll)(struct work_struct *work)); + +/** + * libeth_xdpsq_deinit_timer - deinitialize &libeth_xdpsq_timer + * @timer: timer to deinitialize + * + * Flush and disable the underlying workqueue. + */ +static inline void libeth_xdpsq_deinit_timer(struct libeth_xdpsq_timer *timer) +{ + cancel_delayed_work_sync(&timer->dwork); +} + +/** + * libeth_xdpsq_queue_timer - run &libeth_xdpsq_timer + * @timer: timer to queue + * + * Should be called after the queue was filled and the transmission was run + * to complete the pending buffers if no further sending will be done in a + * second (-> lazy cleaning won't happen). + * If the timer was already run, it will be requeued back to one second + * timeout again. + */ +static inline void libeth_xdpsq_queue_timer(struct libeth_xdpsq_timer *timer) +{ + mod_delayed_work_on(raw_smp_processor_id(), system_bh_highpri_wq, + &timer->dwork, HZ); +} + +/** + * libeth_xdpsq_run_timer - wrapper to run a queue clean-up on a timer event + * @work: workqueue belonging to the corresponding timer + * @poll: driver-specific completion queue poll function + * + * Run the polling function on the locked queue and requeue the timer if + * there's more work to do. + * Designed to be used via LIBETH_XDP_DEFINE_TIMER() below. + */ +static __always_inline void +libeth_xdpsq_run_timer(struct work_struct *work, + u32 (*poll)(void *xdpsq, u32 budget)) +{ + struct libeth_xdpsq_timer *timer = container_of(work, typeof(*timer), + dwork.work); + + libeth_xdpsq_lock(timer->lock); + + if (poll(timer->xdpsq, U32_MAX)) + libeth_xdpsq_queue_timer(timer); + + libeth_xdpsq_unlock(timer->lock); +} + +/* Common Tx bits */ + +/** + * enum - libeth_xdp internal Tx flags + * @LIBETH_XDP_TX_BULK: one bulk size at which it will be flushed to the queue + * @LIBETH_XDP_TX_BATCH: batch size for which the queue fill loop is unrolled + * @LIBETH_XDP_TX_DROP: indicates the send function must drop frames not sent + * @LIBETH_XDP_TX_NDO: whether the send function is called from .ndo_xdp_xmit() + * @LIBETH_XDP_TX_XSK: whether the function is called for ``XDP_TX`` for XSk + */ +enum { + LIBETH_XDP_TX_BULK = DEV_MAP_BULK_SIZE, + LIBETH_XDP_TX_BATCH = 8, + + LIBETH_XDP_TX_DROP = BIT(0), + LIBETH_XDP_TX_NDO = BIT(1), + LIBETH_XDP_TX_XSK = BIT(2), +}; + +/** + * enum - &libeth_xdp_tx_frame and &libeth_xdp_tx_desc flags + * @LIBETH_XDP_TX_LEN: only for ``XDP_TX``, [15:0] of ::len_fl is actual length + * @LIBETH_XDP_TX_CSUM: for XSk xmit, enable checksum offload + * @LIBETH_XDP_TX_XSKMD: for XSk xmit, mask of the metadata bits + * @LIBETH_XDP_TX_FIRST: indicates the frag is the first one of the frame + * @LIBETH_XDP_TX_LAST: whether the frag is the last one of the frame + * @LIBETH_XDP_TX_MULTI: whether the frame contains several frags + * @LIBETH_XDP_TX_FLAGS: only for ``XDP_TX``, [31:16] of ::len_fl is flags + */ +enum { + LIBETH_XDP_TX_LEN = GENMASK(15, 0), + + LIBETH_XDP_TX_CSUM = XDP_TXMD_FLAGS_CHECKSUM, + LIBETH_XDP_TX_XSKMD = LIBETH_XDP_TX_LEN, + + LIBETH_XDP_TX_FIRST = BIT(16), + LIBETH_XDP_TX_LAST = BIT(17), + LIBETH_XDP_TX_MULTI = BIT(18), + + LIBETH_XDP_TX_FLAGS = GENMASK(31, 16), +}; + +/** + * struct libeth_xdp_tx_frame - represents one XDP Tx element + * @data: frame start pointer for ``XDP_TX`` + * @len_fl: ``XDP_TX``, combined flags [31:16] and len [15:0] field for speed + * @soff: ``XDP_TX``, offset from @data to the start of &skb_shared_info + * @frag: one (non-head) frag for ``XDP_TX`` + * @xdpf: &xdp_frame for the head frag for .ndo_xdp_xmit() + * @dma: DMA address of the non-head frag for .ndo_xdp_xmit() + * @xsk: ``XDP_TX`` for XSk, XDP buffer for any frag + * @len: frag length for XSk ``XDP_TX`` and .ndo_xdp_xmit() + * @flags: Tx flags for the above + * @opts: combined @len + @flags for the above for speed + * @desc: XSk xmit descriptor for direct casting + */ +struct libeth_xdp_tx_frame { + union { + /* ``XDP_TX`` */ + struct { + void *data; + u32 len_fl; + u32 soff; + }; + + /* ``XDP_TX`` frag */ + skb_frag_t frag; + + /* .ndo_xdp_xmit(), XSk ``XDP_TX`` */ + struct { + union { + struct xdp_frame *xdpf; + dma_addr_t dma; + + struct libeth_xdp_buff *xsk; + }; + union { + struct { + u32 len; + u32 flags; + }; + aligned_u64 opts; + }; + }; + + /* XSk xmit */ + struct xdp_desc desc; + }; +} __aligned(sizeof(struct xdp_desc)); +static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) == + offsetof(struct libeth_xdp_tx_frame, len_fl)); +static_assert(sizeof(struct libeth_xdp_tx_frame) == sizeof(struct xdp_desc)); + +/** + * struct libeth_xdp_tx_bulk - XDP Tx frame bulk for bulk sending + * @prog: corresponding active XDP program, %NULL for .ndo_xdp_xmit() + * @dev: &net_device which the frames are transmitted on + * @xdpsq: shortcut to the corresponding driver-specific XDPSQ structure + * @act_mask: Rx only, mask of all the XDP prog verdicts for that NAPI session + * @count: current number of frames in @bulk + * @bulk: array of queued frames for bulk Tx + * + * All XDP Tx operations except XSk xmit queue each frame to the bulk first + * and flush it when @count reaches the array end. Bulk is always placed on + * the stack for performance. One bulk element contains all the data necessary + * for sending a frame and then freeing it on completion. + * For XSk xmit, Tx descriptor array from &xsk_buff_pool is casted directly + * to &libeth_xdp_tx_frame as they are compatible and the bulk structure is + * not used. + */ +struct libeth_xdp_tx_bulk { + const struct bpf_prog *prog; + struct net_device *dev; + void *xdpsq; + + u32 act_mask; + u32 count; + struct libeth_xdp_tx_frame bulk[LIBETH_XDP_TX_BULK]; +} __aligned(sizeof(struct libeth_xdp_tx_frame)); + +/** + * LIBETH_XDP_ONSTACK_BULK - declare &libeth_xdp_tx_bulk on the stack + * @bq: name of the variable to declare + * + * Helper to declare a bulk on the stack with a compiler hint that it should + * not be initialized automatically (with `CONFIG_INIT_STACK_ALL_*`) for + * performance reasons. + */ +#define LIBETH_XDP_ONSTACK_BULK(bq) \ + struct libeth_xdp_tx_bulk bq __uninitialized + +/** + * struct libeth_xdpsq - abstraction for an XDPSQ + * @pool: XSk buffer pool for XSk ``XDP_TX`` and xmit + * @sqes: array of Tx buffers from the actual queue struct + * @descs: opaque pointer to the HW descriptor array + * @ntu: pointer to the next free descriptor index + * @count: number of descriptors on that queue + * @pending: pointer to the number of sent-not-completed descs on that queue + * @xdp_tx: pointer to the above, but only for non-XSk-xmit frames + * @lock: corresponding XDPSQ lock + * + * Abstraction for driver-independent implementation of Tx. Placed on the stack + * and filled by the driver before the transmission, so that the generic + * functions can access and modify driver-specific resources. + */ +struct libeth_xdpsq { + struct xsk_buff_pool *pool; + struct libeth_sqe *sqes; + void *descs; + + u32 *ntu; + u32 count; + + u32 *pending; + u32 *xdp_tx; + struct libeth_xdpsq_lock *lock; +}; + +/** + * struct libeth_xdp_tx_desc - abstraction for an XDP Tx descriptor + * @addr: DMA address of the frame + * @len: length of the frame + * @flags: XDP Tx flags + * @opts: combined @len + @flags for speed + * + * Filled by the generic functions and then passed to driver-specific functions + * to fill a HW Tx descriptor, always placed on the [function] stack. + */ +struct libeth_xdp_tx_desc { + dma_addr_t addr; + union { + struct { + u32 len; + u32 flags; + }; + aligned_u64 opts; + }; +} __aligned_largest; + +/** + * libeth_xdp_ptr_to_priv - convert pointer to a libeth_xdp u64 priv + * @ptr: pointer to convert + * + * The main sending function passes private data as the largest scalar, u64. + * Use this helper when you want to pass a pointer there. + */ +#define libeth_xdp_ptr_to_priv(ptr) ({ \ + typecheck_pointer(ptr); \ + ((u64)(uintptr_t)(ptr)); \ +}) +/** + * libeth_xdp_priv_to_ptr - convert libeth_xdp u64 priv to a pointer + * @priv: private data to convert + * + * The main sending function passes private data as the largest scalar, u64. + * Use this helper when your callback takes this u64 and you want to convert + * it back to a pointer. + */ +#define libeth_xdp_priv_to_ptr(priv) ({ \ + static_assert(__same_type(priv, u64)); \ + ((const void *)(uintptr_t)(priv)); \ +}) + +/* + * On 64-bit systems, assigning one u64 is faster than two u32s. When ::len + * occupies lowest 32 bits (LE), whole ::opts can be assigned directly instead. + */ +#ifdef __LITTLE_ENDIAN +#define __LIBETH_WORD_ACCESS 1 +#endif +#ifdef __LIBETH_WORD_ACCESS +#define __libeth_xdp_tx_len(flen, ...) \ + .opts = ((flen) | FIELD_PREP(GENMASK_ULL(63, 32), (__VA_ARGS__ + 0))) +#else +#define __libeth_xdp_tx_len(flen, ...) \ + .len = (flen), .flags = (__VA_ARGS__ + 0) +#endif + +/** + * libeth_xdp_tx_xmit_bulk - main XDP Tx function + * @bulk: array of frames to send + * @xdpsq: pointer to the driver-specific XDPSQ struct + * @n: number of frames to send + * @unroll: whether to unroll the queue filling loop for speed + * @priv: driver-specific private data + * @prep: callback for cleaning the queue and filling abstract &libeth_xdpsq + * @fill: internal callback for filling &libeth_sqe and &libeth_xdp_tx_desc + * @xmit: callback for filling a HW descriptor with the frame info + * + * Internal abstraction for placing @n XDP Tx frames on the HW XDPSQ. Used for + * all types of frames: ``XDP_TX``, .ndo_xdp_xmit(), XSk ``XDP_TX``, and XSk + * xmit. + * @prep must lock the queue as this function releases it at the end. @unroll + * greatly increases the object code size, but also greatly increases XSk xmit + * performance; for other types of frames, it's not enabled. + * The compilers inline all those onstack abstractions to direct data accesses. + * + * Return: number of frames actually placed on the queue, <= @n. The function + * can't fail, but can send less frames if there's no enough free descriptors + * available. The actual free space is returned by @prep from the driver. + */ +static __always_inline __nocfi_generic u32 +libeth_xdp_tx_xmit_bulk(const struct libeth_xdp_tx_frame *bulk, void *xdpsq, + u32 n, bool unroll, u64 priv, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + struct libeth_xdp_tx_desc + (*fill)(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, u64 priv)) +{ + struct libeth_xdpsq sq __uninitialized; + u32 this, batched, off = 0; + u32 ntu, i = 0; + + n = min(n, prep(xdpsq, &sq)); + if (unlikely(!n)) + goto unlock; + + ntu = *sq.ntu; + + this = sq.count - ntu; + if (likely(this > n)) + this = n; + +again: + if (!unroll) + goto linear; + + batched = ALIGN_DOWN(this, LIBETH_XDP_TX_BATCH); + + for ( ; i < off + batched; i += LIBETH_XDP_TX_BATCH) { + u32 base = ntu + i - off; + + unrolled_count(LIBETH_XDP_TX_BATCH) + for (u32 j = 0; j < LIBETH_XDP_TX_BATCH; j++) + xmit(fill(bulk[i + j], base + j, &sq, priv), + base + j, &sq, priv); + } + + if (batched < this) { +linear: + for ( ; i < off + this; i++) + xmit(fill(bulk[i], ntu + i - off, &sq, priv), + ntu + i - off, &sq, priv); + } + + ntu += this; + if (likely(ntu < sq.count)) + goto out; + + ntu = 0; + + if (i < n) { + this = n - i; + off = i; + + goto again; + } + +out: + *sq.ntu = ntu; + *sq.pending += n; + if (sq.xdp_tx) + *sq.xdp_tx += n; + +unlock: + libeth_xdpsq_unlock(sq.lock); + + return n; +} + +/* ``XDP_TX`` bulking */ + +void libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp); + +/** + * libeth_xdp_tx_queue_head - internal helper for queueing one ``XDP_TX`` head + * @bq: XDP Tx bulk to queue the head frag to + * @xdp: XDP buffer with the head to queue + * + * Return: false if it's the only frag of the frame, true if it's an S/G frame. + */ +static inline bool libeth_xdp_tx_queue_head(struct libeth_xdp_tx_bulk *bq, + const struct libeth_xdp_buff *xdp) +{ + const struct xdp_buff *base = &xdp->base; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .data = xdp->data, + .len_fl = (base->data_end - xdp->data) | LIBETH_XDP_TX_FIRST, + .soff = xdp_data_hard_end(base) - xdp->data, + }; + + if (!xdp_buff_has_frags(base)) + return false; + + bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_MULTI; + + return true; +} + +/** + * libeth_xdp_tx_queue_frag - internal helper for queueing one ``XDP_TX`` frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: frag to queue + */ +static inline void libeth_xdp_tx_queue_frag(struct libeth_xdp_tx_bulk *bq, + const skb_frag_t *frag) +{ + bq->bulk[bq->count++].frag = *frag; +} + +/** + * libeth_xdp_tx_queue_bulk - internal helper for queueing one ``XDP_TX`` frame + * @bq: XDP Tx bulk to queue the frame to + * @xdp: XDP buffer to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: true on success, false on flush error. + */ +static __always_inline bool +libeth_xdp_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + const struct skb_shared_info *sinfo; + bool ret = true; + u32 nr_frags; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, 0))) { + libeth_xdp_return_buff_slow(xdp); + return false; + } + + if (!libeth_xdp_tx_queue_head(bq, xdp)) + goto out; + + sinfo = xdp_get_shared_info_from_buff(&xdp->base); + nr_frags = sinfo->nr_frags; + + for (u32 i = 0; i < nr_frags; i++) { + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, 0))) { + ret = false; + break; + } + + libeth_xdp_tx_queue_frag(bq, &sinfo->frags[i]); + } + +out: + bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_LAST; + xdp->data = NULL; + + return ret; +} + +/** + * libeth_xdp_tx_fill_stats - fill &libeth_sqe with ``XDP_TX`` frame stats + * @sqe: SQ element to fill + * @desc: libeth_xdp Tx descriptor + * @sinfo: &skb_shared_info for this frame + * + * Internal helper for filling an SQE with the frame stats, do not use in + * drivers. Fills the number of frags and bytes for this frame. + */ +#define libeth_xdp_tx_fill_stats(sqe, desc, sinfo) \ + __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, __UNIQUE_ID(sqe_), \ + __UNIQUE_ID(desc_), __UNIQUE_ID(sinfo_)) + +#define __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, ue, ud, us) do { \ + const struct libeth_xdp_tx_desc *ud = (desc); \ + const struct skb_shared_info *us; \ + struct libeth_sqe *ue = (sqe); \ + \ + ue->nr_frags = 1; \ + ue->bytes = ud->len; \ + \ + if (ud->flags & LIBETH_XDP_TX_MULTI) { \ + us = (sinfo); \ + ue->nr_frags += us->nr_frags; \ + ue->bytes += us->xdp_frags_size; \ + } \ +} while (0) + +/** + * libeth_xdp_tx_fill_buf - internal helper to fill one ``XDP_TX`` &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the synced DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xdp_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + struct skb_shared_info *sinfo; + skb_frag_t *frag = &frm.frag; + struct libeth_sqe *sqe; + netmem_ref netmem; + + if (frm.len_fl & LIBETH_XDP_TX_FIRST) { + sinfo = frm.data + frm.soff; + skb_frag_fill_netmem_desc(frag, virt_to_netmem(frm.data), + offset_in_page(frm.data), + frm.len_fl); + } else { + sinfo = NULL; + } + + netmem = skb_frag_netmem(frag); + desc = (typeof(desc)){ + .addr = page_pool_get_dma_addr_netmem(netmem) + + skb_frag_off(frag), + .len = skb_frag_size(frag) & LIBETH_XDP_TX_LEN, + .flags = skb_frag_size(frag) & LIBETH_XDP_TX_FLAGS, + }; + + dma_sync_single_for_device(__netmem_get_pp(netmem)->p.dev, desc.addr, + desc.len, DMA_BIDIRECTIONAL); + + if (!sinfo) + return desc; + + sqe = &sq->sqes[i]; + sqe->type = LIBETH_SQE_XDP_TX; + sqe->sinfo = sinfo; + libeth_xdp_tx_fill_stats(sqe, &desc, sinfo); + + return desc; +} + +void libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent, + u32 flags); + +/** + * __libeth_xdp_tx_flush_bulk - internal helper to flush one XDP Tx bulk + * @bq: bulk to flush + * @flags: XDP TX flags (.ndo_xdp_xmit(), XSk etc.) + * @prep: driver-specific callback to prepare the queue for sending + * @fill: libeth_xdp callback to fill &libeth_sqe and &libeth_xdp_tx_desc + * @xmit: driver callback to fill a HW descriptor + * + * Internal abstraction to create bulk flush functions for drivers. Used for + * everything except XSk xmit. + * + * Return: true if anything was sent, false otherwise. + */ +static __always_inline bool +__libeth_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + struct libeth_xdp_tx_desc + (*fill)(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, + u64 priv)) +{ + u32 sent, drops; + int err = 0; + + sent = libeth_xdp_tx_xmit_bulk(bq->bulk, bq->xdpsq, + min(bq->count, LIBETH_XDP_TX_BULK), + false, 0, prep, fill, xmit); + drops = bq->count - sent; + + if (unlikely(drops)) { + libeth_xdp_tx_exception(bq, sent, flags); + err = -ENXIO; + } else { + bq->count = 0; + } + + trace_xdp_bulk_tx(bq->dev, sent, drops, err); + + return likely(sent); +} + +/** + * libeth_xdp_tx_flush_bulk - wrapper to define flush of one ``XDP_TX`` bulk + * @bq: bulk to flush + * @flags: Tx flags, see above + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XDP_DEFINE_FLUSH_TX() to define an ``XDP_TX`` driver + * callback. + */ +#define libeth_xdp_tx_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, flags, prep, libeth_xdp_tx_fill_buf, \ + xmit) + +/* .ndo_xdp_xmit() implementation */ + +/** + * libeth_xdp_xmit_init_bulk - internal helper to initialize bulk for XDP xmit + * @bq: bulk to initialize + * @dev: target &net_device + * @xdpsqs: array of driver-specific XDPSQ structs + * @num: number of active XDPSQs (the above array length) + */ +#define libeth_xdp_xmit_init_bulk(bq, dev, xdpsqs, num) \ + __libeth_xdp_xmit_init_bulk(bq, dev, (xdpsqs)[libeth_xdpsq_id(num)]) + +static inline void __libeth_xdp_xmit_init_bulk(struct libeth_xdp_tx_bulk *bq, + struct net_device *dev, + void *xdpsq) +{ + bq->dev = dev; + bq->xdpsq = xdpsq; + bq->count = 0; +} + +/** + * libeth_xdp_xmit_frame_dma - internal helper to access DMA of an &xdp_frame + * @xf: pointer to the XDP frame + * + * There's no place in &libeth_xdp_tx_frame to store DMA address for an + * &xdp_frame head. The headroom is used then, the address is placed right + * after the frame struct, naturally aligned. + * + * Return: pointer to the DMA address to use. + */ +#define libeth_xdp_xmit_frame_dma(xf) \ + _Generic((xf), \ + const struct xdp_frame *: \ + (const dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf), \ + struct xdp_frame *: \ + (dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf) \ + ) + +static inline void *__libeth_xdp_xmit_frame_dma(const struct xdp_frame *xdpf) +{ + void *addr = (void *)(xdpf + 1); + + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + __alignof(*xdpf) < sizeof(dma_addr_t)) + addr = PTR_ALIGN(addr, sizeof(dma_addr_t)); + + return addr; +} + +/** + * libeth_xdp_xmit_queue_head - internal helper for queueing one XDP xmit head + * @bq: XDP Tx bulk to queue the head frag to + * @xdpf: XDP frame with the head to queue + * @dev: device to perform DMA mapping + * + * Return: ``LIBETH_XDP_DROP`` on DMA mapping error, + * ``LIBETH_XDP_PASS`` if it's the only frag in the frame, + * ``LIBETH_XDP_TX`` if it's an S/G frame. + */ +static inline u32 libeth_xdp_xmit_queue_head(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame *xdpf, + struct device *dev) +{ + dma_addr_t dma; + + dma = dma_map_single(dev, xdpf->data, xdpf->len, DMA_TO_DEVICE); + if (dma_mapping_error(dev, dma)) + return LIBETH_XDP_DROP; + + *libeth_xdp_xmit_frame_dma(xdpf) = dma; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xdpf = xdpf, + __libeth_xdp_tx_len(xdpf->len, LIBETH_XDP_TX_FIRST), + }; + + if (!xdp_frame_has_frags(xdpf)) + return LIBETH_XDP_PASS; + + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI; + + return LIBETH_XDP_TX; +} + +/** + * libeth_xdp_xmit_queue_frag - internal helper for queueing one XDP xmit frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: frag to queue + * @dev: device to perform DMA mapping + * + * Return: true on success, false on DMA mapping error. + */ +static inline bool libeth_xdp_xmit_queue_frag(struct libeth_xdp_tx_bulk *bq, + const skb_frag_t *frag, + struct device *dev) +{ + dma_addr_t dma; + + dma = skb_frag_dma_map(dev, frag); + if (dma_mapping_error(dev, dma)) + return false; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .dma = dma, + __libeth_xdp_tx_len(skb_frag_size(frag)), + }; + + return true; +} + +/** + * libeth_xdp_xmit_queue_bulk - internal helper for queueing one XDP xmit frame + * @bq: XDP Tx bulk to queue the frame to + * @xdpf: XDP frame to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: ``LIBETH_XDP_TX`` on success, + * ``LIBETH_XDP_DROP`` if the frame should be dropped by the stack, + * ``LIBETH_XDP_ABORTED`` if the frame will be dropped by libeth_xdp. + */ +static __always_inline u32 +libeth_xdp_xmit_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame *xdpf, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + u32 head, nr_frags, i, ret = LIBETH_XDP_TX; + struct device *dev = bq->dev->dev.parent; + const struct skb_shared_info *sinfo; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO))) + return LIBETH_XDP_DROP; + + head = libeth_xdp_xmit_queue_head(bq, xdpf, dev); + if (head == LIBETH_XDP_PASS) + goto out; + else if (head == LIBETH_XDP_DROP) + return LIBETH_XDP_DROP; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + nr_frags = sinfo->nr_frags; + + for (i = 0; i < nr_frags; i++) { + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO))) + break; + + if (!libeth_xdp_xmit_queue_frag(bq, &sinfo->frags[i], dev)) + break; + } + + if (unlikely(i < nr_frags)) + ret = LIBETH_XDP_ABORTED; + +out: + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST; + + return ret; +} + +/** + * libeth_xdp_xmit_fill_buf - internal helper to fill one XDP xmit &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the mapped DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xdp_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + struct libeth_sqe *sqe; + struct xdp_frame *xdpf; + + if (frm.flags & LIBETH_XDP_TX_FIRST) { + xdpf = frm.xdpf; + desc.addr = *libeth_xdp_xmit_frame_dma(xdpf); + } else { + xdpf = NULL; + desc.addr = frm.dma; + } + desc.opts = frm.opts; + + sqe = &sq->sqes[i]; + dma_unmap_addr_set(sqe, dma, desc.addr); + dma_unmap_len_set(sqe, len, desc.len); + + if (!xdpf) { + sqe->type = LIBETH_SQE_XDP_XMIT_FRAG; + return desc; + } + + sqe->type = LIBETH_SQE_XDP_XMIT; + sqe->xdpf = xdpf; + libeth_xdp_tx_fill_stats(sqe, &desc, + xdp_get_shared_info_from_frame(xdpf)); + + return desc; +} + +/** + * libeth_xdp_xmit_flush_bulk - wrapper to define flush of one XDP xmit bulk + * @bq: bulk to flush + * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk() + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XDP_DEFINE_FLUSH_XMIT() to define an XDP xmit driver + * callback. + */ +#define libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_NDO, prep, \ + libeth_xdp_xmit_fill_buf, xmit) + +u32 libeth_xdp_xmit_return_bulk(const struct libeth_xdp_tx_frame *bq, + u32 count, const struct net_device *dev); + +/** + * __libeth_xdp_xmit_do_bulk - internal function to implement .ndo_xdp_xmit() + * @bq: XDP Tx bulk to queue frames to + * @frames: XDP frames passed by the stack + * @n: number of frames + * @flags: flags passed by the stack + * @flush_bulk: driver callback to flush an XDP xmit bulk + * @finalize: driver callback to finalize sending XDP Tx frames on the queue + * + * Perform common checks, map the frags and queue them to the bulk, then flush + * the bulk to the XDPSQ. If requested by the stack, finalize the queue. + * + * Return: number of frames send or -errno on error. + */ +static __always_inline int +__libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame **frames, u32 n, u32 flags, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + u32 nxmit = 0; + + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + return -EINVAL; + + for (u32 i = 0; likely(i < n); i++) { + u32 ret; + + ret = libeth_xdp_xmit_queue_bulk(bq, frames[i], flush_bulk); + if (unlikely(ret != LIBETH_XDP_TX)) { + nxmit += ret == LIBETH_XDP_ABORTED; + break; + } + + nxmit++; + } + + if (bq->count) { + flush_bulk(bq, LIBETH_XDP_TX_NDO); + if (unlikely(bq->count)) + nxmit -= libeth_xdp_xmit_return_bulk(bq->bulk, + bq->count, + bq->dev); + } + + finalize(bq->xdpsq, nxmit, flags & XDP_XMIT_FLUSH); + + return nxmit; +} + +/** + * libeth_xdp_xmit_do_bulk - implement full .ndo_xdp_xmit() in driver + * @dev: target &net_device + * @n: number of frames to send + * @fr: XDP frames to send + * @f: flags passed by the stack + * @xqs: array of XDPSQs driver structs + * @nqs: number of active XDPSQs, the above array length + * @fl: driver callback to flush an XDP xmit bulk + * @fin: driver cabback to finalize the queue + * + * If the driver has active XDPSQs, perform common checks and send the frames. + * Finalize the queue, if requested. + * + * Return: number of frames sent or -errno on error. + */ +#define libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin) \ + _libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(ret_), \ + __UNIQUE_ID(nqs_)) + +#define _libeth_xdp_xmit_do_bulk(d, n, fr, f, xqs, nqs, fl, fin, ub, ur, un) \ +({ \ + u32 un = (nqs); \ + int ur; \ + \ + if (likely(un)) { \ + LIBETH_XDP_ONSTACK_BULK(ub); \ + \ + libeth_xdp_xmit_init_bulk(&ub, d, xqs, un); \ + ur = __libeth_xdp_xmit_do_bulk(&ub, fr, n, f, fl, fin); \ + } else { \ + ur = -ENXIO; \ + } \ + \ + ur; \ +}) + +/* Rx polling path */ + +/** + * libeth_xdp_tx_init_bulk - initialize an XDP Tx bulk for Rx NAPI poll + * @bq: bulk to initialize + * @prog: RCU pointer to the XDP program (can be %NULL) + * @dev: target &net_device + * @xdpsqs: array of driver XDPSQ structs + * @num: number of active XDPSQs, the above array length + * + * Should be called on an onstack XDP Tx bulk before the NAPI polling loop. + * Initializes all the needed fields to run libeth_xdp functions. If @num == 0, + * assumes XDP is not enabled. + * Do not use for XSk, it has its own optimized helper. + */ +#define libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num) \ + __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, false, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_)) + +#define __libeth_xdp_tx_init_bulk(bq, pr, d, xdpsqs, num, xsk, ub, un) do { \ + typeof(bq) ub = (bq); \ + u32 un = (num); \ + \ + rcu_read_lock(); \ + \ + if (un || (xsk)) { \ + ub->prog = rcu_dereference(pr); \ + ub->dev = (d); \ + ub->xdpsq = (xdpsqs)[libeth_xdpsq_id(un)]; \ + } else { \ + ub->prog = NULL; \ + } \ + \ + ub->act_mask = 0; \ + ub->count = 0; \ +} while (0) + +void libeth_xdp_load_stash(struct libeth_xdp_buff *dst, + const struct libeth_xdp_buff_stash *src); +void libeth_xdp_save_stash(struct libeth_xdp_buff_stash *dst, + const struct libeth_xdp_buff *src); +void __libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash); + +/** + * libeth_xdp_init_buff - initialize a &libeth_xdp_buff for Rx NAPI poll + * @dst: onstack buffer to initialize + * @src: XDP buffer stash placed on the queue + * @rxq: registered &xdp_rxq_info corresponding to this queue + * + * Should be called before the main NAPI polling loop. Loads the content of + * the previously saved stash or initializes the buffer from scratch. + * Do not use for XSk. + */ +static inline void +libeth_xdp_init_buff(struct libeth_xdp_buff *dst, + const struct libeth_xdp_buff_stash *src, + struct xdp_rxq_info *rxq) +{ + if (likely(!src->data)) + dst->data = NULL; + else + libeth_xdp_load_stash(dst, src); + + dst->base.rxq = rxq; +} + +/** + * libeth_xdp_save_buff - save a partially built buffer on a queue + * @dst: XDP buffer stash placed on the queue + * @src: onstack buffer to save + * + * Should be called after the main NAPI polling loop. If the loop exited before + * the buffer was finished, saves its content on the queue, so that it can be + * completed during the next poll. Otherwise, clears the stash. + */ +static inline void libeth_xdp_save_buff(struct libeth_xdp_buff_stash *dst, + const struct libeth_xdp_buff *src) +{ + if (likely(!src->data)) + dst->data = NULL; + else + libeth_xdp_save_stash(dst, src); +} + +/** + * libeth_xdp_return_stash - free an XDP buffer stash from a queue + * @stash: stash to free + * + * If the queue is about to be destroyed, but it still has an incompleted + * buffer stash, this helper should be called to free it. + */ +static inline void libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash) +{ + if (stash->data) + __libeth_xdp_return_stash(stash); +} + +static inline void libeth_xdp_return_va(const void *data, bool napi) +{ + netmem_ref netmem = virt_to_netmem(data); + + page_pool_put_full_netmem(__netmem_get_pp(netmem), netmem, napi); +} + +static inline void libeth_xdp_return_frags(const struct skb_shared_info *sinfo, + bool napi) +{ + for (u32 i = 0; i < sinfo->nr_frags; i++) { + netmem_ref netmem = skb_frag_netmem(&sinfo->frags[i]); + + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, napi); + } +} + +/** + * libeth_xdp_return_buff - free/recycle &libeth_xdp_buff + * @xdp: buffer to free + * + * Hotpath helper to free &libeth_xdp_buff. Comparing to xdp_return_buff(), + * it's faster as it gets inlined and always assumes order-0 pages and safe + * direct recycling. Zeroes @xdp->data to avoid UAFs. + */ +#define libeth_xdp_return_buff(xdp) __libeth_xdp_return_buff(xdp, true) + +static inline void __libeth_xdp_return_buff(struct libeth_xdp_buff *xdp, + bool napi) +{ + if (!xdp_buff_has_frags(&xdp->base)) + goto out; + + libeth_xdp_return_frags(xdp_get_shared_info_from_buff(&xdp->base), + napi); + +out: + libeth_xdp_return_va(xdp->data, napi); + xdp->data = NULL; +} + +bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len); + +/** + * libeth_xdp_prepare_buff - fill &libeth_xdp_buff with head FQE data + * @xdp: XDP buffer to attach the head to + * @fqe: FQE containing the head buffer + * @len: buffer len passed from HW + * + * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer + * head with the Rx buffer data: data pointer, length, headroom, and + * truesize/tailroom. Zeroes the flags. + */ +static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len) +{ + const struct page *page = __netmem_to_page(fqe->netmem); + + xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset, + pp_page_to_nmdesc(page)->pp->p.offset, len, true); + xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); +} + +/** + * libeth_xdp_process_buff - attach Rx buffer to &libeth_xdp_buff + * @xdp: XDP buffer to attach the Rx buffer to + * @fqe: Rx buffer to process + * @len: received data length from the descriptor + * + * If the XDP buffer is empty, attaches the Rx buffer as head and initializes + * the required fields. Otherwise, attaches the buffer as a frag. + * Already performs DMA sync-for-CPU and frame start prefetch + * (for head buffers only). + * + * Return: true on success, false if the descriptor must be skipped (empty or + * no space for a new frag). + */ +static inline bool libeth_xdp_process_buff(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len) +{ + if (!libeth_rx_sync_for_cpu(fqe, len)) + return false; + + if (xdp->data) + return libeth_xdp_buff_add_frag(xdp, fqe, len); + + libeth_xdp_prepare_buff(xdp, fqe, len); + + prefetch(xdp->data); + + return true; +} + +/** + * libeth_xdp_buff_stats_frags - update onstack RQ stats with XDP frags info + * @ss: onstack stats to update + * @xdp: buffer to account + * + * Internal helper used by __libeth_xdp_run_pass(), do not call directly. + * Adds buffer's frags count and total len to the onstack stats. + */ +static inline void +libeth_xdp_buff_stats_frags(struct libeth_rq_napi_stats *ss, + const struct libeth_xdp_buff *xdp) +{ + const struct skb_shared_info *sinfo; + + sinfo = xdp_get_shared_info_from_buff(&xdp->base); + ss->bytes += sinfo->xdp_frags_size; + ss->fragments += sinfo->nr_frags + 1; +} + +u32 libeth_xdp_prog_exception(const struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + enum xdp_action act, int ret); + +/** + * __libeth_xdp_run_prog - run XDP program on an XDP buffer + * @xdp: XDP buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * + * Internal inline abstraction to run XDP program. Handles ``XDP_DROP`` + * and ``XDP_REDIRECT`` only, the rest is processed levels up. + * Reports an XDP prog exception on errors. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xdp_run_prog(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq) +{ + enum xdp_action act; + + act = bpf_prog_run_xdp(bq->prog, &xdp->base); + if (unlikely(act < XDP_DROP || act > XDP_REDIRECT)) + goto out; + + switch (act) { + case XDP_PASS: + return LIBETH_XDP_PASS; + case XDP_DROP: + libeth_xdp_return_buff(xdp); + + return LIBETH_XDP_DROP; + case XDP_TX: + return LIBETH_XDP_TX; + case XDP_REDIRECT: + if (unlikely(xdp_do_redirect(bq->dev, &xdp->base, bq->prog))) + break; + + xdp->data = NULL; + + return LIBETH_XDP_REDIRECT; + default: + break; + } + +out: + return libeth_xdp_prog_exception(bq, xdp, act, 0); +} + +/** + * __libeth_xdp_run_flush - run XDP program and handle ``XDP_TX`` verdict + * @xdp: XDP buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * @run: internal callback for running XDP program + * @queue: internal callback for queuing ``XDP_TX`` frame + * @flush_bulk: driver callback for flushing a bulk + * + * Internal inline abstraction to run XDP program and additionally handle + * ``XDP_TX`` verdict. Used by both XDP and XSk, hence @run and @queue. + * Do not use directly. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xdp_run_flush(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, + u32 (*run)(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq), + bool (*queue)(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk) + (struct libeth_xdp_tx_bulk *bq, + u32 flags)), + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + u32 act; + + act = run(xdp, bq); + if (act == LIBETH_XDP_TX && unlikely(!queue(bq, xdp, flush_bulk))) + act = LIBETH_XDP_DROP; + + bq->act_mask |= act; + + return act; +} + +/** + * libeth_xdp_run_prog - run XDP program (non-XSk path) and handle all verdicts + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers + * @fl: driver ``XDP_TX`` bulk flush callback + * + * Run the attached XDP program and handle all possible verdicts. XSk has its + * own version. + * Prefer using it via LIBETH_XDP_DEFINE_RUN{,_PASS,_PROG}(). + * + * Return: true if the buffer should be passed up the stack, false if the poll + * should go to the next buffer. + */ +#define libeth_xdp_run_prog(xdp, bq, fl) \ + (__libeth_xdp_run_flush(xdp, bq, __libeth_xdp_run_prog, \ + libeth_xdp_tx_queue_bulk, \ + fl) == LIBETH_XDP_PASS) + +/** + * __libeth_xdp_run_pass - helper to run XDP program and handle the result + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @md: metadata that should be filled to the XDP buffer + * @prep: callback for filling the metadata + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Inline abstraction that does the following (non-XSk path): + * 1) adds frame size and frag number (if needed) to the onstack stats; + * 2) fills the descriptor metadata to the onstack &libeth_xdp_buff + * 3) runs XDP program if present; + * 4) handles all possible verdicts; + * 5) on ``XDP_PASS`, builds an skb from the buffer; + * 6) populates it with the descriptor metadata; + * 7) passes it up the stack. + * + * In most cases, number 2 means just writing the pointer to the HW descriptor + * to the XDP buffer. If so, please use LIBETH_XDP_DEFINE_RUN{,_PASS}() + * wrappers to build a driver function. + */ +static __always_inline void +__libeth_xdp_run_pass(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi, + struct libeth_rq_napi_stats *rs, const void *md, + void (*prep)(struct libeth_xdp_buff *xdp, + const void *md), + bool (*run)(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq), + bool (*populate)(struct sk_buff *skb, + const struct libeth_xdp_buff *xdp, + struct libeth_rq_napi_stats *rs)) +{ + struct sk_buff *skb; + + rs->bytes += xdp->base.data_end - xdp->data; + rs->packets++; + + if (xdp_buff_has_frags(&xdp->base)) + libeth_xdp_buff_stats_frags(rs, xdp); + + if (prep && (!__builtin_constant_p(!!md) || md)) + prep(xdp, md); + + if (!bq || !run || !bq->prog) + goto build; + + if (!run(xdp, bq)) + return; + +build: + skb = xdp_build_skb_from_buff(&xdp->base); + if (unlikely(!skb)) { + libeth_xdp_return_buff_slow(xdp); + return; + } + + xdp->data = NULL; + + if (unlikely(!populate(skb, xdp, rs))) { + napi_consume_skb(skb, true); + return; + } + + napi_gro_receive(napi, skb); +} + +static inline void libeth_xdp_prep_desc(struct libeth_xdp_buff *xdp, + const void *desc) +{ + xdp->desc = desc; +} + +/** + * libeth_xdp_run_pass - helper to run XDP program and handle the result + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @ss: onstack libeth RQ stats + * @desc: pointer to the HW descriptor for that frame + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Wrapper around the underscored version when "fill the descriptor metadata" + * means just writing the pointer to the HW descriptor as @xdp->desc. + */ +#define libeth_xdp_run_pass(xdp, bq, napi, ss, desc, run, populate) \ + __libeth_xdp_run_pass(xdp, bq, napi, ss, desc, libeth_xdp_prep_desc, \ + run, populate) + +/** + * libeth_xdp_finalize_rx - finalize XDPSQ after a NAPI polling loop (non-XSk) + * @bq: ``XDP_TX`` frame bulk + * @flush: driver callback to flush the bulk + * @finalize: driver callback to start sending the frames and run the timer + * + * Flush the bulk if there are frames left to send, kick the queue and flush + * the XDP maps. + */ +#define libeth_xdp_finalize_rx(bq, flush, finalize) \ + __libeth_xdp_finalize_rx(bq, 0, flush, finalize) + +static __always_inline void +__libeth_xdp_finalize_rx(struct libeth_xdp_tx_bulk *bq, u32 flags, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + if (bq->act_mask & LIBETH_XDP_TX) { + if (bq->count) + flush_bulk(bq, flags | LIBETH_XDP_TX_DROP); + finalize(bq->xdpsq, true, true); + } + if (bq->act_mask & LIBETH_XDP_REDIRECT) + xdp_do_flush(); + + rcu_read_unlock(); +} + +/* + * Helpers to reduce boilerplate code in drivers. + * + * Typical driver Rx flow would be (excl. bulk and buff init, frag attach): + * + * LIBETH_XDP_DEFINE_START(); + * LIBETH_XDP_DEFINE_FLUSH_TX(static driver_xdp_flush_tx, driver_xdp_tx_prep, + * driver_xdp_xmit); + * LIBETH_XDP_DEFINE_RUN(static driver_xdp_run, driver_xdp_run_prog, + * driver_xdp_flush_tx, driver_populate_skb); + * LIBETH_XDP_DEFINE_FINALIZE(static driver_xdp_finalize_rx, + * driver_xdp_flush_tx, driver_xdp_finalize_sq); + * LIBETH_XDP_DEFINE_END(); + * + * This will build a set of 4 static functions. The compiler is free to decide + * whether to inline them. + * Then, in the NAPI polling function: + * + * while (packets < budget) { + * // ... + * driver_xdp_run(xdp, &bq, napi, &rs, desc); + * } + * driver_xdp_finalize_rx(&bq); + */ + +#define LIBETH_XDP_DEFINE_START() \ + __diag_push(); \ + __diag_ignore(GCC, 8, "-Wold-style-declaration", \ + "Allow specifying \'static\' after the return type") + +/** + * LIBETH_XDP_DEFINE_TIMER - define a driver XDPSQ cleanup timer callback + * @name: name of the function to define + * @poll: Tx polling/completion function + */ +#define LIBETH_XDP_DEFINE_TIMER(name, poll) \ +void name(struct work_struct *work) \ +{ \ + libeth_xdpsq_run_timer(work, poll); \ +} + +/** + * LIBETH_XDP_DEFINE_FLUSH_TX - define a driver ``XDP_TX`` bulk flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit) \ + __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xdp) + +#define __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, pfx) \ +bool name(struct libeth_xdp_tx_bulk *bq, u32 flags) \ +{ \ + return libeth_##pfx##_tx_flush_bulk(bq, flags, prep, xmit); \ +} + +/** + * LIBETH_XDP_DEFINE_FLUSH_XMIT - define a driver XDP xmit bulk flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XDP_DEFINE_FLUSH_XMIT(name, prep, xmit) \ +bool name(struct libeth_xdp_tx_bulk *bq, u32 flags) \ +{ \ + return libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN_PROG - define a driver XDP program run function + * @name: name of the function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + */ +#define LIBETH_XDP_DEFINE_RUN_PROG(name, flush) \ + bool __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xdp) + +#define __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, pfx) \ +name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq) \ +{ \ + return libeth_##pfx##_run_prog(xdp, bq, flush); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN_PASS - define a driver buffer process + pass function + * @name: name of the function to define + * @run: driver callback to run XDP program (above) + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate) \ + void __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xdp) + +#define __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, pfx) \ +name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq, \ + struct napi_struct *napi, struct libeth_rq_napi_stats *ss, \ + const void *desc) \ +{ \ + return libeth_##pfx##_run_pass(xdp, bq, napi, ss, desc, run, \ + populate); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN - define a driver buffer process, run + pass function + * @name: name of the function to define + * @run: name of the XDP prog run function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XDP_DEFINE_RUN(name, run, flush, populate) \ + __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XDP) + +#define __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, pfx) \ + LIBETH_##pfx##_DEFINE_RUN_PROG(static run, flush); \ + LIBETH_##pfx##_DEFINE_RUN_PASS(name, run, populate) + +/** + * LIBETH_XDP_DEFINE_FINALIZE - define a driver Rx NAPI poll finalize function + * @name: name of the function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + * @finalize: driver callback to finalize an XDPSQ and run the timer + */ +#define LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize) \ + __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xdp) + +#define __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, pfx) \ +void name(struct libeth_xdp_tx_bulk *bq) \ +{ \ + libeth_##pfx##_finalize_rx(bq, flush, finalize); \ +} + +#define LIBETH_XDP_DEFINE_END() __diag_pop() + +/* XMO */ + +/** + * libeth_xdp_buff_to_rq - get RQ pointer from an XDP buffer pointer + * @xdp: &libeth_xdp_buff corresponding to the queue + * @type: typeof() of the driver Rx queue structure + * @member: name of &xdp_rxq_info inside @type + * + * Often times, pointer to the RQ is needed when reading/filling metadata from + * HW descriptors. The helper can be used to quickly jump from an XDP buffer + * to the queue corresponding to its &xdp_rxq_info without introducing + * additional fields (&libeth_xdp_buff is precisely 1 cacheline long on x64). + */ +#define libeth_xdp_buff_to_rq(xdp, type, member) \ + container_of_const((xdp)->base.rxq, type, member) + +/** + * libeth_xdpmo_rx_hash - convert &libeth_rx_pt to an XDP RSS hash metadata + * @hash: pointer to the variable to write the hash to + * @rss_type: pointer to the variable to write the hash type to + * @val: hash value from the HW descriptor + * @pt: libeth parsed packet type + * + * Handle zeroed/non-available hash and convert libeth parsed packet type to + * the corresponding XDP RSS hash type. To be called at the end of + * xdp_metadata_ops idpf_xdpmo::xmo_rx_hash() implementation. + * Note that if the driver doesn't use a constant packet type lookup table but + * generates it at runtime, it must call libeth_rx_pt_gen_hash_type(pt) to + * generate XDP RSS hash type for each packet type. + * + * Return: 0 on success, -ENODATA when the hash is not available. + */ +static inline int libeth_xdpmo_rx_hash(u32 *hash, + enum xdp_rss_hash_type *rss_type, + u32 val, struct libeth_rx_pt pt) +{ + if (unlikely(!val)) + return -ENODATA; + + *hash = val; + *rss_type = pt.hash_type; + + return 0; +} + +/* Tx buffer completion */ + +void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo, + struct xdp_frame_bulk *bq, bool frags); +void libeth_xsk_buff_free_slow(struct libeth_xdp_buff *xdp); + +/** + * __libeth_xdp_complete_tx - complete sent XDPSQE + * @sqe: SQ element / Tx buffer to complete + * @cp: Tx polling/completion params + * @bulk: internal callback to bulk-free ``XDP_TX`` buffers + * @xsk: internal callback to free XSk ``XDP_TX`` buffers + * + * Use the non-underscored version in drivers instead. This one is shared + * internally with libeth_tx_complete_any(). + * Complete an XDPSQE of any type of XDP frame. This includes DMA unmapping + * when needed, buffer freeing, stats update, and SQE invalidation. + */ +static __always_inline void +__libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp, + typeof(libeth_xdp_return_buff_bulk) bulk, + typeof(libeth_xsk_buff_free_slow) xsk) +{ + enum libeth_sqe_type type = sqe->type; + + switch (type) { + case LIBETH_SQE_EMPTY: + return; + case LIBETH_SQE_XDP_XMIT: + case LIBETH_SQE_XDP_XMIT_FRAG: + dma_unmap_page(cp->dev, dma_unmap_addr(sqe, dma), + dma_unmap_len(sqe, len), DMA_TO_DEVICE); + break; + default: + break; + } + + switch (type) { + case LIBETH_SQE_XDP_TX: + bulk(sqe->sinfo, cp->bq, sqe->nr_frags != 1); + break; + case LIBETH_SQE_XDP_XMIT: + xdp_return_frame_bulk(sqe->xdpf, cp->bq); + break; + case LIBETH_SQE_XSK_TX: + case LIBETH_SQE_XSK_TX_FRAG: + xsk(sqe->xsk); + break; + default: + break; + } + + switch (type) { + case LIBETH_SQE_XDP_TX: + case LIBETH_SQE_XDP_XMIT: + case LIBETH_SQE_XSK_TX: + cp->xdp_tx -= sqe->nr_frags; + + cp->xss->packets++; + cp->xss->bytes += sqe->bytes; + break; + default: + break; + } + + sqe->type = LIBETH_SQE_EMPTY; +} + +static inline void libeth_xdp_complete_tx(struct libeth_sqe *sqe, + struct libeth_cq_pp *cp) +{ + __libeth_xdp_complete_tx(sqe, cp, libeth_xdp_return_buff_bulk, + libeth_xsk_buff_free_slow); +} + +/* Misc */ + +u32 libeth_xdp_queue_threshold(u32 count); + +void __libeth_xdp_set_features(struct net_device *dev, + const struct xdp_metadata_ops *xmo, + u32 zc_segs, + const struct xsk_tx_metadata_ops *tmo); +void libeth_xdp_set_redirect(struct net_device *dev, bool enable); + +/** + * libeth_xdp_set_features - set XDP features for netdev + * @dev: &net_device to configure + * @...: optional params, see __libeth_xdp_set_features() + * + * Set all the features libeth_xdp supports, including .ndo_xdp_xmit(). That + * said, it should be used only when XDPSQs are always available regardless + * of whether an XDP prog is attached to @dev. + */ +#define libeth_xdp_set_features(dev, ...) \ + CONCATENATE(__libeth_xdp_feat, \ + COUNT_ARGS(__VA_ARGS__))(dev, ##__VA_ARGS__) + +#define __libeth_xdp_feat0(dev) \ + __libeth_xdp_set_features(dev, NULL, 0, NULL) +#define __libeth_xdp_feat1(dev, xmo) \ + __libeth_xdp_set_features(dev, xmo, 0, NULL) +#define __libeth_xdp_feat2(dev, xmo, zc_segs) \ + __libeth_xdp_set_features(dev, xmo, zc_segs, NULL) +#define __libeth_xdp_feat3(dev, xmo, zc_segs, tmo) \ + __libeth_xdp_set_features(dev, xmo, zc_segs, tmo) + +/** + * libeth_xdp_set_features_noredir - enable all libeth_xdp features w/o redir + * @dev: target &net_device + * @...: optional params, see __libeth_xdp_set_features() + * + * Enable everything except the .ndo_xdp_xmit() feature, use when XDPSQs are + * not available right after netdev registration. + */ +#define libeth_xdp_set_features_noredir(dev, ...) \ + __libeth_xdp_set_features_noredir(dev, __UNIQUE_ID(dev_), \ + ##__VA_ARGS__) + +#define __libeth_xdp_set_features_noredir(dev, ud, ...) do { \ + struct net_device *ud = (dev); \ + \ + libeth_xdp_set_features(ud, ##__VA_ARGS__); \ + libeth_xdp_set_redirect(ud, false); \ +} while (0) + +#define libeth_xsktmo ((const void *)GOLDEN_RATIO_PRIME) + +#endif /* __LIBETH_XDP_H */ diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h new file mode 100644 index 000000000000..82b5d21aae87 --- /dev/null +++ b/include/net/libeth/xsk.h @@ -0,0 +1,688 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2025 Intel Corporation */ + +#ifndef __LIBETH_XSK_H +#define __LIBETH_XSK_H + +#include <net/libeth/xdp.h> +#include <net/xdp_sock_drv.h> + +/* ``XDP_TXMD_FLAGS_VALID`` is defined only under ``CONFIG_XDP_SOCKETS`` */ +#ifdef XDP_TXMD_FLAGS_VALID +static_assert(XDP_TXMD_FLAGS_VALID <= LIBETH_XDP_TX_XSKMD); +#endif + +/* ``XDP_TX`` bulking */ + +/** + * libeth_xsk_tx_queue_head - internal helper for queueing XSk ``XDP_TX`` head + * @bq: XDP Tx bulk to queue the head frag to + * @xdp: XSk buffer with the head to queue + * + * Return: false if it's the only frag of the frame, true if it's an S/G frame. + */ +static inline bool libeth_xsk_tx_queue_head(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp) +{ + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xsk = xdp, + __libeth_xdp_tx_len(xdp->base.data_end - xdp->data, + LIBETH_XDP_TX_FIRST), + }; + + if (likely(!xdp_buff_has_frags(&xdp->base))) + return false; + + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI; + + return true; +} + +/** + * libeth_xsk_tx_queue_frag - internal helper for queueing XSk ``XDP_TX`` frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: XSk frag to queue + */ +static inline void libeth_xsk_tx_queue_frag(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *frag) +{ + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xsk = frag, + __libeth_xdp_tx_len(frag->base.data_end - frag->data), + }; +} + +/** + * libeth_xsk_tx_queue_bulk - internal helper for queueing XSk ``XDP_TX`` frame + * @bq: XDP Tx bulk to queue the frame to + * @xdp: XSk buffer to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: true on success, false on flush error. + */ +static __always_inline bool +libeth_xsk_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + bool ret = true; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) { + libeth_xsk_buff_free_slow(xdp); + return false; + } + + if (!libeth_xsk_tx_queue_head(bq, xdp)) + goto out; + + for (const struct libeth_xdp_buff *head = xdp; ; ) { + xdp = container_of(xsk_buff_get_frag(&head->base), + typeof(*xdp), base); + if (!xdp) + break; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) { + ret = false; + break; + } + + libeth_xsk_tx_queue_frag(bq, xdp); + } + +out: + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST; + + return ret; +} + +/** + * libeth_xsk_tx_fill_buf - internal helper to fill XSk ``XDP_TX`` &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the synced DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xsk_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_buff *xdp = frm.xsk; + struct libeth_xdp_tx_desc desc = { + .addr = xsk_buff_xdp_get_dma(&xdp->base), + .opts = frm.opts, + }; + struct libeth_sqe *sqe; + + xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len); + + sqe = &sq->sqes[i]; + sqe->xsk = xdp; + + if (!(desc.flags & LIBETH_XDP_TX_FIRST)) { + sqe->type = LIBETH_SQE_XSK_TX_FRAG; + return desc; + } + + sqe->type = LIBETH_SQE_XSK_TX; + libeth_xdp_tx_fill_stats(sqe, &desc, + xdp_get_shared_info_from_buff(&xdp->base)); + + return desc; +} + +/** + * libeth_xsk_tx_flush_bulk - wrapper to define flush of XSk ``XDP_TX`` bulk + * @bq: bulk to flush + * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk() + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XSK_DEFINE_FLUSH_TX() to define an XSk ``XDP_TX`` driver + * callback. + */ +#define libeth_xsk_tx_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_XSK, prep, \ + libeth_xsk_tx_fill_buf, xmit) + +/* XSk TMO */ + +/** + * libeth_xsktmo_req_csum - XSk Tx metadata op to request checksum offload + * @csum_start: unused + * @csum_offset: unused + * @priv: &libeth_xdp_tx_desc from the filling helper + * + * Generic implementation of ::tmo_request_checksum. Works only when HW doesn't + * require filling checksum offsets and other parameters beside the checksum + * request bit. + * Consider using within @libeth_xsktmo unless the driver requires HW-specific + * callbacks. + */ +static inline void libeth_xsktmo_req_csum(u16 csum_start, u16 csum_offset, + void *priv) +{ + ((struct libeth_xdp_tx_desc *)priv)->flags |= LIBETH_XDP_TX_CSUM; +} + +/* Only to inline the callbacks below, use @libeth_xsktmo in drivers instead */ +static const struct xsk_tx_metadata_ops __libeth_xsktmo = { + .tmo_request_checksum = libeth_xsktmo_req_csum, +}; + +/** + * __libeth_xsk_xmit_fill_buf_md - internal helper to prepare XSk xmit w/meta + * @xdesc: &xdp_desc from the XSk buffer pool + * @sq: XDPSQ abstraction for the queue + * @priv: XSk Tx metadata ops + * + * Same as __libeth_xsk_xmit_fill_buf(), but requests metadata pointer and + * fills additional fields in &libeth_xdp_tx_desc to ask for metadata offload. + * + * Return: XDP Tx descriptor with the DMA, metadata request bits, and other + * info to pass to the driver callback. + */ +static __always_inline struct libeth_xdp_tx_desc +__libeth_xsk_xmit_fill_buf_md(const struct xdp_desc *xdesc, + const struct libeth_xdpsq *sq, + u64 priv) +{ + const struct xsk_tx_metadata_ops *tmo = libeth_xdp_priv_to_ptr(priv); + struct libeth_xdp_tx_desc desc; + struct xdp_desc_ctx ctx; + + ctx = xsk_buff_raw_get_ctx(sq->pool, xdesc->addr); + desc = (typeof(desc)){ + .addr = ctx.dma, + __libeth_xdp_tx_len(xdesc->len), + }; + + BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo)); + tmo = tmo == libeth_xsktmo ? &__libeth_xsktmo : tmo; + + xsk_tx_metadata_request(ctx.meta, tmo, &desc); + + return desc; +} + +/* XSk xmit implementation */ + +/** + * __libeth_xsk_xmit_fill_buf - internal helper to prepare XSk xmit w/o meta + * @xdesc: &xdp_desc from the XSk buffer pool + * @sq: XDPSQ abstraction for the queue + * + * Return: XDP Tx descriptor with the DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +__libeth_xsk_xmit_fill_buf(const struct xdp_desc *xdesc, + const struct libeth_xdpsq *sq) +{ + return (struct libeth_xdp_tx_desc){ + .addr = xsk_buff_raw_get_dma(sq->pool, xdesc->addr), + __libeth_xdp_tx_len(xdesc->len), + }; +} + +/** + * libeth_xsk_xmit_fill_buf - internal helper to prepare an XSk xmit + * @frm: &xdp_desc from the XSk buffer pool + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: XSk Tx metadata ops + * + * Depending on the metadata ops presence (determined at compile time), calls + * the quickest helper to build a libeth XDP Tx descriptor. + * + * Return: XDP Tx descriptor with the synced DMA, metadata request bits, + * and other info to pass to the driver callback. + */ +static __always_inline struct libeth_xdp_tx_desc +libeth_xsk_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + + if (priv) + desc = __libeth_xsk_xmit_fill_buf_md(&frm.desc, sq, priv); + else + desc = __libeth_xsk_xmit_fill_buf(&frm.desc, sq); + + desc.flags |= xsk_is_eop_desc(&frm.desc) ? LIBETH_XDP_TX_LAST : 0; + + xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len); + + return desc; +} + +/** + * libeth_xsk_xmit_do_bulk - send XSk xmit frames + * @pool: XSk buffer pool containing the frames to send + * @xdpsq: opaque pointer to driver's XDPSQ struct + * @budget: maximum number of frames can be sent + * @tmo: optional XSk Tx metadata ops + * @prep: driver callback to build a &libeth_xdpsq + * @xmit: driver callback to put frames to a HW queue + * @finalize: driver callback to start a transmission + * + * Implements generic XSk xmit. Always turns on XSk Tx wakeup as it's assumed + * lazy cleaning is used and interrupts are disabled for the queue. + * HW descriptor filling is unrolled by ``LIBETH_XDP_TX_BATCH`` to optimize + * writes. + * Note that unlike other XDP Tx ops, the queue must be locked and cleaned + * prior to calling this function to already know available @budget. + * @prepare must only build a &libeth_xdpsq and return ``U32_MAX``. + * + * Return: false if @budget was exhausted, true otherwise. + */ +static __always_inline bool +libeth_xsk_xmit_do_bulk(struct xsk_buff_pool *pool, void *xdpsq, u32 budget, + const struct xsk_tx_metadata_ops *tmo, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + const struct libeth_xdp_tx_frame *bulk; + bool wake; + u32 n; + + wake = xsk_uses_need_wakeup(pool); + if (wake) + xsk_clear_tx_need_wakeup(pool); + + n = xsk_tx_peek_release_desc_batch(pool, budget); + bulk = container_of(&pool->tx_descs[0], typeof(*bulk), desc); + + libeth_xdp_tx_xmit_bulk(bulk, xdpsq, n, true, + libeth_xdp_ptr_to_priv(tmo), prep, + libeth_xsk_xmit_fill_buf, xmit); + finalize(xdpsq, n, true); + + if (wake) + xsk_set_tx_need_wakeup(pool); + + return n < budget; +} + +/* Rx polling path */ + +/** + * libeth_xsk_tx_init_bulk - initialize XDP Tx bulk for an XSk Rx NAPI poll + * @bq: bulk to initialize + * @prog: RCU pointer to the XDP program (never %NULL) + * @dev: target &net_device + * @xdpsqs: array of driver XDPSQ structs + * @num: number of active XDPSQs, the above array length + * + * Should be called on an onstack XDP Tx bulk before the XSk NAPI polling loop. + * Initializes all the needed fields to run libeth_xdp functions. + * Never checks if @prog is %NULL or @num == 0 as XDP must always be enabled + * when hitting this path. + */ +#define libeth_xsk_tx_init_bulk(bq, prog, dev, xdpsqs, num) \ + __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, true, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_)) + +struct libeth_xdp_buff *libeth_xsk_buff_add_frag(struct libeth_xdp_buff *head, + struct libeth_xdp_buff *xdp); + +/** + * libeth_xsk_process_buff - attach XSk Rx buffer to &libeth_xdp_buff + * @head: head XSk buffer to attach the XSk buffer to (or %NULL) + * @xdp: XSk buffer to process + * @len: received data length from the descriptor + * + * If @head == %NULL, treats the XSk buffer as head and initializes + * the required fields. Otherwise, attaches the buffer as a frag. + * Already performs DMA sync-for-CPU and frame start prefetch + * (for head buffers only). + * + * Return: head XSk buffer on success or if the descriptor must be skipped + * (empty), %NULL if there is no space for a new frag. + */ +static inline struct libeth_xdp_buff * +libeth_xsk_process_buff(struct libeth_xdp_buff *head, + struct libeth_xdp_buff *xdp, u32 len) +{ + if (unlikely(!len)) { + libeth_xsk_buff_free_slow(xdp); + return head; + } + + xsk_buff_set_size(&xdp->base, len); + xsk_buff_dma_sync_for_cpu(&xdp->base); + + if (head) + return libeth_xsk_buff_add_frag(head, xdp); + + prefetch(xdp->data); + + return xdp; +} + +void libeth_xsk_buff_stats_frags(struct libeth_rq_napi_stats *rs, + const struct libeth_xdp_buff *xdp); + +u32 __libeth_xsk_run_prog_slow(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq, + enum xdp_action act, int ret); + +/** + * __libeth_xsk_run_prog - run XDP program on XSk buffer + * @xdp: XSk buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * + * Internal inline abstraction to run XDP program on XSk Rx path. Handles + * only the most common ``XDP_REDIRECT`` inline, the rest is processed + * externally. + * Reports an XDP prog exception on errors. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xsk_run_prog(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq) +{ + enum xdp_action act; + int ret = 0; + + act = bpf_prog_run_xdp(bq->prog, &xdp->base); + if (unlikely(act != XDP_REDIRECT)) +rest: + return __libeth_xsk_run_prog_slow(xdp, bq, act, ret); + + ret = xdp_do_redirect(bq->dev, &xdp->base, bq->prog); + if (unlikely(ret)) + goto rest; + + return LIBETH_XDP_REDIRECT; +} + +/** + * libeth_xsk_run_prog - run XDP program on XSk path and handle all verdicts + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers + * @fl: driver ``XDP_TX`` bulk flush callback + * + * Run the attached XDP program and handle all possible verdicts. + * Prefer using it via LIBETH_XSK_DEFINE_RUN{,_PASS,_PROG}(). + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +#define libeth_xsk_run_prog(xdp, bq, fl) \ + __libeth_xdp_run_flush(xdp, bq, __libeth_xsk_run_prog, \ + libeth_xsk_tx_queue_bulk, fl) + +/** + * __libeth_xsk_run_pass - helper to run XDP program and handle the result + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @md: metadata that should be filled to the XSk buffer + * @prep: callback for filling the metadata + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Inline abstraction, XSk's counterpart of __libeth_xdp_run_pass(), see its + * doc for details. + * + * Return: false if the polling loop must be exited due to lack of free + * buffers, true otherwise. + */ +static __always_inline bool +__libeth_xsk_run_pass(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi, + struct libeth_rq_napi_stats *rs, const void *md, + void (*prep)(struct libeth_xdp_buff *xdp, + const void *md), + u32 (*run)(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq), + bool (*populate)(struct sk_buff *skb, + const struct libeth_xdp_buff *xdp, + struct libeth_rq_napi_stats *rs)) +{ + struct sk_buff *skb; + u32 act; + + rs->bytes += xdp->base.data_end - xdp->data; + rs->packets++; + + if (unlikely(xdp_buff_has_frags(&xdp->base))) + libeth_xsk_buff_stats_frags(rs, xdp); + + if (prep && (!__builtin_constant_p(!!md) || md)) + prep(xdp, md); + + act = run(xdp, bq); + if (likely(act == LIBETH_XDP_REDIRECT)) + return true; + + if (act != LIBETH_XDP_PASS) + return act != LIBETH_XDP_ABORTED; + + skb = xdp_build_skb_from_zc(&xdp->base); + if (unlikely(!skb)) { + libeth_xsk_buff_free_slow(xdp); + return true; + } + + if (unlikely(!populate(skb, xdp, rs))) { + napi_consume_skb(skb, true); + return true; + } + + napi_gro_receive(napi, skb); + + return true; +} + +/** + * libeth_xsk_run_pass - helper to run XDP program and handle the result + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @desc: pointer to the HW descriptor for that frame + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Wrapper around the underscored version when "fill the descriptor metadata" + * means just writing the pointer to the HW descriptor as @xdp->desc. + */ +#define libeth_xsk_run_pass(xdp, bq, napi, rs, desc, run, populate) \ + __libeth_xsk_run_pass(xdp, bq, napi, rs, desc, libeth_xdp_prep_desc, \ + run, populate) + +/** + * libeth_xsk_finalize_rx - finalize XDPSQ after an XSk NAPI polling loop + * @bq: ``XDP_TX`` frame bulk + * @flush: driver callback to flush the bulk + * @finalize: driver callback to start sending the frames and run the timer + * + * Flush the bulk if there are frames left to send, kick the queue and flush + * the XDP maps. + */ +#define libeth_xsk_finalize_rx(bq, flush, finalize) \ + __libeth_xdp_finalize_rx(bq, LIBETH_XDP_TX_XSK, flush, finalize) + +/* + * Helpers to reduce boilerplate code in drivers. + * + * Typical driver XSk Rx flow would be (excl. bulk and buff init, frag attach): + * + * LIBETH_XDP_DEFINE_START(); + * LIBETH_XSK_DEFINE_FLUSH_TX(static driver_xsk_flush_tx, driver_xsk_tx_prep, + * driver_xdp_xmit); + * LIBETH_XSK_DEFINE_RUN(static driver_xsk_run, driver_xsk_run_prog, + * driver_xsk_flush_tx, driver_populate_skb); + * LIBETH_XSK_DEFINE_FINALIZE(static driver_xsk_finalize_rx, + * driver_xsk_flush_tx, driver_xdp_finalize_sq); + * LIBETH_XDP_DEFINE_END(); + * + * This will build a set of 4 static functions. The compiler is free to decide + * whether to inline them. + * Then, in the NAPI polling function: + * + * while (packets < budget) { + * // ... + * if (!driver_xsk_run(xdp, &bq, napi, &rs, desc)) + * break; + * } + * driver_xsk_finalize_rx(&bq); + */ + +/** + * LIBETH_XSK_DEFINE_FLUSH_TX - define a driver XSk ``XDP_TX`` flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XSK_DEFINE_FLUSH_TX(name, prep, xmit) \ + __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN_PROG - define a driver XDP program run function + * @name: name of the function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + */ +#define LIBETH_XSK_DEFINE_RUN_PROG(name, flush) \ + u32 __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN_PASS - define a driver buffer process + pass function + * @name: name of the function to define + * @run: driver callback to run XDP program (above) + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XSK_DEFINE_RUN_PASS(name, run, populate) \ + bool __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN - define a driver buffer process, run + pass function + * @name: name of the function to define + * @run: name of the XDP prog run function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XSK_DEFINE_RUN(name, run, flush, populate) \ + __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XSK) + +/** + * LIBETH_XSK_DEFINE_FINALIZE - define a driver XSk NAPI poll finalize function + * @name: name of the function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + * @finalize: driver callback to finalize an XDPSQ and run the timer + */ +#define LIBETH_XSK_DEFINE_FINALIZE(name, flush, finalize) \ + __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xsk) + +/* Refilling */ + +/** + * struct libeth_xskfq - structure representing an XSk buffer (fill) queue + * @fp: hotpath part of the structure + * @pool: &xsk_buff_pool for buffer management + * @fqes: array of XSk buffer pointers + * @descs: opaque pointer to the HW descriptor array + * @ntu: index of the next buffer to poll + * @count: number of descriptors/buffers the queue has + * @pending: current number of XSkFQEs to refill + * @thresh: threshold below which the queue is refilled + * @buf_len: HW-writeable length per each buffer + * @truesize: step between consecutive buffers, 0 if none exists + * @nid: ID of the closest NUMA node with memory + */ +struct libeth_xskfq { + struct_group_tagged(libeth_xskfq_fp, fp, + struct xsk_buff_pool *pool; + struct libeth_xdp_buff **fqes; + void *descs; + + u32 ntu; + u32 count; + ); + + /* Cold fields */ + u32 pending; + u32 thresh; + + u32 buf_len; + u32 truesize; + + int nid; +}; + +int libeth_xskfq_create(struct libeth_xskfq *fq); +void libeth_xskfq_destroy(struct libeth_xskfq *fq); + +/** + * libeth_xsk_buff_xdp_get_dma - get DMA address of XSk &libeth_xdp_buff + * @xdp: buffer to get the DMA addr for + */ +#define libeth_xsk_buff_xdp_get_dma(xdp) \ + xsk_buff_xdp_get_dma(&(xdp)->base) + +/** + * libeth_xskfqe_alloc - allocate @n XSk Rx buffers + * @fq: hotpath part of the XSkFQ, usually onstack + * @n: number of buffers to allocate + * @fill: driver callback to write DMA addresses to HW descriptors + * + * Note that @fq->ntu gets updated, but ::pending must be recalculated + * by the caller. + * + * Return: number of buffers refilled. + */ +static __always_inline u32 +libeth_xskfqe_alloc(struct libeth_xskfq_fp *fq, u32 n, + void (*fill)(const struct libeth_xskfq_fp *fq, u32 i)) +{ + u32 this, ret, done = 0; + struct xdp_buff **xskb; + + this = fq->count - fq->ntu; + if (likely(this > n)) + this = n; + +again: + xskb = (typeof(xskb))&fq->fqes[fq->ntu]; + ret = xsk_buff_alloc_batch(fq->pool, xskb, this); + + for (u32 i = 0, ntu = fq->ntu; likely(i < ret); i++) + fill(fq, ntu + i); + + done += ret; + fq->ntu += ret; + + if (likely(fq->ntu < fq->count) || unlikely(ret < this)) + goto out; + + fq->ntu = 0; + + if (this < n) { + this = n - this; + goto again; + } + +out: + return done; +} + +/* .ndo_xsk_wakeup */ + +void libeth_xsk_init_wakeup(call_single_data_t *csd, struct napi_struct *napi); +void libeth_xsk_wakeup(call_single_data_t *csd, u32 qid); + +/* Pool setup */ + +int libeth_xsk_setup_pool(struct net_device *dev, u32 qid, bool enable); + +#endif /* __LIBETH_XSK_H */ diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 53bd2d02a4f0..26232f603e33 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -138,12 +138,12 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, static inline void lwtunnel_set_redirect(struct dst_entry *dst) { if (lwtunnel_output_redirect(dst->lwtstate)) { - dst->lwtstate->orig_output = dst->output; - dst->output = lwtunnel_output; + dst->lwtstate->orig_output = READ_ONCE(dst->output); + WRITE_ONCE(dst->output, lwtunnel_output); } if (lwtunnel_input_redirect(dst->lwtstate)) { - dst->lwtstate->orig_input = dst->input; - dst->input = lwtunnel_input; + dst->lwtstate->orig_input = READ_ONCE(dst->input); + WRITE_ONCE(dst->input, lwtunnel_input); } } #else @@ -206,6 +206,7 @@ static inline int lwtunnel_valid_encap_type(u16 encap_type, NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel"); return -EOPNOTSUPP; } + static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, struct netlink_ext_ack *extack) { diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c3ed2fcff8b7..40cb20d9309c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7,7 +7,7 @@ * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2024 Intel Corporation + * Copyright (C) 2018 - 2026 Intel Corporation */ #ifndef MAC80211_H @@ -365,6 +365,7 @@ struct ieee80211_vif_chanctx_switch { * @BSS_CHANGED_MLD_VALID_LINKS: MLD valid links status changed. * @BSS_CHANGED_MLD_TTLM: negotiated TID to link mapping was changed * @BSS_CHANGED_TPE: transmit power envelope changed + * @BSS_CHANGED_NAN_LOCAL_SCHED: NAN local schedule changed (NAN mode only) */ enum ieee80211_bss_change { BSS_CHANGED_ASSOC = 1<<0, @@ -402,6 +403,7 @@ enum ieee80211_bss_change { BSS_CHANGED_MLD_VALID_LINKS = BIT_ULL(33), BSS_CHANGED_MLD_TTLM = BIT_ULL(34), BSS_CHANGED_TPE = BIT_ULL(35), + BSS_CHANGED_NAN_LOCAL_SCHED = BIT_ULL(36), /* when adding here, make sure to change ieee80211_reconfig */ }; @@ -682,6 +684,9 @@ struct ieee80211_parsed_tpe { * responder functionality. * @ftmr_params: configurable lci/civic parameter when enabling FTM responder. * @nontransmitted: this BSS is a nontransmitted BSS profile + * @tx_bss_conf: Pointer to the BSS configuration of transmitting interface + * if MBSSID is enabled. This pointer is RCU-protected due to CSA finish + * and BSS color change flows accessing it. * @transmitter_bssid: the address of transmitter AP * @bssid_index: index inside the multiple BSSID set * @bssid_indicator: 2^bssid_indicator is the maximum number of APs in set @@ -702,6 +707,8 @@ struct ieee80211_parsed_tpe { * @tpe: transmit power envelope information * @pwr_reduction: power constraint of BSS. * @eht_support: does this BSS support EHT + * @epcs_support: does this BSS support EPCS + * @uhr_support: does this BSS support UHR * @csa_active: marks whether a channel switch is going on. * @mu_mimo_owner: indicates interface owns MU-MIMO capability * @chanctx_conf: The channel context this interface is assigned to, or %NULL @@ -740,6 +747,7 @@ struct ieee80211_parsed_tpe { * @eht_80mhz_full_bw_ul_mumimo: in AP-mode, does this BSS support the * reception of an EHT TB PPDU on an RU that spans the entire PPDU * bandwidth + * @eht_disable_mcs15: disable EHT-MCS 15 reception capability. * @bss_param_ch_cnt: in BSS-mode, the BSS params change count. This * information is the latest known value. It can come from this link's * beacon or from a beacon sent by another link. @@ -753,6 +761,8 @@ struct ieee80211_parsed_tpe { * be updated to 1, even if bss_param_ch_cnt didn't change. This allows * the link to know that it heard the latest value from its own beacon * (as opposed to hearing its value from another link's beacon). + * @s1g_long_beacon_period: number of beacon intervals between each long + * beacon transmission. */ struct ieee80211_bss_conf { struct ieee80211_vif *vif; @@ -803,6 +813,7 @@ struct ieee80211_bss_conf { struct ieee80211_ftm_responder_params *ftmr_params; /* Multiple BSSID data */ bool nontransmitted; + struct ieee80211_bss_conf __rcu *tx_bss_conf; u8 transmitter_bssid[ETH_ALEN]; u8 bssid_index; u8 bssid_indicator; @@ -823,6 +834,8 @@ struct ieee80211_bss_conf { u8 pwr_reduction; bool eht_support; + bool epcs_support; + bool uhr_support; bool csa_active; @@ -847,8 +860,80 @@ struct ieee80211_bss_conf { bool eht_su_beamformee; bool eht_mu_beamformer; bool eht_80mhz_full_bw_ul_mumimo; + bool eht_disable_mcs15; + u8 bss_param_ch_cnt; u8 bss_param_ch_cnt_link_id; + + u8 s1g_long_beacon_period; +}; + +#define IEEE80211_NAN_MAX_CHANNELS 3 + +/** + * struct ieee80211_nan_channel - NAN channel information + * + * @chanreq: channel request for this NAN channel. Even though this chanreq::ap + * is irrelevant for NAN, still store it for convenience - some functions + * require it as an argument. + * @needed_rx_chains: number of RX chains needed for this NAN channel + * @chanctx_conf: chanctx_conf assigned to this NAN channel. + * If a local channel is being ULWed (because we needed this chanctx for + * something else), the local NAN channel that used this chanctx, + * will have this pointer set to %NULL. + * A peer NAN channel should never have this pointer set to %NULL. + * @channel_entry: the Channel Entry blob as defined in Wi-Fi Aware + * (TM) 4.0 specification Table 100 (Channel Entry format for the NAN + * Availability attribute). + */ +struct ieee80211_nan_channel { + struct ieee80211_chan_req chanreq; + u8 needed_rx_chains; + struct ieee80211_chanctx_conf *chanctx_conf; + u8 channel_entry[6]; +}; + +/** + * struct ieee80211_nan_peer_map - NAN peer schedule map + * + * This stores a single map from a peer's schedule. Each peer can have + * multiple maps. + * + * @map_id: the map ID from the peer schedule, %CFG80211_NAN_INVALID_MAP_ID + * if unused + * @slots: mapping of time slots to channel configurations in the schedule's + * channels array + */ +struct ieee80211_nan_peer_map { + u8 map_id; + struct ieee80211_nan_channel *slots[CFG80211_NAN_SCHED_NUM_TIME_SLOTS]; +}; + +/** + * struct ieee80211_nan_peer_sched - NAN peer schedule + * + * This stores the complete schedule from a peer. Contains peer-level + * parameters and an array of schedule maps. + * + * @seq_id: the sequence ID from the peer schedule + * @committed_dw: committed DW as published by the peer + * @max_chan_switch: maximum channel switch time in microseconds + * @init_ulw: initial ULWs as published by the peer (copied) + * @ulw_size: number of bytes in @init_ulw + * @maps: array of peer schedule maps. Invalid slots have map_id set to + * %CFG80211_NAN_INVALID_MAP_ID. + * @n_channels: number of valid channel entries in @channels + * @channels: flexible array of negotiated peer channels for this schedule + */ +struct ieee80211_nan_peer_sched { + u8 seq_id; + u16 committed_dw; + u16 max_chan_switch; + const u8 *init_ulw; + u16 ulw_size; + struct ieee80211_nan_peer_map maps[CFG80211_NAN_MAX_PEER_MAPS]; + u8 n_channels; + struct ieee80211_nan_channel channels[] __counted_by(n_channels); }; /** @@ -1517,6 +1602,7 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * known the frame shouldn't be reported. * @RX_FLAG_8023: the frame has an 802.3 header (decap offload performed by * hardware or driver) + * @RX_FLAG_RADIOTAP_VHT: VHT radiotap data is present */ enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), @@ -1552,6 +1638,7 @@ enum mac80211_rx_flags { RX_FLAG_RADIOTAP_LSIG = BIT(28), RX_FLAG_NO_PSDU = BIT(29), RX_FLAG_8023 = BIT(30), + RX_FLAG_RADIOTAP_VHT = BIT(31), }; /** @@ -1584,6 +1671,7 @@ enum mac80211_rx_encoding { RX_ENC_VHT, RX_ENC_HE, RX_ENC_EHT, + RX_ENC_UHR, }; /** @@ -1617,7 +1705,7 @@ enum mac80211_rx_encoding { * @antenna: antenna used * @rate_idx: index of data rate into band's supported rates or MCS index if * HT or VHT is used (%RX_FLAG_HT/%RX_FLAG_VHT) - * @nss: number of streams (VHT, HE and EHT only) + * @nss: number of streams (VHT, HE, EHT and UHR only) * @flag: %RX_FLAG_\* * @encoding: &enum mac80211_rx_encoding * @bw: &enum rate_info_bw @@ -1628,6 +1716,11 @@ enum mac80211_rx_encoding { * @eht: EHT specific rate information * @eht.ru: EHT RU, from &enum nl80211_eht_ru_alloc * @eht.gi: EHT GI, from &enum nl80211_eht_gi + * @uhr: UHR specific rate information + * @uhr.ru: UHR RU, from &enum nl80211_eht_ru_alloc + * @uhr.gi: UHR GI, from &enum nl80211_eht_gi + * @uhr.elr: UHR ELR MCS was used + * @uhr.im: UHR interference mitigation was used * @rx_flags: internal RX flags for mac80211 * @ampdu_reference: A-MPDU reference number, must be a different value for * each A-MPDU but the same for each subframe within one A-MPDU @@ -1659,6 +1752,12 @@ struct ieee80211_rx_status { u8 ru:4; u8 gi:2; } eht; + struct { + u8 ru:4; + u8 gi:2; + u8 elr:1; + u8 im:1; + } uhr; }; u8 rate_idx; u8 nss; @@ -1888,6 +1987,59 @@ enum ieee80211_offload_flags { IEEE80211_OFFLOAD_DECAP_ENABLED = BIT(2), }; +#define IEEE80211_NAN_AVAIL_BLOB_MAX_LEN 54 + +/** + * struct ieee80211_eml_params - EHT Operating mode notification parameters + * + * EML Operating mode notification parameters received in the Operating mode + * notification frame. This struct is used as a container to pass the info to + * the underlay driver. + * + * @link_id: the link ID where the Operating mode notification frame has been + * received. + * @control: EML control field defined in P802.11be section 9.4.1.76. + * @link_bitmap: eMLSR/eMLMR enabled links defined in P802.11be + * section 9.4.1.76. + * @emlmr_mcs_map_count: eMLMR number of valid mcs_map_bw fields according to + * P802.11be section 9.4.1.76 (valid if eMLMR mode control bit is set). + * @emlmr_mcs_map_bw: eMLMR supported MCS and NSS set subfileds defined in + * P802.11be section 9.4.1.76 (valid if eMLMR mode control bit is set). + */ +struct ieee80211_eml_params { + u8 link_id; + u8 control; + u16 link_bitmap; + u8 emlmr_mcs_map_count; + u8 emlmr_mcs_map_bw[9]; +}; + +/** + * struct ieee80211_nan_sched_cfg - NAN schedule configuration + * @channels: array of NAN channels. A channel entry is in use if + * channels[i].chanreq.oper.chan is not NULL. + * @schedule: NAN local schedule - mapping of each 16TU time slot to + * the NAN channel on which the radio will operate. NULL if unscheduled. + * @avail_blob: NAN Availability attribute blob. + * @avail_blob_len: length of the @avail_blob in bytes. + * @deferred: indicates that the driver should notify peers before applying the + * new NAN schedule, and apply the new schedule the second NAN Slot + * boundary after it notified the peers, as defined in Wi-Fi Aware (TM) 4.0 + * specification, section 5.2.2. + * The driver must call ieee80211_nan_sched_update_done() after the + * schedule has been applied. + * If a HW restart happened while a deferred schedule update was pending, + * mac80211 will reconfigure the deferred schedule (and wait for the driver + * to notify that the schedule has been applied). + */ +struct ieee80211_nan_sched_cfg { + struct ieee80211_nan_channel channels[IEEE80211_NAN_MAX_CHANNELS]; + struct ieee80211_nan_channel *schedule[CFG80211_NAN_SCHED_NUM_TIME_SLOTS]; + u8 avail_blob[IEEE80211_NAN_AVAIL_BLOB_MAX_LEN]; + u16 avail_blob_len; + bool deferred; +}; + /** * struct ieee80211_vif_cfg - interface configuration * @assoc: association status @@ -1916,6 +2068,7 @@ enum ieee80211_offload_flags { * your driver/device needs to do. * @ap_addr: AP MLD address, or BSSID for non-MLO connections * (station mode only) + * @nan_sched: NAN schedule parameters. &struct ieee80211_nan_sched_cfg */ struct ieee80211_vif_cfg { /* association related data */ @@ -1934,6 +2087,8 @@ struct ieee80211_vif_cfg { bool s1g; bool idle; u8 ap_addr[ETH_ALEN] __aligned(2); + /* Protected by the wiphy mutex */ + struct ieee80211_nan_sched_cfg nan_sched; }; #define IEEE80211_TTLM_NUM_TIDS 8 @@ -2020,9 +2175,9 @@ enum ieee80211_neg_ttlm_res { * @drv_priv: data area for driver use, will always be aligned to * sizeof(void \*). * @txq: the multicast data TX queue + * @txq_mgmt: the mgmt frame TX queue, currently only exists for NAN devices * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see * &enum ieee80211_offload_flags. - * @mbssid_tx_vif: Pointer to the transmitting interface if MBSSID is enabled. */ struct ieee80211_vif { enum nl80211_iftype type; @@ -2039,6 +2194,7 @@ struct ieee80211_vif { u8 hw_queue[IEEE80211_NUM_ACS]; struct ieee80211_txq *txq; + struct ieee80211_txq *txq_mgmt; netdev_features_t netdev_features; u32 driver_flags; @@ -2051,8 +2207,6 @@ struct ieee80211_vif { bool probe_req_reg; bool rx_mcast_action_reg; - struct ieee80211_vif *mbssid_tx_vif; - /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; @@ -2423,12 +2577,18 @@ struct ieee80211_sta_aggregates { * @he_cap: HE capabilities of this STA * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities * @eht_cap: EHT capabilities of this STA + * @uhr_cap: UHR capabilities of this STA + * @s1g_cap: S1G capabilities of this STA * @agg: per-link data for multi-link aggregation - * @bandwidth: current bandwidth the station can receive with + * @bandwidth: current bandwidth the station can receive with. + * This is the minimum between the peer's capabilities and our own + * operating channel width; Invalid for NAN since that is operating on + * multiple channels. * @rx_nss: in HT/VHT, the maximum number of spatial streams the * station can receive at the moment, changed by operating mode * notifications and capabilities. The value is only valid after - * the station moves to associated state. + * the station moves to associated state. Invalid for NAN since it + * operates on multiple configurations of rx_nss. * @txpwr: the station tx power configuration * */ @@ -2445,6 +2605,8 @@ struct ieee80211_link_sta { struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; struct ieee80211_sta_eht_cap eht_cap; + struct ieee80211_sta_uhr_cap uhr_cap; + struct ieee80211_sta_s1g_cap s1g_cap; struct ieee80211_sta_aggregates agg; @@ -2487,6 +2649,7 @@ struct ieee80211_link_sta { * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single * A-MSDU. Taken from the Extended Capabilities element. 0 means * unlimited. + * @eml_cap: EML capabilities of this MLO station * @cur: currently valid data as aggregated from the active links * For non MLO STA it will point to the deflink data. For MLO STA * ieee80211_sta_recalc_aggregates() must be called to update it. @@ -2506,6 +2669,9 @@ struct ieee80211_link_sta { * by the AP. * @valid_links: bitmap of valid links, or 0 for non-MLO * @spp_amsdu: indicates whether the STA uses SPP A-MSDU or not. + * @epp_peer: indicates that the peer is an EPP peer. + * @nmi: For NDI stations, pointer to the NMI station of the peer. + * @nan_sched: NAN peer schedule for this station. Valid only for NMI stations. */ struct ieee80211_sta { u8 addr[ETH_ALEN] __aligned(2); @@ -2521,6 +2687,7 @@ struct ieee80211_sta { bool mlo; bool spp_amsdu; u8 max_amsdu_subframes; + u16 eml_cap; struct ieee80211_sta_aggregates *cur; @@ -2529,9 +2696,15 @@ struct ieee80211_sta { struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1]; u16 valid_links; + bool epp_peer; struct ieee80211_link_sta deflink; struct ieee80211_link_sta __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS]; + struct ieee80211_sta __rcu *nmi; + + /* should only be accessed with the wiphy mutex held */ + struct ieee80211_nan_peer_sched *nan_sched; + /* must be last */ u8 drv_priv[] __aligned(sizeof(void *)); }; @@ -2765,6 +2938,8 @@ struct ieee80211_txq { * station has a unique address, i.e. each station entry can be identified * by just its MAC address; this prevents, for example, the same station * from connecting to two virtual AP interfaces at the same time. + * Note that this doesn't apply for NAN, in which the peer's NMI address + * can be equal to its NDI address. * * @IEEE80211_HW_SUPPORTS_REORDERING_BUFFER: Hardware (or driver) manages the * reordering buffer internally, guaranteeing mac80211 receives frames in @@ -2843,14 +3018,20 @@ struct ieee80211_txq { * * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT * and connecting with a lower bandwidth instead - * @IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ: HW requires disabling puncturing in - * EHT in 5 GHz and connecting with a lower bandwidth instead * * @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so * no need to stop queues. This really should be set by a driver that * implements MLO, so operation can continue on other links when one * link is switching. * + * @IEEE80211_HW_STRICT: strictly enforce certain things mandated by the spec + * but otherwise ignored/worked around for interoperability. This is a + * HW flag so drivers can opt in according to their own control, e.g. in + * testing. + * + * @IEEE80211_HW_SUPPORTS_NDP_BLOCKACK: HW can transmit/receive S1G NDP + * BlockAck frames. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2909,8 +3090,9 @@ enum ieee80211_hw_flags { IEEE80211_HW_DETECTS_COLOR_COLLISION, IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX, IEEE80211_HW_DISALLOW_PUNCTURING, - IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ, IEEE80211_HW_HANDLES_QUIET_CSA, + IEEE80211_HW_STRICT, + IEEE80211_HW_SUPPORTS_NDP_BLOCKACK, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS @@ -3176,6 +3358,10 @@ ieee80211_get_tx_rate(const struct ieee80211_hw *hw, { if (WARN_ON_ONCE(c->control.rates[0].idx < 0)) return NULL; + + if (c->band >= NUM_NL80211_BANDS) + return NULL; + return &hw->wiphy->bands[c->band]->bitrates[c->control.rates[0].idx]; } @@ -3823,7 +4009,7 @@ enum ieee80211_reconfig_type { * @was_assoc: set if this call is due to deauth/disassoc * while just having been associated * @link_id: the link id on which the frame will be TX'ed. - * Only used with the mgd_prepare_tx() method. + * 0 for a non-MLO connection. */ struct ieee80211_prep_tx_info { u16 duration; @@ -4120,6 +4306,15 @@ struct ieee80211_prep_tx_info { * Statistics that the driver doesn't fill will be filled by mac80211. * The callback can sleep. * + * @link_sta_statistics: Get link statistics for this station. For example with + * beacon filtering, the statistics kept by mac80211 might not be + * accurate, so let the driver pre-fill the statistics. The driver can + * fill most of the values (indicating which by setting the filled + * bitmap), but not all of them make sense - see the source for which + * ones are possible. + * Statistics that the driver doesn't fill will be filled by mac80211. + * The callback can sleep. + * * @conf_tx: Configure TX queue parameters (EDCF (aifs, cw_min, cw_max), * bursting) for a hardware TX queue. * Returns a negative error code on failure. @@ -4289,6 +4484,8 @@ struct ieee80211_prep_tx_info { * @mgd_complete_tx: Notify the driver that the response frame for a previously * transmitted frame announced with @mgd_prepare_tx was received, the data * is filled similarly to @mgd_prepare_tx though the duration is not used. + * Note that this isn't always called for each mgd_prepare_tx() call, for + * example for SAE the 'confirm' messages can be on the air in any order. * * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending * a TDLS discovery-request, we expect a reply to arrive on the AP's @@ -4409,6 +4606,12 @@ struct ieee80211_prep_tx_info { * @del_nan_func: Remove a NAN function. The driver must call * ieee80211_nan_func_terminated() with * NL80211_NAN_FUNC_TERM_REASON_USER_REQUEST reason code upon removal. + * @nan_peer_sched_changed: Notifies the driver that the peer NAN schedule + * has changed. The new schedule is available via sta->nan_sched. + * Note that the channel_entry blob might not match the actual chandef + * since the bandwidth of the chandef is the minimum of the local and peer + * bandwidth. It is the driver responsibility to remove the peer schedule + * when the NMI station is removed. * @can_aggregate_in_amsdu: Called in order to determine if HW supports * aggregating two specific frames in the same A-MSDU. The relation * between the skbs should be symmetric and transitive. Note that while @@ -4453,6 +4656,8 @@ struct ieee80211_prep_tx_info { * new links bitmaps may be 0 if going from/to a non-MLO situation. * The @old array contains pointers to the old bss_conf structures * that were already removed, in case they're needed. + * Note that removal of link should always succeed, so the return value + * will be ignored in a removal only case. * This callback can sleep. * @change_sta_links: Change the valid links of a station, similar to * @change_vif_links. This callback can sleep. @@ -4476,6 +4681,9 @@ struct ieee80211_prep_tx_info { * interface with the specified type would be added, and thus drivers that * implement this callback need to handle such cases. The type is the full * &enum nl80211_iftype. + * @set_eml_op_mode: Configure eMLSR/eMLMR operation mode in the underlay + * driver according to the parameter received in the EML Operating mode + * notification frame. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, @@ -4495,7 +4703,7 @@ struct ieee80211_ops { enum nl80211_iftype new_type, bool p2p); void (*remove_interface)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); - int (*config)(struct ieee80211_hw *hw, u32 changed); + int (*config)(struct ieee80211_hw *hw, int radio_idx, u32 changed); void (*bss_info_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, @@ -4558,8 +4766,10 @@ struct ieee80211_ops { void (*get_key_seq)(struct ieee80211_hw *hw, struct ieee80211_key_conf *key, struct ieee80211_key_seq *seq); - int (*set_frag_threshold)(struct ieee80211_hw *hw, u32 value); - int (*set_rts_threshold)(struct ieee80211_hw *hw, u32 value); + int (*set_frag_threshold)(struct ieee80211_hw *hw, int radio_idx, + u32 value); + int (*set_rts_threshold)(struct ieee80211_hw *hw, int radio_idx, + u32 value); int (*sta_add)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, @@ -4614,6 +4824,10 @@ struct ieee80211_ops { s64 offset); void (*reset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*tx_last_beacon)(struct ieee80211_hw *hw); + void (*link_sta_statistics)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_sta *link_sta, + struct link_station_info *link_sinfo); /** * @ampdu_action: @@ -4652,7 +4866,8 @@ struct ieee80211_ops { int (*get_survey)(struct ieee80211_hw *hw, int idx, struct survey_info *survey); void (*rfkill_poll)(struct ieee80211_hw *hw); - void (*set_coverage_class)(struct ieee80211_hw *hw, s16 coverage_class); + void (*set_coverage_class)(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class); #ifdef CONFIG_NL80211_TESTMODE int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void *data, int len); @@ -4667,8 +4882,10 @@ struct ieee80211_ops { void (*channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); - int (*set_antenna)(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant); - int (*get_antenna)(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant); + int (*set_antenna)(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant); + int (*get_antenna)(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant); int (*remain_on_channel)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, @@ -4800,6 +5017,8 @@ struct ieee80211_ops { void (*del_nan_func)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u8 instance_id); + int (*nan_peer_sched_changed)(struct ieee80211_hw *hw, + struct ieee80211_sta *sta); bool (*can_aggregate_in_amsdu)(struct ieee80211_hw *hw, struct sk_buff *head, struct sk_buff *skb); @@ -4862,6 +5081,10 @@ struct ieee80211_ops { struct ieee80211_neg_ttlm *ttlm); void (*prep_add_interface)(struct ieee80211_hw *hw, enum nl80211_iftype type); + int (*set_eml_op_mode)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta, + struct ieee80211_eml_params *eml_params); }; /** @@ -5340,22 +5563,6 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, int max_rates); /** - * ieee80211_sta_set_expected_throughput - set the expected tpt for a station - * - * Call this function to notify mac80211 about a change in expected throughput - * to a station. A driver for a device that does rate control in firmware can - * call this function when the expected throughput estimate towards a station - * changes. The information is used to tune the CoDel AQM applied to traffic - * going towards that station (which can otherwise be too aggressive and cause - * slow stations to starve). - * - * @pubsta: the station to set throughput for. - * @thr: the current expected throughput in kbps. - */ -void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, - u32 thr); - -/** * ieee80211_tx_rate_update - transmit rate update callback * * Drivers should call this functions with a non-NULL pub sta @@ -6011,21 +6218,12 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq); /** - * ieee80211_remove_key - remove the given key - * @keyconf: the parameter passed with the set key - * - * Context: Must be called with the wiphy mutex held. - * - * Remove the given key. If the key was uploaded to the hardware at the - * time this function is called, it is not deleted in the hardware but - * instead assumed to have been removed already. - */ -void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); - -/** * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN * @vif: the virtual interface to add the key on - * @keyconf: new key data + * @idx: the keyidx of the key + * @key_data: the key data + * @key_len: the key data. Might be bigger than the actual key length, + * but not smaller (for the driver convinence) * @link_id: the link id of the key or -1 for non-MLO * * When GTK rekeying was done while the system was suspended, (a) new @@ -6048,13 +6246,11 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); * for the new key for each TID to set up sequence counters properly. * * IMPORTANT: If this replaces a key that is present in the hardware, - * then it will attempt to remove it during this call. In many cases - * this isn't what you want, so call ieee80211_remove_key() first for - * the key that's being replaced. + * then it will attempt to remove it during this call. */ struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, - struct ieee80211_key_conf *keyconf, + u8 idx, u8 *key_data, u8 key_len, int link_id); /** @@ -6255,6 +6451,30 @@ void ieee80211_iterate_active_interfaces_atomic(struct ieee80211_hw *hw, struct ieee80211_vif *vif), void *data); +struct ieee80211_vif * +__ieee80211_iterate_interfaces(struct ieee80211_hw *hw, + struct ieee80211_vif *prev, + u32 iter_flags); + +/** + * for_each_interface - iterate interfaces under wiphy mutex + * @vif: the iterator variable + * @hw: the HW to iterate for + * @flags: the iteration flags, see &enum ieee80211_interface_iteration_flags + */ +#define for_each_interface(vif, hw, flags) \ + for (vif = __ieee80211_iterate_interfaces(hw, NULL, flags); \ + vif; \ + vif = __ieee80211_iterate_interfaces(hw, vif, flags)) + +/** + * for_each_active_interface - iterate active interfaces under wiphy mutex + * @vif: the iterator variable + * @hw: the HW to iterate for + */ +#define for_each_active_interface(vif, hw) \ + for_each_interface(vif, hw, IEEE80211_IFACE_ITER_ACTIVE) + /** * ieee80211_iterate_active_interfaces_mtx - iterate active interfaces * @@ -6267,12 +6487,18 @@ void ieee80211_iterate_active_interfaces_atomic(struct ieee80211_hw *hw, * @iterator: the iterator function to call, cannot sleep * @data: first argument of the iterator function */ -void ieee80211_iterate_active_interfaces_mtx(struct ieee80211_hw *hw, - u32 iter_flags, - void (*iterator)(void *data, - u8 *mac, - struct ieee80211_vif *vif), - void *data); +static inline void +ieee80211_iterate_active_interfaces_mtx(struct ieee80211_hw *hw, + u32 iter_flags, + void (*iterator)(void *data, u8 *mac, + struct ieee80211_vif *vif), + void *data) +{ + struct ieee80211_vif *vif; + + for_each_interface(vif, hw, iter_flags | IEEE80211_IFACE_ITER_ACTIVE) + iterator(data, vif->addr, vif); +} /** * ieee80211_iterate_stations_atomic - iterate stations @@ -6291,6 +6517,20 @@ void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, struct ieee80211_sta *sta), void *data); +struct ieee80211_sta * +__ieee80211_iterate_stations(struct ieee80211_hw *hw, + struct ieee80211_sta *prev); + +/** + * for_each_station - iterate stations under wiphy mutex + * @sta: the iterator variable + * @hw: the HW to iterate for + */ +#define for_each_station(sta, hw) \ + for (sta = __ieee80211_iterate_stations(hw, NULL); \ + sta; \ + sta = __ieee80211_iterate_stations(hw, sta)) + /** * ieee80211_iterate_stations_mtx - iterate stations * @@ -6303,10 +6543,17 @@ void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, * @iterator: the iterator function to call * @data: first argument of the iterator function */ -void ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw, - void (*iterator)(void *data, - struct ieee80211_sta *sta), - void *data); +static inline void +ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw, + void (*iterator)(void *data, + struct ieee80211_sta *sta), + void *data) +{ + struct ieee80211_sta *sta; + + for_each_station(sta, hw) + iterator(data, sta); +} /** * ieee80211_queue_work - add work onto the mac80211 workqueue @@ -6665,6 +6912,31 @@ void ieee80211_iter_chan_contexts_atomic( void *iter_data); /** + * ieee80211_iter_chan_contexts_mtx - iterate channel contexts + * @hw: pointer obtained from ieee80211_alloc_hw(). + * @iter: iterator function + * @iter_data: data passed to iterator function + * + * Iterate all active channel contexts. This function can only be used while + * holding the wiphy mutex. + * + * The iterator will not find a context that's being added (during + * the driver callback to add it) but will find it while it's being + * removed. + * + * Note that during hardware restart, all contexts that existed + * before the restart are considered already present so will be + * found while iterating, whether they've been re-added already + * or not. + */ +void ieee80211_iter_chan_contexts_mtx( + struct ieee80211_hw *hw, + void (*iter)(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *chanctx_conf, + void *data), + void *iter_data); + +/** * ieee80211_ap_probereq_get - retrieve a Probe Request template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. @@ -7181,7 +7453,7 @@ ieee80211_get_he_6ghz_capa_vif(const struct ieee80211_supported_band *sband, } /** - * ieee80211_get_eht_iftype_cap_vif - return ETH capabilities for sband/vif + * ieee80211_get_eht_iftype_cap_vif - return EHT capabilities for sband/vif * @sband: the sband to search for the iftype on * @vif: the vif to get the iftype from * @@ -7195,6 +7467,20 @@ ieee80211_get_eht_iftype_cap_vif(const struct ieee80211_supported_band *sband, } /** + * ieee80211_get_uhr_iftype_cap_vif - return UHR capabilities for sband/vif + * @sband: the sband to search for the iftype on + * @vif: the vif to get the iftype from + * + * Return: pointer to the struct ieee80211_sta_uhr_cap, or %NULL is none found + */ +static inline const struct ieee80211_sta_uhr_cap * +ieee80211_get_uhr_iftype_cap_vif(const struct ieee80211_supported_band *sband, + struct ieee80211_vif *vif) +{ + return ieee80211_get_uhr_iftype_cap(sband, ieee80211_vif_type_p2p(vif)); +} + +/** * ieee80211_update_mu_groups - set the VHT MU-MIMO groud data * * @vif: the specified virtual interface @@ -7220,13 +7506,32 @@ void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif); * ieee80211_ave_rssi - report the average RSSI for the specified interface * * @vif: the specified virtual interface + * @link_id: the link ID for MLO, or -1 for non-MLO * * Note: This function assumes that the given vif is valid. * * Return: The average RSSI value for the requested interface, or 0 if not * applicable. */ -int ieee80211_ave_rssi(struct ieee80211_vif *vif); +int ieee80211_ave_rssi(struct ieee80211_vif *vif, int link_id); + +/** + * ieee80211_calculate_rx_timestamp - calculate timestamp in frame + * @hw: pointer as obtained from ieee80211_alloc_hw() + * @status: RX status + * @mpdu_len: total MPDU length (including FCS) + * @mpdu_offset: offset into MPDU to calculate timestamp at + * + * This function calculates the RX timestamp at the given MPDU offset, taking + * into account what the RX timestamp was. An offset of 0 will just normalize + * the timestamp to TSF at beginning of MPDU reception. + * + * Returns: the calculated timestamp + */ +u64 ieee80211_calculate_rx_timestamp(struct ieee80211_hw *hw, + struct ieee80211_rx_status *status, + unsigned int mpdu_len, + unsigned int mpdu_offset); /** * ieee80211_report_wowlan_wakeup - report WoWLAN wakeup @@ -7248,7 +7553,9 @@ void ieee80211_report_wowlan_wakeup(struct ieee80211_vif *vif, * @band: the band to transmit on * @sta: optional pointer to get the station to send the frame to * - * Return: %true if the skb was prepared, %false otherwise + * Return: %true if the skb was prepared, %false otherwise. + * On failure, the skb is freed by this function; callers must not + * free it again. * * Note: must be called under RCU lock */ @@ -7572,6 +7879,17 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif, gfp_t gfp); /** + * ieee80211_nan_sched_update_done - notify that NAN schedule update is done + * + * This function is called by the driver to notify mac80211 that the NAN + * schedule update has been applied. + * Must be called with wiphy mutex held. May sleep. + * + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + */ +void ieee80211_nan_sched_update_done(struct ieee80211_vif *vif); + +/** * ieee80211_calc_rx_airtime - calculate estimated transmission airtime for RX. * * This function calculates the estimated airtime usage of a frame based on the @@ -7607,19 +7925,22 @@ u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, * ieee80211_get_fils_discovery_tmpl - Get FILS discovery template. * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO. * * The driver is responsible for freeing the returned skb. * * Return: FILS discovery template. %NULL on error. */ struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + unsigned int link_id); /** * ieee80211_get_unsol_bcast_probe_resp_tmpl - Get unsolicited broadcast * probe response template. * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: valid link_id during MLO or 0 for non-MLO. * * The driver is responsible for freeing the returned skb. * @@ -7627,7 +7948,8 @@ struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, */ struct sk_buff * ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, - struct ieee80211_vif *vif); + struct ieee80211_vif *vif, + unsigned int link_id); /** * ieee80211_obss_color_collision_notify - notify userland about a BSS color @@ -7797,4 +8119,17 @@ int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw, int n_vifs, enum ieee80211_chanctx_switch_mode mode); +/** + * ieee80211_vif_nan_started - Return whether a NAN vif is started + * @vif: the vif + * Return: %true iff the vif is a NAN interface and NAN is started + */ +bool ieee80211_vif_nan_started(struct ieee80211_vif *vif); + +/** + * ieee80211_encrypt_tx_skb - Encrypt the transmit skb + * @skb: the skb + * Return: 0 if success and non-zero on error + */ +int ieee80211_encrypt_tx_skb(struct sk_buff *skb); #endif /* MAC80211_H */ diff --git a/include/net/macsec.h b/include/net/macsec.h index bc7de5b53e54..d962093ee923 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -9,6 +9,7 @@ #include <linux/u64_stats_sync.h> #include <linux/if_vlan.h> +#include <linux/workqueue.h> #include <uapi/linux/if_link.h> #include <uapi/linux/if_macsec.h> @@ -123,6 +124,7 @@ struct macsec_dev_stats { * @key: key structure * @ssci: short secure channel identifier * @stats: per-SA stats + * @destroy_work: deferred work to free the SA in process context after RCU grace period */ struct macsec_rx_sa { struct macsec_key key; @@ -136,7 +138,7 @@ struct macsec_rx_sa { bool active; struct macsec_rx_sa_stats __percpu *stats; struct macsec_rx_sc *sc; - struct rcu_head rcu; + struct rcu_work destroy_work; }; struct pcpu_rx_sc_stats { @@ -174,6 +176,7 @@ struct macsec_rx_sc { * @key: key structure * @ssci: short secure channel identifier * @stats: per-SA stats + * @destroy_work: deferred work to free the SA in process context after RCU grace period */ struct macsec_tx_sa { struct macsec_key key; @@ -186,7 +189,7 @@ struct macsec_tx_sa { refcount_t refcnt; bool active; struct macsec_tx_sa_stats __percpu *stats; - struct rcu_head rcu; + struct rcu_work destroy_work; }; /** diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 62e9d7673862..6d836060976a 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -10,6 +10,7 @@ #include "shm_channel.h" #define GDMA_STATUS_MORE_ENTRIES 0x00000105 +#define GDMA_STATUS_CMD_UNSUPPORTED 0xffffffff /* Structures labeled with "HW DATA" are exchanged with the hardware. All of * them are naturally aligned and hence don't need __packed. @@ -34,6 +35,8 @@ enum gdma_request_type { GDMA_CREATE_MR = 31, GDMA_DESTROY_MR = 32, GDMA_QUERY_HWC_TIMEOUT = 84, /* 0x54 */ + GDMA_ALLOC_DM = 96, /* 0x60 */ + GDMA_DESTROY_DM = 97, /* 0x61 */ }; #define GDMA_RESOURCE_DOORBELL_PAGE 27 @@ -58,8 +61,10 @@ enum gdma_eqe_type { GDMA_EQE_HWC_INIT_EQ_ID_DB = 129, GDMA_EQE_HWC_INIT_DATA = 130, GDMA_EQE_HWC_INIT_DONE = 131, - GDMA_EQE_HWC_SOC_RECONFIG = 132, + GDMA_EQE_HWC_FPGA_RECONFIG = 132, GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133, + GDMA_EQE_HWC_SOC_SERVICE = 134, + GDMA_EQE_HWC_RESET_REQUEST = 135, GDMA_EQE_RNIC_QP_FATAL = 176, }; @@ -70,6 +75,18 @@ enum { GDMA_DEVICE_MANA_IB = 3, }; +enum gdma_service_type { + GDMA_SERVICE_TYPE_NONE = 0, + GDMA_SERVICE_TYPE_RDMA_SUSPEND = 1, + GDMA_SERVICE_TYPE_RDMA_RESUME = 2, +}; + +struct mana_service_work { + struct work_struct work; + struct gdma_dev *gdma_dev; + enum gdma_service_type event; +}; + struct gdma_resource { /* Protect the bitmap */ spinlock_t lock; @@ -152,6 +169,7 @@ struct gdma_general_req { #define GDMA_MESSAGE_V1 1 #define GDMA_MESSAGE_V2 2 #define GDMA_MESSAGE_V3 3 +#define GDMA_MESSAGE_V4 4 struct gdma_general_resp { struct gdma_resp_hdr hdr; @@ -197,6 +215,12 @@ enum gdma_page_type { #define GDMA_INVALID_DMA_REGION 0 +struct mana_serv_work { + struct work_struct serv_work; + struct pci_dev *pdev; + enum gdma_eqe_type type; +}; + struct gdma_mem_info { struct device *dev; @@ -223,6 +247,8 @@ struct gdma_dev { void *driver_data; struct auxiliary_device *adev; + bool is_suspended; + bool rdma_teardown; }; /* MANA_PAGE_SIZE is the DMA unit */ @@ -364,6 +390,11 @@ struct gdma_irq_context { char name[MANA_IRQ_NAME_SZ]; }; +enum gdma_context_flags { + GC_PROBE_SUCCEEDED = 0, + GC_IN_SERVICE = 1, +}; + struct gdma_context { struct device *dev; struct dentry *mana_pci_debugfs; @@ -372,7 +403,7 @@ struct gdma_context { unsigned int max_num_queues; unsigned int max_num_msix; unsigned int num_msix_usable; - struct gdma_irq_context *irq_contexts; + struct xarray irq_contexts; /* L2 MTU */ u16 adapter_mtu; @@ -387,12 +418,15 @@ struct gdma_context { u32 test_event_eq_id; bool is_pf; + phys_addr_t bar0_pa; void __iomem *bar0_va; + resource_size_t bar0_size; void __iomem *shm_base; void __iomem *db_page_base; phys_addr_t phys_db_page_base; - u32 db_page_size; + u64 db_page_off; + u64 db_page_size; int numa_node; /* Shared memory chanenl (used to bootstrap HWC) */ @@ -406,6 +440,12 @@ struct gdma_context { /* Azure RDMA adapter */ struct gdma_dev mana_ib; + + u64 pf_cap_flags1; + + struct workqueue_struct *service_wq; + + unsigned long flags; }; static inline bool mana_gd_is_mana(struct gdma_dev *gd) @@ -441,6 +481,8 @@ int mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe); void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit); +int mana_schedule_serv_work(struct gdma_context *gc, enum gdma_eqe_type type); + struct gdma_wqe { u32 reserved :24; u32 last_vbytes :8; @@ -462,6 +504,8 @@ struct gdma_wqe { #define INLINE_OOB_SMALL_SIZE 8 #define INLINE_OOB_LARGE_SIZE 24 +#define MANA_MAX_TX_WQE_SGL_ENTRIES 30 + #define MAX_TX_WQE_SIZE 512 #define MAX_RX_WQE_SIZE 256 @@ -552,17 +596,53 @@ enum { */ #define GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX BIT(2) #define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3) +#define GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB BIT(4) #define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5) /* Driver can handle holes (zeros) in the device list */ #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11) +/* Driver supports dynamic MSI-X vector allocation */ +#define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13) + +/* Driver can self reset on EQE notification */ +#define GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE BIT(14) + +/* Driver can self reset on FPGA Reconfig EQE notification */ +#define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17) + +/* Driver detects stalled send queues and recovers them */ +#define GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY BIT(18) + +#define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6) + +/* Driver supports linearizing the skb when num_sge exceeds hardware limit */ +#define GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE BIT(20) + +/* Driver can send HWC periodically to query stats */ +#define GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY BIT(21) + +/* Driver can handle hardware recovery events during probe */ +#define GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY BIT(22) + +/* Driver supports self recovery on Hardware Channel timeouts */ +#define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY BIT(25) + #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \ GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \ - GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP) + GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \ + GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \ + GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \ + GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE | \ + GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE | \ + GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY | \ + GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE | \ + GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY | \ + GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY | \ + GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY) #define GDMA_DRV_CAP_FLAGS2 0 @@ -706,26 +786,13 @@ struct gdma_query_hwc_timeout_resp { u32 reserved; }; -enum atb_page_size { - ATB_PAGE_SIZE_4K, - ATB_PAGE_SIZE_8K, - ATB_PAGE_SIZE_16K, - ATB_PAGE_SIZE_32K, - ATB_PAGE_SIZE_64K, - ATB_PAGE_SIZE_128K, - ATB_PAGE_SIZE_256K, - ATB_PAGE_SIZE_512K, - ATB_PAGE_SIZE_1M, - ATB_PAGE_SIZE_2M, - ATB_PAGE_SIZE_MAX, -}; - enum gdma_mr_access_flags { GDMA_ACCESS_FLAG_LOCAL_READ = BIT_ULL(0), GDMA_ACCESS_FLAG_LOCAL_WRITE = BIT_ULL(1), GDMA_ACCESS_FLAG_REMOTE_READ = BIT_ULL(2), GDMA_ACCESS_FLAG_REMOTE_WRITE = BIT_ULL(3), GDMA_ACCESS_FLAG_REMOTE_ATOMIC = BIT_ULL(4), + GDMA_ACCESS_FLAG_BIND_MW = BIT_ULL(5), }; /* GDMA_CREATE_DMA_REGION */ @@ -778,6 +845,7 @@ struct gdma_destroy_dma_region_req { enum gdma_pd_flags { GDMA_PD_FLAG_INVALID = 0, + GDMA_PD_FLAG_ALLOW_GPA_MR = 1, }; struct gdma_create_pd_req { @@ -803,11 +871,24 @@ struct gdma_destory_pd_resp { };/* HW DATA */ enum gdma_mr_type { + /* + * Guest Physical Address - MRs of this type allow access + * to any DMA-mapped memory using bus-logical address + */ + GDMA_MR_TYPE_GPA = 1, /* Guest Virtual Address - MRs of this type allow access * to memory mapped by PTEs associated with this MR using a virtual * address that is set up in the MST */ GDMA_MR_TYPE_GVA = 2, + /* Guest zero-based address MRs */ + GDMA_MR_TYPE_ZBVA = 4, + /* Device address MRs */ + GDMA_MR_TYPE_DM = 5, + /* Memory Window type 1 */ + GDMA_MR_TYPE_MW1 = 6, + /* Memory Window type 2 */ + GDMA_MR_TYPE_MW2 = 7, }; struct gdma_create_mr_params { @@ -819,6 +900,16 @@ struct gdma_create_mr_params { u64 virtual_address; enum gdma_mr_access_flags access_flags; } gva; + struct { + u64 dma_region_handle; + enum gdma_mr_access_flags access_flags; + } zbva; + struct { + u64 dm_handle; + u64 offset; + u64 length; + enum gdma_mr_access_flags access_flags; + } da; }; }; @@ -833,10 +924,23 @@ struct gdma_create_mr_request { u64 dma_region_handle; u64 virtual_address; enum gdma_mr_access_flags access_flags; - } gva; - - }; + } __packed gva; + struct { + u64 dma_region_handle; + enum gdma_mr_access_flags access_flags; + } __packed zbva; + struct { + u64 dm_handle; + u64 offset; + enum gdma_mr_access_flags access_flags; + } __packed da; + } __packed; u32 reserved_2; + union { + struct { + u64 length; + } da_ext; + }; };/* HW DATA */ struct gdma_create_mr_response { @@ -855,6 +959,27 @@ struct gdma_destroy_mr_response { struct gdma_resp_hdr hdr; };/* HW DATA */ +struct gdma_alloc_dm_req { + struct gdma_req_hdr hdr; + u64 length; + u32 alignment; + u32 flags; +}; /* HW Data */ + +struct gdma_alloc_dm_resp { + struct gdma_resp_hdr hdr; + u64 dm_handle; +}; /* HW Data */ + +struct gdma_destroy_dm_req { + struct gdma_req_hdr hdr; + u64 dm_handle; +}; /* HW Data */ + +struct gdma_destroy_dm_resp { + struct gdma_resp_hdr hdr; +}; /* HW Data */ + int mana_gd_verify_vf_version(struct pci_dev *pdev); int mana_gd_register_device(struct gdma_dev *gd); @@ -886,4 +1011,11 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle); void mana_register_debugfs(void); void mana_unregister_debugfs(void); +int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event); + +int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state); +int mana_gd_resume(struct pci_dev *pdev); + +bool mana_need_log(struct gdma_context *gc, int err); + #endif /* _GDMA_H */ diff --git a/include/net/mana/hw_channel.h b/include/net/mana/hw_channel.h index 158b125692c2..16feb39616c1 100644 --- a/include/net/mana/hw_channel.h +++ b/include/net/mana/hw_channel.h @@ -24,6 +24,8 @@ #define HWC_INIT_DATA_PF_DEST_CQ_ID 11 #define HWC_DATA_CFG_HWC_TIMEOUT 1 +#define HWC_DATA_HW_LINK_CONNECT 2 +#define HWC_DATA_HW_LINK_DISCONNECT 3 #define HW_CHANNEL_WAIT_RESOURCE_TIMEOUT_MS 30000 @@ -49,6 +51,15 @@ union hwc_init_type_data { }; }; /* HW DATA */ +union hwc_init_soc_service_type { + u32 as_uint32; + + struct { + u32 value : 28; + u32 type : 4; + }; +}; /* HW DATA */ + struct hwc_rx_oob { u32 type : 6; u32 eom : 1; diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 0d00b24eacaf..8f721cd4e4a7 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -5,6 +5,7 @@ #define _MANA_H #include <net/xdp.h> +#include <net/net_shaper.h> #include "gdma.h" #include "hw_channel.h" @@ -60,16 +61,23 @@ enum TRI_STATE { #define MAX_PORTS_IN_MANA_DEV 256 +/* Maximum number of packets per coalesced CQE */ +#define MANA_RXCOMP_OOB_NUM_PPI 4 + /* Update this count whenever the respective structures are changed */ -#define MANA_STATS_RX_COUNT 5 +#define MANA_STATS_RX_COUNT (6 + MANA_RXCOMP_OOB_NUM_PPI - 1) #define MANA_STATS_TX_COUNT 11 +#define MANA_RX_FRAG_ALIGNMENT 64 + struct mana_stats_rx { u64 packets; u64 bytes; u64 xdp_drop; u64 xdp_tx; u64 xdp_redirect; + u64 pkt_len0_err; + u64 coalesced_cqe[MANA_RXCOMP_OOB_NUM_PPI - 1]; struct u64_stats_sync syncp; }; @@ -224,8 +232,6 @@ struct mana_rxcomp_perpkt_info { u32 pkt_hash; }; /* HW DATA */ -#define MANA_RXCOMP_OOB_NUM_PPI 4 - /* Receive completion OOB */ struct mana_rxcomp_oob { struct mana_cqe_header cqe_hdr; @@ -327,6 +333,7 @@ struct mana_rxq { u32 datasize; u32 alloc_size; u32 headroom; + u32 frag_count; mana_handle_t rxobj; @@ -371,6 +378,13 @@ struct mana_tx_qp { struct mana_ethtool_stats { u64 stop_queue; u64 wake_queue; + u64 tx_cqe_err; + u64 tx_cqe_unknown_type; + u64 tx_linear_pkt_cnt; + u64 rx_cqe_unknown_type; +}; + +struct mana_ethtool_hc_stats { u64 hc_rx_discards_no_wqe; u64 hc_rx_err_vport_disabled; u64 hc_rx_bytes; @@ -398,26 +412,92 @@ struct mana_ethtool_stats { u64 hc_tx_mcast_pkts; u64 hc_tx_mcast_bytes; u64 hc_tx_err_gdma; - u64 tx_cqe_err; - u64 tx_cqe_unknown_type; - u64 rx_coalesced_err; - u64 rx_cqe_unknown_type; +}; + +struct mana_ethtool_phy_stats { + /* Drop Counters */ + u64 rx_pkt_drop_phy; + u64 tx_pkt_drop_phy; + + /* Per TC traffic Counters */ + u64 rx_pkt_tc0_phy; + u64 tx_pkt_tc0_phy; + u64 rx_pkt_tc1_phy; + u64 tx_pkt_tc1_phy; + u64 rx_pkt_tc2_phy; + u64 tx_pkt_tc2_phy; + u64 rx_pkt_tc3_phy; + u64 tx_pkt_tc3_phy; + u64 rx_pkt_tc4_phy; + u64 tx_pkt_tc4_phy; + u64 rx_pkt_tc5_phy; + u64 tx_pkt_tc5_phy; + u64 rx_pkt_tc6_phy; + u64 tx_pkt_tc6_phy; + u64 rx_pkt_tc7_phy; + u64 tx_pkt_tc7_phy; + + u64 rx_byte_tc0_phy; + u64 tx_byte_tc0_phy; + u64 rx_byte_tc1_phy; + u64 tx_byte_tc1_phy; + u64 rx_byte_tc2_phy; + u64 tx_byte_tc2_phy; + u64 rx_byte_tc3_phy; + u64 tx_byte_tc3_phy; + u64 rx_byte_tc4_phy; + u64 tx_byte_tc4_phy; + u64 rx_byte_tc5_phy; + u64 tx_byte_tc5_phy; + u64 rx_byte_tc6_phy; + u64 tx_byte_tc6_phy; + u64 rx_byte_tc7_phy; + u64 tx_byte_tc7_phy; + + /* Per TC pause Counters */ + u64 rx_pause_tc0_phy; + u64 tx_pause_tc0_phy; + u64 rx_pause_tc1_phy; + u64 tx_pause_tc1_phy; + u64 rx_pause_tc2_phy; + u64 tx_pause_tc2_phy; + u64 rx_pause_tc3_phy; + u64 tx_pause_tc3_phy; + u64 rx_pause_tc4_phy; + u64 tx_pause_tc4_phy; + u64 rx_pause_tc5_phy; + u64 tx_pause_tc5_phy; + u64 rx_pause_tc6_phy; + u64 tx_pause_tc6_phy; + u64 rx_pause_tc7_phy; + u64 tx_pause_tc7_phy; }; struct mana_context { struct gdma_dev *gdma_dev; u16 num_ports; + u8 bm_hostmode; + struct mana_ethtool_hc_stats hc_stats; struct mana_eq *eqs; struct dentry *mana_eqs_debugfs; + struct workqueue_struct *per_port_queue_reset_wq; + /* Workqueue for querying hardware stats */ + struct delayed_work gf_stats_work; + bool hwc_timeout_occurred; struct net_device *ports[MAX_PORTS_IN_MANA_DEV]; + + /* Link state change work */ + struct work_struct link_change_work; + u32 link_event; }; struct mana_port_context { struct mana_context *ac; struct net_device *ndev; + struct work_struct queue_reset_work; u8 mac_addr[ETH_ALEN]; @@ -449,6 +529,7 @@ struct mana_port_context { u32 rxbpre_datasize; u32 rxbpre_alloc_size; u32 rxbpre_headroom; + u32 rxbpre_frag_count; struct bpf_prog *bpf_prog; @@ -466,13 +547,25 @@ struct mana_port_context { struct mutex vport_mutex; int vport_use_count; + /* Net shaper handle*/ + struct net_shaper_handle handle; + u16 port_idx; + /* Currently configured speed (mbps) */ + u32 speed; + /* Maximum speed supported by the SKU (mbps) */ + u32 max_speed; bool port_is_up; bool port_st_save; /* Saved port state */ + u8 cqe_coalescing_enable; + u32 cqe_coalescing_timeout_ns; + struct mana_ethtool_stats eth_stats; + struct mana_ethtool_phy_stats phy_stats; + /* Debugfs */ struct dentry *mana_port_debugfs; }; @@ -480,6 +573,7 @@ struct mana_port_context { netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev); int mana_config_rss(struct mana_port_context *ac, enum TRI_STATE rx, bool update_hash, bool update_tab); +int mana_disable_vport_rx(struct mana_port_context *apc); int mana_alloc_queues(struct net_device *ndev); int mana_attach(struct net_device *ndev); @@ -488,6 +582,9 @@ int mana_detach(struct net_device *ndev, bool from_close); int mana_probe(struct gdma_dev *gd, bool resuming); void mana_remove(struct gdma_dev *gd, bool suspending); +int mana_rdma_probe(struct gdma_dev *gd); +void mana_rdma_remove(struct gdma_dev *gd); + void mana_xdp_tx(struct sk_buff *skb, struct net_device *ndev); int mana_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames, u32 flags); @@ -496,9 +593,14 @@ u32 mana_run_xdp(struct net_device *ndev, struct mana_rxq *rxq, struct bpf_prog *mana_xdp_get(struct mana_port_context *apc); void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog); int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); -void mana_query_gf_stats(struct mana_port_context *apc); +int mana_query_gf_stats(struct mana_context *ac); +int mana_query_link_cfg(struct mana_port_context *apc); +int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed, + int enable_clamping); +void mana_query_phy_stats(struct mana_port_context *apc); int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues); void mana_pre_dealloc_rxbufs(struct mana_port_context *apc); +void mana_unmap_skb(struct sk_buff *skb, struct mana_port_context *apc); extern const struct ethtool_ops mana_ethtool_ops; extern struct dentry *mana_debugfs_root; @@ -523,6 +625,9 @@ enum mana_command_code { MANA_FENCE_RQ = 0x20006, MANA_CONFIG_VPORT_RX = 0x20007, MANA_QUERY_VPORT_CONFIG = 0x20008, + MANA_QUERY_LINK_CONFIG = 0x2000A, + MANA_SET_BW_CLAMP = 0x2000B, + MANA_QUERY_PHY_STAT = 0x2000c, /* Privileged commands for the PF mode */ MANA_REGISTER_FILTER = 0x28000, @@ -531,6 +636,35 @@ enum mana_command_code { MANA_DEREGISTER_HW_PORT = 0x28004, }; +/* Query Link Configuration*/ +struct mana_query_link_config_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; +}; /* HW DATA */ + +struct mana_query_link_config_resp { + struct gdma_resp_hdr hdr; + u32 qos_speed_mbps; + u8 qos_unconfigured; + u8 reserved1[3]; + u32 link_speed_mbps; + u8 reserved2[4]; +}; /* HW DATA */ + +/* Set Bandwidth Clamp*/ +struct mana_set_bw_clamp_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + enum TRI_STATE enable_clamping; + u32 link_speed_mbps; +}; /* HW DATA */ + +struct mana_set_bw_clamp_resp { + struct gdma_resp_hdr hdr; + u8 qos_unconfigured; + u8 reserved[7]; +}; /* HW DATA */ + /* Query Device Configuration */ struct mana_query_device_cfg_req { struct gdma_req_hdr hdr; @@ -557,7 +691,8 @@ struct mana_query_device_cfg_resp { u64 pf_cap_flags4; u16 max_num_vports; - u16 reserved; + u8 bm_hostmode; /* response v3: Bare Metal Host Mode */ + u8 reserved; u32 max_num_eqs; /* response v2: */ @@ -684,6 +819,74 @@ struct mana_query_gf_stat_resp { u64 tx_err_gdma; }; /* HW DATA */ +/* Query phy stats */ +struct mana_query_phy_stat_req { + struct gdma_req_hdr hdr; + u64 req_stats; +}; /* HW DATA */ + +struct mana_query_phy_stat_resp { + struct gdma_resp_hdr hdr; + u64 reported_stats; + + /* Aggregate Drop Counters */ + u64 rx_pkt_drop_phy; + u64 tx_pkt_drop_phy; + + /* Per TC(Traffic class) traffic Counters */ + u64 rx_pkt_tc0_phy; + u64 tx_pkt_tc0_phy; + u64 rx_pkt_tc1_phy; + u64 tx_pkt_tc1_phy; + u64 rx_pkt_tc2_phy; + u64 tx_pkt_tc2_phy; + u64 rx_pkt_tc3_phy; + u64 tx_pkt_tc3_phy; + u64 rx_pkt_tc4_phy; + u64 tx_pkt_tc4_phy; + u64 rx_pkt_tc5_phy; + u64 tx_pkt_tc5_phy; + u64 rx_pkt_tc6_phy; + u64 tx_pkt_tc6_phy; + u64 rx_pkt_tc7_phy; + u64 tx_pkt_tc7_phy; + + u64 rx_byte_tc0_phy; + u64 tx_byte_tc0_phy; + u64 rx_byte_tc1_phy; + u64 tx_byte_tc1_phy; + u64 rx_byte_tc2_phy; + u64 tx_byte_tc2_phy; + u64 rx_byte_tc3_phy; + u64 tx_byte_tc3_phy; + u64 rx_byte_tc4_phy; + u64 tx_byte_tc4_phy; + u64 rx_byte_tc5_phy; + u64 tx_byte_tc5_phy; + u64 rx_byte_tc6_phy; + u64 tx_byte_tc6_phy; + u64 rx_byte_tc7_phy; + u64 tx_byte_tc7_phy; + + /* Per TC(Traffic Class) pause Counters */ + u64 rx_pause_tc0_phy; + u64 tx_pause_tc0_phy; + u64 rx_pause_tc1_phy; + u64 tx_pause_tc1_phy; + u64 rx_pause_tc2_phy; + u64 tx_pause_tc2_phy; + u64 rx_pause_tc3_phy; + u64 tx_pause_tc3_phy; + u64 rx_pause_tc4_phy; + u64 tx_pause_tc4_phy; + u64 rx_pause_tc5_phy; + u64 tx_pause_tc5_phy; + u64 rx_pause_tc6_phy; + u64 tx_pause_tc6_phy; + u64 rx_pause_tc7_phy; + u64 tx_pause_tc7_phy; +}; /* HW DATA */ + /* Configure vPort Rx Steering */ struct mana_cfg_rx_steer_req_v2 { struct gdma_req_hdr hdr; @@ -705,6 +908,10 @@ struct mana_cfg_rx_steer_req_v2 { struct mana_cfg_rx_steer_resp { struct gdma_resp_hdr hdr; + + /* V2 */ + u32 cqe_coalescing_timeout_ns; + u32 reserved1; }; /* HW DATA */ /* Register HW vPort */ @@ -801,6 +1008,7 @@ struct mana_deregister_filter_resp { #define STATISTICS_FLAGS_TX_ERRORS_GDMA_ERROR 0x0000000004000000 #define MANA_MAX_NUM_QUEUES 64 +#define MANA_DEF_NUM_QUEUES 16 #define MANA_SHORT_VPORT_OFFSET_MAX ((1U << 8) - 1) @@ -827,5 +1035,7 @@ int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id, u32 doorbell_pg_id); void mana_uncfg_vport(struct mana_port_context *apc); -struct net_device *mana_get_primary_netdev_rcu(struct mana_context *ac, u32 port_index); +struct net_device *mana_get_primary_netdev(struct mana_context *ac, + u32 port_index, + netdevice_tracker *tracker); #endif /* _MANA_H */ diff --git a/include/net/mana/shm_channel.h b/include/net/mana/shm_channel.h index 5199b41497ff..dbabcfb95daf 100644 --- a/include/net/mana/shm_channel.h +++ b/include/net/mana/shm_channel.h @@ -4,6 +4,12 @@ #ifndef _SHM_CHANNEL_H #define _SHM_CHANNEL_H +#define SMC_APERTURE_BITS 256 +#define SMC_BASIC_UNIT (sizeof(u32)) +#define SMC_APERTURE_DWORDS (SMC_APERTURE_BITS / (SMC_BASIC_UNIT * 8)) +#define SMC_LAST_DWORD (SMC_APERTURE_DWORDS - 1) +#define SMC_APERTURE_SIZE (SMC_APERTURE_BITS / 8) + struct shm_channel { struct device *dev; void __iomem *base; diff --git a/include/net/mctp.h b/include/net/mctp.h index 1ecbff7116f6..d8bf9074110d 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -26,6 +26,9 @@ struct mctp_hdr { #define MCTP_VER_MIN 1 #define MCTP_VER_MAX 1 +/* Definitions for ver field */ +#define MCTP_HDR_VER_MASK GENMASK(3, 0) + /* Definitions for flags_seq_tag field */ #define MCTP_HDR_FLAG_SOM BIT(7) #define MCTP_HDR_FLAG_EOM BIT(6) @@ -69,7 +72,10 @@ struct mctp_sock { /* bind() params */ unsigned int bind_net; - mctp_eid_t bind_addr; + mctp_eid_t bind_local_addr; + mctp_eid_t bind_peer_addr; + unsigned int bind_peer_net; + bool bind_peer_set; __u8 bind_type; /* sendmsg()/recvmsg() uses struct sockaddr_mctp_ext */ @@ -183,8 +189,8 @@ struct mctp_sk_key { struct mctp_skb_cb { unsigned int magic; unsigned int net; - int ifindex; /* extended/direct addressing if set */ - mctp_eid_t src; + /* fields below provide extended addressing for ingress to recvmsg() */ + int ifindex; unsigned char halen; unsigned char haddr[MAX_ADDR_LEN]; }; @@ -212,7 +218,7 @@ static inline struct mctp_skb_cb *mctp_cb(struct sk_buff *skb) BUILD_BUG_ON(sizeof(struct mctp_skb_cb) > sizeof(skb->cb)); WARN_ON(cb->magic != 0x4d435450); - return (void *)(skb->cb); + return cb; } /* If CONFIG_MCTP_FLOWS, we may add one of these as a SKB extension, @@ -222,6 +228,8 @@ struct mctp_flow { struct mctp_sk_key *key; }; +struct mctp_dst; + /* Route definition. * * These are held in the pernet->mctp.routes list, with RCU protection for @@ -229,16 +237,25 @@ struct mctp_flow { * dropped on NETDEV_UNREGISTER events. * * Updates to the route table are performed under rtnl; all reads under RCU, - * so routes cannot be referenced over a RCU grace period. Specifically: A - * caller cannot block between mctp_route_lookup and mctp_route_release() + * so routes cannot be referenced over a RCU grace period. */ struct mctp_route { mctp_eid_t min, max; unsigned char type; + unsigned int mtu; - struct mctp_dev *dev; - int (*output)(struct mctp_route *route, + + enum { + MCTP_ROUTE_DIRECT, + MCTP_ROUTE_GATEWAY, + } dst_type; + union { + struct mctp_dev *dev; + struct mctp_fq_addr gateway; + }; + + int (*output)(struct mctp_dst *dst, struct sk_buff *skb); struct list_head list; @@ -246,12 +263,36 @@ struct mctp_route { struct rcu_head rcu; }; +/* Route lookup result: dst. Represents the results of a routing decision, + * but is only held over the individual routing operation. + * + * Will typically be stored on the caller stack, and must be released after + * usage. + */ +struct mctp_dst { + struct mctp_dev *dev; + unsigned int mtu; + mctp_eid_t nexthop; + mctp_eid_t saddr; + + /* set for direct addressing */ + unsigned char halen; + unsigned char haddr[MAX_ADDR_LEN]; + + int (*output)(struct mctp_dst *dst, struct sk_buff *skb); +}; + +int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex, + unsigned char halen, const unsigned char *haddr); + /* route interfaces */ -struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, - mctp_eid_t daddr); +int mctp_route_lookup(struct net *net, unsigned int dnet, + mctp_eid_t daddr, struct mctp_dst *dst); + +void mctp_dst_release(struct mctp_dst *dst); /* always takes ownership of skb */ -int mctp_local_output(struct sock *sk, struct mctp_route *rt, +int mctp_local_output(struct sock *sk, struct mctp_dst *dst, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); void mctp_key_unref(struct mctp_sk_key *key); diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 814b5f2e3ed5..ee70f597a4de 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -14,6 +14,7 @@ struct mptcp_info; struct mptcp_sock; +struct mptcp_pm_addr_entry; struct seq_file; /* MPTCP sk_buff extension data */ @@ -26,7 +27,9 @@ struct mptcp_ext { u32 subflow_seq; u16 data_len; __sum16 csum; - u8 use_map:1, + + struct_group(flags, + u8 use_map:1, dsn64:1, data_fin:1, use_ack:1, @@ -34,9 +37,10 @@ struct mptcp_ext { mpc_map:1, frozen:1, reset_transient:1; - u8 reset_reason:4, + u8 reset_reason:4, csum_reqd:1, infinite_map:1; + ); /* end of flags group */ }; #define MPTCPOPT_HMAC_LEN 20 @@ -100,17 +104,9 @@ struct mptcp_out_options { #define MPTCP_SCHED_MAX 128 #define MPTCP_SCHED_BUF_MAX (MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX) -#define MPTCP_SUBFLOWS_MAX 8 - -struct mptcp_sched_data { - bool reinject; - u8 subflows; - struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; -}; - struct mptcp_sched_ops { - int (*get_subflow)(struct mptcp_sock *msk, - struct mptcp_sched_data *data); + int (*get_send)(struct mptcp_sock *msk); + int (*get_retrans)(struct mptcp_sock *msk); char name[MPTCP_SCHED_NAME_MAX]; struct module *owner; @@ -120,6 +116,19 @@ struct mptcp_sched_ops { void (*release)(struct mptcp_sock *msk); } ____cacheline_aligned_in_smp; +#define MPTCP_PM_NAME_MAX 16 +#define MPTCP_PM_MAX 128 +#define MPTCP_PM_BUF_MAX (MPTCP_PM_NAME_MAX * MPTCP_PM_MAX) + +struct mptcp_pm_ops { + char name[MPTCP_PM_NAME_MAX]; + struct module *owner; + struct list_head list; + + void (*init)(struct mptcp_sock *msk); + void (*release)(struct mptcp_sock *msk); +} ____cacheline_aligned_in_smp; + #ifdef CONFIG_MPTCP void mptcp_init(void); diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 3c88d5bc5eed..3da1a6f8d3f9 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -2,8 +2,6 @@ #ifndef _NDISC_H #define _NDISC_H -#include <net/ipv6_stubs.h> - /* * ICMP codes for neighbour discovery messages */ @@ -60,15 +58,6 @@ enum { #include <net/neighbour.h> -/* Set to 3 to get tracing... */ -#define ND_DEBUG 1 - -#define ND_PRINTK(val, level, fmt, ...) \ -do { \ - if (val <= ND_DEBUG) \ - net_##level##_ratelimited(fmt, ##__VA_ARGS__); \ -} while (0) - struct ctl_table; struct inet6_dev; struct net_device; @@ -368,14 +357,6 @@ static inline struct neighbour *__ipv6_neigh_lookup_noref(struct net_device *dev return ___neigh_lookup_noref(&nd_tbl, neigh_key_eq128, ndisc_hashfn, pkey, dev); } -static inline -struct neighbour *__ipv6_neigh_lookup_noref_stub(struct net_device *dev, - const void *pkey) -{ - return ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, - ndisc_hashfn, pkey, dev); -} - static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, const void *pkey) { struct neighbour *n; @@ -400,28 +381,20 @@ static inline void __ipv6_confirm_neigh(struct net_device *dev, rcu_read_unlock(); } -static inline void __ipv6_confirm_neigh_stub(struct net_device *dev, - const void *pkey) -{ - struct neighbour *n; - - rcu_read_lock(); - n = __ipv6_neigh_lookup_noref_stub(dev, pkey); - neigh_confirm(n); - rcu_read_unlock(); -} - -/* uses ipv6_stub and is meant for use outside of IPv6 core */ static inline struct neighbour *ip_neigh_gw6(struct net_device *dev, const void *addr) { +#if IS_ENABLED(CONFIG_IPV6) struct neighbour *neigh; - neigh = __ipv6_neigh_lookup_noref_stub(dev, addr); + neigh = __ipv6_neigh_lookup_noref(dev, addr); if (unlikely(!neigh)) - neigh = __neigh_create(ipv6_stub->nd_tbl, addr, dev, false); + neigh = __neigh_create(&nd_tbl, addr, dev, false); return neigh; +#else + return ERR_PTR(-EAFNOSUPPORT); +#endif } int ndisc_init(void); @@ -443,6 +416,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); + void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt); diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 9a832cab5b1d..8860cc2175fc 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -92,15 +92,17 @@ struct neigh_parms { static inline void neigh_var_set(struct neigh_parms *p, int index, int val) { set_bit(index, p->data_state); - p->data[index] = val; + WRITE_ONCE(p->data[index], val); } -#define NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr]) +#define __NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr]) +#define NEIGH_VAR(p, attr) READ_ONCE(__NEIGH_VAR(p, attr)) +#define NEIGH_VAR_PTR(p, attr) (&(__NEIGH_VAR(p, attr))) /* In ndo_neigh_setup, NEIGH_VAR_INIT should be used. * In other cases, NEIGH_VAR_SET should be used. */ -#define NEIGH_VAR_INIT(p, attr, val) (NEIGH_VAR(p, attr) = val) +#define NEIGH_VAR_INIT(p, attr, val) (__NEIGH_VAR(p, attr) = val) #define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val) static inline void neigh_parms_data_state_setall(struct neigh_parms *p) @@ -176,12 +178,17 @@ struct neigh_ops { }; struct pneigh_entry { - struct pneigh_entry *next; + struct pneigh_entry __rcu *next; possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; + union { + struct list_head free_node; + struct rcu_head rcu; + }; u32 flags; u8 protocol; + bool permanent; u32 key[]; }; @@ -231,11 +238,12 @@ struct neigh_table { atomic_t gc_entries; struct list_head gc_list; struct list_head managed_list; - rwlock_t lock; + spinlock_t lock; unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; - struct pneigh_entry **phash_buckets; + struct mutex phash_lock; + struct pneigh_entry __rcu **phash_buckets; }; static inline int neigh_parms_family(struct neigh_parms *p) @@ -260,13 +268,15 @@ static inline void *neighbour_priv(const struct neighbour *n) #define NEIGH_UPDATE_F_EXT_LEARNED BIT(5) #define NEIGH_UPDATE_F_ISROUTER BIT(6) #define NEIGH_UPDATE_F_ADMIN BIT(7) +#define NEIGH_UPDATE_F_EXT_VALIDATED BIT(8) /* In-kernel representation for NDA_FLAGS_EXT flags: */ #define NTF_OLD_MASK 0xff #define NTF_EXT_SHIFT 8 -#define NTF_EXT_MASK (NTF_EXT_MANAGED) +#define NTF_EXT_MASK (NTF_EXT_MANAGED | NTF_EXT_EXT_VALIDATED) #define NTF_MANAGED (NTF_EXT_MANAGED << NTF_EXT_SHIFT) +#define NTF_EXT_VALIDATED (NTF_EXT_EXT_VALIDATED << NTF_EXT_SHIFT) extern const struct nla_policy nda_policy[]; @@ -370,13 +380,20 @@ struct net *neigh_parms_net(const struct neigh_parms *parms) unsigned long neigh_rand_reach_time(unsigned long base); +static inline void neigh_set_reach_time(struct neigh_parms *p) +{ + unsigned long base = NEIGH_VAR(p, BASE_REACHABLE_TIME); + + WRITE_ONCE(p->reachable_time, neigh_rand_reach_time(base)); +} + void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev, - int creat); -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev); + const void *key, struct net_device *dev); +int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key, + struct net_device *dev, u32 flags, u8 protocol, + bool permanent); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); @@ -472,11 +489,15 @@ static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { - unsigned int seq, hh_alen; + unsigned int seq, hh_alen = HH_DATA_ALIGN(ETH_HLEN); + int err; + + err = skb_cow_head(skb, hh_alen); + if (err) + return err; do { seq = read_seqbegin(&hh->hh_lock); - hh_alen = HH_DATA_ALIGN(ETH_HLEN); memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN); } while (read_seqretry(&hh->hh_lock, seq)); return 0; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index f467a66abc6b..80de5e98a66d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -37,6 +37,7 @@ #include <net/netns/smc.h> #include <net/netns/bpf.h> #include <net/netns/mctp.h> +#include <net/netns/vsock.h> #include <net/net_trackers.h> #include <linux/ns_common.h> #include <linux/idr.h> @@ -83,6 +84,9 @@ struct net { struct llist_node defer_free_list; struct llist_node cleanup_list; /* namespaces on death row */ + struct list_head ptype_all; + struct list_head ptype_specific; + #ifdef CONFIG_KEYS struct key_tag *key_domain; /* Key domain of operation tag */ #endif @@ -117,6 +121,7 @@ struct net { * it is critical that it is on a read_mostly cache line. */ u32 hash_mix; + bool is_dying; struct net_device *loopback_dev; /* The loopback */ @@ -193,6 +198,9 @@ struct net { /* Move to a better place when the config guard is removed. */ struct mutex rtnl_mutex; #endif +#if IS_ENABLED(CONFIG_VSOCKETS) + struct netns_vsock vsock; +#endif } __randomize_layout; #include <linux/seq_file_net.h> @@ -201,7 +209,7 @@ struct net { extern struct net init_net; #ifdef CONFIG_NET_NS -struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net); void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid); @@ -215,7 +223,7 @@ extern struct task_struct *cleanup_net_task; #else /* CONFIG_NET_NS */ #include <linux/sched.h> #include <linux/nsproxy.h> -static inline struct net *copy_net_ns(unsigned long flags, +static inline struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { if (flags & CLONE_NEWNET) @@ -256,13 +264,18 @@ void ipx_unregister_sysctl(void); #define ipx_unregister_sysctl() #endif +static inline struct net *to_net_ns(struct ns_common *ns) +{ + return container_of(ns, struct net, ns); +} + #ifdef CONFIG_NET_NS void __put_net(struct net *net); /* Try using get_net_track() instead */ static inline struct net *get_net(struct net *net) { - refcount_inc(&net->ns.count); + ns_ref_inc(net); return net; } @@ -273,7 +286,7 @@ static inline struct net *maybe_get_net(struct net *net) * exists. If the reference count is zero this * function fails and returns NULL. */ - if (!refcount_inc_not_zero(&net->ns.count)) + if (!ns_ref_get(net)) net = NULL; return net; } @@ -281,7 +294,7 @@ static inline struct net *maybe_get_net(struct net *net) /* Try using put_net_track() instead */ static inline void put_net(struct net *net) { - if (refcount_dec_and_test(&net->ns.count)) + if (ns_ref_put(net)) __put_net(net); } @@ -293,10 +306,10 @@ int net_eq(const struct net *net1, const struct net *net2) static inline int check_net(const struct net *net) { - return refcount_read(&net->ns.count) != 0; + return ns_ref_read(net) != 0; } -void net_drop_ns(void *); +void net_drop_ns(struct ns_common *); void net_passive_dec(struct net *net); #else @@ -472,8 +485,8 @@ struct pernet_operations { void (*exit)(struct net *net); void (*exit_batch)(struct list_head *net_exit_list); /* Following method is called with RTNL held. */ - void (*exit_batch_rtnl)(struct list_head *net_exit_list, - struct list_head *dev_kill_list); + void (*exit_rtnl)(struct net *net, + struct list_head *dev_kill_list); unsigned int * const id; const size_t size; }; diff --git a/include/net/net_shaper.h b/include/net/net_shaper.h index 5c3f49b52fe9..3939b816b001 100644 --- a/include/net/net_shaper.h +++ b/include/net/net_shaper.h @@ -53,6 +53,7 @@ struct net_shaper { /* private: */ u32 leaves; /* accounted only for NODE scope */ + bool valid; struct rcu_head rcu; }; diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h new file mode 100644 index 000000000000..3d3aef80beac --- /dev/null +++ b/include/net/netdev_lock.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _NET_NETDEV_LOCK_H +#define _NET_NETDEV_LOCK_H + +#include <linux/lockdep.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> + +static inline bool netdev_trylock(struct net_device *dev) +{ + return mutex_trylock(&dev->lock); +} + +static inline void netdev_assert_locked(const struct net_device *dev) +{ + lockdep_assert_held(&dev->lock); +} + +static inline void +netdev_assert_locked_or_invisible(const struct net_device *dev) +{ + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) + netdev_assert_locked(dev); +} + +static inline bool netdev_need_ops_lock(const struct net_device *dev) +{ + bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops; + +#if IS_ENABLED(CONFIG_NET_SHAPER) + ret |= !!dev->netdev_ops->net_shaper_ops; +#endif + + return ret; +} + +static inline void netdev_lock_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_lock(dev); +} + +static inline void netdev_unlock_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); +} + +static inline void netdev_lock_ops_to_full(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_lock(dev); +} + +static inline void netdev_unlock_full_to_ops(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_assert_locked(dev); + else + netdev_unlock(dev); +} + +static inline void netdev_ops_assert_locked(const struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + lockdep_assert_held(&dev->lock); + else + ASSERT_RTNL(); +} + +static inline void +netdev_ops_assert_locked_or_invisible(const struct net_device *dev) +{ + if (dev->reg_state == NETREG_REGISTERED || + dev->reg_state == NETREG_UNREGISTERING) + netdev_ops_assert_locked(dev); +} + +static inline void netdev_lock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_lock(dev); + else + rtnl_lock(); +} + +static inline void netdev_unlock_ops_compat(struct net_device *dev) +{ + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); + else + rtnl_unlock(); +} + +static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, + const struct lockdep_map *b) +{ + if (a == b) + return 0; + + /* Allow locking multiple devices only under rtnl_lock, + * the exact order doesn't matter. + * Note that upper devices don't lock their ops, so nesting + * mostly happens in batched device removal for now. + */ + return lockdep_rtnl_is_held() ? -1 : 1; +} + +#define netdev_lockdep_set_classes(dev) \ +{ \ + static struct lock_class_key qdisc_tx_busylock_key; \ + static struct lock_class_key qdisc_xmit_lock_key; \ + static struct lock_class_key dev_addr_list_lock_key; \ + static struct lock_class_key dev_instance_lock_key; \ + unsigned int i; \ + \ + (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ + lockdep_set_class(&(dev)->addr_list_lock, \ + &dev_addr_list_lock_key); \ + lockdep_set_class(&(dev)->lock, \ + &dev_instance_lock_key); \ + lock_set_cmp_fn(&dev->lock, netdev_lock_cmp_fn, NULL); \ + for (i = 0; i < (dev)->num_tx_queues; i++) \ + lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ + &qdisc_xmit_lock_key); \ +} + +#define netdev_lock_dereference(p, dev) \ + rcu_dereference_protected(p, lockdep_is_held(&(dev)->lock)) + +int netdev_debug_event(struct notifier_block *nb, unsigned long event, + void *ptr); + +#endif diff --git a/include/net/netdev_netlink.h b/include/net/netdev_netlink.h new file mode 100644 index 000000000000..075962dbe743 --- /dev/null +++ b/include/net/netdev_netlink.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_NETDEV_NETLINK_H +#define __NET_NETDEV_NETLINK_H + +#include <linux/list.h> + +struct netdev_nl_sock { + struct mutex lock; + struct list_head bindings; +}; + +#endif /* __NET_NETDEV_NETLINK_H */ diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index b02bb9f109d5..70c9fe9e83cc 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -14,6 +14,10 @@ struct netdev_config { u8 hds_config; }; +struct netdev_queue_config { + u32 rx_page_size; +}; + /* See the netdev.yaml spec for definition of each statistic */ struct netdev_queue_stats_rx { u64 bytes; @@ -23,6 +27,7 @@ struct netdev_queue_stats_rx { u64 hw_drops; u64 hw_drop_overruns; + u64 csum_complete; u64 csum_unnecessary; u64 csum_none; u64 csum_bad; @@ -84,9 +89,11 @@ struct netdev_queue_stats_tx { * for some of the events is not maintained, and reliable "total" cannot * be provided). * + * Ops are called under the instance lock if netdev_need_ops_lock() + * returns true, otherwise under rtnl_lock. * Device drivers can assume that when collecting total device stats, * the @get_base_stats and subsequent per-queue calls are performed - * "atomically" (without releasing the rtnl_lock). + * "atomically" (without releasing the relevant lock). * * Device drivers are encouraged to reset the per-queue statistics when * number of queues change. This is because the primary use case for @@ -102,6 +109,17 @@ struct netdev_stat_ops { struct netdev_queue_stats_tx *tx); }; +void netdev_stat_queue_sum(struct net_device *netdev, + int rx_start, int rx_end, + struct netdev_queue_stats_rx *rx_sum, + int tx_start, int tx_end, + struct netdev_queue_stats_tx *tx_sum); + +enum { + /* The queue checks and honours the page size qcfg parameter */ + QCFG_RX_PAGE_SIZE = 0x1, +}; + /** * struct netdev_queue_mgmt_ops - netdev ops for queue management * @@ -117,22 +135,65 @@ struct netdev_stat_ops { * * @ndo_queue_stop: Stop the RX queue at the specified index. The stopped * queue's memory is written at the specified address. + * + * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used + * for this queue. Return NULL on error. + * + * @ndo_default_qcfg: (Optional) Populate queue config struct with defaults. + * Queue config structs are passed to this helper before + * the user-requested settings are applied. + * + * @ndo_validate_qcfg: (Optional) Check if queue config is supported. + * Called when configuration affecting a queue may be + * changing, either due to NIC-wide config, or config + * scoped to the queue at a specified index. + * When NIC-wide config is changed the callback will + * be invoked for all queues. + * + * @ndo_queue_create: Create a new RX queue on a virtual device that will + * be paired with a physical device's queue via leasing. + * Return the new queue id on success, negative error + * on failure. + * + * @supported_params: Bitmask of supported parameters, see QCFG_*. + * + * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while + * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only + * be called for an interface which is open. */ struct netdev_queue_mgmt_ops { - size_t ndo_queue_mem_size; - int (*ndo_queue_mem_alloc)(struct net_device *dev, - void *per_queue_mem, - int idx); - void (*ndo_queue_mem_free)(struct net_device *dev, - void *per_queue_mem); - int (*ndo_queue_start)(struct net_device *dev, - void *per_queue_mem, - int idx); - int (*ndo_queue_stop)(struct net_device *dev, - void *per_queue_mem, - int idx); + size_t ndo_queue_mem_size; + int (*ndo_queue_mem_alloc)(struct net_device *dev, + struct netdev_queue_config *qcfg, + void *per_queue_mem, + int idx); + void (*ndo_queue_mem_free)(struct net_device *dev, + void *per_queue_mem); + int (*ndo_queue_start)(struct net_device *dev, + struct netdev_queue_config *qcfg, + void *per_queue_mem, + int idx); + int (*ndo_queue_stop)(struct net_device *dev, + void *per_queue_mem, + int idx); + void (*ndo_default_qcfg)(struct net_device *dev, + struct netdev_queue_config *qcfg); + int (*ndo_validate_qcfg)(struct net_device *dev, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack); + struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, + int idx); + int (*ndo_queue_create)(struct net_device *dev, + struct netlink_ext_ack *extack); + + unsigned int supported_params; }; +void netdev_queue_config(struct net_device *dev, int rxq, + struct netdev_queue_config *qcfg); + +bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx); + /** * DOC: Lockless queue stopping / waking helpers. * @@ -275,28 +336,58 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue, #define netif_subqueue_try_stop(dev, idx, get_desc, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_try_stop(txq, get_desc, start_thrs); \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_try_stop(_txq, get_desc, start_thrs); \ }) +static inline void netif_subqueue_sent(const struct net_device *dev, + unsigned int idx, unsigned int bytes) +{ + struct netdev_queue *txq; + + txq = netdev_get_tx_queue(dev, idx); + netdev_tx_sent_queue(txq, bytes); +} + +static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq) +{ + unsigned long trans_start = READ_ONCE(txq->trans_start); + + if (netif_xmit_stopped(txq) && + time_after(jiffies, trans_start + txq->dev->watchdog_timeo)) + return jiffies_to_msecs(jiffies - trans_start); + + return 0; +} + #define netif_subqueue_maybe_stop(dev, idx, get_desc, stop_thrs, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs); \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_maybe_stop(_txq, get_desc, stop_thrs, start_thrs); \ }) #define netif_subqueue_completed_wake(dev, idx, pkts, bytes, \ get_desc, start_thrs) \ ({ \ - struct netdev_queue *txq; \ + struct netdev_queue *_txq; \ \ - txq = netdev_get_tx_queue(dev, idx); \ - netif_txq_completed_wake(txq, pkts, bytes, \ + _txq = netdev_get_tx_queue(dev, idx); \ + netif_txq_completed_wake(_txq, pkts, bytes, \ get_desc, start_thrs); \ }) -#endif +struct device *netdev_queue_get_dma_dev(struct net_device *dev, + unsigned int idx, + enum netdev_queue_type type); +bool netdev_can_create_queue(const struct net_device *dev, + struct netlink_ext_ack *extack); +bool netdev_can_lease_queue(const struct net_device *dev, + struct netlink_ext_ack *extack); +bool netdev_queue_busy(struct net_device *dev, unsigned int idx, + enum netdev_queue_type type, + struct netlink_ext_ack *extack); +#endif /* _LINUX_NET_QUEUES_H */ diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index 596836abf7bf..9415a94d333d 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -7,26 +7,38 @@ #include <linux/sysfs.h> #include <net/xdp.h> #include <net/page_pool/types.h> +#include <net/netdev_queues.h> +#include <net/rps-types.h> /* This structure contains an instance of an RX queue. */ struct netdev_rx_queue { struct xdp_rxq_info xdp_rxq; #ifdef CONFIG_RPS struct rps_map __rcu *rps_map; - struct rps_dev_flow_table __rcu *rps_flow_table; + rps_tag_ptr rps_flow_table; #endif struct kobject kobj; + const struct attribute_group **groups; struct net_device *dev; netdevice_tracker dev_tracker; + /* All fields below are "ops protected", + * see comment about net_device::lock + */ #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif - /* NAPI instance for the queue - * Readers and writers must hold RTNL - */ struct napi_struct *napi; + struct netdev_queue_config qcfg; struct pp_memory_provider_params mp_params; + + /* If a queue is leased, then the lease pointer is always + * valid. From the physical device it points to the virtual + * queue, and from the virtual device it points to the + * physical queue. + */ + struct netdev_rx_queue *lease; + netdevice_tracker lease_tracker; } ____cacheline_aligned_in_smp; /* @@ -55,6 +67,18 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue) return index; } -int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); +enum netif_lease_dir { + NETIF_VIRT_TO_PHYS, + NETIF_PHYS_TO_VIRT, +}; -#endif +struct netdev_rx_queue * +__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq, + enum netif_lease_dir dir); + +int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); +void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, + struct netdev_rx_queue *rxq_src); +void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, + struct netdev_rx_queue *rxq_src); +#endif /* _LINUX_NETDEV_RX_QUEUE_H */ diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 2c8c2b023848..b39417ad955e 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -13,15 +13,9 @@ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp; extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp; extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; -#ifdef CONFIG_NF_CT_PROTO_DCCP -extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp; -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp; #endif -#ifdef CONFIG_NF_CT_PROTO_UDPLITE -extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite; -#endif #ifdef CONFIG_NF_CT_PROTO_GRE extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre; #endif diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h index c653fcb88354..09de2f2686b5 100644 --- a/include/net/netfilter/ipv4/nf_reject.h +++ b/include/net/netfilter/ipv4/nf_reject.h @@ -10,14 +10,6 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook); void nf_send_reset(struct net *net, struct sock *, struct sk_buff *oldskb, int hook); -const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *_oth, int hook); -struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int ttl); -void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, - const struct tcphdr *oth); - struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h index d729344ba644..94ec0b9f2838 100644 --- a/include/net/netfilter/ipv6/nf_reject.h +++ b/include/net/netfilter/ipv6/nf_reject.h @@ -9,16 +9,6 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, unsigned char cod unsigned int hooknum); void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook); -const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *otcph, - unsigned int *otcplen, int hook); -struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int hoplimit); -void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - const struct tcphdr *oth, unsigned int otcplen); - struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 3f02a45773e8..bc42dd0e10e6 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -16,9 +16,9 @@ #include <linux/bitops.h> #include <linux/compiler.h> +#include <net/netns/generic.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tcp.h> -#include <linux/netfilter/nf_conntrack_dccp.h> #include <linux/netfilter/nf_conntrack_sctp.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> @@ -31,7 +31,6 @@ struct nf_ct_udp { /* per conntrack: protocol private data */ union nf_conntrack_proto { /* insert conntrack proto private data here */ - struct nf_ct_dccp dccp; struct ip_ct_sctp sctp; struct ip_ct_tcp tcp; struct nf_ct_udp udp; @@ -306,8 +305,19 @@ static inline bool nf_ct_is_expired(const struct nf_conn *ct) /* use after obtaining a reference count */ static inline bool nf_ct_should_gc(const struct nf_conn *ct) { - return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) && - !nf_ct_is_dying(ct); + if (!nf_ct_is_confirmed(ct)) + return false; + + /* load ct->timeout after is_confirmed() test. + * Pairs with __nf_conntrack_confirm() which: + * 1. Increases ct->timeout value + * 2. Inserts ct into rcu hlist + * 3. Sets the confirmed bit + * 4. Unlocks the hlist lock + */ + smp_acquire__after_ctrl_dep(); + + return nf_ct_is_expired(ct) && !nf_ct_is_dying(ct); } #define NF_CT_DAY (86400 * HZ) diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 3384859a8921..8883575adcc1 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -83,6 +83,11 @@ void nf_conntrack_lock(spinlock_t *lock); extern spinlock_t nf_conntrack_expect_lock; +static inline void lockdep_nfct_expect_lock_held(void) +{ + lockdep_assert_held(&nf_conntrack_expect_lock); +} + /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index 1b58b5b91ff6..cf0166520cf3 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -13,20 +13,20 @@ struct nf_conncount_list { u32 last_gc; /* jiffies at most recent gc */ struct list_head head; /* connections with the same filtering key */ unsigned int count; /* length of list */ + unsigned int last_gc_count; /* length of list at most recent gc */ }; struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen); void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data); -unsigned int nf_conncount_count(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); +unsigned int nf_conncount_count_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_data *data, + const u32 *key); -int nf_conncount_add(struct net *net, struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); +int nf_conncount_add_skb(struct net *net, const struct sk_buff *skb, + u16 l3num, struct nf_conncount_list *list); void nf_conncount_list_init(struct nf_conncount_list *list); diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 165e7a03b8e9..80f50fd0f7ad 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -22,10 +22,16 @@ struct nf_conntrack_expect { /* Hash member */ struct hlist_node hnode; + /* Network namespace */ + possible_net_t net; + /* We expect this tuple, with the following mask */ struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_mask mask; +#ifdef CONFIG_NF_CONNTRACK_ZONES + struct nf_conntrack_zone zone; +#endif /* Usage count. */ refcount_t use; @@ -39,8 +45,11 @@ struct nf_conntrack_expect { void (*expectfn)(struct nf_conn *new, struct nf_conntrack_expect *this); + /* Helper that created this expectation */ + struct nf_conntrack_helper __rcu *helper; + /* Helper to assign to new connection */ - struct nf_conntrack_helper *helper; + struct nf_conntrack_helper __rcu *assign_helper; /* The conntrack of the master connection */ struct nf_conn *master; @@ -62,7 +71,17 @@ struct nf_conntrack_expect { static inline struct net *nf_ct_exp_net(struct nf_conntrack_expect *exp) { - return nf_ct_net(exp->master); + return read_pnet(&exp->net); +} + +static inline bool nf_ct_exp_zone_equal_any(const struct nf_conntrack_expect *a, + const struct nf_conntrack_zone *b) +{ +#ifdef CONFIG_NF_CONNTRACK_ZONES + return a->zone.id == b->id; +#else + return true; +#endif } #define NF_CT_EXP_POLICY_NAME_LEN 16 diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 1f47bef51722..fde2427ceb8f 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -30,7 +30,7 @@ struct nf_conntrack_l4proto { /* called by gc worker if table is full */ bool (*can_early_drop)(const struct nf_conn *ct); - /* convert protoinfo to nfnetink attributes */ + /* convert protoinfo to nfnetlink attributes */ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy); @@ -107,21 +107,11 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); -int nf_conntrack_udplite_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state); int nf_conntrack_tcp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); -int nf_conntrack_dccp_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state); int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, @@ -137,7 +127,6 @@ void nf_conntrack_generic_init_net(struct net *net); void nf_conntrack_tcp_init_net(struct net *net); void nf_conntrack_udp_init_net(struct net *net); void nf_conntrack_gre_init_net(struct net *net); -void nf_conntrack_dccp_init_net(struct net *net); void nf_conntrack_sctp_init_net(struct net *net); void nf_conntrack_icmp_init_net(struct net *net); void nf_conntrack_icmpv6_init_net(struct net *net); @@ -145,8 +134,6 @@ void nf_conntrack_icmpv6_init_net(struct net *net); /* Existing built-in generic protocol */ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic; -#define MAX_NF_CT_PROTO IPPROTO_UDPLITE - const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto); /* Generic netlink helpers */ @@ -223,13 +210,6 @@ static inline bool nf_conntrack_tcp_established(const struct nf_conn *ct) } #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP -static inline struct nf_dccp_net *nf_dccp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.dccp; -} -#endif - #ifdef CONFIG_NF_CT_PROTO_SCTP static inline struct nf_sctp_net *nf_sctp_pernet(struct net *net) { diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 9fdaba911de6..3a66d4abb6d6 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -14,6 +14,7 @@ struct nf_ct_timeout { __u16 l3num; const struct nf_conntrack_l4proto *l4proto; + struct rcu_head rcu; char data[]; }; diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h index f7dd950ff250..4d55b7325707 100644 --- a/include/net/netfilter/nf_conntrack_tuple.h +++ b/include/net/netfilter/nf_conntrack_tuple.h @@ -11,7 +11,7 @@ #ifndef _NF_CONNTRACK_TUPLE_H #define _NF_CONNTRACK_TUPLE_H -#include <linux/netfilter/x_tables.h> +#include <linux/netfilter.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <linux/list_nulls.h> diff --git a/include/net/netfilter/nf_dup_netdev.h b/include/net/netfilter/nf_dup_netdev.h index b175d271aec9..609bcf422a9b 100644 --- a/include/net/netfilter/nf_dup_netdev.h +++ b/include/net/netfilter/nf_dup_netdev.h @@ -3,10 +3,23 @@ #define _NF_DUP_NETDEV_H_ #include <net/netfilter/nf_tables.h> +#include <linux/netdevice.h> +#include <linux/sched.h> void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif); void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif); +#define NF_RECURSION_LIMIT 2 + +static inline u8 *nf_get_nf_dup_skb_recursion(void) +{ +#ifndef CONFIG_PREEMPT_RT + return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); +#else + return ¤t->net_xmit.nf_dup_skb_recursion; +#endif +} + struct nft_offload_ctx; struct nft_flow_rule; diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index d711642e78b5..7b23b245a5a8 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -107,6 +107,19 @@ enum flow_offload_xmit_type { #define NF_FLOW_TABLE_ENCAP_MAX 2 +struct flow_offload_tunnel { + union { + struct in_addr src_v4; + struct in6_addr src_v6; + }; + union { + struct in_addr dst_v4; + struct in6_addr dst_v6; + }; + + u8 l3_proto; +}; + struct flow_offload_tuple { union { struct in_addr src_v4; @@ -130,22 +143,26 @@ struct flow_offload_tuple { __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; + struct flow_offload_tunnel tun; + /* All members above are keys for lookups, see flow_offload_hash(). */ struct { } __hash; - u8 dir:2, + u16 dir:2, xmit_type:3, encap_num:2, + needs_gso_segment:1, + tun_num:2, in_vlan_ingress:2; u16 mtu; union { struct { struct dst_entry *dst_cache; + u32 ifidx; u32 dst_cookie; }; struct { u32 ifidx; - u32 hw_ifidx; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; } out; @@ -206,7 +223,9 @@ struct nf_flow_route { u16 id; __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; + struct flow_offload_tunnel tun; u8 num_encaps:2, + num_tuns:2, ingress_vlans:2; } in; struct { @@ -214,6 +233,7 @@ struct nf_flow_route { u32 hw_ifindex; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; + u8 needs_gso_segment:1; } out; enum flow_offload_xmit_type xmit_type; } tuple[FLOW_OFFLOAD_DIR_MAX]; @@ -222,6 +242,12 @@ struct nf_flow_route { struct flow_offload *flow_offload_alloc(struct nf_conn *ct); void flow_offload_free(struct flow_offload *flow); +struct nft_flowtable; +struct nft_pktinfo; +int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, + struct nf_flow_route *route, enum ip_conntrack_dir dir, + struct nft_flowtable *ft); + static inline int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, flow_setup_cb_t *cb, void *cb_priv) @@ -370,7 +396,7 @@ static inline __be16 __nf_flow_pppoe_proto(const struct sk_buff *skb) static inline bool nf_flow_pppoe_proto(struct sk_buff *skb, __be16 *inner_proto) { - if (!pskb_may_pull(skb, PPPOE_SES_HLEN)) + if (!pskb_may_pull(skb, ETH_HLEN + PPPOE_SES_HLEN)) return false; *inner_proto = __nf_flow_pppoe_proto(skb); diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index e55eedc84ed7..00506792a06d 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -59,6 +59,9 @@ extern int sysctl_nf_log_all_netns; int nf_log_register(u_int8_t pf, struct nf_logger *logger); void nf_log_unregister(struct nf_logger *logger); +/* Check if any logger is registered for a given protocol family. */ +bool nf_log_is_registered(u_int8_t pf); + int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger); void nf_log_unset(struct net *net, const struct nf_logger *logger); diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 4aeffddb7586..3978c3174cdb 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -6,12 +6,15 @@ #include <linux/ipv6.h> #include <linux/jhash.h> #include <linux/netfilter.h> +#include <linux/rhashtable-types.h> #include <linux/skbuff.h> /* Each queued (to userspace) skbuff has one of these. */ struct nf_queue_entry { struct list_head list; + struct rhash_head hash_node; struct sk_buff *skb; + struct net_device *skb_dev; unsigned int id; unsigned int hook_index; /* index in hook_entries->hook[] */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) @@ -19,6 +22,7 @@ struct nf_queue_entry { struct net_device *physout; #endif struct nf_hook_state state; + bool nf_ct_is_unconfirmed; u16 size; /* sizeof(entry) + saved route keys */ /* extra space to store route keys */ diff --git a/include/net/netfilter/nf_reject.h b/include/net/netfilter/nf_reject.h index 7c669792fb9c..f1db33bc6bf8 100644 --- a/include/net/netfilter/nf_reject.h +++ b/include/net/netfilter/nf_reject.h @@ -34,7 +34,6 @@ static inline bool nf_reject_verify_csum(struct sk_buff *skb, int dataoff, /* Protocols with partial checksums. */ case IPPROTO_UDPLITE: - case IPPROTO_DCCP: return false; } return true; diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 803d5f1601f9..9d844354c4d9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -6,7 +6,6 @@ #include <linux/list.h> #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> -#include <linux/netfilter/x_tables.h> #include <linux/netfilter/nf_tables.h> #include <linux/u64_stats_sync.h> #include <linux/rhashtable.h> @@ -32,7 +31,9 @@ struct nft_pktinfo { const struct nf_hook_state *state; u8 flags; u8 tprot; + __be16 ethertype; u16 fragoff; + u16 nhoff; u16 thoff; u16 inneroff; }; @@ -84,6 +85,8 @@ static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt) { pkt->flags = 0; pkt->tprot = 0; + pkt->ethertype = pkt->skb->protocol; + pkt->nhoff = 0; pkt->thoff = 0; pkt->fragoff = 0; } @@ -123,17 +126,6 @@ struct nft_regs { }; }; -struct nft_regs_track { - struct { - const struct nft_expr *selector; - const struct nft_expr *bitwise; - u8 num_reg; - } regs[NFT_REG32_NUM]; - - const struct nft_expr *cur; - const struct nft_expr *last; -}; - /* Store/load an u8, u16 or u64 integer to/from the u32 data register. * * Note, when using concatenations, register allocation happens at 32-bit @@ -188,6 +180,13 @@ static inline u64 nft_reg_load64(const u32 *sreg) return get_unaligned((u64 *)sreg); } +static inline bool nft_reg_overlap(u8 src, u8 dst, u32 len) +{ + unsigned int n = DIV_ROUND_UP(len, sizeof(u32)); + + return src != dst && src < dst + n && dst < src + n; +} + static inline void nft_data_copy(u32 *dst, const struct nft_data *src, unsigned int len) { @@ -317,11 +316,13 @@ static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv) * @NFT_ITER_UNSPEC: unspecified, to catch errors * @NFT_ITER_READ: read-only iteration over set elements * @NFT_ITER_UPDATE: iteration under mutex to update set element state + * @NFT_ITER_UPDATE_CLONE: clone set before iteration under mutex to update element */ enum nft_iter_type { NFT_ITER_UNSPEC, NFT_ITER_READ, NFT_ITER_UPDATE, + NFT_ITER_UPDATE_CLONE, }; struct nft_set; @@ -424,8 +425,6 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset); -bool nft_expr_reduce_bitwise(struct nft_regs_track *track, - const struct nft_expr *expr); struct nft_set_ext; @@ -452,6 +451,7 @@ struct nft_set_ext; * @init: initialize private data of new set instance * @destroy: destroy private data of set instance * @gc_init: initialize garbage collection + * @abort_skip_removal: skip removal of elements from abort path * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster @@ -459,19 +459,13 @@ struct nft_set_ext; * control plane functions. */ struct nft_set_ops { - bool (*lookup)(const struct net *net, + const struct nft_set_ext * (*lookup)(const struct net *net, const struct nft_set *set, + const u32 *key); + const struct nft_set_ext * (*update)(struct nft_set *set, const u32 *key, - const struct nft_set_ext **ext); - bool (*update)(struct nft_set *set, - const u32 *key, - struct nft_elem_priv * - (*new)(struct nft_set *, - const struct nft_expr *, - struct nft_regs *), const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_set_ext **ext); + struct nft_regs *regs); bool (*delete)(const struct nft_set *set, const u32 *key); @@ -515,6 +509,7 @@ struct nft_set_ops { const struct nft_set *set); void (*gc_init)(const struct nft_set *set); + bool abort_skip_removal; unsigned int elemsize; }; @@ -562,6 +557,7 @@ struct nft_set_elem_expr { * @size: maximum set size * @field_len: length of each field in concatenation, bytes * @field_count: number of concatenated fields in element + * @in_update_walk: true during ->walk() in transaction phase * @use: number of rules references to this set * @nelems: number of elements * @ndeact: number of deactivated elements queued for removal @@ -596,6 +592,7 @@ struct nft_set { u32 size; u8 field_len[NFT_REG32_COUNT]; u8 field_count; + bool in_update_walk; u32 use; atomic_t nelems; u32 ndeact; @@ -875,6 +872,8 @@ struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set, u64 timeout, u64 expiration, gfp_t gfp); int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr_array[]); +void nft_set_elem_expr_destroy(const struct nft_ctx *ctx, + struct nft_set_elem_expr *elem_expr); void nft_set_elem_destroy(const struct nft_set *set, const struct nft_elem_priv *elem_priv, bool destroy_expr); @@ -940,7 +939,6 @@ struct nft_offload_ctx; * @destroy_clone: destruction clone function * @dump: function to dump parameters * @validate: validate expression, called during loop detection - * @reduce: reduce expression * @gc: garbage collection expression * @offload: hardware offload expression * @offload_action: function to report true/false to allocate one slot or not in the flow @@ -974,8 +972,6 @@ struct nft_expr_ops { bool reset); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr); - bool (*reduce)(struct nft_regs_track *track, - const struct nft_expr *expr); bool (*gc)(struct net *net, const struct nft_expr *expr); int (*offload)(struct nft_offload_ctx *ctx, @@ -1095,6 +1091,29 @@ struct nft_rule_blob { __attribute__((aligned(__alignof__(struct nft_rule_dp)))); }; +enum nft_chain_types { + NFT_CHAIN_T_DEFAULT = 0, + NFT_CHAIN_T_ROUTE, + NFT_CHAIN_T_NAT, + NFT_CHAIN_T_MAX +}; + +/** + * struct nft_chain_validate_state - validation state + * + * If a chain is encountered again during table validation it is + * possible to avoid revalidation provided the calling context is + * compatible. This structure stores relevant calling context of + * previous validations. + * + * @hook_mask: the hook numbers and locations the chain is linked to + * @depth: the deepest call chain level the chain is linked to + */ +struct nft_chain_validate_state { + u8 hook_mask[NFT_CHAIN_T_MAX]; + u8 depth; +}; + /** * struct nft_chain - nf_tables chain * @@ -1113,6 +1132,7 @@ struct nft_rule_blob { * @udlen: user data length * @udata: user data in the chain * @blob_next: rule blob pointer to the next in the chain + * @vstate: validation state */ struct nft_chain { struct nft_rule_blob __rcu *blob_gen_0; @@ -1132,9 +1152,10 @@ struct nft_chain { /* Only used during control plane commit phase: */ struct nft_rule_blob *blob_next; + struct nft_chain_validate_state vstate; }; -int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); +int nft_chain_validate(const struct nft_ctx *ctx, struct nft_chain *chain); int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set, const struct nft_set_iter *iter, struct nft_elem_priv *elem_priv); @@ -1142,13 +1163,6 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); -enum nft_chain_types { - NFT_CHAIN_T_DEFAULT = 0, - NFT_CHAIN_T_ROUTE, - NFT_CHAIN_T_NAT, - NFT_CHAIN_T_MAX -}; - /** * struct nft_chain_type - nf_tables chain type info * @@ -1197,14 +1211,22 @@ struct nft_stats { struct u64_stats_sync syncp; }; +#define NFT_HOOK_REMOVE (1 << 0) + struct nft_hook { struct list_head list; - struct nf_hook_ops ops; + struct list_head ops_list; struct rcu_head rcu; char ifname[IFNAMSIZ]; u8 ifnamelen; + u8 flags; }; +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, + const struct net_device *dev); +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, + const struct net_device *dev); + /** * struct nft_base_chain - nf_tables base chain * @@ -1653,6 +1675,16 @@ struct nft_trans { }; /** + * struct nft_trans_hook - nf_tables hook update in transaction + * @list: used internally + * @hook: struct nft_hook with the device hook + */ +struct nft_trans_hook { + struct list_head list; + struct nft_hook *hook; +}; + +/** * struct nft_trans_binding - nf_tables object with binding support in transaction * @nft_trans: base structure, MUST be first member * @binding_list: list of objects with possible bindings @@ -1837,6 +1869,11 @@ struct nft_trans_gc { struct rcu_head rcu; }; +static inline int nft_trans_gc_space(const struct nft_trans_gc *trans) +{ + return NFT_TRANS_GC_BATCHCOUNT - trans->count; +} + static inline void nft_ctx_update(struct nft_ctx *ctx, const struct nft_trans *trans) { @@ -1913,7 +1950,6 @@ struct nftables_pernet { struct mutex commit_mutex; u64 table_handle; u64 tstamp; - unsigned int base_seq; unsigned int gc_seq; u8 validate_state; struct work_struct destroy_work; @@ -1931,25 +1967,4 @@ static inline u64 nft_net_tstamp(const struct net *net) return nft_pernet(net)->tstamp; } -#define __NFT_REDUCE_READONLY 1UL -#define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY - -static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) -{ - return expr->ops->reduce == NFT_REDUCE_READONLY; -} - -void nft_reg_track_update(struct nft_regs_track *track, - const struct nft_expr *expr, u8 dreg, u8 len); -void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); -void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg); - -static inline bool nft_reg_track_cmp(struct nft_regs_track *track, - const struct nft_expr *expr, u8 dreg) -{ - return track->regs[dreg].selector && - track->regs[dreg].selector->ops == expr->ops && - track->regs[dreg].num_reg == 0; -} - #endif /* _NET_NF_TABLES_H */ diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 03b6165756fc..b8df5acbb723 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -73,7 +73,7 @@ struct nft_ct { struct nft_payload { enum nft_payload_bases base:8; - u8 offset; + u16 offset; u8 len; u8 dreg; }; @@ -94,34 +94,35 @@ extern const struct nft_set_type nft_set_pipapo_type; extern const struct nft_set_type nft_set_pipapo_avx2_type; #ifdef CONFIG_MITIGATION_RETPOLINE -bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_hash_lookup_fast(const struct net *net, - const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_hash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -#else -static inline bool -nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) -{ - return set->ops->lookup(net, set, key, ext); -} +const struct nft_set_ext * +nft_rhash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_hash_lookup_fast(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); #endif +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); + /* called from nft_pipapo_avx2.c */ -bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); +const struct nft_set_ext * +nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); /* called from nft_set_pipapo.c */ -bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); +const struct nft_set_ext * +nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); void nft_counter_init_seqcount(void); @@ -181,4 +182,7 @@ void nft_objref_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_objref_map_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); +struct nft_elem_priv *nft_dynset_new(struct nft_set *set, + const struct nft_expr *expr, + struct nft_regs *regs); #endif /* _NET_NF_TABLES_CORE_H */ diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index fcf967286e37..e715405a73cb 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -12,16 +12,19 @@ static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt) ip = ip_hdr(pkt->skb); pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = ip->protocol; + pkt->ethertype = pkt->skb->protocol; + pkt->nhoff = 0; pkt->thoff = ip_hdrlen(pkt->skb); pkt->fragoff = ntohs(ip->frag_off) & IP_OFFSET; } -static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) +static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, + int nhoff) { struct iphdr *iph, _iph; u32 len, thoff, skb_len; - iph = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), + iph = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb) + nhoff, sizeof(*iph), &_iph); if (!iph) return -1; @@ -31,7 +34,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; - skb_len = pkt->skb->len - skb_network_offset(pkt->skb); + skb_len = pkt->skb->len - skb_network_offset(pkt->skb) - nhoff; if (skb_len < len) return -1; @@ -42,7 +45,9 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; - pkt->thoff = skb_network_offset(pkt->skb) + thoff; + pkt->ethertype = pkt->skb->protocol; + pkt->nhoff = nhoff; + pkt->thoff = skb_network_offset(pkt->skb) + nhoff + thoff; pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; @@ -50,7 +55,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { - if (__nft_set_pktinfo_ipv4_validate(pkt) < 0) + if (__nft_set_pktinfo_ipv4_validate(pkt, 0) < 0) nft_set_pktinfo_unspec(pkt); } @@ -78,6 +83,8 @@ static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt) } pkt->flags = NFT_PKTINFO_L4PROTO; + pkt->ethertype = pkt->skb->protocol; + pkt->nhoff = 0; pkt->tprot = iph->protocol; pkt->thoff = thoff; pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index a0633eeaec97..d7b8c559b795 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -20,21 +20,23 @@ static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt) pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; + pkt->ethertype = pkt->skb->protocol; + pkt->nhoff = 0; pkt->thoff = thoff; pkt->fragoff = frag_off; } -static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) +static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, int nhoff) { #if IS_ENABLED(CONFIG_IPV6) unsigned int flags = IP6_FH_F_AUTH; struct ipv6hdr *ip6h, _ip6h; - unsigned int thoff = 0; + unsigned int thoff = nhoff; unsigned short frag_off; u32 pkt_len, skb_len; int protohdr; - ip6h = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), + ip6h = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb) + nhoff, sizeof(*ip6h), &_ip6h); if (!ip6h) return -1; @@ -42,8 +44,8 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) if (ip6h->version != 6) return -1; - pkt_len = ntohs(ip6h->payload_len); - skb_len = pkt->skb->len - skb_network_offset(pkt->skb); + pkt_len = ipv6_payload_len(pkt->skb, ip6h); + skb_len = pkt->skb->len - skb_network_offset(pkt->skb) - nhoff; if (pkt_len + sizeof(*ip6h) > skb_len) return -1; @@ -53,6 +55,8 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; + pkt->ethertype = pkt->skb->protocol; + pkt->nhoff = nhoff; pkt->thoff = thoff; pkt->fragoff = frag_off; @@ -64,7 +68,7 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) { - if (__nft_set_pktinfo_ipv6_validate(pkt) < 0) + if (__nft_set_pktinfo_ipv6_validate(pkt, 0) < 0) nft_set_pktinfo_unspec(pkt); } @@ -86,7 +90,7 @@ static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt) if (ip6h->version != 6) goto inhdr_error; - pkt_len = ntohs(ip6h->payload_len); + pkt_len = ipv6_payload_len(pkt->skb, ip6h); if (pkt_len + sizeof(*ip6h) > pkt->skb->len) { idev = __in6_dev_get(nft_in(pkt)); __IP6_INC_STATS(nft_net(pkt), idev, IPSTATS_MIB_INTRUNCATEDPKTS); @@ -99,6 +103,8 @@ static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt) pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; + pkt->ethertype = pkt->skb->protocol; + pkt->nhoff = 0; pkt->thoff = thoff; pkt->fragoff = frag_off; diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index 3568b6a2f5f0..14c427891ee6 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -67,6 +67,16 @@ struct nft_flow_rule { struct flow_rule *rule; }; +static inline struct flow_action_entry * +nft_flow_action_entry_next(struct nft_offload_ctx *ctx, + struct nft_flow_rule *flow) +{ + if (unlikely(ctx->num_actions >= flow->rule->action.num_entries)) + return NULL; + + return &flow->rule->action.entries[ctx->num_actions++]; +} + void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, enum flow_dissector_key_id addr_type); diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 38cae7113de4..e0422456f27b 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -2,6 +2,7 @@ #ifndef _NFT_FIB_H_ #define _NFT_FIB_H_ +#include <net/l3mdev.h> #include <net/netfilter/nf_tables.h> struct nft_fib { @@ -18,6 +19,35 @@ nft_fib_is_loopback(const struct sk_buff *skb, const struct net_device *in) return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } +static inline bool nft_fib_can_skip(const struct nft_pktinfo *pkt) +{ + const struct net_device *indev = nft_in(pkt); + const struct sock *sk; + + switch (nft_hook(pkt)) { + case NF_INET_PRE_ROUTING: + case NF_INET_INGRESS: + case NF_INET_LOCAL_IN: + break; + default: + return false; + } + + sk = pkt->skb->sk; + if (sk && sk_fullsock(sk)) + return sk->sk_rx_dst_ifindex == indev->ifindex; + + return nft_fib_is_loopback(pkt->skb, indev); +} + +static inline int nft_fib_l3mdev_master_ifindex_rcu(const struct nft_pktinfo *pkt, + const struct net_device *iif) +{ + const struct net_device *dev = iif ? iif : pkt->skb->dev; + + return l3mdev_master_ifindex_rcu(dev); +} + int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); @@ -36,6 +66,4 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, void nft_fib_store_result(void *reg, const struct nft_fib *priv, const struct net_device *dev); -bool nft_fib_reduce(struct nft_regs_track *track, - const struct nft_expr *expr); #endif diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index d602263590fe..f74e63290603 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -43,9 +43,6 @@ void nft_meta_set_destroy(const struct nft_ctx *ctx, int nft_meta_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr); -bool nft_meta_get_reduce(struct nft_regs_track *track, - const struct nft_expr *expr); - struct nft_inner_tun_ctx; void nft_meta_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt, diff --git a/include/net/netlink.h b/include/net/netlink.h index e015ffbed819..546d10586576 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -68,6 +68,8 @@ * nlmsg_for_each_msg() loop over all messages * nlmsg_validate() validate netlink message incl. attrs * nlmsg_for_each_attr() loop over all attributes + * nlmsg_for_each_attr_type() loop over all attributes with the + * given type * * Misc: * nlmsg_report() report back to application? @@ -118,6 +120,7 @@ * nla_nest_start(skb, type) start a nested attribute * nla_nest_end(skb, nla) finalize a nested attribute * nla_nest_cancel(skb, nla) cancel nested attribute construction + * nla_put_empty_nest(skb, type) create an empty nest * * Attribute Length Calculations: * nla_attr_size(payload) length of attribute w/o padding @@ -320,7 +323,13 @@ enum nla_policy_validation { * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: + * NLA_U8, NLA_U16, + * NLA_U32, NLA_U64, + * NLA_S8, NLA_S16, + * NLA_S32, NLA_S64, + * NLA_MSECS, * NLA_BINARY Validation function called for the attribute. + * * All other Unused - but note that it's a union * * Example: @@ -611,6 +620,22 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh) } /** + * nlmsg_payload - message payload if the data fits in the len + * @nlh: netlink message header + * @len: struct length + * + * Returns: The netlink message payload/data if the length is sufficient, + * otherwise NULL. + */ +static inline void *nlmsg_payload(const struct nlmsghdr *nlh, size_t len) +{ + if (nlh->nlmsg_len < nlmsg_msg_size(len)) + return NULL; + + return nlmsg_data(nlh); +} + +/** * nlmsg_attrdata - head of attributes data * @nlh: netlink message header * @hdrlen: length of family specific header @@ -944,6 +969,18 @@ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) nlmsg_attrlen(nlh, hdrlen), rem) /** + * nlmsg_for_each_attr_type - iterate over a stream of attributes + * @pos: loop counter, set to the current attribute + * @type: required attribute type for @pos + * @nlh: netlink message header + * @hdrlen: length of the family specific header + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nlmsg_for_each_attr_type(pos, type, nlh, hdrlen, rem) \ + nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \ + if (nla_type(pos) == type) + +/** * nlmsg_put - Add a new netlink message to an skb * @skb: socket buffer to store message in * @portid: netlink PORTID of requesting application @@ -2228,6 +2265,25 @@ static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start) } /** + * nla_nest_end_safe - Validate and finalize nesting of attributes + * @skb: socket buffer the attributes are stored in + * @start: container attribute + * + * Corrects the container attribute header to include all appended + * attributes. + * + * Returns: the total data length of the skb, or -EMSGSIZE if the + * nested attribute length exceeds U16_MAX. + */ +static inline int nla_nest_end_safe(struct sk_buff *skb, struct nlattr *start) +{ + if (skb_tail_pointer(skb) - (unsigned char *)start > U16_MAX) + return -EMSGSIZE; + + return nla_nest_end(skb, start); +} + +/** * nla_nest_cancel - Cancel nesting of attributes * @skb: socket buffer the message is stored in * @start: container attribute @@ -2241,6 +2297,20 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) } /** + * nla_put_empty_nest - Create an empty nest + * @skb: socket buffer the message is stored in + * @attrtype: attribute type of the container + * + * This function is a helper for creating empty nests. + * + * Returns: 0 when successful or -EMSGSIZE on failure. + */ +static inline int nla_put_empty_nest(struct sk_buff *skb, int attrtype) +{ + return nla_nest_start(skb, attrtype) ? 0 : -EMSGSIZE; +} + +/** * __nla_validate_nested - Validate a stream of nested attributes * @start: container attribute * @maxtype: maximum attribute type to be expected diff --git a/include/net/netmem.h b/include/net/netmem.h index 1b58faa4f20f..bccacd21b6c3 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -8,9 +8,54 @@ #ifndef _NET_NETMEM_H #define _NET_NETMEM_H +#include <linux/dma-mapping.h> #include <linux/mm.h> #include <net/net_debug.h> +/* These fields in struct page are used by the page_pool and net stack: + * + * struct { + * unsigned long pp_magic; + * struct page_pool *pp; + * unsigned long _pp_mapping_pad; + * unsigned long dma_addr; + * atomic_long_t pp_ref_count; + * }; + * + * We mirror the page_pool fields here so the page_pool can access these + * fields without worrying whether the underlying fields belong to a + * page or netmem_desc. + * + * CAUTION: Do not update the fields in netmem_desc without also + * updating the anonymous aliasing union in struct net_iov. + */ +struct netmem_desc { + unsigned long _flags; + unsigned long pp_magic; + struct page_pool *pp; + unsigned long _pp_mapping_pad; + unsigned long dma_addr; + atomic_long_t pp_ref_count; +}; + +#define NETMEM_DESC_ASSERT_OFFSET(pg, desc) \ + static_assert(offsetof(struct page, pg) == \ + offsetof(struct netmem_desc, desc)) +NETMEM_DESC_ASSERT_OFFSET(flags, _flags); +NETMEM_DESC_ASSERT_OFFSET(pp_magic, pp_magic); +NETMEM_DESC_ASSERT_OFFSET(pp, pp); +NETMEM_DESC_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad); +NETMEM_DESC_ASSERT_OFFSET(dma_addr, dma_addr); +NETMEM_DESC_ASSERT_OFFSET(pp_ref_count, pp_ref_count); +#undef NETMEM_DESC_ASSERT_OFFSET + +/* + * Since struct netmem_desc uses the space in struct page, the size + * should be checked, until struct netmem_desc has its own instance from + * slab, to avoid conflicting with other members within struct page. + */ +static_assert(sizeof(struct netmem_desc) <= offsetof(struct page, _refcount)); + /* net_iov */ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); @@ -20,39 +65,67 @@ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); */ #define NET_IOV 0x01UL -struct net_iov { - unsigned long __unused_padding; - unsigned long pp_magic; - struct page_pool *pp; - struct dmabuf_genpool_chunk_owner *owner; - unsigned long dma_addr; - atomic_long_t pp_ref_count; +enum net_iov_type { + NET_IOV_DMABUF, + NET_IOV_IOURING, }; -/* These fields in struct page are used by the page_pool and net stack: +/* A memory descriptor representing abstract networking I/O vectors, + * generally for non-pages memory that doesn't have its corresponding + * struct page and needs to be explicitly allocated through slab. * - * struct { - * unsigned long pp_magic; - * struct page_pool *pp; - * unsigned long _pp_mapping_pad; - * unsigned long dma_addr; - * atomic_long_t pp_ref_count; - * }; + * net_iovs are allocated and used by networking code, and the size of + * the chunk is PAGE_SIZE. * - * We mirror the page_pool fields here so the page_pool can access these fields - * without worrying whether the underlying fields belong to a page or net_iov. + * This memory can be any form of non-struct paged memory. Examples + * include imported dmabuf memory and imported io_uring memory. See + * net_iov_type for all the supported types. * - * The non-net stack fields of struct page are private to the mm stack and must - * never be mirrored to net_iov. + * @pp_magic: pp field, similar to the one in struct page/struct + * netmem_desc. + * @pp: the pp this net_iov belongs to, if any. + * @dma_addr: the dma addrs of the net_iov. Needed for the network + * card to send/receive this net_iov. + * @pp_ref_count: the pp ref count of this net_iov, exactly the same + * usage as struct page/struct netmem_desc. + * @owner: the net_iov_area this net_iov belongs to, if any. + * @type: the type of the memory. Different types of net_iovs are + * supported. */ -#define NET_IOV_ASSERT_OFFSET(pg, iov) \ - static_assert(offsetof(struct page, pg) == \ - offsetof(struct net_iov, iov)) -NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic); -NET_IOV_ASSERT_OFFSET(pp, pp); -NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr); -NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); -#undef NET_IOV_ASSERT_OFFSET +struct net_iov { + struct netmem_desc desc; + enum net_iov_type type; + struct net_iov_area *owner; +}; + +struct net_iov_area { + /* Array of net_iovs for this area. */ + struct net_iov *niovs; + size_t num_niovs; + + /* Offset into the dma-buf where this chunk starts. */ + unsigned long base_virtual; +}; + +static inline struct net_iov_area *net_iov_owner(const struct net_iov *niov) +{ + return niov->owner; +} + +static inline unsigned int net_iov_idx(const struct net_iov *niov) +{ + return niov - net_iov_owner(niov)->niovs; +} + +/* Initialize a niov: stamp the owning area, the memory provider type. + */ +static inline void net_iov_init(struct net_iov *niov, + struct net_iov_area *owner, + enum net_iov_type type) +{ + niov->owner = owner; + niov->type = type; +} /* netmem */ @@ -60,8 +133,7 @@ NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); * typedef netmem_ref - a nonexistent type marking a reference to generic * network memory. * - * A netmem_ref currently is always a reference to a struct page. This - * abstraction is introduced so support for new memory types can be added. + * A netmem_ref can be a struct page* or a struct net_iov* underneath. * * Use the supplied helpers to obtain the underlying memory pointer and fields. */ @@ -88,9 +160,6 @@ static inline struct page *__netmem_to_page(netmem_ref netmem) return (__force struct page *)netmem; } -/* This conversion fails (returns NULL) if the netmem_ref is not struct page - * backed. - */ static inline struct page *netmem_to_page(netmem_ref netmem) { if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) @@ -114,10 +183,9 @@ static inline netmem_ref net_iov_to_netmem(struct net_iov *niov) return (__force netmem_ref)((unsigned long)niov | NET_IOV); } -static inline netmem_ref page_to_netmem(struct page *page) -{ - return (__force netmem_ref)page; -} +#define page_to_netmem(p) (_Generic((p), \ + const struct page * : (__force const netmem_ref)(p), \ + struct page * : (__force netmem_ref)(p))) /** * virt_to_netmem - convert virtual memory pointer to a netmem reference @@ -149,9 +217,59 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem) return page_to_pfn(netmem_to_page(netmem)); } -static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) +/* XXX: How to extract netmem_desc from page must be changed, once + * netmem_desc no longer overlays on page and will be allocated through + * slab. + */ +#define __pp_page_to_nmdesc(p) (_Generic((p), \ + const struct page * : (const struct netmem_desc *)(p), \ + struct page * : (struct netmem_desc *)(p))) + +/* CAUTION: Check if the page is a pp page before calling this helper or + * know it's a pp page. + */ +#define pp_page_to_nmdesc(p) \ +({ \ + DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p)); \ + __pp_page_to_nmdesc(p); \ +}) + +/** + * __netmem_to_nmdesc - unsafely get pointer to the &netmem_desc backing + * @netmem + * @netmem: netmem reference to convert + * + * Unsafe version that can be used only when @netmem is always backed by + * system memory, performs faster and generates smaller object code (no + * check for the LSB, no WARN). When @netmem points to IOV, provokes + * undefined behaviour. + * + * Return: pointer to the &netmem_desc (garbage if @netmem is not backed + * by system memory). + */ +static inline struct netmem_desc *__netmem_to_nmdesc(netmem_ref netmem) { - return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV); + return (__force struct netmem_desc *)netmem; +} + +/* netmem_to_nmdesc - convert netmem_ref to struct netmem_desc * for + * access to common fields. + * @netmem: netmem reference to get netmem_desc. + * + * All the sub types of netmem_ref (netmem_desc, net_iov) have the same + * pp, pp_magic, dma_addr, and pp_ref_count fields via netmem_desc. + * + * Return: the pointer to struct netmem_desc * regardless of its + * underlying type. + */ +static inline struct netmem_desc *netmem_to_nmdesc(netmem_ref netmem) +{ + void *p = (void *)((__force unsigned long)netmem & ~NET_IOV); + + if (netmem_is_net_iov(netmem)) + return &((struct net_iov *)p)->desc; + + return __pp_page_to_nmdesc((struct page *)p); } /** @@ -167,17 +285,17 @@ static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) */ static inline struct page_pool *__netmem_get_pp(netmem_ref netmem) { - return __netmem_to_page(netmem)->pp; + return __netmem_to_nmdesc(netmem)->pp; } static inline struct page_pool *netmem_get_pp(netmem_ref netmem) { - return __netmem_clear_lsb(netmem)->pp; + return netmem_to_nmdesc(netmem)->pp; } static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem) { - return &__netmem_clear_lsb(netmem)->pp_ref_count; + return &netmem_to_nmdesc(netmem)->pp_ref_count; } static inline bool netmem_is_pref_nid(netmem_ref netmem, int pref_nid) @@ -242,7 +360,57 @@ static inline bool netmem_is_pfmemalloc(netmem_ref netmem) static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) { - return __netmem_clear_lsb(netmem)->dma_addr; + return netmem_to_nmdesc(netmem)->dma_addr; +} + +#if defined(CONFIG_NET_DEVMEM) +static inline bool net_is_devmem_iov(const struct net_iov *niov) +{ + return niov->type == NET_IOV_DMABUF; +} +#else +static inline bool net_is_devmem_iov(const struct net_iov *niov) +{ + return false; +} +#endif + +void __get_netmem(netmem_ref netmem); +void __put_netmem(netmem_ref netmem); + +static __always_inline void get_netmem(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) + __get_netmem(netmem); + else + get_page(netmem_to_page(netmem)); +} + +static __always_inline void put_netmem(netmem_ref netmem) +{ + if (netmem_is_net_iov(netmem)) + __put_netmem(netmem); + else + put_page(netmem_to_page(netmem)); +} + +#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL) \ + do { \ + if (!netmem_is_net_iov(NETMEM)) \ + dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \ + else \ + dma_unmap_addr_set(PTR, ADDR_NAME, 0); \ + } while (0) + +static inline void netmem_dma_unmap_page_attrs(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + if (!addr) + return; + + dma_unmap_page_attrs(dev, addr, size, dir, attrs); } #endif /* _NET_NETMEM_H */ diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index bae914815aa3..ab74b5ed0b01 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -7,9 +7,6 @@ #include <linux/atomic.h> #include <linux/workqueue.h> #include <linux/netfilter/nf_conntrack_tcp.h> -#ifdef CONFIG_NF_CT_PROTO_DCCP -#include <linux/netfilter/nf_conntrack_dccp.h> -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP #include <linux/netfilter/nf_conntrack_sctp.h> #endif @@ -50,13 +47,6 @@ struct nf_icmp_net { unsigned int timeout; }; -#ifdef CONFIG_NF_CT_PROTO_DCCP -struct nf_dccp_net { - u8 dccp_loose; - unsigned int dccp_timeout[CT_DCCP_MAX + 1]; -}; -#endif - #ifdef CONFIG_NF_CT_PROTO_SCTP struct nf_sctp_net { unsigned int timeouts[SCTP_CONNTRACK_MAX]; @@ -82,9 +72,6 @@ struct nf_ip_net { struct nf_udp_net udp; struct nf_icmp_net icmp; struct nf_icmp_net icmpv6; -#ifdef CONFIG_NF_CT_PROTO_DCCP - struct nf_dccp_net dccp; -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP struct nf_sctp_net sctp; #endif diff --git a/include/net/netns/core.h b/include/net/netns/core.h index 9b36f0ff0c20..9ef3d70e5e9c 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -13,9 +13,11 @@ struct netns_core { struct ctl_table_header *sysctl_hdr; int sysctl_somaxconn; + int sysctl_txq_reselection; int sysctl_optmem_max; u8 sysctl_txrehash; u8 sysctl_tstamp_allow_data; + u8 sysctl_bypass_prot_mem; #ifdef CONFIG_PROC_FS struct prot_inuse __percpu *prot_inuse; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 46452da35206..6e27c56514df 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -47,6 +47,11 @@ struct sysctl_fib_multipath_hash_seed { }; #endif +struct udp_tunnel_gro { + struct sock __rcu *sk; + struct hlist_head list; +}; + struct netns_ipv4 { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. @@ -69,22 +74,35 @@ struct netns_ipv4 { /* TXRX readonly hotpath cache lines */ __cacheline_group_begin(netns_ipv4_read_txrx); - u8 sysctl_tcp_moderate_rcvbuf; + u8 sysctl_tcp_shrink_window; __cacheline_group_end(netns_ipv4_read_txrx); /* RX readonly hotpath cache line */ __cacheline_group_begin(netns_ipv4_read_rx); + u8 sysctl_tcp_moderate_rcvbuf; u8 sysctl_ip_early_demux; u8 sysctl_tcp_early_demux; u8 sysctl_tcp_l3mdev_accept; /* 3 bytes hole, try to pack */ int sysctl_tcp_reordering; int sysctl_tcp_rmem[3]; + int sysctl_tcp_rcvbuf_low_rtt; __cacheline_group_end(netns_ipv4_read_rx); + /* ICMP rate limiter hot cache line. */ + __cacheline_group_begin_aligned(icmp); + atomic_t icmp_global_credit; + u32 icmp_global_stamp; + __cacheline_group_end_aligned(icmp); + struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + /* Not in a pernet subsys because need to be available at GRO stage */ + struct udp_tunnel_gro udp_tunnel_gro[2]; +#endif + #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table_header *frags_hdr; @@ -105,12 +123,14 @@ struct netns_ipv4 { #endif bool fib_has_custom_local_routes; bool fib_offload_disabled; - u8 sysctl_tcp_shrink_window; #ifdef CONFIG_IP_ROUTE_CLASSID atomic_t fib_num_tclassid_users; #endif struct hlist_head *fib_table_hash; struct sock *fibnl; + struct hlist_head *fib_info_hash; + unsigned int fib_info_hash_bits; + unsigned int fib_info_cnt; struct sock *mc_autojoin_sk; @@ -122,12 +142,12 @@ struct netns_ipv4 { u8 sysctl_icmp_echo_ignore_broadcasts; u8 sysctl_icmp_ignore_bogus_error_responses; u8 sysctl_icmp_errors_use_inbound_ifaddr; + u8 sysctl_icmp_errors_extension_mask; int sysctl_icmp_ratelimit; int sysctl_icmp_ratemask; int sysctl_icmp_msgs_per_sec; int sysctl_icmp_msgs_burst; - atomic_t icmp_global_credit; - u32 icmp_global_stamp; + u32 ip_rt_min_pmtu; int ip_rt_mtu_expires; int ip_rt_min_advmss; @@ -135,6 +155,8 @@ struct netns_ipv4 { struct local_ports ip_local_ports; u8 sysctl_tcp_ecn; + u8 sysctl_tcp_ecn_option; + u8 sysctl_tcp_ecn_option_beacon; u8 sysctl_tcp_ecn_fallback; u8 sysctl_ip_default_ttl; @@ -144,6 +166,7 @@ struct netns_ipv4 { u8 sysctl_ip_autobind_reuse; /* Shall we try to damage output packets if routing dev changes? */ u8 sysctl_ip_dynaddr; + u32 sysctl_ip_local_port_step_width; #ifdef CONFIG_NET_L3_MASTER_DEV u8 sysctl_raw_l3mdev_accept; #endif @@ -181,6 +204,7 @@ struct netns_ipv4 { u8 sysctl_tcp_window_scaling; u8 sysctl_tcp_timestamps; int sysctl_tcp_rto_min_us; + int sysctl_tcp_rto_max_ms; u8 sysctl_tcp_recovery; u8 sysctl_tcp_thin_linear_timeouts; u8 sysctl_tcp_slow_start_after_idle; @@ -204,6 +228,7 @@ struct netns_ipv4 { int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; unsigned int sysctl_tcp_child_ehash_entries; + int sysctl_tcp_comp_sack_rtt_percent; unsigned long sysctl_tcp_comp_sack_delay_ns; unsigned long sysctl_tcp_comp_sack_slack_ns; int sysctl_max_syn_backlog; @@ -237,6 +262,7 @@ struct netns_ipv4 { int sysctl_igmp_qrv; struct ping_group_range ping_group_range; + u16 ping_port_rover; atomic_t dev_addr_genid; @@ -249,11 +275,14 @@ struct netns_ipv4 { #ifdef CONFIG_IP_MROUTE #ifndef CONFIG_IP_MROUTE_MULTIPLE_TABLES - struct mr_table *mrt; + struct mr_table __rcu *mrt; #else struct list_head mr_tables; struct fib_rules_ops *mr_rules_ops; #endif + struct fib_notifier_ops *ipmr_notifier_ops; + atomic_t ipmr_seq; + struct mutex mfc_mutex; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH struct sysctl_fib_multipath_hash_seed sysctl_fib_multipath_hash_seed; @@ -265,12 +294,10 @@ struct netns_ipv4 { struct fib_notifier_ops *notifier_ops; unsigned int fib_seq; /* writes protected by rtnl_mutex */ - struct fib_notifier_ops *ipmr_notifier_ops; - unsigned int ipmr_seq; /* protected by rtnl_mutex */ - atomic_t rt_genid; siphash_key_t ip_id_key; struct hlist_head *inet_addr_lst; struct delayed_work addr_chk_work; }; + #endif diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 5f2cfd84570a..875916d60bfe 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -30,19 +30,23 @@ struct netns_sysctl_ipv6 { int ip6_rt_min_advmss; u32 multipath_hash_fields; u8 multipath_hash_policy; - u8 bindv6only; + + __cacheline_group_begin(sysctl_ipv6_flowlabel); u8 flowlabel_consistency; u8 auto_flowlabels; - int icmpv6_time; + u8 flowlabel_state_ranges; + __cacheline_group_end(sysctl_ipv6_flowlabel); + u8 icmpv6_echo_ignore_all; u8 icmpv6_echo_ignore_multicast; u8 icmpv6_echo_ignore_anycast; + int icmpv6_time; DECLARE_BITMAP(icmpv6_ratemask, ICMPV6_MSG_MAX + 1); unsigned long *icmpv6_ratemask_ptr; u8 anycast_src_echo_reply; + u8 bindv6only; u8 ip_nonlocal_bind; u8 fwmark_reflect; - u8 flowlabel_state_ranges; int idgen_retries; int idgen_delay; int flowlabel_reflect; @@ -56,6 +60,7 @@ struct netns_sysctl_ipv6 { u8 skip_notify_on_dev_down; u8 fib_notify_on_flag_change; u8 icmpv6_error_anycast_as_unicast; + u8 icmpv6_errors_extension_mask; }; struct netns_ipv6 { @@ -72,6 +77,7 @@ struct netns_ipv6 { struct rt6_statistics *rt6_stats; struct timer_list ip6_fib_timer; struct hlist_head *fib_table_hash; + spinlock_t fib_table_hash_lock; struct fib6_table *fib6_main_tbl; struct list_head fib6_walkers; rwlock_t fib6_walker_lock; @@ -112,7 +118,8 @@ struct netns_ipv6 { struct seg6_pernet_data *seg6_data; struct fib_notifier_ops *notifier_ops; struct fib_notifier_ops *ip6mr_notifier_ops; - unsigned int ipmr_seq; /* protected by rtnl_mutex */ + atomic_t ipmr_seq; + int flowlabel_count; struct { struct hlist_head head; spinlock_t lock; diff --git a/include/net/netns/mctp.h b/include/net/netns/mctp.h index 1db8f9aaddb4..89555f90b97b 100644 --- a/include/net/netns/mctp.h +++ b/include/net/netns/mctp.h @@ -6,19 +6,25 @@ #ifndef __NETNS_MCTP_H__ #define __NETNS_MCTP_H__ +#include <linux/hash.h> +#include <linux/hashtable.h> #include <linux/mutex.h> #include <linux/types.h> +#define MCTP_BINDS_BITS 7 + struct netns_mctp { /* Only updated under RTNL, entries freed via RCU */ struct list_head routes; - /* Bound sockets: list of sockets bound by type. - * This list is updated from non-atomic contexts (under bind_lock), - * and read (under rcu) in packet rx + /* Bound sockets: hash table of sockets, keyed by + * (type, src_eid, dest_eid). + * Specific src_eid/dest_eid entries also have an entry for + * MCTP_ADDR_ANY. This list is updated from non-atomic contexts + * (under bind_lock), and read (under rcu) in packet rx. */ struct mutex bind_lock; - struct hlist_head binds; + DECLARE_HASHTABLE(binds, MCTP_BINDS_BITS); /* tag allocations. This list is read and updated from atomic contexts, * but elements are free()ed after a RCU grace-period @@ -34,4 +40,10 @@ struct netns_mctp { struct list_head neighbours; }; +static inline u32 mctp_bind_hash(u8 type, u8 local_addr, u8 peer_addr) +{ + return hash_32(type | (u32)local_addr << 8 | (u32)peer_addr << 16, + MCTP_BINDS_BITS); +} + #endif /* __NETNS_MCTP_H__ */ diff --git a/include/net/netns/mib.h b/include/net/netns/mib.h index 7e373664b1e7..dce05f8e6a33 100644 --- a/include/net/netns/mib.h +++ b/include/net/netns/mib.h @@ -28,11 +28,6 @@ struct netns_mib { DEFINE_SNMP_STAT(struct mptcp_mib, mptcp_statistics); #endif - DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics); -#if IS_ENABLED(CONFIG_IPV6) - DEFINE_SNMP_STAT(struct udp_mib, udplite_stats_in6); -#endif - DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics); DEFINE_SNMP_STAT_ATOMIC(struct icmpmsg_mib, icmpmsg_statistics); #if IS_ENABLED(CONFIG_IPV6) diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h index 19ad2574b267..2073cbac2afb 100644 --- a/include/net/netns/mpls.h +++ b/include/net/netns/mpls.h @@ -16,6 +16,8 @@ struct netns_mpls { int default_ttl; size_t platform_labels; struct mpls_route __rcu * __rcu *platform_label; + struct mutex platform_mutex; + seqcount_mutex_t platform_label_seq; struct ctl_table_header *ctl; }; diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index cc8060c017d5..99dd166c5d07 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -3,6 +3,7 @@ #define _NETNS_NFTABLES_H_ struct netns_nftables { + unsigned int base_seq; u8 gencursor; }; diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index d25cd7a9c5ff..c0f97f36389e 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -75,8 +75,8 @@ struct netns_sctp { /* Whether Cookie Preservative is enabled(1) or not(0) */ int cookie_preserve_enable; - /* The namespace default hmac alg */ - char *sctp_hmac_alg; + /* Whether cookie authentication is enabled(1) or not(0) */ + int cookie_auth_enable; /* Valid.Cookie.Life - 60 seconds */ unsigned int valid_cookie_life; diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index fc752a50f91b..ed24c9f638ee 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -17,6 +17,9 @@ struct netns_smc { #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + struct smc_hs_ctrl __rcu *hs_ctrl; +#endif /* CONFIG_SMC_HS_CTRL_BPF */ unsigned int sysctl_autocorking_size; unsigned int sysctl_smcr_buf_type; int sysctl_smcr_testlink_time; @@ -24,5 +27,7 @@ struct netns_smc { int sysctl_rmem; int sysctl_max_links_per_lgr; int sysctl_max_conns_per_lgr; + unsigned int sysctl_smcr_max_send_wr; + unsigned int sysctl_smcr_max_recv_wr; }; #endif diff --git a/include/net/netns/vsock.h b/include/net/netns/vsock.h new file mode 100644 index 000000000000..7f84aad92f57 --- /dev/null +++ b/include/net/netns/vsock.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_NET_NAMESPACE_VSOCK_H +#define __NET_NET_NAMESPACE_VSOCK_H + +#include <linux/types.h> + +enum vsock_net_mode { + VSOCK_NET_MODE_GLOBAL, + VSOCK_NET_MODE_LOCAL, +}; + +struct netns_vsock { + struct ctl_table_header *sysctl_hdr; + + /* protected by the vsock_table_lock in af_vsock.c */ + u32 port; + + enum vsock_net_mode mode; + enum vsock_net_mode child_ns_mode; + + /* 0 = unlocked, 1 = locked to global, 2 = locked to local */ + int child_ns_mode_locked; + + int g2h_fallback; +}; +#endif /* __NET_NET_NAMESPACE_VSOCK_H */ diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 23dd647fe024..b73983a17e08 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -59,7 +59,7 @@ struct netns_xfrm { struct list_head inexact_bins; - struct sock *nlsk; + struct sock __rcu *nlsk; struct sock *nlsk_stash; u32 sysctl_aevent_etime; diff --git a/include/net/netrom.h b/include/net/netrom.h deleted file mode 100644 index f0565a5987d1..000000000000 --- a/include/net/netrom.h +++ /dev/null @@ -1,273 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Declarations of NET/ROM type objects. - * - * Jonathan Naylor G4KLX 9/4/95 - */ - -#ifndef _NETROM_H -#define _NETROM_H - -#include <linux/netrom.h> -#include <linux/list.h> -#include <linux/slab.h> -#include <net/sock.h> -#include <linux/refcount.h> -#include <linux/seq_file.h> -#include <net/ax25.h> - -#define NR_NETWORK_LEN 15 -#define NR_TRANSPORT_LEN 5 - -#define NR_PROTO_IP 0x0C - -#define NR_PROTOEXT 0x00 -#define NR_CONNREQ 0x01 -#define NR_CONNACK 0x02 -#define NR_DISCREQ 0x03 -#define NR_DISCACK 0x04 -#define NR_INFO 0x05 -#define NR_INFOACK 0x06 -#define NR_RESET 0x07 - -#define NR_CHOKE_FLAG 0x80 -#define NR_NAK_FLAG 0x40 -#define NR_MORE_FLAG 0x20 - -/* Define Link State constants. */ -enum { - NR_STATE_0, - NR_STATE_1, - NR_STATE_2, - NR_STATE_3 -}; - -#define NR_COND_ACK_PENDING 0x01 -#define NR_COND_REJECT 0x02 -#define NR_COND_PEER_RX_BUSY 0x04 -#define NR_COND_OWN_RX_BUSY 0x08 - -#define NR_DEFAULT_T1 120000 /* Outstanding frames - 120 seconds */ -#define NR_DEFAULT_T2 5000 /* Response delay - 5 seconds */ -#define NR_DEFAULT_N2 3 /* Number of Retries - 3 */ -#define NR_DEFAULT_T4 180000 /* Busy Delay - 180 seconds */ -#define NR_DEFAULT_IDLE 0 /* No Activity Timeout - none */ -#define NR_DEFAULT_WINDOW 4 /* Default Window Size - 4 */ -#define NR_DEFAULT_OBS 6 /* Default Obsolescence Count - 6 */ -#define NR_DEFAULT_QUAL 10 /* Default Neighbour Quality - 10 */ -#define NR_DEFAULT_TTL 16 /* Default Time To Live - 16 */ -#define NR_DEFAULT_ROUTING 1 /* Is routing enabled ? */ -#define NR_DEFAULT_FAILS 2 /* Link fails until route fails */ -#define NR_DEFAULT_RESET 0 /* Sent / accept reset cmds? */ - -#define NR_MODULUS 256 -#define NR_MAX_WINDOW_SIZE 127 /* Maximum Window Allowable - 127 */ -#define NR_MAX_PACKET_SIZE 236 /* Maximum Packet Length - 236 */ - -struct nr_sock { - struct sock sock; - ax25_address user_addr, source_addr, dest_addr; - struct net_device *device; - unsigned char my_index, my_id; - unsigned char your_index, your_id; - unsigned char state, condition, bpqext, window; - unsigned short vs, vr, va, vl; - unsigned char n2, n2count; - unsigned long t1, t2, t4, idle; - unsigned short fraglen; - struct timer_list t1timer; - struct timer_list t2timer; - struct timer_list t4timer; - struct timer_list idletimer; - struct sk_buff_head ack_queue; - struct sk_buff_head reseq_queue; - struct sk_buff_head frag_queue; -}; - -#define nr_sk(sk) ((struct nr_sock *)(sk)) - -struct nr_neigh { - struct hlist_node neigh_node; - ax25_address callsign; - ax25_digi *digipeat; - ax25_cb *ax25; - struct net_device *dev; - unsigned char quality; - unsigned char locked; - unsigned short count; - unsigned int number; - unsigned char failed; - refcount_t refcount; -}; - -struct nr_route { - unsigned char quality; - unsigned char obs_count; - struct nr_neigh *neighbour; -}; - -struct nr_node { - struct hlist_node node_node; - ax25_address callsign; - char mnemonic[7]; - unsigned char which; - unsigned char count; - struct nr_route routes[3]; - refcount_t refcount; - spinlock_t node_lock; -}; - -/********************************************************************* - * nr_node & nr_neigh lists, refcounting and locking - *********************************************************************/ - -#define nr_node_hold(__nr_node) \ - refcount_inc(&((__nr_node)->refcount)) - -static __inline__ void nr_node_put(struct nr_node *nr_node) -{ - if (refcount_dec_and_test(&nr_node->refcount)) { - kfree(nr_node); - } -} - -#define nr_neigh_hold(__nr_neigh) \ - refcount_inc(&((__nr_neigh)->refcount)) - -static __inline__ void nr_neigh_put(struct nr_neigh *nr_neigh) -{ - if (refcount_dec_and_test(&nr_neigh->refcount)) { - if (nr_neigh->ax25) - ax25_cb_put(nr_neigh->ax25); - kfree(nr_neigh->digipeat); - kfree(nr_neigh); - } -} - -/* nr_node_lock and nr_node_unlock also hold/put the node's refcounter. - */ -static __inline__ void nr_node_lock(struct nr_node *nr_node) -{ - nr_node_hold(nr_node); - spin_lock_bh(&nr_node->node_lock); -} - -static __inline__ void nr_node_unlock(struct nr_node *nr_node) -{ - spin_unlock_bh(&nr_node->node_lock); - nr_node_put(nr_node); -} - -#define nr_neigh_for_each(__nr_neigh, list) \ - hlist_for_each_entry(__nr_neigh, list, neigh_node) - -#define nr_neigh_for_each_safe(__nr_neigh, node2, list) \ - hlist_for_each_entry_safe(__nr_neigh, node2, list, neigh_node) - -#define nr_node_for_each(__nr_node, list) \ - hlist_for_each_entry(__nr_node, list, node_node) - -#define nr_node_for_each_safe(__nr_node, node2, list) \ - hlist_for_each_entry_safe(__nr_node, node2, list, node_node) - - -/*********************************************************************/ - -/* af_netrom.c */ -extern int sysctl_netrom_default_path_quality; -extern int sysctl_netrom_obsolescence_count_initialiser; -extern int sysctl_netrom_network_ttl_initialiser; -extern int sysctl_netrom_transport_timeout; -extern int sysctl_netrom_transport_maximum_tries; -extern int sysctl_netrom_transport_acknowledge_delay; -extern int sysctl_netrom_transport_busy_delay; -extern int sysctl_netrom_transport_requested_window_size; -extern int sysctl_netrom_transport_no_activity_timeout; -extern int sysctl_netrom_routing_control; -extern int sysctl_netrom_link_fails_count; -extern int sysctl_netrom_reset_circuit; - -int nr_rx_frame(struct sk_buff *, struct net_device *); -void nr_destroy_socket(struct sock *); - -/* nr_dev.c */ -int nr_rx_ip(struct sk_buff *, struct net_device *); -void nr_setup(struct net_device *); - -/* nr_in.c */ -int nr_process_rx_frame(struct sock *, struct sk_buff *); - -/* nr_loopback.c */ -void nr_loopback_init(void); -void nr_loopback_clear(void); -int nr_loopback_queue(struct sk_buff *); - -/* nr_out.c */ -void nr_output(struct sock *, struct sk_buff *); -void nr_send_nak_frame(struct sock *); -void nr_kick(struct sock *); -void nr_transmit_buffer(struct sock *, struct sk_buff *); -void nr_establish_data_link(struct sock *); -void nr_enquiry_response(struct sock *); -void nr_check_iframes_acked(struct sock *, unsigned short); - -/* nr_route.c */ -void nr_rt_device_down(struct net_device *); -struct net_device *nr_dev_first(void); -struct net_device *nr_dev_get(ax25_address *); -int nr_rt_ioctl(unsigned int, void __user *); -void nr_link_failed(ax25_cb *, int); -int nr_route_frame(struct sk_buff *, ax25_cb *); -extern const struct seq_operations nr_node_seqops; -extern const struct seq_operations nr_neigh_seqops; -void nr_rt_free(void); - -/* nr_subr.c */ -void nr_clear_queues(struct sock *); -void nr_frames_acked(struct sock *, unsigned short); -void nr_requeue_frames(struct sock *); -int nr_validate_nr(struct sock *, unsigned short); -int nr_in_rx_window(struct sock *, unsigned short); -void nr_write_internal(struct sock *, int); - -void __nr_transmit_reply(struct sk_buff *skb, int mine, unsigned char cmdflags); - -/* - * This routine is called when a Connect Acknowledge with the Choke Flag - * set is needed to refuse a connection. - */ -#define nr_transmit_refusal(skb, mine) \ -do { \ - __nr_transmit_reply((skb), (mine), NR_CONNACK | NR_CHOKE_FLAG); \ -} while (0) - -/* - * This routine is called when we don't have a circuit matching an incoming - * NET/ROM packet. This is an G8PZT Xrouter extension. - */ -#define nr_transmit_reset(skb, mine) \ -do { \ - __nr_transmit_reply((skb), (mine), NR_RESET); \ -} while (0) - -void nr_disconnect(struct sock *, int); - -/* nr_timer.c */ -void nr_init_timers(struct sock *sk); -void nr_start_heartbeat(struct sock *); -void nr_start_t1timer(struct sock *); -void nr_start_t2timer(struct sock *); -void nr_start_t4timer(struct sock *); -void nr_start_idletimer(struct sock *); -void nr_stop_heartbeat(struct sock *); -void nr_stop_t1timer(struct sock *); -void nr_stop_t2timer(struct sock *); -void nr_stop_t4timer(struct sock *); -void nr_stop_idletimer(struct sock *); -int nr_t1timer_running(struct sock *); - -/* sysctl_net_netrom.c */ -int nr_register_sysctl(void); -void nr_unregister_sysctl(void); - -#endif diff --git a/include/net/nexthop.h b/include/net/nexthop.h index d9fb44e8b321..572e69cda476 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -152,6 +152,8 @@ struct nexthop { u8 protocol; /* app managing this nh */ u8 nh_flags; bool is_group; + bool dead; + spinlock_t lock; /* protect dead and f6i_list */ refcount_t refcnt; struct rcu_head rcu; diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index e180bdf2f82b..664d5058e66e 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -52,7 +52,7 @@ enum nci_state { #define NCI_RF_DISC_SELECT_TIMEOUT 5000 #define NCI_RF_DEACTIVATE_TIMEOUT 30000 #define NCI_CMD_TIMEOUT 5000 -#define NCI_DATA_TIMEOUT 700 +#define NCI_DATA_TIMEOUT 3000 struct nci_dev; diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 127e6c7d910d..c54df042db6b 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -219,6 +219,8 @@ static inline void nfc_free_device(struct nfc_dev *dev) int nfc_register_device(struct nfc_dev *dev); +void nfc_unregister_rfkill(struct nfc_dev *dev); +void nfc_remove_device(struct nfc_dev *dev); void nfc_unregister_device(struct nfc_dev *dev); /** diff --git a/include/net/nl802154.h b/include/net/nl802154.h index a994dea74596..442822746e92 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -191,14 +191,12 @@ enum nl802154_iftype { * @NL802154_CAP_ATTR_CHANNELS: a nested attribute for nl802154_channel_attr * @NL802154_CAP_ATTR_TX_POWERS: a nested attribute for * nl802154_wpan_phy_tx_power - * @NL802154_CAP_ATTR_MIN_CCA_ED_LEVEL: minimum value for cca_ed_level - * @NL802154_CAP_ATTR_MAX_CCA_ED_LEVEL: maximum value for cca_ed_level * @NL802154_CAP_ATTR_CCA_MODES: nl802154_cca_modes flags * @NL802154_CAP_ATTR_CCA_OPTS: nl802154_cca_opts flags * @NL802154_CAP_ATTR_MIN_MINBE: minimum of minbe value * @NL802154_CAP_ATTR_MAX_MINBE: maximum of minbe value * @NL802154_CAP_ATTR_MIN_MAXBE: minimum of maxbe value - * @NL802154_CAP_ATTR_MAX_MINBE: maximum of maxbe value + * @NL802154_CAP_ATTR_MAX_MAXBE: maximum of maxbe value * @NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS: minimum of csma backoff value * @NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS: maximum of csma backoffs value * @NL802154_CAP_ATTR_MIN_FRAME_RETRIES: minimum of frame retries value @@ -364,6 +362,7 @@ enum nl802154_cca_opts { NL802154_CCA_OPT_ENERGY_CARRIER_AND, NL802154_CCA_OPT_ENERGY_CARRIER_OR, + /* private: */ /* keep last */ __NL802154_CCA_OPT_ATTR_AFTER_LAST, NL802154_CCA_OPT_ATTR_MAX = __NL802154_CCA_OPT_ATTR_AFTER_LAST - 1 diff --git a/include/net/nsh.h b/include/net/nsh.h index 16a751093896..15a26c590815 100644 --- a/include/net/nsh.h +++ b/include/net/nsh.h @@ -247,10 +247,10 @@ struct nshhdr { #define NSH_M_TYPE1_LEN 24 /* NSH header maximum Length. */ -#define NSH_HDR_MAX_LEN 256 +#define NSH_HDR_MAX_LEN ((NSH_LEN_MASK >> NSH_LEN_SHIFT) * 4) /* NSH context headers maximum Length. */ -#define NSH_CTX_HDRS_MAX_LEN 248 +#define NSH_CTX_HDRS_MAX_LEN (NSH_HDR_MAX_LEN - NSH_BASE_HDR_LEN) static inline struct nshhdr *nsh_hdr(struct sk_buff *skb) { diff --git a/include/net/p8022.h b/include/net/p8022.h deleted file mode 100644 index a29e224ac498..000000000000 --- a/include/net/p8022.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_P8022_H -#define _NET_P8022_H - -struct net_device; -struct packet_type; -struct sk_buff; - -struct datalink_proto * -register_8022_client(unsigned char type, - int (*func)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt, - struct net_device *orig_dev)); -void unregister_8022_client(struct datalink_proto *proto); -#endif diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 582a3d00cbe2..3247026e096a 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -153,6 +153,13 @@ static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool, return page_pool_alloc_netmem(pool, offset, size, gfp); } +static inline netmem_ref page_pool_dev_alloc_netmems(struct page_pool *pool) +{ + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + + return page_pool_alloc_netmems(pool, gfp); +} + static inline struct page *page_pool_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) @@ -395,6 +402,12 @@ static inline void page_pool_recycle_direct(struct page_pool *pool, page_pool_put_full_page(pool, page, true); } +static inline void page_pool_recycle_direct_netmem(struct page_pool *pool, + netmem_ref netmem) +{ + page_pool_put_full_netmem(pool, netmem, true); +} + #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) @@ -431,12 +444,7 @@ static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) */ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { - dma_addr_t ret = page->dma_addr; - - if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) - ret <<= PAGE_SHIFT; - - return ret; + return page_pool_get_dma_addr_netmem(page_to_netmem(page)); } static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool, @@ -481,6 +489,11 @@ page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool, offset, dma_sync_size); } +static inline void page_pool_get(struct page_pool *pool) +{ + refcount_inc(&pool->user_cnt); +} + static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); @@ -492,4 +505,21 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) page_pool_update_nid(pool, new_nid); } +/** + * page_pool_is_unreadable() - will allocated buffers be unreadable for the CPU + * @pool: queried page pool + * + * Check if page pool will return buffers which are unreadable to the CPU / + * kernel. This will only be the case if user space bound a memory provider (mp) + * which returns unreadable memory to the queue served by the page pool. + * If %PP_FLAG_ALLOW_UNREADABLE_NETMEM was set but there is no mp bound + * this helper will return false. See also netif_rxq_has_unreadable_mp(). + * + * Return: true if memory allocated by the page pool may be unreadable + */ +static inline bool page_pool_is_unreadable(struct page_pool *pool) +{ + return !!pool->mp_ops; +} + #endif /* _NET_PAGE_POOL_HELPERS_H */ diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h new file mode 100644 index 000000000000..255ce4cfd975 --- /dev/null +++ b/include/net/page_pool/memory_provider.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NET_PAGE_POOL_MEMORY_PROVIDER_H +#define _NET_PAGE_POOL_MEMORY_PROVIDER_H + +#include <net/netmem.h> +#include <net/page_pool/types.h> + +struct netdev_rx_queue; +struct netlink_ext_ack; +struct sk_buff; + +struct memory_provider_ops { + netmem_ref (*alloc_netmems)(struct page_pool *pool, gfp_t gfp); + bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem); + int (*init)(struct page_pool *pool); + void (*destroy)(struct page_pool *pool); + int (*nl_fill)(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq); + void (*uninstall)(void *mp_priv, struct netdev_rx_queue *rxq); +}; + +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr); +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); +void net_mp_niov_clear_page_pool(struct net_iov *niov); + +int netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack); +void netif_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *old_p); + +/** + * net_mp_netmem_place_in_cache() - give a netmem to a page pool + * @pool: the page pool to place the netmem into + * @netmem: netmem to give + * + * Push an accounted netmem into the page pool's allocation cache. The caller + * must ensure that there is space in the cache. It should only be called off + * the mp_ops->alloc_netmems() path. + */ +static inline void net_mp_netmem_place_in_cache(struct page_pool *pool, + netmem_ref netmem) +{ + pool->alloc.cache[pool->alloc.count++] = netmem; +} + +#endif diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 7f405672b089..03da138722f5 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -6,6 +6,7 @@ #include <linux/dma-direction.h> #include <linux/ptr_ring.h> #include <linux/types.h> +#include <linux/xarray.h> #include <net/netmem.h> #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA @@ -33,6 +34,9 @@ #define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \ PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM) +/* Index limit to stay within PP_DMA_INDEX_BITS for DMA indices */ +#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1) + /* * Fast allocation side cache array/stack * @@ -40,6 +44,8 @@ * use-case. The NAPI budget is 64 packets. After a NAPI poll the RX * ring is usually refilled and the max consumed elements will be 64, * thus a natural max size of objects needed in the cache. + * The refill watermark is set to 64 for 4KB pages, + * and scales to balance its size in bytes across page sizes. * * Keeping room for more objects, is due to XDP_DROP use-case. As * XDP_DROP allows the opportunity to recycle objects directly into @@ -47,8 +53,15 @@ * cache is already full (or partly full) then the XDP_DROP recycles * would have to take a slower code path. */ -#define PP_ALLOC_CACHE_SIZE 128 +#if PAGE_SIZE >= SZ_64K +#define PP_ALLOC_CACHE_REFILL 4 +#elif PAGE_SIZE >= SZ_16K +#define PP_ALLOC_CACHE_REFILL 16 +#else #define PP_ALLOC_CACHE_REFILL 64 +#endif + +#define PP_ALLOC_CACHE_SIZE (PP_ALLOC_CACHE_REFILL * 2) struct pp_alloc_cache { u32 count; netmem_ref cache[PP_ALLOC_CACHE_SIZE]; @@ -152,8 +165,12 @@ struct page_pool_stats { */ #define PAGE_POOL_FRAG_GROUP_ALIGN (4 * sizeof(long)) +struct memory_provider_ops; + struct pp_memory_provider_params { void *mp_priv; + const struct memory_provider_ops *mp_ops; + u32 rx_page_size; }; struct page_pool { @@ -216,6 +233,9 @@ struct page_pool { struct ptr_ring ring; void *mp_priv; + const struct memory_provider_ops *mp_ops; + + struct xarray dma_mapped; #ifdef CONFIG_PAGE_POOL_STATS /* recycle stats are per-cpu to avoid locking */ @@ -236,7 +256,7 @@ struct page_pool { /* User-facing fields, protected by page_pools_lock */ struct { struct hlist_node list; - u64 detach_time; + ktime_t detach_time; u32 id; } user; }; @@ -255,6 +275,8 @@ struct page_pool *page_pool_create_percpu(const struct page_pool_params *params, struct xdp_mem_info; #ifdef CONFIG_PAGE_POOL +void page_pool_enable_direct_recycling(struct page_pool *pool, + struct napi_struct *napi); void page_pool_disable_direct_recycling(struct page_pool *pool); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), diff --git a/include/net/pfcp.h b/include/net/pfcp.h index af14f970b80e..639553797d3e 100644 --- a/include/net/pfcp.h +++ b/include/net/pfcp.h @@ -45,7 +45,7 @@ struct pfcphdr_session { reserved:4; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved:4, - message_priprity:4; + message_priority:4; #else #error "Please fix <asm/byteorder>" #endif diff --git a/include/net/phy/realtek_phy.h b/include/net/phy/realtek_phy.h new file mode 100644 index 000000000000..d683bc1b0659 --- /dev/null +++ b/include/net/phy/realtek_phy.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _REALTEK_PHY_H +#define _REALTEK_PHY_H + +#define PHY_ID_RTL_DUMMY_SFP 0x001ccbff + +#endif /* _REALTEK_PHY_H */ diff --git a/include/net/pie.h b/include/net/pie.h index 01cbc66825a4..1f3db0c35514 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -104,7 +104,7 @@ static inline void pie_vars_init(struct pie_vars *vars) vars->dq_tstamp = DTIME_INVALID; vars->accu_prob = 0; vars->dq_count = DQCOUNT_INVALID; - vars->avg_dq_rate = 0; + WRITE_ONCE(vars->avg_dq_rate, 0); } static inline struct pie_skb_cb *get_pie_cb(const struct sk_buff *skb) diff --git a/include/net/ping.h b/include/net/ping.h index bc7779262e60..bcbdb5a136e3 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -20,8 +20,7 @@ /* Compatibility glue so we can support IPv6 when it's compiled as a module */ struct pingv6_ops { - int (*ipv6_recv_error)(struct sock *sk, struct msghdr *msg, int len, - int *addr_len); + int (*ipv6_recv_error)(struct sock *sk, struct msghdr *msg, int len); void (*ip6_datagram_recv_common_ctl)(struct sock *sk, struct msghdr *msg, struct sk_buff *skb); @@ -54,18 +53,17 @@ struct pingfakehdr { }; int ping_get_port(struct sock *sk, unsigned short ident); -int ping_hash(struct sock *sk); void ping_unhash(struct sock *sk); int ping_init_sock(struct sock *sk); void ping_close(struct sock *sk, long timeout); -int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int ping_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); void ping_err(struct sk_buff *skb, int offset, u32 info); int ping_getfrag(void *from, char *to, int offset, int fraglen, int odd, struct sk_buff *); int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len); + int flags); int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, void *user_icmph, size_t icmph_len); int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index c64fd896b1f9..99ac747b7906 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -536,6 +536,8 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) case TCF_LAYER_NETWORK: return skb_network_header(skb); case TCF_LAYER_TRANSPORT: + if (!skb_transport_header_was_set(skb)) + break; return skb_transport_header(skb); } diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index d7b7b6cd4aa1..18a419cd9d94 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -25,11 +25,6 @@ struct qdisc_walker { const struct Qdisc * : (const void *)&q->privdata, \ struct Qdisc * : (void *)&q->privdata) -static inline struct Qdisc *qdisc_from_priv(void *priv) -{ - return container_of(priv, struct Qdisc, privdata); -} - /* Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth @@ -48,7 +43,6 @@ static inline struct Qdisc *qdisc_from_priv(void *priv) */ typedef u64 psched_time_t; -typedef long psched_tdiff_t; /* Avoid doing 64 bit divide */ #define PSCHED_SHIFT 6 @@ -114,19 +108,19 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct netlink_ext_ack *extack); void qdisc_put_rtab(struct qdisc_rate_table *tab); void qdisc_put_stab(struct qdisc_size_table *tab); -void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate); void __qdisc_run(struct Qdisc *q); -static inline void qdisc_run(struct Qdisc *q) +static inline struct sk_buff *qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { __qdisc_run(q); - qdisc_run_end(q); + return qdisc_run_end(q); } + return NULL; } extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; @@ -290,4 +284,52 @@ static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, return true; } +static inline void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) +{ + if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { + pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", + txt, qdisc->ops->id, qdisc->handle >> 16); + qdisc->flags |= TCQ_F_WARN_NONWC; + } +} + +static inline unsigned int qdisc_peek_len(struct Qdisc *sch) +{ + struct sk_buff *skb; + unsigned int len; + + skb = sch->ops->peek(sch); + if (unlikely(skb == NULL)) { + qdisc_warn_nonwc("qdisc_peek_len", sch); + return 0; + } + len = qdisc_pkt_len(skb); + + return len; +} + +static inline void qdisc_lock_init(struct Qdisc *sch, + const struct Qdisc_ops *ops) +{ + spin_lock_init(&sch->q.lock); + + /* Skip dynamic keys if nesting is not possible */ + if (ops->static_flags & TCQ_F_INGRESS || + ops == &noqueue_qdisc_ops) + return; + + lockdep_register_key(&sch->root_lock_key); + lockdep_set_class(&sch->q.lock, &sch->root_lock_key); +} + +static inline void qdisc_lock_uninit(struct Qdisc *sch, + const struct Qdisc_ops *ops) +{ + if (ops->static_flags & TCQ_F_INGRESS || + ops == &noqueue_qdisc_ops) + return; + + lockdep_unregister_key(&sch->root_lock_key); +} + #endif diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h index a6ab2f4f5e28..ad6d703ce6fe 100644 --- a/include/net/proto_memory.h +++ b/include/net/proto_memory.h @@ -31,10 +31,13 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) if (!sk->sk_prot->memory_pressure) return false; - if (mem_cgroup_sockets_enabled && sk->sk_memcg && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) + if (mem_cgroup_sk_enabled(sk) && + mem_cgroup_sk_under_memory_pressure(sk)) return true; + if (sk->sk_bypass_prot_mem) + return false; + return !!READ_ONCE(*sk->sk_prot->memory_pressure); } diff --git a/include/net/psp.h b/include/net/psp.h new file mode 100644 index 000000000000..33bb4d1dc46e --- /dev/null +++ b/include/net/psp.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_ALL_H +#define __NET_PSP_ALL_H + +#include <uapi/linux/psp.h> +#include <net/psp/functions.h> +#include <net/psp/types.h> + +/* Do not add any code here. Put it in the sub-headers instead. */ + +#endif /* __NET_PSP_ALL_H */ diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h new file mode 100644 index 000000000000..c5c23a54774e --- /dev/null +++ b/include/net/psp/functions.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_HELPERS_H +#define __NET_PSP_HELPERS_H + +#include <linux/skbuff.h> +#include <linux/rcupdate.h> +#include <linux/udp.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/psp/types.h> + +struct inet_timewait_sock; + +/* Driver-facing API */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops, + struct psp_dev_caps *psd_caps, void *priv_ptr); +void psp_dev_unregister(struct psp_dev *psd); +bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, __be16 sport); +int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv); + +/* Kernel-facing API */ +void psp_assoc_put(struct psp_assoc *pas); + +static inline void *psp_assoc_drv_data(struct psp_assoc *pas) +{ + return pas->drv_data; +} + +#if IS_ENABLED(CONFIG_INET_PSP) +unsigned int psp_key_size(u32 version); +void psp_sk_assoc_free(struct sock *sk); +void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk); +void psp_twsk_assoc_free(struct inet_timewait_sock *tw); +void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb); + +static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) +{ + return rcu_dereference_check(sk->psp_assoc, lockdep_sock_is_held(sk)); +} + +static inline void +psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) +{ + struct psp_assoc *pas; + + pas = psp_sk_assoc(sk); + if (pas && pas->tx.spi) + skb->decrypted = 1; +} + +static inline unsigned long +__psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, + unsigned long diffs) +{ + struct psp_skb_ext *a, *b; + + a = skb_ext_find(one, SKB_EXT_PSP); + b = skb_ext_find(two, SKB_EXT_PSP); + + diffs |= (!!a) ^ (!!b); + if (!diffs && unlikely(a)) + diffs |= memcmp(a, b, sizeof(*a)); + return diffs; +} + +static inline bool +psp_is_allowed_nondata(struct sk_buff *skb, struct psp_assoc *pas) +{ + bool fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN); + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + u32 seq = TCP_SKB_CB(skb)->seq; + bool pure_fin; + + pure_fin = fin && end_seq - seq == 1; + + return seq == end_seq || (pure_fin && seq == pas->upgrade_seq); +} + +static inline bool +psp_pse_matches_pas(struct psp_skb_ext *pse, struct psp_assoc *pas) +{ + return pse && pas->rx.spi == pse->spi && + pas->generation == pse->generation && + pas->version == pse->version && + pas->dev_id == pse->dev_id; +} + +static inline enum skb_drop_reason +__psp_sk_rx_policy_check(struct sk_buff *skb, struct psp_assoc *pas) +{ + struct psp_skb_ext *pse = skb_ext_find(skb, SKB_EXT_PSP); + + if (!pas) + return pse ? SKB_DROP_REASON_PSP_INPUT : 0; + + if (likely(psp_pse_matches_pas(pse, pas))) { + if (unlikely(!pas->peer_tx)) + pas->peer_tx = 1; + + return 0; + } + + if (!pse) { + if (!pas->tx.spi || + (!pas->peer_tx && psp_is_allowed_nondata(skb, pas))) + return 0; + } + + return SKB_DROP_REASON_PSP_INPUT; +} + +static inline enum skb_drop_reason +psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) +{ + return __psp_sk_rx_policy_check(skb, psp_sk_assoc(sk)); +} + +static inline enum skb_drop_reason +psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) +{ + return __psp_sk_rx_policy_check(skb, rcu_dereference(tw->psp_assoc)); +} + +static inline struct psp_assoc *psp_sk_get_assoc_rcu(const struct sock *sk) +{ + struct psp_assoc *pas; + int state; + + state = READ_ONCE(sk->sk_state); + if (!sk_is_inet(sk) || state == TCP_NEW_SYN_RECV) + return NULL; + + pas = state == TCP_TIME_WAIT ? + rcu_dereference(inet_twsk(sk)->psp_assoc) : + rcu_dereference(sk->psp_assoc); + return pas; +} + +static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) +{ + if (!skb->decrypted || !skb->sk) + return NULL; + + return psp_sk_get_assoc_rcu(skb->sk); +} + +static inline unsigned int psp_sk_overhead(const struct sock *sk) +{ + int psp_encap = sizeof(struct udphdr) + PSP_HDR_SIZE + PSP_TRL_SIZE; + bool has_psp = rcu_access_pointer(sk->psp_assoc); + + return has_psp ? psp_encap : 0; +} +#else +static inline void psp_sk_assoc_free(struct sock *sk) { } +static inline void +psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { } +static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { } +static inline void +psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb) { } + +static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk) +{ + return NULL; +} + +static inline void +psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { } + +static inline unsigned long +__psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two, + unsigned long diffs) +{ + return diffs; +} + +static inline enum skb_drop_reason +psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb) +{ + return 0; +} + +static inline enum skb_drop_reason +psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb) +{ + return 0; +} + +static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb) +{ + return NULL; +} + +static inline unsigned int psp_sk_overhead(const struct sock *sk) +{ + return 0; +} +#endif + +static inline unsigned long +psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two) +{ + return __psp_skb_coalesce_diff(one, two, 0); +} + +#endif /* __NET_PSP_HELPERS_H */ diff --git a/include/net/psp/types.h b/include/net/psp/types.h new file mode 100644 index 000000000000..25a9096d4e7d --- /dev/null +++ b/include/net/psp/types.h @@ -0,0 +1,216 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __NET_PSP_H +#define __NET_PSP_H + +#include <linux/mutex.h> +#include <linux/refcount.h> + +struct netlink_ext_ack; + +#define PSP_DEFAULT_UDP_PORT 1000 + +struct psphdr { + u8 nexthdr; + u8 hdrlen; + u8 crypt_offset; + u8 verfl; + __be32 spi; + __be64 iv; + __be64 vc[]; /* optional */ +}; + +#define PSP_ENCAP_HLEN (sizeof(struct udphdr) + sizeof(struct psphdr)) + +#define PSP_SPI_KEY_ID GENMASK(30, 0) +#define PSP_SPI_KEY_PHASE BIT(31) + +#define PSPHDR_CRYPT_OFFSET GENMASK(5, 0) + +#define PSPHDR_VERFL_SAMPLE BIT(7) +#define PSPHDR_VERFL_DROP BIT(6) +#define PSPHDR_VERFL_VERSION GENMASK(5, 2) +#define PSPHDR_VERFL_VIRT BIT(1) +#define PSPHDR_VERFL_ONE BIT(0) + +#define PSP_HDRLEN_NOOPT ((sizeof(struct psphdr) - 8) / 8) + +/** + * struct psp_dev_config - PSP device configuration + * @versions: PSP versions enabled on the device + */ +struct psp_dev_config { + u32 versions; +}; + +/** + * struct psp_dev - PSP device struct + * @main_netdev: original netdevice of this PSP device + * @ops: driver callbacks + * @caps: device capabilities + * @drv_priv: driver priv pointer + * @lock: instance lock, protects all fields + * @refcnt: reference count for the instance + * @id: instance id + * @generation: current generation of the device key + * @config: current device configuration + * @active_assocs: list of registered associations + * @prev_assocs: associations which use old (but still usable) + * device key + * @stale_assocs: associations which use a rotated out key + * + * @stats: statistics maintained by the core + * @stats.rotations: See stats attr key-rotations + * @stats.stales: See stats attr stale-events + * + * @rcu: RCU head for freeing the structure + */ +struct psp_dev { + struct net_device *main_netdev; + + struct psp_dev_ops *ops; + struct psp_dev_caps *caps; + void *drv_priv; + + struct mutex lock; + refcount_t refcnt; + + u32 id; + + u8 generation; + + struct psp_dev_config config; + + struct list_head active_assocs; + struct list_head prev_assocs; + struct list_head stale_assocs; + + struct { + unsigned long rotations; + unsigned long stales; + } stats; + + struct rcu_head rcu; +}; + +#define PSP_GEN_VALID_MASK 0x7f + +/** + * struct psp_dev_caps - PSP device capabilities + */ +struct psp_dev_caps { + /** + * @versions: mask of supported PSP versions + * Set this field to 0 to indicate PSP is not supported at all. + */ + u32 versions; + + /** + * @assoc_drv_spc: size of driver-specific state in Tx assoc + * Determines the size of struct psp_assoc::drv_data + */ + u32 assoc_drv_spc; +}; + +#define PSP_MAX_KEY 32 + +#define PSP_HDR_SIZE 16 /* We don't support optional fields, yet */ +#define PSP_TRL_SIZE 16 /* AES-GCM/GMAC trailer size */ + +struct psp_skb_ext { + __be32 spi; + u16 dev_id; + u8 generation; + u8 version; +}; + +struct psp_key_parsed { + __be32 spi; + u8 key[PSP_MAX_KEY]; +}; + +struct psp_assoc { + struct psp_dev *psd; + + u16 dev_id; + u8 generation; + u8 version; + u8 peer_tx; + + u32 upgrade_seq; + + struct psp_key_parsed tx; + struct psp_key_parsed rx; + + refcount_t refcnt; + struct rcu_head rcu; + struct work_struct work; + struct list_head assocs_list; + + u8 drv_data[] __aligned(8); +}; + +struct psp_dev_stats { + union { + struct { + u64 rx_packets; + u64 rx_bytes; + u64 rx_auth_fail; + u64 rx_error; + u64 rx_bad; + u64 tx_packets; + u64 tx_bytes; + u64 tx_error; + }; + DECLARE_FLEX_ARRAY(u64, required); + }; +}; + +/** + * struct psp_dev_ops - netdev driver facing PSP callbacks + */ +struct psp_dev_ops { + /** + * @set_config: set configuration of a PSP device + * Driver can inspect @psd->config for the previous configuration. + * Core will update @psd->config with @config on success. + */ + int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf, + struct netlink_ext_ack *extack); + + /** + * @key_rotate: rotate the device key + */ + int (*key_rotate)(struct psp_dev *psd, struct netlink_ext_ack *extack); + + /** + * @rx_spi_alloc: allocate an Rx SPI+key pair + * Allocate an Rx SPI and resulting derived key. + * This key should remain valid until key rotation. + */ + int (*rx_spi_alloc)(struct psp_dev *psd, u32 version, + struct psp_key_parsed *assoc, + struct netlink_ext_ack *extack); + + /** + * @tx_key_add: add a Tx key to the device + * Install an association in the device. Core will allocate space + * for the driver to use at drv_data. + */ + int (*tx_key_add)(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack); + /** + * @tx_key_del: remove a Tx key from the device + * Remove an association from the device. + */ + void (*tx_key_del)(struct psp_dev *psd, struct psp_assoc *pas); + + /** + * @get_stats: get statistics from the device + * Stats required by the spec must be maintained and filled in. + * Stats must be filled in member-by-member, never memset the struct. + */ + void (*get_stats)(struct psp_dev *psd, struct psp_dev_stats *stats); +}; + +#endif /* __NET_PSP_H */ diff --git a/include/net/raw.h b/include/net/raw.h index 32a61481a253..66c0ffeada2e 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -81,6 +81,7 @@ struct raw_sock { struct inet_sock inet; struct icmp_filter filter; u32 ipmr_table; + struct numa_drop_counters drop_counters; }; #define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk) diff --git a/include/net/request_sock.h b/include/net/request_sock.h index b07b1cd14e9f..5a9c826a7092 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -30,19 +30,14 @@ struct request_sock_ops { unsigned int obj_size; struct kmem_cache *slab; char *slab_name; - int (*rtx_syn_ack)(const struct sock *sk, - struct request_sock *req); void (*send_ack)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); void (*send_reset)(const struct sock *sk, struct sk_buff *skb, enum sk_rst_reason reason); void (*destructor)(struct request_sock *req); - void (*syn_ack_timeout)(const struct request_sock *req); }; -int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req); - struct saved_syn { u32 mac_hdrlen; u32 network_hdrlen; @@ -128,14 +123,7 @@ static inline struct sock *skb_steal_sock(struct sk_buff *skb, return sk; } -static inline void __reqsk_free(struct request_sock *req) -{ - req->rsk_ops->destructor(req); - if (req->rsk_listener) - sock_put(req->rsk_listener); - kfree(req->saved_syn); - kmem_cache_free(req->rsk_ops->slab, req); -} +void __reqsk_free(struct request_sock *req); static inline void reqsk_free(struct request_sock *req) { @@ -189,8 +177,8 @@ struct fastopen_queue { struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; + u8 synflood_warned; - u32 synflood_warned; atomic_t qlen; atomic_t young; @@ -201,8 +189,6 @@ struct request_sock_queue { */ }; -void reqsk_queue_alloc(struct request_sock_queue *queue); - void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset); diff --git a/include/net/rose.h b/include/net/rose.h index 23267b4efcfa..41bfcb224f0b 100644 --- a/include/net/rose.h +++ b/include/net/rose.h @@ -1,250 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* - * Declarations of Rose type objects. - * - * Jonathan Naylor G4KLX 25/8/96 - */ - #ifndef _ROSE_H -#define _ROSE_H - -#include <linux/rose.h> -#include <net/ax25.h> -#include <net/sock.h> - -#define ROSE_ADDR_LEN 5 - -#define ROSE_MIN_LEN 3 - -#define ROSE_CALL_REQ_ADDR_LEN_OFF 3 -#define ROSE_CALL_REQ_ADDR_LEN_VAL 0xAA /* each address is 10 digits */ -#define ROSE_CALL_REQ_DEST_ADDR_OFF 4 -#define ROSE_CALL_REQ_SRC_ADDR_OFF 9 -#define ROSE_CALL_REQ_FACILITIES_OFF 14 - -#define ROSE_GFI 0x10 -#define ROSE_Q_BIT 0x80 -#define ROSE_D_BIT 0x40 -#define ROSE_M_BIT 0x10 - -#define ROSE_CALL_REQUEST 0x0B -#define ROSE_CALL_ACCEPTED 0x0F -#define ROSE_CLEAR_REQUEST 0x13 -#define ROSE_CLEAR_CONFIRMATION 0x17 -#define ROSE_DATA 0x00 -#define ROSE_INTERRUPT 0x23 -#define ROSE_INTERRUPT_CONFIRMATION 0x27 -#define ROSE_RR 0x01 -#define ROSE_RNR 0x05 -#define ROSE_REJ 0x09 -#define ROSE_RESET_REQUEST 0x1B -#define ROSE_RESET_CONFIRMATION 0x1F -#define ROSE_REGISTRATION_REQUEST 0xF3 -#define ROSE_REGISTRATION_CONFIRMATION 0xF7 -#define ROSE_RESTART_REQUEST 0xFB -#define ROSE_RESTART_CONFIRMATION 0xFF -#define ROSE_DIAGNOSTIC 0xF1 -#define ROSE_ILLEGAL 0xFD - -/* Define Link State constants. */ - -enum { - ROSE_STATE_0, /* Ready */ - ROSE_STATE_1, /* Awaiting Call Accepted */ - ROSE_STATE_2, /* Awaiting Clear Confirmation */ - ROSE_STATE_3, /* Data Transfer */ - ROSE_STATE_4, /* Awaiting Reset Confirmation */ - ROSE_STATE_5 /* Deferred Call Acceptance */ -}; - -#define ROSE_DEFAULT_T0 180000 /* Default T10 T20 value */ -#define ROSE_DEFAULT_T1 200000 /* Default T11 T21 value */ -#define ROSE_DEFAULT_T2 180000 /* Default T12 T22 value */ -#define ROSE_DEFAULT_T3 180000 /* Default T13 T23 value */ -#define ROSE_DEFAULT_HB 5000 /* Default Holdback value */ -#define ROSE_DEFAULT_IDLE 0 /* No Activity Timeout - none */ -#define ROSE_DEFAULT_ROUTING 1 /* Default routing flag */ -#define ROSE_DEFAULT_FAIL_TIMEOUT 120000 /* Time until link considered usable */ -#define ROSE_DEFAULT_MAXVC 50 /* Maximum number of VCs per neighbour */ -#define ROSE_DEFAULT_WINDOW_SIZE 7 /* Default window size */ - -#define ROSE_MODULUS 8 -#define ROSE_MAX_PACKET_SIZE 251 /* Maximum packet size */ - -#define ROSE_COND_ACK_PENDING 0x01 -#define ROSE_COND_PEER_RX_BUSY 0x02 -#define ROSE_COND_OWN_RX_BUSY 0x04 - -#define FAC_NATIONAL 0x00 -#define FAC_CCITT 0x0F - -#define FAC_NATIONAL_RAND 0x7F -#define FAC_NATIONAL_FLAGS 0x3F -#define FAC_NATIONAL_DEST_DIGI 0xE9 -#define FAC_NATIONAL_SRC_DIGI 0xEB -#define FAC_NATIONAL_FAIL_CALL 0xED -#define FAC_NATIONAL_FAIL_ADD 0xEE -#define FAC_NATIONAL_DIGIS 0xEF - -#define FAC_CCITT_DEST_NSAP 0xC9 -#define FAC_CCITT_SRC_NSAP 0xCB - -struct rose_neigh { - struct rose_neigh *next; - ax25_address callsign; - ax25_digi *digipeat; - ax25_cb *ax25; - struct net_device *dev; - unsigned short count; - unsigned short use; - unsigned int number; - char restarted; - char dce_mode; - char loopback; - struct sk_buff_head queue; - struct timer_list t0timer; - struct timer_list ftimer; -}; - -struct rose_node { - struct rose_node *next; - rose_address address; - unsigned short mask; - unsigned char count; - char loopback; - struct rose_neigh *neighbour[3]; -}; - -struct rose_route { - struct rose_route *next; - unsigned int lci1, lci2; - rose_address src_addr, dest_addr; - ax25_address src_call, dest_call; - struct rose_neigh *neigh1, *neigh2; - unsigned int rand; -}; - -struct rose_sock { - struct sock sock; - rose_address source_addr, dest_addr; - ax25_address source_call, dest_call; - unsigned char source_ndigis, dest_ndigis; - ax25_address source_digis[ROSE_MAX_DIGIS]; - ax25_address dest_digis[ROSE_MAX_DIGIS]; - struct rose_neigh *neighbour; - struct net_device *device; - netdevice_tracker dev_tracker; - unsigned int lci, rand; - unsigned char state, condition, qbitincl, defer; - unsigned char cause, diagnostic; - unsigned short vs, vr, va, vl; - unsigned long t1, t2, t3, hb, idle; -#ifdef M_BIT - unsigned short fraglen; - struct sk_buff_head frag_queue; -#endif - struct sk_buff_head ack_queue; - struct rose_facilities_struct facilities; - struct timer_list timer; - struct timer_list idletimer; -}; - -#define rose_sk(sk) ((struct rose_sock *)(sk)) - -/* af_rose.c */ -extern ax25_address rose_callsign; -extern int sysctl_rose_restart_request_timeout; -extern int sysctl_rose_call_request_timeout; -extern int sysctl_rose_reset_request_timeout; -extern int sysctl_rose_clear_request_timeout; -extern int sysctl_rose_no_activity_timeout; -extern int sysctl_rose_ack_hold_back_timeout; -extern int sysctl_rose_routing_control; -extern int sysctl_rose_link_fail_timeout; -extern int sysctl_rose_maximum_vcs; -extern int sysctl_rose_window_size; - -int rosecmp(const rose_address *, const rose_address *); -int rosecmpm(const rose_address *, const rose_address *, unsigned short); -char *rose2asc(char *buf, const rose_address *); -struct sock *rose_find_socket(unsigned int, struct rose_neigh *); -void rose_kill_by_neigh(struct rose_neigh *); -unsigned int rose_new_lci(struct rose_neigh *); -int rose_rx_call_request(struct sk_buff *, struct net_device *, - struct rose_neigh *, unsigned int); -void rose_destroy_socket(struct sock *); - -/* rose_dev.c */ -void rose_setup(struct net_device *); - -/* rose_in.c */ -int rose_process_rx_frame(struct sock *, struct sk_buff *); - -/* rose_link.c */ -void rose_start_ftimer(struct rose_neigh *); -void rose_stop_ftimer(struct rose_neigh *); -void rose_stop_t0timer(struct rose_neigh *); -int rose_ftimer_running(struct rose_neigh *); -void rose_link_rx_restart(struct sk_buff *, struct rose_neigh *, - unsigned short); -void rose_transmit_clear_request(struct rose_neigh *, unsigned int, - unsigned char, unsigned char); -void rose_transmit_link(struct sk_buff *, struct rose_neigh *); - -/* rose_loopback.c */ -void rose_loopback_init(void); -void rose_loopback_clear(void); -int rose_loopback_queue(struct sk_buff *, struct rose_neigh *); - -/* rose_out.c */ -void rose_kick(struct sock *); -void rose_enquiry_response(struct sock *); - -/* rose_route.c */ -extern struct rose_neigh *rose_loopback_neigh; -extern const struct seq_operations rose_neigh_seqops; -extern const struct seq_operations rose_node_seqops; -extern struct seq_operations rose_route_seqops; - -void rose_add_loopback_neigh(void); -int __must_check rose_add_loopback_node(const rose_address *); -void rose_del_loopback_node(const rose_address *); -void rose_rt_device_down(struct net_device *); -void rose_link_device_down(struct net_device *); -struct net_device *rose_dev_first(void); -struct net_device *rose_dev_get(rose_address *); -struct rose_route *rose_route_free_lci(unsigned int, struct rose_neigh *); -struct rose_neigh *rose_get_neigh(rose_address *, unsigned char *, - unsigned char *, int); -int rose_rt_ioctl(unsigned int, void __user *); -void rose_link_failed(ax25_cb *, int); -int rose_route_frame(struct sk_buff *, ax25_cb *); -void rose_rt_free(void); - -/* rose_subr.c */ -void rose_clear_queues(struct sock *); -void rose_frames_acked(struct sock *, unsigned short); -void rose_requeue_frames(struct sock *); -int rose_validate_nr(struct sock *, unsigned short); -void rose_write_internal(struct sock *, int); -int rose_decode(struct sk_buff *, int *, int *, int *, int *, int *); -int rose_parse_facilities(unsigned char *, unsigned int, - struct rose_facilities_struct *); -void rose_disconnect(struct sock *, int, int, int); - -/* rose_timer.c */ -void rose_start_heartbeat(struct sock *); -void rose_start_t1timer(struct sock *); -void rose_start_t2timer(struct sock *); -void rose_start_t3timer(struct sock *); -void rose_start_hbtimer(struct sock *); -void rose_start_idletimer(struct sock *); -void rose_stop_heartbeat(struct sock *); -void rose_stop_timer(struct sock *); -void rose_stop_idletimer(struct sock *); +#define _ROSE_H -/* sysctl_net_rose.c */ -void rose_register_sysctl(void); -void rose_unregister_sysctl(void); +#define ROSE_ADDR_LEN 5 #endif diff --git a/include/net/route.h b/include/net/route.h index c605fd5ec0c0..f90106f383c5 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -153,7 +153,7 @@ static inline void inet_sk_init_flowi4(const struct inet_sock *inet, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, inet->inet_saddr, inet->inet_dport, - inet->inet_sport, sk->sk_uid); + inet->inet_sport, sk_uid(sk)); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); } @@ -189,7 +189,7 @@ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, { struct flowi4 fl4 = { .flowi4_oif = oif, - .flowi4_tos = inet_dscp_to_dsfield(dscp), + .flowi4_dscp = dscp, .flowi4_scope = scope, .daddr = daddr, .saddr = saddr, @@ -326,9 +326,12 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, if (inet_test_bit(TRANSPARENT, sk)) flow_flags |= FLOWI_FLAG_ANYSRC; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport) + flow_flags |= FLOWI_FLAG_ANY_SPORT; + flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), protocol, flow_flags, dst, - src, dport, sport, sk->sk_uid); + src, dport, sport, sk_uid(sk)); } static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst, @@ -387,7 +390,7 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) const struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst->dev); + net = dst_dev_net_rcu(dst); hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); rcu_read_unlock(); } diff --git a/include/net/rps-types.h b/include/net/rps-types.h new file mode 100644 index 000000000000..6b90a66866c1 --- /dev/null +++ b/include/net/rps-types.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_RPS_TYPES_H +#define _NET_RPS_TYPES_H + +/* Define a rps_tag_ptr: + * Low order 5 bits are used to store the ilog2(size) of an RPS table. + */ +typedef unsigned long rps_tag_ptr; + +static inline u8 rps_tag_to_log(rps_tag_ptr tag_ptr) +{ + return tag_ptr & 31U; +} + +static inline u32 rps_tag_to_mask(rps_tag_ptr tag_ptr) +{ + return (1U << rps_tag_to_log(tag_ptr)) - 1; +} + +static inline void *rps_tag_to_table(rps_tag_ptr tag_ptr) +{ + return (void *)(tag_ptr & ~31UL); +} +#endif /* _NET_RPS_TYPES_H */ diff --git a/include/net/rps.h b/include/net/rps.h index a93401d23d66..e33c6a2fa8bb 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -8,6 +8,7 @@ #include <net/hotdata.h> #ifdef CONFIG_RPS +#include <net/rps-types.h> extern struct static_key_false rps_needed; extern struct static_key_false rfs_needed; @@ -25,28 +26,20 @@ struct rps_map { /* * The rps_dev_flow structure contains the mapping of a flow to a CPU, the - * tail pointer for that CPU's input queue at the time of last enqueue, and - * a hardware filter index. + * tail pointer for that CPU's input queue at the time of last enqueue, a + * hardware filter index, and the hash of the flow if aRFS is enabled. */ struct rps_dev_flow { u16 cpu; u16 filter; unsigned int last_qtail; +#ifdef CONFIG_RFS_ACCEL + u32 hash; +#endif }; #define RPS_NO_FILTER 0xffff /* - * The rps_dev_flow_table structure contains a table of flow mappings. - */ -struct rps_dev_flow_table { - unsigned int mask; - struct rcu_head rcu; - struct rps_dev_flow flows[]; -}; -#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \ - ((_num) * sizeof(struct rps_dev_flow))) - -/* * The rps_sock_flow_table contains mappings of flows to the last CPU * on which they were processed by the application (set in recvmsg). * Each entry is a 32bit value. Upper part is the high-order bits @@ -57,68 +50,119 @@ struct rps_dev_flow_table { * meaning we use 32-6=26 bits for the hash. */ struct rps_sock_flow_table { - u32 mask; - - u32 ents[] ____cacheline_aligned_in_smp; + u32 ent; }; -#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) #define RPS_NO_CPU 0xffff -static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, - u32 hash) +static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash) { - unsigned int index = hash & table->mask; + unsigned int index = hash & rps_tag_to_mask(tag_ptr); u32 val = hash & ~net_hotdata.rps_cpu_mask; + struct rps_sock_flow_table *table; /* We only give a hint, preemption can change CPU under us */ val |= raw_smp_processor_id(); + table = rps_tag_to_table(tag_ptr); /* The following WRITE_ONCE() is paired with the READ_ONCE() * here, and another one in get_rps_cpu(). */ - if (READ_ONCE(table->ents[index]) != val) - WRITE_ONCE(table->ents[index], val); + if (READ_ONCE(table[index].ent) != val) + WRITE_ONCE(table[index].ent, val); } -#endif /* CONFIG_RPS */ +static inline void _sock_rps_record_flow_hash(__u32 hash) +{ + rps_tag_ptr tag_ptr; -static inline void sock_rps_record_flow_hash(__u32 hash) + if (!hash) + return; + rcu_read_lock(); + tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table); + if (tag_ptr) + rps_record_sock_flow(tag_ptr, hash); + rcu_read_unlock(); +} + +static inline void _sock_rps_record_flow(const struct sock *sk) { -#ifdef CONFIG_RPS - struct rps_sock_flow_table *sock_flow_table; + /* Reading sk->sk_rxhash might incur an expensive cache line + * miss. + * + * TCP_ESTABLISHED does cover almost all states where RFS + * might be useful, and is cheaper [1] than testing : + * IPv4: inet_sk(sk)->inet_daddr + * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) + * OR an additional socket flag + * [1] : sk_state and sk_prot are in the same cache line. + */ + if (sk->sk_state == TCP_ESTABLISHED) { + /* This READ_ONCE() is paired with the WRITE_ONCE() + * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). + */ + _sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); + } +} + +static inline void _sock_rps_delete_flow(const struct sock *sk) +{ + struct rps_sock_flow_table *table; + rps_tag_ptr tag_ptr; + u32 hash, index; + hash = READ_ONCE(sk->sk_rxhash); if (!hash) return; + rcu_read_lock(); - sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table); - if (sock_flow_table) - rps_record_sock_flow(sock_flow_table, hash); + tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table); + if (tag_ptr) { + index = hash & rps_tag_to_mask(tag_ptr); + table = rps_tag_to_table(tag_ptr); + if (READ_ONCE(table[index].ent) != RPS_NO_CPU) + WRITE_ONCE(table[index].ent, RPS_NO_CPU); + } rcu_read_unlock(); +} +#endif /* CONFIG_RPS */ + +static inline bool rfs_is_needed(void) +{ +#ifdef CONFIG_RPS + return static_branch_unlikely(&rfs_needed); +#else + return false; +#endif +} + +static inline void sock_rps_record_flow_hash(__u32 hash) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_record_flow_hash(hash); #endif } static inline void sock_rps_record_flow(const struct sock *sk) { #ifdef CONFIG_RPS - if (static_branch_unlikely(&rfs_needed)) { - /* Reading sk->sk_rxhash might incur an expensive cache line - * miss. - * - * TCP_ESTABLISHED does cover almost all states where RFS - * might be useful, and is cheaper [1] than testing : - * IPv4: inet_sk(sk)->inet_daddr - * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) - * OR an additional socket flag - * [1] : sk_state and sk_prot are in the same cache line. - */ - if (sk->sk_state == TCP_ESTABLISHED) { - /* This READ_ONCE() is paired with the WRITE_ONCE() - * from sock_rps_save_rxhash() and sock_rps_reset_rxhash(). - */ - sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash)); - } - } + if (!rfs_is_needed()) + return; + + _sock_rps_record_flow(sk); +#endif +} + +static inline void sock_rps_delete_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_delete_flow(sk); #endif } diff --git a/include/net/rstreason.h b/include/net/rstreason.h index 69cb2e52b7da..979ac87b5d99 100644 --- a/include/net/rstreason.h +++ b/include/net/rstreason.h @@ -36,7 +36,7 @@ /** * enum sk_rst_reason - the reasons of socket reset * - * The reasons of sk reset, which are used in DCCP/TCP/MPTCP protocols. + * The reasons of sk reset, which are used in TCP/MPTCP protocols. * * There are three parts in order: * 1) skb drop reasons: relying on drop reasons for such as passive reset diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index bc0069a8b6ea..ec65a8cebb99 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -70,6 +70,40 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) } /** + * struct rtnl_newlink_params - parameters of rtnl_link_ops::newlink() + * + * @src_net: Source netns of rtnetlink socket + * @link_net: Link netns by IFLA_LINK_NETNSID, NULL if not specified + * @peer_net: Peer netns + * @tb: IFLA_* attributes + * @data: IFLA_INFO_DATA attributes + */ +struct rtnl_newlink_params { + struct net *src_net; + struct net *link_net; + struct net *peer_net; + struct nlattr **tb; + struct nlattr **data; +}; + +/* Get effective link netns from newlink params. Generally, this is link_net + * and falls back to src_net. But for compatibility, a driver may * choose to + * use dev_net(dev) instead. + */ +static inline struct net *rtnl_newlink_link_net(struct rtnl_newlink_params *p) +{ + return p->link_net ? : p->src_net; +} + +/* Get peer netns from newlink params. Fallback to link netns if peer netns is + * not specified explicitly. + */ +static inline struct net *rtnl_newlink_peer_net(struct rtnl_newlink_params *p) +{ + return p->peer_net ? : rtnl_newlink_link_net(p); +} + +/** * struct rtnl_link_ops - rtnetlink link operations * * @list: Used internally, protected by link_ops_mutex and SRCU @@ -125,10 +159,8 @@ struct rtnl_link_ops { struct nlattr *data[], struct netlink_ext_ack *extack); - int (*newlink)(struct net *src_net, - struct net_device *dev, - struct nlattr *tb[], - struct nlattr *data[], + int (*newlink)(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack); int (*changelink)(struct net_device *dev, struct nlattr *tb[], diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index d48c657191cd..11159a50d6a1 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -20,12 +20,15 @@ #include <net/rtnetlink.h> #include <net/flow_offload.h> #include <linux/xarray.h> +#include <net/dropreason-qdisc.h> struct Qdisc_ops; struct qdisc_walker; struct tcf_walker; struct module; struct bpf_flow_keys; +struct Qdisc; +struct netdev_queue; struct qdisc_rate_table { struct tc_ratespec rate; @@ -41,13 +44,6 @@ enum qdisc_state_t { __QDISC_STATE_DRAINING, }; -enum qdisc_state2_t { - /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly. - * Use qdisc_run_begin/end() or qdisc_is_running() instead. - */ - __QDISC_STATE2_RUNNING, -}; - #define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) #define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) @@ -95,6 +91,8 @@ struct Qdisc { #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ #define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */ #define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */ +#define TCQ_F_DEQUEUE_DROPS 0x400 /* ->dequeue() can drop packets in q->to_free */ + u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; @@ -110,20 +108,30 @@ struct Qdisc { int pad; refcount_t refcnt; - /* - * For performance sake on SMP, we put highly modified fields at the end - */ - struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; - struct qdisc_skb_head q; - struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - int owner; - unsigned long state; - unsigned long state2; /* must be written under qdisc spinlock */ - struct Qdisc *next_sched; - struct sk_buff_head skb_bad_txq; - - spinlock_t busylock ____cacheline_aligned_in_smp; + /* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */ + __cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned; + struct sk_buff_head gso_skb; + struct Qdisc *next_sched; + struct sk_buff_head skb_bad_txq; + __cacheline_group_end(Qdisc_read_mostly); + + /* Fields dirtied in dequeue() fast path. */ + __cacheline_group_begin(Qdisc_write) ____cacheline_aligned; + struct qdisc_skb_head q; + unsigned long state; + struct gnet_stats_basic_sync bstats; + bool running; /* must be written under qdisc spinlock */ + + /* Note : we only change qstats.backlog in fast path. */ + struct gnet_stats_queue qstats; + + struct sk_buff *to_free; + __cacheline_group_end(Qdisc_write); + + + atomic_long_t defer_count ____cacheline_aligned_in_smp; + struct llist_head defer_list; + spinlock_t seqlock; struct rcu_head rcu; @@ -168,7 +176,7 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) return spin_is_locked(&qdisc->seqlock); - return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + return READ_ONCE(qdisc->running); } static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) @@ -211,11 +219,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) */ return spin_trylock(&qdisc->seqlock); } - return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + if (READ_ONCE(qdisc->running)) + return false; + WRITE_ONCE(qdisc->running, true); + return true; } -static inline void qdisc_run_end(struct Qdisc *qdisc) +static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc) { + struct sk_buff *to_free = NULL; + if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); @@ -228,9 +241,16 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) if (unlikely(test_bit(__QDISC_STATE_MISSED, &qdisc->state))) __netif_schedule(qdisc); - } else { - __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + return NULL; } + + if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) { + to_free = qdisc->to_free; + if (to_free) + qdisc->to_free = NULL; + } + WRITE_ONCE(qdisc->running, false); + return to_free; } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) @@ -432,13 +452,16 @@ struct tcf_proto { }; struct qdisc_skb_cb { - struct { - unsigned int pkt_len; - u16 slave_dev_queue_mapping; - u16 tc_classid; - }; + unsigned int pkt_len; + u16 pkt_segs; + u16 tc_classid; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; + + u16 slave_dev_queue_mapping; + u8 post_ct:1; + u8 post_ct_snat:1; + u8 post_ct_dnat:1; }; typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); @@ -687,8 +710,8 @@ void dev_qdisc_change_real_num_tx(struct net_device *dev, void dev_init_scheduler(struct net_device *dev); void dev_shutdown(struct net_device *dev); void dev_activate(struct net_device *dev); -void dev_deactivate(struct net_device *dev); -void dev_deactivate_many(struct list_head *head); +void dev_deactivate(struct net_device *dev, bool reset_needed); +void dev_deactivate_many(struct list_head *head, bool reset_needed); struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc); void qdisc_reset(struct Qdisc *qdisc); @@ -696,6 +719,34 @@ void qdisc_destroy(struct Qdisc *qdisc); void qdisc_put(struct Qdisc *qdisc); void qdisc_put_unlocked(struct Qdisc *qdisc); void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len); + +static inline void dev_reset_queue(struct net_device *dev, + struct netdev_queue *dev_queue, + void *_unused) +{ + struct Qdisc *qdisc; + bool nolock; + + qdisc = rtnl_dereference(dev_queue->qdisc_sleeping); + if (!qdisc) + return; + + nolock = qdisc->flags & TCQ_F_NOLOCK; + + if (nolock) + spin_lock_bh(&qdisc->seqlock); + spin_lock_bh(qdisc_lock(qdisc)); + + qdisc_reset(qdisc); + + spin_unlock_bh(qdisc_lock(qdisc)); + if (nolock) { + clear_bit(__QDISC_STATE_MISSED, &qdisc->state); + clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); + spin_unlock_bh(&qdisc->seqlock); + } +} + #ifdef CONFIG_NET_SCHED int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type, void *type_data); @@ -758,13 +809,23 @@ static inline bool skb_skip_tc_classify(struct sk_buff *skb) static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i) { struct Qdisc *qdisc; + bool nolock; for (; i < dev->num_tx_queues; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); if (qdisc) { + nolock = qdisc->flags & TCQ_F_NOLOCK; + + if (nolock) + spin_lock_bh(&qdisc->seqlock); spin_lock_bh(qdisc_lock(qdisc)); qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); + if (nolock) { + clear_bit(__QDISC_STATE_MISSED, &qdisc->state); + clear_bit(__QDISC_STATE_DRAINING, &qdisc->state); + spin_unlock_bh(&qdisc->seqlock); + } } } } @@ -803,6 +864,14 @@ static inline bool qdisc_tx_changing(const struct net_device *dev) return false; } +/* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */ +static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq) +{ + struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc); + + return qdisc->enqueue == NULL; +} + /* Is the device using the noop qdisc on all queues? */ static inline bool qdisc_tx_is_noop(const struct net_device *dev) { @@ -821,6 +890,15 @@ static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb) return qdisc_skb_cb(skb)->pkt_len; } +static inline unsigned int qdisc_pkt_segs(const struct sk_buff *skb) +{ + u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs; + + DEBUG_NET_WARN_ON_ONCE(pkt_segs != + (skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1)); + return pkt_segs; +} + /* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */ enum net_xmit_qdisc_t { __NET_XMIT_STOLEN = 0x00010000, @@ -862,9 +940,7 @@ static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, static inline void bstats_update(struct gnet_stats_basic_sync *bstats, const struct sk_buff *skb) { - _bstats_update(bstats, - qdisc_pkt_len(skb), - skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); + _bstats_update(bstats, qdisc_pkt_len(skb), qdisc_pkt_segs(skb)); } static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, @@ -965,14 +1041,6 @@ static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch, __u32 *qlen, *backlog = qstats.backlog; } -static inline void qdisc_tree_flush_backlog(struct Qdisc *sch) -{ - __u32 qlen, backlog; - - qdisc_qstats_qlen_backlog(sch, &qlen, &backlog); - qdisc_tree_reduce_backlog(sch, qlen, backlog); -} - static inline void qdisc_purge_queue(struct Qdisc *sch) { __u32 qlen, backlog; @@ -1031,6 +1099,26 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh) return skb; } +static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool direct) +{ + struct sk_buff *skb; + + skb = __skb_dequeue(&sch->gso_skb); + if (skb) { + sch->q.qlen--; + qdisc_qstats_backlog_dec(sch, skb); + return skb; + } + if (direct) { + skb = __qdisc_dequeue_head(&sch->q); + if (skb) + qdisc_qstats_backlog_dec(sch, skb); + return skb; + } else { + return sch->dequeue(sch); + } +} + static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) { struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); @@ -1047,11 +1135,8 @@ struct tc_skb_cb { struct qdisc_skb_cb qdisc_cb; u32 drop_reason; - u16 zone; /* Only valid if post_ct = true */ + u16 zone; /* Only valid if qdisc_skb_cb(skb)->post_ct = true */ u16 mru; - u8 post_ct:1; - u8 post_ct_snat:1; - u8 post_ct_dnat:1; }; static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) @@ -1062,18 +1147,64 @@ static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) return cb; } +/* TC classifier accessors - use enum skb_drop_reason */ static inline enum skb_drop_reason tcf_get_drop_reason(const struct sk_buff *skb) { - return tc_skb_cb(skb)->drop_reason; + return (enum skb_drop_reason)tc_skb_cb(skb)->drop_reason; } static inline void tcf_set_drop_reason(const struct sk_buff *skb, enum skb_drop_reason reason) { + tc_skb_cb(skb)->drop_reason = (enum qdisc_drop_reason)reason; +} + +/* Qdisc accessors - use enum qdisc_drop_reason */ +static inline enum qdisc_drop_reason +tcf_get_qdisc_drop_reason(const struct sk_buff *skb) +{ + return tc_skb_cb(skb)->drop_reason; +} + +static inline void tcf_set_qdisc_drop_reason(const struct sk_buff *skb, + enum qdisc_drop_reason reason) +{ tc_skb_cb(skb)->drop_reason = reason; } +void __tcf_kfree_skb_list(struct sk_buff *skb, struct Qdisc *q, + struct netdev_queue *txq, struct net_device *dev); + +static inline void tcf_kfree_skb_list(struct sk_buff *skb, struct Qdisc *q, + struct netdev_queue *txq, + struct net_device *dev) +{ + if (unlikely(skb)) + __tcf_kfree_skb_list(skb, q, txq, dev); +} + +static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb, + enum qdisc_drop_reason reason) +{ + struct Qdisc *root; + + DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS)); + DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK); + + rcu_read_lock(); + root = qdisc_root_sleeping(q); + + if (root->flags & TCQ_F_DEQUEUE_DROPS) { + tcf_set_qdisc_drop_reason(skb, reason); + skb->next = root->to_free; + root->to_free = skb; + } else { + kfree_skb_reason(skb, (enum skb_drop_reason)reason); + } + rcu_read_unlock(); +} + /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ @@ -1246,9 +1377,9 @@ static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, static inline int qdisc_drop_reason(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free, - enum skb_drop_reason reason) + enum qdisc_drop_reason reason) { - tcf_set_drop_reason(skb, reason); + tcf_set_qdisc_drop_reason(skb, reason); return qdisc_drop(skb, sch, to_free); } @@ -1353,6 +1484,11 @@ void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp, struct tcf_block *block); +static inline bool mini_qdisc_pair_inited(struct mini_Qdisc_pair *miniqp) +{ + return !!miniqp->p_miniq; +} + void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx); int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb)); diff --git a/include/net/sch_priv.h b/include/net/sch_priv.h new file mode 100644 index 000000000000..4789f668ae87 --- /dev/null +++ b/include/net/sch_priv.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_SCHED_PRIV_H +#define __NET_SCHED_PRIV_H + +#include <net/sch_generic.h> + +struct mq_sched { + struct Qdisc **qdiscs; +}; + +int mq_init_common(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack, + const struct Qdisc_ops *qdisc_ops); +void mq_destroy_common(struct Qdisc *sch); +void mq_attach(struct Qdisc *sch); +void mq_dump_common(struct Qdisc *sch, struct sk_buff *skb); +struct netdev_queue *mq_select_queue(struct Qdisc *sch, + struct tcmsg *tcm); +struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl); +unsigned long mq_find(struct Qdisc *sch, u32 classid); +int mq_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm); +int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d); +void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg); + +#endif diff --git a/include/net/scm.h b/include/net/scm.h index 22bb49589fde..c52519669349 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -69,7 +69,7 @@ static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_co static __inline__ void scm_set_cred(struct scm_cookie *scm, struct pid *pid, kuid_t uid, kgid_t gid) { - scm->pid = get_pid(pid); + scm->pid = get_pid(pid); scm->creds.pid = pid_vnr(pid); scm->creds.uid = uid; scm->creds.gid = gid; @@ -78,7 +78,7 @@ static __inline__ void scm_set_cred(struct scm_cookie *scm, static __inline__ void scm_destroy_cred(struct scm_cookie *scm) { put_pid(scm->pid); - scm->pid = NULL; + scm->pid = NULL; } static __inline__ void scm_destroy(struct scm_cookie *scm) @@ -102,123 +102,10 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, return __scm_send(sock, msg, scm); } -#ifdef CONFIG_SECURITY_NETWORK -static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) -{ - struct lsm_context ctx; - int err; - - if (test_bit(SOCK_PASSSEC, &sock->flags)) { - err = security_secid_to_secctx(scm->secid, &ctx); - - if (err >= 0) { - put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len, - ctx.context); - security_release_secctx(&ctx); - } - } -} - -static inline bool scm_has_secdata(struct socket *sock) -{ - return test_bit(SOCK_PASSSEC, &sock->flags); -} -#else -static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) -{ } - -static inline bool scm_has_secdata(struct socket *sock) -{ - return false; -} -#endif /* CONFIG_SECURITY_NETWORK */ - -static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) -{ - struct file *pidfd_file = NULL; - int len, pidfd; - - /* put_cmsg() doesn't return an error if CMSG is truncated, - * that's why we need to opencode these checks here. - */ - if (msg->msg_flags & MSG_CMSG_COMPAT) - len = sizeof(struct compat_cmsghdr) + sizeof(int); - else - len = sizeof(struct cmsghdr) + sizeof(int); - - if (msg->msg_controllen < len) { - msg->msg_flags |= MSG_CTRUNC; - return; - } - - if (!scm->pid) - return; - - pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); - - if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { - if (pidfd_file) { - put_unused_fd(pidfd); - fput(pidfd_file); - } - - return; - } - - if (pidfd_file) - fd_install(pidfd, pidfd_file); -} - -static inline bool __scm_recv_common(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) -{ - if (!msg->msg_control) { - if (test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags) || - scm->fp || scm_has_secdata(sock)) - msg->msg_flags |= MSG_CTRUNC; - scm_destroy(scm); - return false; - } - - if (test_bit(SOCK_PASSCRED, &sock->flags)) { - struct user_namespace *current_ns = current_user_ns(); - struct ucred ucreds = { - .pid = scm->creds.pid, - .uid = from_kuid_munged(current_ns, scm->creds.uid), - .gid = from_kgid_munged(current_ns, scm->creds.gid), - }; - put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); - } - - scm_passec(sock, msg, scm); - - if (scm->fp) - scm_detach_fds(msg, scm); - - return true; -} - -static inline void scm_recv(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) -{ - if (!__scm_recv_common(sock, msg, scm, flags)) - return; - - scm_destroy_cred(scm); -} - -static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm, int flags) -{ - if (!__scm_recv_common(sock, msg, scm, flags)) - return; - - if (test_bit(SOCK_PASSPIDFD, &sock->flags)) - scm_pidfd_recv(msg, scm); - - scm_destroy_cred(scm); -} +void scm_recv(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags); +void scm_recv_unix(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags); static inline int scm_recv_one_fd(struct file *f, int __user *ufd, unsigned int flags) diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h index d4b3b2dcd15b..6f2cd562b1de 100644 --- a/include/net/sctp/auth.h +++ b/include/net/sctp/auth.h @@ -22,16 +22,11 @@ struct sctp_endpoint; struct sctp_association; struct sctp_authkey; struct sctp_hmacalgo; -struct crypto_shash; -/* - * Define a generic struct that will hold all the info - * necessary for an HMAC transform - */ +/* Defines an HMAC algorithm supported by SCTP chunk authentication */ struct sctp_hmac { - __u16 hmac_id; /* one of the above ids */ - char *hmac_name; /* name for loading */ - __u16 hmac_len; /* length of the signature */ + __u16 hmac_id; /* one of SCTP_AUTH_HMAC_ID_* */ + __u16 hmac_len; /* length of the HMAC value in bytes */ }; /* This is generic structure that containst authentication bytes used @@ -77,10 +72,9 @@ struct sctp_shared_key *sctp_auth_get_shkey( int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep, struct sctp_association *asoc, gfp_t gfp); -int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp); -void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[]); -struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id); -struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc); +const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id); +const struct sctp_hmac * +sctp_auth_asoc_get_hmac(const struct sctp_association *asoc); void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, struct sctp_hmac_algo_param *hmacs); int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h index f514a0aa849e..654d37ec0402 100644 --- a/include/net/sctp/checksum.h +++ b/include/net/sctp/checksum.h @@ -15,8 +15,6 @@ * Dinakaran Joseph * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> - * - * Rewritten to use libcrc32c by: * Vlad Yasevich <vladislav.yasevich@hp.com> */ @@ -25,42 +23,18 @@ #include <linux/types.h> #include <linux/sctp.h> -#include <linux/crc32c.h> -#include <linux/crc32.h> - -static inline __wsum sctp_csum_update(const void *buff, int len, __wsum sum) -{ - /* This uses the crypto implementation of crc32c, which is either - * implemented w/ hardware support or resolves to __crc32c_le(). - */ - return (__force __wsum)crc32c((__force __u32)sum, buff, len); -} - -static inline __wsum sctp_csum_combine(__wsum csum, __wsum csum2, - int offset, int len) -{ - return (__force __wsum)__crc32c_le_combine((__force __u32)csum, - (__force __u32)csum2, len); -} - -static const struct skb_checksum_ops sctp_csum_ops = { - .update = sctp_csum_update, - .combine = sctp_csum_combine, -}; static inline __le32 sctp_compute_cksum(const struct sk_buff *skb, unsigned int offset) { struct sctphdr *sh = (struct sctphdr *)(skb->data + offset); __le32 old = sh->checksum; - __wsum new; + u32 new; sh->checksum = 0; - new = ~__skb_checksum(skb, offset, skb->len - offset, ~(__wsum)0, - &sctp_csum_ops); + new = ~skb_crc32c(skb, offset, skb->len - offset, ~0); sh->checksum = old; - - return cpu_to_le32((__force __u32)new); + return cpu_to_le32(new); } #endif /* __sctp_checksum_h__ */ diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 5859e0a16a58..ae3376ba0b99 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -296,9 +296,8 @@ enum { SCTP_MAX_GABS = 16 }; */ #define SCTP_DEFAULT_MINSEGMENT 512 /* MTU size ... if no mtu disc */ -#define SCTP_SECRET_SIZE 32 /* Number of octets in a 256 bits. */ - -#define SCTP_SIGNATURE_SIZE 20 /* size of a SLA-1 signature */ +#define SCTP_COOKIE_KEY_SIZE 32 /* size of cookie HMAC key */ +#define SCTP_COOKIE_MAC_SIZE 32 /* size of HMAC field in cookies */ #define SCTP_COOKIE_MULTIPLE 32 /* Pad out our cookie to make our hash * functions simpler to write. @@ -417,16 +416,12 @@ enum { SCTP_AUTH_HMAC_ID_RESERVED_0, SCTP_AUTH_HMAC_ID_SHA1, SCTP_AUTH_HMAC_ID_RESERVED_2, -#if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE) SCTP_AUTH_HMAC_ID_SHA256, -#endif __SCTP_AUTH_HMAC_MAX }; #define SCTP_AUTH_HMAC_ID_MAX __SCTP_AUTH_HMAC_MAX - 1 #define SCTP_AUTH_NUM_HMACS __SCTP_AUTH_HMAC_MAX -#define SCTP_SHA1_SIG_SIZE 20 -#define SCTP_SHA256_SIG_SIZE 32 /* SCTP-AUTH, Section 3.2 * The chunk types for INIT, INIT-ACK, SHUTDOWN-COMPLETE and AUTH chunks diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 84e6b9fd5610..58242b37b47a 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -85,7 +85,7 @@ void sctp_udp_sock_stop(struct net *net); /* * sctp/socket.c */ -int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr, +int sctp_inet_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags); int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb); int sctp_inet_listen(struct socket *sock, int backlog); @@ -94,8 +94,7 @@ void sctp_data_ready(struct sock *sk); __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait); void sctp_sock_rfree(struct sk_buff *skb); -void sctp_copy_sock(struct sock *newsk, struct sock *sk, - struct sctp_association *asoc); + extern struct percpu_counter sctp_sockets_allocated; int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *); struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int *); @@ -364,8 +363,6 @@ sctp_assoc_to_state(const struct sctp_association *asoc) /* Look up the association by its id. */ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id); -int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp); - /* A macro to walk a list of skbs. */ #define sctp_skb_for_each(pos, head, tmp) \ skb_queue_walk_safe(head, pos, tmp) @@ -636,7 +633,7 @@ static inline void sctp_transport_pl_reset(struct sctp_transport *t) } } else { if (t->pl.state != SCTP_PL_DISABLED) { - if (del_timer(&t->probe_timer)) + if (timer_delete(&t->probe_timer)) sctp_transport_put(t); t->pl.state = SCTP_PL_DISABLED; } diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index 64c42bd56bb2..3bfd261a53cc 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -161,7 +161,6 @@ const struct sctp_sm_table_entry *sctp_sm_lookup_event( enum sctp_event_type event_type, enum sctp_state state, union sctp_subtype event_subtype); -int sctp_chunk_iif(const struct sctp_chunk *); struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *, struct sctp_chunk *, gfp_t gfp); diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h index 8034bf5febbe..77806ef1cb70 100644 --- a/include/net/sctp/stream_sched.h +++ b/include/net/sctp/stream_sched.h @@ -52,10 +52,10 @@ void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch); void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch); int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp); -struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream); +const struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream); void sctp_sched_ops_register(enum sctp_sched_type sched, - struct sctp_sched_ops *sched_ops); + const struct sctp_sched_ops *sched_ops); void sctp_sched_ops_prio_init(void); void sctp_sched_ops_rr_init(void); void sctp_sched_ops_fc_init(void); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 31248cfdfb23..affee44bd38e 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -32,6 +32,7 @@ #ifndef __sctp_structs_h__ #define __sctp_structs_h__ +#include <crypto/sha2.h> #include <linux/ktime.h> #include <linux/generic-radix-tree.h> #include <linux/rhashtable-types.h> @@ -51,9 +52,9 @@ * We should wean ourselves off this. */ union sctp_addr { + struct sockaddr_inet sa; /* Large enough for both address families */ struct sockaddr_in v4; struct sockaddr_in6 v6; - struct sockaddr sa; }; /* Forward declarations for data structures. */ @@ -68,7 +69,6 @@ struct sctp_outq; struct sctp_bind_addr; struct sctp_ulpq; struct sctp_ep_common; -struct crypto_shash; struct sctp_stream; @@ -155,10 +155,6 @@ struct sctp_sock { /* PF_ family specific functions. */ struct sctp_pf *pf; - /* Access to HMAC transform. */ - struct crypto_shash *hmac; - char *sctp_hmac_alg; - /* What is our base endpointer? */ struct sctp_endpoint *ep; @@ -227,14 +223,11 @@ struct sctp_sock { frag_interleave:1, recvrcvinfo:1, recvnxtinfo:1, - data_ready_signalled:1; + data_ready_signalled:1, + cookie_auth_enable:1; atomic_t pd_mode; - /* Fields after this point will be skipped on copies, like on accept - * and peeloff operations - */ - /* Receive to here while partial delivery is in effect. */ struct sk_buff_head pd_lobby; @@ -335,7 +328,7 @@ struct sctp_cookie { /* The format of our cookie that we send to our peer. */ struct sctp_signed_cookie { - __u8 signature[SCTP_SECRET_SIZE]; + __u8 mac[SCTP_COOKIE_MAC_SIZE]; __u32 __pad; /* force sctp_cookie alignment to 64 bits */ struct sctp_cookie c; } __packed; @@ -500,9 +493,6 @@ struct sctp_pf { int (*bind_verify) (struct sctp_sock *, union sctp_addr *); int (*send_verify) (struct sctp_sock *, union sctp_addr *); int (*supported_addrs)(const struct sctp_sock *, __be16 *); - struct sock *(*create_accept_sk) (struct sock *sk, - struct sctp_association *asoc, - bool kern); int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr *addr); void (*to_sk_saddr)(union sctp_addr *, struct sock *sk); void (*to_sk_daddr)(union sctp_addr *, struct sock *sk); @@ -775,6 +765,7 @@ struct sctp_transport { /* Reference counting. */ refcount_t refcnt; + __u32 dead:1, /* RTO-Pending : A flag used to track if one of the DATA * chunks sent to this address is currently being * used to compute a RTT. If this flag is 0, @@ -784,7 +775,7 @@ struct sctp_transport { * calculation completes (i.e. the DATA chunk * is SACK'd) clear this flag. */ - __u32 rto_pending:1, + rto_pending:1, /* * hb_sent : a flag that signals that we have a pending @@ -1078,7 +1069,7 @@ struct sctp_outq { struct list_head out_chunk_list; /* Stream scheduler being used */ - struct sctp_sched_ops *sched; + const struct sctp_sched_ops *sched; unsigned int out_qlen; /* Total length of queued data chunks. */ @@ -1306,33 +1297,15 @@ struct sctp_endpoint { /* This is really a list of struct sctp_association entries. */ struct list_head asocs; - /* Secret Key: A secret key used by this endpoint to compute - * the MAC. This SHOULD be a cryptographic quality - * random number with a sufficient length. - * Discussion in [RFC1750] can be helpful in - * selection of the key. - */ - __u8 secret_key[SCTP_SECRET_SIZE]; - - /* digest: This is a digest of the sctp cookie. This field is - * only used on the receive path when we try to validate - * that the cookie has not been tampered with. We put - * this here so we pre-allocate this once and can re-use - * on every receive. - */ - __u8 *digest; - + /* Cookie authentication key used by this endpoint */ + struct hmac_sha256_key cookie_auth_key; + /* sendbuf acct. policy. */ __u32 sndbuf_policy; /* rcvbuf acct. policy. */ __u32 rcvbuf_policy; - /* SCTP AUTH: array of the HMACs that will be allocated - * we need this per association so that we don't serialize - */ - struct crypto_shash **auth_hmacs; - /* SCTP-AUTH: hmacs for the endpoint encoded into parameter */ struct sctp_hmac_algo_param *auth_hmacs_list; @@ -2151,8 +2124,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *, const union sctp_addr *address, const gfp_t gfp, const int peer_state); -void sctp_assoc_del_peer(struct sctp_association *asoc, - const union sctp_addr *addr); void sctp_assoc_rm_peer(struct sctp_association *asoc, struct sctp_transport *peer); void sctp_assoc_control_transport(struct sctp_association *asoc, diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index 21e7fa2a1813..6f996229167b 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -5,20 +5,47 @@ #include <linux/types.h> struct net; +extern struct net init_net; + +union tcp_seq_and_ts_off { + struct { + u32 seq; + u32 ts_off; + }; + u64 hash64; +}; u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport); -u32 secure_tcp_seq(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport); -u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr); -u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, - __be16 sport, __be16 dport); -u32 secure_tcpv6_ts_off(const struct net *net, - const __be32 *saddr, const __be32 *daddr); -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport); -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport); +union tcp_seq_and_ts_off +secure_tcp_seq_and_ts_off(const struct net *net, __be32 saddr, __be32 daddr, + __be16 sport, __be16 dport); + +static inline u32 secure_tcp_seq(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) +{ + union tcp_seq_and_ts_off ts; + + ts = secure_tcp_seq_and_ts_off(&init_net, saddr, daddr, + sport, dport); + + return ts.seq; +} + +union tcp_seq_and_ts_off +secure_tcpv6_seq_and_ts_off(const struct net *net, const __be32 *saddr, + const __be32 *daddr, + __be16 sport, __be16 dport); + +static inline u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport) +{ + union tcp_seq_and_ts_off ts; + + ts = secure_tcpv6_seq_and_ts_off(&init_net, saddr, daddr, + sport, dport); + return ts.seq; +} #endif /* _NET_SECURE_SEQ */ diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h index 24f733b3e3fe..e9f41725933e 100644 --- a/include/net/seg6_hmac.h +++ b/include/net/seg6_hmac.h @@ -9,6 +9,8 @@ #ifndef _NET_SEG6_HMAC_H #define _NET_SEG6_HMAC_H +#include <crypto/sha1.h> +#include <crypto/sha2.h> #include <net/flow.h> #include <net/ip6_fib.h> #include <net/sock.h> @@ -19,7 +21,6 @@ #include <linux/seg6_hmac.h> #include <linux/rhashtable-types.h> -#define SEG6_HMAC_MAX_DIGESTSIZE 160 #define SEG6_HMAC_RING_SIZE 256 struct seg6_hmac_info { @@ -27,16 +28,15 @@ struct seg6_hmac_info { struct rcu_head rcu; u32 hmackeyid; + /* The raw key, kept only so it can be returned back to userspace */ char secret[SEG6_HMAC_SECRET_LEN]; u8 slen; u8 alg_id; -}; - -struct seg6_hmac_algo { - u8 alg_id; - char name[64]; - struct crypto_shash * __percpu *tfms; - struct shash_desc * __percpu *shashs; + /* The prepared key, which the calculations actually use */ + union { + struct hmac_sha1_key sha1; + struct hmac_sha256_key sha256; + } key; }; extern int seg6_hmac_compute(struct seg6_hmac_info *hinfo, @@ -50,13 +50,9 @@ extern int seg6_push_hmac(struct net *net, struct in6_addr *saddr, struct ipv6_sr_hdr *srh); extern bool seg6_hmac_validate_skb(struct sk_buff *skb); #ifdef CONFIG_IPV6_SEG6_HMAC -extern int seg6_hmac_init(void); -extern void seg6_hmac_exit(void); extern int seg6_hmac_net_init(struct net *net); extern void seg6_hmac_net_exit(struct net *net); #else -static inline int seg6_hmac_init(void) { return 0; } -static inline void seg6_hmac_exit(void) {} static inline int seg6_hmac_net_init(struct net *net) { return 0; } static inline void seg6_hmac_net_exit(struct net *net) {} #endif diff --git a/include/net/selftests.h b/include/net/selftests.h index e65e8d230d33..c36e07406ad4 100644 --- a/include/net/selftests.h +++ b/include/net/selftests.h @@ -3,9 +3,48 @@ #define _NET_SELFTESTS #include <linux/ethtool.h> +#include <linux/netdevice.h> + +struct net_packet_attrs { + const unsigned char *src; + const unsigned char *dst; + u32 ip_src; + u32 ip_dst; + bool tcp; + u16 sport; + u16 dport; + int timeout; + int size; + int max_size; + u8 id; + u16 queue_mapping; + bool bad_csum; +}; + +struct net_test_priv { + struct net_packet_attrs *packet; + struct packet_type pt; + struct completion comp; + int double_vlan; + int vlan_id; + int ok; +}; + +struct netsfhdr { + __be32 version; + __be64 magic; + u8 id; +} __packed; + +#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct netsfhdr)) +#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL +#define NET_LB_TIMEOUT msecs_to_jiffies(200) #if IS_ENABLED(CONFIG_NET_SELFTESTS) +struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id, + struct net_packet_attrs *attr); void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf); int net_selftest_get_count(void); @@ -13,6 +52,12 @@ void net_selftest_get_strings(u8 *data); #else +static inline struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id, + struct net_packet_attrs *attr) +{ + return NULL; +} + static inline void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) { diff --git a/include/net/smc.h b/include/net/smc.h index db84e4e35080..bfdc4c41f019 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -15,8 +15,10 @@ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/wait.h> -#include "linux/ism.h" +#include <linux/dibs.h> +struct tcp_sock; +struct inet_request_sock; struct sock; #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ @@ -27,62 +29,15 @@ struct smc_hashinfo { }; /* SMCD/ISM device driver interface */ -struct smcd_dmb { - u64 dmb_tok; - u64 rgid; - u32 dmb_len; - u32 sba_idx; - u32 vlan_valid; - u32 vlan_id; - void *cpu_addr; - dma_addr_t dma_addr; -}; - -#define ISM_EVENT_DMB 0 -#define ISM_EVENT_GID 1 -#define ISM_EVENT_SWR 2 - #define ISM_RESERVED_VLANID 0x1FFF -#define ISM_ERROR 0xFFFF - -struct smcd_dev; - struct smcd_gid { u64 gid; u64 gid_ext; }; -struct smcd_ops { - int (*query_remote_gid)(struct smcd_dev *dev, struct smcd_gid *rgid, - u32 vid_valid, u32 vid); - int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, - void *client); - int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); - int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, - bool sf, unsigned int offset, void *data, - unsigned int size); - int (*supports_v2)(void); - void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); - u16 (*get_chid)(struct smcd_dev *dev); - struct device* (*get_dev)(struct smcd_dev *dev); - - /* optional operations */ - int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); - int (*set_vlan_required)(struct smcd_dev *dev); - int (*reset_vlan_required)(struct smcd_dev *dev); - int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, - u32 trigger_irq, u32 event_code, u64 info); - int (*support_dmb_nocopy)(struct smcd_dev *dev); - int (*attach_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); - int (*detach_dmb)(struct smcd_dev *dev, u64 token); -}; - struct smcd_dev { - const struct smcd_ops *ops; - void *priv; - void *client; + struct dibs_dev *dibs; struct list_head list; spinlock_t lock; struct smc_connection **conn; @@ -97,4 +52,55 @@ struct smcd_dev { u8 going_away : 1; }; +#define SMC_HS_CTRL_NAME_MAX 16 + +enum { + /* ops can be inherit from init_net */ + SMC_HS_CTRL_FLAG_INHERITABLE = 0x1, + + SMC_HS_CTRL_ALL_FLAGS = SMC_HS_CTRL_FLAG_INHERITABLE, +}; + +struct smc_hs_ctrl { + /* private */ + + struct list_head list; + struct module *owner; + + /* public */ + + /* unique name */ + char name[SMC_HS_CTRL_NAME_MAX]; + int flags; + + /* Invoked before computing SMC option for SYN packets. + * We can control whether to set SMC options by returning various value. + * Return 0 to disable SMC, or return any other value to enable it. + */ + int (*syn_option)(struct tcp_sock *tp); + + /* Invoked before Set up SMC options for SYN-ACK packets + * We can control whether to respond SMC options by returning various + * value. Return 0 to disable SMC, or return any other value to enable + * it. + */ + int (*synack_option)(const struct tcp_sock *tp, + struct inet_request_sock *ireq); +}; + +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) +#define smc_call_hsbpf(init_val, tp, func, ...) ({ \ + typeof(init_val) __ret = (init_val); \ + struct smc_hs_ctrl *ctrl; \ + rcu_read_lock(); \ + ctrl = rcu_dereference(sock_net((struct sock *)(tp))->smc.hs_ctrl); \ + if (ctrl && ctrl->func) \ + __ret = ctrl->func(tp, ##__VA_ARGS__); \ + rcu_read_unlock(); \ + __ret; \ +}) +#else +#define smc_call_hsbpf(init_val, tp, ...) ({ (void)(tp); (init_val); }) +#endif /* CONFIG_SMC_HS_CTRL_BPF */ + #endif /* _SMC_H */ diff --git a/include/net/snmp.h b/include/net/snmp.h index 468a67836e2f..584e70742e9b 100644 --- a/include/net/snmp.h +++ b/include/net/snmp.h @@ -36,11 +36,6 @@ struct snmp_mib { .entry = _entry, \ } -#define SNMP_MIB_SENTINEL { \ - .name = NULL, \ - .entry = 0, \ -} - /* * We use unsigned longs for most mibs but u64 for ipstats. */ @@ -159,7 +154,7 @@ struct linux_tls_mib { #define __SNMP_ADD_STATS64(mib, field, addend) \ do { \ - __typeof__(*mib) *ptr = raw_cpu_ptr(mib); \ + TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[field] += addend; \ u64_stats_update_end(&ptr->syncp); \ @@ -176,8 +171,7 @@ struct linux_tls_mib { #define SNMP_INC_STATS64(mib, field) SNMP_ADD_STATS64(mib, field, 1) #define __SNMP_UPD_PO_STATS64(mib, basefield, addend) \ do { \ - __typeof__(*mib) *ptr; \ - ptr = raw_cpu_ptr((mib)); \ + TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \ u64_stats_update_begin(&ptr->syncp); \ ptr->mibs[basefield##PKTS]++; \ ptr->mibs[basefield##OCTETS] += addend; \ diff --git a/include/net/sock.h b/include/net/sock.h index 7ef728324e4e..dccd3738c368 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -81,8 +81,13 @@ * mini-semaphore synchronizes multiple users amongst themselves. */ typedef struct { - spinlock_t slock; - int owned; + union { + struct slock_owned { + int owned; + spinlock_t slock; + }; + long combined; + }; wait_queue_head_t wq; /* * We express the mutex-alike socket_lock semantics @@ -118,16 +123,17 @@ typedef __u64 __bitwise __addrpair; * @skc_reuseport: %SO_REUSEPORT setting * @skc_ipv6only: socket is IPV6 only * @skc_net_refcnt: socket is using net ref counting + * @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables - * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol + * @skc_portaddr_node: second hash linkage for UDP * @skc_prot: protocol handlers inside a network family * @skc_net: reference to the network namespace of this socket * @skc_v6_daddr: IPV6 destination address * @skc_v6_rcv_saddr: IPV6 source address * @skc_cookie: socket's cookie value * @skc_node: main hash linkage for various protocol lookup tables - * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol + * @skc_nulls_node: main hash linkage for TCP * @skc_tx_queue_mapping: tx queue number for this connection * @skc_rx_queue_mapping: rx queue number for this connection * @skc_flags: place holder for sk_flags @@ -174,6 +180,7 @@ struct sock_common { unsigned char skc_reuseport:1; unsigned char skc_ipv6only:1; unsigned char skc_net_refcnt:1; + unsigned char skc_bypass_prot_mem:1; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; @@ -249,6 +256,7 @@ struct sk_filter; * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy + * @psp_assoc: PSP association, if socket is PSP-secured * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags @@ -282,9 +290,11 @@ struct sk_filter; * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter + * @sk_drop_counters: optional pointer to numa_drop_counters * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner + * @sk_ino: inode number (zero if orphaned) * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting @@ -300,15 +310,19 @@ struct sk_filter; * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer + * @tcp_retransmit_timer: tcp retransmit timer + * @mptcp_retransmit_timer: mptcp retransmit timer * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING flags + * @sk_bpf_cb_flags: used in bpf_setsockopt() * @sk_use_task_frag: allow sk_page_frag() to use current->task_frag. * Sockets that can be used under memory reclaim should * set this to false. * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock * for timestamping * @sk_tskey: counter to disambiguate concurrent tstamp requests + * @sk_tx_queue_mapping_jiffies: time in jiffies of last @sk_tx_queue_mapping refresh. * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock. @@ -332,12 +346,21 @@ struct sk_filter; * @sk_reuseport_cb: reuseport group container * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage * @sk_rcu: used during RCU grace period + * @sk_freeptr: used for SLAB_TYPESAFE_BY_RCU managed sockets * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags + * @sk_scm_recv_flags: all flags used by scm_recv() + * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS + * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY + * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD + * @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS + * @sk_scm_unused: unused flags for scm_recv() * @ns_tracker: tracker for netns reference * @sk_user_frags: xarray of pages the user is holding a reference on. + * @sk_owner: reference to the real owner of the socket that calls + * sock_lock_init_class_and_name(). */ struct sock { /* @@ -368,6 +391,7 @@ struct sock { #define sk_reuseport __sk_common.skc_reuseport #define sk_ipv6only __sk_common.skc_ipv6only #define sk_net_refcnt __sk_common.skc_net_refcnt +#define sk_bypass_prot_mem __sk_common.skc_bypass_prot_mem #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot @@ -434,10 +458,15 @@ struct sock { __cacheline_group_begin(sock_read_rxtx); int sk_err; struct socket *sk_socket; +#ifdef CONFIG_MEMCG struct mem_cgroup *sk_memcg; +#endif #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif +#if IS_ENABLED(CONFIG_INET_PSP) + struct psp_assoc __rcu *psp_assoc; +#endif __cacheline_group_end(sock_read_rxtx); __cacheline_group_begin(sock_write_rxtx); @@ -450,7 +479,7 @@ struct sock { __cacheline_group_begin(sock_write_tx); int sk_write_pending; atomic_t sk_omem_alloc; - int sk_sndbuf; + int sk_err_soft; int sk_wmem_queued; refcount_t sk_wmem_alloc; @@ -460,21 +489,28 @@ struct sock { struct rb_root tcp_rtx_queue; }; struct sk_buff_head sk_write_queue; - u32 sk_dst_pending_confirm; - u32 sk_pacing_status; /* see enum sk_pacing */ struct page_frag sk_frag; - struct timer_list sk_timer; - + union { + struct timer_list sk_timer; + struct timer_list tcp_retransmit_timer; + struct timer_list mptcp_retransmit_timer; + }; unsigned long sk_pacing_rate; /* bytes per second */ atomic_t sk_zckey; atomic_t sk_tskey; + unsigned long sk_tx_queue_mapping_jiffies; __cacheline_group_end(sock_write_tx); __cacheline_group_begin(sock_read_tx); + u32 sk_dst_pending_confirm; + u32 sk_pacing_status; /* see enum sk_pacing */ unsigned long sk_max_pacing_rate; long sk_sndtimeo; u32 sk_priority; u32 sk_mark; + kuid_t sk_uid; + u16 sk_protocol; + u16 sk_type; struct dst_entry __rcu *sk_dst_cache; netdev_features_t sk_route_caps; #ifdef CONFIG_SOCK_VALIDATE_XMIT @@ -487,6 +523,7 @@ struct sock { unsigned int sk_gso_max_size; gfp_t sk_allocation; u32 sk_txhash; + int sk_sndbuf; u8 sk_pacing_shift; bool sk_use_task_frag; __cacheline_group_end(sock_read_tx); @@ -500,15 +537,12 @@ struct sock { sk_no_check_tx : 1, sk_no_check_rx : 1; u8 sk_shutdown; - u16 sk_type; - u16 sk_protocol; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; - int sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; - kuid_t sk_uid; + u64 sk_ino; spinlock_t sk_peer_lock; int sk_bind_phc; struct pid *sk_peer_pid; @@ -520,11 +554,23 @@ struct sock { #endif int sk_disconnects; - u8 sk_txrehash; + union { + u8 sk_txrehash; + u8 sk_scm_recv_flags; + struct { + u8 sk_scm_credentials : 1, + sk_scm_security : 1, + sk_scm_pidfd : 1, + sk_scm_rights : 1, + sk_scm_unused : 4; + }; + }; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG)) + u8 sk_bpf_cb_flags; void *sk_user_data; #ifdef CONFIG_SECURITY @@ -541,9 +587,21 @@ struct sock { #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *sk_bpf_storage; #endif - struct rcu_head sk_rcu; + struct numa_drop_counters *sk_drop_counters; + /* sockets using SLAB_TYPESAFE_BY_RCU can use sk_freeptr. + * By the time kfree() is called, sk_rcu can not be in + * use and can be mangled. + */ + union { + struct rcu_head sk_rcu; + freeptr_t sk_freeptr; + }; netns_tracker ns_tracker; struct xarray sk_user_frags; + +#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) + struct module *sk_owner; +#endif }; struct sock_bh_locked { @@ -793,11 +851,9 @@ static inline bool sk_del_node_init(struct sock *sk) { bool rc = __sk_del_node_init(sk); - if (rc) { - /* paranoid for a while -acme */ - WARN_ON(refcount_read(&sk->sk_refcnt) == 1); + if (rc) __sock_put(sk); - } + return rc; } #define sk_del_node_init_rcu(sk) sk_del_node_init(sk) @@ -815,14 +871,25 @@ static inline bool sk_nulls_del_node_init_rcu(struct sock *sk) { bool rc = __sk_nulls_del_node_init_rcu(sk); - if (rc) { - /* paranoid for a while -acme */ - WARN_ON(refcount_read(&sk->sk_refcnt) == 1); + if (rc) __sock_put(sk); - } + return rc; } +static inline bool sk_nulls_replace_node_init_rcu(struct sock *old, + struct sock *new) +{ + if (sk_hashed(old)) { + hlist_nulls_replace_init_rcu(&old->sk_nulls_node, + &new->sk_nulls_node); + __sock_put(old); + return true; + } + + return false; +} + static inline void __sk_add_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_node, list); @@ -954,6 +1021,7 @@ enum sock_flags { SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */ + SOCK_TIMESTAMPING_ANY, /* Copy of sk_tsflags & TSFLAGS_ANY */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) @@ -1224,10 +1292,10 @@ struct proto { void (*close)(struct sock *sk, long timeout); int (*pre_connect)(struct sock *sk, - struct sockaddr *uaddr, + struct sockaddr_unsized *uaddr, int addr_len); int (*connect)(struct sock *sk, - struct sockaddr *uaddr, + struct sockaddr_unsized *uaddr, int addr_len); int (*disconnect)(struct sock *sk, int flags); @@ -1253,12 +1321,12 @@ struct proto { int (*sendmsg)(struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, - size_t len, int flags, int *addr_len); + size_t len, int flags); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, - struct sockaddr *addr, int addr_len); + struct sockaddr_unsized *addr, int addr_len); int (*bind_add)(struct sock *sk, - struct sockaddr *addr, int addr_len); + struct sockaddr_unsized *addr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); @@ -1284,10 +1352,6 @@ struct proto { unsigned int inuse_idx; #endif -#if IS_ENABLED(CONFIG_MPTCP) - int (*forward_alloc_get)(const struct sock *sk); -#endif - bool (*stream_memory_free)(const struct sock *sk, int wake); bool (*sock_is_readable)(struct sock *sk); /* Memory pressure */ @@ -1317,19 +1381,17 @@ struct proto { struct kmem_cache *slab; unsigned int obj_size; + unsigned int freeptr_offset; unsigned int ipv6_pinfo_offset; slab_flags_t slab_flags; unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ - unsigned int __percpu *orphan_count; - struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; union { struct inet_hashinfo *hashinfo; - struct udp_table *udp_table; struct raw_hashinfo *raw_hash; struct smc_hashinfo *smc_hash; } h; @@ -1348,15 +1410,6 @@ int sock_load_diag_module(int family, int protocol); INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); -static inline int sk_forward_alloc_get(const struct sock *sk) -{ -#if IS_ENABLED(CONFIG_MPTCP) - if (sk->sk_prot->forward_alloc_get) - return sk->sk_prot->forward_alloc_get(sk); -#endif - return READ_ONCE(sk->sk_forward_alloc); -} - static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) @@ -1473,6 +1526,10 @@ static inline int __sk_prot_rehash(struct sock *sk) #define SOCK_BINDADDR_LOCK 4 #define SOCK_BINDPORT_LOCK 8 +/** + * define SOCK_CONNECT_BIND - &sock->sk_userlocks flag for auto-bind at connect() time + */ +#define SOCK_CONNECT_BIND 16 struct socket_alloc { struct socket socket; @@ -1540,7 +1597,7 @@ __sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc) } static inline bool -sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +sk_rmem_schedule(struct sock *sk, const struct sk_buff *skb, int size) { return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb)); } @@ -1592,6 +1649,37 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) sk_mem_reclaim(sk); } +void __sk_charge(struct sock *sk, gfp_t gfp); + +#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) +static inline void sk_owner_set(struct sock *sk, struct module *owner) +{ + __module_get(owner); + sk->sk_owner = owner; +} + +static inline void sk_owner_clear(struct sock *sk) +{ + sk->sk_owner = NULL; +} + +static inline void sk_owner_put(struct sock *sk) +{ + module_put(sk->sk_owner); +} +#else +static inline void sk_owner_set(struct sock *sk, struct module *owner) +{ +} + +static inline void sk_owner_clear(struct sock *sk) +{ +} + +static inline void sk_owner_put(struct sock *sk) +{ +} +#endif /* * Macro so as to not evaluate some arguments when * lockdep is not enabled. @@ -1601,13 +1689,14 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) */ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \ + sk_owner_set(sk, THIS_MODULE); \ sk->sk_lock.owned = 0; \ init_waitqueue_head(&sk->sk_lock.wq); \ spin_lock_init(&(sk)->sk_lock.slock); \ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ - sizeof((sk)->sk_lock)); \ + sizeof((sk)->sk_lock)); \ lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ - (skey), (sname)); \ + (skey), (sname)); \ lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ } while (0) @@ -1624,7 +1713,6 @@ static inline void lock_sock(struct sock *sk) lock_sock_nested(sk, 0); } -void __lock_sock(struct sock *sk); void __release_sock(struct sock *sk); void release_sock(struct sock *sk); @@ -1753,8 +1841,12 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, void sk_free(struct sock *sk); void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); -struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); -void sk_free_unlock_clone(struct sock *sk); +struct sock *sk_clone(const struct sock *sk, const gfp_t priority, bool lock); + +static inline struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) +{ + return sk_clone(sk, priority, true); +} struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); @@ -1806,6 +1898,8 @@ static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk, } void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); +void *sock_kmemdup(struct sock *sk, const void *src, + int size, gfp_t priority); void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); @@ -1823,12 +1917,14 @@ struct sockcm_cookie { u32 tsflags; u32 ts_opt_id; u32 priority; + u32 dmabuf_id; }; static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { *sockc = (struct sockcm_cookie) { + .mark = READ_ONCE(sk->sk_mark), .tsflags = READ_ONCE(sk->sk_tsflags), .priority = READ_ONCE(sk->sk_priority), }; @@ -1843,8 +1939,8 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, * Functions to fill in entries in struct proto_ops when a protocol * does not implement a particular function. */ -int sock_no_bind(struct socket *, struct sockaddr *, int); -int sock_no_connect(struct socket *, struct sockaddr *, int, int); +int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len); +int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); int sock_no_getname(struct socket *, struct sockaddr *, int); @@ -1934,7 +2030,15 @@ static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held. */ - WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); + if (READ_ONCE(sk->sk_tx_queue_mapping) != tx_queue) { + WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); + WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies); + return; + } + + /* Refresh sk_tx_queue_mapping_jiffies if too old. */ + if (time_is_before_jiffies(READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + HZ)) + WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies); } #define NO_QUEUE_MAPPING USHRT_MAX @@ -1947,19 +2051,7 @@ static inline void sk_tx_queue_clear(struct sock *sk) WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING); } -static inline int sk_tx_queue_get(const struct sock *sk) -{ - if (sk) { - /* Paired with WRITE_ONCE() in sk_tx_queue_clear() - * and sk_tx_queue_set(). - */ - int val = READ_ONCE(sk->sk_tx_queue_mapping); - - if (val != NO_QUEUE_MAPPING) - return val; - } - return -1; -} +int sk_tx_queue_get(const struct sock *sk); static inline void __sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb, @@ -2009,7 +2101,14 @@ static inline int sk_rx_queue_get(const struct sock *sk) static inline void sk_set_socket(struct sock *sk, struct socket *sock) { - sk->sk_socket = sock; + WRITE_ONCE(sk->sk_socket, sock); + if (sock) { + WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid); + WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino); + } else { + /* Note: sk_uid is unchanged. */ + WRITE_ONCE(sk->sk_ino, 0); + } } static inline wait_queue_head_t *sk_sleep(struct sock *sk) @@ -2040,18 +2139,25 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) rcu_assign_pointer(sk->sk_wq, &parent->wq); parent->sk = sk; sk_set_socket(sk, parent); - sk->sk_uid = SOCK_INODE(parent)->i_uid; security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } -kuid_t sock_i_uid(struct sock *sk); -unsigned long __sock_i_ino(struct sock *sk); -unsigned long sock_i_ino(struct sock *sk); +static inline u64 sock_i_ino(const struct sock *sk) +{ + /* Paired with WRITE_ONCE() in sock_graft() and sock_orphan() */ + return READ_ONCE(sk->sk_ino); +} + +static inline kuid_t sk_uid(const struct sock *sk) +{ + /* Paired with WRITE_ONCE() in sockfs_setattr() */ + return READ_ONCE(sk->sk_uid); +} static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) { - return sk ? sk->sk_uid : make_kuid(net->user_ns, 0); + return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0); } static inline u32 net_tx_rndhash(void) @@ -2231,6 +2337,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro return 0; } +#define SK_WMEM_ALLOC_BIAS 1 /** * sk_wmem_alloc_get - returns write allocations * @sk: socket @@ -2239,7 +2346,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro */ static inline int sk_wmem_alloc_get(const struct sock *sk) { - return refcount_read(&sk->sk_wmem_alloc) - 1; + return refcount_read(&sk->sk_wmem_alloc) - SK_WMEM_ALLOC_BIAS; } /** @@ -2395,12 +2502,23 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb)); int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); -int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, - enum skb_drop_reason *reason); +enum skb_drop_reason +sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb); static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { - return sock_queue_rcv_skb_reason(sk, skb, NULL); + enum skb_drop_reason drop_reason = sock_queue_rcv_skb_reason(sk, skb); + + switch (drop_reason) { + case SKB_DROP_REASON_SOCKET_RCVBUFF: + return -ENOMEM; + case SKB_DROP_REASON_PROTO_MEM: + return -ENOBUFS; + case 0: + return 0; + default: + return -EPERM; + } } int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); @@ -2524,12 +2642,16 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); +static inline bool __sock_writeable(const struct sock *sk, int wmem_alloc) +{ + return wmem_alloc < (READ_ONCE(sk->sk_sndbuf) >> 1); +} /* * Default write policy as shown to user space via poll/select/SIGIO */ static inline bool sock_writeable(const struct sock *sk) { - return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); + return __sock_writeable(sk, refcount_read(&sk->sk_wmem_alloc)); } static inline gfp_t gfp_any(void) @@ -2542,14 +2664,62 @@ static inline gfp_t gfp_memcg_charge(void) return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) +{ + return sk->sk_memcg; +} + +static inline bool mem_cgroup_sk_enabled(const struct sock *sk) +{ + return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk); +} + +static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) +{ + struct mem_cgroup *memcg = mem_cgroup_from_sk(sk); + +#ifdef CONFIG_MEMCG_V1 + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return !!memcg->tcpmem_pressure; +#endif /* CONFIG_MEMCG_V1 */ + + do { + if (time_before64(get_jiffies_64(), + mem_cgroup_get_socket_pressure(memcg))) { + memcg_memory_event(mem_cgroup_from_sk(sk), + MEMCG_SOCK_THROTTLED); + return true; + } + } while ((memcg = parent_mem_cgroup(memcg))); + + return false; +} +#else +static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk) +{ + return NULL; +} + +static inline bool mem_cgroup_sk_enabled(const struct sock *sk) +{ + return false; +} + +static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) +{ + return false; +} +#endif + static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { - return noblock ? 0 : sk->sk_rcvtimeo; + return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo); } static inline long sock_sndtimeo(const struct sock *sk, bool noblock) { - return noblock ? 0 : sk->sk_sndtimeo; + return noblock ? 0 : READ_ONCE(sk->sk_sndtimeo); } static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) @@ -2575,8 +2745,8 @@ struct sock_skb_cb { * using skb->cb[] would keep using it directly and utilize its * alignment guarantee. */ -#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ - sizeof(struct sock_skb_cb))) +#define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \ + sizeof(struct sock_skb_cb)) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) @@ -2584,18 +2754,53 @@ struct sock_skb_cb { #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) +static inline void sk_drops_add(struct sock *sk, int segs) +{ + struct numa_drop_counters *ndc = sk->sk_drop_counters; + + if (ndc) + numa_drop_add(ndc, segs); + else + atomic_add(segs, &sk->sk_drops); +} + +static inline void sk_drops_inc(struct sock *sk) +{ + sk_drops_add(sk, 1); +} + +static inline int sk_drops_read(const struct sock *sk) +{ + const struct numa_drop_counters *ndc = sk->sk_drop_counters; + + if (ndc) { + DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops)); + return numa_drop_read(ndc); + } + return atomic_read(&sk->sk_drops); +} + +static inline void sk_drops_reset(struct sock *sk) +{ + struct numa_drop_counters *ndc = sk->sk_drop_counters; + + if (ndc) + numa_drop_reset(ndc); + atomic_set(&sk->sk_drops, 0); +} + static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? - atomic_read(&sk->sk_drops) : 0; + sk_drops_read(sk) : 0; } -static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) +static inline void sk_drops_skbadd(struct sock *sk, const struct sk_buff *skb) { int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); - atomic_add(segs, &sk->sk_drops); + sk_drops_add(sk, segs); } static inline ktime_t sock_read_timestamp(struct sock *sk) @@ -2631,6 +2836,10 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); +bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk); +int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk, + struct timespec64 *ts); + static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { @@ -2665,13 +2874,13 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, { #define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_RCVTSTAMP) | \ - (1UL << SOCK_RCVMARK) |\ - (1UL << SOCK_RCVPRIORITY)) + (1UL << SOCK_RCVMARK) | \ + (1UL << SOCK_RCVPRIORITY) | \ + (1UL << SOCK_TIMESTAMPING_ANY)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) - if (sk->sk_flags & FLAGS_RECV_CMSGS || - READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY) + if (READ_ONCE(sk->sk_flags) & FLAGS_RECV_CMSGS) __sock_recv_cmsgs(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); @@ -2706,8 +2915,6 @@ static inline void _sock_tx_timestamp(struct sock *sk, *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } } - if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) - *tx_flags |= SKBTX_WIFI_STATUS; } static inline void sock_tx_timestamp(struct sock *sk, @@ -2745,9 +2952,14 @@ static inline bool sk_is_udp(const struct sock *sk) sk->sk_protocol == IPPROTO_UDP; } +static inline bool sk_is_unix(const struct sock *sk) +{ + return sk->sk_family == AF_UNIX; +} + static inline bool sk_is_stream_unix(const struct sock *sk) { - return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM; + return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM; } static inline bool sk_is_vsock(const struct sock *sk) @@ -2755,6 +2967,13 @@ static inline bool sk_is_vsock(const struct sock *sk) return sk->sk_family == AF_VSOCK; } +static inline bool sk_may_scm_recv(const struct sock *sk) +{ + return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) || + sk->sk_family == AF_NETLINK || + (IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH); +} + /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from @@ -2794,26 +3013,10 @@ sk_is_refcounted(struct sock *sk) return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } -/* Checks if this SKB belongs to an HW offloaded socket - * and whether any SW fallbacks are required based on dev. - * Check decrypted mark in case skb_orphan() cleared socket. - */ -static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, - struct net_device *dev) +static inline bool +sk_requests_wifi_status(struct sock *sk) { -#ifdef CONFIG_SOCK_VALIDATE_XMIT - struct sock *sk = skb->sk; - - if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { - skb = sk->sk_validate_xmit_skb(sk, dev, skb); - } else if (unlikely(skb_is_decrypted(skb))) { - pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); - kfree_skb(skb); - skb = NULL; - } -#endif - - return skb; + return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS); } /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV @@ -2852,8 +3055,8 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo); */ #define _SK_MEM_PACKETS 256 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) -#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) -#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_WMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_RMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; @@ -2920,7 +3123,13 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool); int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping); -void sock_enable_timestamps(struct sock *sk); +#if defined(CONFIG_CGROUP_BPF) +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op); +#else +static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) +{ +} +#endif void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); @@ -2930,7 +3139,7 @@ void sock_set_reuseaddr(struct sock *sk); void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); -int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); +int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); int sock_get_timeout(long timeo, void *optval, bool old_timeval); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, @@ -2941,8 +3150,11 @@ int sock_ioctl_inout(struct sock *sk, unsigned int cmd, int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); static inline bool sk_is_readable(struct sock *sk) { - if (sk->sk_prot->sock_is_readable) - return sk->sk_prot->sock_is_readable(sk); + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot->sock_is_readable) + return prot->sock_is_readable(sk); + return false; } #endif /* _SOCK_H */ diff --git a/include/net/strparser.h b/include/net/strparser.h index 0a83010b3a64..0ed73e364faa 100644 --- a/include/net/strparser.h +++ b/include/net/strparser.h @@ -114,8 +114,6 @@ static inline void strp_pause(struct strparser *strp) /* May be called without holding lock for attached socket */ void strp_unpause(struct strparser *strp); -/* Must be called with process lock held (lock_sock) */ -void __strp_unpause(struct strparser *strp); static inline void save_strp_stats(struct strparser *strp, struct strp_aggr_stats *agg_stats) diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 8346b0d29542..ee500706496b 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -15,6 +15,7 @@ #define SWITCHDEV_F_NO_RECURSE BIT(0) #define SWITCHDEV_F_SKIP_EOPNOTSUPP BIT(1) #define SWITCHDEV_F_DEFER BIT(2) +#define SWITCHDEV_F_NO_FOREIGN BIT(3) enum switchdev_attr_id { SWITCHDEV_ATTR_ID_UNDEFINED, diff --git a/include/net/tc_act/tc_connmark.h b/include/net/tc_act/tc_connmark.h index e8dd77a96748..a5ce83f3eea4 100644 --- a/include/net/tc_act/tc_connmark.h +++ b/include/net/tc_act/tc_connmark.h @@ -7,6 +7,7 @@ struct tcf_connmark_parms { struct net *net; u16 zone; + int action; struct rcu_head rcu; }; diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h index 68269e4581b7..8d0c7a9f9345 100644 --- a/include/net/tc_act/tc_csum.h +++ b/include/net/tc_act/tc_csum.h @@ -8,6 +8,7 @@ struct tcf_csum_params { u32 update_flags; + int action; struct rcu_head rcu; }; @@ -18,15 +19,6 @@ struct tcf_csum { }; #define to_tcf_csum(a) ((struct tcf_csum *)a) -static inline bool is_tcf_csum(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_CSUM) - return true; -#endif - return false; -} - static inline u32 tcf_csum_update_flags(const struct tc_action *a) { u32 update_flags; diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h index 77f87c622a2e..8b90c86c0b0d 100644 --- a/include/net/tc_act/tc_ct.h +++ b/include/net/tc_act/tc_ct.h @@ -13,7 +13,7 @@ struct tcf_ct_params { struct nf_conntrack_helper *helper; struct nf_conn *tmpl; u16 zone; - + int action; u32 mark; u32 mark_mask; @@ -92,13 +92,4 @@ static inline void tcf_ct_flow_table_restore_skb(struct sk_buff *skb, unsigned long cookie) { } #endif -static inline bool is_tcf_ct(const struct tc_action *a) -{ -#if defined(CONFIG_NET_CLS_ACT) && IS_ENABLED(CONFIG_NF_CONNTRACK) - if (a->ops && a->ops->id == TCA_ID_CT) - return true; -#endif - return false; -} - #endif /* __NET_TC_CT_H */ diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h index f071c1d70a25..7fe01ab236da 100644 --- a/include/net/tc_act/tc_ctinfo.h +++ b/include/net/tc_act/tc_ctinfo.h @@ -7,6 +7,7 @@ struct tcf_ctinfo_params { struct rcu_head rcu; struct net *net; + int action; u32 dscpmask; u32 dscpstatemask; u32 cpmarkmask; @@ -18,9 +19,9 @@ struct tcf_ctinfo_params { struct tcf_ctinfo { struct tc_action common; struct tcf_ctinfo_params __rcu *params; - u64 stats_dscp_set; - u64 stats_dscp_error; - u64 stats_cpmark_set; + atomic64_t stats_dscp_set; + atomic64_t stats_dscp_error; + atomic64_t stats_cpmark_set; }; enum { diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h index c8fa11ebb397..e0fded18e18c 100644 --- a/include/net/tc_act/tc_gate.h +++ b/include/net/tc_act/tc_gate.h @@ -32,6 +32,7 @@ struct tcf_gate_params { s32 tcfg_clockid; size_t num_entries; struct list_head entries; + struct rcu_head rcu; }; #define GATE_ACT_GATE_OPEN BIT(0) @@ -39,7 +40,7 @@ struct tcf_gate_params { struct tcf_gate { struct tc_action common; - struct tcf_gate_params param; + struct tcf_gate_params __rcu *param; u8 current_gate_status; ktime_t current_close_time; u32 current_entry_octets; @@ -51,56 +52,65 @@ struct tcf_gate { #define to_gate(a) ((struct tcf_gate *)a) -static inline bool is_tcf_gate(const struct tc_action *a) +static inline struct tcf_gate_params *tcf_gate_params_locked(const struct tc_action *a) { -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_GATE) - return true; -#endif - return false; + struct tcf_gate *gact = to_gate(a); + + return rcu_dereference_protected(gact->param, + lockdep_is_held(&gact->tcf_lock)); } static inline s32 tcf_gate_prio(const struct tc_action *a) { + struct tcf_gate_params *p; s32 tcfg_prio; - tcfg_prio = to_gate(a)->param.tcfg_priority; + p = tcf_gate_params_locked(a); + tcfg_prio = p->tcfg_priority; return tcfg_prio; } static inline u64 tcf_gate_basetime(const struct tc_action *a) { + struct tcf_gate_params *p; u64 tcfg_basetime; - tcfg_basetime = to_gate(a)->param.tcfg_basetime; + p = tcf_gate_params_locked(a); + tcfg_basetime = p->tcfg_basetime; return tcfg_basetime; } static inline u64 tcf_gate_cycletime(const struct tc_action *a) { + struct tcf_gate_params *p; u64 tcfg_cycletime; - tcfg_cycletime = to_gate(a)->param.tcfg_cycletime; + p = tcf_gate_params_locked(a); + tcfg_cycletime = p->tcfg_cycletime; return tcfg_cycletime; } static inline u64 tcf_gate_cycletimeext(const struct tc_action *a) { + struct tcf_gate_params *p; u64 tcfg_cycletimeext; - tcfg_cycletimeext = to_gate(a)->param.tcfg_cycletime_ext; + p = tcf_gate_params_locked(a); + tcfg_cycletimeext = p->tcfg_cycletime_ext; return tcfg_cycletimeext; } static inline u32 tcf_gate_num_entries(const struct tc_action *a) { + struct tcf_gate_params *p; u32 num_entries; - num_entries = to_gate(a)->param.num_entries; + p = tcf_gate_params_locked(a); + num_entries = p->num_entries; return num_entries; } @@ -114,7 +124,7 @@ static inline struct action_gate_entry u32 num_entries; int i = 0; - p = &to_gate(a)->param; + p = tcf_gate_params_locked(a); num_entries = p->num_entries; list_for_each_entry(entry, &p->entries, list) @@ -123,7 +133,7 @@ static inline struct action_gate_entry if (i != num_entries) return NULL; - oe = kcalloc(num_entries, sizeof(*oe), GFP_ATOMIC); + oe = kzalloc_objs(*oe, num_entries, GFP_ATOMIC); if (!oe) return NULL; diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h index c7f24a2da1ca..24d4d5a62b3c 100644 --- a/include/net/tc_act/tc_ife.h +++ b/include/net/tc_act/tc_ife.h @@ -13,15 +13,13 @@ struct tcf_ife_params { u8 eth_src[ETH_ALEN]; u16 eth_type; u16 flags; - + struct list_head metalist; struct rcu_head rcu; }; struct tcf_ife_info { struct tc_action common; struct tcf_ife_params __rcu *params; - /* list of metaids allowed */ - struct list_head metalist; }; #define to_ife(a) ((struct tcf_ife_info *)a) diff --git a/include/net/tc_act/tc_mpls.h b/include/net/tc_act/tc_mpls.h index 721de4f5733a..dd067bd4018d 100644 --- a/include/net/tc_act/tc_mpls.h +++ b/include/net/tc_act/tc_mpls.h @@ -10,6 +10,7 @@ struct tcf_mpls_params { int tcfm_action; u32 tcfm_label; + int action; /* tcf_action */ u8 tcfm_tc; u8 tcfm_ttl; u8 tcfm_bos; @@ -27,15 +28,6 @@ struct tcf_mpls { }; #define to_mpls(a) ((struct tcf_mpls *)a) -static inline bool is_tcf_mpls(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_MPLS) - return true; -#endif - return false; -} - static inline u32 tcf_mpls_action(const struct tc_action *a) { u32 tcfm_action; diff --git a/include/net/tc_act/tc_nat.h b/include/net/tc_act/tc_nat.h index c869274ac529..ae35f4009445 100644 --- a/include/net/tc_act/tc_nat.h +++ b/include/net/tc_act/tc_nat.h @@ -6,6 +6,7 @@ #include <net/act_api.h> struct tcf_nat_parms { + int action; __be32 old_addr; __be32 new_addr; __be32 mask; diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h index 83fe39931781..cb7b82f2cbc7 100644 --- a/include/net/tc_act/tc_pedit.h +++ b/include/net/tc_act/tc_pedit.h @@ -14,7 +14,7 @@ struct tcf_pedit_key_ex { struct tcf_pedit_parms { struct tc_pedit_key *tcfp_keys; struct tcf_pedit_key_ex *tcfp_keys_ex; - u32 tcfp_off_max_hint; + int action; unsigned char tcfp_nkeys; unsigned char tcfp_flags; struct rcu_head rcu; diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h index 283bde711a42..a89fc8e68b1e 100644 --- a/include/net/tc_act/tc_police.h +++ b/include/net/tc_act/tc_police.h @@ -5,10 +5,11 @@ #include <net/act_api.h> struct tcf_police_params { + int action; int tcfp_result; u32 tcfp_ewma_rate; - s64 tcfp_burst; u32 tcfp_mtu; + s64 tcfp_burst; s64 tcfp_mtu_ptoks; s64 tcfp_pkt_burst; struct psched_ratecfg rate; @@ -44,15 +45,6 @@ struct tc_police_compat { struct tc_ratespec peakrate; }; -static inline bool is_tcf_police(const struct tc_action *act) -{ -#ifdef CONFIG_NET_CLS_ACT - if (act->ops && act->ops->id == TCA_ID_POLICE) - return true; -#endif - return false; -} - static inline u64 tcf_police_rate_bytes_ps(const struct tc_action *act) { struct tcf_police *police = to_police(act); diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h index b5d76305e854..abd163ca1864 100644 --- a/include/net/tc_act/tc_sample.h +++ b/include/net/tc_act/tc_sample.h @@ -17,15 +17,6 @@ struct tcf_sample { }; #define to_sample(a) ((struct tcf_sample *)a) -static inline bool is_tcf_sample(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - return a->ops && a->ops->id == TCA_ID_SAMPLE; -#else - return false; -#endif -} - static inline __u32 tcf_sample_rate(const struct tc_action *a) { return to_sample(a)->rate; diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 9649600fb3dc..31b2cd0bebb5 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -12,6 +12,7 @@ #include <linux/tc_act/tc_skbedit.h> struct tcf_skbedit_params { + int action; u32 flags; u32 priority; u32 mark; diff --git a/include/net/tc_act/tc_skbmod.h b/include/net/tc_act/tc_skbmod.h index 7c240d2fed4e..626704cd6241 100644 --- a/include/net/tc_act/tc_skbmod.h +++ b/include/net/tc_act/tc_skbmod.h @@ -12,6 +12,7 @@ struct tcf_skbmod_params { struct rcu_head rcu; u64 flags; /*up to 64 types of operations; extend if needed */ + int action; u8 eth_dst[ETH_ALEN]; u16 eth_type; u8 eth_src[ETH_ALEN]; diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index 879fe8cff581..0f1925f97520 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -14,6 +14,7 @@ struct tcf_tunnel_key_params { struct rcu_head rcu; int tcft_action; + int action; struct metadata_dst *tcft_enc_metadata; }; diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 904eddfc1826..beadee41669a 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -10,6 +10,7 @@ #include <linux/tc_act/tc_vlan.h> struct tcf_vlan_params { + int action; int tcfv_action; unsigned char tcfv_push_dst[ETH_ALEN]; unsigned char tcfv_push_src[ETH_ALEN]; @@ -26,15 +27,6 @@ struct tcf_vlan { }; #define to_vlan(a) ((struct tcf_vlan *)a) -static inline bool is_tcf_vlan(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_VLAN) - return true; -#endif - return false; -} - static inline u32 tcf_vlan_action(const struct tc_action *a) { u32 tcfv_action; diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h index ffe58a02537c..4ebb053bb0dd 100644 --- a/include/net/tc_wrapper.h +++ b/include/net/tc_wrapper.h @@ -12,7 +12,8 @@ #define TC_INDIRECT_SCOPE -extern struct static_key_false tc_skip_wrapper; +extern struct static_key_false tc_skip_wrapper_act; +extern struct static_key_false tc_skip_wrapper_cls; /* TC Actions */ #ifdef CONFIG_NET_CLS_ACT @@ -46,7 +47,7 @@ TC_INDIRECT_ACTION_DECLARE(tunnel_key_act); static inline int tc_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { - if (static_branch_likely(&tc_skip_wrapper)) + if (static_branch_likely(&tc_skip_wrapper_act)) goto skip; #if IS_BUILTIN(CONFIG_NET_ACT_GACT) @@ -153,7 +154,7 @@ TC_INDIRECT_FILTER_DECLARE(u32_classify); static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { - if (static_branch_likely(&tc_skip_wrapper)) + if (static_branch_likely(&tc_skip_wrapper_cls)) goto skip; #if IS_BUILTIN(CONFIG_NET_CLS_BPF) @@ -202,8 +203,44 @@ skip: static inline void tc_wrapper_init(void) { #ifdef CONFIG_X86 - if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) - static_branch_enable(&tc_skip_wrapper); + int cnt_cls = IS_BUILTIN(CONFIG_NET_CLS_BPF) + + IS_BUILTIN(CONFIG_NET_CLS_U32) + + IS_BUILTIN(CONFIG_NET_CLS_FLOWER) + + IS_BUILTIN(CONFIG_NET_CLS_FW) + + IS_BUILTIN(CONFIG_NET_CLS_MATCHALL) + + IS_BUILTIN(CONFIG_NET_CLS_BASIC) + + IS_BUILTIN(CONFIG_NET_CLS_CGROUP) + + IS_BUILTIN(CONFIG_NET_CLS_FLOW) + + IS_BUILTIN(CONFIG_NET_CLS_ROUTE4); + + int cnt_act = IS_BUILTIN(CONFIG_NET_ACT_GACT) + + IS_BUILTIN(CONFIG_NET_ACT_MIRRED) + + IS_BUILTIN(CONFIG_NET_ACT_PEDIT) + + IS_BUILTIN(CONFIG_NET_ACT_SKBEDIT) + + IS_BUILTIN(CONFIG_NET_ACT_SKBMOD) + + IS_BUILTIN(CONFIG_NET_ACT_POLICE) + + IS_BUILTIN(CONFIG_NET_ACT_BPF) + + IS_BUILTIN(CONFIG_NET_ACT_CONNMARK) + + IS_BUILTIN(CONFIG_NET_ACT_CSUM) + + IS_BUILTIN(CONFIG_NET_ACT_CT) + + IS_BUILTIN(CONFIG_NET_ACT_CTINFO) + + IS_BUILTIN(CONFIG_NET_ACT_GATE) + + IS_BUILTIN(CONFIG_NET_ACT_MPLS) + + IS_BUILTIN(CONFIG_NET_ACT_NAT) + + IS_BUILTIN(CONFIG_NET_ACT_TUNNEL_KEY) + + IS_BUILTIN(CONFIG_NET_ACT_VLAN) + + IS_BUILTIN(CONFIG_NET_ACT_IFE) + + IS_BUILTIN(CONFIG_NET_ACT_SIMP) + + IS_BUILTIN(CONFIG_NET_ACT_SAMPLE); + + if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) + return; + + if (cnt_cls > 1) + static_branch_enable(&tc_skip_wrapper_cls); + + if (cnt_act > 1) + static_branch_enable(&tc_skip_wrapper_act); #endif } diff --git a/include/net/tcp.h b/include/net/tcp.h index 2d08473a6dc0..98848db62894 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -26,6 +26,7 @@ #include <linux/kref.h> #include <linux/ktime.h> #include <linux/indirect_call_wrapper.h> +#include <linux/bits.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> @@ -42,6 +43,7 @@ #include <net/dst.h> #include <net/mptcp.h> #include <net/xfrm.h> +#include <net/secure_seq.h> #include <linux/seq_file.h> #include <linux/memcontrol.h> @@ -53,7 +55,15 @@ extern struct inet_hashinfo tcp_hashinfo; DECLARE_PER_CPU(unsigned int, tcp_orphan_count); int tcp_orphan_count_sum(void); -DECLARE_PER_CPU(u32, tcp_tw_isn); +static inline void tcp_orphan_count_inc(void) +{ + this_cpu_inc(tcp_orphan_count); +} + +static inline void tcp_orphan_count_dec(void) +{ + this_cpu_dec(tcp_orphan_count); +} void tcp_time_wait(struct sock *sk, int state, int timeo); @@ -89,6 +99,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* Maximal number of window scale according to RFC1323 */ #define TCP_MAX_WSCALE 14U +/* Default sending frequency of accurate ECN option per RTT */ +#define TCP_ACCECN_OPTION_BEACON 3 + /* urg_data states */ #define TCP_URG_VALID 0x0100 #define TCP_URG_NOTYET 0x0200 @@ -144,8 +157,9 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCP_DELACK_MIN 4U #define TCP_ATO_MIN 4U #endif -#define TCP_RTO_MAX ((unsigned)(120*HZ)) -#define TCP_RTO_MIN ((unsigned)(HZ/5)) +#define TCP_RTO_MAX_SEC 120 +#define TCP_RTO_MAX ((unsigned)(TCP_RTO_MAX_SEC * HZ)) +#define TCP_RTO_MIN ((unsigned)(HZ / 5)) #define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */ #define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */ @@ -201,6 +215,8 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOPT_AO 29 /* Authentication Option (RFC5925) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ +#define TCPOPT_ACCECN0 172 /* 0xAC: Accurate ECN Order 0 */ +#define TCPOPT_ACCECN1 174 /* 0xAE: Accurate ECN Order 1 */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt @@ -218,6 +234,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 +#define TCPOLEN_ACCECN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 @@ -231,6 +248,14 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX); #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 +#define TCPOLEN_ACCECN_PERFIELD 3 + +/* Maximum number of byte counters in AccECN option + size */ +#define TCP_ACCECN_NUMFIELDS 3 +#define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ + TCPOLEN_ACCECN_PERFIELD * \ + TCP_ACCECN_NUMFIELDS) +#define TCP_ACCECN_SAFETY_SHIFT 1 /* SAFETY_FACTOR in accecn draft */ /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ @@ -265,7 +290,6 @@ extern long sysctl_tcp_mem[3]; #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ #define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */ -extern atomic_long_t tcp_memory_allocated; DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); extern struct percpu_counter tcp_sockets_allocated; @@ -274,10 +298,13 @@ extern unsigned long tcp_memory_pressure; /* optimized version of sk_under_memory_pressure() for TCP sockets */ static inline bool tcp_under_memory_pressure(const struct sock *sk) { - if (mem_cgroup_sockets_enabled && sk->sk_memcg && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) + if (mem_cgroup_sk_enabled(sk) && + mem_cgroup_sk_under_memory_pressure(sk)) return true; + if (sk->sk_bypass_prot_mem) + return false; + return READ_ONCE(tcp_memory_pressure); } /* @@ -319,13 +346,21 @@ extern struct proto tcp_prot; #define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field) #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) -void tcp_tasklet_init(void); +/* + * TCP splice context + */ +struct tcp_splice_state { + struct pipe_inode_info *pipe; + size_t len; + unsigned int flags; +}; + +void tcp_tsq_work_init(void); int tcp_v4_err(struct sk_buff *skb, u32); void tcp_shutdown(struct sock *sk, int how); -int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); void tcp_remove_empty_skb(struct sock *sk); @@ -338,17 +373,34 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags); int tcp_wmem_schedule(struct sock *sk, int copy); void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal); + void tcp_release_cb(struct sock *sk); + +static inline bool tcp_release_cb_cond(struct sock *sk) +{ +#ifdef CONFIG_INET + if (likely(sk->sk_prot->release_cb == tcp_release_cb)) { + if (unlikely(smp_load_acquire(&sk->sk_tsq_flags) & TCP_DEFERRED_ALL)) + tcp_release_cb(sk); + return true; + } +#endif + return false; +} + void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, int *karg); enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); +void tcp_rcvbuf_grow(struct sock *sk, u32 newval); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); void tcp_twsk_purge(struct list_head *net_exit_list); +int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int offset, size_t len); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); @@ -372,31 +424,71 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) } } -#define TCP_ECN_OK 1 -#define TCP_ECN_QUEUE_CWR 2 -#define TCP_ECN_DEMAND_CWR 4 -#define TCP_ECN_SEEN 8 +#define TCP_ECN_MODE_RFC3168 BIT(0) +#define TCP_ECN_QUEUE_CWR BIT(1) +#define TCP_ECN_DEMAND_CWR BIT(2) +#define TCP_ECN_SEEN BIT(3) +#define TCP_ECN_MODE_ACCECN BIT(4) + +#define TCP_ECN_DISABLED 0 +#define TCP_ECN_MODE_PENDING (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) +#define TCP_ECN_MODE_ANY (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) + +static inline bool tcp_ecn_mode_any(const struct tcp_sock *tp) +{ + return tp->ecn_flags & TCP_ECN_MODE_ANY; +} + +static inline bool tcp_ecn_mode_rfc3168(const struct tcp_sock *tp) +{ + return (tp->ecn_flags & TCP_ECN_MODE_ANY) == TCP_ECN_MODE_RFC3168; +} + +static inline bool tcp_ecn_mode_accecn(const struct tcp_sock *tp) +{ + return (tp->ecn_flags & TCP_ECN_MODE_ANY) == TCP_ECN_MODE_ACCECN; +} + +static inline bool tcp_ecn_disabled(const struct tcp_sock *tp) +{ + return !tcp_ecn_mode_any(tp); +} + +static inline bool tcp_ecn_mode_pending(const struct tcp_sock *tp) +{ + return (tp->ecn_flags & TCP_ECN_MODE_PENDING) == TCP_ECN_MODE_PENDING; +} + +static inline void tcp_ecn_mode_set(struct tcp_sock *tp, u8 mode) +{ + tp->ecn_flags &= ~TCP_ECN_MODE_ANY; + tp->ecn_flags |= mode; +} enum tcp_tw_status { TCP_TW_SUCCESS = 0, TCP_TW_RST = 1, TCP_TW_ACK = 2, - TCP_TW_SYN = 3 + TCP_TW_SYN = 3, + TCP_TW_ACK_OOW = 4 }; enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th, - u32 *tw_isn); + u32 *tw_isn, + enum skb_drop_reason *drop_reason); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, - bool *lost_race); + bool *lost_race, enum skb_drop_reason *drop_reason); enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag); void tcp_clear_retrans(struct tcp_sock *tp); +void tcp_update_pacing_rate(struct sock *sk); +void tcp_set_rto(struct sock *sk); void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); @@ -416,14 +508,23 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); +void tcp_reset_keepalive_timer(struct sock *sk, unsigned long timeout); void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, - int flags, int *addr_len); + int flags); int tcp_set_rcvlowat(struct sock *sk, int val); +void tcp_set_rcvbuf(struct sock *sk, int val); int tcp_set_window_clamp(struct sock *sk, int val); -void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping_internal *tss); + +static inline void +tcp_update_recv_tstamps(struct sk_buff *skb, + struct scm_timestamping_internal *tss) +{ + tss->ts[0] = skb->tstamp; + tss->ts[2] = skb_hwtstamps(skb)->hwtstamp; +} + void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss); void tcp_data_ready(struct sock *sk); @@ -450,7 +551,6 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, * TCP v4 functions exported for the inet6 API */ -void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); void tcp_req_err(struct sock *sk, u32 seq, bool abort); void tcp_ld_RTO_revert(struct sock *sk, u32 seq); @@ -463,14 +563,17 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, - bool *own_req); + bool *own_req, + void (*opt_child_init)(struct sock *newsk, + const struct sock *sk)); int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); -int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); int tcp_connect(struct sock *sk); enum tcp_synack_type { TCP_SYNACK_NORMAL, TCP_SYNACK_FASTOPEN, TCP_SYNACK_COOKIE, + TCP_SYNACK_RETRANS, }; struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, @@ -667,7 +770,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority, enum sk_rst_reason reason); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); -void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags); void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); @@ -681,7 +784,15 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_done_with_error(struct sock *sk, int err); void tcp_reset(struct sock *sk, struct sk_buff *skb); void tcp_fin(struct sock *sk); -void tcp_check_space(struct sock *sk); +void __tcp_check_space(struct sock *sk); +static inline void tcp_check_space(struct sock *sk) +{ + /* pairs with tcp_poll() */ + smp_mb(); + + if (sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + __tcp_check_space(sk); +} void tcp_sack_compress_send_ack(struct sock *sk); static inline void tcp_cleanup_skb(struct sk_buff *skb) @@ -739,6 +850,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) /* tcp.c */ void tcp_get_info(struct sock *, struct tcp_info *); +void tcp_rate_check_app_limited(struct sock *sk); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, @@ -756,42 +868,27 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); -static inline void tcp_bound_rto(struct sock *sk) +static inline unsigned int tcp_rto_max(const struct sock *sk) { - if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) - inet_csk(sk)->icsk_rto = TCP_RTO_MAX; + return READ_ONCE(inet_csk(sk)->icsk_rto_max); } -static inline u32 __tcp_set_rto(const struct tcp_sock *tp) +static inline void tcp_bound_rto(struct sock *sk) { - return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); + inet_csk(sk)->icsk_rto = min(inet_csk(sk)->icsk_rto, tcp_rto_max(sk)); } -static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) -{ - /* mptcp hooks are only on the slow path */ - if (sk_is_mptcp((struct sock *)tp)) - return; - - tp->pred_flags = htonl((tp->tcp_header_len << 26) | - ntohl(TCP_FLAG_ACK) | - snd_wnd); -} - -static inline void tcp_fast_path_on(struct tcp_sock *tp) +static inline u32 __tcp_set_rto(const struct tcp_sock *tp) { - __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); + return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); } -static inline void tcp_fast_path_check(struct sock *sk) +static inline unsigned long tcp_reqsk_timeout(struct request_sock *req) { - struct tcp_sock *tp = tcp_sk(sk); + u64 timeout = (u64)req->timeout << req->num_timeout; - if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && - tp->rcv_wnd && - atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && - !tp->urg_data) - tcp_fast_path_on(tp); + return (unsigned long)min_t(u64, timeout, + tcp_rto_max(req->rsk_listener)); } u32 tcp_delack_max(const struct sock *sk); @@ -800,7 +897,7 @@ u32 tcp_delack_max(const struct sock *sk); static inline u32 tcp_rto_min(const struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); - u32 rto_min = inet_csk(sk)->icsk_rto_min; + u32 rto_min = READ_ONCE(inet_csk(sk)->icsk_rto_min); if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); @@ -836,6 +933,28 @@ static inline u32 tcp_receive_window(const struct tcp_sock *tp) return (u32) win; } +/* Compute the maximum receive window we ever advertised. + * Rcv_nxt can be after the window if our peer push more data + * than the offered window. + */ +static inline u32 tcp_max_receive_window(const struct tcp_sock *tp) +{ + s32 win = tp->rcv_mwnd_seq - tp->rcv_nxt; + + if (win < 0) + win = 0; + return (u32) win; +} + +/* Check if we need to update the maximum receive window sequence number */ +static inline void tcp_update_max_rcv_wnd_seq(struct tcp_sock *tp) +{ + u32 wre = tp->rcv_wup + tp->rcv_wnd; + + if (after(wre, tp->rcv_mwnd_seq)) + tp->rcv_mwnd_seq = wre; +} + /* Choose a new window, without checks for shrinking, and without * scaling applied to the result. The caller does these things * if necessary. This is a "raw" window selection. @@ -928,16 +1047,35 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq) #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) -#define TCPHDR_FIN 0x01 -#define TCPHDR_SYN 0x02 -#define TCPHDR_RST 0x04 -#define TCPHDR_PSH 0x08 -#define TCPHDR_ACK 0x10 -#define TCPHDR_URG 0x20 -#define TCPHDR_ECE 0x40 -#define TCPHDR_CWR 0x80 - +#define TCPHDR_FIN BIT(0) +#define TCPHDR_SYN BIT(1) +#define TCPHDR_RST BIT(2) +#define TCPHDR_PSH BIT(3) +#define TCPHDR_ACK BIT(4) +#define TCPHDR_URG BIT(5) +#define TCPHDR_ECE BIT(6) +#define TCPHDR_CWR BIT(7) +#define TCPHDR_AE BIT(8) +#define TCPHDR_FLAGS_MASK (TCPHDR_FIN | TCPHDR_SYN | TCPHDR_RST | \ + TCPHDR_PSH | TCPHDR_ACK | TCPHDR_URG | \ + TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) +#define tcp_flags_ntohs(th) (ntohs(*(__be16 *)&tcp_flag_word(th)) & \ + TCPHDR_FLAGS_MASK) + +#define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) +#define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR) + +#define TCP_ACCECN_CEP_ACE_MASK 0x7 +#define TCP_ACCECN_ACE_MAX_DELTA 6 + +/* To avoid/detect middlebox interference, not all counters start at 0. + * See draft-ietf-tcpm-accurate-ecn for the latest values. + */ +#define TCP_ACCECN_CEP_INIT_OFFSET 5 +#define TCP_ACCECN_E1B_INIT_OFFSET 1 +#define TCP_ACCECN_E0B_INIT_OFFSET 1 +#define TCP_ACCECN_CEB_INIT_OFFSET 0 /* State flags for sacked in struct tcp_skb_cb */ enum tcp_skb_cb_sacked_flags { @@ -962,23 +1100,28 @@ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { - /* Note : + /* Notes : + * tcp_tw_isn is used in input path only + * (isn chosen by tcp_timewait_state_process()) * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ + u32 tcp_tw_isn; struct { u16 tcp_gso_segs; u16 tcp_gso_size; }; }; - __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ + __u16 tcp_flags; /* TCP header flags (tcp[12-13])*/ __u8 sacked; /* State flags for SACK. */ __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ - __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ +#define TSTAMP_ACK_SK 0x1 +#define TSTAMP_ACK_BPF 0x2 + __u8 txstamp_ack:2, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ has_rxtstamp:1, /* SKB has a RX timestamp */ - unused:5; + unused:4; __u32 ack_seq; /* Sequence number ACK'd */ union { struct { @@ -1035,9 +1178,7 @@ static inline int tcp_v6_sdif(const struct sk_buff *skb) extern const struct inet_connection_sock_af_ops ipv6_specific; -INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb)); -void tcp_v6_early_demux(struct sk_buff *skb); #endif @@ -1124,10 +1265,18 @@ enum tcp_ca_ack_event_flags { #define TCP_CA_UNSPEC 0 /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ -#define TCP_CONG_NON_RESTRICTED 0x1 +#define TCP_CONG_NON_RESTRICTED BIT(0) /* Requires ECN/ECT set on all packets */ -#define TCP_CONG_NEEDS_ECN 0x2 -#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) +#define TCP_CONG_NEEDS_ECN BIT(1) +/* Require successfully negotiated AccECN capability */ +#define TCP_CONG_NEEDS_ACCECN BIT(2) +/* Use ECT(1) instead of ECT(0) while the CA is uninitialized */ +#define TCP_CONG_ECT_1_NEGOTIATION BIT(3) +/* Cannot fallback to RFC3168 during AccECN negotiation */ +#define TCP_CONG_NO_FALLBACK_RFC3168 BIT(4) +#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN | \ + TCP_CONG_NEEDS_ACCECN | TCP_CONG_ECT_1_NEGOTIATION | \ + TCP_CONG_NO_FALLBACK_RFC3168) union tcp_cc_info; @@ -1167,33 +1316,45 @@ struct rate_sample { struct tcp_congestion_ops { /* fast path fields are put first to fill one cache line */ + /* A congestion control (CC) must provide one of either: + * + * (a) a cong_avoid function, if the CC wants to use the core TCP + * stack's default functionality to implement a "classic" + * (Reno/CUBIC-style) response to packet loss, RFC3168 ECN, + * idle periods, pacing rate computations, etc. + * + * (b) a cong_control function, if the CC wants custom behavior and + * complete control of all congestion control behaviors. + */ + /* (a) "classic" response: calculate new cwnd. + */ + void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked); + /* (b) "custom" response: call when packets are delivered to update + * cwnd and pacing rate, after all the ca_state processing. + */ + void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs); + /* return slow start threshold (required) */ u32 (*ssthresh)(struct sock *sk); - /* do new cwnd calculation (required) */ - void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked); - /* call before changing ca_state (optional) */ void (*set_state)(struct sock *sk, u8 new_state); /* call when cwnd event occurs (optional) */ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); + /* call when CA_EVENT_TX_START cwnd event occurs (optional) */ + void (*cwnd_event_tx_start)(struct sock *sk); + /* call when ack arrives (optional) */ void (*in_ack_event)(struct sock *sk, u32 flags); /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); - /* override sysctl_tcp_min_tso_segs */ + /* override sysctl_tcp_min_tso_segs (optional) */ u32 (*min_tso_segs)(struct sock *sk); - /* call when packets are delivered to update cwnd and pacing rate, - * after all the ca_state processing. (optional) - */ - void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs); - - /* new value of cwnd after loss (required) */ u32 (*undo_cwnd)(struct sock *sk); /* returns the multiplier used in tcp_sndbuf_expand (optional) */ @@ -1259,10 +1420,36 @@ static inline bool tcp_ca_needs_ecn(const struct sock *sk) return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; } +static inline bool tcp_ca_needs_accecn(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ACCECN; +} + +static inline bool tcp_ca_ect_1_negotiation(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + return icsk->icsk_ca_ops->flags & TCP_CONG_ECT_1_NEGOTIATION; +} + +static inline bool tcp_ca_no_fallback_rfc3168(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + return icsk->icsk_ca_ops->flags & TCP_CONG_NO_FALLBACK_RFC3168; +} + static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) { const struct inet_connection_sock *icsk = inet_csk(sk); + if (event == CA_EVENT_TX_START) { + if (icsk->icsk_ca_ops->cwnd_event_tx_start) + icsk->icsk_ca_ops->cwnd_event_tx_start(sk); + return; + } if (icsk->icsk_ca_ops->cwnd_event) icsk->icsk_ca_ops->cwnd_event(sk, event); } @@ -1270,13 +1457,6 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) /* From tcp_cong.c */ void tcp_set_ca_state(struct sock *sk, const u8 ca_state); -/* From tcp_rate.c */ -void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); -void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, - struct rate_sample *rs); -void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - bool is_sack_reneg, struct rate_sample *rs); -void tcp_rate_check_app_limited(struct sock *sk); static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) { @@ -1334,7 +1514,7 @@ static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp) static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val) { WARN_ON_ONCE((int)val <= 0); - tp->snd_cwnd = val; + WRITE_ONCE(tp->snd_cwnd, val); } static inline bool tcp_in_slow_start(const struct tcp_sock *tp) @@ -1440,10 +1620,12 @@ static inline unsigned long tcp_pacing_delay(const struct sock *sk) static inline void tcp_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, - const unsigned long max_when) + bool pace_delay) { - inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk), - max_when); + if (pace_delay) + when += tcp_pacing_delay(sk); + inet_csk_reset_xmit_timer(sk, what, when, + tcp_rto_max(sk)); } /* Something is really bad, we could not queue an additional packet, @@ -1472,7 +1654,7 @@ static inline void tcp_check_probe_timer(struct sock *sk) { if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - tcp_probe0_base(sk), TCP_RTO_MAX); + tcp_probe0_base(sk), true); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) @@ -1500,11 +1682,16 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) __skb_checksum_complete(skb); } -bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, - enum skb_drop_reason *reason); +enum skb_drop_reason tcp_add_backlog(struct sock *sk, struct sk_buff *skb); +static inline enum skb_drop_reason +tcp_filter(struct sock *sk, struct sk_buff *skb) +{ + const struct tcphdr *th = (const struct tcphdr *)skb->data; + + return sk_filter_trim_cap(sk, skb, __tcp_hdrlen(th)); +} -int tcp_filter(struct sock *sk, struct sk_buff *skb); void tcp_set_state(struct sock *sk, int state); void tcp_done(struct sock *sk); int tcp_abort(struct sock *sk, int err); @@ -1742,6 +1929,40 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, return true; } +static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) +{ + u32 ace; + + /* mptcp hooks are only on the slow path */ + if (sk_is_mptcp((struct sock *)tp)) + return; + + ace = tcp_ecn_mode_accecn(tp) ? + ((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) & + TCP_ACCECN_CEP_ACE_MASK) : 0; + + tp->pred_flags = htonl((tp->tcp_header_len << 26) | + (ace << 22) | + ntohl(TCP_FLAG_ACK) | + snd_wnd); +} + +static inline void tcp_fast_path_on(struct tcp_sock *tp) +{ + __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); +} + +static inline void tcp_fast_path_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && + tp->rcv_wnd && + atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && + !tp->urg_data) + tcp_fast_path_on(tp); +} + bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, int mib_idx, u32 *last_oow_ack_time); @@ -1755,14 +1976,8 @@ static inline void tcp_mib_init(struct net *net) } /* from STCP */ -static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) -{ - tp->lost_skb_hint = NULL; -} - static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { - tcp_clear_retrans_hints_partial(tp); tp->retransmit_skb_hint = NULL; } @@ -1803,13 +2018,6 @@ struct tcp6_pseudohdr { __be32 protocol; /* including padding */ }; -union tcp_md5sum_block { - struct tcp4_pseudohdr ip4; -#if IS_ENABLED(CONFIG_IPV6) - struct tcp6_pseudohdr ip6; -#endif -}; - /* * struct tcp_sigpool - per-CPU pool of ahash_requests * @scratch: per-CPU temporary area, that can be used between @@ -1844,8 +2052,8 @@ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c); void tcp_sigpool_end(struct tcp_sigpool *c); size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len); /* - functions */ -int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, - const struct sock *sk, const struct sk_buff *skb); +void tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, + const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags, const u8 *newkey, u8 newkeylen); @@ -1882,6 +2090,7 @@ tcp_md5_do_lookup_any_l3index(const struct sock *sk, } #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) +void tcp_md5_destruct_sock(struct sock *sk); #else static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, @@ -1898,15 +2107,15 @@ tcp_md5_do_lookup_any_l3index(const struct sock *sk, } #define tcp_twsk_md5_key(twsk) NULL +static inline void tcp_md5_destruct_sock(struct sock *sk) +{ +} #endif -int tcp_md5_alloc_sigpool(void); -void tcp_md5_release_sigpool(void); -void tcp_md5_add_sigpool(void); -extern int tcp_md5_sigpool_id; - -int tcp_md5_hash_key(struct tcp_sigpool *hp, - const struct tcp_md5sig_key *key); +struct md5_ctx; +void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb, + unsigned int header_len); +void tcp_md5_hash_key(struct md5_ctx *ctx, const struct tcp_md5sig_key *key); /* From tcp_fastopen.c */ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, @@ -1995,7 +2204,34 @@ enum tcp_chrono { __TCP_CHRONO_MAX, }; -void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type); +static inline void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new) +{ + const u32 now = tcp_jiffies32; + enum tcp_chrono old = tp->chrono_type; + + /* Following WRITE_ONCE()s pair with READ_ONCE()s in + * tcp_get_info_chrono_stats(). + */ + if (old > TCP_CHRONO_UNSPEC) + WRITE_ONCE(tp->chrono_stat[old - 1], + tp->chrono_stat[old - 1] + now - tp->chrono_start); + WRITE_ONCE(tp->chrono_start, now); + WRITE_ONCE(tp->chrono_type, new); +} + +static inline void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* If there are multiple conditions worthy of tracking in a + * chronograph then the highest priority enum takes precedence + * over the other conditions. So that if something "more interesting" + * starts happening, stop the previous chrono and start a new one. + */ + if (type > tp->chrono_type) + tcp_chrono_set(tp, type); +} + void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type); /* This helper is needed, because skb->tcp_tsorted_anchor uses @@ -2213,21 +2449,26 @@ void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); -struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb); struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th); struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, struct tcphdr *th); INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)); -INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff)); -INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)); #ifdef CONFIG_INET void tcp_gro_complete(struct sk_buff *skb); #else static inline void tcp_gro_complete(struct sk_buff *skb) { } #endif -void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); +static inline void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, + __be32 daddr) +{ + struct tcphdr *th = tcp_hdr(skb); + + th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); +} static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { @@ -2256,7 +2497,7 @@ struct tcp_sock_af_ops { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*md5_lookup) (const struct sock *sk, const struct sock *addr_sk); - int (*calc_md5_hash)(char *location, + void (*calc_md5_hash)(char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); @@ -2284,7 +2525,7 @@ struct tcp_request_sock_ops { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk, const struct sock *addr_sk); - int (*calc_md5_hash) (char *location, + void (*calc_md5_hash) (char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); @@ -2307,8 +2548,9 @@ struct tcp_request_sock_ops { struct flowi *fl, struct request_sock *req, u32 tw_isn); - u32 (*init_seq)(const struct sk_buff *skb); - u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); + union tcp_seq_and_ts_off (*init_seq_and_ts_off)( + const struct net *net, + const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, @@ -2414,10 +2656,7 @@ void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced); extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd); extern bool tcp_rack_mark_lost(struct sock *sk); -extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); -extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs); /* tcp_plb.c */ @@ -2563,7 +2802,7 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) */ static inline void tcp_listendrop(const struct sock *sk) { - atomic_inc(&((struct sock *)sk)->sk_drops); + sk_drops_inc((struct sock *)sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); } @@ -2588,8 +2827,8 @@ struct tcp_ulp_ops { /* cleanup ulp */ void (*release)(struct sock *sk); /* diagnostic */ - int (*get_info)(struct sock *sk, struct sk_buff *skb); - size_t (*get_info_size)(const struct sock *sk); + int (*get_info)(struct sock *sk, struct sk_buff *skb, bool net_admin); + size_t (*get_info_size)(const struct sock *sk, bool net_admin); /* clone ulp */ void (*clone)(const struct request_sock *req, struct sock *newsk, const gfp_t priority); @@ -2606,8 +2845,8 @@ void tcp_update_ulp(struct sock *sk, struct proto *p, void (*write_space)(struct sock *sk)); #define MODULE_ALIAS_TCP_ULP(name) \ - __MODULE_INFO(alias, alias_userspace, name); \ - __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) + MODULE_INFO(alias, name); \ + MODULE_INFO(alias, "tcp-ulp-" name) #ifdef CONFIG_NET_SOCK_MSG struct sk_msg; @@ -2671,6 +2910,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); if (sk_fullsock(sk)) { sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_owned_by_me(sk); } @@ -2759,9 +2999,9 @@ extern struct static_key_false tcp_have_smc; #endif #if IS_ENABLED(CONFIG_TLS_DEVICE) -void clean_acked_data_enable(struct inet_connection_sock *icsk, +void clean_acked_data_enable(struct tcp_sock *tp, void (*cad)(struct sock *sk, u32 ack_seq)); -void clean_acked_data_disable(struct inet_connection_sock *icsk); +void clean_acked_data_disable(struct tcp_sock *tp); void clean_acked_data_flush(void); #endif @@ -2842,4 +3082,18 @@ enum skb_drop_reason tcp_inbound_hash(struct sock *sk, const void *saddr, const void *daddr, int family, int dif, int sdif); +static inline int tcp_recv_should_stop(struct sock *sk) +{ + return sk->sk_err || + sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + signal_pending(current); +} + +INDIRECT_CALLABLE_DECLARE(union tcp_seq_and_ts_off + tcp_v4_init_seq_and_ts_off(const struct net *net, + const struct sk_buff *skb)); +INDIRECT_CALLABLE_DECLARE(union tcp_seq_and_ts_off + tcp_v6_init_seq_and_ts_off(const struct net *net, + const struct sk_buff *skb)); #endif /* _TCP_H */ diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index df655ce6987d..1e9e27d6e06b 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -130,7 +130,6 @@ struct tcp_ao_info { u32 snd_sne; u32 rcv_sne; refcount_t refcnt; /* Protects twsk destruction */ - struct rcu_head rcu; }; #ifdef CONFIG_TCP_MD5SIG diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h new file mode 100644 index 000000000000..865d5c5a7718 --- /dev/null +++ b/include/net/tcp_ecn.h @@ -0,0 +1,675 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _TCP_ECN_H +#define _TCP_ECN_H + +#include <linux/tcp.h> +#include <linux/skbuff.h> +#include <linux/bitfield.h> + +#include <net/inet_connection_sock.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/inet_ecn.h> + +/* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is + * attemped to be negotiated and requested for incoming connection + * and outgoing connection, respectively. + */ +enum tcp_ecn_mode { + TCP_ECN_IN_NOECN_OUT_NOECN = 0, + TCP_ECN_IN_ECN_OUT_ECN = 1, + TCP_ECN_IN_ECN_OUT_NOECN = 2, + TCP_ECN_IN_ACCECN_OUT_ACCECN = 3, + TCP_ECN_IN_ACCECN_OUT_ECN = 4, + TCP_ECN_IN_ACCECN_OUT_NOECN = 5, +}; + +/* AccECN option sending when AccECN has been successfully negotiated */ +enum tcp_accecn_option { + TCP_ACCECN_OPTION_DISABLED = 0, + TCP_ACCECN_OPTION_MINIMUM = 1, + TCP_ACCECN_OPTION_FULL = 2, + TCP_ACCECN_OPTION_PERSIST = 3, +}; + +/* Apply either ECT(0) or ECT(1) based on TCP_CONG_ECT_1_NEGOTIATION flag */ +static inline void INET_ECN_xmit_ect_1_negotiation(struct sock *sk) +{ + __INET_ECN_xmit(sk, tcp_ca_ect_1_negotiation(sk)); +} + +static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) +{ + /* Do not set CWR if in AccECN mode! */ + if (tcp_ecn_mode_rfc3168(tp)) + tp->ecn_flags |= TCP_ECN_QUEUE_CWR; +} + +static inline void tcp_ecn_accept_cwr(struct sock *sk, + const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) { + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; + + /* If the sender is telling us it has entered CWR, then its + * cwnd may be very low (even just 1 packet), so we should ACK + * immediately. + */ + if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + } +} + +static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) +{ + tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; +} + +static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND; +} + +static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV; +} + +static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND; +} + +static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp) +{ + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV; +} + +static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) +{ + tp->accecn_fail_mode |= mode; +} + +static inline u8 tcp_accecn_ace(const struct tcphdr *th) +{ + return (th->ae << 2) | (th->cwr << 1) | th->ece; +} + +/* Infer the ECT value our SYN arrived with from the echoed ACE field */ +static inline int tcp_accecn_extract_syn_ect(u8 ace) +{ + /* Below is an excerpt from the 1st block of Table 2 of AccECN spec */ + static const int ace_to_ecn[8] = { + INET_ECN_ECT_0, /* 0b000 (Undefined) */ + INET_ECN_ECT_1, /* 0b001 (Undefined) */ + INET_ECN_NOT_ECT, /* 0b010 (Not-ECT is received) */ + INET_ECN_ECT_1, /* 0b011 (ECT-1 is received) */ + INET_ECN_ECT_0, /* 0b100 (ECT-0 is received) */ + INET_ECN_ECT_1, /* 0b101 (Reserved) */ + INET_ECN_CE, /* 0b110 (CE is received) */ + INET_ECN_ECT_1 /* 0b111 (Undefined) */ + }; + + return ace_to_ecn[ace & 0x7]; +} + +/* Check ECN field transition to detect invalid transitions */ +static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv) +{ + if (rcv == snt) + return true; + + /* Non-ECT altered to something or something became non-ECT */ + if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT) + return false; + /* CE -> ECT(0/1)? */ + if (snt == INET_ECN_CE) + return false; + return true; +} + +static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, + u8 sent_ect) +{ + u8 ect = tcp_accecn_extract_syn_ect(ace); + struct tcp_sock *tp = tcp_sk(sk); + + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) + return true; + + if (!tcp_ect_transition_valid(sent_ect, ect)) { + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); + return false; + } + + return true; +} + +static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp, + u8 saw_opt) +{ + tp->saw_accecn_opt = saw_opt; + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); +} + +/* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ +static inline void tcp_accecn_third_ack(struct sock *sk, + const struct sk_buff *skb, u8 sent_ect) +{ + u8 ace = tcp_accecn_ace(tcp_hdr(skb)); + struct tcp_sock *tp = tcp_sk(sk); + + switch (ace) { + case 0x0: + /* Invalid value */ + if (!TCP_SKB_CB(skb)->sacked) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV | + TCP_ACCECN_OPT_FAIL_RECV); + break; + case 0x7: + case 0x5: + case 0x1: + /* Unused but legal values */ + break; + default: + /* Validation only applies to first non-data packet */ + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && + !TCP_SKB_CB(skb)->sacked && + tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) { + if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) && + !tp->delivered_ce) + WRITE_ONCE(tp->delivered_ce, 1); + } + break; + } +} + +/* Demand the minimum # to send AccECN optnio */ +static inline void tcp_accecn_opt_demand_min(struct sock *sk, + u8 opt_demand_min) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 opt_demand; + + opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand); + tp->accecn_opt_demand = opt_demand; +} + +/* Maps IP ECN field ECT/CE code point to AccECN option field number, given + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). + */ +static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return 1; + case INET_ECN_CE: + return 2; + case INET_ECN_ECT_0: + return 3; + } + return 0; +} + +/* Maps IP ECN field ECT/CE code point to AccECN option field value offset. + * Some fields do not start from zero, to detect zeroing by middleboxes. + */ +static inline u32 tcp_accecn_field_init_offset(u8 ecnfield) +{ + switch (ecnfield & INET_ECN_MASK) { + case INET_ECN_NOT_ECT: + return 0; /* AccECN does not send counts of NOT_ECT */ + case INET_ECN_ECT_1: + return TCP_ACCECN_E1B_INIT_OFFSET; + case INET_ECN_CE: + return TCP_ACCECN_CEB_INIT_OFFSET; + case INET_ECN_ECT_0: + return TCP_ACCECN_E0B_INIT_OFFSET; + } + return 0; +} + +/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ +static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option, + bool order) +{ + /* Based on Table 5 of the AccECN spec to map (option, order) to + * the corresponding ECN conuters (ECT-1, ECT-0, or CE). + */ + static const u8 optfield_lookup[2][3] = { + /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */ + { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 }, + /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */ + { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 } + }; + + return optfield_lookup[order][option % 3]; +} + +/* Handles AccECN option ECT and CE 24-bit byte counters update into + * the u32 value in tcp_sock. As we're processing TCP options, it is + * safe to access from - 1. + */ +static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, + u32 init_offset) +{ + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & + 0xFFFFFFU; + u32 delta = (truncated - *cnt) & 0xFFFFFFU; + + /* If delta has the highest bit set (24th bit) indicating + * negative, sign extend to correct an estimation using + * sign_extend32(delta, 24 - 1) + */ + delta = sign_extend32(delta, 23); + *cnt += delta; + return (s32)delta; +} + +/* Updates Accurate ECN received counters from the received IP ECN field */ +static inline void tcp_ecn_received_counters(struct sock *sk, + const struct sk_buff *skb, u32 len) +{ + u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + u8 is_ce = INET_ECN_is_ce(ecnfield); + struct tcp_sock *tp = tcp_sk(sk); + bool ecn_edge; + + if (!INET_ECN_is_not_ect(ecnfield)) { + u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); + + /* As for accurate ECN, the TCP_ECN_SEEN flag is set by + * tcp_ecn_received_counters() when the ECN codepoint of + * received TCP data or ACK contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + tp->ecn_flags |= TCP_ECN_SEEN; + + /* ACE counter tracks *all* segments including pure ACKs */ + tp->received_ce += pcount; + tp->received_ce_pending = min(tp->received_ce_pending + pcount, + 0xfU); + + if (len > 0) { + u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); + u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1]; + u32 bytes_mask = GENMASK_U32(31, 22); + + tp->received_ecn_bytes[ecnfield - 1] += len; + tp->accecn_minlen = max_t(u8, tp->accecn_minlen, + minlen); + + /* Send AccECN option at least once per 2^22-byte + * increase in any ECN byte counter. + */ + if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) & + bytes_mask) { + tcp_accecn_opt_demand_min(sk, 1); + } + } + } + + ecn_edge = tp->prev_ecnfield != ecnfield; + if (ecn_edge || is_ce) { + tp->prev_ecnfield = ecnfield; + /* Demand Accurate ECN change-triggered ACKs. Two ACK are + * demanded to indicate unambiguously the ecnfield value + * in the latter ACK. + */ + if (tcp_ecn_mode_accecn(tp)) { + if (ecn_edge) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + tp->accecn_opt_demand = 2; + } + } +} + +/* AccECN specification, 2.2: [...] A Data Receiver maintains four counters + * initialized at the start of the half-connection. [...] These byte counters + * reflect only the TCP payload length, excluding TCP header and TCP options. + */ +static inline void tcp_ecn_received_counters_payload(struct sock *sk, + const struct sk_buff *skb) +{ + const struct tcphdr *th = (const struct tcphdr *)skb->data; + + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); +} + +/* AccECN specification, 5.1: [...] a server can determine that it + * negotiated AccECN as [...] if the ACK contains an ACE field with + * the value 0b010 to 0b111 (decimal 2 to 7). + */ +static inline bool cookie_accecn_ok(const struct tcphdr *th) +{ + return tcp_accecn_ace(th) > 0x1; +} + +/* Used to form the ACE flags for SYN/ACK */ +static inline u16 tcp_accecn_reflector_flags(u8 ect) +{ + /* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN. + * Below is an excerpt from the 1st block of Table 2 of AccECN spec, + * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE + */ + static const u8 ecn_to_ace_flags[4] = { + 0b010, /* Not-ECT is received */ + 0b011, /* ECT(1) is received */ + 0b100, /* ECT(0) is received */ + 0b110 /* CE is received */ + }; + + return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]); +} + +/* AccECN specification, 3.1.2: If a TCP server that implements AccECN + * receives a SYN with the three TCP header flags (AE, CWR and ECE) set + * to any combination other than 000, 011 or 111, it MUST negotiate the + * use of AccECN as if they had been set to 111. + */ +static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) +{ + u8 ace = tcp_accecn_ace(th); + + return ace && ace != 0x3; +} + +static inline void __tcp_accecn_init_bytes_counters(int *counter_array) +{ + BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1); + BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2); + BUILD_BUG_ON(INET_ECN_CE != 0x3); + + counter_array[INET_ECN_ECT_1 - 1] = 0; + counter_array[INET_ECN_ECT_0 - 1] = 0; + counter_array[INET_ECN_CE - 1] = 0; +} + +static inline void tcp_accecn_init_counters(struct tcp_sock *tp) +{ + tp->received_ce = 0; + tp->received_ce_pending = 0; + __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); + __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); + tp->accecn_opt_sent_w_dsack = 0; + tp->accecn_minlen = 0; + tp->accecn_opt_demand = 0; + tp->est_ecnfield = 0; +} + +/* Used for make_synack to form the ACE flags */ +static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) +{ + /* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received + * from SYN. Below is an excerpt from Table 2 of the AccECN spec: + * +====================+====================================+ + * | IP-ECN codepoint | Respective ACE falgs on SYN/ACK | + * | received on SYN | AE CWR ECE | + * +====================+====================================+ + * | Not-ECT | 0 1 0 | + * | ECT(1) | 0 1 1 | + * | ECT(0) | 1 0 0 | + * | CE | 1 1 0 | + * +====================+====================================+ + */ + th->ae = !!(ect & INET_ECN_ECT_0); + th->cwr = ect != INET_ECN_ECT_0; + th->ece = ect == INET_ECN_ECT_1; +} + +static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, + struct tcphdr *th) +{ + u32 wire_ace; + + /* The final packet of the 3WHS or anything like it must reflect + * the SYN/ACK ECT instead of putting CEP into ACE field, such + * case show up in tcp_flags. + */ + if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) { + wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; + th->ece = !!(wire_ace & 0x1); + th->cwr = !!(wire_ace & 0x2); + th->ae = !!(wire_ace & 0x4); + tp->received_ce_pending = 0; + } +} + +static inline u8 tcp_accecn_option_init(const struct sk_buff *skb, + u8 opt_offset) +{ + u8 *ptr = skb_transport_header(skb) + opt_offset; + unsigned int optlen = ptr[1] - 2; + + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return TCP_ACCECN_OPT_FAIL_SEEN; + ptr += 2; + + /* Detect option zeroing: an AccECN connection "MAY check that the + * initial value of the EE0B field or the EE1B field is non-zero" + */ + if (optlen < TCPOLEN_ACCECN_PERFIELD) + return TCP_ACCECN_OPT_EMPTY_SEEN; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + if (optlen < TCPOLEN_ACCECN_PERFIELD * 3) + return TCP_ACCECN_OPT_COUNTER_SEEN; + ptr += TCPOLEN_ACCECN_PERFIELD * 2; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + + return TCP_ACCECN_OPT_COUNTER_SEEN; +} + +static inline void tcp_ecn_rcv_synack_accecn(struct sock *sk, + const struct sk_buff *skb, u8 dsf) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_rcv = dsf & INET_ECN_MASK; + /* Demand Accurate ECN option in response to the SYN on the SYN/ACK + * and the TCP server will try to send one more packet with an AccECN + * Option at a later point during the connection. + */ + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tp->accecn_opt_demand = 2; + } +} + +/* See Table 2 of the AccECN draft */ +static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, + const struct tcphdr *th, u8 ip_dsfield) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 ace = tcp_accecn_ace(th); + + switch (ace) { + case 0x0: + case 0x7: + /* +========+========+============+=============+ + * | A | B | SYN/ACK | Feedback | + * | | | B->A | Mode of A | + * | | | AE CWR ECE | | + * +========+========+============+=============+ + * | AccECN | No ECN | 0 0 0 | Not ECN | + * | AccECN | Broken | 1 1 1 | Not ECN | + * +========+========+============+=============+ + */ + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); + break; + case 0x1: + /* +========+========+============+=============+ + * | A | B | SYN/ACK | Feedback | + * | | | B->A | Mode of A | + * | | | AE CWR ECE | | + * +========+========+============+=============+ + * | AccECN | ECN | 0 0 1 | Classic ECN | + * | Nonce | AccECN | 0 0 1 | Classic ECN | + * | ECN | AccECN | 0 0 1 | Classic ECN | + * +========+========+============+=============+ + */ + if (tcp_ca_no_fallback_rfc3168(sk)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); + else + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + break; + case 0x5: + if (tcp_ecn_mode_pending(tp)) { + tcp_ecn_rcv_synack_accecn(sk, skb, ip_dsfield); + if (INET_ECN_is_ce(ip_dsfield)) { + tp->received_ce++; + tp->received_ce_pending++; + } + } + break; + default: + tcp_ecn_rcv_synack_accecn(sk, skb, ip_dsfield); + if (INET_ECN_is_ce(ip_dsfield) && + tcp_accecn_validate_syn_feedback(sk, ace, + tp->syn_ect_snt)) { + tp->received_ce++; + tp->received_ce_pending++; + } + break; + } +} + +static inline void tcp_ecn_rcv_syn(struct sock *sk, const struct tcphdr *th, + const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_ecn_mode_pending(tp)) { + if (!tcp_accecn_syn_requested(th)) { + /* Downgrade to classic ECN feedback */ + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } else { + tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + tp->prev_ecnfield = tp->syn_ect_rcv; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + } + } + if (tcp_ecn_mode_rfc3168(tp) && + (!th->ece || !th->cwr || tcp_ca_no_fallback_rfc3168(sk))) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); +} + +static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, + const struct tcphdr *th) +{ + if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) + return true; + return false; +} + +/* Packet ECN state for a SYN-ACK */ +static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; + if (tcp_ecn_disabled(tp)) + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; + else if (tcp_ca_needs_ecn(sk) || + tcp_bpf_ca_needs_ecn(sk)) + INET_ECN_xmit_ect_1_negotiation(sk); + + if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) { + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; + TCP_SKB_CB(skb)->tcp_flags |= + tcp_accecn_reflector_flags(tp->syn_ect_rcv); + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; + } +} + +/* Packet ECN state for a SYN. */ +static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn, use_accecn; + u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn); + + use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN || + tcp_ca_needs_accecn(sk); + use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN || + tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn; + + if (!use_ecn) { + const struct dst_entry *dst = __sk_dst_get(sk); + + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } + + tp->ecn_flags = 0; + + if (use_ecn) { + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit_ect_1_negotiation(sk); + + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; + if (use_accecn) { + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING); + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; + } else { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + } + } +} + +static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) +{ + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) { + /* tp->ecn_flags are cleared at a later point in time when + * SYN ACK is ultimatively being received. + */ + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; + } +} + +static inline void +tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th, + enum tcp_synack_type synack_type) +{ + /* Accurate ECN shall retransmit SYN/ACK with ACE=0 if the + * previously retransmitted SYN/ACK also times out. + */ + if (!req->num_timeout || synack_type != TCP_SYNACK_RETRANS) { + if (tcp_rsk(req)->accecn_ok) + tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); + else if (inet_rsk(req)->ecn_ok) + th->ece = 1; + } else if (tcp_rsk(req)->accecn_ok) { + th->ae = 0; + th->cwr = 0; + th->ece = 0; + } +} + +static inline bool tcp_accecn_option_beacon_check(const struct sock *sk) +{ + u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon); + const struct tcp_sock *tp = tcp_sk(sk); + + if (!ecn_beacon) + return false; + + return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >= + (tp->srtt_us >> 3); +} + +#endif /* _LINUX_TCP_ECN_H */ diff --git a/include/net/tcx.h b/include/net/tcx.h index 5ce0ce9e0c02..23a61af13547 100644 --- a/include/net/tcx.h +++ b/include/net/tcx.h @@ -20,7 +20,6 @@ struct tcx_entry { struct tcx_link { struct bpf_link link; struct net_device *dev; - u32 location; }; static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress) diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h index 62b3e9f2aed4..0a85ac64a66d 100644 --- a/include/net/timewait_sock.h +++ b/include/net/timewait_sock.h @@ -15,13 +15,6 @@ struct timewait_sock_ops { struct kmem_cache *twsk_slab; char *twsk_slab_name; unsigned int twsk_obj_size; - void (*twsk_destructor)(struct sock *sk); }; -static inline void twsk_destructor(struct sock *sk) -{ - if (sk->sk_prot->twsk_prot->twsk_destructor != NULL) - sk->sk_prot->twsk_prot->twsk_destructor(sk); -} - #endif /* _TIMEWAIT_SOCK_H */ diff --git a/include/net/tls.h b/include/net/tls.h index 857340338b69..ebd2550280ae 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -53,6 +53,8 @@ struct tls_rec; /* Maximum data size carried in a TLS record */ #define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14) +/* Minimum record size limit as per RFC8449 */ +#define TLS_MIN_RECORD_SIZE_LIM ((size_t)1 << 6) #define TLS_HEADER_SIZE 5 #define TLS_NONCE_OFFSET TLS_HEADER_SIZE @@ -226,6 +228,7 @@ struct tls_context { u8 rx_conf:3; u8 zerocopy_sendfile:1; u8 rx_no_pad:1; + u16 tx_max_payload_len; int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); @@ -451,25 +454,26 @@ static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq) /* Log all TLS record header TCP sequences in [seq, seq+len] */ static inline void -tls_offload_rx_resync_async_request_start(struct sock *sk, __be32 seq, u16 len) +tls_offload_rx_resync_async_request_start(struct tls_offload_resync_async *resync_async, + __be32 seq, u16 len) { - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); - - atomic64_set(&rx_ctx->resync_async->req, ((u64)ntohl(seq) << 32) | + atomic64_set(&resync_async->req, ((u64)ntohl(seq) << 32) | ((u64)len << 16) | RESYNC_REQ | RESYNC_REQ_ASYNC); - rx_ctx->resync_async->loglen = 0; - rx_ctx->resync_async->rcd_delta = 0; + resync_async->loglen = 0; + resync_async->rcd_delta = 0; } static inline void -tls_offload_rx_resync_async_request_end(struct sock *sk, __be32 seq) +tls_offload_rx_resync_async_request_end(struct tls_offload_resync_async *resync_async, + __be32 seq) { - struct tls_context *tls_ctx = tls_get_ctx(sk); - struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); + atomic64_set(&resync_async->req, ((u64)ntohl(seq) << 32) | RESYNC_REQ); +} - atomic64_set(&rx_ctx->resync_async->req, - ((u64)ntohl(seq) << 32) | RESYNC_REQ); +static inline void +tls_offload_rx_resync_async_request_cancel(struct tls_offload_resync_async *resync_async) +{ + atomic64_set(&resync_async->req, 0); } static inline void diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index 1a97e3f32029..c0a421fe0c2a 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -8,7 +8,6 @@ /* IPv6 transport protocols */ extern struct proto rawv6_prot; extern struct proto udpv6_prot; -extern struct proto udplitev6_prot; extern struct proto tcpv6_prot; extern struct proto pingv6_prot; @@ -28,8 +27,6 @@ int rawv6_init(void); void rawv6_exit(void); int udpv6_init(void); void udpv6_exit(void); -int udplitev6_init(void); -void udplitev6_exit(void); int tcpv6_init(void); void tcpv6_exit(void); diff --git a/include/net/tso.h b/include/net/tso.h index e7e157ae0526..da82aabd1d48 100644 --- a/include/net/tso.h +++ b/include/net/tso.h @@ -3,6 +3,7 @@ #define _TSO_H #include <linux/skbuff.h> +#include <linux/dma-mapping.h> #include <net/ip.h> #define TSO_HEADER_SIZE 256 @@ -28,4 +29,103 @@ void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size); int tso_start(struct sk_buff *skb, struct tso_t *tso); +/** + * struct tso_dma_map - DMA mapping state for GSO payload + * @dev: device used for DMA mapping + * @skb: the GSO skb being mapped + * @hdr_len: per-segment header length + * @iova_state: DMA IOVA state (when IOMMU available) + * @iova_offset: global byte offset into IOVA range (IOVA path only) + * @total_len: total payload length + * @frag_idx: current region (-1 = linear, 0..nr_frags-1 = frag) + * @offset: byte offset within current region + * @linear_dma: DMA address of the linear payload + * @linear_len: length of the linear payload + * @nr_frags: number of frags successfully DMA-mapped + * @frags: per-frag DMA address and length + * + * DMA-maps the payload regions of a GSO skb (linear data + frags). + * Prefers the DMA IOVA API for a single contiguous mapping with one + * IOTLB sync; falls back to per-region dma_map_phys() otherwise. + */ +struct tso_dma_map { + struct device *dev; + const struct sk_buff *skb; + unsigned int hdr_len; + /* IOVA path */ + struct dma_iova_state iova_state; + size_t iova_offset; + size_t total_len; + /* Fallback path if IOVA path fails */ + int frag_idx; + unsigned int offset; + dma_addr_t linear_dma; + unsigned int linear_len; + unsigned int nr_frags; + struct { + dma_addr_t dma; + unsigned int len; + } frags[MAX_SKB_FRAGS]; +}; + +/** + * struct tso_dma_map_completion_state - Completion-time cleanup state + * @iova_state: DMA IOVA state (when IOMMU available) + * @total_len: total payload length of the IOVA mapping + * + * Drivers store this on their SW ring at xmit time via + * tso_dma_map_completion_save(), then call tso_dma_map_complete() at + * completion time. + */ +struct tso_dma_map_completion_state { + struct dma_iova_state iova_state; + size_t total_len; +}; + +int tso_dma_map_init(struct tso_dma_map *map, struct device *dev, + const struct sk_buff *skb, unsigned int hdr_len); +void tso_dma_map_cleanup(struct tso_dma_map *map); +unsigned int tso_dma_map_count(struct tso_dma_map *map, unsigned int len); +bool tso_dma_map_next(struct tso_dma_map *map, dma_addr_t *addr, + unsigned int *chunk_len, unsigned int *mapping_len, + unsigned int seg_remaining); + +/** + * tso_dma_map_completion_save - save state needed for completion-time cleanup + * @map: the xmit-time DMA map + * @cstate: driver-owned storage that persists until completion + * + * Should be called at xmit time to update the completion state and later passed + * to tso_dma_map_complete(). + */ +static inline void +tso_dma_map_completion_save(const struct tso_dma_map *map, + struct tso_dma_map_completion_state *cstate) +{ + cstate->iova_state = map->iova_state; + cstate->total_len = map->total_len; +} + +/** + * tso_dma_map_complete - tear down mapping at completion time + * @dev: the device that owns the mapping + * @cstate: state saved by tso_dma_map_completion_save() + * + * Return: true if the IOVA path was used and the mapping has been + * destroyed; false if the fallback per-region path was used and the + * driver must unmap via its normal completion path. + */ +static inline bool +tso_dma_map_complete(struct device *dev, + struct tso_dma_map_completion_state *cstate) +{ + if (dma_use_iova(&cstate->iova_state)) { + dma_iova_destroy(dev, &cstate->iova_state, cstate->total_len, + DMA_TO_DEVICE, 0); + return true; + } + + return false; +} + #endif /* _TSO_H */ diff --git a/include/net/udp.h b/include/net/udp.h index 6e89520e100d..8262e2b215b4 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -29,13 +29,12 @@ #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/indirect_call_wrapper.h> +#include <linux/math.h> /** - * struct udp_skb_cb - UDP(-Lite) private variables + * struct udp_skb_cb - UDP private variables * * @header: private variables used by IPv4/IPv6 - * @cscov: checksum coverage length (UDP-Lite only) - * @partial_cov: if set indicates partial csum coverage */ struct udp_skb_cb { union { @@ -44,8 +43,6 @@ struct udp_skb_cb { struct inet6_skb_parm h6; #endif } header; - __u16 cscov; - __u8 partial_cov; }; #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) @@ -104,7 +101,7 @@ struct udp_table { unsigned int log; }; extern struct udp_table udp_table; -void udp_table_init(struct udp_table *, const char *); + static inline struct udp_hslot *udp_hashslot(struct udp_table *table, const struct net *net, unsigned int num) @@ -205,7 +202,6 @@ static inline void udp_hash4_dec(struct udp_hslot *hslot2) extern struct proto udp_prot; -extern atomic_long_t udp_memory_allocated; DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); /* sysctl variables for udp */ @@ -216,13 +212,11 @@ extern int sysctl_udp_wmem_min; struct sk_buff; /* - * Generic checksumming routines for UDP(-Lite) v4 and v6 + * Generic checksumming routines for UDP v4 and v6 */ static inline __sum16 __udp_lib_checksum_complete(struct sk_buff *skb) { - return (UDP_SKB_CB(skb)->cscov == skb->len ? - __skb_checksum_complete(skb) : - __skb_checksum_complete_head(skb, UDP_SKB_CB(skb)->cscov)); + return __skb_checksum_complete(skb); } static inline int udp_lib_checksum_complete(struct sk_buff *skb) @@ -273,7 +267,6 @@ static inline void udp_csum_pull_header(struct sk_buff *skb) skb->csum = csum_partial(skb->data, sizeof(struct udphdr), skb->csum); skb_pull_rcsum(skb, sizeof(struct udphdr)); - UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr); } typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport, @@ -282,19 +275,37 @@ typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport, void udp_v6_early_demux(struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); +int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); +INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int flags)); + struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); -static inline void udp_lib_init_sock(struct sock *sk) +static inline int udp_lib_init_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); + sk->sk_drop_counters = &up->drop_counters; skb_queue_head_init(&up->reader_queue); + INIT_HLIST_NODE(&up->tunnel_list); up->forward_threshold = sk->sk_rcvbuf >> 2; set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); + + up->udp_prod_queue = kzalloc_objs(*up->udp_prod_queue, nr_node_ids); + if (!up->udp_prod_queue) + return -ENOMEM; + for (int i = 0; i < nr_node_ids; i++) + init_llist_head(&up->udp_prod_queue[i].ll_root); + return 0; } -/* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ +static inline void udp_drops_inc(struct sock *sk) +{ + numa_drop_add(&udp_sk(sk)->drop_counters, 1); +} + +/* hash routines shared between UDPv4/6 */ static inline int udp_lib_hash(struct sock *sk) { BUG(); @@ -363,7 +374,7 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, */ hash ^= hash << 16; - return htons((((u64) hash * (max - min)) >> 32) + min); + return htons(reciprocal_scale(hash, max - min + 1) + min); } static inline int udp_rqueue_get(struct sock *sk) @@ -397,11 +408,13 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, return __skb_recv_udp(sk, flags, &off, err); } -int udp_v4_early_demux(struct sk_buff *skb); +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); +INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int flags)); void udp_splice_eof(struct socket *sock); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); @@ -409,8 +422,7 @@ int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, int *karg); -int udp_init_sock(struct sock *sk); -int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int udp_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); @@ -425,9 +437,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif); struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, - __be16 sport, - __be32 daddr, __be16 dport, int dif, int sdif, - struct udp_table *tbl, struct sk_buff *skb); + __be16 sport, __be32 daddr, __be16 dport, + int dif, int sdif, struct sk_buff *skb); struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); struct sock *udp6_lib_lookup(const struct net *net, @@ -437,8 +448,7 @@ struct sock *udp6_lib_lookup(const struct net *net, struct sock *__udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, - int dif, int sdif, struct udp_table *tbl, - struct sk_buff *skb); + int dif, int sdif, struct sk_buff *skb); struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport); int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); @@ -510,38 +520,28 @@ static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, } /* - * SNMP statistics for UDP and UDP-Lite + * SNMP statistics for UDP */ -#define UDP_INC_STATS(net, field, is_udplite) do { \ - if (is_udplite) SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ - else SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) -#define __UDP_INC_STATS(net, field, is_udplite) do { \ - if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ - else __SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) - -#define __UDP6_INC_STATS(net, field, is_udplite) do { \ - if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_stats_in6, field);\ - else __SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ -} while(0) -#define UDP6_INC_STATS(net, field, __lite) do { \ - if (__lite) SNMP_INC_STATS((net)->mib.udplite_stats_in6, field); \ - else SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ -} while(0) +#define __UDP_INC_STATS(net, field) \ + __SNMP_INC_STATS((net)->mib.udp_statistics, field) +#define UDP_INC_STATS(net, field) \ + SNMP_INC_STATS((net)->mib.udp_statistics, field) +#define __UDP6_INC_STATS(net, field) \ + __SNMP_INC_STATS((net)->mib.udp_stats_in6, field) +#define UDP6_INC_STATS(net, field) \ + SNMP_INC_STATS((net)->mib.udp_stats_in6, field) #if IS_ENABLED(CONFIG_IPV6) -#define __UDPX_MIB(sk, ipv4) \ -({ \ - ipv4 ? (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ - sock_net(sk)->mib.udp_statistics) : \ - (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_stats_in6 : \ - sock_net(sk)->mib.udp_stats_in6); \ -}) +#define __UDPX_MIB(sk, ipv4) \ + ({ \ + ipv4 ? sock_net(sk)->mib.udp_statistics : \ + sock_net(sk)->mib.udp_stats_in6; \ + }) #else -#define __UDPX_MIB(sk, ipv4) \ -({ \ - IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ - sock_net(sk)->mib.udp_statistics; \ -}) +#define __UDPX_MIB(sk, ipv4) \ + ({ \ + sock_net(sk)->mib.udp_statistics; \ + }) #endif #define __UDPX_INC_STATS(sk, field) \ @@ -550,7 +550,6 @@ static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, #ifdef CONFIG_PROC_FS struct udp_seq_afinfo { sa_family_t family; - struct udp_table *udp_table; }; struct udp_iter_state { @@ -562,9 +561,6 @@ void *udp_seq_start(struct seq_file *seq, loff_t *pos); void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos); void udp_seq_stop(struct seq_file *seq, void *v); -extern const struct seq_operations udp_seq_ops; -extern const struct seq_operations udp6_seq_ops; - int udp4_proc_init(void); void udp4_proc_exit(void); #endif /* CONFIG_PROC_FS */ @@ -586,6 +582,16 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, { netdev_features_t features = NETIF_F_SG; struct sk_buff *segs; + int drop_count; + + /* + * Segmentation in UDP receive path is only for UDP GRO, drop udp + * fragmentation offload (UFO) packets. + */ + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) { + drop_count = 1; + goto drop; + } /* Avoid csum recalculation by skb_segment unless userspace explicitly * asks for the final checksum values @@ -609,23 +615,22 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk, */ segs = __skb_gso_segment(skb, features, false); if (IS_ERR_OR_NULL(segs)) { - int segs_nr = skb_shinfo(skb)->gso_segs; - - atomic_add(segs_nr, &sk->sk_drops); - SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, segs_nr); - kfree_skb(skb); - return NULL; + drop_count = skb_shinfo(skb)->gso_segs; + goto drop; } consume_skb(skb); return segs; + +drop: + sk_drops_add(sk, drop_count); + SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, drop_count); + kfree_skb(skb); + return NULL; } static inline void udp_post_segment_fix_csum(struct sk_buff *skb) { - /* UDP-lite can't land here - no GRO */ - WARN_ON_ONCE(UDP_SKB_CB(skb)->partial_cov); - /* UDP packets generated with UDP_SEGMENT and traversing: * * UDP tunnel(xmit) -> veth (segmentation) -> veth (gro) -> UDP tunnel (rx) @@ -639,7 +644,6 @@ static inline void udp_post_segment_fix_csum(struct sk_buff *skb) * a valid csum after the segmentation. * Additionally fixup the UDP CB. */ - UDP_SKB_CB(skb)->cscov = skb->len; if (skb->ip_summed == CHECKSUM_NONE && !skb->csum_valid) skb->csum_valid = 1; } diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index a93dc51f6323..47c23d4a1740 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -7,9 +7,13 @@ #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> -#include <net/ipv6_stubs.h> #endif +#define UDP_TUNNEL_PARTIAL_FEATURES NETIF_F_GSO_ENCAP_ALL +#define UDP_TUNNEL_STRIPPED_GSO_TYPES ((UDP_TUNNEL_PARTIAL_FEATURES | \ + NETIF_F_GSO_PARTIAL) >> \ + NETIF_F_GSO_SHIFT) + struct udp_port_cfg { u8 family; @@ -47,7 +51,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, static inline int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { - return 0; + return -EPFNOSUPPORT; } #endif @@ -130,35 +134,47 @@ void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type); void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type); -static inline void udp_tunnel_get_rx_info(struct net_device *dev) -{ - ASSERT_RTNL(); - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev); -} - -static inline void udp_tunnel_drop_rx_info(struct net_device *dev) -{ - ASSERT_RTNL(); - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev); -} - /* Transmit the skb using UDP encapsulation. */ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck); + bool xnet, bool nocheck, u16 ipcb_flags); + +void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck, + u16 ip6cb_flags); + +static inline bool udp_tunnel_handle_partial(struct sk_buff *skb) +{ + bool double_encap = !!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL); -int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, - struct net_device *dev, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, bool nocheck); + /* + * If the skb went through partial segmentation, lower devices + * will not need to offload the related features - except for + * UDP_TUNNEL, that will be re-added by the later + * udp_tunnel_handle_offloads(). + */ + if (double_encap) + skb_shinfo(skb)->gso_type &= ~UDP_TUNNEL_STRIPPED_GSO_TYPES; + return double_encap; +} + +static inline void udp_tunnel_set_inner_protocol(struct sk_buff *skb, + bool double_encap, + __be16 inner_proto) +{ + /* + * The inner protocol has been set by the nested tunnel, don't + * overraid it. + */ + if (!double_encap) + skb_set_inner_protocol(skb, inner_proto); +} void udp_tunnel_sock_release(struct socket *sock); @@ -191,6 +207,21 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) } #endif +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add); +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add); +#else +static inline void udp_tunnel_update_gro_lookup(struct net *net, + struct sock *sk, bool add) {} +static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {} +#endif + +static inline void udp_tunnel_cleanup_gro(struct sock *sk) +{ + udp_tunnel_update_gro_rcv(sk, false); + udp_tunnel_update_gro_lookup(sock_net(sk), sk, false); +} + static inline void udp_tunnel_encap_enable(struct sock *sk) { if (udp_test_and_set_bit(ENCAP_ENABLED, sk)) @@ -198,7 +229,7 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) #if IS_ENABLED(CONFIG_IPV6) if (READ_ONCE(sk->sk_family) == PF_INET6) - ipv6_stub->udpv6_encap_enable(); + udpv6_encap_enable(); #endif udp_encap_enable(); } @@ -206,19 +237,17 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) #define UDP_TUNNEL_NIC_MAX_TABLES 4 enum udp_tunnel_nic_info_flags { - /* Device callbacks may sleep */ - UDP_TUNNEL_NIC_INFO_MAY_SLEEP = BIT(0), /* Device only supports offloads when it's open, all ports * will be removed before close and re-added after open. */ - UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(1), + UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(0), /* Device supports only IPv4 tunnels */ - UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(2), + UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(1), /* Device has hard-coded the IANA VXLAN port (4789) as VXLAN. * This port must not be counted towards n_entries of any table. * Driver will not receive any callback associated with port 4789. */ - UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(3), + UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(2), }; struct udp_tunnel_nic; @@ -309,6 +338,9 @@ struct udp_tunnel_nic_ops { size_t (*dump_size)(struct net_device *dev, unsigned int table); int (*dump_write)(struct net_device *dev, unsigned int table, struct sk_buff *skb); + void (*assert_locked)(struct net_device *dev); + void (*lock)(struct net_device *dev); + void (*unlock)(struct net_device *dev); }; #ifdef CONFIG_INET @@ -337,8 +369,28 @@ static inline void udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { - if (udp_tunnel_nic_ops) + if (udp_tunnel_nic_ops) { + udp_tunnel_nic_ops->assert_locked(dev); udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv); + } +} + +static inline void udp_tunnel_nic_assert_locked(struct net_device *dev) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->assert_locked(dev); +} + +static inline void udp_tunnel_nic_lock(struct net_device *dev) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->lock(dev); +} + +static inline void udp_tunnel_nic_unlock(struct net_device *dev) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->unlock(dev); } static inline void @@ -380,17 +432,50 @@ static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev) static inline size_t udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) { + size_t ret; + if (!udp_tunnel_nic_ops) return 0; - return udp_tunnel_nic_ops->dump_size(dev, table); + + udp_tunnel_nic_ops->lock(dev); + ret = udp_tunnel_nic_ops->dump_size(dev, table); + udp_tunnel_nic_ops->unlock(dev); + + return ret; } static inline int udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, struct sk_buff *skb) { + int ret; + if (!udp_tunnel_nic_ops) return 0; - return udp_tunnel_nic_ops->dump_write(dev, table, skb); + + udp_tunnel_nic_ops->lock(dev); + ret = udp_tunnel_nic_ops->dump_write(dev, table, skb); + udp_tunnel_nic_ops->unlock(dev); + + return ret; } + +static inline void udp_tunnel_get_rx_info(struct net_device *dev) +{ + ASSERT_RTNL(); + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; + udp_tunnel_nic_assert_locked(dev); + call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev); +} + +static inline void udp_tunnel_drop_rx_info(struct net_device *dev) +{ + ASSERT_RTNL(); + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; + udp_tunnel_nic_assert_locked(dev); + call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev); +} + #endif diff --git a/include/net/udplite.h b/include/net/udplite.h deleted file mode 100644 index 786919d29f8d..000000000000 --- a/include/net/udplite.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Definitions for the UDP-Lite (RFC 3828) code. - */ -#ifndef _UDPLITE_H -#define _UDPLITE_H - -#include <net/ip6_checksum.h> -#include <net/udp.h> - -/* UDP-Lite socket options */ -#define UDPLITE_SEND_CSCOV 10 /* sender partial coverage (as sent) */ -#define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */ - -extern struct proto udplite_prot; -extern struct udp_table udplite_table; - -/* - * Checksum computation is all in software, hence simpler getfrag. - */ -static __inline__ int udplite_getfrag(void *from, char *to, int offset, - int len, int odd, struct sk_buff *skb) -{ - struct msghdr *msg = from; - return copy_from_iter_full(to, len, &msg->msg_iter) ? 0 : -EFAULT; -} - -/* - * Checksumming routines - */ -static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh) -{ - u16 cscov; - - /* In UDPv4 a zero checksum means that the transmitter generated no - * checksum. UDP-Lite (like IPv6) mandates checksums, hence packets - * with a zero checksum field are illegal. */ - if (uh->check == 0) { - net_dbg_ratelimited("UDPLite: zeroed checksum field\n"); - return 1; - } - - cscov = ntohs(uh->len); - - if (cscov == 0) /* Indicates that full coverage is required. */ - ; - else if (cscov < 8 || cscov > skb->len) { - /* - * Coverage length violates RFC 3828: log and discard silently. - */ - net_dbg_ratelimited("UDPLite: bad csum coverage %d/%d\n", - cscov, skb->len); - return 1; - - } else if (cscov < skb->len) { - UDP_SKB_CB(skb)->partial_cov = 1; - UDP_SKB_CB(skb)->cscov = cscov; - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - skb->csum_valid = 0; - } - - return 0; -} - -/* Fast-path computation of checksum. Socket may not be locked. */ -static inline __wsum udplite_csum(struct sk_buff *skb) -{ - const int off = skb_transport_offset(skb); - const struct sock *sk = skb->sk; - int len = skb->len - off; - - if (udp_test_bit(UDPLITE_SEND_CC, sk)) { - u16 pcslen = READ_ONCE(udp_sk(sk)->pcslen); - - if (pcslen < len) { - if (pcslen > 0) - len = pcslen; - udp_hdr(skb)->len = htons(pcslen); - } - } - skb->ip_summed = CHECKSUM_NONE; /* no HW support for checksumming */ - - return skb_checksum(skb, off, len, 0); -} - -void udplite4_register(void); -#endif /* _UDPLITE_H */ diff --git a/include/net/vsock_addr.h b/include/net/vsock_addr.h index cf8cc140d68d..c3f4cc206198 100644 --- a/include/net/vsock_addr.h +++ b/include/net/vsock_addr.h @@ -16,7 +16,7 @@ bool vsock_addr_bound(const struct sockaddr_vm *addr); void vsock_addr_unbind(struct sockaddr_vm *addr); bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, const struct sockaddr_vm *other); -int vsock_addr_cast(const struct sockaddr *addr, size_t len, +int vsock_addr_cast(const struct sockaddr_unsized *addr, size_t len, struct sockaddr_vm **out_addr); #endif diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 2dd23ee2bacd..0ee50785f4f1 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -296,7 +296,7 @@ struct vxlan_dev { struct vxlan_rdst default_dst; /* default destination */ struct timer_list age_timer; - spinlock_t hash_lock[FDB_HASH_SIZE]; + spinlock_t hash_lock; unsigned int addrcnt; struct gro_cells gro_cells; @@ -304,9 +304,10 @@ struct vxlan_dev { struct vxlan_vni_group __rcu *vnigrp; - struct hlist_head fdb_head[FDB_HASH_SIZE]; + struct rhashtable fdb_hash_tbl; struct rhashtable mdb_tbl; + struct hlist_head fdb_list; struct hlist_head mdb_list; unsigned int mdb_seq; }; @@ -331,6 +332,7 @@ struct vxlan_dev { #define VXLAN_F_VNIFILTER 0x20000 #define VXLAN_F_MDB 0x40000 #define VXLAN_F_LOCALBYPASS 0x80000 +#define VXLAN_F_MC_ROUTE 0x100000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -352,7 +354,9 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_COLLECT_METADATA | \ VXLAN_F_VNIFILTER | \ - VXLAN_F_LOCALBYPASS) + VXLAN_F_LOCALBYPASS | \ + VXLAN_F_MC_ROUTE | \ + 0) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); diff --git a/include/net/x25.h b/include/net/x25.h index 5e833cfc864e..414f3fd99345 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -203,7 +203,6 @@ void x25_send_frame(struct sk_buff *, struct x25_neigh *); int x25_lapb_receive_frame(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); void x25_establish_link(struct x25_neigh *); -void x25_terminate_link(struct x25_neigh *); /* x25_facilities.c */ int x25_parse_facilities(struct sk_buff *, struct x25_facilities *, diff --git a/include/net/xdp.h b/include/net/xdp.h index 4dafc5e021f1..aa742f413c35 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -76,6 +76,11 @@ enum xdp_buff_flags { XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under * pressure */ + /* frags have unreadable mem, this can't be true for real XDP packets, + * but drivers may use XDP helpers to construct Rx pkt state even when + * XDP program is not attached. + */ + XDP_FLAGS_FRAGS_UNREADABLE = BIT(2), }; struct xdp_buff { @@ -85,8 +90,20 @@ struct xdp_buff { void *data_hard_start; struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; - u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ - u32 flags; /* supported values defined in xdp_buff_flags */ + + union { + struct { + /* frame size to deduce data_hard_end/tailroom */ + u32 frame_sz; + /* supported values defined in xdp_buff_flags */ + u32 flags; + }; + +#ifdef __LITTLE_ENDIAN + /* Used to micro-optimize xdp_init_buff(), don't use directly */ + u64 frame_sz_flags_init; +#endif + }; }; static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp) @@ -104,23 +121,42 @@ static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } -static __always_inline bool -xdp_buff_is_frag_pfmemalloc(const struct xdp_buff *xdp) +static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) { - return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); + xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } -static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) +static __always_inline void xdp_buff_set_frag_unreadable(struct xdp_buff *xdp) { - xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; + xdp->flags |= XDP_FLAGS_FRAGS_UNREADABLE; +} + +static __always_inline u32 xdp_buff_get_skb_flags(const struct xdp_buff *xdp) +{ + return xdp->flags; +} + +static __always_inline void xdp_buff_clear_frag_pfmemalloc(struct xdp_buff *xdp) +{ + xdp->flags &= ~XDP_FLAGS_FRAGS_PF_MEMALLOC; } static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { - xdp->frame_sz = frame_sz; xdp->rxq = rxq; + +#ifdef __LITTLE_ENDIAN + /* + * Force the compilers to initialize ::flags and assign ::frame_sz with + * one write on 64-bit LE architectures as they're often unable to do + * it themselves. + */ + xdp->frame_sz_flags_init = frame_sz; +#else + xdp->frame_sz = frame_sz; xdp->flags = 0; +#endif } static __always_inline void @@ -249,6 +285,8 @@ static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, if (unlikely(netmem_is_pfmemalloc(netmem))) xdp_buff_set_frag_pfmemalloc(xdp); + if (unlikely(netmem_is_net_iov(netmem))) + xdp_buff_set_frag_unreadable(xdp); return true; } @@ -272,10 +310,10 @@ static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame) return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } -static __always_inline bool -xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) +static __always_inline u32 +xdp_frame_get_skb_flags(const struct xdp_frame *frame) { - return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); + return frame->flags; } #define XDP_BULK_QUEUE_SIZE 16 @@ -312,9 +350,9 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame) } static inline void -xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, - unsigned int size, unsigned int truesize, - bool pfmemalloc) +xdp_update_skb_frags_info(struct sk_buff *skb, u8 nr_frags, + unsigned int size, unsigned int truesize, + u32 xdp_flags) { struct skb_shared_info *sinfo = skb_shinfo(skb); @@ -328,7 +366,8 @@ xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, skb->len += size; skb->data_len += size; skb->truesize += truesize; - skb->pfmemalloc |= pfmemalloc; + skb->pfmemalloc |= !!(xdp_flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); + skb->unreadable |= !!(xdp_flags & XDP_FLAGS_FRAGS_UNREADABLE); } /* Avoids inlining WARN macro in fast-path */ @@ -343,7 +382,6 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); -int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline @@ -617,8 +655,12 @@ struct xdp_metadata_ops { u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); +void xdp_features_clear_redirect_target_locked(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index bfe625b55d55..ebac60a3d8a1 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -14,7 +14,7 @@ #include <linux/mm.h> #include <net/sock.h> -#define XDP_UMEM_SG_FLAG (1 << 1) +#define XDP_UMEM_SG_FLAG BIT(3) struct net_device; struct xsk_queue; @@ -71,9 +71,6 @@ struct xdp_sock { */ u32 tx_budget_spent; - /* Protects generic receive. */ - spinlock_t rx_lock; - /* Statistics */ u64 rx_dropped; u64 rx_queue_full; @@ -87,6 +84,7 @@ struct xdp_sock { struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; + u32 max_tx_budget; /* Protects multiple processes in the control path */ struct mutex mutex; struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */ @@ -110,11 +108,16 @@ struct xdp_sock { * indicates position where checksumming should start. * csum_offset indicates position where checksum should be stored. * + * void (*tmo_request_launch_time)(u64 launch_time, void *priv) + * Called when AF_XDP frame requested launch time HW offload support. + * launch_time indicates the PTP time at which the device can schedule the + * packet for transmission. */ struct xsk_tx_metadata_ops { void (*tmo_request_timestamp)(void *priv); u64 (*tmo_fill_timestamp)(void *priv); void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv); + void (*tmo_request_launch_time)(u64 launch_time, void *priv); }; #ifdef CONFIG_XDP_SOCKETS @@ -122,6 +125,7 @@ struct xsk_tx_metadata_ops { int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(struct list_head *flush_list); +INDIRECT_CALLABLE_DECLARE(void xsk_destruct_skb(struct sk_buff *)); /** * xsk_tx_metadata_to_compl - Save enough relevant metadata information @@ -162,6 +166,11 @@ static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta, if (!meta) return; + if (ops->tmo_request_launch_time) + if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) + ops->tmo_request_launch_time(meta->request.launch_time, + priv); + if (ops->tmo_request_timestamp) if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) ops->tmo_request_timestamp(priv); @@ -210,6 +219,12 @@ static inline void __xsk_map_flush(struct list_head *flush_list) { } +#ifdef CONFIG_MITIGATION_RETPOLINE +static inline void xsk_destruct_skb(struct sk_buff *skb) +{ +} +#endif + static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, struct xsk_tx_metadata_compl *compl) { diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 784cd34f5bba..46797645a0c2 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -12,6 +12,10 @@ #define XDP_UMEM_MIN_CHUNK_SHIFT 11 #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT) +#define NETDEV_XDP_ACT_XSK (NETDEV_XDP_ACT_BASIC | \ + NETDEV_XDP_ACT_REDIRECT | \ + NETDEV_XDP_ACT_XSK_ZEROCOPY) + struct xsk_cb_desc { void *src; u8 off; @@ -37,16 +41,42 @@ static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) return XDP_PACKET_HEADROOM + pool->headroom; } +static inline u32 xsk_pool_get_tailroom(bool mbuf) +{ + return mbuf ? SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : 0; +} + static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return pool->chunk_size; } -static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) +static inline u32 __xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); } +static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) +{ + u32 frame_size = __xsk_pool_get_rx_frame_size(pool); + struct xdp_umem *umem = pool->umem; + bool mbuf; + + /* Reserve tailroom only for zero-copy pools that opted into + * multi-buffer. The reserved area is used for skb_shared_info, + * matching the XDP core's xdp_data_hard_end() layout. + */ + mbuf = pool->dev && (umem->flags & XDP_UMEM_SG_FLAG); + frame_size -= xsk_pool_get_tailroom(mbuf); + + return ALIGN_DOWN(frame_size, 128); +} + +static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) +{ + return pool->unaligned ? 0 : xsk_pool_get_chunk_size(pool); +} + static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { @@ -118,7 +148,7 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) goto out; list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { - list_del(&pos->list_node); + list_del_init(&pos->list_node); xp_free(pos); } @@ -153,18 +183,28 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) frag = list_first_entry_or_null(&xskb->pool->xskb_list, struct xdp_buff_xsk, list_node); if (frag) { - list_del(&frag->list_node); + list_del_init(&frag->list_node); ret = &frag->xdp; } return ret; } -static inline void xsk_buff_del_tail(struct xdp_buff *tail) +static inline void xsk_buff_del_frag(struct xdp_buff *xdp) { - struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + list_del_init(&xskb->list_node); +} - list_del(&xskb->list_node); +static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) +{ + struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); + struct xdp_buff_xsk *frag; + + frag = list_first_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, + list_node); + return &frag->xdp; } static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) @@ -196,9 +236,27 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return xp_raw_get_data(pool, addr); } +/** + * xsk_buff_raw_get_ctx - get &xdp_desc context + * @pool: XSk buff pool desc address belongs to + * @addr: desc address (from userspace) + * + * Wrapper for xp_raw_get_ctx() to be used in drivers, see its kdoc for + * details. + * + * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata + * pointer, if it is present and valid (initialized to %NULL otherwise). + */ +static inline struct xdp_desc_ctx +xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + return xp_raw_get_ctx(pool, addr); +} + #define XDP_TXMD_FLAGS_VALID ( \ XDP_TXMD_FLAGS_TIMESTAMP | \ XDP_TXMD_FLAGS_CHECKSUM | \ + XDP_TXMD_FLAGS_LAUNCH_TIME | \ 0) static inline bool @@ -207,20 +265,27 @@ xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta) return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); } -static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +static inline struct xsk_tx_metadata * +__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) { struct xsk_tx_metadata *meta; if (!pool->tx_metadata_len) return NULL; - meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len; + meta = data - pool->tx_metadata_len; if (unlikely(!xsk_buff_valid_tx_metadata(meta))) return NULL; /* no way to signal the error to the user */ return meta; } +static inline struct xsk_tx_metadata * +xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + return __xsk_buff_get_metadata(pool, xp_raw_get_data(pool, addr)); +} + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); @@ -298,6 +363,11 @@ static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) return 0; } +static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool) +{ + return 0; +} + static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { @@ -364,10 +434,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) return NULL; } -static inline void xsk_buff_del_tail(struct xdp_buff *tail) +static inline void xsk_buff_del_frag(struct xdp_buff *xdp) { } +static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first) +{ + return NULL; +} + static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) { return NULL; @@ -388,12 +463,25 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return NULL; } +static inline struct xdp_desc_ctx +xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + return (struct xdp_desc_ctx){ }; +} + static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) { return false; } -static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +static inline struct xsk_tx_metadata * +__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data) +{ + return NULL; +} + +static inline struct xsk_tx_metadata * +xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) { return NULL; } diff --git a/include/net/xfrm.h b/include/net/xfrm.h index ed4b83696c77..874409127e29 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -147,8 +147,19 @@ enum { }; struct xfrm_dev_offload { + /* The device for this offload. + * Device drivers should not use this directly, as that will prevent + * them from working with bonding device. Instead, the device passed + * to the add/delete callbacks should be used. + */ struct net_device *dev; netdevice_tracker dev_tracker; + /* This is a private pointer used by the bonding driver (and eventually + * should be moved there). Device drivers should not use it. + * Protected by xfrm_state.lock AND bond.ipsec_lock in most cases, + * except in the .xdo_dev_state_del() flow, where only xfrm_state.lock + * is held. + */ struct net_device *real_dev; unsigned long offload_handle; u8 dir : 2; @@ -236,7 +247,6 @@ struct xfrm_state { /* Data for encapsulator */ struct xfrm_encap_tmpl *encap; - struct sock __rcu *encap_sk; /* NAT keepalive */ u32 nat_keepalive_interval; /* seconds */ @@ -431,7 +441,6 @@ int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo); int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo); void xfrm_flush_gc(void); -void xfrm_state_delete_tunnel(struct xfrm_state *x); struct xfrm_type { struct module *owner; @@ -464,6 +473,15 @@ struct xfrm_type_offload { int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); +void xfrm_set_type_offload(struct xfrm_state *x, bool try_load); +static inline void xfrm_unset_type_offload(struct xfrm_state *x) +{ + if (!x->type_offload) + return; + + module_put(x->type_offload->owner); + x->type_offload = NULL; +} /** * struct xfrm_mode_cbs - XFRM mode callbacks @@ -518,7 +536,8 @@ static inline int xfrm_af2proto(unsigned int family) static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto) { - if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || + if ((x->sel.family != AF_UNSPEC) || + (ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6)) return &x->inner_mode; else @@ -696,6 +715,7 @@ struct xfrm_mgr { const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, + struct net *net, const struct xfrm_encap_tmpl *encap); bool (*is_alive)(const struct km_event *c); }; @@ -897,7 +917,7 @@ static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols) xfrm_pol_put(pols[i]); } -void __xfrm_state_destroy(struct xfrm_state *, bool); +void __xfrm_state_destroy(struct xfrm_state *); static inline void __xfrm_state_put(struct xfrm_state *x) { @@ -907,13 +927,7 @@ static inline void __xfrm_state_put(struct xfrm_state *x) static inline void xfrm_state_put(struct xfrm_state *x) { if (refcount_dec_and_test(&x->refcnt)) - __xfrm_state_destroy(x, false); -} - -static inline void xfrm_state_put_sync(struct xfrm_state *x) -{ - if (refcount_dec_and_test(&x->refcnt)) - __xfrm_state_destroy(x, true); + __xfrm_state_destroy(x); } static inline void xfrm_state_hold(struct xfrm_state *x) @@ -1143,19 +1157,19 @@ struct xfrm_offload { #define CRYPTO_INVALID_PROTOCOL 128 /* Used to keep whole l2 header for transport mode GRO */ - __u32 orig_mac_len; + __u16 orig_mac_len; __u8 proto; __u8 inner_ipproto; }; struct sec_path { - int len; - int olen; - int verified_cnt; - struct xfrm_state *xvec[XFRM_MAX_DEPTH]; struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH]; + + u8 len; + u8 olen; + u8 verified_cnt; }; struct sec_path *secpath_set(struct sk_buff *skb); @@ -1751,7 +1765,7 @@ struct xfrmk_spdinfo { struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_delete(struct xfrm_state *x); -int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, bool task_valid); @@ -1760,8 +1774,7 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); -int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, - struct netlink_ext_ack *extack); +int __xfrm_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack); int xfrm_init_state(struct xfrm_state *x); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); @@ -1773,6 +1786,15 @@ int xfrm_trans_queue(struct sk_buff *skb, struct sk_buff *)); int xfrm_output_resume(struct sock *sk, struct sk_buff *skb, int err); int xfrm_output(struct sock *sk, struct sk_buff *skb); +int xfrm4_tunnel_check_size(struct sk_buff *skb); +#if IS_ENABLED(CONFIG_IPV6) +int xfrm6_tunnel_check_size(struct sk_buff *skb); +#else +static inline int xfrm6_tunnel_check_size(struct sk_buff *skb) +{ + return -EMSGSIZE; +} +#endif #if IS_ENABLED(CONFIG_NET_PKTGEN) int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb); @@ -1870,18 +1892,22 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol); #ifdef CONFIG_XFRM_MIGRATE int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, - const struct xfrm_kmaddress *k, + const struct xfrm_kmaddress *k, struct net *net, const struct xfrm_encap_tmpl *encap); struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, - struct xfrm_encap_tmpl *encap); + struct xfrm_encap_tmpl *encap, + struct net *net, + struct xfrm_user_offload *xuo, + struct netlink_ext_ack *extack); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id, - struct netlink_ext_ack *extack); + struct netlink_ext_ack *extack, + struct xfrm_user_offload *xuo); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 50779406bc2d..ccb3b350001f 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -53,6 +53,8 @@ struct xsk_buff_pool { refcount_t users; struct xdp_umem *umem; struct work_struct work; + /* Protects generic receive in shared and non-shared umem mode. */ + spinlock_t rx_lock; struct list_head free_list; struct list_head xskb_list; u32 heads_cnt; @@ -83,11 +85,11 @@ struct xsk_buff_pool { bool unaligned; bool tx_sw_csum; void *addrs; - /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: - * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when - * sockets share a single cq when the same netdev and queue id is shared. + /* Mutual exclusion of the completion ring in the SKB mode. + * Protect: NAPI TX thread and sendmsg error paths in the SKB + * destructor callback. */ - spinlock_t cq_lock; + spinlock_t cq_prod_lock; struct xdp_buff_xsk *free_heads[]; }; @@ -141,6 +143,14 @@ u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max); bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); + +struct xdp_desc_ctx { + dma_addr_t dma; + struct xsk_tx_metadata *meta; +}; + +struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr); + static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) { return xskb->dma; @@ -164,13 +174,6 @@ static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_sync_single_for_device(pool->dev, dma, size, DMA_BIDIRECTIONAL); } -/* Masks for xdp_umem_page flags. - * The low 12-bits of the addr will be 0 since this is the page address, so we - * can use them for flags. - */ -#define XSK_NEXT_PG_CONTIG_SHIFT 0 -#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) - static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, u64 addr, u32 len) { @@ -230,8 +233,8 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb, return orig_addr; offset = xskb->xdp.data - xskb->xdp.data_hard_start; - orig_addr -= offset; offset += pool->headroom; + orig_addr -= offset; return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } |
