summaryrefslogtreecommitdiff
path: root/include/net
diff options
context:
space:
mode:
Diffstat (limited to 'include/net')
-rw-r--r--include/net/9p/9p.h38
-rw-r--r--include/net/9p/client.h98
-rw-r--r--include/net/9p/transport.h15
-rw-r--r--include/net/Space.h5
-rw-r--r--include/net/act_api.h43
-rw-r--r--include/net/addrconf.h8
-rw-r--r--include/net/af_rxrpc.h56
-rw-r--r--include/net/af_unix.h83
-rw-r--r--include/net/af_vsock.h82
-rw-r--r--include/net/aligned_data.h22
-rw-r--r--include/net/atmclip.h53
-rw-r--r--include/net/ax25.h483
-rw-r--r--include/net/bluetooth/bluetooth.h71
-rw-r--r--include/net/bluetooth/hci.h490
-rw-r--r--include/net/bluetooth/hci_core.h322
-rw-r--r--include/net/bluetooth/hci_drv.h76
-rw-r--r--include/net/bluetooth/hci_mon.h2
-rw-r--r--include/net/bluetooth/hci_sync.h15
-rw-r--r--include/net/bluetooth/l2cap.h14
-rw-r--r--include/net/bluetooth/mgmt.h16
-rw-r--r--include/net/bond_3ad.h5
-rw-r--r--include/net/bond_options.h2
-rw-r--r--include/net/bonding.h35
-rw-r--r--include/net/busy_poll.h21
-rw-r--r--include/net/caif/caif_dev.h128
-rw-r--r--include/net/caif/caif_device.h55
-rw-r--r--include/net/caif/caif_layer.h277
-rw-r--r--include/net/caif/cfcnfg.h90
-rw-r--r--include/net/caif/cfctrl.h130
-rw-r--r--include/net/caif/cffrml.h21
-rw-r--r--include/net/caif/cfmuxl.h20
-rw-r--r--include/net/caif/cfpkt.h232
-rw-r--r--include/net/caif/cfserl.h13
-rw-r--r--include/net/caif/cfsrvl.h61
-rw-r--r--include/net/can.h28
-rw-r--r--include/net/cfg80211.h1209
-rw-r--r--include/net/checksum.h14
-rw-r--r--include/net/cls_cgroup.h2
-rw-r--r--include/net/codel_impl.h46
-rw-r--r--include/net/devlink.h127
-rw-r--r--include/net/dropreason-core.h126
-rw-r--r--include/net/dropreason-qdisc.h114
-rw-r--r--include/net/dropreason.h12
-rw-r--r--include/net/dsa.h57
-rw-r--r--include/net/dst.h64
-rw-r--r--include/net/dst_metadata.h18
-rw-r--r--include/net/fib_rules.h29
-rw-r--r--include/net/flow.h13
-rw-r--r--include/net/flow_offload.h34
-rw-r--r--include/net/fq_impl.h2
-rw-r--r--include/net/genetlink.h6
-rw-r--r--include/net/gro.h108
-rw-r--r--include/net/hotdata.h14
-rw-r--r--include/net/icmp.h10
-rw-r--r--include/net/ieee80211_radiotap.h20
-rw-r--r--include/net/inet6_connection_sock.h10
-rw-r--r--include/net/inet6_hashtables.h26
-rw-r--r--include/net/inet_common.h16
-rw-r--r--include/net/inet_connection_sock.h76
-rw-r--r--include/net/inet_dscp.h6
-rw-r--r--include/net/inet_ecn.h20
-rw-r--r--include/net/inet_frag.h24
-rw-r--r--include/net/inet_hashtables.h81
-rw-r--r--include/net/inet_sock.h33
-rw-r--r--include/net/inet_timewait_sock.h11
-rw-r--r--include/net/ioam6.h2
-rw-r--r--include/net/ip.h52
-rw-r--r--include/net/ip6_checksum.h2
-rw-r--r--include/net/ip6_fib.h57
-rw-r--r--include/net/ip6_route.h59
-rw-r--r--include/net/ip6_tunnel.h19
-rw-r--r--include/net/ip_fib.h9
-rw-r--r--include/net/ip_tunnels.h90
-rw-r--r--include/net/ip_vs.h444
-rw-r--r--include/net/ipcomp.h13
-rw-r--r--include/net/ipv6.h211
-rw-r--r--include/net/ipv6_frag.h14
-rw-r--r--include/net/ipv6_stubs.h102
-rw-r--r--include/net/iucv/iucv.h209
-rw-r--r--include/net/kcm.h1
-rw-r--r--include/net/l3mdev.h34
-rw-r--r--include/net/libeth/rx.h75
-rw-r--r--include/net/libeth/tx.h36
-rw-r--r--include/net/libeth/types.h106
-rw-r--r--include/net/libeth/xdp.h1870
-rw-r--r--include/net/libeth/xsk.h688
-rw-r--r--include/net/lwtunnel.h9
-rw-r--r--include/net/mac80211.h465
-rw-r--r--include/net/macsec.h7
-rw-r--r--include/net/mana/gdma.h174
-rw-r--r--include/net/mana/hw_channel.h11
-rw-r--r--include/net/mana/mana.h230
-rw-r--r--include/net/mana/shm_channel.h6
-rw-r--r--include/net/mctp.h63
-rw-r--r--include/net/mptcp.h33
-rw-r--r--include/net/ndisc.h40
-rw-r--r--include/net/neighbour.h47
-rw-r--r--include/net/net_namespace.h31
-rw-r--r--include/net/net_shaper.h1
-rw-r--r--include/net/netdev_lock.h138
-rw-r--r--include/net/netdev_netlink.h12
-rw-r--r--include/net/netdev_queues.h137
-rw-r--r--include/net/netdev_rx_queue.h36
-rw-r--r--include/net/netfilter/ipv4/nf_conntrack_ipv4.h6
-rw-r--r--include/net/netfilter/ipv4/nf_reject.h8
-rw-r--r--include/net/netfilter/ipv6/nf_reject.h10
-rw-r--r--include/net/netfilter/nf_conntrack.h18
-rw-r--r--include/net/netfilter/nf_conntrack_core.h5
-rw-r--r--include/net/netfilter/nf_conntrack_count.h16
-rw-r--r--include/net/netfilter/nf_conntrack_expect.h23
-rw-r--r--include/net/netfilter/nf_conntrack_l4proto.h22
-rw-r--r--include/net/netfilter/nf_conntrack_timeout.h1
-rw-r--r--include/net/netfilter/nf_conntrack_tuple.h2
-rw-r--r--include/net/netfilter/nf_dup_netdev.h13
-rw-r--r--include/net/netfilter/nf_flow_table.h32
-rw-r--r--include/net/netfilter/nf_log.h3
-rw-r--r--include/net/netfilter/nf_queue.h4
-rw-r--r--include/net/netfilter/nf_reject.h1
-rw-r--r--include/net/netfilter/nf_tables.h131
-rw-r--r--include/net/netfilter/nf_tables_core.h54
-rw-r--r--include/net/netfilter/nf_tables_ipv4.h17
-rw-r--r--include/net/netfilter/nf_tables_ipv6.h20
-rw-r--r--include/net/netfilter/nf_tables_offload.h10
-rw-r--r--include/net/netfilter/nft_fib.h32
-rw-r--r--include/net/netfilter/nft_meta.h3
-rw-r--r--include/net/netlink.h70
-rw-r--r--include/net/netmem.h252
-rw-r--r--include/net/netns/conntrack.h13
-rw-r--r--include/net/netns/core.h2
-rw-r--r--include/net/netns/ipv4.h43
-rw-r--r--include/net/netns/ipv6.h15
-rw-r--r--include/net/netns/mctp.h20
-rw-r--r--include/net/netns/mib.h5
-rw-r--r--include/net/netns/mpls.h2
-rw-r--r--include/net/netns/nftables.h1
-rw-r--r--include/net/netns/sctp.h4
-rw-r--r--include/net/netns/smc.h5
-rw-r--r--include/net/netns/vsock.h26
-rw-r--r--include/net/netns/xfrm.h2
-rw-r--r--include/net/netrom.h273
-rw-r--r--include/net/nexthop.h2
-rw-r--r--include/net/nfc/nci_core.h2
-rw-r--r--include/net/nfc/nfc.h2
-rw-r--r--include/net/nl802154.h5
-rw-r--r--include/net/nsh.h4
-rw-r--r--include/net/p8022.h16
-rw-r--r--include/net/page_pool/helpers.h42
-rw-r--r--include/net/page_pool/memory_provider.h47
-rw-r--r--include/net/page_pool/types.h26
-rw-r--r--include/net/pfcp.h2
-rw-r--r--include/net/phy/realtek_phy.h7
-rw-r--r--include/net/pie.h2
-rw-r--r--include/net/ping.h8
-rw-r--r--include/net/pkt_cls.h2
-rw-r--r--include/net/pkt_sched.h60
-rw-r--r--include/net/proto_memory.h7
-rw-r--r--include/net/psp.h12
-rw-r--r--include/net/psp/functions.h209
-rw-r--r--include/net/psp/types.h216
-rw-r--r--include/net/raw.h1
-rw-r--r--include/net/request_sock.h18
-rw-r--r--include/net/rose.h247
-rw-r--r--include/net/route.h11
-rw-r--r--include/net/rps-types.h24
-rw-r--r--include/net/rps.h138
-rw-r--r--include/net/rstreason.h2
-rw-r--r--include/net/rtnetlink.h40
-rw-r--r--include/net/sch_generic.h238
-rw-r--r--include/net/sch_priv.h27
-rw-r--r--include/net/scm.h125
-rw-r--r--include/net/sctp/auth.h18
-rw-r--r--include/net/sctp/checksum.h32
-rw-r--r--include/net/sctp/constants.h9
-rw-r--r--include/net/sctp/sctp.h9
-rw-r--r--include/net/sctp/sm.h1
-rw-r--r--include/net/sctp/stream_sched.h4
-rw-r--r--include/net/sctp/structs.h51
-rw-r--r--include/net/secure_seq.h49
-rw-r--r--include/net/seg6_hmac.h20
-rw-r--r--include/net/selftests.h45
-rw-r--r--include/net/smc.h104
-rw-r--r--include/net/snmp.h10
-rw-r--r--include/net/sock.h444
-rw-r--r--include/net/strparser.h2
-rw-r--r--include/net/switchdev.h1
-rw-r--r--include/net/tc_act/tc_connmark.h1
-rw-r--r--include/net/tc_act/tc_csum.h10
-rw-r--r--include/net/tc_act/tc_ct.h11
-rw-r--r--include/net/tc_act/tc_ctinfo.h7
-rw-r--r--include/net/tc_act/tc_gate.h38
-rw-r--r--include/net/tc_act/tc_ife.h4
-rw-r--r--include/net/tc_act/tc_mpls.h10
-rw-r--r--include/net/tc_act/tc_nat.h1
-rw-r--r--include/net/tc_act/tc_pedit.h2
-rw-r--r--include/net/tc_act/tc_police.h12
-rw-r--r--include/net/tc_act/tc_sample.h9
-rw-r--r--include/net/tc_act/tc_skbedit.h1
-rw-r--r--include/net/tc_act/tc_skbmod.h1
-rw-r--r--include/net/tc_act/tc_tunnel_key.h1
-rw-r--r--include/net/tc_act/tc_vlan.h10
-rw-r--r--include/net/tc_wrapper.h47
-rw-r--r--include/net/tcp.h520
-rw-r--r--include/net/tcp_ao.h1
-rw-r--r--include/net/tcp_ecn.h675
-rw-r--r--include/net/tcx.h1
-rw-r--r--include/net/timewait_sock.h7
-rw-r--r--include/net/tls.h28
-rw-r--r--include/net/transp_v6.h3
-rw-r--r--include/net/tso.h100
-rw-r--r--include/net/udp.h134
-rw-r--r--include/net/udp_tunnel.h155
-rw-r--r--include/net/udplite.h88
-rw-r--r--include/net/vsock_addr.h2
-rw-r--r--include/net/vxlan.h10
-rw-r--r--include/net/x25.h1
-rw-r--r--include/net/xdp.h74
-rw-r--r--include/net/xdp_sock.h23
-rw-r--r--include/net/xdp_sock_drv.h108
-rw-r--r--include/net/xfrm.h70
-rw-r--r--include/net/xsk_buff_pool.h27
220 files changed, 11675 insertions, 4993 deletions
diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index 60cad0d200a4..fd7a034b8278 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -24,6 +24,8 @@
* @P9_DEBUG_PKT: packet marshalling/unmarshalling
* @P9_DEBUG_FSC: FS-cache tracing
* @P9_DEBUG_VPKT: Verbose packet debugging (full packet dump)
+ * @P9_DEBUG_CACHE: cache operations tracing
+ * @P9_DEBUG_MMAP: memory-mapped I/O tracing
*
* These flags are passed at mount time to turn on various levels of
* verbosity and tracing which will be output to the system logs.
@@ -68,13 +70,39 @@ void _p9_debug(enum p9_debug_flags level, const char *func,
* @P9_RSYMLINK: make symlink response
* @P9_TMKNOD: create a special file object request
* @P9_RMKNOD: create a special file object response
+ * @P9_TLOPEN: open a file for I/O (9P2000.L)
+ * @P9_RLOPEN: response with qid and iounit (9P2000.L)
* @P9_TLCREATE: prepare a handle for I/O on an new file for 9P2000.L
* @P9_RLCREATE: response with file access information for 9P2000.L
* @P9_TRENAME: rename request
* @P9_RRENAME: rename response
- * @P9_TMKDIR: create a directory request
- * @P9_RMKDIR: create a directory response
- * @P9_TVERSION: version handshake request
+ * @P9_TREADLINK: read symbolic link target (9P2000.L)
+ * @P9_RREADLINK: response with symbolic link target (9P2000.L)
+ * @P9_TGETATTR: get file attributes request (9P2000.L)
+ * @P9_RGETATTR: get file attributes response (9P2000.L)
+ * @P9_TSETATTR: set file attributes request (9P2000.L)
+ * @P9_RSETATTR: set file attributes response (9P2000.L)
+ * @P9_TXATTRWALK: prepare to read/list extended attributes (9P2000.L)
+ * @P9_RXATTRWALK: response with extended attribute size (9P2000.L)
+ * @P9_TXATTRCREATE: prepare to set extended attribute (9P2000.L)
+ * @P9_RXATTRCREATE: set extended attribute response (9P2000.L)
+ * @P9_TREADDIR: read directory entries request (9P2000.L)
+ * @P9_RREADDIR: read directory entries response (9P2000.L)
+ * @P9_TFSYNC: flush cached file data to storage request (9P2000.L)
+ * @P9_RFSYNC: flush cached file data to storage response (9P2000.L)
+ * @P9_TLOCK: acquire or release a POSIX record lock (9P2000.L)
+ * @P9_RLOCK: POSIX record lock response (9P2000.L)
+ * @P9_TGETLOCK: test for existence of POSIX record lock (9P2000.L)
+ * @P9_RGETLOCK: POSIX record lock test response (9P2000.L)
+ * @P9_TLINK: create a hard link (9P2000.L)
+ * @P9_RLINK: hard link response (9P2000.L)
+ * @P9_TRENAMEAT: safely rename across directories (9P2000.L)
+ * @P9_RRENAMEAT: rename response (9P2000.L)
+ * @P9_TUNLINKAT: unlink a file or directory (9P2000.L)
+ * @P9_RUNLINKAT: unlink response (9P2000.L)
+ * @P9_TMKDIR: create a directory request (9P2000.L)
+ * @P9_RMKDIR: create a directory response (9P2000.L)
+ * @P9_TVERSION: negotiate protocol version and message size
* @P9_RVERSION: version handshake response
* @P9_TAUTH: request to establish authentication channel
* @P9_RAUTH: response with authentication information
@@ -194,6 +222,10 @@ enum p9_msg_t {
* @P9_ORCLOSE: remove the file when the file is closed
* @P9_OAPPEND: open the file and seek to the end
* @P9_OEXCL: only create a file, do not open it
+ * @P9L_MODE_MASK: mask for protocol mode bits (client-side only)
+ * @P9L_DIRECT: disable client-side caching for this file
+ * @P9L_NOWRITECACHE: disable write caching for this file
+ * @P9L_LOOSE: enable loose cache consistency
*
* 9P open modes differ slightly from Posix standard modes.
* In particular, there are extra modes which specify different
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index 4f785098c67a..838a94218b59 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -16,6 +16,12 @@
/* Number of requests per row */
#define P9_ROW_MAXTAG 255
+/* DEFAULT MSIZE = 32 pages worth of payload + P9_HDRSZ +
+ * room for write (16 extra) or read (11 extra) operands.
+ */
+
+#define DEFAULT_MSIZE ((128 * 1024) + P9_IOHDRSZ)
+
/** enum p9_proto_versions - 9P protocol versions
* @p9_proto_legacy: 9P Legacy mode, pre-9P2000.u
* @p9_proto_2000u: 9P2000.u extension
@@ -127,6 +133,96 @@ struct p9_client {
};
/**
+ * struct p9_fd_opts - holds client options during parsing
+ * @msize: maximum data size negotiated by protocol
+ * @prot-Oversion: 9P protocol version to use
+ * @trans_mod: module API instantiated with this client
+ *
+ * These parsed options get transferred into client in
+ * apply_client_options()
+ */
+struct p9_client_opts {
+ unsigned int msize;
+ unsigned char proto_version;
+ struct p9_trans_module *trans_mod;
+};
+
+/**
+ * struct p9_fd_opts - per-transport options for fd transport
+ * @rfd: file descriptor for reading (trans=fd)
+ * @wfd: file descriptor for writing (trans=fd)
+ * @port: port to connect to (trans=tcp)
+ * @privport: port is privileged
+ */
+struct p9_fd_opts {
+ int rfd;
+ int wfd;
+ u16 port;
+ bool privport;
+};
+
+/**
+ * struct p9_rdma_opts - Collection of mount options for rdma transport
+ * @port: port of connection
+ * @privport: Whether a privileged port may be used
+ * @sq_depth: The requested depth of the SQ. This really doesn't need
+ * to be any deeper than the number of threads used in the client
+ * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth
+ * @timeout: Time to wait in msecs for CM events
+ */
+struct p9_rdma_opts {
+ short port;
+ bool privport;
+ int sq_depth;
+ int rq_depth;
+ long timeout;
+};
+
+/**
+ * struct p9_session_opts - holds parsed options for v9fs_session_info
+ * @flags: session options of type &p9_session_flags
+ * @nodev: set to 1 to disable device mapping
+ * @debug: debug level
+ * @afid: authentication handle
+ * @cache: cache mode of type &p9_cache_bits
+ * @cachetag: the tag of the cache associated with this session
+ * @uname: string user name to mount hierarchy as
+ * @aname: mount specifier for remote hierarchy
+ * @dfltuid: default numeric userid to mount hierarchy as
+ * @dfltgid: default numeric groupid to mount hierarchy as
+ * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy
+ * @session_lock_timeout: retry interval for blocking locks
+ *
+ * This strucure holds options which are parsed and will be transferred
+ * to the v9fs_session_info structure when mounted, and therefore largely
+ * duplicates struct v9fs_session_info.
+ */
+struct p9_session_opts {
+ unsigned int flags;
+ unsigned char nodev;
+ unsigned short debug;
+ unsigned int afid;
+ unsigned int cache;
+#ifdef CONFIG_9P_FSCACHE
+ char *cachetag;
+#endif
+ char *uname;
+ char *aname;
+ kuid_t dfltuid;
+ kgid_t dfltgid;
+ kuid_t uid;
+ long session_lock_timeout;
+};
+
+/* Used by mount API to store parsed mount options */
+struct v9fs_context {
+ struct p9_client_opts client_opts;
+ struct p9_fd_opts fd_opts;
+ struct p9_rdma_opts rdma_opts;
+ struct p9_session_opts session_opts;
+};
+
+/**
* struct p9_fid - file system entity handle
* @clnt: back pointer to instantiating &p9_client
* @fid: numeric identifier for this handle
@@ -183,7 +279,7 @@ int p9_client_rename(struct p9_fid *fid, struct p9_fid *newdirfid,
const char *name);
int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name,
struct p9_fid *newdirfid, const char *new_name);
-struct p9_client *p9_client_create(const char *dev_name, char *options);
+struct p9_client *p9_client_create(struct fs_context *fc);
void p9_client_destroy(struct p9_client *clnt);
void p9_client_disconnect(struct p9_client *clnt);
void p9_client_begin_disconnect(struct p9_client *clnt);
diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h
index 766ec07c9599..a912bbaa862f 100644
--- a/include/net/9p/transport.h
+++ b/include/net/9p/transport.h
@@ -14,6 +14,13 @@
#define P9_DEF_MIN_RESVPORT (665U)
#define P9_DEF_MAX_RESVPORT (1023U)
+#define P9_FD_PORT 564
+
+#define P9_RDMA_PORT 5640
+#define P9_RDMA_SQ_DEPTH 32
+#define P9_RDMA_RQ_DEPTH 32
+#define P9_RDMA_TIMEOUT 30000 /* 30 seconds */
+
/**
* struct p9_trans_module - transport module interface
* @list: used to maintain a list of currently available transports
@@ -24,6 +31,9 @@
* we're less flexible when choosing the response message
* size in this case
* @def: set if this transport should be considered the default
+ * @supports_vmalloc: set if this transport can work with vmalloc'd buffers
+ * (non-physically contiguous memory). Transports requiring
+ * DMA should leave this as false.
* @create: member function to create a new connection on this transport
* @close: member function to discard a connection on this transport
* @request: member function to issue a request to the transport
@@ -43,10 +53,11 @@ struct p9_trans_module {
char *name; /* name of transport */
int maxsize; /* max message size of transport */
bool pooled_rbuffers;
- int def; /* this transport should be default */
+ bool def; /* this transport should be default */
+ bool supports_vmalloc; /* can work with vmalloc'd buffers */
struct module *owner;
int (*create)(struct p9_client *client,
- const char *devname, char *args);
+ struct fs_context *fc);
void (*close)(struct p9_client *client);
int (*request)(struct p9_client *client, struct p9_req_t *req);
int (*cancel)(struct p9_client *client, struct p9_req_t *req);
diff --git a/include/net/Space.h b/include/net/Space.h
index ef42629f4258..6a0b6674d930 100644
--- a/include/net/Space.h
+++ b/include/net/Space.h
@@ -3,10 +3,5 @@
* ethernet adaptor have the name "eth[0123...]".
*/
-struct net_device *ultra_probe(int unit);
-struct net_device *wd_probe(int unit);
struct net_device *ne_probe(int unit);
-struct net_device *smc_init(int unit);
struct net_device *cs89x0_probe(int unit);
-struct net_device *tc515_probe(int unit);
-struct net_device *lance_probe(int unit);
diff --git a/include/net/act_api.h b/include/net/act_api.h
index 404df8557f6a..fd2967ee08f7 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -33,7 +33,10 @@ struct tc_action {
struct tcf_t tcfa_tm;
struct gnet_stats_basic_sync tcfa_bstats;
struct gnet_stats_basic_sync tcfa_bstats_hw;
- struct gnet_stats_queue tcfa_qstats;
+
+ atomic_t tcfa_drops;
+ atomic_t tcfa_overlimits;
+
struct net_rate_estimator __rcu *tcfa_rate_est;
spinlock_t tcfa_lock;
struct gnet_stats_basic_sync __percpu *cpu_bstats;
@@ -42,6 +45,7 @@ struct tc_action {
struct tc_cookie __rcu *user_cookie;
struct tcf_chain __rcu *goto_chain;
u32 tcfa_flags;
+ struct rcu_head tcfa_rcu;
u8 hw_stats;
u8 used_hw_stats;
bool used_hw_stats_valid;
@@ -53,7 +57,6 @@ struct tc_action {
#define tcf_action common.tcfa_action
#define tcf_tm common.tcfa_tm
#define tcf_bstats common.tcfa_bstats
-#define tcf_qstats common.tcfa_qstats
#define tcf_rate_est common.tcfa_rate_est
#define tcf_lock common.tcfa_lock
@@ -68,6 +71,7 @@ struct tc_action {
#define TCA_ACT_FLAGS_REPLACE (1U << (TCA_ACT_FLAGS_USER_BITS + 2))
#define TCA_ACT_FLAGS_NO_RTNL (1U << (TCA_ACT_FLAGS_USER_BITS + 3))
#define TCA_ACT_FLAGS_AT_INGRESS (1U << (TCA_ACT_FLAGS_USER_BITS + 4))
+#define TCA_ACT_FLAGS_AT_INGRESS_OR_CLSACT (1U << (TCA_ACT_FLAGS_USER_BITS + 5))
/* Update lastuse only if needed, to avoid dirtying a cache line.
* We use a temp variable to avoid fetching jiffies twice.
@@ -76,19 +80,24 @@ static inline void tcf_lastuse_update(struct tcf_t *tm)
{
unsigned long now = jiffies;
- if (tm->lastuse != now)
- tm->lastuse = now;
- if (unlikely(!tm->firstuse))
- tm->firstuse = now;
+ if (READ_ONCE(tm->lastuse) != now)
+ WRITE_ONCE(tm->lastuse, now);
+ if (unlikely(!READ_ONCE(tm->firstuse)))
+ WRITE_ONCE(tm->firstuse, now);
}
static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm)
{
- dtm->install = jiffies_to_clock_t(jiffies - stm->install);
- dtm->lastuse = jiffies_to_clock_t(jiffies - stm->lastuse);
- dtm->firstuse = stm->firstuse ?
- jiffies_to_clock_t(jiffies - stm->firstuse) : 0;
- dtm->expires = jiffies_to_clock_t(stm->expires);
+ unsigned long firstuse, now = jiffies;
+
+ dtm->install = jiffies_to_clock_t(now - READ_ONCE(stm->install));
+ dtm->lastuse = jiffies_to_clock_t(now - READ_ONCE(stm->lastuse));
+
+ firstuse = READ_ONCE(stm->firstuse);
+ dtm->firstuse = firstuse ?
+ jiffies_to_clock_t(now - firstuse) : 0;
+
+ dtm->expires = jiffies_to_clock_t(READ_ONCE(stm->expires));
}
static inline enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats)
@@ -152,7 +161,7 @@ int tc_action_net_init(struct net *net, struct tc_action_net *tn,
{
int err = 0;
- tn->idrinfo = kmalloc(sizeof(*tn->idrinfo), GFP_KERNEL);
+ tn->idrinfo = kmalloc_obj(*tn->idrinfo);
if (!tn->idrinfo)
return -ENOMEM;
tn->ops = ops;
@@ -170,14 +179,12 @@ static inline void tc_action_net_exit(struct list_head *net_list,
{
struct net *net;
- rtnl_lock();
list_for_each_entry(net, net_list, exit_list) {
struct tc_action_net *tn = net_generic(net, id);
tcf_idrinfo_destroy(tn->ops, tn->idrinfo);
kfree(tn->idrinfo);
}
- rtnl_unlock();
}
int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
@@ -238,9 +245,7 @@ static inline void tcf_action_inc_drop_qstats(struct tc_action *a)
qstats_drop_inc(this_cpu_ptr(a->cpu_qstats));
return;
}
- spin_lock(&a->tcfa_lock);
- qstats_drop_inc(&a->tcfa_qstats);
- spin_unlock(&a->tcfa_lock);
+ atomic_inc(&a->tcfa_drops);
}
static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a)
@@ -249,9 +254,7 @@ static inline void tcf_action_inc_overlimit_qstats(struct tc_action *a)
qstats_overlimit_inc(this_cpu_ptr(a->cpu_qstats));
return;
}
- spin_lock(&a->tcfa_lock);
- qstats_overlimit_inc(&a->tcfa_qstats);
- spin_unlock(&a->tcfa_lock);
+ atomic_inc(&a->tcfa_overlimits);
}
void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets,
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 9e5e95988b9e..9e96776945e5 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -8,7 +8,8 @@
#define MIN_VALID_LIFETIME (2*3600) /* 2 hours */
-#define TEMP_VALID_LIFETIME (7*86400) /* 1 week */
+/* TEMP_VALID_LIFETIME default value as specified in RFC 8981 3.8 */
+#define TEMP_VALID_LIFETIME (2*86400) /* 2 days */
#define TEMP_PREFERRED_LIFETIME (86400) /* 24 hours */
#define REGEN_MIN_ADVANCE (2) /* 2 seconds */
#define REGEN_MAX_RETRY (3)
@@ -347,6 +348,11 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev)
return rcu_dereference_rtnl(dev->ip6_ptr);
}
+static inline struct inet6_dev *in6_dev_rcu(const struct net_device *dev)
+{
+ return rcu_dereference(dev->ip6_ptr);
+}
+
static inline struct inet6_dev *__in6_dev_get_rtnl_net(const struct net_device *dev)
{
return rtnl_net_dereference(dev_net(dev), dev->ip6_ptr);
diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index 0754c463224a..0fb4c41c9bbf 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -16,6 +16,7 @@ struct sock;
struct socket;
struct rxrpc_call;
struct rxrpc_peer;
+struct krb5_buffer;
enum rxrpc_abort_reason;
enum rxrpc_interruptibility {
@@ -24,23 +25,33 @@ enum rxrpc_interruptibility {
RXRPC_UNINTERRUPTIBLE, /* Call should not be interruptible at all */
};
+enum rxrpc_oob_type {
+ RXRPC_OOB_CHALLENGE, /* Security challenge for a connection */
+};
+
/*
* Debug ID counter for tracing.
*/
extern atomic_t rxrpc_debug_id;
+/*
+ * Operations table for rxrpc to call out to a kernel application (e.g. kAFS).
+ */
+struct rxrpc_kernel_ops {
+ void (*notify_new_call)(struct sock *sk, struct rxrpc_call *call,
+ unsigned long user_call_ID);
+ void (*discard_new_call)(struct rxrpc_call *call, unsigned long user_call_ID);
+ void (*user_attach_call)(struct rxrpc_call *call, unsigned long user_call_ID);
+ void (*notify_oob)(struct sock *sk, struct sk_buff *oob);
+};
+
typedef void (*rxrpc_notify_rx_t)(struct sock *, struct rxrpc_call *,
unsigned long);
typedef void (*rxrpc_notify_end_tx_t)(struct sock *, struct rxrpc_call *,
unsigned long);
-typedef void (*rxrpc_notify_new_call_t)(struct sock *, struct rxrpc_call *,
- unsigned long);
-typedef void (*rxrpc_discard_new_call_t)(struct rxrpc_call *, unsigned long);
-typedef void (*rxrpc_user_attach_call_t)(struct rxrpc_call *, unsigned long);
-void rxrpc_kernel_new_call_notification(struct socket *,
- rxrpc_notify_new_call_t,
- rxrpc_discard_new_call_t);
+void rxrpc_kernel_set_notifications(struct socket *sock,
+ const struct rxrpc_kernel_ops *app_ops);
struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
struct rxrpc_peer *peer,
struct key *key,
@@ -69,17 +80,36 @@ struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer);
struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call);
const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer);
const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer);
+unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data);
+unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer);
unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *);
-int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t,
- rxrpc_user_attach_call_t, unsigned long, gfp_t,
- unsigned int);
+int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx,
+ unsigned long user_call_ID, gfp_t gfp,
+ unsigned int debug_id);
void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64);
bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *);
-u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *);
-void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *,
- unsigned long);
int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val);
int rxrpc_sock_set_security_keyring(struct sock *, struct key *);
+int rxrpc_sock_set_manage_response(struct sock *sk, bool set);
+
+enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob,
+ struct rxrpc_peer **_peer,
+ unsigned long *_peer_appdata);
+struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock,
+ enum rxrpc_oob_type *_type);
+void rxrpc_kernel_free_oob(struct sk_buff *oob);
+void rxrpc_kernel_query_challenge(struct sk_buff *challenge,
+ struct rxrpc_peer **_peer,
+ unsigned long *_peer_appdata,
+ u16 *_service_id, u8 *_security_index);
+int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code,
+ int error, enum rxrpc_abort_reason why);
+int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge);
+u32 rxgk_kernel_query_challenge(struct sk_buff *challenge);
+int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge,
+ struct krb5_buffer *appdata);
+u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call,
+ u16 *_service_id, u32 *_enctype);
#endif /* _NET_RXRPC_H */
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 63129c79b8cb..34f53dde65ce 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -2,11 +2,15 @@
#ifndef __LINUX_NET_AFUNIX_H
#define __LINUX_NET_AFUNIX_H
-#include <linux/socket.h>
-#include <linux/un.h>
+#include <linux/atomic.h>
#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/path.h>
#include <linux/refcount.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
#include <net/sock.h>
+#include <uapi/linux/un.h>
#if IS_ENABLED(CONFIG_UNIX)
struct unix_sock *unix_get_socket(struct file *filp);
@@ -17,61 +21,17 @@ static inline struct unix_sock *unix_get_socket(struct file *filp)
}
#endif
-extern unsigned int unix_tot_inflight;
-void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
-void unix_del_edges(struct scm_fp_list *fpl);
-void unix_update_edges(struct unix_sock *receiver);
-int unix_prepare_fpl(struct scm_fp_list *fpl);
-void unix_destroy_fpl(struct scm_fp_list *fpl);
-void unix_gc(void);
-void wait_for_unix_gc(struct scm_fp_list *fpl);
-
-struct unix_vertex {
- struct list_head edges;
- struct list_head entry;
- struct list_head scc_entry;
- unsigned long out_degree;
- unsigned long index;
- unsigned long scc_index;
-};
-
-struct unix_edge {
- struct unix_sock *predecessor;
- struct unix_sock *successor;
- struct list_head vertex_entry;
- struct list_head stack_entry;
-};
-
-struct sock *unix_peer_get(struct sock *sk);
-
-#define UNIX_HASH_MOD (256 - 1)
-#define UNIX_HASH_SIZE (256 * 2)
-#define UNIX_HASH_BITS 8
-
struct unix_address {
refcount_t refcnt;
int len;
struct sockaddr_un name[];
};
-struct unix_skb_parms {
- struct pid *pid; /* Skb credentials */
- kuid_t uid;
- kgid_t gid;
- struct scm_fp_list *fp; /* Passed files */
-#ifdef CONFIG_SECURITY_NETWORK
- u32 secid; /* Security ID */
-#endif
- u32 consumed;
-} __randomize_layout;
-
struct scm_stat {
atomic_t nr_fds;
unsigned long nr_unix_fds;
};
-#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb))
-
/* The AF_UNIX socket */
struct unix_sock {
/* WARNING: sk has to be the first member */
@@ -84,8 +44,11 @@ struct unix_sock {
struct unix_vertex *vertex;
spinlock_t lock;
struct socket_wq peer_wq;
+#define peer_wait peer_wq.wait
wait_queue_entry_t peer_wake;
struct scm_stat scm_stat;
+ int inq_len;
+ bool recvmsg_inq;
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
struct sk_buff *oob_skb;
#endif
@@ -97,32 +60,4 @@ struct unix_sock {
#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock)
#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock)
-#define peer_wait peer_wq.wait
-
-long unix_inq_len(struct sock *sk);
-long unix_outq_len(struct sock *sk);
-
-int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
- int flags);
-int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
- int flags);
-#ifdef CONFIG_SYSCTL
-int unix_sysctl_register(struct net *net);
-void unix_sysctl_unregister(struct net *net);
-#else
-static inline int unix_sysctl_register(struct net *net) { return 0; }
-static inline void unix_sysctl_unregister(struct net *net) {}
-#endif
-
-#ifdef CONFIG_BPF_SYSCALL
-extern struct proto unix_dgram_proto;
-extern struct proto unix_stream_proto;
-
-int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
-int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
-void __init unix_bpf_build_proto(void);
-#else
-static inline void __init unix_bpf_build_proto(void)
-{}
-#endif
#endif
diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index 9e85424c8343..4e40063adab4 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -10,6 +10,7 @@
#include <linux/kernel.h>
#include <linux/workqueue.h>
+#include <net/netns/vsock.h>
#include <net/sock.h>
#include <uapi/linux/vm_sockets.h>
@@ -124,7 +125,7 @@ struct vsock_transport {
size_t len, int flags);
int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *,
struct msghdr *, size_t len);
- bool (*dgram_allow)(u32 cid, u32 port);
+ bool (*dgram_allow)(struct vsock_sock *vsk, u32 cid, u32 port);
/* STREAM. */
/* TODO: stream_bind() */
@@ -136,14 +137,14 @@ struct vsock_transport {
s64 (*stream_has_space)(struct vsock_sock *);
u64 (*stream_rcvhiwat)(struct vsock_sock *);
bool (*stream_is_active)(struct vsock_sock *);
- bool (*stream_allow)(u32 cid, u32 port);
+ bool (*stream_allow)(struct vsock_sock *vsk, u32 cid, u32 port);
/* SEQ_PACKET. */
ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
int flags);
int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg,
size_t len);
- bool (*seqpacket_allow)(u32 remote_cid);
+ bool (*seqpacket_allow)(struct vsock_sock *vsk, u32 remote_cid);
u32 (*seqpacket_has_data)(struct vsock_sock *vsk);
/* Notification. */
@@ -178,6 +179,15 @@ struct vsock_transport {
/* Addressing. */
u32 (*get_local_cid)(void);
+ /* Check if this transport serves a specific remote CID.
+ * For H2G transports: return true if the CID belongs to a registered
+ * guest. If not implemented, all CIDs > VMADDR_CID_HOST go to H2G.
+ * For G2H transports: return true if the transport can reach arbitrary
+ * CIDs via the hypervisor (i.e. supports the fallback overlay). VMCI
+ * does not implement this as it only serves CIDs 0 and 2.
+ */
+ bool (*has_remote_cid)(struct vsock_sock *vsk, u32 remote_cid);
+
/* Read a single skb */
int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
@@ -216,11 +226,17 @@ void vsock_remove_connected(struct vsock_sock *vsk);
struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
struct sockaddr_vm *dst);
+struct sock *vsock_find_bound_socket_net(struct sockaddr_vm *addr,
+ struct net *net);
+struct sock *vsock_find_connected_socket_net(struct sockaddr_vm *src,
+ struct sockaddr_vm *dst,
+ struct net *net);
void vsock_remove_sock(struct vsock_sock *vsk);
void vsock_for_each_connected_socket(struct vsock_transport *transport,
void (*fn)(struct sock *sk));
int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk);
bool vsock_find_cid(unsigned int cid);
+void vsock_linger(struct sock *sk);
/**** TAP ****/
@@ -242,8 +258,8 @@ int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
size_t len, int flags);
-#ifdef CONFIG_BPF_SYSCALL
extern struct proto vsock_proto;
+#ifdef CONFIG_BPF_SYSCALL
int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
void __init vsock_bpf_build_proto(void);
#else
@@ -255,4 +271,62 @@ static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t)
{
return t->msgzerocopy_allow && t->msgzerocopy_allow();
}
+
+static inline enum vsock_net_mode vsock_net_mode(struct net *net)
+{
+ if (!net)
+ return VSOCK_NET_MODE_GLOBAL;
+
+ return READ_ONCE(net->vsock.mode);
+}
+
+static inline bool vsock_net_mode_global(struct vsock_sock *vsk)
+{
+ return vsock_net_mode(sock_net(sk_vsock(vsk))) == VSOCK_NET_MODE_GLOBAL;
+}
+
+static inline bool vsock_net_set_child_mode(struct net *net,
+ enum vsock_net_mode mode)
+{
+ int new_locked = mode + 1;
+ int old_locked = 0; /* unlocked */
+
+ if (try_cmpxchg(&net->vsock.child_ns_mode_locked,
+ &old_locked, new_locked)) {
+ WRITE_ONCE(net->vsock.child_ns_mode, mode);
+ return true;
+ }
+
+ return old_locked == new_locked;
+}
+
+static inline enum vsock_net_mode vsock_net_child_mode(struct net *net)
+{
+ return READ_ONCE(net->vsock.child_ns_mode);
+}
+
+/* Return true if two namespaces pass the mode rules. Otherwise, return false.
+ *
+ * A NULL namespace is treated as VSOCK_NET_MODE_GLOBAL.
+ *
+ * Read more about modes in the comment header of net/vmw_vsock/af_vsock.c.
+ */
+static inline bool vsock_net_check_mode(struct net *ns0, struct net *ns1)
+{
+ enum vsock_net_mode mode0, mode1;
+
+ /* Any vsocks within the same network namespace are always reachable,
+ * regardless of the mode.
+ */
+ if (net_eq(ns0, ns1))
+ return true;
+
+ mode0 = vsock_net_mode(ns0);
+ mode1 = vsock_net_mode(ns1);
+
+ /* Different namespaces are only reachable if they are both
+ * global mode.
+ */
+ return mode0 == VSOCK_NET_MODE_GLOBAL && mode0 == mode1;
+}
#endif /* __AF_VSOCK_H__ */
diff --git a/include/net/aligned_data.h b/include/net/aligned_data.h
new file mode 100644
index 000000000000..e1a1c8aedc79
--- /dev/null
+++ b/include/net/aligned_data.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_ALIGNED_DATA_H
+#define _NET_ALIGNED_DATA_H
+
+#include <linux/atomic.h>
+#include <linux/types.h>
+
+/* Structure holding cacheline aligned fields on SMP builds.
+ * Each field or group should have an ____cacheline_aligned_in_smp
+ * attribute to ensure no accidental false sharing can happen.
+ */
+struct net_aligned_data {
+ atomic64_t net_cookie ____cacheline_aligned_in_smp;
+#if defined(CONFIG_INET)
+ atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp;
+ atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp;
+#endif
+};
+
+extern struct net_aligned_data net_aligned_data;
+
+#endif /* _NET_ALIGNED_DATA_H */
diff --git a/include/net/atmclip.h b/include/net/atmclip.h
deleted file mode 100644
index 70e350e0db3d..000000000000
--- a/include/net/atmclip.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* net/atm/atmarp.h - RFC1577 ATM ARP */
-
-/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
-
-
-#ifndef _ATMCLIP_H
-#define _ATMCLIP_H
-
-#include <linux/netdevice.h>
-#include <linux/atm.h>
-#include <linux/atmdev.h>
-#include <linux/atmarp.h>
-#include <linux/spinlock.h>
-#include <net/neighbour.h>
-
-
-#define CLIP_VCC(vcc) ((struct clip_vcc *) ((vcc)->user_back))
-
-struct sk_buff;
-
-struct clip_vcc {
- struct atm_vcc *vcc; /* VCC descriptor */
- struct atmarp_entry *entry; /* ATMARP table entry, NULL if IP addr.
- isn't known yet */
- int xoff; /* 1 if send buffer is full */
- unsigned char encap; /* 0: NULL, 1: LLC/SNAP */
- unsigned long last_use; /* last send or receive operation */
- unsigned long idle_timeout; /* keep open idle for so many jiffies*/
- void (*old_push)(struct atm_vcc *vcc,struct sk_buff *skb);
- /* keep old push fn for chaining */
- void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb);
- /* keep old pop fn for chaining */
- struct clip_vcc *next; /* next VCC */
-};
-
-
-struct atmarp_entry {
- struct clip_vcc *vccs; /* active VCCs; NULL if resolution is
- pending */
- unsigned long expires; /* entry expiration time */
- struct neighbour *neigh; /* neighbour back-pointer */
-};
-
-#define PRIV(dev) ((struct clip_priv *) netdev_priv(dev))
-
-struct clip_priv {
- int number; /* for convenience ... */
- spinlock_t xoff_lock; /* ensures that pop is atomic (SMP) */
- struct net_device *next; /* next CLIP interface */
-};
-
-#endif
diff --git a/include/net/ax25.h b/include/net/ax25.h
index 4ee141aae0a2..6b2f518facdb 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -1,487 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Declarations of AX.25 type objects.
- *
- * Alan Cox (GW4PTS) 10/11/93
- */
#ifndef _AX25_H
-#define _AX25_H
+#define _AX25_H
#include <linux/ax25.h>
-#include <linux/spinlock.h>
-#include <linux/timer.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/refcount.h>
-#include <net/neighbour.h>
-#include <net/sock.h>
-#include <linux/seq_file.h>
-#define AX25_T1CLAMPLO 1
-#define AX25_T1CLAMPHI (30 * HZ)
-
-#define AX25_BPQ_HEADER_LEN 16
-#define AX25_KISS_HEADER_LEN 1
-
-#define AX25_HEADER_LEN 17
-#define AX25_ADDR_LEN 7
-#define AX25_DIGI_HEADER_LEN (AX25_MAX_DIGIS * AX25_ADDR_LEN)
-#define AX25_MAX_HEADER_LEN (AX25_HEADER_LEN + AX25_DIGI_HEADER_LEN)
-
-/* AX.25 Protocol IDs */
-#define AX25_P_ROSE 0x01
-#define AX25_P_VJCOMP 0x06 /* Compressed TCP/IP packet */
- /* Van Jacobsen (RFC 1144) */
-#define AX25_P_VJUNCOMP 0x07 /* Uncompressed TCP/IP packet */
- /* Van Jacobsen (RFC 1144) */
-#define AX25_P_SEGMENT 0x08 /* Segmentation fragment */
-#define AX25_P_TEXNET 0xc3 /* TEXTNET datagram protocol */
-#define AX25_P_LQ 0xc4 /* Link Quality Protocol */
-#define AX25_P_ATALK 0xca /* Appletalk */
-#define AX25_P_ATALK_ARP 0xcb /* Appletalk ARP */
-#define AX25_P_IP 0xcc /* ARPA Internet Protocol */
-#define AX25_P_ARP 0xcd /* ARPA Address Resolution */
-#define AX25_P_FLEXNET 0xce /* FlexNet */
-#define AX25_P_NETROM 0xcf /* NET/ROM */
-#define AX25_P_TEXT 0xF0 /* No layer 3 protocol impl. */
-
-/* AX.25 Segment control values */
-#define AX25_SEG_REM 0x7F
-#define AX25_SEG_FIRST 0x80
-
-#define AX25_CBIT 0x80 /* Command/Response bit */
-#define AX25_EBIT 0x01 /* HDLC Address Extension bit */
-#define AX25_HBIT 0x80 /* Has been repeated bit */
-
-#define AX25_SSSID_SPARE 0x60 /* Unused bits in SSID for standard AX.25 */
-#define AX25_ESSID_SPARE 0x20 /* Unused bits in SSID for extended AX.25 */
-#define AX25_DAMA_FLAG 0x20 /* Well, it is *NOT* unused! (dl1bke 951121 */
-
-#define AX25_COND_ACK_PENDING 0x01
-#define AX25_COND_REJECT 0x02
-#define AX25_COND_PEER_RX_BUSY 0x04
-#define AX25_COND_OWN_RX_BUSY 0x08
-#define AX25_COND_DAMA_MODE 0x10
-
-#ifndef _LINUX_NETDEVICE_H
-#include <linux/netdevice.h>
-#endif
-
-/* Upper sub-layer (LAPB) definitions */
-
-/* Control field templates */
-#define AX25_I 0x00 /* Information frames */
-#define AX25_S 0x01 /* Supervisory frames */
-#define AX25_RR 0x01 /* Receiver ready */
-#define AX25_RNR 0x05 /* Receiver not ready */
-#define AX25_REJ 0x09 /* Reject */
-#define AX25_U 0x03 /* Unnumbered frames */
-#define AX25_SABM 0x2f /* Set Asynchronous Balanced Mode */
-#define AX25_SABME 0x6f /* Set Asynchronous Balanced Mode Extended */
-#define AX25_DISC 0x43 /* Disconnect */
-#define AX25_DM 0x0f /* Disconnected mode */
-#define AX25_UA 0x63 /* Unnumbered acknowledge */
-#define AX25_FRMR 0x87 /* Frame reject */
-#define AX25_UI 0x03 /* Unnumbered information */
-#define AX25_XID 0xaf /* Exchange information */
-#define AX25_TEST 0xe3 /* Test */
-
-#define AX25_PF 0x10 /* Poll/final bit for standard AX.25 */
-#define AX25_EPF 0x01 /* Poll/final bit for extended AX.25 */
-
-#define AX25_ILLEGAL 0x100 /* Impossible to be a real frame type */
-
-#define AX25_POLLOFF 0
-#define AX25_POLLON 1
-
-/* AX25 L2 C-bit */
-#define AX25_COMMAND 1
-#define AX25_RESPONSE 2
-
-/* Define Link State constants. */
-
-enum {
- AX25_STATE_0, /* Listening */
- AX25_STATE_1, /* SABM sent */
- AX25_STATE_2, /* DISC sent */
- AX25_STATE_3, /* Established */
- AX25_STATE_4 /* Recovery */
-};
-
-#define AX25_MODULUS 8 /* Standard AX.25 modulus */
-#define AX25_EMODULUS 128 /* Extended AX.25 modulus */
-
-enum {
- AX25_PROTO_STD_SIMPLEX,
- AX25_PROTO_STD_DUPLEX,
-#ifdef CONFIG_AX25_DAMA_SLAVE
- AX25_PROTO_DAMA_SLAVE,
-#ifdef CONFIG_AX25_DAMA_MASTER
- AX25_PROTO_DAMA_MASTER,
-#define AX25_PROTO_MAX AX25_PROTO_DAMA_MASTER
-#endif
-#endif
- __AX25_PROTO_MAX,
- AX25_PROTO_MAX = __AX25_PROTO_MAX -1
-};
-
-enum {
- AX25_VALUES_IPDEFMODE, /* 0=DG 1=VC */
- AX25_VALUES_AXDEFMODE, /* 0=Normal 1=Extended Seq Nos */
- AX25_VALUES_BACKOFF, /* 0=None 1=Linear 2=Exponential */
- AX25_VALUES_CONMODE, /* Allow connected modes - 0=No 1=no "PID text" 2=all PIDs */
- AX25_VALUES_WINDOW, /* Default window size for standard AX.25 */
- AX25_VALUES_EWINDOW, /* Default window size for extended AX.25 */
- AX25_VALUES_T1, /* Default T1 timeout value */
- AX25_VALUES_T2, /* Default T2 timeout value */
- AX25_VALUES_T3, /* Default T3 timeout value */
- AX25_VALUES_IDLE, /* Connected mode idle timer */
- AX25_VALUES_N2, /* Default N2 value */
- AX25_VALUES_PACLEN, /* AX.25 MTU */
- AX25_VALUES_PROTOCOL, /* Std AX.25, DAMA Slave, DAMA Master */
-#ifdef CONFIG_AX25_DAMA_SLAVE
- AX25_VALUES_DS_TIMEOUT, /* DAMA Slave timeout */
-#endif
- AX25_MAX_VALUES /* THIS MUST REMAIN THE LAST ENTRY OF THIS LIST */
-};
-
-#define AX25_DEF_IPDEFMODE 0 /* Datagram */
-#define AX25_DEF_AXDEFMODE 0 /* Normal */
-#define AX25_DEF_BACKOFF 1 /* Linear backoff */
-#define AX25_DEF_CONMODE 2 /* Connected mode allowed */
-#define AX25_DEF_WINDOW 2 /* Window=2 */
-#define AX25_DEF_EWINDOW 32 /* Module-128 Window=32 */
-#define AX25_DEF_T1 10000 /* T1=10s */
-#define AX25_DEF_T2 3000 /* T2=3s */
-#define AX25_DEF_T3 300000 /* T3=300s */
-#define AX25_DEF_N2 10 /* N2=10 */
-#define AX25_DEF_IDLE 0 /* Idle=None */
-#define AX25_DEF_PACLEN 256 /* Paclen=256 */
-#define AX25_DEF_PROTOCOL AX25_PROTO_STD_SIMPLEX /* Standard AX.25 */
-#define AX25_DEF_DS_TIMEOUT 180000 /* DAMA timeout 3 minutes */
-
-typedef struct ax25_uid_assoc {
- struct hlist_node uid_node;
- refcount_t refcount;
- kuid_t uid;
- ax25_address call;
-} ax25_uid_assoc;
-
-#define ax25_uid_for_each(__ax25, list) \
- hlist_for_each_entry(__ax25, list, uid_node)
-
-#define ax25_uid_hold(ax25) \
- refcount_inc(&((ax25)->refcount))
-
-static inline void ax25_uid_put(ax25_uid_assoc *assoc)
-{
- if (refcount_dec_and_test(&assoc->refcount)) {
- kfree(assoc);
- }
-}
-
-typedef struct {
- ax25_address calls[AX25_MAX_DIGIS];
- unsigned char repeated[AX25_MAX_DIGIS];
- unsigned char ndigi;
- signed char lastrepeat;
-} ax25_digi;
-
-typedef struct ax25_route {
- struct ax25_route *next;
- ax25_address callsign;
- struct net_device *dev;
- ax25_digi *digipeat;
- char ip_mode;
-} ax25_route;
-
-void __ax25_put_route(ax25_route *ax25_rt);
-
-extern rwlock_t ax25_route_lock;
-
-static inline void ax25_route_lock_use(void)
-{
- read_lock(&ax25_route_lock);
-}
-
-static inline void ax25_route_lock_unuse(void)
-{
- read_unlock(&ax25_route_lock);
-}
-
-typedef struct {
- char slave; /* slave_mode? */
- struct timer_list slave_timer; /* timeout timer */
- unsigned short slave_timeout; /* when? */
-} ax25_dama_info;
-
-struct ctl_table;
-
-typedef struct ax25_dev {
- struct list_head list;
-
- struct net_device *dev;
- netdevice_tracker dev_tracker;
-
- struct net_device *forward;
- struct ctl_table_header *sysheader;
- int values[AX25_MAX_VALUES];
-#if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER)
- ax25_dama_info dama;
-#endif
- refcount_t refcount;
- bool device_up;
- struct rcu_head rcu;
-} ax25_dev;
-
-typedef struct ax25_cb {
- struct hlist_node ax25_node;
- ax25_address source_addr, dest_addr;
- ax25_digi *digipeat;
- ax25_dev *ax25_dev;
- netdevice_tracker dev_tracker;
- unsigned char iamdigi;
- unsigned char state, modulus, pidincl;
- unsigned short vs, vr, va;
- unsigned char condition, backoff;
- unsigned char n2, n2count;
- struct timer_list t1timer, t2timer, t3timer, idletimer;
- unsigned long t1, t2, t3, idle, rtt;
- unsigned short paclen, fragno, fraglen;
- struct sk_buff_head write_queue;
- struct sk_buff_head reseq_queue;
- struct sk_buff_head ack_queue;
- struct sk_buff_head frag_queue;
- unsigned char window;
- struct timer_list timer, dtimer;
- struct sock *sk; /* Backlink to socket */
- refcount_t refcount;
-} ax25_cb;
-
-struct ax25_sock {
- struct sock sk;
- struct ax25_cb *cb;
-};
-
-#define ax25_sk(ptr) container_of_const(ptr, struct ax25_sock, sk)
-
-static inline struct ax25_cb *sk_to_ax25(const struct sock *sk)
-{
- return ax25_sk(sk)->cb;
-}
-
-#define ax25_for_each(__ax25, list) \
- hlist_for_each_entry(__ax25, list, ax25_node)
-
-#define ax25_cb_hold(__ax25) \
- refcount_inc(&((__ax25)->refcount))
-
-static __inline__ void ax25_cb_put(ax25_cb *ax25)
-{
- if (refcount_dec_and_test(&ax25->refcount)) {
- kfree(ax25->digipeat);
- kfree(ax25);
- }
-}
-
-static inline void ax25_dev_hold(ax25_dev *ax25_dev)
-{
- refcount_inc(&ax25_dev->refcount);
-}
-
-static inline void ax25_dev_put(ax25_dev *ax25_dev)
-{
- if (refcount_dec_and_test(&ax25_dev->refcount))
- kfree_rcu(ax25_dev, rcu);
-}
-static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev)
-{
- skb->dev = dev;
- skb_reset_mac_header(skb);
- skb->pkt_type = PACKET_HOST;
- return htons(ETH_P_AX25);
-}
-
-/* af_ax25.c */
-extern struct hlist_head ax25_list;
-extern spinlock_t ax25_list_lock;
-void ax25_cb_add(ax25_cb *);
-struct sock *ax25_find_listener(ax25_address *, int, struct net_device *, int);
-struct sock *ax25_get_socket(ax25_address *, ax25_address *, int);
-ax25_cb *ax25_find_cb(const ax25_address *, ax25_address *, ax25_digi *,
- struct net_device *);
-void ax25_send_to_raw(ax25_address *, struct sk_buff *, int);
-void ax25_destroy_socket(ax25_cb *);
-ax25_cb * __must_check ax25_create_cb(void);
-void ax25_fillin_cb(ax25_cb *, ax25_dev *);
-struct sock *ax25_make_new(struct sock *, struct ax25_dev *);
-
-/* ax25_addr.c */
-extern const ax25_address ax25_bcast;
-extern const ax25_address ax25_defaddr;
-extern const ax25_address null_ax25_address;
-char *ax2asc(char *buf, const ax25_address *);
-void asc2ax(ax25_address *addr, const char *callsign);
-int ax25cmp(const ax25_address *, const ax25_address *);
-int ax25digicmp(const ax25_digi *, const ax25_digi *);
-const unsigned char *ax25_addr_parse(const unsigned char *, int,
- ax25_address *, ax25_address *, ax25_digi *, int *, int *);
-int ax25_addr_build(unsigned char *, const ax25_address *,
- const ax25_address *, const ax25_digi *, int, int);
-int ax25_addr_size(const ax25_digi *);
-void ax25_digi_invert(const ax25_digi *, ax25_digi *);
-
-/* ax25_dev.c */
-extern spinlock_t ax25_dev_lock;
-
-#if IS_ENABLED(CONFIG_AX25)
-static inline ax25_dev *ax25_dev_ax25dev(const struct net_device *dev)
-{
- return rcu_dereference_rtnl(dev->ax25_ptr);
-}
-#endif
-
-ax25_dev *ax25_addr_ax25dev(ax25_address *);
-void ax25_dev_device_up(struct net_device *);
-void ax25_dev_device_down(struct net_device *);
-int ax25_fwd_ioctl(unsigned int, struct ax25_fwd_struct *);
-struct net_device *ax25_fwd_dev(struct net_device *);
-void ax25_dev_free(void);
-
-/* ax25_ds_in.c */
-int ax25_ds_frame_in(ax25_cb *, struct sk_buff *, int);
-
-/* ax25_ds_subr.c */
-void ax25_ds_nr_error_recovery(ax25_cb *);
-void ax25_ds_enquiry_response(ax25_cb *);
-void ax25_ds_establish_data_link(ax25_cb *);
-void ax25_dev_dama_off(ax25_dev *);
-void ax25_dama_on(ax25_cb *);
-void ax25_dama_off(ax25_cb *);
-
-/* ax25_ds_timer.c */
-void ax25_ds_setup_timer(ax25_dev *);
-void ax25_ds_set_timer(ax25_dev *);
-void ax25_ds_del_timer(ax25_dev *);
-void ax25_ds_timer(ax25_cb *);
-void ax25_ds_t1_timeout(ax25_cb *);
-void ax25_ds_heartbeat_expiry(ax25_cb *);
-void ax25_ds_t3timer_expiry(ax25_cb *);
-void ax25_ds_idletimer_expiry(ax25_cb *);
-
-/* ax25_iface.c */
-
-struct ax25_protocol {
- struct ax25_protocol *next;
- unsigned int pid;
- int (*func)(struct sk_buff *, ax25_cb *);
-};
-
-void ax25_register_pid(struct ax25_protocol *ap);
-void ax25_protocol_release(unsigned int);
-
-struct ax25_linkfail {
- struct hlist_node lf_node;
- void (*func)(ax25_cb *, int);
-};
-
-void ax25_linkfail_register(struct ax25_linkfail *lf);
-void ax25_linkfail_release(struct ax25_linkfail *lf);
-int __must_check ax25_listen_register(const ax25_address *,
- struct net_device *);
-void ax25_listen_release(const ax25_address *, struct net_device *);
-int(*ax25_protocol_function(unsigned int))(struct sk_buff *, ax25_cb *);
-int ax25_listen_mine(const ax25_address *, struct net_device *);
-void ax25_link_failed(ax25_cb *, int);
-int ax25_protocol_is_registered(unsigned int);
-
-/* ax25_in.c */
-int ax25_rx_iframe(ax25_cb *, struct sk_buff *);
-int ax25_kiss_rcv(struct sk_buff *, struct net_device *, struct packet_type *,
- struct net_device *);
-
-/* ax25_ip.c */
-netdev_tx_t ax25_ip_xmit(struct sk_buff *skb);
-extern const struct header_ops ax25_header_ops;
-
-/* ax25_out.c */
-ax25_cb *ax25_send_frame(struct sk_buff *, int, const ax25_address *,
- ax25_address *, ax25_digi *, struct net_device *);
-void ax25_output(ax25_cb *, int, struct sk_buff *);
-void ax25_kick(ax25_cb *);
-void ax25_transmit_buffer(ax25_cb *, struct sk_buff *, int);
-void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev);
-int ax25_check_iframes_acked(ax25_cb *, unsigned short);
-
-/* ax25_route.c */
-void ax25_rt_device_down(struct net_device *);
-int ax25_rt_ioctl(unsigned int, void __user *);
-extern const struct seq_operations ax25_rt_seqops;
-ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev);
-int ax25_rt_autobind(ax25_cb *, ax25_address *);
-struct sk_buff *ax25_rt_build_path(struct sk_buff *, ax25_address *,
- ax25_address *, ax25_digi *);
-void ax25_rt_free(void);
-
-/* ax25_std_in.c */
-int ax25_std_frame_in(ax25_cb *, struct sk_buff *, int);
-
-/* ax25_std_subr.c */
-void ax25_std_nr_error_recovery(ax25_cb *);
-void ax25_std_establish_data_link(ax25_cb *);
-void ax25_std_transmit_enquiry(ax25_cb *);
-void ax25_std_enquiry_response(ax25_cb *);
-void ax25_std_timeout_response(ax25_cb *);
-
-/* ax25_std_timer.c */
-void ax25_std_heartbeat_expiry(ax25_cb *);
-void ax25_std_t1timer_expiry(ax25_cb *);
-void ax25_std_t2timer_expiry(ax25_cb *);
-void ax25_std_t3timer_expiry(ax25_cb *);
-void ax25_std_idletimer_expiry(ax25_cb *);
-
-/* ax25_subr.c */
-void ax25_clear_queues(ax25_cb *);
-void ax25_frames_acked(ax25_cb *, unsigned short);
-void ax25_requeue_frames(ax25_cb *);
-int ax25_validate_nr(ax25_cb *, unsigned short);
-int ax25_decode(ax25_cb *, struct sk_buff *, int *, int *, int *);
-void ax25_send_control(ax25_cb *, int, int, int);
-void ax25_return_dm(struct net_device *, ax25_address *, ax25_address *,
- ax25_digi *);
-void ax25_calculate_t1(ax25_cb *);
-void ax25_calculate_rtt(ax25_cb *);
-void ax25_disconnect(ax25_cb *, int);
-
-/* ax25_timer.c */
-void ax25_setup_timers(ax25_cb *);
-void ax25_start_heartbeat(ax25_cb *);
-void ax25_start_t1timer(ax25_cb *);
-void ax25_start_t2timer(ax25_cb *);
-void ax25_start_t3timer(ax25_cb *);
-void ax25_start_idletimer(ax25_cb *);
-void ax25_stop_heartbeat(ax25_cb *);
-void ax25_stop_t1timer(ax25_cb *);
-void ax25_stop_t2timer(ax25_cb *);
-void ax25_stop_t3timer(ax25_cb *);
-void ax25_stop_idletimer(ax25_cb *);
-int ax25_t1timer_running(ax25_cb *);
-unsigned long ax25_display_timer(struct timer_list *);
-
-/* ax25_uid.c */
-extern int ax25_uid_policy;
-ax25_uid_assoc *ax25_findbyuid(kuid_t);
-int __must_check ax25_uid_ioctl(int, struct sockaddr_ax25 *);
-extern const struct seq_operations ax25_uid_seqops;
-void ax25_uid_free(void);
-
-/* sysctl_net_ax25.c */
-#ifdef CONFIG_SYSCTL
-int ax25_register_dev_sysctl(ax25_dev *ax25_dev);
-void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev);
-#else
-static inline int ax25_register_dev_sysctl(ax25_dev *ax25_dev) { return 0; }
-static inline void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev) {}
-#endif /* CONFIG_SYSCTL */
+#define AX25_ADDR_LEN 7
+#define AX25_P_IP 0xCC
#endif
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 435250c72d56..3faea66b1979 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -29,6 +29,7 @@
#include <linux/poll.h>
#include <net/sock.h>
#include <linux/seq_file.h>
+#include <linux/ethtool.h>
#define BT_SUBSYS_VERSION 2
#define BT_SUBSYS_REVISION 22
@@ -129,21 +130,30 @@ struct bt_voice {
#define BT_RCVMTU 13
#define BT_PHY 14
-#define BT_PHY_BR_1M_1SLOT 0x00000001
-#define BT_PHY_BR_1M_3SLOT 0x00000002
-#define BT_PHY_BR_1M_5SLOT 0x00000004
-#define BT_PHY_EDR_2M_1SLOT 0x00000008
-#define BT_PHY_EDR_2M_3SLOT 0x00000010
-#define BT_PHY_EDR_2M_5SLOT 0x00000020
-#define BT_PHY_EDR_3M_1SLOT 0x00000040
-#define BT_PHY_EDR_3M_3SLOT 0x00000080
-#define BT_PHY_EDR_3M_5SLOT 0x00000100
-#define BT_PHY_LE_1M_TX 0x00000200
-#define BT_PHY_LE_1M_RX 0x00000400
-#define BT_PHY_LE_2M_TX 0x00000800
-#define BT_PHY_LE_2M_RX 0x00001000
-#define BT_PHY_LE_CODED_TX 0x00002000
-#define BT_PHY_LE_CODED_RX 0x00004000
+#define BT_PHY_BR_1M_1SLOT BIT(0)
+#define BT_PHY_BR_1M_3SLOT BIT(1)
+#define BT_PHY_BR_1M_5SLOT BIT(2)
+#define BT_PHY_EDR_2M_1SLOT BIT(3)
+#define BT_PHY_EDR_2M_3SLOT BIT(4)
+#define BT_PHY_EDR_2M_5SLOT BIT(5)
+#define BT_PHY_EDR_3M_1SLOT BIT(6)
+#define BT_PHY_EDR_3M_3SLOT BIT(7)
+#define BT_PHY_EDR_3M_5SLOT BIT(8)
+#define BT_PHY_LE_1M_TX BIT(9)
+#define BT_PHY_LE_1M_RX BIT(10)
+#define BT_PHY_LE_2M_TX BIT(11)
+#define BT_PHY_LE_2M_RX BIT(12)
+#define BT_PHY_LE_CODED_TX BIT(13)
+#define BT_PHY_LE_CODED_RX BIT(14)
+
+#define BT_PHY_BREDR_MASK (BT_PHY_BR_1M_1SLOT | BT_PHY_BR_1M_3SLOT | \
+ BT_PHY_BR_1M_5SLOT | BT_PHY_EDR_2M_1SLOT | \
+ BT_PHY_EDR_2M_3SLOT | BT_PHY_EDR_2M_5SLOT | \
+ BT_PHY_EDR_3M_1SLOT | BT_PHY_EDR_3M_3SLOT | \
+ BT_PHY_EDR_3M_5SLOT)
+#define BT_PHY_LE_MASK (BT_PHY_LE_1M_TX | BT_PHY_LE_1M_RX | \
+ BT_PHY_LE_2M_TX | BT_PHY_LE_2M_RX | \
+ BT_PHY_LE_CODED_TX | BT_PHY_LE_CODED_RX)
#define BT_MODE 15
@@ -156,6 +166,7 @@ struct bt_voice {
#define BT_PKT_STATUS 16
#define BT_SCM_PKT_STATUS 0x03
+#define BT_SCM_ERROR 0x04
#define BT_ISO_QOS 17
@@ -171,7 +182,7 @@ struct bt_iso_io_qos {
__u32 interval;
__u16 latency;
__u16 sdu;
- __u8 phy;
+ __u8 phys;
__u8 rtn;
};
@@ -210,9 +221,9 @@ struct bt_iso_qos {
};
};
-#define BT_ISO_PHY_1M 0x01
-#define BT_ISO_PHY_2M 0x02
-#define BT_ISO_PHY_CODED 0x04
+#define BT_ISO_PHY_1M BIT(0)
+#define BT_ISO_PHY_2M BIT(1)
+#define BT_ISO_PHY_CODED BIT(2)
#define BT_ISO_PHY_ANY (BT_ISO_PHY_1M | BT_ISO_PHY_2M | \
BT_ISO_PHY_CODED)
@@ -242,6 +253,12 @@ struct bt_codecs {
#define BT_ISO_BASE 20
+/* Socket option value 21 reserved */
+
+#define BT_PKT_SEQNUM 22
+
+#define BT_SCM_PKT_SEQNUM 0x05
+
__printf(1, 2)
void bt_info(const char *fmt, ...);
__printf(1, 2)
@@ -264,7 +281,8 @@ void bt_err_ratelimited(const char *fmt, ...);
#define BT_ERR(fmt, ...) bt_err(fmt "\n", ##__VA_ARGS__)
#if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG)
-#define BT_DBG(fmt, ...) bt_dbg(fmt "\n", ##__VA_ARGS__)
+#define BT_DBG(fmt, ...) \
+ bt_dbg("%s:%d: " fmt "\n", __func__, __LINE__, ##__VA_ARGS__)
#else
#define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__)
#endif
@@ -380,6 +398,7 @@ void baswap(bdaddr_t *dst, const bdaddr_t *src);
struct bt_sock {
struct sock sk;
struct list_head accept_q;
+ spinlock_t accept_q_lock; /* protects accept_q */
struct sock *parent;
unsigned long flags;
void (*skb_msg_name)(struct sk_buff *, void *, int *);
@@ -389,7 +408,8 @@ struct bt_sock {
enum {
BT_SK_DEFER_SETUP,
BT_SK_SUSPEND,
- BT_SK_PKT_STATUS
+ BT_SK_PKT_STATUS,
+ BT_SK_PKT_SEQNUM,
};
struct bt_sock_list {
@@ -447,6 +467,9 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
hci_req_complete_t *req_complete,
hci_req_complete_skb_t *req_complete_skb);
+int hci_ethtool_ts_info(unsigned int index, int sk_proto,
+ struct kernel_ethtool_ts_info *ts_info);
+
#define HCI_REQ_START BIT(0)
#define HCI_REQ_SKB BIT(1)
@@ -470,6 +493,7 @@ struct bt_skb_cb {
u8 pkt_type;
u8 force_active;
u16 expect;
+ u16 pkt_seqnum;
u8 incoming:1;
u8 pkt_status:2;
union {
@@ -483,6 +507,7 @@ struct bt_skb_cb {
#define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type
#define hci_skb_pkt_status(skb) bt_cb((skb))->pkt_status
+#define hci_skb_pkt_seqnum(skb) bt_cb((skb))->pkt_seqnum
#define hci_skb_expect(skb) bt_cb((skb))->expect
#define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode
#define hci_skb_event(skb) bt_cb((skb))->hci.req_event
@@ -633,7 +658,7 @@ static inline void sco_exit(void)
#if IS_ENABLED(CONFIG_BT_LE)
int iso_init(void);
int iso_exit(void);
-bool iso_enabled(void);
+bool iso_inited(void);
#else
static inline int iso_init(void)
{
@@ -645,7 +670,7 @@ static inline int iso_exit(void)
return 0;
}
-static inline bool iso_enabled(void)
+static inline bool iso_inited(void)
{
return false;
}
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 3ec915738112..572b1c620c5d 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -208,6 +208,13 @@ enum {
*/
HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED,
+ /* When this quirk is set consider Sync Flow Control as supported by
+ * the driver.
+ *
+ * This quirk must be set before hci_register_dev is called.
+ */
+ HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED,
+
/* When this quirk is set, the LE states reported through the
* HCI_LE_READ_SUPPORTED_STATES are invalid/broken.
*
@@ -354,6 +361,24 @@ enum {
* during the hdev->setup vendor callback.
*/
HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY,
+
+ /* When this quirk is set, the HCI_OP_READ_VOICE_SETTING command is
+ * skipped. This is required for a subset of the CSR controller clones
+ * which erroneously claim to support it.
+ *
+ * This quirk must be set before hci_register_dev is called.
+ */
+ HCI_QUIRK_BROKEN_READ_VOICE_SETTING,
+
+ /* When this quirk is set, the HCI_OP_READ_PAGE_SCAN_TYPE command is
+ * skipped. This is required for a subset of the CSR controller clones
+ * which erroneously claim to support it.
+ *
+ * This quirk must be set before hci_register_dev is called.
+ */
+ HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE,
+
+ __HCI_NUM_QUIRKS,
};
/* HCI device flags */
@@ -409,6 +434,7 @@ enum {
HCI_USER_CHANNEL,
HCI_EXT_CONFIGURED,
HCI_LE_ADV,
+ HCI_LE_ADV_0,
HCI_LE_PER_ADV,
HCI_LE_SCAN,
HCI_SSP_ENABLED,
@@ -432,6 +458,7 @@ enum {
HCI_WIDEBAND_SPEECH_ENABLED,
HCI_EVENT_FILTER_CONFIGURED,
HCI_PA_SYNC,
+ HCI_SCO_FLOWCTL,
HCI_DUT_MODE,
HCI_VENDOR_DIAG,
@@ -462,6 +489,7 @@ enum {
#define HCI_AUTO_OFF_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */
#define HCI_ACL_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */
#define HCI_LE_CONN_TIMEOUT msecs_to_jiffies(20000) /* 20 seconds */
+#define HCI_ISO_TX_TIMEOUT usecs_to_jiffies(0x7fffff) /* 8388607 usecs */
/* HCI data types */
#define HCI_COMMAND_PKT 0x01
@@ -470,6 +498,7 @@ enum {
#define HCI_EVENT_PKT 0x04
#define HCI_ISODATA_PKT 0x05
#define HCI_DIAG_PKT 0xf0
+#define HCI_DRV_PKT 0xf1
#define HCI_VENDOR_PKT 0xff
/* HCI packet types */
@@ -533,7 +562,9 @@ enum {
#define ESCO_LINK 0x02
/* Low Energy links do not have defined link type. Use invented one */
#define LE_LINK 0x80
-#define ISO_LINK 0x82
+#define CIS_LINK 0x82
+#define BIS_LINK 0x83
+#define PA_LINK 0x84
#define INVALID_LINK 0xff
/* LMP features */
@@ -616,10 +647,15 @@ enum {
#define HCI_LE_EXT_ADV 0x10
#define HCI_LE_PERIODIC_ADV 0x20
#define HCI_LE_CHAN_SEL_ALG2 0x40
+#define HCI_LE_PAST_SENDER 0x01
+#define HCI_LE_PAST_RECEIVER 0x02
#define HCI_LE_CIS_CENTRAL 0x10
#define HCI_LE_CIS_PERIPHERAL 0x20
#define HCI_LE_ISO_BROADCASTER 0x40
#define HCI_LE_ISO_SYNC_RECEIVER 0x80
+#define HCI_LE_LL_EXT_FEATURE 0x80
+#define HCI_LE_CS 0x40
+#define HCI_LE_CS_HOST 0x80
/* Connection modes */
#define HCI_CM_ACTIVE 0x0000
@@ -855,6 +891,11 @@ struct hci_cp_remote_name_req_cancel {
bdaddr_t bdaddr;
} __packed;
+struct hci_rp_remote_name_req_cancel {
+ __u8 status;
+ bdaddr_t bdaddr;
+} __packed;
+
#define HCI_OP_READ_REMOTE_FEATURES 0x041b
struct hci_cp_read_remote_features {
__le16 handle;
@@ -1427,8 +1468,12 @@ struct hci_rp_read_data_block_size {
} __packed;
#define HCI_OP_READ_LOCAL_CODECS 0x100b
-struct hci_std_codecs {
+struct hci_std_codecs_hdr {
__u8 num;
+} __packed;
+
+struct hci_std_codecs {
+ struct hci_std_codecs_hdr;
__u8 codec[];
} __packed;
@@ -1446,7 +1491,7 @@ struct hci_vnd_codecs {
struct hci_rp_read_local_supported_codecs {
__u8 status;
- struct hci_std_codecs std_codecs;
+ struct hci_std_codecs_hdr std_codecs;
struct hci_vnd_codecs vnd_codecs;
} __packed;
@@ -1463,8 +1508,12 @@ struct hci_std_codec_v2 {
__u8 transport;
} __packed;
-struct hci_std_codecs_v2 {
+struct hci_std_codecs_v2_hdr {
__u8 num;
+} __packed;
+
+struct hci_std_codecs_v2 {
+ struct hci_std_codecs_v2_hdr;
struct hci_std_codec_v2 codec[];
} __packed;
@@ -1481,7 +1530,7 @@ struct hci_vnd_codecs_v2 {
struct hci_rp_read_local_supported_codecs_v2 {
__u8 status;
- struct hci_std_codecs_v2 std_codecs;
+ struct hci_std_codecs_v2_hdr std_codecs;
struct hci_vnd_codecs_v2 vendor_codecs;
} __packed;
@@ -1528,6 +1577,11 @@ struct hci_rp_read_tx_power {
__s8 tx_power;
} __packed;
+#define HCI_OP_WRITE_SYNC_FLOWCTL 0x0c2f
+struct hci_cp_write_sync_flowctl {
+ __u8 enable;
+} __packed;
+
#define HCI_OP_READ_PAGE_SCAN_TYPE 0x0c46
struct hci_rp_read_page_scan_type {
__u8 status;
@@ -1839,6 +1893,15 @@ struct hci_cp_le_set_default_phy {
#define HCI_LE_SET_PHY_2M 0x02
#define HCI_LE_SET_PHY_CODED 0x04
+#define HCI_OP_LE_SET_PHY 0x2032
+struct hci_cp_le_set_phy {
+ __le16 handle;
+ __u8 all_phys;
+ __u8 tx_phys;
+ __u8 rx_phys;
+ __le16 phy_opts;
+} __packed;
+
#define HCI_OP_LE_SET_EXT_SCAN_PARAMS 0x2041
struct hci_cp_le_set_ext_scan_params {
__u8 own_addr_type;
@@ -1897,6 +1960,8 @@ struct hci_cp_le_pa_create_sync {
__u8 sync_cte_type;
} __packed;
+#define HCI_OP_LE_PA_CREATE_SYNC_CANCEL 0x2045
+
#define HCI_OP_LE_PA_TERM_SYNC 0x2046
struct hci_cp_le_pa_term_sync {
__le16 handle;
@@ -2025,6 +2090,44 @@ struct hci_cp_le_set_privacy_mode {
__u8 mode;
} __packed;
+#define HCI_OP_LE_PAST 0x205a
+struct hci_cp_le_past {
+ __le16 handle;
+ __le16 service_data;
+ __le16 sync_handle;
+} __packed;
+
+struct hci_rp_le_past {
+ __u8 status;
+ __le16 handle;
+} __packed;
+
+#define HCI_OP_LE_PAST_SET_INFO 0x205b
+struct hci_cp_le_past_set_info {
+ __le16 handle;
+ __le16 service_data;
+ __u8 adv_handle;
+} __packed;
+
+struct hci_rp_le_past_set_info {
+ __u8 status;
+ __le16 handle;
+} __packed;
+
+#define HCI_OP_LE_PAST_PARAMS 0x205c
+struct hci_cp_le_past_params {
+ __le16 handle;
+ __u8 mode;
+ __le16 skip;
+ __le16 sync_timeout;
+ __u8 cte_type;
+} __packed;
+
+struct hci_rp_le_past_params {
+ __u8 status;
+ __le16 handle;
+} __packed;
+
#define HCI_OP_LE_READ_BUFFER_SIZE_V2 0x2060
struct hci_rp_le_read_buffer_size_v2 {
__u8 status;
@@ -2052,8 +2155,8 @@ struct hci_cis_params {
__u8 cis_id;
__le16 c_sdu;
__le16 p_sdu;
- __u8 c_phy;
- __u8 p_phy;
+ __u8 c_phys;
+ __u8 p_phys;
__u8 c_rtn;
__u8 p_rtn;
} __packed;
@@ -2172,6 +2275,217 @@ struct hci_cp_le_set_host_feature {
__u8 bit_value;
} __packed;
+#define HCI_OP_LE_READ_ALL_LOCAL_FEATURES 0x2087
+struct hci_rp_le_read_all_local_features {
+ __u8 status;
+ __u8 page;
+ __u8 features[248];
+} __packed;
+
+#define HCI_OP_LE_READ_ALL_REMOTE_FEATURES 0x2088
+struct hci_cp_le_read_all_remote_features {
+ __le16 handle;
+ __u8 pages;
+} __packed;
+
+/* Channel Sounding Commands */
+#define HCI_OP_LE_CS_RD_LOCAL_SUPP_CAP 0x2089
+struct hci_rp_le_cs_rd_local_supp_cap {
+ __u8 status;
+ __u8 num_config_supported;
+ __le16 max_consecutive_procedures_supported;
+ __u8 num_antennas_supported;
+ __u8 max_antenna_paths_supported;
+ __u8 roles_supported;
+ __u8 modes_supported;
+ __u8 rtt_capability;
+ __u8 rtt_aa_only_n;
+ __u8 rtt_sounding_n;
+ __u8 rtt_random_payload_n;
+ __le16 nadm_sounding_capability;
+ __le16 nadm_random_capability;
+ __u8 cs_sync_phys_supported;
+ __le16 subfeatures_supported;
+ __le16 t_ip1_times_supported;
+ __le16 t_ip2_times_supported;
+ __le16 t_fcs_times_supported;
+ __le16 t_pm_times_supported;
+ __u8 t_sw_time_supported;
+ __u8 tx_snr_capability;
+} __packed;
+
+#define HCI_OP_LE_CS_RD_RMT_SUPP_CAP 0x208A
+struct hci_cp_le_cs_rd_local_supp_cap {
+ __le16 handle;
+} __packed;
+
+#define HCI_OP_LE_CS_WR_CACHED_RMT_SUPP_CAP 0x208B
+struct hci_cp_le_cs_wr_cached_rmt_supp_cap {
+ __le16 handle;
+ __u8 num_config_supported;
+ __le16 max_consecutive_procedures_supported;
+ __u8 num_antennas_supported;
+ __u8 max_antenna_paths_supported;
+ __u8 roles_supported;
+ __u8 modes_supported;
+ __u8 rtt_capability;
+ __u8 rtt_aa_only_n;
+ __u8 rtt_sounding_n;
+ __u8 rtt_random_payload_n;
+ __le16 nadm_sounding_capability;
+ __le16 nadm_random_capability;
+ __u8 cs_sync_phys_supported;
+ __le16 subfeatures_supported;
+ __le16 t_ip1_times_supported;
+ __le16 t_ip2_times_supported;
+ __le16 t_fcs_times_supported;
+ __le16 t_pm_times_supported;
+ __u8 t_sw_time_supported;
+ __u8 tx_snr_capability;
+} __packed;
+
+struct hci_rp_le_cs_wr_cached_rmt_supp_cap {
+ __u8 status;
+ __le16 handle;
+} __packed;
+
+#define HCI_OP_LE_CS_SEC_ENABLE 0x208C
+struct hci_cp_le_cs_sec_enable {
+ __le16 handle;
+} __packed;
+
+#define HCI_OP_LE_CS_SET_DEFAULT_SETTINGS 0x208D
+struct hci_cp_le_cs_set_default_settings {
+ __le16 handle;
+ __u8 role_enable;
+ __u8 cs_sync_ant_sel;
+ __s8 max_tx_power;
+} __packed;
+
+struct hci_rp_le_cs_set_default_settings {
+ __u8 status;
+ __le16 handle;
+} __packed;
+
+#define HCI_OP_LE_CS_RD_RMT_FAE_TABLE 0x208E
+struct hci_cp_le_cs_rd_rmt_fae_table {
+ __le16 handle;
+} __packed;
+
+#define HCI_OP_LE_CS_WR_CACHED_RMT_FAE_TABLE 0x208F
+struct hci_cp_le_cs_wr_rmt_cached_fae_table {
+ __le16 handle;
+ __u8 remote_fae_table[72];
+} __packed;
+
+struct hci_rp_le_cs_wr_rmt_cached_fae_table {
+ __u8 status;
+ __le16 handle;
+} __packed;
+
+#define HCI_OP_LE_CS_CREATE_CONFIG 0x2090
+struct hci_cp_le_cs_create_config {
+ __le16 handle;
+ __u8 config_id;
+ __u8 create_context;
+ __u8 main_mode_type;
+ __u8 sub_mode_type;
+ __u8 min_main_mode_steps;
+ __u8 max_main_mode_steps;
+ __u8 main_mode_repetition;
+ __u8 mode_0_steps;
+ __u8 role;
+ __u8 rtt_type;
+ __u8 cs_sync_phy;
+ __u8 channel_map[10];
+ __u8 channel_map_repetition;
+ __u8 channel_selection_type;
+ __u8 ch3c_shape;
+ __u8 ch3c_jump;
+ __u8 reserved;
+} __packed;
+
+#define HCI_OP_LE_CS_REMOVE_CONFIG 0x2091
+struct hci_cp_le_cs_remove_config {
+ __le16 handle;
+ __u8 config_id;
+} __packed;
+
+#define HCI_OP_LE_CS_SET_CH_CLASSIFICATION 0x2092
+struct hci_cp_le_cs_set_ch_classification {
+ __u8 ch_classification[10];
+} __packed;
+
+struct hci_rp_le_cs_set_ch_classification {
+ __u8 status;
+} __packed;
+
+#define HCI_OP_LE_CS_SET_PROC_PARAM 0x2093
+struct hci_cp_le_cs_set_proc_param {
+ __le16 handle;
+ __u8 config_id;
+ __le16 max_procedure_len;
+ __le16 min_procedure_interval;
+ __le16 max_procedure_interval;
+ __le16 max_procedure_count;
+ __u8 min_subevent_len[3];
+ __u8 max_subevent_len[3];
+ __u8 tone_antenna_config_selection;
+ __u8 phy;
+ __u8 tx_power_delta;
+ __u8 preferred_peer_antenna;
+ __u8 snr_control_initiator;
+ __u8 snr_control_reflector;
+} __packed;
+
+struct hci_rp_le_cs_set_proc_param {
+ __u8 status;
+ __le16 handle;
+} __packed;
+
+#define HCI_OP_LE_CS_SET_PROC_ENABLE 0x2094
+struct hci_cp_le_cs_set_proc_enable {
+ __le16 handle;
+ __u8 config_id;
+ __u8 enable;
+} __packed;
+
+#define HCI_OP_LE_CS_TEST 0x2095
+struct hci_cp_le_cs_test {
+ __u8 main_mode_type;
+ __u8 sub_mode_type;
+ __u8 main_mode_repetition;
+ __u8 mode_0_steps;
+ __u8 role;
+ __u8 rtt_type;
+ __u8 cs_sync_phy;
+ __u8 cs_sync_antenna_selection;
+ __u8 subevent_len[3];
+ __le16 subevent_interval;
+ __u8 max_num_subevents;
+ __u8 transmit_power_level;
+ __u8 t_ip1_time;
+ __u8 t_ip2_time;
+ __u8 t_fcs_time;
+ __u8 t_pm_time;
+ __u8 t_sw_time;
+ __u8 tone_antenna_config_selection;
+ __u8 reserved;
+ __u8 snr_control_initiator;
+ __u8 snr_control_reflector;
+ __le16 drbg_nonce;
+ __u8 channel_map_repetition;
+ __le16 override_config;
+ __u8 override_parameters_length;
+ __u8 override_parameters_data[];
+} __packed;
+
+struct hci_rp_le_cs_test {
+ __u8 status;
+} __packed;
+
+#define HCI_OP_LE_CS_TEST_END 0x2096
+
/* ---- HCI Events ---- */
struct hci_ev_status {
__u8 status;
@@ -2594,6 +2908,7 @@ struct hci_ev_le_conn_complete {
#define LE_EXT_ADV_DIRECT_IND 0x0004
#define LE_EXT_ADV_SCAN_RSP 0x0008
#define LE_EXT_ADV_LEGACY_PDU 0x0010
+#define LE_EXT_ADV_DATA_STATUS_MASK 0x0060
#define LE_EXT_ADV_EVT_TYPE_MASK 0x007f
#define ADDR_LE_DEV_PUBLIC 0x00
@@ -2739,6 +3054,11 @@ struct hci_ev_le_per_adv_report {
__u8 data[];
} __packed;
+#define HCI_EV_LE_PA_SYNC_LOST 0x10
+struct hci_ev_le_pa_sync_lost {
+ __le16 handle;
+} __packed;
+
#define LE_PA_DATA_COMPLETE 0x00
#define LE_PA_DATA_MORE_TO_COME 0x01
#define LE_PA_DATA_TRUNCATED 0x02
@@ -2751,6 +3071,20 @@ struct hci_evt_le_ext_adv_set_term {
__u8 num_evts;
} __packed;
+#define HCI_EV_LE_PAST_RECEIVED 0x18
+struct hci_ev_le_past_received {
+ __u8 status;
+ __le16 handle;
+ __le16 service_data;
+ __le16 sync_handle;
+ __u8 sid;
+ __u8 bdaddr_type;
+ bdaddr_t bdaddr;
+ __u8 phy;
+ __le16 interval;
+ __u8 clock_accuracy;
+} __packed;
+
#define HCI_EVT_LE_CIS_ESTABLISHED 0x19
struct hci_evt_le_cis_established {
__u8 status;
@@ -2796,8 +3130,8 @@ struct hci_evt_le_create_big_complete {
__le16 bis_handle[];
} __packed;
-#define HCI_EVT_LE_BIG_SYNC_ESTABILISHED 0x1d
-struct hci_evt_le_big_sync_estabilished {
+#define HCI_EVT_LE_BIG_SYNC_ESTABLISHED 0x1d
+struct hci_evt_le_big_sync_established {
__u8 status;
__u8 handle;
__u8 latency[3];
@@ -2811,6 +3145,12 @@ struct hci_evt_le_big_sync_estabilished {
__le16 bis[];
} __packed;
+#define HCI_EVT_LE_BIG_SYNC_LOST 0x1e
+struct hci_evt_le_big_sync_lost {
+ __u8 handle;
+ __u8 reason;
+} __packed;
+
#define HCI_EVT_LE_BIG_INFO_ADV_REPORT 0x22
struct hci_evt_le_big_info_adv_report {
__le16 sync_handle;
@@ -2828,6 +3168,138 @@ struct hci_evt_le_big_info_adv_report {
__u8 encryption;
} __packed;
+#define HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE 0x2b
+struct hci_evt_le_read_all_remote_features_complete {
+ __u8 status;
+ __le16 handle;
+ __u8 max_pages;
+ __u8 valid_pages;
+ __u8 features[248];
+} __packed;
+
+/* Channel Sounding Events */
+#define HCI_EVT_LE_CS_READ_RMT_SUPP_CAP_COMPLETE 0x2C
+struct hci_evt_le_cs_read_rmt_supp_cap_complete {
+ __u8 status;
+ __le16 handle;
+ __u8 num_configs_supp;
+ __le16 max_consec_proc_supp;
+ __u8 num_ant_supp;
+ __u8 max_ant_path_supp;
+ __u8 roles_supp;
+ __u8 modes_supp;
+ __u8 rtt_cap;
+ __u8 rtt_aa_only_n;
+ __u8 rtt_sounding_n;
+ __u8 rtt_rand_payload_n;
+ __le16 nadm_sounding_cap;
+ __le16 nadm_rand_cap;
+ __u8 cs_sync_phys_supp;
+ __le16 sub_feat_supp;
+ __le16 t_ip1_times_supp;
+ __le16 t_ip2_times_supp;
+ __le16 t_fcs_times_supp;
+ __le16 t_pm_times_supp;
+ __u8 t_sw_times_supp;
+ __u8 tx_snr_cap;
+} __packed;
+
+#define HCI_EVT_LE_CS_READ_RMT_FAE_TABLE_COMPLETE 0x2D
+struct hci_evt_le_cs_read_rmt_fae_table_complete {
+ __u8 status;
+ __le16 handle;
+ __u8 remote_fae_table[72];
+} __packed;
+
+#define HCI_EVT_LE_CS_SECURITY_ENABLE_COMPLETE 0x2E
+struct hci_evt_le_cs_security_enable_complete {
+ __u8 status;
+ __le16 handle;
+} __packed;
+
+#define HCI_EVT_LE_CS_CONFIG_COMPLETE 0x2F
+struct hci_evt_le_cs_config_complete {
+ __u8 status;
+ __le16 handle;
+ __u8 config_id;
+ __u8 action;
+ __u8 main_mode_type;
+ __u8 sub_mode_type;
+ __u8 min_main_mode_steps;
+ __u8 max_main_mode_steps;
+ __u8 main_mode_rep;
+ __u8 mode_0_steps;
+ __u8 role;
+ __u8 rtt_type;
+ __u8 cs_sync_phy;
+ __u8 channel_map[10];
+ __u8 channel_map_rep;
+ __u8 channel_sel_type;
+ __u8 ch3c_shape;
+ __u8 ch3c_jump;
+ __u8 reserved;
+ __u8 t_ip1_time;
+ __u8 t_ip2_time;
+ __u8 t_fcs_time;
+ __u8 t_pm_time;
+} __packed;
+
+#define HCI_EVT_LE_CS_PROCEDURE_ENABLE_COMPLETE 0x30
+struct hci_evt_le_cs_procedure_enable_complete {
+ __u8 status;
+ __le16 handle;
+ __u8 config_id;
+ __u8 state;
+ __u8 tone_ant_config_sel;
+ __s8 sel_tx_pwr;
+ __u8 sub_evt_len[3];
+ __u8 sub_evts_per_evt;
+ __le16 sub_evt_intrvl;
+ __le16 evt_intrvl;
+ __le16 proc_intrvl;
+ __le16 proc_counter;
+ __le16 max_proc_len;
+} __packed;
+
+#define HCI_EVT_LE_CS_SUBEVENT_RESULT 0x31
+struct hci_evt_le_cs_subevent_result {
+ __le16 handle;
+ __u8 config_id;
+ __le16 start_acl_conn_evt_counter;
+ __le16 proc_counter;
+ __le16 freq_comp;
+ __u8 ref_pwr_lvl;
+ __u8 proc_done_status;
+ __u8 subevt_done_status;
+ __u8 abort_reason;
+ __u8 num_ant_paths;
+ __u8 num_steps_reported;
+ __u8 step_mode[0]; /* depends on num_steps_reported */
+ __u8 step_channel[0]; /* depends on num_steps_reported */
+ __u8 step_data_length[0]; /* depends on num_steps_reported */
+ __u8 step_data[0]; /* depends on num_steps_reported */
+} __packed;
+
+#define HCI_EVT_LE_CS_SUBEVENT_RESULT_CONTINUE 0x32
+struct hci_evt_le_cs_subevent_result_continue {
+ __le16 handle;
+ __u8 config_id;
+ __u8 proc_done_status;
+ __u8 subevt_done_status;
+ __u8 abort_reason;
+ __u8 num_ant_paths;
+ __u8 num_steps_reported;
+ __u8 step_mode[0]; /* depends on num_steps_reported */
+ __u8 step_channel[0]; /* depends on num_steps_reported */
+ __u8 step_data_length[0]; /* depends on num_steps_reported */
+ __u8 step_data[0]; /* depends on num_steps_reported */
+} __packed;
+
+#define HCI_EVT_LE_CS_TEST_END_COMPLETE 0x33
+struct hci_evt_le_cs_test_end_complete {
+ __u8 status;
+} __packed;
+
#define HCI_EV_VENDOR 0xff
/* Internal events generated by Bluetooth stack */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 6281063cbd8e..aa600fbf9a53 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -29,8 +29,11 @@
#include <linux/idr.h>
#include <linux/leds.h>
#include <linux/rculist.h>
+#include <linux/spinlock.h>
+#include <linux/srcu.h>
#include <net/bluetooth/hci.h>
+#include <net/bluetooth/hci_drv.h>
#include <net/bluetooth/hci_sync.h>
#include <net/bluetooth/hci_sock.h>
#include <net/bluetooth/coredump.h>
@@ -92,6 +95,7 @@ struct discovery_state {
u16 uuid_count;
u8 (*uuids)[16];
unsigned long name_resolve_timeout;
+ spinlock_t lock;
};
#define SUSPEND_NOTIFIER_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */
@@ -125,7 +129,9 @@ struct hci_conn_hash {
struct list_head list;
unsigned int acl_num;
unsigned int sco_num;
- unsigned int iso_num;
+ unsigned int cis_num;
+ unsigned int bis_num;
+ unsigned int pa_num;
unsigned int le_num;
unsigned int le_num_peripheral;
};
@@ -160,6 +166,7 @@ enum hci_conn_flags {
HCI_CONN_FLAG_REMOTE_WAKEUP = BIT(0),
HCI_CONN_FLAG_DEVICE_PRIVACY = BIT(1),
HCI_CONN_FLAG_ADDRESS_RESOLUTION = BIT(2),
+ HCI_CONN_FLAG_PAST = BIT(3),
};
typedef u8 hci_conn_flags_t;
@@ -238,9 +245,11 @@ struct adv_info {
bool enabled;
bool pending;
bool periodic;
+ bool periodic_enabled;
__u8 mesh;
__u8 instance;
__u8 handle;
+ __u8 sid;
__u32 flags;
__u16 timeout;
__u16 remaining_time;
@@ -261,6 +270,12 @@ struct adv_info {
struct delayed_work rpa_expired_cb;
};
+struct tx_queue {
+ struct sk_buff_head queue;
+ unsigned int extra;
+ unsigned int tracked;
+};
+
#define HCI_MAX_ADV_INSTANCES 5
#define HCI_DEFAULT_ADV_DURATION 2
@@ -339,6 +354,7 @@ struct adv_monitor {
struct hci_dev {
struct list_head list;
+ struct srcu_struct srcu;
struct mutex lock;
struct ida unset_handle_ida;
@@ -362,7 +378,7 @@ struct hci_dev {
__u8 minor_class;
__u8 max_page;
__u8 features[HCI_MAX_PAGES][8];
- __u8 le_features[8];
+ __u8 le_features[248];
__u8 le_accept_list_size;
__u8 le_resolv_list_size;
__u8 le_num_of_adv_sets;
@@ -454,7 +470,7 @@ struct hci_dev {
unsigned int auto_accept_delay;
- unsigned long quirks;
+ DECLARE_BITMAP(quirk_flags, __HCI_NUM_QUIRKS);
atomic_t cmd_cnt;
unsigned int acl_cnt;
@@ -473,6 +489,7 @@ struct hci_dev {
unsigned long acl_last_tx;
unsigned long le_last_tx;
+ unsigned long iso_last_tx;
__u8 le_tx_def_phys;
__u8 le_rx_def_phys;
@@ -539,6 +556,7 @@ struct hci_dev {
struct hci_conn_hash conn_hash;
struct list_head mesh_pending;
+ struct mutex mgmt_pending_lock;
struct list_head mgmt_pending;
struct list_head reject_list;
struct list_head accept_list;
@@ -607,6 +625,8 @@ struct hci_dev {
struct list_head monitored_devices;
bool advmon_pend_notify;
+ struct hci_drv *hci_drv;
+
#if IS_ENABLED(CONFIG_BT_LEDS)
struct led_trigger *power_led;
#endif
@@ -643,6 +663,10 @@ struct hci_dev {
u8 (*classify_pkt_type)(struct hci_dev *hdev, struct sk_buff *skb);
};
+#define hci_set_quirk(hdev, nr) set_bit((nr), (hdev)->quirk_flags)
+#define hci_clear_quirk(hdev, nr) clear_bit((nr), (hdev)->quirk_flags)
+#define hci_test_quirk(hdev, nr) test_bit((nr), (hdev)->quirk_flags)
+
#define HCI_PHY_HANDLE(handle) (handle & 0xff)
enum conn_reasons {
@@ -678,6 +702,7 @@ struct hci_conn {
__u8 attempt;
__u8 dev_class[3];
__u8 features[HCI_MAX_PAGES][8];
+ __u8 le_features[248];
__u16 pkt_type;
__u16 link_policy;
__u8 key_type;
@@ -705,6 +730,8 @@ struct hci_conn {
__u16 le_per_adv_data_offset;
__u8 le_adv_phy;
__u8 le_adv_sec_phy;
+ __u8 le_tx_def_phys;
+ __u8 le_rx_def_phys;
__u8 le_tx_phy;
__u8 le_rx_phy;
__s8 rssi;
@@ -726,13 +753,14 @@ struct hci_conn {
__u8 remote_cap;
__u8 remote_auth;
- __u8 remote_id;
unsigned int sent;
struct sk_buff_head data_q;
struct list_head chan_list;
+ struct tx_queue tx_q;
+
struct delayed_work disc_work;
struct delayed_work auto_accept_work;
struct delayed_work idle_work;
@@ -814,29 +842,30 @@ extern struct mutex hci_cb_list_lock;
#define hci_dev_test_and_clear_flag(hdev, nr) test_and_clear_bit((nr), (hdev)->dev_flags)
#define hci_dev_test_and_change_flag(hdev, nr) test_and_change_bit((nr), (hdev)->dev_flags)
-#define hci_dev_clear_volatile_flags(hdev) \
- do { \
- hci_dev_clear_flag(hdev, HCI_LE_SCAN); \
- hci_dev_clear_flag(hdev, HCI_LE_ADV); \
- hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);\
- hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ); \
- hci_dev_clear_flag(hdev, HCI_QUALITY_REPORT); \
+#define hci_dev_clear_volatile_flags(hdev) \
+ do { \
+ hci_dev_clear_flag((hdev), HCI_LE_SCAN); \
+ hci_dev_clear_flag((hdev), HCI_LE_ADV); \
+ hci_dev_clear_flag((hdev), HCI_LL_RPA_RESOLUTION); \
+ hci_dev_clear_flag((hdev), HCI_PERIODIC_INQ); \
+ hci_dev_clear_flag((hdev), HCI_QUALITY_REPORT); \
} while (0)
#define hci_dev_le_state_simultaneous(hdev) \
- (!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks) && \
- (hdev->le_states[4] & 0x08) && /* Central */ \
- (hdev->le_states[4] & 0x40) && /* Peripheral */ \
- (hdev->le_states[3] & 0x10)) /* Simultaneous */
+ (!hci_test_quirk((hdev), HCI_QUIRK_BROKEN_LE_STATES) && \
+ ((hdev)->le_states[4] & 0x08) && /* Central */ \
+ ((hdev)->le_states[4] & 0x40) && /* Peripheral */ \
+ ((hdev)->le_states[3] & 0x10)) /* Simultaneous */
/* ----- HCI interface to upper protocols ----- */
int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr);
int l2cap_disconn_ind(struct hci_conn *hcon);
-void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags);
+int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb,
+ u16 flags);
#if IS_ENABLED(CONFIG_BT_BREDR)
int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags);
-void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb);
+int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb);
#else
static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
__u8 *flags)
@@ -844,23 +873,30 @@ static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
return 0;
}
-static inline void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
+static inline int sco_recv_scodata(struct hci_dev *hdev, u16 handle,
+ struct sk_buff *skb)
{
+ kfree_skb(skb);
+ return -ENOENT;
}
#endif
#if IS_ENABLED(CONFIG_BT_LE)
int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags);
-void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags);
+int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb,
+ u16 flags);
#else
static inline int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
__u8 *flags)
{
return 0;
}
-static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb,
- u16 flags)
+
+static inline int iso_recv(struct hci_dev *hdev, u16 handle,
+ struct sk_buff *skb, u16 flags)
{
+ kfree_skb(skb);
+ return -ENOENT;
}
#endif
@@ -870,6 +906,7 @@ static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb,
static inline void discovery_init(struct hci_dev *hdev)
{
+ spin_lock_init(&hdev->discovery.lock);
hdev->discovery.state = DISCOVERY_STOPPED;
INIT_LIST_HEAD(&hdev->discovery.all);
INIT_LIST_HEAD(&hdev->discovery.unknown);
@@ -884,8 +921,11 @@ static inline void hci_discovery_filter_clear(struct hci_dev *hdev)
hdev->discovery.report_invalid_rssi = true;
hdev->discovery.rssi = HCI_RSSI_INVALID;
hdev->discovery.uuid_count = 0;
+
+ spin_lock(&hdev->discovery.lock);
kfree(hdev->discovery.uuids);
hdev->discovery.uuids = NULL;
+ spin_unlock(&hdev->discovery.lock);
}
bool hci_discovery_active(struct hci_dev *hdev);
@@ -988,8 +1028,14 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c)
case ESCO_LINK:
h->sco_num++;
break;
- case ISO_LINK:
- h->iso_num++;
+ case CIS_LINK:
+ h->cis_num++;
+ break;
+ case BIS_LINK:
+ h->bis_num++;
+ break;
+ case PA_LINK:
+ h->pa_num++;
break;
}
}
@@ -1014,8 +1060,14 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c)
case ESCO_LINK:
h->sco_num--;
break;
- case ISO_LINK:
- h->iso_num--;
+ case CIS_LINK:
+ h->cis_num--;
+ break;
+ case BIS_LINK:
+ h->bis_num--;
+ break;
+ case PA_LINK:
+ h->pa_num--;
break;
}
}
@@ -1031,8 +1083,12 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type)
case SCO_LINK:
case ESCO_LINK:
return h->sco_num;
- case ISO_LINK:
- return h->iso_num;
+ case CIS_LINK:
+ return h->cis_num;
+ case BIS_LINK:
+ return h->bis_num;
+ case PA_LINK:
+ return h->pa_num;
default:
return 0;
}
@@ -1042,7 +1098,15 @@ static inline unsigned int hci_conn_count(struct hci_dev *hdev)
{
struct hci_conn_hash *c = &hdev->conn_hash;
- return c->acl_num + c->sco_num + c->le_num + c->iso_num;
+ return c->acl_num + c->sco_num + c->le_num + c->cis_num + c->bis_num +
+ c->pa_num;
+}
+
+static inline unsigned int hci_iso_count(struct hci_dev *hdev)
+{
+ struct hci_conn_hash *c = &hdev->conn_hash;
+
+ return c->cis_num + c->bis_num;
}
static inline bool hci_conn_valid(struct hci_dev *hdev, struct hci_conn *conn)
@@ -1092,7 +1156,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev,
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (bacmp(&c->dst, ba) || c->type != ISO_LINK)
+ if (bacmp(&c->dst, ba) || c->type != BIS_LINK)
continue;
if (c->iso_qos.bcast.bis == bis) {
@@ -1105,10 +1169,8 @@ static inline struct hci_conn *hci_conn_hash_lookup_bis(struct hci_dev *hdev,
return NULL;
}
-static inline struct hci_conn *hci_conn_hash_lookup_sid(struct hci_dev *hdev,
- __u8 sid,
- bdaddr_t *dst,
- __u8 dst_type)
+static inline struct hci_conn *
+hci_conn_hash_lookup_create_pa_sync(struct hci_dev *hdev)
{
struct hci_conn_hash *h = &hdev->conn_hash;
struct hci_conn *c;
@@ -1116,8 +1178,10 @@ static inline struct hci_conn *hci_conn_hash_lookup_sid(struct hci_dev *hdev,
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type != ISO_LINK || bacmp(&c->dst, dst) ||
- c->dst_type != dst_type || c->sid != sid)
+ if (c->type != PA_LINK)
+ continue;
+
+ if (!test_bit(HCI_CONN_CREATE_PA_SYNC, &c->flags))
continue;
rcu_read_unlock();
@@ -1140,8 +1204,8 @@ hci_conn_hash_lookup_per_adv_bis(struct hci_dev *hdev,
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (bacmp(&c->dst, ba) || c->type != ISO_LINK ||
- !test_bit(HCI_CONN_PER_ADV, &c->flags))
+ if (bacmp(&c->dst, ba) || c->type != BIS_LINK ||
+ !test_bit(HCI_CONN_PER_ADV, &c->flags))
continue;
if (c->iso_qos.bcast.big == big &&
@@ -1194,6 +1258,27 @@ static inline struct hci_conn *hci_conn_hash_lookup_ba(struct hci_dev *hdev,
return NULL;
}
+static inline struct hci_conn *hci_conn_hash_lookup_role(struct hci_dev *hdev,
+ __u8 type, __u8 role,
+ bdaddr_t *ba)
+{
+ struct hci_conn_hash *h = &hdev->conn_hash;
+ struct hci_conn *c;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(c, &h->list, list) {
+ if (c->type == type && c->role == role && !bacmp(&c->dst, ba)) {
+ rcu_read_unlock();
+ return c;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+
static inline struct hci_conn *hci_conn_hash_lookup_le(struct hci_dev *hdev,
bdaddr_t *ba,
__u8 ba_type)
@@ -1230,7 +1315,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cis(struct hci_dev *hdev,
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY))
+ if (c->type != CIS_LINK)
continue;
/* Match CIG ID if set */
@@ -1262,7 +1347,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_cig(struct hci_dev *hdev,
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type != ISO_LINK || !bacmp(&c->dst, BDADDR_ANY))
+ if (c->type != CIS_LINK)
continue;
if (handle == c->iso_qos.ucast.cig) {
@@ -1285,17 +1370,7 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev,
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type != ISO_LINK)
- continue;
-
- /* An ISO_LINK hcon with BDADDR_ANY as destination
- * address is a Broadcast connection. A Broadcast
- * slave connection is associated with a PA train,
- * so the sync_handle can be used to differentiate
- * from unicast.
- */
- if (bacmp(&c->dst, BDADDR_ANY) &&
- c->sync_handle == HCI_SYNC_HANDLE_INVALID)
+ if (c->type != BIS_LINK)
continue;
if (handle == c->iso_qos.bcast.big) {
@@ -1319,7 +1394,7 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev,
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type != ISO_LINK)
+ if (c->type != PA_LINK)
continue;
if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) {
@@ -1334,7 +1409,8 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev,
}
static inline struct hci_conn *
-hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state)
+hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state,
+ __u8 role)
{
struct hci_conn_hash *h = &hdev->conn_hash;
struct hci_conn *c;
@@ -1342,8 +1418,7 @@ hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state)
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK ||
- c->state != state)
+ if (c->type != BIS_LINK || c->state != state || c->role != role)
continue;
if (handle == c->iso_qos.bcast.big) {
@@ -1366,8 +1441,8 @@ hci_conn_hash_lookup_pa_sync_big_handle(struct hci_dev *hdev, __u8 big)
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type != ISO_LINK ||
- !test_bit(HCI_CONN_PA_SYNC, &c->flags))
+ if (c->type != BIS_LINK ||
+ !test_bit(HCI_CONN_PA_SYNC, &c->flags))
continue;
if (c->iso_qos.bcast.big == big) {
@@ -1389,7 +1464,7 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle)
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type != ISO_LINK)
+ if (c->type != PA_LINK)
continue;
/* Ignore the listen hcon, we are looking
@@ -1409,26 +1484,6 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle)
return NULL;
}
-static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev,
- __u8 type, __u16 state)
-{
- struct hci_conn_hash *h = &hdev->conn_hash;
- struct hci_conn *c;
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type == type && c->state == state) {
- rcu_read_unlock();
- return c;
- }
- }
-
- rcu_read_unlock();
-
- return NULL;
-}
-
typedef void (*hci_conn_func_t)(struct hci_conn *conn, void *data);
static inline void hci_conn_hash_list_state(struct hci_dev *hdev,
hci_conn_func_t func, __u8 type,
@@ -1516,14 +1571,12 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle);
void hci_sco_setup(struct hci_conn *conn, __u8 status);
bool hci_iso_setup_path(struct hci_conn *conn);
int hci_le_create_cis_pending(struct hci_dev *hdev);
-int hci_pa_create_sync_pending(struct hci_dev *hdev);
-int hci_le_big_create_sync_pending(struct hci_dev *hdev);
int hci_conn_check_create_cis(struct hci_conn *conn);
struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
- u8 role, u16 handle);
+ u8 dst_type, u8 role, u16 handle);
struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type,
- bdaddr_t *dst, u8 role);
+ bdaddr_t *dst, u8 dst_type, u8 role);
void hci_conn_del(struct hci_conn *conn);
void hci_conn_hash_flush(struct hci_dev *hdev);
@@ -1547,20 +1600,24 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
__u16 setting, struct bt_codec *codec,
u16 timeout);
struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
- __u8 dst_type, struct bt_iso_qos *qos);
-struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst,
+ __u8 dst_type, struct bt_iso_qos *qos,
+ u16 timeout);
+struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid,
struct bt_iso_qos *qos,
- __u8 base_len, __u8 *base);
+ __u8 base_len, __u8 *base, u16 timeout);
+int hci_past_bis(struct hci_conn *conn, bdaddr_t *dst, __u8 dst_type);
struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
- __u8 dst_type, struct bt_iso_qos *qos);
-struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
__u8 dst_type, struct bt_iso_qos *qos,
- __u8 data_len, __u8 *data);
+ u16 timeout);
+struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
+ __u8 dst_type, __u8 sid,
+ struct bt_iso_qos *qos,
+ __u8 data_len, __u8 *data, u16 timeout);
struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst,
__u8 dst_type, __u8 sid, struct bt_iso_qos *qos);
-int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon,
- struct bt_iso_qos *qos,
- __u16 sync_handle, __u8 num_bis, __u8 bis[]);
+int hci_conn_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon,
+ struct bt_iso_qos *qos, __u16 sync_handle,
+ __u8 num_bis, __u8 bis[]);
int hci_conn_check_link_mode(struct hci_conn *conn);
int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level);
int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type,
@@ -1572,6 +1629,18 @@ void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active);
void hci_conn_failed(struct hci_conn *conn, u8 status);
u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle);
+void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb);
+void hci_conn_tx_dequeue(struct hci_conn *conn);
+void hci_setup_tx_timestamp(struct sk_buff *skb, size_t key_offset,
+ const struct sockcm_cookie *sockc);
+
+static inline void hci_sockcm_init(struct sockcm_cookie *sockc, struct sock *sk)
+{
+ *sockc = (struct sockcm_cookie) {
+ .tsflags = READ_ONCE(sk->sk_tsflags),
+ };
+}
+
/*
* hci_conn_get() and hci_conn_put() are used to control the life-time of an
* "hci_conn" object. They do not guarantee that the hci_conn object is running,
@@ -1780,6 +1849,7 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list,
void hci_uuids_clear(struct hci_dev *hdev);
void hci_link_keys_clear(struct hci_dev *hdev);
+u8 *hci_conn_key_enc_size(struct hci_conn *conn);
struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr);
struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn,
bdaddr_t *bdaddr, u8 *val, u8 type,
@@ -1816,6 +1886,7 @@ int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr,
void hci_adv_instances_clear(struct hci_dev *hdev);
struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance);
+struct adv_info *hci_find_adv_sid(struct hci_dev *hdev, u8 sid);
struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance);
struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance,
u32 flags, u16 adv_data_len, u8 *adv_data,
@@ -1823,7 +1894,7 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance,
u16 timeout, u16 duration, s8 tx_power,
u32 min_interval, u32 max_interval,
u8 mesh_handle);
-struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance,
+struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u8 sid,
u32 flags, u8 data_len, u8 *data,
u32 min_interval, u32 max_interval);
int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance,
@@ -1858,6 +1929,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
#define lmp_hold_capable(dev) ((dev)->features[0][0] & LMP_HOLD)
#define lmp_sniff_capable(dev) ((dev)->features[0][0] & LMP_SNIFF)
#define lmp_park_capable(dev) ((dev)->features[0][1] & LMP_PARK)
+#define lmp_sco_capable(dev) ((dev)->features[0][1] & LMP_SCO)
#define lmp_inq_rssi_capable(dev) ((dev)->features[0][3] & LMP_RSSI_INQ)
#define lmp_esco_capable(dev) ((dev)->features[0][3] & LMP_ESCO)
#define lmp_bredr_capable(dev) (!((dev)->features[0][4] & LMP_NO_BREDR))
@@ -1900,6 +1972,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
!hci_dev_test_flag(dev, HCI_RPA_EXPIRED))
#define adv_rpa_valid(adv) (bacmp(&adv->random_addr, BDADDR_ANY) && \
!adv->rpa_expired)
+#define le_enabled(dev) (lmp_le_capable(dev) && \
+ hci_dev_test_flag(dev, HCI_LE_ENABLED))
#define scan_1m(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_1M) || \
((dev)->le_rx_def_phys & HCI_LE_SET_PHY_1M))
@@ -1910,36 +1984,41 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M))
#define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED) && \
- !test_bit(HCI_QUIRK_BROKEN_LE_CODED, \
- &(dev)->quirks))
+ !hci_test_quirk((dev), \
+ HCI_QUIRK_BROKEN_LE_CODED))
#define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \
((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED))
#define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY)
+#define ll_privacy_enabled(dev) (le_enabled(dev) && ll_privacy_capable(dev))
#define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \
- (hdev->commands[39] & 0x04))
+ ((dev)->commands[39] & 0x04))
#define read_key_size_capable(dev) \
((dev)->commands[20] & 0x10 && \
- !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &hdev->quirks))
+ !hci_test_quirk((dev), HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE))
+
+#define read_voice_setting_capable(dev) \
+ ((dev)->commands[9] & 0x04 && \
+ !hci_test_quirk((dev), HCI_QUIRK_BROKEN_READ_VOICE_SETTING))
/* Use enhanced synchronous connection if command is supported and its quirk
* has not been set.
*/
#define enhanced_sync_conn_capable(dev) \
(((dev)->commands[29] & 0x08) && \
- !test_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &(dev)->quirks))
+ !hci_test_quirk((dev), HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN))
/* Use ext scanning if set ext scan param and ext scan enable is supported */
#define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \
((dev)->commands[37] & 0x40) && \
- !test_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &(dev)->quirks))
+ !hci_test_quirk((dev), HCI_QUIRK_BROKEN_EXT_SCAN))
/* Use ext create connection if command is supported */
#define use_ext_conn(dev) (((dev)->commands[37] & 0x80) && \
- !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, &(dev)->quirks))
+ !hci_test_quirk((dev), HCI_QUIRK_BROKEN_EXT_CREATE_CONN))
/* Extended advertising support */
#define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV))
@@ -1954,25 +2033,54 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
*/
#define use_enhanced_conn_complete(dev) ((ll_privacy_capable(dev) || \
ext_adv_capable(dev)) && \
- !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, \
- &(dev)->quirks))
+ !hci_test_quirk((dev), \
+ HCI_QUIRK_BROKEN_EXT_CREATE_CONN))
/* Periodic advertising support */
#define per_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_PERIODIC_ADV))
/* CIS Master/Slave and BIS support */
#define iso_capable(dev) (cis_capable(dev) || bis_capable(dev))
+#define iso_enabled(dev) (le_enabled(dev) && iso_capable(dev))
#define cis_capable(dev) \
(cis_central_capable(dev) || cis_peripheral_capable(dev))
+#define cis_enabled(dev) (le_enabled(dev) && cis_capable(dev))
#define cis_central_capable(dev) \
((dev)->le_features[3] & HCI_LE_CIS_CENTRAL)
+#define cis_central_enabled(dev) \
+ (le_enabled(dev) && cis_central_capable(dev))
#define cis_peripheral_capable(dev) \
((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL)
+#define cis_peripheral_enabled(dev) \
+ (le_enabled(dev) && cis_peripheral_capable(dev))
#define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER)
-#define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER)
+#define bis_enabled(dev) (le_enabled(dev) && bis_capable(dev))
+#define sync_recv_capable(dev) \
+ ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER)
+#define sync_recv_enabled(dev) (le_enabled(dev) && sync_recv_capable(dev))
+#define past_sender_capable(dev) \
+ ((dev)->le_features[3] & HCI_LE_PAST_SENDER)
+#define past_receiver_capable(dev) \
+ ((dev)->le_features[3] & HCI_LE_PAST_RECEIVER)
+#define past_capable(dev) \
+ (past_sender_capable(dev) || past_receiver_capable(dev))
+#define past_sender_enabled(dev) \
+ (le_enabled(dev) && past_sender_capable(dev))
+#define past_receiver_enabled(dev) \
+ (le_enabled(dev) && past_receiver_capable(dev))
+#define past_enabled(dev) \
+ (past_sender_enabled(dev) || past_receiver_enabled(dev))
+#define ll_ext_feature_capable(dev) \
+ ((dev)->le_features[7] & HCI_LE_LL_EXT_FEATURE)
+
+/* Channel sounding support */
+#define le_cs_capable(dev) \
+ ((dev)->le_features[5] & HCI_LE_CS)
+#define le_cs_host_capable(dev) \
+ ((dev)->le_features[5] & HCI_LE_CS_HOST)
#define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \
- (!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks)))
+ (!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG)))
/* ----- HCI protocols ----- */
#define HCI_PROTO_DEFER 0x01
@@ -1988,7 +2096,9 @@ static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr,
case ESCO_LINK:
return sco_connect_ind(hdev, bdaddr, flags);
- case ISO_LINK:
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
return iso_connect_ind(hdev, bdaddr, flags);
default:
@@ -2232,6 +2342,7 @@ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode);
void *hci_recv_event_data(struct hci_dev *hdev, __u8 event);
u32 hci_conn_get_phy(struct hci_conn *conn);
+int hci_conn_set_phy(struct hci_conn *conn, u32 phys);
/* ----- HCI Sockets ----- */
void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb);
@@ -2354,8 +2465,6 @@ void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status);
void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class,
u8 status);
void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status);
-void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status);
-void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status);
void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
u8 addr_type, u8 *dev_class, s8 rssi, u32 flags,
u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len,
@@ -2381,13 +2490,12 @@ void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev,
u8 instance);
void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev,
u8 instance);
-void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle);
int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip);
void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle,
bdaddr_t *bdaddr, u8 addr_type);
int hci_abort_conn(struct hci_conn *conn, u8 reason);
-u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency,
+void hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency,
u16 to_multiplier);
void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand,
__u8 ltk[16], __u8 key_size);
diff --git a/include/net/bluetooth/hci_drv.h b/include/net/bluetooth/hci_drv.h
new file mode 100644
index 000000000000..3fd6fdbdb02e
--- /dev/null
+++ b/include/net/bluetooth/hci_drv.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2025 Google Corporation
+ */
+
+#ifndef __HCI_DRV_H
+#define __HCI_DRV_H
+
+#include <linux/types.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci.h>
+
+struct hci_drv_cmd_hdr {
+ __le16 opcode;
+ __le16 len;
+} __packed;
+
+struct hci_drv_ev_hdr {
+ __le16 opcode;
+ __le16 len;
+} __packed;
+
+#define HCI_DRV_EV_CMD_STATUS 0x0000
+struct hci_drv_ev_cmd_status {
+ __le16 opcode;
+ __u8 status;
+} __packed;
+
+#define HCI_DRV_EV_CMD_COMPLETE 0x0001
+struct hci_drv_ev_cmd_complete {
+ __le16 opcode;
+ __u8 status;
+ __u8 data[];
+} __packed;
+
+#define HCI_DRV_STATUS_SUCCESS 0x00
+#define HCI_DRV_STATUS_UNSPECIFIED_ERROR 0x01
+#define HCI_DRV_STATUS_UNKNOWN_COMMAND 0x02
+#define HCI_DRV_STATUS_INVALID_PARAMETERS 0x03
+
+#define HCI_DRV_MAX_DRIVER_NAME_LENGTH 32
+
+/* Common commands that make sense on all drivers start from 0x0000 */
+#define HCI_DRV_OP_READ_INFO 0x0000
+#define HCI_DRV_READ_INFO_SIZE 0
+struct hci_drv_rp_read_info {
+ __u8 driver_name[HCI_DRV_MAX_DRIVER_NAME_LENGTH];
+ __le16 num_supported_commands;
+ __le16 supported_commands[] __counted_by_le(num_supported_commands);
+} __packed;
+
+/* Driver specific OGF (Opcode Group Field)
+ * Commands in this group may have different meanings across different drivers.
+ */
+#define HCI_DRV_OGF_DRIVER_SPECIFIC 0x01
+
+int hci_drv_cmd_status(struct hci_dev *hdev, u16 cmd, u8 status);
+int hci_drv_cmd_complete(struct hci_dev *hdev, u16 cmd, u8 status, void *rp,
+ size_t rp_len);
+int hci_drv_process_cmd(struct hci_dev *hdev, struct sk_buff *cmd_skb);
+
+struct hci_drv_handler {
+ int (*func)(struct hci_dev *hdev, void *data, u16 data_len);
+ size_t data_len;
+};
+
+struct hci_drv {
+ size_t common_handler_count;
+ const struct hci_drv_handler *common_handlers;
+
+ size_t specific_handler_count;
+ const struct hci_drv_handler *specific_handlers;
+};
+
+#endif /* __HCI_DRV_H */
diff --git a/include/net/bluetooth/hci_mon.h b/include/net/bluetooth/hci_mon.h
index 082f89531b88..bbd752494ef9 100644
--- a/include/net/bluetooth/hci_mon.h
+++ b/include/net/bluetooth/hci_mon.h
@@ -51,6 +51,8 @@ struct hci_mon_hdr {
#define HCI_MON_CTRL_EVENT 17
#define HCI_MON_ISO_TX_PKT 18
#define HCI_MON_ISO_RX_PKT 19
+#define HCI_MON_DRV_TX_PKT 20
+#define HCI_MON_DRV_RX_PKT 21
struct hci_mon_new_index {
__u8 type;
diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h
index 7e2cf0cca939..73e494b2591d 100644
--- a/include/net/bluetooth/hci_sync.h
+++ b/include/net/bluetooth/hci_sync.h
@@ -93,7 +93,7 @@ int hci_update_class_sync(struct hci_dev *hdev);
int hci_update_eir_sync(struct hci_dev *hdev);
int hci_update_class_sync(struct hci_dev *hdev);
-int hci_update_name_sync(struct hci_dev *hdev);
+int hci_update_name_sync(struct hci_dev *hdev, const u8 *name);
int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode);
int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
@@ -115,8 +115,8 @@ int hci_enable_ext_advertising_sync(struct hci_dev *hdev, u8 instance);
int hci_enable_advertising_sync(struct hci_dev *hdev);
int hci_enable_advertising(struct hci_dev *hdev);
-int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len,
- u8 *data, u32 flags, u16 min_interval,
+int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 sid,
+ u8 data_len, u8 *data, u32 flags, u16 min_interval,
u16 max_interval, u16 sync_interval);
int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance);
@@ -185,3 +185,12 @@ int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn);
int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn);
int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn,
struct hci_conn_params *params);
+
+int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn);
+int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn);
+int hci_past_sync(struct hci_conn *conn, struct hci_conn *le);
+
+int hci_le_read_remote_features(struct hci_conn *conn);
+
+int hci_acl_change_pkt_type(struct hci_conn *conn, u16 pkt_type);
+int hci_le_set_phy(struct hci_conn *conn, u8 tx_phys, u8 rx_phys);
diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 9189354c568f..e0a1f2293679 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -33,6 +33,7 @@
/* L2CAP defaults */
#define L2CAP_DEFAULT_MTU 672
#define L2CAP_DEFAULT_MIN_MTU 48
+#define L2CAP_SIG_MTU 48 /* BR/EDR signaling MTU */
#define L2CAP_DEFAULT_FLUSH_TO 0xFFFF
#define L2CAP_EFS_DEFAULT_FLUSH_TO 0xFFFFFFFF
#define L2CAP_DEFAULT_TX_WINDOW 63
@@ -284,9 +285,9 @@ struct l2cap_conn_rsp {
#define L2CAP_CR_LE_BAD_KEY_SIZE 0x0007
#define L2CAP_CR_LE_ENCRYPTION 0x0008
#define L2CAP_CR_LE_INVALID_SCID 0x0009
-#define L2CAP_CR_LE_SCID_IN_USE 0X000A
-#define L2CAP_CR_LE_UNACCEPT_PARAMS 0X000B
-#define L2CAP_CR_LE_INVALID_PARAMS 0X000C
+#define L2CAP_CR_LE_SCID_IN_USE 0x000A
+#define L2CAP_CR_LE_UNACCEPT_PARAMS 0x000B
+#define L2CAP_CR_LE_INVALID_PARAMS 0x000C
/* connect/create channel status */
#define L2CAP_CS_NO_INFO 0x0000
@@ -493,6 +494,8 @@ struct l2cap_ecred_reconf_req {
#define L2CAP_RECONF_SUCCESS 0x0000
#define L2CAP_RECONF_INVALID_MTU 0x0001
#define L2CAP_RECONF_INVALID_MPS 0x0002
+#define L2CAP_RECONF_INVALID_CID 0x0003
+#define L2CAP_RECONF_INVALID_PARAMS 0x0004
struct l2cap_ecred_reconf_rsp {
__le16 result;
@@ -655,8 +658,8 @@ struct l2cap_conn {
struct sk_buff *rx_skb;
__u32 rx_len;
+ struct ida tx_ida;
__u8 tx_ident;
- struct mutex ident_lock;
struct sk_buff_head pending_rx;
struct work_struct pending_rx_work;
@@ -955,7 +958,8 @@ void l2cap_chan_close(struct l2cap_chan *chan, int reason);
int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
bdaddr_t *dst, u8 dst_type, u16 timeout);
int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu);
-int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len);
+int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len,
+ const struct sockcm_cookie *sockc);
void l2cap_chan_busy(struct l2cap_chan *chan, int busy);
void l2cap_chan_rx_avail(struct l2cap_chan *chan, ssize_t rx_avail);
int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator);
diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h
index affac861efdc..8234915854b6 100644
--- a/include/net/bluetooth/mgmt.h
+++ b/include/net/bluetooth/mgmt.h
@@ -53,10 +53,15 @@ struct mgmt_hdr {
} __packed;
struct mgmt_tlv {
- __le16 type;
- __u8 length;
+ /* New members MUST be added within the __struct_group() macro below. */
+ __struct_group(mgmt_tlv_hdr, __hdr, __packed,
+ __le16 type;
+ __u8 length;
+ );
__u8 value[];
} __packed;
+static_assert(offsetof(struct mgmt_tlv, value) == sizeof(struct mgmt_tlv_hdr),
+ "struct member likely outside of __struct_group()");
struct mgmt_addr_info {
bdaddr_t bdaddr;
@@ -113,6 +118,9 @@ struct mgmt_rp_read_index_list {
#define MGMT_SETTING_CIS_PERIPHERAL BIT(19)
#define MGMT_SETTING_ISO_BROADCASTER BIT(20)
#define MGMT_SETTING_ISO_SYNC_RECEIVER BIT(21)
+#define MGMT_SETTING_LL_PRIVACY BIT(22)
+#define MGMT_SETTING_PAST_SENDER BIT(23)
+#define MGMT_SETTING_PAST_RECEIVER BIT(24)
#define MGMT_OP_READ_INFO 0x0004
#define MGMT_READ_INFO_SIZE 0
@@ -774,7 +782,7 @@ struct mgmt_adv_pattern {
__u8 ad_type;
__u8 offset;
__u8 length;
- __u8 value[31];
+ __u8 value[HCI_MAX_AD_LENGTH];
} __packed;
#define MGMT_OP_ADD_ADV_PATTERNS_MONITOR 0x0052
@@ -847,7 +855,7 @@ struct mgmt_cp_set_mesh {
__le16 window;
__le16 period;
__u8 num_ad_types;
- __u8 ad_types[];
+ __u8 ad_types[] __counted_by(num_ad_types);
} __packed;
#define MGMT_SET_MESH_RECEIVER_SIZE 6
diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h
index 2053cd8e788a..05572c19e14b 100644
--- a/include/net/bond_3ad.h
+++ b/include/net/bond_3ad.h
@@ -26,6 +26,7 @@ enum {
BOND_AD_STABLE = 0,
BOND_AD_BANDWIDTH = 1,
BOND_AD_COUNT = 2,
+ BOND_AD_PRIO = 3,
};
/* rx machine states(43.4.11 in the 802.3ad standard) */
@@ -242,7 +243,7 @@ typedef struct port {
churn_state_t sm_churn_actor_state;
churn_state_t sm_churn_partner_state;
struct slave *slave; /* pointer to the bond slave that this port belongs to */
- struct aggregator *aggregator; /* pointer to an aggregator that this port related to */
+ struct aggregator __rcu *aggregator; /* pointer to an aggregator that this port related to */
struct port *next_port_in_aggregator; /* Next port on the linked list of the parent aggregator */
u32 transaction_id; /* continuous number for identification of Marker PDU's; */
struct lacpdu lacpdu; /* the lacpdu that will be sent for this port */
@@ -274,6 +275,7 @@ struct ad_slave_info {
struct port port; /* 802.3ad port structure */
struct bond_3ad_stats stats;
u16 id;
+ u16 port_priority;
};
static inline const char *bond_3ad_churn_desc(churn_state_t state)
@@ -307,6 +309,7 @@ int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond,
struct slave *slave);
int bond_3ad_set_carrier(struct bonding *bond);
void bond_3ad_update_lacp_rate(struct bonding *bond);
+void bond_3ad_update_lacp_active(struct bonding *bond);
void bond_3ad_update_ad_actor_settings(struct bonding *bond);
int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats);
size_t bond_3ad_stats_size(void);
diff --git a/include/net/bond_options.h b/include/net/bond_options.h
index 18687ccf0638..e6eedf23aea1 100644
--- a/include/net/bond_options.h
+++ b/include/net/bond_options.h
@@ -77,6 +77,8 @@ enum {
BOND_OPT_NS_TARGETS,
BOND_OPT_PRIO,
BOND_OPT_COUPLED_CONTROL,
+ BOND_OPT_BROADCAST_NEIGH,
+ BOND_OPT_ACTOR_PORT_PRIO,
BOND_OPT_LAST
};
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 8bb5f016969f..edd1942dcd73 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -69,9 +69,6 @@
#define bond_first_slave_rcu(bond) \
netdev_lower_get_first_private_rcu(bond->dev)
-#define bond_is_first_slave(bond, pos) (pos == bond_first_slave(bond))
-#define bond_is_last_slave(bond, pos) (pos == bond_last_slave(bond))
-
/**
* bond_for_each_slave - iterate over all slaves
* @bond: the bond holding this list
@@ -91,22 +88,22 @@
NETIF_F_GSO_ESP)
#ifdef CONFIG_NET_POLL_CONTROLLER
-extern atomic_t netpoll_block_tx;
+DECLARE_STATIC_KEY_FALSE(netpoll_block_tx);
static inline void block_netpoll_tx(void)
{
- atomic_inc(&netpoll_block_tx);
+ static_branch_inc(&netpoll_block_tx);
}
static inline void unblock_netpoll_tx(void)
{
- atomic_dec(&netpoll_block_tx);
+ static_branch_dec(&netpoll_block_tx);
}
static inline int is_netpoll_tx_blocked(struct net_device *dev)
{
- if (unlikely(netpoll_tx_running(dev)))
- return atomic_read(&netpoll_block_tx);
+ if (static_branch_unlikely(&netpoll_block_tx))
+ return netpoll_tx_running(dev);
return 0;
}
#else
@@ -115,6 +112,8 @@ static inline int is_netpoll_tx_blocked(struct net_device *dev)
#define is_netpoll_tx_blocked(dev) (0)
#endif
+DECLARE_STATIC_KEY_FALSE(bond_bcast_neigh_enabled);
+
struct bond_params {
int mode;
int xmit_policy;
@@ -124,7 +123,6 @@ struct bond_params {
int arp_interval;
int arp_validate;
int arp_all_targets;
- int use_carrier;
int fail_over_mac;
int updelay;
int downdelay;
@@ -149,6 +147,7 @@ struct bond_params {
struct in6_addr ns_targets[BOND_MAX_NS_TARGETS];
#endif
int coupled_control;
+ int broadcast_neighbor;
/* 2 bytes of padding : see ether_addr_equal_64bits() */
u8 ad_actor_system[ETH_ALEN + 2];
@@ -252,6 +251,7 @@ struct bonding {
struct delayed_work ad_work;
struct delayed_work mcast_work;
struct delayed_work slave_arr_work;
+ struct delayed_work peer_notify_work;
#ifdef CONFIG_DEBUG_FS
/* debugging support via debugfs */
struct dentry *debug_dir;
@@ -519,13 +519,14 @@ static inline int bond_is_ip6_target_ok(struct in6_addr *addr)
static inline unsigned long slave_oldest_target_arp_rx(struct bonding *bond,
struct slave *slave)
{
+ unsigned long tmp, ret = READ_ONCE(slave->target_last_arp_rx[0]);
int i = 1;
- unsigned long ret = slave->target_last_arp_rx[0];
-
- for (; (i < BOND_MAX_ARP_TARGETS) && bond->params.arp_targets[i]; i++)
- if (time_before(slave->target_last_arp_rx[i], ret))
- ret = slave->target_last_arp_rx[i];
+ for (; (i < BOND_MAX_ARP_TARGETS) && bond->params.arp_targets[i]; i++) {
+ tmp = READ_ONCE(slave->target_last_arp_rx[i]);
+ if (time_before(tmp, ret))
+ ret = tmp;
+ }
return ret;
}
@@ -535,7 +536,7 @@ static inline unsigned long slave_last_rx(struct bonding *bond,
if (bond->params.arp_all_targets == BOND_ARP_TARGETS_ALL)
return slave_oldest_target_arp_rx(bond, slave);
- return slave->last_rx;
+ return READ_ONCE(slave->last_rx);
}
static inline void slave_update_last_tx(struct slave *slave)
@@ -695,6 +696,8 @@ void bond_debug_register(struct bonding *bond);
void bond_debug_unregister(struct bonding *bond);
void bond_debug_reregister(struct bonding *bond);
const char *bond_mode_name(int mode);
+bool __bond_xdp_check(int mode, int xmit_policy);
+bool bond_xdp_check(struct bonding *bond, int mode);
void bond_setup(struct net_device *bond_dev);
unsigned int bond_get_num_tx_queues(void);
int bond_netlink_init(void);
@@ -706,7 +709,9 @@ struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev,
int level);
int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave);
void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay);
+void bond_peer_notify_work_rearm(struct bonding *bond, unsigned long delay);
void bond_work_init_all(struct bonding *bond);
+void bond_work_cancel_all(struct bonding *bond);
#ifdef CONFIG_PROC_FS
void bond_create_proc_entry(struct bonding *bond);
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index c39a426ebf52..6e172d0f6ef5 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -24,6 +24,11 @@
*/
#define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1))
+static inline bool napi_id_valid(unsigned int napi_id)
+{
+ return napi_id >= MIN_NAPI_ID;
+}
+
#define BUSY_POLL_BUDGET 8
#ifdef CONFIG_NET_RX_BUSY_POLL
@@ -114,7 +119,7 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock)
#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int napi_id = READ_ONCE(sk->sk_napi_id);
- if (napi_id >= MIN_NAPI_ID)
+ if (napi_id_valid(napi_id))
napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk,
READ_ONCE(sk->sk_prefer_busy_poll),
READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET);
@@ -122,18 +127,24 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock)
}
/* used in the NIC receive handler to mark the skb */
-static inline void skb_mark_napi_id(struct sk_buff *skb,
- struct napi_struct *napi)
+static inline void __skb_mark_napi_id(struct sk_buff *skb,
+ const struct gro_node *gro)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
/* If the skb was already marked with a valid NAPI ID, avoid overwriting
* it.
*/
- if (skb->napi_id < MIN_NAPI_ID)
- skb->napi_id = napi->napi_id;
+ if (!napi_id_valid(skb->napi_id))
+ skb->napi_id = gro->cached_napi_id;
#endif
}
+static inline void skb_mark_napi_id(struct sk_buff *skb,
+ const struct napi_struct *napi)
+{
+ __skb_mark_napi_id(skb, &napi->gro);
+}
+
/* used in the protocol handler to propagate the napi_id to the socket */
static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
{
diff --git a/include/net/caif/caif_dev.h b/include/net/caif/caif_dev.h
deleted file mode 100644
index b655d8666f55..000000000000
--- a/include/net/caif/caif_dev.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author: Sjur Brendeland
- */
-
-#ifndef CAIF_DEV_H_
-#define CAIF_DEV_H_
-
-#include <net/caif/caif_layer.h>
-#include <net/caif/cfcnfg.h>
-#include <net/caif/caif_device.h>
-#include <linux/caif/caif_socket.h>
-#include <linux/if.h>
-#include <linux/net.h>
-
-/**
- * struct caif_param - CAIF parameters.
- * @size: Length of data
- * @data: Binary Data Blob
- */
-struct caif_param {
- u16 size;
- u8 data[256];
-};
-
-/**
- * struct caif_connect_request - Request data for CAIF channel setup.
- * @protocol: Type of CAIF protocol to use (at, datagram etc)
- * @sockaddr: Socket address to connect.
- * @priority: Priority of the connection.
- * @link_selector: Link selector (high bandwidth or low latency)
- * @ifindex: kernel index of the interface.
- * @param: Connect Request parameters (CAIF_SO_REQ_PARAM).
- *
- * This struct is used when connecting a CAIF channel.
- * It contains all CAIF channel configuration options.
- */
-struct caif_connect_request {
- enum caif_protocol_type protocol;
- struct sockaddr_caif sockaddr;
- enum caif_channel_priority priority;
- enum caif_link_selector link_selector;
- int ifindex;
- struct caif_param param;
-};
-
-/**
- * caif_connect_client - Connect a client to CAIF Core Stack.
- * @config: Channel setup parameters, specifying what address
- * to connect on the Modem.
- * @client_layer: User implementation of client layer. This layer
- * MUST have receive and control callback functions
- * implemented.
- * @ifindex: Link layer interface index used for this connection.
- * @headroom: Head room needed by CAIF protocol.
- * @tailroom: Tail room needed by CAIF protocol.
- *
- * This function connects a CAIF channel. The Client must implement
- * the struct cflayer. This layer represents the Client layer and holds
- * receive functions and control callback functions. Control callback
- * function will receive information about connect/disconnect responses,
- * flow control etc (see enum caif_control).
- * E.g. CAIF Socket will call this function for each socket it connects
- * and have one client_layer instance for each socket.
- */
-int caif_connect_client(struct net *net,
- struct caif_connect_request *conn_req,
- struct cflayer *client_layer, int *ifindex,
- int *headroom, int *tailroom);
-
-/**
- * caif_disconnect_client - Disconnects a client from the CAIF stack.
- *
- * @client_layer: Client layer to be disconnected.
- */
-int caif_disconnect_client(struct net *net, struct cflayer *client_layer);
-
-
-/**
- * caif_client_register_refcnt - register ref-count functions provided by client.
- *
- * @adapt_layer: Client layer using CAIF Stack.
- * @hold: Function provided by client layer increasing ref-count
- * @put: Function provided by client layer decreasing ref-count
- *
- * Client of the CAIF Stack must register functions for reference counting.
- * These functions are called by the CAIF Stack for every upstream packet,
- * and must therefore be implemented efficiently.
- *
- * Client should call caif_free_client when reference count degrease to zero.
- */
-
-void caif_client_register_refcnt(struct cflayer *adapt_layer,
- void (*hold)(struct cflayer *lyr),
- void (*put)(struct cflayer *lyr));
-/**
- * caif_free_client - Free memory used to manage the client in the CAIF Stack.
- *
- * @client_layer: Client layer to be removed.
- *
- * This function must be called from client layer in order to free memory.
- * Caller must guarantee that no packets are in flight upstream when calling
- * this function.
- */
-void caif_free_client(struct cflayer *adap_layer);
-
-/**
- * struct caif_enroll_dev - Enroll a net-device as a CAIF Link layer
- * @dev: Network device to enroll.
- * @caifdev: Configuration information from CAIF Link Layer
- * @link_support: Link layer support layer
- * @head_room: Head room needed by link support layer
- * @layer: Lowest layer in CAIF stack
- * @rcv_fun: Receive function for CAIF stack.
- *
- * This function enroll a CAIF link layer into CAIF Stack and
- * expects the interface to be able to handle CAIF payload.
- * The link_support layer is used to add any Link Layer specific
- * framing.
- */
-int caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev,
- struct cflayer *link_support, int head_room,
- struct cflayer **layer, int (**rcv_func)(
- struct sk_buff *, struct net_device *,
- struct packet_type *, struct net_device *));
-
-#endif /* CAIF_DEV_H_ */
diff --git a/include/net/caif/caif_device.h b/include/net/caif/caif_device.h
deleted file mode 100644
index 91d1fd5b44a4..000000000000
--- a/include/net/caif/caif_device.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author: Sjur Brendeland
- */
-
-#ifndef CAIF_DEVICE_H_
-#define CAIF_DEVICE_H_
-#include <linux/kernel.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/caif/caif_socket.h>
-#include <net/caif/caif_device.h>
-
-/**
- * struct caif_dev_common - data shared between CAIF drivers and stack.
- * @flowctrl: Flow Control callback function. This function is
- * supplied by CAIF Core Stack and is used by CAIF
- * Link Layer to send flow-stop to CAIF Core.
- * The flow information will be distributed to all
- * clients of CAIF.
- *
- * @link_select: Profile of device, either high-bandwidth or
- * low-latency. This member is set by CAIF Link
- * Layer Device in order to indicate if this device
- * is a high bandwidth or low latency device.
- *
- * @use_frag: CAIF Frames may be fragmented.
- * Is set by CAIF Link Layer in order to indicate if the
- * interface receives fragmented frames that must be
- * assembled by CAIF Core Layer.
- *
- * @use_fcs: Indicate if Frame CheckSum (fcs) is used.
- * Is set if the physical interface is
- * using Frame Checksum on the CAIF Frames.
- *
- * @use_stx: Indicate STart of frame eXtension (stx) in use.
- * Is set if the CAIF Link Layer expects
- * CAIF Frames to start with the STX byte.
- *
- * This structure is shared between the CAIF drivers and the CAIF stack.
- * It is used by the device to register its behavior.
- * CAIF Core layer must set the member flowctrl in order to supply
- * CAIF Link Layer with the flow control function.
- *
- */
- struct caif_dev_common {
- void (*flowctrl)(struct net_device *net, int on);
- enum caif_link_selector link_select;
- int use_frag;
- int use_fcs;
- int use_stx;
-};
-
-#endif /* CAIF_DEVICE_H_ */
diff --git a/include/net/caif/caif_layer.h b/include/net/caif/caif_layer.h
deleted file mode 100644
index 053e7c6a6a66..000000000000
--- a/include/net/caif/caif_layer.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author: Sjur Brendeland
- */
-
-#ifndef CAIF_LAYER_H_
-#define CAIF_LAYER_H_
-
-#include <linux/list.h>
-
-struct cflayer;
-struct cfpkt;
-struct caif_payload_info;
-
-#define CAIF_LAYER_NAME_SZ 16
-
-/**
- * caif_assert() - Assert function for CAIF.
- * @assert: expression to evaluate.
- *
- * This function will print a error message and a do WARN_ON if the
- * assertion fails. Normally this will do a stack up at the current location.
- */
-#define caif_assert(assert) \
-do { \
- if (!(assert)) { \
- pr_err("caif:Assert detected:'%s'\n", #assert); \
- WARN_ON(!(assert)); \
- } \
-} while (0)
-
-/**
- * enum caif_ctrlcmd - CAIF Stack Control Signaling sent in layer.ctrlcmd().
- *
- * @CAIF_CTRLCMD_FLOW_OFF_IND: Flow Control is OFF, transmit function
- * should stop sending data
- *
- * @CAIF_CTRLCMD_FLOW_ON_IND: Flow Control is ON, transmit function
- * can start sending data
- *
- * @CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND: Remote end modem has decided to close
- * down channel
- *
- * @CAIF_CTRLCMD_INIT_RSP: Called initially when the layer below
- * has finished initialization
- *
- * @CAIF_CTRLCMD_DEINIT_RSP: Called when de-initialization is
- * complete
- *
- * @CAIF_CTRLCMD_INIT_FAIL_RSP: Called if initialization fails
- *
- * @_CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND: CAIF Link layer temporarily cannot
- * send more packets.
- * @_CAIF_CTRLCMD_PHYIF_FLOW_ON_IND: Called if CAIF Link layer is able
- * to send packets again.
- * @_CAIF_CTRLCMD_PHYIF_DOWN_IND: Called if CAIF Link layer is going
- * down.
- *
- * These commands are sent upwards in the CAIF stack to the CAIF Client.
- * They are used for signaling originating from the modem or CAIF Link Layer.
- * These are either responses (*_RSP) or events (*_IND).
- */
-enum caif_ctrlcmd {
- CAIF_CTRLCMD_FLOW_OFF_IND,
- CAIF_CTRLCMD_FLOW_ON_IND,
- CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND,
- CAIF_CTRLCMD_INIT_RSP,
- CAIF_CTRLCMD_DEINIT_RSP,
- CAIF_CTRLCMD_INIT_FAIL_RSP,
- _CAIF_CTRLCMD_PHYIF_FLOW_OFF_IND,
- _CAIF_CTRLCMD_PHYIF_FLOW_ON_IND,
- _CAIF_CTRLCMD_PHYIF_DOWN_IND,
-};
-
-/**
- * enum caif_modemcmd - Modem Control Signaling, sent from CAIF Client
- * to the CAIF Link Layer or modem.
- *
- * @CAIF_MODEMCMD_FLOW_ON_REQ: Flow Control is ON, transmit function
- * can start sending data.
- *
- * @CAIF_MODEMCMD_FLOW_OFF_REQ: Flow Control is OFF, transmit function
- * should stop sending data.
- *
- * @_CAIF_MODEMCMD_PHYIF_USEFULL: Notify physical layer that it is in use
- *
- * @_CAIF_MODEMCMD_PHYIF_USELESS: Notify physical layer that it is
- * no longer in use.
- *
- * These are requests sent 'downwards' in the stack.
- * Flow ON, OFF can be indicated to the modem.
- */
-enum caif_modemcmd {
- CAIF_MODEMCMD_FLOW_ON_REQ = 0,
- CAIF_MODEMCMD_FLOW_OFF_REQ = 1,
- _CAIF_MODEMCMD_PHYIF_USEFULL = 3,
- _CAIF_MODEMCMD_PHYIF_USELESS = 4
-};
-
-/**
- * enum caif_direction - CAIF Packet Direction.
- * Indicate if a packet is to be sent out or to be received in.
- * @CAIF_DIR_IN: Incoming packet received.
- * @CAIF_DIR_OUT: Outgoing packet to be transmitted.
- */
-enum caif_direction {
- CAIF_DIR_IN = 0,
- CAIF_DIR_OUT = 1
-};
-
-/**
- * struct cflayer - CAIF Stack layer.
- * Defines the framework for the CAIF Core Stack.
- * @up: Pointer up to the layer above.
- * @dn: Pointer down to the layer below.
- * @node: List node used when layer participate in a list.
- * @receive: Packet receive function.
- * @transmit: Packet transmit function.
- * @ctrlcmd: Used for control signalling upwards in the stack.
- * @modemcmd: Used for control signaling downwards in the stack.
- * @id: The identity of this layer
- * @name: Name of the layer.
- *
- * This structure defines the layered structure in CAIF.
- *
- * It defines CAIF layering structure, used by all CAIF Layers and the
- * layers interfacing CAIF.
- *
- * In order to integrate with CAIF an adaptation layer on top of the CAIF stack
- * and PHY layer below the CAIF stack
- * must be implemented. These layer must follow the design principles below.
- *
- * Principles for layering of protocol layers:
- * - All layers must use this structure. If embedding it, then place this
- * structure first in the layer specific structure.
- *
- * - Each layer should not depend on any others layer's private data.
- *
- * - In order to send data upwards do
- * layer->up->receive(layer->up, packet);
- *
- * - In order to send data downwards do
- * layer->dn->transmit(layer->dn, info, packet);
- */
-struct cflayer {
- struct cflayer *up;
- struct cflayer *dn;
- struct list_head node;
-
- /*
- * receive() - Receive Function (non-blocking).
- * Contract: Each layer must implement a receive function passing the
- * CAIF packets upwards in the stack.
- * Packet handling rules:
- * - The CAIF packet (cfpkt) ownership is passed to the
- * called receive function. This means that the
- * packet cannot be accessed after passing it to the
- * above layer using up->receive().
- *
- * - If parsing of the packet fails, the packet must be
- * destroyed and negative error code returned
- * from the function.
- * EXCEPTION: If the framing layer (cffrml) returns
- * -EILSEQ, the packet is not freed.
- *
- * - If parsing succeeds (and above layers return OK) then
- * the function must return a value >= 0.
- *
- * Returns result < 0 indicates an error, 0 or positive value
- * indicates success.
- *
- * @layr: Pointer to the current layer the receive function is
- * implemented for (this pointer).
- * @cfpkt: Pointer to CaifPacket to be handled.
- */
- int (*receive)(struct cflayer *layr, struct cfpkt *cfpkt);
-
- /*
- * transmit() - Transmit Function (non-blocking).
- * Contract: Each layer must implement a transmit function passing the
- * CAIF packet downwards in the stack.
- * Packet handling rules:
- * - The CAIF packet (cfpkt) ownership is passed to the
- * transmit function. This means that the packet
- * cannot be accessed after passing it to the below
- * layer using dn->transmit().
- *
- * - Upon error the packet ownership is still passed on,
- * so the packet shall be freed where error is detected.
- * Callers of the transmit function shall not free packets,
- * but errors shall be returned.
- *
- * - Return value less than zero means error, zero or
- * greater than zero means OK.
- *
- * Returns result < 0 indicates an error, 0 or positive value
- * indicates success.
- *
- * @layr: Pointer to the current layer the receive function
- * isimplemented for (this pointer).
- * @cfpkt: Pointer to CaifPacket to be handled.
- */
- int (*transmit) (struct cflayer *layr, struct cfpkt *cfpkt);
-
- /*
- * cttrlcmd() - Control Function upwards in CAIF Stack (non-blocking).
- * Used for signaling responses (CAIF_CTRLCMD_*_RSP)
- * and asynchronous events from the modem (CAIF_CTRLCMD_*_IND)
- *
- * @layr: Pointer to the current layer the receive function
- * is implemented for (this pointer).
- * @ctrl: Control Command.
- */
- void (*ctrlcmd) (struct cflayer *layr, enum caif_ctrlcmd ctrl,
- int phyid);
-
- /*
- * modemctrl() - Control Function used for controlling the modem.
- * Used to signal down-wards in the CAIF stack.
- * Returns 0 on success, < 0 upon failure.
- *
- * @layr: Pointer to the current layer the receive function
- * is implemented for (this pointer).
- * @ctrl: Control Command.
- */
- int (*modemcmd) (struct cflayer *layr, enum caif_modemcmd ctrl);
-
- unsigned int id;
- char name[CAIF_LAYER_NAME_SZ];
-};
-
-/**
- * layer_set_up() - Set the up pointer for a specified layer.
- * @layr: Layer where up pointer shall be set.
- * @above: Layer above.
- */
-#define layer_set_up(layr, above) ((layr)->up = (struct cflayer *)(above))
-
-/**
- * layer_set_dn() - Set the down pointer for a specified layer.
- * @layr: Layer where down pointer shall be set.
- * @below: Layer below.
- */
-#define layer_set_dn(layr, below) ((layr)->dn = (struct cflayer *)(below))
-
-/**
- * struct dev_info - Physical Device info information about physical layer.
- * @dev: Pointer to native physical device.
- * @id: Physical ID of the physical connection used by the
- * logical CAIF connection. Used by service layers to
- * identify their physical id to Caif MUX (CFMUXL)so
- * that the MUX can add the correct physical ID to the
- * packet.
- */
-struct dev_info {
- void *dev;
- unsigned int id;
-};
-
-/**
- * struct caif_payload_info - Payload information embedded in packet (sk_buff).
- *
- * @dev_info: Information about the receiving device.
- *
- * @hdr_len: Header length, used to align pay load on 32bit boundary.
- *
- * @channel_id: Channel ID of the logical CAIF connection.
- * Used by mux to insert channel id into the caif packet.
- */
-struct caif_payload_info {
- struct dev_info *dev_info;
- unsigned short hdr_len;
- unsigned short channel_id;
-};
-
-#endif /* CAIF_LAYER_H_ */
diff --git a/include/net/caif/cfcnfg.h b/include/net/caif/cfcnfg.h
deleted file mode 100644
index 8819ff4db35a..000000000000
--- a/include/net/caif/cfcnfg.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author: Sjur Brendeland
- */
-
-#ifndef CFCNFG_H_
-#define CFCNFG_H_
-#include <linux/spinlock.h>
-#include <linux/netdevice.h>
-#include <net/caif/caif_layer.h>
-#include <net/caif/cfctrl.h>
-
-struct cfcnfg;
-
-/**
- * enum cfcnfg_phy_preference - Physical preference HW Abstraction
- *
- * @CFPHYPREF_UNSPECIFIED: Default physical interface
- *
- * @CFPHYPREF_LOW_LAT: Default physical interface for low-latency
- * traffic
- * @CFPHYPREF_HIGH_BW: Default physical interface for high-bandwidth
- * traffic
- * @CFPHYPREF_LOOP: TEST only Loopback interface simulating modem
- * responses.
- *
- */
-enum cfcnfg_phy_preference {
- CFPHYPREF_UNSPECIFIED,
- CFPHYPREF_LOW_LAT,
- CFPHYPREF_HIGH_BW,
- CFPHYPREF_LOOP
-};
-
-/**
- * cfcnfg_create() - Get the CAIF configuration object given network.
- * @net: Network for the CAIF configuration object.
- */
-struct cfcnfg *get_cfcnfg(struct net *net);
-
-/**
- * cfcnfg_create() - Create the CAIF configuration object.
- */
-struct cfcnfg *cfcnfg_create(void);
-
-/**
- * cfcnfg_remove() - Remove the CFCNFG object
- * @cfg: config object
- */
-void cfcnfg_remove(struct cfcnfg *cfg);
-
-/**
- * cfcnfg_add_phy_layer() - Adds a physical layer to the CAIF stack.
- * @cnfg: Pointer to a CAIF configuration object, created by
- * cfcnfg_create().
- * @dev: Pointer to link layer device
- * @phy_layer: Specify the physical layer. The transmit function
- * MUST be set in the structure.
- * @pref: The phy (link layer) preference.
- * @link_support: Protocol implementation for link layer specific protocol.
- * @fcs: Specify if checksum is used in CAIF Framing Layer.
- * @head_room: Head space needed by link specific protocol.
- */
-int
-cfcnfg_add_phy_layer(struct cfcnfg *cnfg,
- struct net_device *dev, struct cflayer *phy_layer,
- enum cfcnfg_phy_preference pref,
- struct cflayer *link_support,
- bool fcs, int head_room);
-
-/**
- * cfcnfg_del_phy_layer - Deletes an phy layer from the CAIF stack.
- *
- * @cnfg: Pointer to a CAIF configuration object, created by
- * cfcnfg_create().
- * @phy_layer: Adaptation layer to be removed.
- */
-int cfcnfg_del_phy_layer(struct cfcnfg *cnfg, struct cflayer *phy_layer);
-
-/**
- * cfcnfg_set_phy_state() - Set the state of the physical interface device.
- * @cnfg: Configuration object
- * @phy_layer: Physical Layer representation
- * @up: State of device
- */
-int cfcnfg_set_phy_state(struct cfcnfg *cnfg, struct cflayer *phy_layer,
- bool up);
-
-#endif /* CFCNFG_H_ */
diff --git a/include/net/caif/cfctrl.h b/include/net/caif/cfctrl.h
deleted file mode 100644
index 86d17315c8a1..000000000000
--- a/include/net/caif/cfctrl.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author: Sjur Brendeland
- */
-
-#ifndef CFCTRL_H_
-#define CFCTRL_H_
-#include <net/caif/caif_layer.h>
-#include <net/caif/cfsrvl.h>
-
-/* CAIF Control packet commands */
-enum cfctrl_cmd {
- CFCTRL_CMD_LINK_SETUP = 0,
- CFCTRL_CMD_LINK_DESTROY = 1,
- CFCTRL_CMD_LINK_ERR = 2,
- CFCTRL_CMD_ENUM = 3,
- CFCTRL_CMD_SLEEP = 4,
- CFCTRL_CMD_WAKE = 5,
- CFCTRL_CMD_LINK_RECONF = 6,
- CFCTRL_CMD_START_REASON = 7,
- CFCTRL_CMD_RADIO_SET = 8,
- CFCTRL_CMD_MODEM_SET = 9,
- CFCTRL_CMD_MASK = 0xf
-};
-
-/* Channel types */
-enum cfctrl_srv {
- CFCTRL_SRV_DECM = 0,
- CFCTRL_SRV_VEI = 1,
- CFCTRL_SRV_VIDEO = 2,
- CFCTRL_SRV_DBG = 3,
- CFCTRL_SRV_DATAGRAM = 4,
- CFCTRL_SRV_RFM = 5,
- CFCTRL_SRV_UTIL = 6,
- CFCTRL_SRV_MASK = 0xf
-};
-
-#define CFCTRL_RSP_BIT 0x20
-#define CFCTRL_ERR_BIT 0x10
-
-struct cfctrl_rsp {
- void (*linksetup_rsp)(struct cflayer *layer, u8 linkid,
- enum cfctrl_srv serv, u8 phyid,
- struct cflayer *adapt_layer);
- void (*linkdestroy_rsp)(struct cflayer *layer, u8 linkid);
- void (*linkerror_ind)(void);
- void (*enum_rsp)(void);
- void (*sleep_rsp)(void);
- void (*wake_rsp)(void);
- void (*restart_rsp)(void);
- void (*radioset_rsp)(void);
- void (*reject_rsp)(struct cflayer *layer, u8 linkid,
- struct cflayer *client_layer);
-};
-
-/* Link Setup Parameters for CAIF-Links. */
-struct cfctrl_link_param {
- enum cfctrl_srv linktype;/* (T3,T0) Type of Channel */
- u8 priority; /* (P4,P0) Priority of the channel */
- u8 phyid; /* (U2-U0) Physical interface to connect */
- u8 endpoint; /* (E1,E0) Endpoint for data channels */
- u8 chtype; /* (H1,H0) Channel-Type, applies to
- * VEI, DEBUG */
- union {
- struct {
- u8 connid; /* (D7,D0) Video LinkId */
- } video;
-
- struct {
- u32 connid; /* (N31,Ngit0) Connection ID used
- * for Datagram */
- } datagram;
-
- struct {
- u32 connid; /* Connection ID used for RFM */
- char volume[20]; /* Volume to mount for RFM */
- } rfm; /* Configuration for RFM */
-
- struct {
- u16 fifosize_kb; /* Psock FIFO size in KB */
- u16 fifosize_bufs; /* Psock # signal buffers */
- char name[16]; /* Name of the PSOCK service */
- u8 params[255]; /* Link setup Parameters> */
- u16 paramlen; /* Length of Link Setup
- * Parameters */
- } utility; /* Configuration for Utility Links (Psock) */
- } u;
-};
-
-/* This structure is used internally in CFCTRL */
-struct cfctrl_request_info {
- int sequence_no;
- enum cfctrl_cmd cmd;
- u8 channel_id;
- struct cfctrl_link_param param;
- struct cflayer *client_layer;
- struct list_head list;
-};
-
-struct cfctrl {
- struct cfsrvl serv;
- struct cfctrl_rsp res;
- atomic_t req_seq_no;
- atomic_t rsp_seq_no;
- struct list_head list;
- /* Protects from simultaneous access to first_req list */
- spinlock_t info_list_lock;
-#ifndef CAIF_NO_LOOP
- u8 loop_linkid;
- int loop_linkused[256];
- /* Protects simultaneous access to loop_linkid and loop_linkused */
- spinlock_t loop_linkid_lock;
-#endif
-
-};
-
-void cfctrl_enum_req(struct cflayer *cfctrl, u8 physlinkid);
-int cfctrl_linkup_request(struct cflayer *cfctrl,
- struct cfctrl_link_param *param,
- struct cflayer *user_layer);
-int cfctrl_linkdown_req(struct cflayer *cfctrl, u8 linkid,
- struct cflayer *client);
-
-struct cflayer *cfctrl_create(void);
-struct cfctrl_rsp *cfctrl_get_respfuncs(struct cflayer *layer);
-int cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer);
-void cfctrl_remove(struct cflayer *layr);
-
-#endif /* CFCTRL_H_ */
diff --git a/include/net/caif/cffrml.h b/include/net/caif/cffrml.h
deleted file mode 100644
index 1ab8a80ede4d..000000000000
--- a/include/net/caif/cffrml.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author: Sjur Brendeland
- */
-
-#ifndef CFFRML_H_
-#define CFFRML_H_
-#include <net/caif/caif_layer.h>
-#include <linux/netdevice.h>
-
-struct cffrml;
-struct cflayer *cffrml_create(u16 phyid, bool use_fcs);
-void cffrml_free(struct cflayer *layr);
-void cffrml_set_uplayer(struct cflayer *this, struct cflayer *up);
-void cffrml_set_dnlayer(struct cflayer *this, struct cflayer *dn);
-void cffrml_put(struct cflayer *layr);
-void cffrml_hold(struct cflayer *layr);
-int cffrml_refcnt_read(struct cflayer *layr);
-
-#endif /* CFFRML_H_ */
diff --git a/include/net/caif/cfmuxl.h b/include/net/caif/cfmuxl.h
deleted file mode 100644
index 92ccb2648309..000000000000
--- a/include/net/caif/cfmuxl.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author: Sjur Brendeland
- */
-
-#ifndef CFMUXL_H_
-#define CFMUXL_H_
-#include <net/caif/caif_layer.h>
-
-struct cfsrvl;
-struct cffrml;
-
-struct cflayer *cfmuxl_create(void);
-int cfmuxl_set_uplayer(struct cflayer *layr, struct cflayer *up, u8 linkid);
-struct cflayer *cfmuxl_remove_dnlayer(struct cflayer *layr, u8 phyid);
-int cfmuxl_set_dnlayer(struct cflayer *layr, struct cflayer *up, u8 phyid);
-struct cflayer *cfmuxl_remove_uplayer(struct cflayer *layr, u8 linkid);
-
-#endif /* CFMUXL_H_ */
diff --git a/include/net/caif/cfpkt.h b/include/net/caif/cfpkt.h
deleted file mode 100644
index acf664227d96..000000000000
--- a/include/net/caif/cfpkt.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author: Sjur Brendeland
- */
-
-#ifndef CFPKT_H_
-#define CFPKT_H_
-#include <net/caif/caif_layer.h>
-#include <linux/types.h>
-struct cfpkt;
-
-/* Create a CAIF packet.
- * len: Length of packet to be created
- * @return New packet.
- */
-struct cfpkt *cfpkt_create(u16 len);
-
-/*
- * Destroy a CAIF Packet.
- * pkt Packet to be destroyed.
- */
-void cfpkt_destroy(struct cfpkt *pkt);
-
-/*
- * Extract header from packet.
- *
- * pkt Packet to extract header data from.
- * data Pointer to copy the header data into.
- * len Length of head data to copy.
- * @return zero on success and error code upon failure
- */
-int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len);
-
-static inline u8 cfpkt_extr_head_u8(struct cfpkt *pkt)
-{
- u8 tmp;
-
- cfpkt_extr_head(pkt, &tmp, 1);
-
- return tmp;
-}
-
-static inline u16 cfpkt_extr_head_u16(struct cfpkt *pkt)
-{
- __le16 tmp;
-
- cfpkt_extr_head(pkt, &tmp, 2);
-
- return le16_to_cpu(tmp);
-}
-
-static inline u32 cfpkt_extr_head_u32(struct cfpkt *pkt)
-{
- __le32 tmp;
-
- cfpkt_extr_head(pkt, &tmp, 4);
-
- return le32_to_cpu(tmp);
-}
-
-/*
- * Peek header from packet.
- * Reads data from packet without changing packet.
- *
- * pkt Packet to extract header data from.
- * data Pointer to copy the header data into.
- * len Length of head data to copy.
- * @return zero on success and error code upon failure
- */
-int cfpkt_peek_head(struct cfpkt *pkt, void *data, u16 len);
-
-/*
- * Extract header from trailer (end of packet).
- *
- * pkt Packet to extract header data from.
- * data Pointer to copy the trailer data into.
- * len Length of header data to copy.
- * @return zero on success and error code upon failure
- */
-int cfpkt_extr_trail(struct cfpkt *pkt, void *data, u16 len);
-
-/*
- * Add header to packet.
- *
- *
- * pkt Packet to add header data to.
- * data Pointer to data to copy into the header.
- * len Length of header data to copy.
- * @return zero on success and error code upon failure
- */
-int cfpkt_add_head(struct cfpkt *pkt, const void *data, u16 len);
-
-/*
- * Add trailer to packet.
- *
- *
- * pkt Packet to add trailer data to.
- * data Pointer to data to copy into the trailer.
- * len Length of trailer data to copy.
- * @return zero on success and error code upon failure
- */
-int cfpkt_add_trail(struct cfpkt *pkt, const void *data, u16 len);
-
-/*
- * Pad trailer on packet.
- * Moves data pointer in packet, no content copied.
- *
- * pkt Packet in which to pad trailer.
- * len Length of padding to add.
- * @return zero on success and error code upon failure
- */
-int cfpkt_pad_trail(struct cfpkt *pkt, u16 len);
-
-/*
- * Add a single byte to packet body (tail).
- *
- * pkt Packet in which to add byte.
- * data Byte to add.
- * @return zero on success and error code upon failure
- */
-int cfpkt_addbdy(struct cfpkt *pkt, const u8 data);
-
-/*
- * Add a data to packet body (tail).
- *
- * pkt Packet in which to add data.
- * data Pointer to data to copy into the packet body.
- * len Length of data to add.
- * @return zero on success and error code upon failure
- */
-int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len);
-
-/*
- * Checks whether there are more data to process in packet.
- * pkt Packet to check.
- * @return true if more data are available in packet false otherwise
- */
-bool cfpkt_more(struct cfpkt *pkt);
-
-/*
- * Checks whether the packet is erroneous,
- * i.e. if it has been attempted to extract more data than available in packet
- * or writing more data than has been allocated in cfpkt_create().
- * pkt Packet to check.
- * @return true on error false otherwise
- */
-bool cfpkt_erroneous(struct cfpkt *pkt);
-
-/*
- * Get the packet length.
- * pkt Packet to get length from.
- * @return Number of bytes in packet.
- */
-u16 cfpkt_getlen(struct cfpkt *pkt);
-
-/*
- * Set the packet length, by adjusting the trailer pointer according to length.
- * pkt Packet to set length.
- * len Packet length.
- * @return Number of bytes in packet.
- */
-int cfpkt_setlen(struct cfpkt *pkt, u16 len);
-
-/*
- * cfpkt_append - Appends a packet's data to another packet.
- * dstpkt: Packet to append data into, WILL BE FREED BY THIS FUNCTION
- * addpkt: Packet to be appended and automatically released,
- * WILL BE FREED BY THIS FUNCTION.
- * expectlen: Packet's expected total length. This should be considered
- * as a hint.
- * NB: Input packets will be destroyed after appending and cannot be used
- * after calling this function.
- * @return The new appended packet.
- */
-struct cfpkt *cfpkt_append(struct cfpkt *dstpkt, struct cfpkt *addpkt,
- u16 expectlen);
-
-/*
- * cfpkt_split - Split a packet into two packets at the specified split point.
- * pkt: Packet to be split (will contain the first part of the data on exit)
- * pos: Position to split packet in two parts.
- * @return The new packet, containing the second part of the data.
- */
-struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos);
-
-/*
- * Iteration function, iterates the packet buffers from start to end.
- *
- * Checksum iteration function used to iterate buffers
- * (we may have packets consisting of a chain of buffers)
- * pkt: Packet to calculate checksum for
- * iter_func: Function pointer to iteration function
- * chks: Checksum calculated so far.
- * buf: Pointer to the buffer to checksum
- * len: Length of buf.
- * data: Initial checksum value.
- * @return Checksum of buffer.
- */
-
-int cfpkt_iterate(struct cfpkt *pkt,
- u16 (*iter_func)(u16 chks, void *buf, u16 len),
- u16 data);
-
-/* Map from a "native" packet (e.g. Linux Socket Buffer) to a CAIF packet.
- * dir - Direction indicating whether this packet is to be sent or received.
- * nativepkt - The native packet to be transformed to a CAIF packet
- * @return The mapped CAIF Packet CFPKT.
- */
-struct cfpkt *cfpkt_fromnative(enum caif_direction dir, void *nativepkt);
-
-/* Map from a CAIF packet to a "native" packet (e.g. Linux Socket Buffer).
- * pkt - The CAIF packet to be transformed into a "native" packet.
- * @return The native packet transformed from a CAIF packet.
- */
-void *cfpkt_tonative(struct cfpkt *pkt);
-
-/*
- * Returns packet information for a packet.
- * pkt Packet to get info from;
- * @return Packet information
- */
-struct caif_payload_info *cfpkt_info(struct cfpkt *pkt);
-
-/** cfpkt_set_prio - set priority for a CAIF packet.
- *
- * @pkt: The CAIF packet to be adjusted.
- * @prio: one of TC_PRIO_ constants.
- */
-void cfpkt_set_prio(struct cfpkt *pkt, int prio);
-
-#endif /* CFPKT_H_ */
diff --git a/include/net/caif/cfserl.h b/include/net/caif/cfserl.h
deleted file mode 100644
index 67cce8757175..000000000000
--- a/include/net/caif/cfserl.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author: Sjur Brendeland
- */
-
-#ifndef CFSERL_H_
-#define CFSERL_H_
-#include <net/caif/caif_layer.h>
-
-struct cflayer *cfserl_create(int instance, bool use_stx);
-void cfserl_release(struct cflayer *layer);
-#endif
diff --git a/include/net/caif/cfsrvl.h b/include/net/caif/cfsrvl.h
deleted file mode 100644
index a000dc45f966..000000000000
--- a/include/net/caif/cfsrvl.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) ST-Ericsson AB 2010
- * Author: Sjur Brendeland
- */
-
-#ifndef CFSRVL_H_
-#define CFSRVL_H_
-#include <linux/list.h>
-#include <linux/stddef.h>
-#include <linux/types.h>
-#include <linux/kref.h>
-#include <linux/rculist.h>
-
-struct cfsrvl {
- struct cflayer layer;
- bool open;
- bool phy_flow_on;
- bool modem_flow_on;
- bool supports_flowctrl;
- void (*release)(struct cflayer *layer);
- struct dev_info dev_info;
- void (*hold)(struct cflayer *lyr);
- void (*put)(struct cflayer *lyr);
- struct rcu_head rcu;
-};
-
-struct cflayer *cfvei_create(u8 linkid, struct dev_info *dev_info);
-struct cflayer *cfdgml_create(u8 linkid, struct dev_info *dev_info);
-struct cflayer *cfutill_create(u8 linkid, struct dev_info *dev_info);
-struct cflayer *cfvidl_create(u8 linkid, struct dev_info *dev_info);
-struct cflayer *cfrfml_create(u8 linkid, struct dev_info *dev_info,
- int mtu_size);
-struct cflayer *cfdbgl_create(u8 linkid, struct dev_info *dev_info);
-
-bool cfsrvl_phyid_match(struct cflayer *layer, int phyid);
-
-void cfsrvl_init(struct cfsrvl *service,
- u8 channel_id,
- struct dev_info *dev_info,
- bool supports_flowctrl);
-bool cfsrvl_ready(struct cfsrvl *service, int *err);
-
-static inline void cfsrvl_get(struct cflayer *layr)
-{
- struct cfsrvl *s = container_of(layr, struct cfsrvl, layer);
- if (layr == NULL || layr->up == NULL || s->hold == NULL)
- return;
-
- s->hold(layr->up);
-}
-
-static inline void cfsrvl_put(struct cflayer *layr)
-{
- struct cfsrvl *s = container_of(layr, struct cfsrvl, layer);
- if (layr == NULL || layr->up == NULL || s->hold == NULL)
- return;
-
- s->put(layr->up);
-}
-#endif /* CFSRVL_H_ */
diff --git a/include/net/can.h b/include/net/can.h
new file mode 100644
index 000000000000..6db9e826f0e0
--- /dev/null
+++ b/include/net/can.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * net/can.h
+ *
+ * Definitions for the CAN network socket buffer extensions
+ *
+ * Copyright (C) 2026 Oliver Hartkopp <socketcan@hartkopp.net>
+ *
+ */
+
+#ifndef _NET_CAN_H
+#define _NET_CAN_H
+
+/**
+ * struct can_skb_ext - skb extensions for CAN specific content
+ * @can_iif: ifindex of the first interface the CAN frame appeared on
+ * @can_framelen: cached echo CAN frame length for bql
+ * @can_gw_hops: can-gw CAN frame time-to-live counter
+ * @can_ext_flags: CAN skb extensions flags
+ */
+struct can_skb_ext {
+ int can_iif;
+ u16 can_framelen;
+ u8 can_gw_hops;
+ u8 can_ext_flags;
+};
+
+#endif /* _NET_CAN_H */
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 363d7dd2255a..9d3639ff9c28 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -7,7 +7,7 @@
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2024 Intel Corporation
+ * Copyright (C) 2018-2026 Intel Corporation
*/
#include <linux/ethtool.h>
@@ -101,16 +101,6 @@ struct wiphy;
* @IEEE80211_CHAN_NO_10MHZ: 10 MHz bandwidth is not permitted
* on this channel.
* @IEEE80211_CHAN_NO_HE: HE operation is not permitted on this channel.
- * @IEEE80211_CHAN_1MHZ: 1 MHz bandwidth is permitted
- * on this channel.
- * @IEEE80211_CHAN_2MHZ: 2 MHz bandwidth is permitted
- * on this channel.
- * @IEEE80211_CHAN_4MHZ: 4 MHz bandwidth is permitted
- * on this channel.
- * @IEEE80211_CHAN_8MHZ: 8 MHz bandwidth is permitted
- * on this channel.
- * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted
- * on this channel.
* @IEEE80211_CHAN_NO_320MHZ: If the driver supports 320 MHz on the band,
* this flag indicates that a 320 MHz channel cannot use this
* channel as the control or any of the secondary channels.
@@ -127,6 +117,16 @@ struct wiphy;
* even if it is otherwise disabled.
* @IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP: Allow using this channel for AP operation
* with very low power (VLP), even if otherwise set to NO_IR.
+ * @IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY: Allow activity on a 20 MHz channel,
+ * even if otherwise set to NO_IR.
+ * @IEEE80211_CHAN_S1G_NO_PRIMARY: Prevents the channel for use as an S1G
+ * primary channel. Does not prevent the wider operating channel
+ * described by the chandef from being used. In order for a 2MHz primary
+ * to be used, both 1MHz subchannels shall not contain this flag.
+ * @IEEE80211_CHAN_NO_4MHZ: 4 MHz bandwidth is not permitted on this channel.
+ * @IEEE80211_CHAN_NO_8MHZ: 8 MHz bandwidth is not permitted on this channel.
+ * @IEEE80211_CHAN_NO_16MHZ: 16 MHz bandwidth is not permitted on this channel.
+ * @IEEE80211_CHAN_NO_UHR: UHR operation is not permitted on this channel.
*/
enum ieee80211_channel_flags {
IEEE80211_CHAN_DISABLED = BIT(0),
@@ -143,11 +143,8 @@ enum ieee80211_channel_flags {
IEEE80211_CHAN_NO_20MHZ = BIT(11),
IEEE80211_CHAN_NO_10MHZ = BIT(12),
IEEE80211_CHAN_NO_HE = BIT(13),
- IEEE80211_CHAN_1MHZ = BIT(14),
- IEEE80211_CHAN_2MHZ = BIT(15),
- IEEE80211_CHAN_4MHZ = BIT(16),
- IEEE80211_CHAN_8MHZ = BIT(17),
- IEEE80211_CHAN_16MHZ = BIT(18),
+ /* can use free bits here */
+ IEEE80211_CHAN_NO_UHR = BIT(18),
IEEE80211_CHAN_NO_320MHZ = BIT(19),
IEEE80211_CHAN_NO_EHT = BIT(20),
IEEE80211_CHAN_DFS_CONCURRENT = BIT(21),
@@ -155,6 +152,11 @@ enum ieee80211_channel_flags {
IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT = BIT(23),
IEEE80211_CHAN_CAN_MONITOR = BIT(24),
IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP = BIT(25),
+ IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY = BIT(26),
+ IEEE80211_CHAN_S1G_NO_PRIMARY = BIT(27),
+ IEEE80211_CHAN_NO_4MHZ = BIT(28),
+ IEEE80211_CHAN_NO_8MHZ = BIT(29),
+ IEEE80211_CHAN_NO_16MHZ = BIT(30),
};
#define IEEE80211_CHAN_NO_HT40 \
@@ -188,6 +190,8 @@ enum ieee80211_channel_flags {
* on this channel.
* @dfs_state_entered: timestamp (jiffies) when the dfs state was entered.
* @dfs_cac_ms: DFS CAC time in milliseconds, this is valid for DFS channels.
+ * @cac_start_time: timestamp (CLOCK_BOOTTIME, nanoseconds) when CAC was
+ * started on this channel. Zero when CAC is not in progress.
* @psd: power spectral density (in dBm)
*/
struct ieee80211_channel {
@@ -205,6 +209,7 @@ struct ieee80211_channel {
enum nl80211_dfs_state dfs_state;
unsigned long dfs_state_entered;
unsigned int dfs_cac_ms;
+ u64 cac_start_time;
s8 psd;
};
@@ -429,6 +434,18 @@ struct ieee80211_sta_eht_cap {
u8 eht_ppe_thres[IEEE80211_EHT_PPE_THRES_MAX_LEN];
};
+/**
+ * struct ieee80211_sta_uhr_cap - STA's UHR capabilities
+ * @has_uhr: true iff UHR is supported and data is valid
+ * @mac: fixed MAC capabilities
+ * @phy: fixed PHY capabilities
+ */
+struct ieee80211_sta_uhr_cap {
+ bool has_uhr;
+ struct ieee80211_uhr_cap_mac mac;
+ struct ieee80211_uhr_cap_phy phy;
+};
+
/* sparse defines __CHECKER__; see Documentation/dev-tools/sparse.rst */
#ifdef __CHECKER__
/*
@@ -454,6 +471,7 @@ struct ieee80211_sta_eht_cap {
* @he_6ghz_capa: HE 6 GHz capabilities, must be filled in for a
* 6 GHz band channel (and 0 may be valid value).
* @eht_cap: STA's EHT capabilities
+ * @uhr_cap: STA's UHR capabilities
* @vendor_elems: vendor element(s) to advertise
* @vendor_elems.data: vendor element(s) data
* @vendor_elems.len: vendor element(s) length
@@ -463,6 +481,7 @@ struct ieee80211_sband_iftype_data {
struct ieee80211_sta_he_cap he_cap;
struct ieee80211_he_6ghz_capa he_6ghz_capa;
struct ieee80211_sta_eht_cap eht_cap;
+ struct ieee80211_sta_uhr_cap uhr_cap;
struct {
const u8 *data;
unsigned int len;
@@ -557,7 +576,7 @@ struct ieee80211_sta_s1g_cap {
* @vht_cap: VHT capabilities in this band
* @s1g_cap: S1G capabilities in this band
* @edmg_cap: EDMG capabilities in this band
- * @s1g_cap: S1G capabilities in this band (S1B band only, of course)
+ * @s1g_cap: S1G capabilities in this band (S1G band only, of course)
* @n_iftype_data: number of iftype data entries
* @iftype_data: interface type data entries. Note that the bits in
* @types_mask inside this structure cannot overlap (i.e. only
@@ -630,7 +649,7 @@ ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband,
const struct ieee80211_sband_iftype_data *data;
int i;
- if (WARN_ON(iftype >= NL80211_IFTYPE_MAX))
+ if (WARN_ON(iftype >= NUM_NL80211_IFTYPES))
return NULL;
if (iftype == NL80211_IFTYPE_AP_VLAN)
@@ -685,7 +704,7 @@ ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband,
}
/**
- * ieee80211_get_eht_iftype_cap - return ETH capabilities for an sband's iftype
+ * ieee80211_get_eht_iftype_cap - return EHT capabilities for an sband's iftype
* @sband: the sband to search for the iftype on
* @iftype: enum nl80211_iftype
*
@@ -705,6 +724,26 @@ ieee80211_get_eht_iftype_cap(const struct ieee80211_supported_band *sband,
}
/**
+ * ieee80211_get_uhr_iftype_cap - return UHR capabilities for an sband's iftype
+ * @sband: the sband to search for the iftype on
+ * @iftype: enum nl80211_iftype
+ *
+ * Return: pointer to the struct ieee80211_sta_uhr_cap, or NULL is none found
+ */
+static inline const struct ieee80211_sta_uhr_cap *
+ieee80211_get_uhr_iftype_cap(const struct ieee80211_supported_band *sband,
+ enum nl80211_iftype iftype)
+{
+ const struct ieee80211_sband_iftype_data *data =
+ ieee80211_get_sband_iftype_data(sband, iftype);
+
+ if (data && data->uhr_cap.has_uhr)
+ return &data->uhr_cap;
+
+ return NULL;
+}
+
+/**
* wiphy_read_of_freq_limits - read frequency limits from device tree
*
* @wiphy: the wireless device to get extra limits for
@@ -786,8 +825,7 @@ struct vif_params {
* @key: key material
* @key_len: length of key material
* @cipher: cipher suite selector
- * @seq: sequence counter (IV/PN) for TKIP and CCMP keys, only used
- * with the get_key() callback, must be in little endian,
+ * @seq: sequence counter (IV/PN), must be in little endian,
* length given by @seq_len.
* @seq_len: length of @seq.
* @vlan_id: vlan_id for VLAN group key (if nonzero)
@@ -818,6 +856,9 @@ struct key_params {
* @punctured: mask of the punctured 20 MHz subchannels, with
* bits turned on being disabled (punctured); numbered
* from lower to higher frequency (like in the spec)
+ * @s1g_primary_2mhz: Indicates if the control channel pointed to
+ * by 'chan' exists as a 1MHz primary subchannel within an
+ * S1G 2MHz primary channel.
*/
struct cfg80211_chan_def {
struct ieee80211_channel *chan;
@@ -827,6 +868,7 @@ struct cfg80211_chan_def {
struct ieee80211_edmg edmg;
u16 freq1_offset;
u16 punctured;
+ bool s1g_primary_2mhz;
};
/*
@@ -838,9 +880,12 @@ struct cfg80211_bitrate_mask {
u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN];
u16 vht_mcs[NL80211_VHT_NSS_MAX];
u16 he_mcs[NL80211_HE_NSS_MAX];
+ u16 eht_mcs[NL80211_EHT_NSS_MAX];
enum nl80211_txrate_gi gi;
enum nl80211_he_gi he_gi;
+ enum nl80211_eht_gi eht_gi;
enum nl80211_he_ltf he_ltf;
+ enum nl80211_eht_ltf eht_ltf;
} control[NUM_NL80211_BANDS];
};
@@ -968,7 +1013,8 @@ cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1,
chandef1->center_freq1 == chandef2->center_freq1 &&
chandef1->freq1_offset == chandef2->freq1_offset &&
chandef1->center_freq2 == chandef2->center_freq2 &&
- chandef1->punctured == chandef2->punctured);
+ chandef1->punctured == chandef2->punctured &&
+ chandef1->s1g_primary_2mhz == chandef2->s1g_primary_2mhz);
}
/**
@@ -985,6 +1031,18 @@ cfg80211_chandef_is_edmg(const struct cfg80211_chan_def *chandef)
}
/**
+ * cfg80211_chandef_is_s1g - check if chandef represents an S1G channel
+ * @chandef: the channel definition
+ *
+ * Return: %true if S1G.
+ */
+static inline bool
+cfg80211_chandef_is_s1g(const struct cfg80211_chan_def *chandef)
+{
+ return chandef->chan->band == NL80211_BAND_S1GHZ;
+}
+
+/**
* cfg80211_chandef_compatible - check if two channel definitions are compatible
* @chandef1: first channel definition
* @chandef2: second channel definition
@@ -996,6 +1054,7 @@ const struct cfg80211_chan_def *
cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1,
const struct cfg80211_chan_def *chandef2);
+
/**
* nl80211_chan_width_to_mhz - get the channel width in MHz
* @chan_width: the channel width from &enum nl80211_chan_width
@@ -1006,6 +1065,17 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1,
int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width);
/**
+ * cfg80211_chandef_get_width - return chandef width in MHz
+ * @c: chandef to return bandwidth for
+ * Return: channel width in MHz for the given chandef; note that it returns
+ * 80 for 80+80 configurations
+ */
+static inline int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c)
+{
+ return nl80211_chan_width_to_mhz(c->width);
+}
+
+/**
* cfg80211_chandef_valid - check if a channel definition is valid
* @chandef: the channel definition to check
* Return: %true if the channel definition is valid. %false otherwise.
@@ -1083,43 +1153,6 @@ int cfg80211_chandef_primary(const struct cfg80211_chan_def *chandef,
int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef);
/**
- * ieee80211_chanwidth_rate_flags - return rate flags for channel width
- * @width: the channel width of the channel
- *
- * In some channel types, not all rates may be used - for example CCK
- * rates may not be used in 5/10 MHz channels.
- *
- * Returns: rate flags which apply for this channel width
- */
-static inline enum ieee80211_rate_flags
-ieee80211_chanwidth_rate_flags(enum nl80211_chan_width width)
-{
- switch (width) {
- case NL80211_CHAN_WIDTH_5:
- return IEEE80211_RATE_SUPPORTS_5MHZ;
- case NL80211_CHAN_WIDTH_10:
- return IEEE80211_RATE_SUPPORTS_10MHZ;
- default:
- break;
- }
- return 0;
-}
-
-/**
- * ieee80211_chandef_rate_flags - returns rate flags for a channel
- * @chandef: channel definition for the channel
- *
- * See ieee80211_chanwidth_rate_flags().
- *
- * Returns: rate flags which apply for this channel
- */
-static inline enum ieee80211_rate_flags
-ieee80211_chandef_rate_flags(struct cfg80211_chan_def *chandef)
-{
- return ieee80211_chanwidth_rate_flags(chandef->width);
-}
-
-/**
* ieee80211_chandef_max_power - maximum transmission power for the chandef
*
* In some regulations, the transmit power may depend on the configured channel
@@ -1286,11 +1319,13 @@ struct cfg80211_crypto_settings {
* struct cfg80211_mbssid_config - AP settings for multi bssid
*
* @tx_wdev: pointer to the transmitted interface in the MBSSID set
+ * @tx_link_id: link ID of the transmitted profile in an MLD.
* @index: index of this AP in the multi bssid group.
* @ema: set to true if the beacons should be sent out in EMA mode.
*/
struct cfg80211_mbssid_config {
struct wireless_dev *tx_wdev;
+ u8 tx_link_id;
u8 index;
bool ema;
};
@@ -1445,6 +1480,23 @@ struct cfg80211_unsol_bcast_probe_resp {
};
/**
+ * struct cfg80211_s1g_short_beacon - S1G short beacon data.
+ *
+ * @update: Set to true if the feature configuration should be updated.
+ * @short_head: Short beacon head.
+ * @short_tail: Short beacon tail.
+ * @short_head_len: Short beacon head len.
+ * @short_tail_len: Short beacon tail len.
+ */
+struct cfg80211_s1g_short_beacon {
+ bool update;
+ const u8 *short_head;
+ const u8 *short_tail;
+ size_t short_head_len;
+ size_t short_tail_len;
+};
+
+/**
* struct cfg80211_ap_settings - AP configuration
*
* Used to configure an AP interface.
@@ -1473,6 +1525,7 @@ struct cfg80211_unsol_bcast_probe_resp {
* @he_cap: HE capabilities (or %NULL if HE isn't enabled)
* @eht_cap: EHT capabilities (or %NULL if EHT isn't enabled)
* @eht_oper: EHT operation IE (or %NULL if EHT isn't enabled)
+ * @uhr_oper: UHR operation (or %NULL if UHR isn't enabled)
* @ht_required: stations must support HT
* @vht_required: stations must support VHT
* @twt_responder: Enable Target Wait Time
@@ -1484,6 +1537,8 @@ struct cfg80211_unsol_bcast_probe_resp {
* @fils_discovery: FILS discovery transmission parameters
* @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
* @mbssid_config: AP settings for multiple bssid
+ * @s1g_long_beacon_period: S1G long beacon period
+ * @s1g_short_beacon: S1G short beacon data
*/
struct cfg80211_ap_settings {
struct cfg80211_chan_def chandef;
@@ -1510,6 +1565,7 @@ struct cfg80211_ap_settings {
const struct ieee80211_he_operation *he_oper;
const struct ieee80211_eht_cap_elem *eht_cap;
const struct ieee80211_eht_operation *eht_oper;
+ const struct ieee80211_uhr_operation *uhr_oper;
bool ht_required, vht_required, he_required, sae_h2e_required;
bool twt_responder;
u32 flags;
@@ -1517,6 +1573,8 @@ struct cfg80211_ap_settings {
struct cfg80211_fils_discovery fils_discovery;
struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
struct cfg80211_mbssid_config mbssid_config;
+ u8 s1g_long_beacon_period;
+ struct cfg80211_s1g_short_beacon s1g_short_beacon;
};
@@ -1528,11 +1586,13 @@ struct cfg80211_ap_settings {
* @beacon: beacon data
* @fils_discovery: FILS discovery transmission parameters
* @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
+ * @s1g_short_beacon: S1G short beacon data
*/
struct cfg80211_ap_update {
struct cfg80211_beacon_data beacon;
struct cfg80211_fils_discovery fils_discovery;
struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
+ struct cfg80211_s1g_short_beacon s1g_short_beacon;
};
/**
@@ -1547,6 +1607,7 @@ struct cfg80211_ap_update {
* @n_counter_offsets_beacon: number of csa counters the beacon (tail)
* @n_counter_offsets_presp: number of csa counters in the probe response
* @beacon_after: beacon data to be used on the new channel
+ * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
* @radar_required: whether radar detection is required on the new channel
* @block_tx: whether transmissions should be blocked while changing
* @count: number of beacons until switch
@@ -1561,6 +1622,7 @@ struct cfg80211_csa_settings {
unsigned int n_counter_offsets_beacon;
unsigned int n_counter_offsets_presp;
struct cfg80211_beacon_data beacon_after;
+ struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
bool radar_required;
bool block_tx;
u8 count;
@@ -1576,6 +1638,7 @@ struct cfg80211_csa_settings {
* @counter_offset_beacon: offsets of the counters within the beacon (tail)
* @counter_offset_presp: offsets of the counters within the probe response
* @beacon_next: beacon data to be used after the color change
+ * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters
* @count: number of beacons until the color change
* @color: the color used after the change
* @link_id: defines the link on which color change is expected during MLO.
@@ -1586,6 +1649,7 @@ struct cfg80211_color_change_settings {
u16 counter_offset_beacon;
u16 counter_offset_presp;
struct cfg80211_beacon_data beacon_next;
+ struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp;
u8 count;
u8 color;
u8 link_id;
@@ -1674,6 +1738,9 @@ struct sta_txpwr {
* @he_6ghz_capa: HE 6 GHz Band capabilities of station
* @eht_capa: EHT capabilities of station
* @eht_capa_len: the length of the EHT capabilities
+ * @s1g_capa: S1G capabilities of station
+ * @uhr_capa: UHR capabilities of the station
+ * @uhr_capa_len: the length of the UHR capabilities
*/
struct link_station_parameters {
const u8 *mld_mac;
@@ -1692,6 +1759,9 @@ struct link_station_parameters {
const struct ieee80211_he_6ghz_capa *he_6ghz_capa;
const struct ieee80211_eht_cap_elem *eht_capa;
u8 eht_capa_len;
+ const struct ieee80211_s1g_cap *s1g_capa;
+ const struct ieee80211_uhr_cap *uhr_capa;
+ u8 uhr_capa_len;
};
/**
@@ -1756,7 +1826,12 @@ struct cfg80211_ttlm_params {
* @supported_oper_classes_len: number of supported operating classes
* @support_p2p_ps: information if station supports P2P PS mechanism
* @airtime_weight: airtime scheduler weight for this station
+ * @eml_cap_present: Specifies if EML capabilities field (@eml_cap) is
+ * present/updated
+ * @eml_cap: EML capabilities of this station
* @link_sta_params: link related params.
+ * @epp_peer: EPP peer indication
+ * @nmi_mac: MAC address of the NMI station of the NAN peer
*/
struct station_parameters {
struct net_device *vlan;
@@ -1780,7 +1855,11 @@ struct station_parameters {
u8 supported_oper_classes_len;
int support_p2p_ps;
u16 airtime_weight;
+ bool eml_cap_present;
+ u16 eml_cap;
struct link_station_parameters link_sta_params;
+ bool epp_peer;
+ const u8 *nmi_mac;
};
/**
@@ -1820,6 +1899,8 @@ struct station_del_parameters {
* entry that is operating, has been marked authorized by userspace)
* @CFG80211_STA_MESH_PEER_KERNEL: peer on mesh interface (kernel managed)
* @CFG80211_STA_MESH_PEER_USER: peer on mesh interface (user managed)
+ * @CFG80211_STA_NAN_MGMT: NAN management interface station
+ * @CFG80211_STA_NAN_DATA: NAN data path station
*/
enum cfg80211_station_type {
CFG80211_STA_AP_CLIENT,
@@ -1831,6 +1912,8 @@ enum cfg80211_station_type {
CFG80211_STA_TDLS_PEER_ACTIVE,
CFG80211_STA_MESH_PEER_KERNEL,
CFG80211_STA_MESH_PEER_USER,
+ CFG80211_STA_NAN_MGMT,
+ CFG80211_STA_NAN_DATA,
};
/**
@@ -1866,6 +1949,11 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
* @RATE_INFO_FLAGS_EXTENDED_SC_DMG: 60GHz extended SC MCS
* @RATE_INFO_FLAGS_EHT_MCS: EHT MCS information
* @RATE_INFO_FLAGS_S1G_MCS: MCS field filled with S1G MCS
+ * @RATE_INFO_FLAGS_UHR_MCS: UHR MCS information
+ * @RATE_INFO_FLAGS_UHR_ELR_MCS: UHR ELR MCS was used
+ * (set together with @RATE_INFO_FLAGS_UHR_MCS)
+ * @RATE_INFO_FLAGS_UHR_IM: UHR Interference Mitigation
+ * was used
*/
enum rate_info_flags {
RATE_INFO_FLAGS_MCS = BIT(0),
@@ -1877,6 +1965,9 @@ enum rate_info_flags {
RATE_INFO_FLAGS_EXTENDED_SC_DMG = BIT(6),
RATE_INFO_FLAGS_EHT_MCS = BIT(7),
RATE_INFO_FLAGS_S1G_MCS = BIT(8),
+ RATE_INFO_FLAGS_UHR_MCS = BIT(9),
+ RATE_INFO_FLAGS_UHR_ELR_MCS = BIT(10),
+ RATE_INFO_FLAGS_UHR_IM = BIT(11),
};
/**
@@ -1892,7 +1983,7 @@ enum rate_info_flags {
* @RATE_INFO_BW_160: 160 MHz bandwidth
* @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation
* @RATE_INFO_BW_320: 320 MHz bandwidth
- * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT RU allocation
+ * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT/UHR RU allocation
* @RATE_INFO_BW_1: 1 MHz bandwidth
* @RATE_INFO_BW_2: 2 MHz bandwidth
* @RATE_INFO_BW_4: 4 MHz bandwidth
@@ -1923,7 +2014,7 @@ enum rate_info_bw {
*
* @flags: bitflag of flags from &enum rate_info_flags
* @legacy: bitrate in 100kbit/s for 802.11abg
- * @mcs: mcs index if struct describes an HT/VHT/HE/EHT/S1G rate
+ * @mcs: mcs index if struct describes an HT/VHT/HE/EHT/S1G/UHR rate
* @nss: number of streams (VHT & HE only)
* @bw: bandwidth (from &enum rate_info_bw)
* @he_gi: HE guard interval (from &enum nl80211_he_gi)
@@ -2034,6 +2125,99 @@ struct cfg80211_tid_stats {
#define IEEE80211_MAX_CHAINS 4
/**
+ * struct link_station_info - link station information
+ *
+ * Link station information filled by driver for get_station() and
+ * dump_station().
+ * @filled: bit flag of flags using the bits of &enum nl80211_sta_info to
+ * indicate the relevant values in this struct for them
+ * @connected_time: time(in secs) since a link of station is last connected
+ * @inactive_time: time since last activity for link station(tx/rx)
+ * in milliseconds
+ * @assoc_at: bootime (ns) of the last association of link of station
+ * @rx_bytes: bytes (size of MPDUs) received from this link of station
+ * @tx_bytes: bytes (size of MPDUs) transmitted to this link of station
+ * @signal: The signal strength, type depends on the wiphy's signal_type.
+ * For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_.
+ * @signal_avg: Average signal strength, type depends on the wiphy's
+ * signal_type. For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_
+ * @chains: bitmask for filled values in @chain_signal, @chain_signal_avg
+ * @chain_signal: per-chain signal strength of last received packet in dBm
+ * @chain_signal_avg: per-chain signal strength average in dBm
+ * @txrate: current unicast bitrate from this link of station
+ * @rxrate: current unicast bitrate to this link of station
+ * @rx_packets: packets (MSDUs & MMPDUs) received from this link of station
+ * @tx_packets: packets (MSDUs & MMPDUs) transmitted to this link of station
+ * @tx_retries: cumulative retry counts (MPDUs) for this link of station
+ * @tx_failed: number of failed transmissions (MPDUs) (retries exceeded, no ACK)
+ * @rx_dropped_misc: Dropped for un-specified reason.
+ * @bss_param: current BSS parameters
+ * @beacon_loss_count: Number of times beacon loss event has triggered.
+ * @expected_throughput: expected throughput in kbps (including 802.11 headers)
+ * towards this station.
+ * @rx_beacon: number of beacons received from this peer
+ * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received
+ * from this peer
+ * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer
+ * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer
+ * @airtime_weight: current airtime scheduling weight
+ * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last
+ * (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs.
+ * Note that this doesn't use the @filled bit, but is used if non-NULL.
+ * @ack_signal: signal strength (in dBm) of the last ACK frame.
+ * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has
+ * been sent.
+ * @rx_mpdu_count: number of MPDUs received from this station
+ * @fcs_err_count: number of packets (MPDUs) received from this station with
+ * an FCS error. This counter should be incremented only when TA of the
+ * received packet with an FCS error matches the peer MAC address.
+ * @addr: For MLO STA connection, filled with address of the link of station.
+ */
+struct link_station_info {
+ u64 filled;
+ u32 connected_time;
+ u32 inactive_time;
+ u64 assoc_at;
+ u64 rx_bytes;
+ u64 tx_bytes;
+ s8 signal;
+ s8 signal_avg;
+
+ u8 chains;
+ s8 chain_signal[IEEE80211_MAX_CHAINS];
+ s8 chain_signal_avg[IEEE80211_MAX_CHAINS];
+
+ struct rate_info txrate;
+ struct rate_info rxrate;
+ u32 rx_packets;
+ u32 tx_packets;
+ u32 tx_retries;
+ u32 tx_failed;
+ u32 rx_dropped_misc;
+ struct sta_bss_parameters bss_param;
+
+ u32 beacon_loss_count;
+
+ u32 expected_throughput;
+
+ u64 tx_duration;
+ u64 rx_duration;
+ u64 rx_beacon;
+ u8 rx_beacon_signal_avg;
+
+ u16 airtime_weight;
+
+ s8 ack_signal;
+ s8 avg_ack_signal;
+ struct cfg80211_tid_stats *pertid;
+
+ u32 rx_mpdu_count;
+ u32 fcs_err_count;
+
+ u8 addr[ETH_ALEN] __aligned(2);
+};
+
+/**
* struct station_info - station information
*
* Station information filled by driver for get_station() and dump_station.
@@ -2045,9 +2229,6 @@ struct cfg80211_tid_stats {
* @assoc_at: bootime (ns) of the last association
* @rx_bytes: bytes (size of MPDUs) received from this station
* @tx_bytes: bytes (size of MPDUs) transmitted to this station
- * @llid: mesh local link id
- * @plid: mesh peer link id
- * @plink_state: mesh peer link state
* @signal: The signal strength, type depends on the wiphy's signal_type.
* For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_.
* @signal_avg: Average signal strength, type depends on the wiphy's signal_type.
@@ -2067,14 +2248,20 @@ struct cfg80211_tid_stats {
* This number should increase every time the list of stations
* changes, i.e. when a station is added or removed, so that
* userspace can tell whether it got a consistent snapshot.
+ * @beacon_loss_count: Number of times beacon loss event has triggered.
* @assoc_req_ies: IEs from (Re)Association Request.
* This is used only when in AP mode with drivers that do not use
* user space MLME/SME implementation. The information is provided for
* the cfg80211_new_sta() calls to notify user space of the IEs.
* @assoc_req_ies_len: Length of assoc_req_ies buffer in octets.
* @sta_flags: station flags mask & values
- * @beacon_loss_count: Number of times beacon loss event has triggered.
* @t_offset: Time offset of the station relative to this host.
+ * @llid: mesh local link id
+ * @plid: mesh peer link id
+ * @plink_state: mesh peer link state
+ * @connected_to_gate: true if mesh STA has a path to mesh gate
+ * @connected_to_as: true if mesh STA has a path to authentication server
+ * @airtime_link_metric: mesh airtime link metric.
* @local_pm: local mesh STA power save mode
* @peer_pm: peer mesh STA power save mode
* @nonpeer_pm: non-peer mesh STA power save mode
@@ -2083,7 +2270,6 @@ struct cfg80211_tid_stats {
* @rx_beacon: number of beacons received from this peer
* @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received
* from this peer
- * @connected_to_gate: true if mesh STA has a path to mesh gate
* @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer
* @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer
* @airtime_weight: current airtime scheduling weight
@@ -2097,8 +2283,6 @@ struct cfg80211_tid_stats {
* @fcs_err_count: number of packets (MPDUs) received from this station with
* an FCS error. This counter should be incremented only when TA of the
* received packet with an FCS error matches the peer MAC address.
- * @airtime_link_metric: mesh airtime link metric.
- * @connected_to_as: true if mesh STA has a path to authentication server
* @mlo_params_valid: Indicates @assoc_link_id and @mld_addr fields are filled
* by driver. Drivers use this only in cfg80211_new_sta() calls when AP
* MLD's MLME/SME is offload to driver. Drivers won't fill this
@@ -2117,6 +2301,11 @@ struct cfg80211_tid_stats {
* dump_station() callbacks. User space needs this information to determine
* the accepted and rejected affiliated links of the connected station.
* @assoc_resp_ies_len: Length of @assoc_resp_ies buffer in octets.
+ * @valid_links: bitmap of valid links, or 0 for non-MLO. Drivers fill this
+ * information in cfg80211_new_sta(), cfg80211_del_sta_sinfo(),
+ * get_station() and dump_station() callbacks.
+ * @links: reference to Link sta entries for MLO STA, all link specific
+ * information is accessed through links[link_id].
*/
struct station_info {
u64 filled;
@@ -2125,9 +2314,6 @@ struct station_info {
u64 assoc_at;
u64 rx_bytes;
u64 tx_bytes;
- u16 llid;
- u16 plid;
- u8 plink_state;
s8 signal;
s8 signal_avg;
@@ -2147,41 +2333,46 @@ struct station_info {
int generation;
+ u32 beacon_loss_count;
+
const u8 *assoc_req_ies;
size_t assoc_req_ies_len;
- u32 beacon_loss_count;
s64 t_offset;
+ u16 llid;
+ u16 plid;
+ u8 plink_state;
+ u8 connected_to_gate;
+ u8 connected_to_as;
+ u32 airtime_link_metric;
enum nl80211_mesh_power_mode local_pm;
enum nl80211_mesh_power_mode peer_pm;
enum nl80211_mesh_power_mode nonpeer_pm;
u32 expected_throughput;
- u64 tx_duration;
- u64 rx_duration;
- u64 rx_beacon;
- u8 rx_beacon_signal_avg;
- u8 connected_to_gate;
+ u16 airtime_weight;
- struct cfg80211_tid_stats *pertid;
s8 ack_signal;
s8 avg_ack_signal;
+ struct cfg80211_tid_stats *pertid;
- u16 airtime_weight;
+ u64 tx_duration;
+ u64 rx_duration;
+ u64 rx_beacon;
+ u8 rx_beacon_signal_avg;
u32 rx_mpdu_count;
u32 fcs_err_count;
- u32 airtime_link_metric;
-
- u8 connected_to_as;
-
bool mlo_params_valid;
u8 assoc_link_id;
u8 mld_addr[ETH_ALEN] __aligned(2);
const u8 *assoc_resp_ies;
size_t assoc_resp_ies_len;
+
+ u16 valid_links;
+ struct link_station_info *links[IEEE80211_MLD_MAX_NUM_LINKS];
};
/**
@@ -2265,7 +2456,7 @@ static inline int cfg80211_get_station(struct net_device *dev,
* @MONITOR_FLAG_PLCPFAIL: pass frames with bad PLCP
* @MONITOR_FLAG_CONTROL: pass control frames
* @MONITOR_FLAG_OTHER_BSS: disable BSSID filtering
- * @MONITOR_FLAG_COOK_FRAMES: report frames after processing
+ * @MONITOR_FLAG_COOK_FRAMES: deprecated, will unconditionally be refused
* @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address
* @MONITOR_FLAG_SKIP_TX: do not pass locally transmitted frames
*/
@@ -2344,6 +2535,29 @@ struct mpath_info {
};
/**
+ * enum wiphy_bss_param_flags - bit positions for supported bss parameters.
+ *
+ * @WIPHY_BSS_PARAM_CTS_PROT: support changing CTS protection.
+ * @WIPHY_BSS_PARAM_SHORT_PREAMBLE: support changing short preamble usage.
+ * @WIPHY_BSS_PARAM_SHORT_SLOT_TIME: support changing short slot time usage.
+ * @WIPHY_BSS_PARAM_BASIC_RATES: support reconfiguring basic rates.
+ * @WIPHY_BSS_PARAM_AP_ISOLATE: support changing AP isolation.
+ * @WIPHY_BSS_PARAM_HT_OPMODE: support changing HT operating mode.
+ * @WIPHY_BSS_PARAM_P2P_CTWINDOW: support reconfiguring ctwindow.
+ * @WIPHY_BSS_PARAM_P2P_OPPPS: support changing P2P opportunistic power-save.
+ */
+enum wiphy_bss_param_flags {
+ WIPHY_BSS_PARAM_CTS_PROT = BIT(0),
+ WIPHY_BSS_PARAM_SHORT_PREAMBLE = BIT(1),
+ WIPHY_BSS_PARAM_SHORT_SLOT_TIME = BIT(2),
+ WIPHY_BSS_PARAM_BASIC_RATES = BIT(3),
+ WIPHY_BSS_PARAM_AP_ISOLATE = BIT(4),
+ WIPHY_BSS_PARAM_HT_OPMODE = BIT(5),
+ WIPHY_BSS_PARAM_P2P_CTWINDOW = BIT(6),
+ WIPHY_BSS_PARAM_P2P_OPPPS = BIT(7),
+};
+
+/**
* struct bss_parameters - BSS parameters
*
* Used to change BSS parameters (mainly for AP mode).
@@ -2662,15 +2876,16 @@ struct cfg80211_scan_6ghz_params {
* @wiphy: the wiphy this was for
* @scan_start: time (in jiffies) when the scan started
* @wdev: the wireless device to scan for
- * @info: (internal) information about completed scan
- * @notified: (internal) scan request was notified as done or aborted
* @no_cck: used to send probe requests at non CCK rate in 2GHz band
* @mac_addr: MAC address used with randomisation
* @mac_addr_mask: MAC address mask used with randomisation, bits that
* are 0 in the mask should be randomised, bits that are 1 should
* be taken from the @mac_addr
* @scan_6ghz: relevant for split scan request only,
- * true if this is the second scan request
+ * true if this is a 6 GHz scan request
+ * @first_part: %true if this is the first part of a split scan request or a
+ * scan that was not split. May be %true for a @scan_6ghz scan if no other
+ * channels were requested
* @n_6ghz_params: number of 6 GHz params
* @scan_6ghz_params: 6 GHz params
* @bssid: BSSID to scan for (most commonly, the wildcard BSSID)
@@ -2694,20 +2909,17 @@ struct cfg80211_scan_request {
u8 mac_addr[ETH_ALEN] __aligned(2);
u8 mac_addr_mask[ETH_ALEN] __aligned(2);
u8 bssid[ETH_ALEN] __aligned(2);
-
- /* internal */
struct wiphy *wiphy;
unsigned long scan_start;
- struct cfg80211_scan_info info;
- bool notified;
bool no_cck;
bool scan_6ghz;
+ bool first_part;
u32 n_6ghz_params;
struct cfg80211_scan_6ghz_params *scan_6ghz_params;
s8 tsf_report_link_id;
/* keep last */
- struct ieee80211_channel *channels[] __counted_by(n_channels);
+ struct ieee80211_channel *channels[];
};
static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask)
@@ -2947,6 +3159,7 @@ struct cfg80211_bss_ies {
* @nontrans_list: list of non-transmitted BSS, if this is a transmitted one
* (multi-BSSID support)
* @signal: signal strength value (type depends on the wiphy's signal_type)
+ * @ts_boottime: timestamp of the last BSS update in nanoseconds since boot
* @chains: bitmask for filled values in @chain_signal.
* @chain_signal: per-chain signal strength of last received BSS in dBm.
* @bssid_index: index in the multiple BSS set
@@ -2971,6 +3184,8 @@ struct cfg80211_bss {
s32 signal;
+ u64 ts_boottime;
+
u16 beacon_interval;
u16 capability;
@@ -3067,8 +3282,6 @@ struct cfg80211_auth_request {
* if this is %NULL for a link, that link is not requested
* @elems: extra elements for the per-STA profile for this link
* @elems_len: length of the elements
- * @disabled: If set this link should be included during association etc. but it
- * should not be used until enabled by the AP MLD.
* @error: per-link error code, must be <= 0. If there is an error, then the
* operation as a whole must fail.
*/
@@ -3076,11 +3289,23 @@ struct cfg80211_assoc_link {
struct cfg80211_bss *bss;
const u8 *elems;
size_t elems_len;
- bool disabled;
int error;
};
/**
+ * struct cfg80211_ml_reconf_req - MLO link reconfiguration request
+ * @add_links: data for links to add, see &struct cfg80211_assoc_link
+ * @rem_links: bitmap of links to remove
+ * @ext_mld_capa_ops: extended MLD capabilities and operations set by
+ * userspace for the ML reconfiguration action frame
+ */
+struct cfg80211_ml_reconf_req {
+ struct cfg80211_assoc_link add_links[IEEE80211_MLD_MAX_NUM_LINKS];
+ u16 rem_links;
+ u16 ext_mld_capa_ops;
+};
+
+/**
* enum cfg80211_assoc_req_flags - Over-ride default behaviour in association.
*
* @ASSOC_REQ_DISABLE_HT: Disable HT (802.11n)
@@ -3096,6 +3321,7 @@ struct cfg80211_assoc_link {
* Drivers shall disable MLO features for the current association if this
* flag is not set.
* @ASSOC_REQ_SPP_AMSDU: SPP A-MSDUs will be used on this connection (if any)
+ * @ASSOC_REQ_DISABLE_UHR: Disable UHR
*/
enum cfg80211_assoc_req_flags {
ASSOC_REQ_DISABLE_HT = BIT(0),
@@ -3106,6 +3332,7 @@ enum cfg80211_assoc_req_flags {
ASSOC_REQ_DISABLE_EHT = BIT(5),
CONNECT_REQ_MLO_SUPPORT = BIT(6),
ASSOC_REQ_SPP_AMSDU = BIT(7),
+ ASSOC_REQ_DISABLE_UHR = BIT(8),
};
/**
@@ -3130,10 +3357,10 @@ enum cfg80211_assoc_req_flags {
* included in the Current AP address field of the Reassociation Request
* frame.
* @flags: See &enum cfg80211_assoc_req_flags
- * @supported_selectors: supported selectors in IEEE 802.11 format
+ * @supported_selectors: supported BSS selectors in IEEE 802.11 format
* (or %NULL for no change).
* If %NULL, then support for SAE_H2E should be assumed.
- * @supported_selectors_len: Length of supported_selectors in octets.
+ * @supported_selectors_len: number of supported BSS selectors
* @ht_capa: HT Capabilities over-rides. Values set in ht_capa_mask
* will be used in ht_capa. Un-supported values will be ignored.
* @ht_capa_mask: The bits of ht_capa which are to be used.
@@ -3152,6 +3379,8 @@ enum cfg80211_assoc_req_flags {
* the link on which the association request should be sent
* @ap_mld_addr: AP MLD address in case of MLO association request,
* valid iff @link_id >= 0
+ * @ext_mld_capa_ops: extended MLD capabilities and operations set by
+ * userspace for the association
*/
struct cfg80211_assoc_request {
struct cfg80211_bss *bss;
@@ -3172,6 +3401,7 @@ struct cfg80211_assoc_request {
struct cfg80211_assoc_link links[IEEE80211_MLD_MAX_NUM_LINKS];
const u8 *ap_mld_addr;
s8 link_id;
+ u16 ext_mld_capa_ops;
};
/**
@@ -3757,6 +3987,109 @@ struct cfg80211_qos_map {
};
/**
+ * DOC: Neighbor Awareness Networking (NAN)
+ *
+ * NAN uses two interface types:
+ *
+ * - %NL80211_IFTYPE_NAN: a non-netdev interface. This has two roles: (1) holds
+ * the configuration of all NAN activities (DE parameters, synchronisation
+ * parameters, local schedule, etc.), and (2) uses as the NAN Management
+ * Interface (NMI), which is used for NAN management communication.
+ *
+ * - %NL80211_IFTYPE_NAN_DATA: The NAN Data Interface (NDI), used for data
+ * communication with NAN peers.
+ *
+ * An NDI interface can only be started (IFF_UP) if the NMI one is running and
+ * NAN is started. Before NAN is stopped, all associated NDI interfaces
+ * must be stopped first.
+ *
+ * The local schedule specifies which channels the device is available on and
+ * when. Must be cancelled before NAN is stopped.
+ *
+ * NAN Stations
+ * ~~~~~~~~~~~~
+ *
+ * There are two types of stations corresponding to the two interface types:
+ *
+ * - NMI station: Represents the NAN peer. Peer-specific data such as the peer's
+ * schedule and the HT, VHT and HE capabilities belongs to the NMI station.
+ * Also used for Tx/Rx of NAN management frames to/from the peer.
+ * Added on the %NL80211_IFTYPE_NAN interface.
+ *
+ * - NDI station: Used for Tx/Rx of data frames (and non-NAN management frames)
+ * for a specific NDP established with the NAN peer. Added on the
+ * %NL80211_IFTYPE_NAN_DATA interface.
+ *
+ * A peer may reuse its NMI address as the NDI address. In that case, two
+ * separate stations should be added even though they share the same MAC
+ * address.
+ *
+ * HT, VHT and HE capabilities should not changes after it was set. It is the
+ * driver's responsibility to check that.
+ *
+ * An NDI station can only be added if the corresponding NMI station has already
+ * been configured with HT (and possibly VHT and HE) capabilities. It is the
+ * driver's responsibility to check that.
+ *
+ * All NDI stations must be removed before corresponding NMI station is removed.
+ * Therefore, removing a NMI station implies that the associated NDI station(s)
+ * (if any) will be removed first.
+ *
+ * NAN Dependencies
+ * ~~~~~~~~~~~~~~~~
+ *
+ * The following diagram shows the dependencies between NAN components.
+ * An arrow from A to B means A must be started/added before B, and B must be
+ * stopped/removed before A:
+ *
+ * +-------------+
+ * | NMI iface |---(local schedule)
+ * +------+------+
+ * / \
+ * v v
+ * +-----------+ +-------------+
+ * | NDI iface | | NMI sta |---(peer schedule)
+ * +-----+-----+ +------+------+
+ * \ /
+ * v v
+ * +----------+
+ * | NDI sta |
+ * +----------+
+ */
+
+/**
+ * struct cfg80211_nan_band_config - NAN band specific configuration
+ *
+ * @chan: Pointer to the IEEE 802.11 channel structure. The channel to be used
+ * for NAN operations on this band. For 2.4 GHz band, this is always
+ * channel 6. For 5 GHz band, the channel is either 44 or 149, according
+ * to the regulatory constraints. If chan pointer is NULL the entire band
+ * configuration entry is considered invalid and should not be used.
+ * @rssi_close: RSSI close threshold used for NAN state transition algorithm
+ * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State
+ * Transition" of Wi-Fi Aware Specification v4.0. If not
+ * specified (set to 0), default device value is used. The value should
+ * be greater than -60 dBm.
+ * @rssi_middle: RSSI middle threshold used for NAN state transition algorithm.
+ * as described in chapters 3.3.6 and 3.3.7 "NAN Device Role and State
+ * Transition" of Wi-Fi Aware Specification v4.0. If not
+ * specified (set to 0), default device value is used. The value should be
+ * greater than -75 dBm and less than rssi_close.
+ * @awake_dw_interval: Committed DW interval. Valid values range: 0-5. 0
+ * indicates no wakeup for DW and can't be used on 2.4GHz band, otherwise
+ * 2^(n-1).
+ * @disable_scan: If true, the device will not scan this band for cluster
+ * merge. Disabling scan on 2.4 GHz band is not allowed.
+ */
+struct cfg80211_nan_band_config {
+ struct ieee80211_channel *chan;
+ s8 rssi_close;
+ s8 rssi_middle;
+ u8 awake_dw_interval;
+ bool disable_scan;
+};
+
+/**
* struct cfg80211_nan_conf - NAN configuration
*
* This struct defines NAN configuration parameters
@@ -3765,10 +4098,129 @@ struct cfg80211_qos_map {
* @bands: operating bands, a bitmap of &enum nl80211_band values.
* For instance, for NL80211_BAND_2GHZ, bit 0 would be set
* (i.e. BIT(NL80211_BAND_2GHZ)).
+ * @cluster_id: cluster ID used for NAN synchronization. This is a MAC address
+ * that can take a value from 50-6F-9A-01-00-00 to 50-6F-9A-01-FF-FF.
+ * @scan_period: period (in seconds) between NAN scans.
+ * @scan_dwell_time: dwell time (in milliseconds) for NAN scans.
+ * @discovery_beacon_interval: interval (in TUs) for discovery beacons.
+ * @enable_dw_notification: flag to enable/disable discovery window
+ * notifications.
+ * @band_cfgs: array of band specific configurations, indexed by
+ * &enum nl80211_band values.
+ * @extra_nan_attrs: pointer to additional NAN attributes.
+ * @extra_nan_attrs_len: length of the additional NAN attributes.
+ * @vendor_elems: pointer to vendor-specific elements.
+ * @vendor_elems_len: length of the vendor-specific elements.
*/
struct cfg80211_nan_conf {
u8 master_pref;
u8 bands;
+ u8 cluster_id[ETH_ALEN] __aligned(2);
+ u16 scan_period;
+ u16 scan_dwell_time;
+ u8 discovery_beacon_interval;
+ bool enable_dw_notification;
+ struct cfg80211_nan_band_config band_cfgs[NUM_NL80211_BANDS];
+ const u8 *extra_nan_attrs;
+ u16 extra_nan_attrs_len;
+ const u8 *vendor_elems;
+ u16 vendor_elems_len;
+};
+
+#define CFG80211_NAN_SCHED_NUM_TIME_SLOTS 32
+
+/**
+ * struct cfg80211_nan_channel - NAN channel configuration
+ *
+ * This struct defines a NAN channel configuration
+ *
+ * @chandef: the channel definition
+ * @channel_entry: pointer to the Channel Entry blob as defined in Wi-Fi Aware
+ * (TM) 4.0 specification Table 100 (Channel Entry format for the NAN
+ * Availability attribute).
+ * @rx_nss: number of spatial streams supported on this channel
+ */
+struct cfg80211_nan_channel {
+ struct cfg80211_chan_def chandef;
+ const u8 *channel_entry;
+ u8 rx_nss;
+};
+
+/**
+ * struct cfg80211_nan_local_sched - NAN local schedule
+ *
+ * This struct defines NAN local schedule parameters
+ *
+ * @schedule: a mapping of time slots to chandef indexes in %nan_channels.
+ * An unscheduled slot will be set to %NL80211_NAN_SCHED_NOT_AVAIL_SLOT.
+ * @n_channels: number of channel definitions in %nan_channels.
+ * @nan_avail_blob: pointer to NAN Availability attribute blob.
+ * See %NL80211_ATTR_NAN_AVAIL_BLOB for more details.
+ * @nan_avail_blob_len: length of the @nan_avail_blob in bytes.
+ * @deferred: if true, the command containing this schedule configuration is a
+ * request from the device to perform an announced schedule update. This
+ * means that it needs to send the updated NAN availability to the peers,
+ * and do the actual switch on the right time (i.e. at the end of the slot
+ * after the slot in which the updated NAN Availability was sent).
+ * See %NL80211_ATTR_NAN_SCHED_DEFERRED for more details.
+ * If false, the schedule is applied immediately.
+ * @nan_channels: array of NAN channel definitions that can be scheduled.
+ */
+struct cfg80211_nan_local_sched {
+ u8 schedule[CFG80211_NAN_SCHED_NUM_TIME_SLOTS];
+ u8 n_channels;
+ const u8 *nan_avail_blob;
+ u16 nan_avail_blob_len;
+ bool deferred;
+ struct cfg80211_nan_channel nan_channels[] __counted_by(n_channels);
+};
+
+/**
+ * struct cfg80211_nan_peer_map - NAN peer schedule map
+ *
+ * This struct defines a single NAN peer schedule map
+ *
+ * @map_id: map ID of this schedule map
+ * @schedule: a mapping of time slots to chandef indexes in the schedule's
+ * @nan_channels. Each slot lasts 16TUs. An unscheduled slot will be
+ * set to %NL80211_NAN_SCHED_NOT_AVAIL_SLOT.
+ */
+struct cfg80211_nan_peer_map {
+ u8 map_id;
+ u8 schedule[CFG80211_NAN_SCHED_NUM_TIME_SLOTS];
+};
+
+#define CFG80211_NAN_MAX_PEER_MAPS 2
+#define CFG80211_NAN_INVALID_MAP_ID 0xff
+
+/**
+ * struct cfg80211_nan_peer_sched - NAN peer schedule
+ *
+ * This struct defines NAN peer schedule parameters for a peer.
+ *
+ * @peer_addr: MAC address of the peer (NMI address)
+ * @seq_id: sequence ID of the peer schedule.
+ * @committed_dw: committed DW as published by the peer.
+ * See %NL80211_ATTR_NAN_COMMITTED_DW
+ * @max_chan_switch: maximum channel switch time in microseconds as published
+ * by the peer. See %NL80211_ATTR_NAN_MAX_CHAN_SWITCH_TIME.
+ * @init_ulw: initial ULWs as published by the peer.
+ * @ulw_size: number of bytes in @init_ulw.
+ * @n_channels: number of channel definitions in @nan_channels.
+ * @nan_channels: array of NAN channel definitions for this schedule.
+ * @maps: array of peer schedule maps. Unused entries have
+ * map_id = %CFG80211_NAN_INVALID_MAP_ID.
+ */
+struct cfg80211_nan_peer_sched {
+ const u8 *peer_addr;
+ u8 seq_id;
+ u16 committed_dw;
+ u16 max_chan_switch;
+ const u8 *init_ulw;
+ u16 ulw_size;
+ u8 n_channels;
+ struct cfg80211_nan_channel *nan_channels;
+ struct cfg80211_nan_peer_map maps[CFG80211_NAN_MAX_PEER_MAPS];
};
/**
@@ -3777,10 +4229,17 @@ struct cfg80211_nan_conf {
*
* @CFG80211_NAN_CONF_CHANGED_PREF: master preference
* @CFG80211_NAN_CONF_CHANGED_BANDS: operating bands
+ * @CFG80211_NAN_CONF_CHANGED_CONFIG: changed additional configuration.
+ * When this flag is set, it indicates that some additional attribute(s)
+ * (other then master_pref and bands) have been changed. In this case,
+ * all the unchanged attributes will be properly configured to their
+ * previous values. The driver doesn't need to store any
+ * previous configuration besides master_pref and bands.
*/
enum cfg80211_nan_conf_changes {
CFG80211_NAN_CONF_CHANGED_PREF = BIT(0),
CFG80211_NAN_CONF_CHANGED_BANDS = BIT(1),
+ CFG80211_NAN_CONF_CHANGED_CONFIG = BIT(2),
};
/**
@@ -3957,6 +4416,7 @@ struct cfg80211_ftm_responder_stats {
* @num_bursts_exp: actual number of bursts exponent negotiated
* @burst_duration: actual burst duration negotiated
* @ftms_per_burst: actual FTMs per burst negotiated
+ * @burst_period: actual burst period negotiated in units of 100ms
* @lci_len: length of LCI information (if present)
* @civicloc_len: length of civic location information (if present)
* @lci: LCI data (may be %NULL)
@@ -3998,6 +4458,7 @@ struct cfg80211_pmsr_ftm_result {
u8 num_bursts_exp;
u8 burst_duration;
u8 ftms_per_burst;
+ u16 burst_period;
s32 rssi_avg;
s32 rssi_spread;
struct rate_info tx_rate, rx_rate;
@@ -4060,7 +4521,9 @@ struct cfg80211_pmsr_result {
* @burst_period: burst period to use
* @asap: indicates to use ASAP mode
* @num_bursts_exp: number of bursts exponent
- * @burst_duration: burst duration
+ * @burst_duration: burst duration. If @trigger_based or @non_trigger_based is
+ * set, this is the burst duration in milliseconds, and zero means the
+ * device should pick an appropriate value based on @ftms_per_burst.
* @ftms_per_burst: number of FTMs per burst
* @ftmr_retries: number of retries for FTM request
* @request_lci: request LCI information
@@ -4073,6 +4536,8 @@ struct cfg80211_pmsr_result {
* EDCA based ranging will be used.
* @lmr_feedback: negotiate for I2R LMR feedback. Only valid if either
* @trigger_based or @non_trigger_based is set.
+ * @rsta: Operate as the RSTA in the measurement. Only valid if @lmr_feedback
+ * and either @trigger_based or @non_trigger_based is set.
* @bss_color: the bss color of the responder. Optional. Set to zero to
* indicate the driver should set the BSS color. Only valid if
* @non_trigger_based or @trigger_based is set.
@@ -4088,7 +4553,8 @@ struct cfg80211_pmsr_ftm_request_peer {
request_civicloc:1,
trigger_based:1,
non_trigger_based:1,
- lmr_feedback:1;
+ lmr_feedback:1,
+ rsta:1;
u8 num_bursts_exp;
u8 burst_duration;
u8 ftms_per_burst;
@@ -4537,6 +5003,19 @@ struct mgmt_frame_regs {
* @nan_change_conf: changes NAN configuration. The changed parameters must
* be specified in @changes (using &enum cfg80211_nan_conf_changes);
* All other parameters must be ignored.
+ * @nan_set_local_sched: configure the local schedule for NAN. The schedule
+ * consists of an array of %cfg80211_nan_channel and the schedule itself,
+ * in which each entry maps each time slot to the channel on which the
+ * radio should operate on. If the chandef of a NAN channel is not
+ * changed, the channel entry must also remain unchanged. It is the
+ * driver's responsibility to verify this.
+ * @nan_set_peer_sched: configure the peer schedule for NAN. The schedule
+ * consists of an array of %cfg80211_nan_channel and the schedule itself,
+ * in which each entry maps each time slot to a channel on which the
+ * radio should operate on. In addition, it contains more peer's schedule
+ * information such as committed DW, etc. When updating an existing peer
+ * schedule, the full new schedule is provided - partial updates are not
+ * supported, and the new schedule completely replaces the previous one.
*
* @set_multicast_to_unicast: configure multicast to unicast conversion for BSS
*
@@ -4631,24 +5110,24 @@ struct cfg80211_ops {
struct wireless_dev *wdev,
unsigned int link_id);
- int (*add_key)(struct wiphy *wiphy, struct net_device *netdev,
+ int (*add_key)(struct wiphy *wiphy, struct wireless_dev *wdev,
int link_id, u8 key_index, bool pairwise,
const u8 *mac_addr, struct key_params *params);
- int (*get_key)(struct wiphy *wiphy, struct net_device *netdev,
+ int (*get_key)(struct wiphy *wiphy, struct wireless_dev *wdev,
int link_id, u8 key_index, bool pairwise,
const u8 *mac_addr, void *cookie,
void (*callback)(void *cookie, struct key_params*));
- int (*del_key)(struct wiphy *wiphy, struct net_device *netdev,
+ int (*del_key)(struct wiphy *wiphy, struct wireless_dev *wdev,
int link_id, u8 key_index, bool pairwise,
const u8 *mac_addr);
int (*set_default_key)(struct wiphy *wiphy,
struct net_device *netdev, int link_id,
u8 key_index, bool unicast, bool multicast);
int (*set_default_mgmt_key)(struct wiphy *wiphy,
- struct net_device *netdev, int link_id,
+ struct wireless_dev *wdev, int link_id,
u8 key_index);
int (*set_default_beacon_key)(struct wiphy *wiphy,
- struct net_device *netdev,
+ struct wireless_dev *wdev,
int link_id,
u8 key_index);
@@ -4660,17 +5139,17 @@ struct cfg80211_ops {
unsigned int link_id);
- int (*add_station)(struct wiphy *wiphy, struct net_device *dev,
+ int (*add_station)(struct wiphy *wiphy, struct wireless_dev *wdev,
const u8 *mac,
struct station_parameters *params);
- int (*del_station)(struct wiphy *wiphy, struct net_device *dev,
+ int (*del_station)(struct wiphy *wiphy, struct wireless_dev *wdev,
struct station_del_parameters *params);
- int (*change_station)(struct wiphy *wiphy, struct net_device *dev,
+ int (*change_station)(struct wiphy *wiphy, struct wireless_dev *wdev,
const u8 *mac,
struct station_parameters *params);
- int (*get_station)(struct wiphy *wiphy, struct net_device *dev,
+ int (*get_station)(struct wiphy *wiphy, struct wireless_dev *wdev,
const u8 *mac, struct station_info *sinfo);
- int (*dump_station)(struct wiphy *wiphy, struct net_device *dev,
+ int (*dump_station)(struct wiphy *wiphy, struct wireless_dev *wdev,
int idx, u8 *mac, struct station_info *sinfo);
int (*add_mpath)(struct wiphy *wiphy, struct net_device *dev,
@@ -4750,12 +5229,14 @@ struct cfg80211_ops {
int (*set_mcast_rate)(struct wiphy *wiphy, struct net_device *dev,
int rate[NUM_NL80211_BANDS]);
- int (*set_wiphy_params)(struct wiphy *wiphy, u32 changed);
+ int (*set_wiphy_params)(struct wiphy *wiphy, int radio_idx,
+ u32 changed);
int (*set_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev,
+ int radio_idx,
enum nl80211_tx_power_setting type, int mbm);
int (*get_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev,
- unsigned int link_id, int *dbm);
+ int radio_idx, unsigned int link_id, int *dbm);
void (*rfkill_poll)(struct wiphy *wiphy);
@@ -4817,8 +5298,10 @@ struct cfg80211_ops {
struct wireless_dev *wdev,
struct mgmt_frame_regs *upd);
- int (*set_antenna)(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant);
- int (*get_antenna)(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant);
+ int (*set_antenna)(struct wiphy *wiphy, int radio_idx,
+ u32 tx_ant, u32 rx_ant);
+ int (*get_antenna)(struct wiphy *wiphy, int radio_idx,
+ u32 *tx_ant, u32 *rx_ant);
int (*sched_scan_start)(struct wiphy *wiphy,
struct net_device *dev,
@@ -4910,7 +5393,12 @@ struct cfg80211_ops {
struct wireless_dev *wdev,
struct cfg80211_nan_conf *conf,
u32 changes);
-
+ int (*nan_set_local_sched)(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ struct cfg80211_nan_local_sched *sched);
+ int (*nan_set_peer_sched)(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ struct cfg80211_nan_peer_sched *sched);
int (*set_multicast_to_unicast)(struct wiphy *wiphy,
struct net_device *dev,
const bool enabled);
@@ -4970,8 +5458,7 @@ struct cfg80211_ops {
struct cfg80211_ttlm_params *params);
u32 (*get_radio_mask)(struct wiphy *wiphy, struct net_device *dev);
int (*assoc_ml_reconf)(struct wiphy *wiphy, struct net_device *dev,
- struct cfg80211_assoc_link *add_links,
- u16 rem_links);
+ struct cfg80211_ml_reconf_req *req);
int (*set_epcs)(struct wiphy *wiphy, struct net_device *dev,
bool val);
};
@@ -5405,6 +5892,18 @@ cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type);
* not limited)
* @ftm.trigger_based: trigger based ranging measurement is supported
* @ftm.non_trigger_based: non trigger based ranging measurement is supported
+ * @ftm.support_6ghz: supports ranging in 6 GHz band
+ * @ftm.max_tx_ltf_rep: maximum number of TX LTF repetitions supported (0 means
+ * only one LTF, no repetitions)
+ * @ftm.max_rx_ltf_rep: maximum number of RX LTF repetitions supported (0 means
+ * only one LTF, no repetitions)
+ * @ftm.max_tx_sts: maximum number of TX STS supported (zero based)
+ * @ftm.max_rx_sts: maximum number of RX STS supported (zero based)
+ * @ftm.max_total_ltf_tx: maximum total number of LTFs that can be transmitted
+ * (0 means unknown)
+ * @ftm.max_total_ltf_rx: maximum total number of LTFs that can be received
+ * (0 means unknown)
+ * @ftm.support_rsta: supports operating as RSTA in PMSR FTM request
*/
struct cfg80211_pmsr_capabilities {
unsigned int max_peers;
@@ -5422,7 +5921,15 @@ struct cfg80211_pmsr_capabilities {
request_lci:1,
request_civicloc:1,
trigger_based:1,
- non_trigger_based:1;
+ non_trigger_based:1,
+ support_6ghz:1;
+ u8 max_tx_ltf_rep;
+ u8 max_rx_ltf_rep;
+ u8 max_tx_sts;
+ u8 max_rx_sts;
+ u8 max_total_ltf_tx;
+ u8 max_total_ltf_rx;
+ u8 support_rsta:1;
} ftm;
};
@@ -5442,6 +5949,22 @@ struct wiphy_iftype_akm_suites {
};
/**
+ * struct wiphy_radio_cfg - physical radio config of a wiphy
+ * This structure describes the configurations of a physical radio in a
+ * wiphy. It is used to denote per-radio attributes belonging to a wiphy.
+ *
+ * @rts_threshold: RTS threshold (dot11RTSThreshold);
+ * -1 (default) = RTS/CTS disabled
+ * @radio_debugfsdir: Pointer to debugfs directory containing the radio-
+ * specific parameters.
+ * NULL (default) = Debugfs directory not created
+ */
+struct wiphy_radio_cfg {
+ u32 rts_threshold;
+ struct dentry *radio_debugfsdir;
+};
+
+/**
* struct wiphy_radio_freq_range - wiphy frequency range
* @start_freq: start range edge frequency (kHz)
* @end_freq: end range edge frequency (kHz)
@@ -5477,6 +6000,53 @@ struct wiphy_radio {
u32 antenna_mask;
};
+/**
+ * enum wiphy_nan_flags - NAN capabilities
+ *
+ * @WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC: Device supports NAN configurable
+ * synchronization.
+ * @WIPHY_NAN_FLAGS_USERSPACE_DE: Device doesn't support DE offload.
+ */
+enum wiphy_nan_flags {
+ WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC = BIT(0),
+ WIPHY_NAN_FLAGS_USERSPACE_DE = BIT(1),
+};
+
+/**
+ * struct wiphy_nan_capa - NAN capabilities
+ *
+ * This structure describes the NAN capabilities of a wiphy.
+ *
+ * @flags: NAN capabilities flags, see &enum wiphy_nan_flags
+ * @op_mode: NAN operation mode, as defined in Wi-Fi Aware (TM) specification
+ * Table 81.
+ * @n_antennas: number of antennas supported by the device for Tx/Rx. Lower
+ * nibble indicates the number of TX antennas and upper nibble indicates the
+ * number of RX antennas. Value 0 indicates the information is not
+ * available.
+ * @max_channel_switch_time: maximum channel switch time in milliseconds.
+ * @dev_capabilities: NAN device capabilities as defined in Wi-Fi Aware (TM)
+ * specification Table 79 (Capabilities field).
+ * @phy: Band-agnostic capabilities for NAN data interfaces. Since NAN
+ * operates on multiple channels simultaneously, these capabilities apply
+ * across all bands. Valid only if NL80211_IFTYPE_NAN_DATA is supported.
+ * @phy.ht: HT capabilities (mandatory for NAN data)
+ * @phy.vht: VHT capabilities (optional)
+ * @phy.he: HE capabilities (optional)
+ */
+struct wiphy_nan_capa {
+ u32 flags;
+ u8 op_mode;
+ u8 n_antennas;
+ u16 max_channel_switch_time;
+ u8 dev_capabilities;
+ struct {
+ struct ieee80211_sta_ht_cap ht;
+ struct ieee80211_sta_vht_cap vht;
+ struct ieee80211_sta_he_cap he;
+ } phy;
+};
+
#define CFG80211_HW_TIMESTAMP_ALL_PEERS 0xffff
/**
@@ -5637,6 +6207,11 @@ struct wiphy_radio {
* and probe responses. This value should be set if the driver
* wishes to limit the number of csa counters. Default (0) means
* infinite.
+ * @bss_param_support: bitmask indicating which bss_parameters as defined in
+ * &struct bss_parameters the driver can actually handle in the
+ * .change_bss() callback. The bit positions are defined in &enum
+ * wiphy_bss_param_flags.
+ *
* @bss_select_support: bitmask indicating the BSS selection criteria supported
* by the driver in the .connect() callback. The bit position maps to the
* attribute indices defined in &enum nl80211_bss_select_attr.
@@ -5645,6 +6220,7 @@ struct wiphy_radio {
* bitmap of &enum nl80211_band values. For instance, for
* NL80211_BAND_2GHZ, bit 0 would be set
* (i.e. BIT(NL80211_BAND_2GHZ)).
+ * @nan_capa: NAN capabilities
*
* @txq_limit: configuration of internal TX queue frame limit
* @txq_memory_limit: configuration internal TX queue memory limit
@@ -5696,6 +6272,10 @@ struct wiphy_radio {
* supports enabling HW timestamping for all peers (i.e. no need to
* specify a mac address).
*
+ * @radio_cfg: configuration of radios belonging to a muli-radio wiphy. This
+ * struct contains a list of all radio specific attributes and should be
+ * used only for multi-radio wiphy.
+ *
* @radio: radios belonging to this wiphy
* @n_radio: number of radios
*/
@@ -5785,6 +6365,8 @@ struct wiphy {
void (*reg_notifier)(struct wiphy *wiphy,
struct regulatory_request *request);
+ struct wiphy_radio_cfg *radio_cfg;
+
/* fields below are read-only, assigned by cfg80211 */
const struct ieee80211_regdomain __rcu *regd;
@@ -5816,9 +6398,11 @@ struct wiphy {
u8 max_num_csa_counters;
+ u32 bss_param_support;
u32 bss_select_support;
u8 nan_supported_bands;
+ struct wiphy_nan_capa nan_capa;
u32 txq_limit;
u32 txq_memory_limit;
@@ -6138,6 +6722,11 @@ static inline void wiphy_delayed_work_init(struct wiphy_delayed_work *dwork,
* after wiphy_lock() was called. Therefore, wiphy_cancel_work() can
* use just cancel_work() instead of cancel_work_sync(), it requires
* being in a section protected by wiphy_lock().
+ *
+ * Note that these are scheduled with a timer where the accuracy
+ * becomes less the longer in the future the scheduled timer is. Use
+ * wiphy_hrtimer_work_queue() if the timer must be not be late by more
+ * than approximately 10 percent.
*/
void wiphy_delayed_work_queue(struct wiphy *wiphy,
struct wiphy_delayed_work *dwork,
@@ -6209,6 +6798,79 @@ void wiphy_delayed_work_flush(struct wiphy *wiphy,
bool wiphy_delayed_work_pending(struct wiphy *wiphy,
struct wiphy_delayed_work *dwork);
+struct wiphy_hrtimer_work {
+ struct wiphy_work work;
+ struct wiphy *wiphy;
+ struct hrtimer timer;
+};
+
+enum hrtimer_restart wiphy_hrtimer_work_timer(struct hrtimer *t);
+
+static inline void wiphy_hrtimer_work_init(struct wiphy_hrtimer_work *hrwork,
+ wiphy_work_func_t func)
+{
+ hrtimer_setup(&hrwork->timer, wiphy_hrtimer_work_timer,
+ CLOCK_BOOTTIME, HRTIMER_MODE_REL);
+ wiphy_work_init(&hrwork->work, func);
+}
+
+/**
+ * wiphy_hrtimer_work_queue - queue hrtimer work for the wiphy
+ * @wiphy: the wiphy to queue for
+ * @hrwork: the high resolution timer worker
+ * @delay: the delay given as a ktime_t
+ *
+ * Please refer to wiphy_delayed_work_queue(). The difference is that
+ * the hrtimer work uses a high resolution timer for scheduling. This
+ * may be needed if timeouts might be scheduled further in the future
+ * and the accuracy of the normal timer is not sufficient.
+ *
+ * Expect a delay of a few milliseconds as the timer is scheduled
+ * with some slack and some more time may pass between queueing the
+ * work and its start.
+ */
+void wiphy_hrtimer_work_queue(struct wiphy *wiphy,
+ struct wiphy_hrtimer_work *hrwork,
+ ktime_t delay);
+
+/**
+ * wiphy_hrtimer_work_cancel - cancel previously queued hrtimer work
+ * @wiphy: the wiphy, for debug purposes
+ * @hrtimer: the hrtimer work to cancel
+ *
+ * Cancel the work *without* waiting for it, this assumes being
+ * called under the wiphy mutex acquired by wiphy_lock().
+ */
+void wiphy_hrtimer_work_cancel(struct wiphy *wiphy,
+ struct wiphy_hrtimer_work *hrtimer);
+
+/**
+ * wiphy_hrtimer_work_flush - flush previously queued hrtimer work
+ * @wiphy: the wiphy, for debug purposes
+ * @hrwork: the hrtimer work to flush
+ *
+ * Flush the work (i.e. run it if pending). This must be called
+ * under the wiphy mutex acquired by wiphy_lock().
+ */
+void wiphy_hrtimer_work_flush(struct wiphy *wiphy,
+ struct wiphy_hrtimer_work *hrwork);
+
+/**
+ * wiphy_hrtimer_work_pending - Find out whether a wiphy hrtimer
+ * work item is currently pending.
+ *
+ * @wiphy: the wiphy, for debug purposes
+ * @hrwork: the hrtimer work in question
+ *
+ * Return: true if timer is pending, false otherwise
+ *
+ * Please refer to the wiphy_delayed_work_pending() documentation as
+ * this is the equivalent function for hrtimer based delayed work
+ * items.
+ */
+bool wiphy_hrtimer_work_pending(struct wiphy *wiphy,
+ struct wiphy_hrtimer_work *hrwork);
+
/**
* enum ieee80211_ap_reg_power - regulatory power for an Access Point
*
@@ -6276,8 +6938,8 @@ enum ieee80211_ap_reg_power {
* the P2P Device.
* @ps: powersave mode is enabled
* @ps_timeout: dynamic powersave timeout
- * @ap_unexpected_nlportid: (private) netlink port ID of application
- * registered for unexpected class 3 frames (AP mode)
+ * @unexpected_nlportid: (private) netlink port ID of application
+ * registered for unexpected frames (AP mode or NAN_DATA mode)
* @conn: (private) cfg80211 software SME connection state machine data
* @connect_keys: (private) keys to set after connection is established
* @conn_bss_type: connecting/connected BSS type
@@ -6339,7 +7001,7 @@ struct wireless_dev {
bool ps;
int ps_timeout;
- u32 ap_unexpected_nlportid;
+ u32 unexpected_nlportid;
u32 owner_nlportid;
bool nl_owner_dead;
@@ -6397,6 +7059,12 @@ struct wireless_dev {
struct {
struct cfg80211_chan_def chandef;
} ocb;
+ struct {
+ u8 cluster_id[ETH_ALEN] __aligned(2);
+ u8 n_channels;
+ struct cfg80211_chan_def *chandefs;
+ bool sched_update_pending;
+ } nan;
} u;
struct {
@@ -6505,16 +7173,6 @@ ieee80211_channel_to_khz(const struct ieee80211_channel *chan)
}
/**
- * ieee80211_s1g_channel_width - get allowed channel width from @chan
- *
- * Only allowed for band NL80211_BAND_S1GHZ
- * @chan: channel
- * Return: The allowed channel width for this center_freq
- */
-enum nl80211_chan_width
-ieee80211_s1g_channel_width(const struct ieee80211_channel *chan);
-
-/**
* ieee80211_channel_to_freq_khz - convert channel number to frequency
* @chan: channel number
* @band: band, necessary due to channel number overlap
@@ -6593,6 +7251,19 @@ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan)
}
/**
+ * ieee80211_radio_freq_range_valid - Check if the radio supports the
+ * specified frequency range
+ *
+ * @radio: wiphy radio
+ * @freq: the frequency (in KHz) to be queried
+ * @width: the bandwidth (in KHz) to be queried
+ *
+ * Return: whether or not the given frequency range is valid for the given radio
+ */
+bool ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio,
+ u32 freq, u32 width);
+
+/**
* cfg80211_radio_chandef_valid - Check if the radio supports the chandef
*
* @radio: wiphy radio
@@ -8465,6 +9136,17 @@ void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie,
int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp);
/**
+ * cfg80211_link_sinfo_alloc_tid_stats - allocate per-tid statistics.
+ *
+ * @link_sinfo: the link station information
+ * @gfp: allocation flags
+ *
+ * Return: 0 on success. Non-zero on error.
+ */
+int cfg80211_link_sinfo_alloc_tid_stats(struct link_station_info *link_sinfo,
+ gfp_t gfp);
+
+/**
* cfg80211_sinfo_release_content - release contents of station info
* @sinfo: the station information
*
@@ -8475,40 +9157,47 @@ int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp);
static inline void cfg80211_sinfo_release_content(struct station_info *sinfo)
{
kfree(sinfo->pertid);
+
+ for (int link_id = 0; link_id < ARRAY_SIZE(sinfo->links); link_id++) {
+ if (sinfo->links[link_id]) {
+ kfree(sinfo->links[link_id]->pertid);
+ kfree(sinfo->links[link_id]);
+ }
+ }
}
/**
* cfg80211_new_sta - notify userspace about station
*
- * @dev: the netdev
+ * @wdev: the wireless device
* @mac_addr: the station's address
* @sinfo: the station information
* @gfp: allocation flags
*/
-void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,
+void cfg80211_new_sta(struct wireless_dev *wdev, const u8 *mac_addr,
struct station_info *sinfo, gfp_t gfp);
/**
* cfg80211_del_sta_sinfo - notify userspace about deletion of a station
- * @dev: the netdev
+ * @wdev: the wireless device
* @mac_addr: the station's address. For MLD station, MLD address is used.
* @sinfo: the station information/statistics
* @gfp: allocation flags
*/
-void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,
+void cfg80211_del_sta_sinfo(struct wireless_dev *wdev, const u8 *mac_addr,
struct station_info *sinfo, gfp_t gfp);
/**
* cfg80211_del_sta - notify userspace about deletion of a station
*
- * @dev: the netdev
+ * @wdev: the wireless device
* @mac_addr: the station's address. For MLD station, MLD address is used.
* @gfp: allocation flags
*/
-static inline void cfg80211_del_sta(struct net_device *dev,
+static inline void cfg80211_del_sta(struct wireless_dev *wdev,
const u8 *mac_addr, gfp_t gfp)
{
- cfg80211_del_sta_sinfo(dev, mac_addr, NULL, gfp);
+ cfg80211_del_sta_sinfo(wdev, mac_addr, NULL, gfp);
}
/**
@@ -8879,22 +9568,25 @@ void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index,
/**
* cfg80211_rx_spurious_frame - inform userspace about a spurious frame
* @dev: The device the frame matched to
+ * @link_id: the link the frame was received on, -1 if not applicable or unknown
* @addr: the transmitter address
* @gfp: context flags
*
- * This function is used in AP mode (only!) to inform userspace that
- * a spurious class 3 frame was received, to be able to deauth the
- * sender.
+ * This function is used in AP mode to inform userspace that a spurious
+ * class 3 frame was received, to be able to deauth the sender.
+ * It is also used in NAN_DATA mode to report frames from unknown peers
+ * (A2 not assigned to any active NDP), per Wi-Fi Aware (TM) 4.0 specification 6.2.5.
* Return: %true if the frame was passed to userspace (or this failed
* for a reason other than not having a subscription.)
*/
-bool cfg80211_rx_spurious_frame(struct net_device *dev,
- const u8 *addr, gfp_t gfp);
+bool cfg80211_rx_spurious_frame(struct net_device *dev, const u8 *addr,
+ int link_id, gfp_t gfp);
/**
* cfg80211_rx_unexpected_4addr_frame - inform about unexpected WDS frame
* @dev: The device the frame matched to
* @addr: the transmitter address
+ * @link_id: the link the frame was received on, -1 if not applicable or unknown
* @gfp: context flags
*
* This function is used in AP mode (only!) to inform userspace that
@@ -8904,8 +9596,8 @@ bool cfg80211_rx_spurious_frame(struct net_device *dev,
* Return: %true if the frame was passed to userspace (or this failed
* for a reason other than not having a subscription.)
*/
-bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev,
- const u8 *addr, gfp_t gfp);
+bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, const u8 *addr,
+ int link_id, gfp_t gfp);
/**
* cfg80211_probe_status - notify userspace about probe status
@@ -9371,6 +10063,32 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
void (*iter)(const struct ieee80211_iface_combination *c,
void *data),
void *data);
+/**
+ * cfg80211_get_radio_idx_by_chan - get the radio index by the channel
+ *
+ * @wiphy: the wiphy
+ * @chan: channel for which the supported radio index is required
+ *
+ * Return: radio index on success or -EINVAL otherwise
+ */
+int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy,
+ const struct ieee80211_channel *chan);
+
+/**
+ * cfg80211_stop_link - stop AP/P2P_GO link if link_id is non-negative or stops
+ * all links on the interface.
+ *
+ * @wiphy: the wiphy
+ * @wdev: wireless device
+ * @link_id: valid link ID in case of MLO AP/P2P_GO Operation or else -1
+ * @gfp: context flags
+ *
+ * If link_id is set during MLO operation, stops only the specified AP/P2P_GO
+ * link and if link_id is set to -1 or last link is stopped, the entire
+ * interface is stopped as if AP was stopped, IBSS/mesh left, STA disconnected.
+ */
+void cfg80211_stop_link(struct wiphy *wiphy, struct wireless_dev *wdev,
+ int link_id, gfp_t gfp);
/**
* cfg80211_stop_iface - trigger interface disconnection
@@ -9384,8 +10102,11 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
*
* Note: This doesn't need any locks and is asynchronous.
*/
-void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev,
- gfp_t gfp);
+static inline void
+cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev, gfp_t gfp)
+{
+ cfg80211_stop_link(wiphy, wdev, -1, gfp);
+}
/**
* cfg80211_shutdown_all_interfaces - shut down all interfaces for a wiphy
@@ -9501,6 +10222,18 @@ void cfg80211_nan_func_terminated(struct wireless_dev *wdev,
enum nl80211_nan_func_term_reason reason,
u64 cookie, gfp_t gfp);
+/**
+ * cfg80211_nan_sched_update_done - notify deferred schedule update completion
+ * @wdev: the wireless device reporting the event
+ * @success: whether or not the schedule update was successful
+ * @gfp: allocation flags
+ *
+ * This function notifies user space that a deferred local NAN schedule update
+ * (requested with %NL80211_ATTR_NAN_SCHED_DEFERRED) has been completed.
+ */
+void cfg80211_nan_sched_update_done(struct wireless_dev *wdev, bool success,
+ gfp_t gfp);
+
/* ethtool helper */
void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info);
@@ -9719,6 +10452,36 @@ static inline int cfg80211_color_change_notify(struct net_device *dev,
}
/**
+ * cfg80211_6ghz_power_type - determine AP regulatory power type
+ * @control: control flags
+ * @client_flags: &enum ieee80211_channel_flags for station mode to enable
+ * SP to LPI fallback, zero otherwise.
+ *
+ * Return: regulatory power type from &enum ieee80211_ap_reg_power
+ */
+static inline enum ieee80211_ap_reg_power
+cfg80211_6ghz_power_type(u8 control, u32 client_flags)
+{
+ switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) {
+ case IEEE80211_6GHZ_CTRL_REG_LPI_AP:
+ case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP:
+ case IEEE80211_6GHZ_CTRL_REG_AP_ROLE_NOT_RELEVANT:
+ case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD:
+ return IEEE80211_REG_LPI_AP;
+ case IEEE80211_6GHZ_CTRL_REG_SP_AP:
+ return IEEE80211_REG_SP_AP;
+ case IEEE80211_6GHZ_CTRL_REG_VLP_AP:
+ return IEEE80211_REG_VLP_AP;
+ case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP:
+ if (client_flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT)
+ return IEEE80211_REG_LPI_AP;
+ return IEEE80211_REG_SP_AP;
+ default:
+ return IEEE80211_REG_UNSET_AP;
+ }
+}
+
+/**
* cfg80211_links_removed - Notify about removed STA MLD setup links.
* @dev: network device.
* @link_mask: BIT mask of removed STA MLD setup link IDs.
@@ -9735,6 +10498,11 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask);
* struct cfg80211_mlo_reconf_done_data - MLO reconfiguration data
* @buf: MLO Reconfiguration Response frame (header + body)
* @len: length of the frame data
+ * @driver_initiated: Indicates whether the add links request is initiated by
+ * driver. This is set to true when the link reconfiguration request
+ * initiated by driver due to AP link recommendation requests
+ * (Ex: BTM (BSS Transition Management) request) handling offloaded to
+ * driver.
* @added_links: BIT mask of links successfully added to the association
* @links: per-link information indexed by link ID
* @links.bss: the BSS that MLO reconfiguration was requested for, ownership of
@@ -9747,9 +10515,11 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask);
struct cfg80211_mlo_reconf_done_data {
const u8 *buf;
size_t len;
+ bool driver_initiated;
u16 added_links;
struct {
struct cfg80211_bss *bss;
+ u8 *addr;
} links[IEEE80211_MLD_MAX_NUM_LINKS];
};
@@ -9781,6 +10551,62 @@ void cfg80211_schedule_channels_check(struct wireless_dev *wdev);
*/
void cfg80211_epcs_changed(struct net_device *netdev, bool enabled);
+/**
+ * cfg80211_next_nan_dw_notif - Notify about the next NAN Discovery Window (DW)
+ * @wdev: Pointer to the wireless device structure
+ * @chan: DW channel (6, 44 or 149)
+ * @gfp: Memory allocation flags
+ */
+void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev,
+ struct ieee80211_channel *chan, gfp_t gfp);
+
+/**
+ * cfg80211_nan_cluster_joined - Notify about NAN cluster join
+ * @wdev: Pointer to the wireless device structure
+ * @cluster_id: Cluster ID of the NAN cluster that was joined or started
+ * @new_cluster: Indicates if this is a new cluster or an existing one
+ * @gfp: Memory allocation flags
+ *
+ * This function is used to notify user space when a NAN cluster has been
+ * joined, providing the cluster ID and a flag whether it is a new cluster.
+ */
+void cfg80211_nan_cluster_joined(struct wireless_dev *wdev,
+ const u8 *cluster_id, bool new_cluster,
+ gfp_t gfp);
+
+/**
+ * cfg80211_nan_ulw_update - Notify user space about ULW update
+ * @wdev: Pointer to the wireless device structure
+ * @ulw: Pointer to the ULW blob data
+ * @ulw_len: Length of the ULW blob in bytes
+ * @gfp: Memory allocation flags
+ *
+ * This function is used by drivers to notify user space when the device's
+ * ULW (Unaligned Schedule) blob has been updated. User space can use this
+ * blob to attach to frames sent to peers.
+ */
+void cfg80211_nan_ulw_update(struct wireless_dev *wdev,
+ const u8 *ulw, size_t ulw_len, gfp_t gfp);
+
+/**
+ * cfg80211_nan_channel_evac - Notify user space about NAN channel evacuation
+ * @wdev: Pointer to the wireless device structure
+ * @chandef: Pointer to the channel definition of the NAN channel that was
+ * evacuated
+ * @gfp: Memory allocation flags
+ *
+ * This function is used by drivers to notify user space when a NAN
+ * channel has been evacuated (i.e. ULWed) due to channel resource conflicts
+ * with other interfaces.
+ * This can happen when another interface sharing the channel resource with NAN
+ * needs to move to a different channel (e.g. due to channel switch or link
+ * switch). User space may reconfigure the local schedule to exclude the
+ * evacuated channel.
+ */
+void cfg80211_nan_channel_evac(struct wireless_dev *wdev,
+ const struct cfg80211_chan_def *chandef,
+ gfp_t gfp);
+
#ifdef CONFIG_CFG80211_DEBUGFS
/**
* wiphy_locked_debugfs_read - do a locked read in debugfs
@@ -9831,4 +10657,95 @@ ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file,
void *data);
#endif
+/**
+ * cfg80211_s1g_get_start_freq_khz - get S1G chandef start frequency
+ * @chandef: the chandef to use
+ *
+ * Return: the chandefs starting frequency in KHz
+ */
+static inline u32
+cfg80211_s1g_get_start_freq_khz(const struct cfg80211_chan_def *chandef)
+{
+ u32 bw_mhz = cfg80211_chandef_get_width(chandef);
+ u32 center_khz =
+ MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset;
+ return center_khz - bw_mhz * 500 + 500;
+}
+
+/**
+ * cfg80211_s1g_get_end_freq_khz - get S1G chandef end frequency
+ * @chandef: the chandef to use
+ *
+ * Return: the chandefs ending frequency in KHz
+ */
+static inline u32
+cfg80211_s1g_get_end_freq_khz(const struct cfg80211_chan_def *chandef)
+{
+ u32 bw_mhz = cfg80211_chandef_get_width(chandef);
+ u32 center_khz =
+ MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset;
+ return center_khz + bw_mhz * 500 - 500;
+}
+
+/**
+ * cfg80211_s1g_get_primary_sibling - retrieve the sibling 1MHz subchannel
+ * for an S1G chandef using a 2MHz primary channel.
+ * @wiphy: wiphy the channel belongs to
+ * @chandef: the chandef to use
+ *
+ * When chandef::s1g_primary_2mhz is set to true, we are operating on a 2MHz
+ * primary channel. The 1MHz subchannel designated by the primary channel
+ * location exists within chandef::chan, whilst the 'sibling' is denoted as
+ * being the other 1MHz subchannel that make up the 2MHz primary channel.
+ *
+ * Returns: the sibling 1MHz &struct ieee80211_channel, or %NULL on failure.
+ */
+static inline struct ieee80211_channel *
+cfg80211_s1g_get_primary_sibling(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef)
+{
+ int width_mhz = cfg80211_chandef_get_width(chandef);
+ u32 pri_1mhz_khz, sibling_1mhz_khz, op_low_1mhz_khz, pri_index;
+
+ if (!chandef->s1g_primary_2mhz || width_mhz < 2)
+ return NULL;
+
+ pri_1mhz_khz = ieee80211_channel_to_khz(chandef->chan);
+ op_low_1mhz_khz = cfg80211_s1g_get_start_freq_khz(chandef);
+
+ /*
+ * Compute the index of the primary 1 MHz subchannel within the
+ * operating channel, relative to the lowest 1 MHz center frequency.
+ * Flip the least significant bit to select the even/odd sibling,
+ * then translate that index back into a channel frequency.
+ */
+ pri_index = (pri_1mhz_khz - op_low_1mhz_khz) / 1000;
+ sibling_1mhz_khz = op_low_1mhz_khz + ((pri_index ^ 1) * 1000);
+
+ return ieee80211_get_channel_khz(wiphy, sibling_1mhz_khz);
+}
+
+
+/**
+ * cfg80211_incumbent_signal_notify - Notify userspace of incumbent signal detection
+ * @wiphy: the wiphy to use
+ * @chandef: channel definition in which the interference was detected
+ * @signal_interference_bitmap: bitmap indicating interference across 20 MHz segments
+ * @gfp: allocation context for message creation and multicast; pass GFP_ATOMIC
+ * if called from atomic context (e.g. firmware event handler), otherwise
+ * GFP_KERNEL
+ *
+ * Use this function to notify userspace when an incumbent signal is detected on
+ * the operating channel in the 6 GHz band. The notification includes the
+ * current channel definition and a bitmap representing interference across
+ * the operating bandwidth. Each bit in the bitmap corresponds to a 20 MHz
+ * segment, with the lowest bit representing the lowest frequency segment.
+ * Punctured sub-channels are included in the bitmap structure but are always
+ * set to zero since interference detection is not performed on them.
+ */
+void cfg80211_incumbent_signal_notify(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef,
+ u32 signal_interference_bitmap,
+ gfp_t gfp);
+
#endif /* __NET_CFG80211_H */
diff --git a/include/net/checksum.h b/include/net/checksum.h
index 243f972267b8..3cbab35de5ab 100644
--- a/include/net/checksum.h
+++ b/include/net/checksum.h
@@ -99,12 +99,6 @@ csum_block_add(__wsum csum, __wsum csum2, int offset)
}
static __always_inline __wsum
-csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len)
-{
- return csum_block_add(csum, csum2, offset);
-}
-
-static __always_inline __wsum
csum_block_sub(__wsum csum, __wsum csum2, int offset)
{
return csum_block_add(csum, ~csum2, offset);
@@ -115,12 +109,6 @@ static __always_inline __wsum csum_unfold(__sum16 n)
return (__force __wsum)n;
}
-static __always_inline
-__wsum csum_partial_ext(const void *buff, int len, __wsum sum)
-{
- return csum_partial(buff, len, sum);
-}
-
#define CSUM_MANGLED_0 ((__force __sum16)0xffff)
static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff)
@@ -164,7 +152,7 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
const __be32 *from, const __be32 *to,
bool pseudohdr);
void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb,
- __wsum diff, bool pseudohdr);
+ __wsum diff, bool pseudohdr, bool ipv6);
static __always_inline
void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb,
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index 7e78e7d6f015..668aeee9b3f6 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -63,7 +63,7 @@ static inline u32 task_get_classid(const struct sk_buff *skb)
* calls by looking at the number of nested bh disable calls because
* softirqs always disables bh.
*/
- if (in_serving_softirq()) {
+ if (softirq_count()) {
struct sock *sk = skb_to_full_sk(skb);
/* If there is an sock_cgroup_classid we'll use that. */
diff --git a/include/net/codel_impl.h b/include/net/codel_impl.h
index 78a27ac73070..2c1f0ec309e9 100644
--- a/include/net/codel_impl.h
+++ b/include/net/codel_impl.h
@@ -120,10 +120,10 @@ static bool codel_should_drop(const struct sk_buff *skb,
}
skb_len = skb_len_func(skb);
- vars->ldelay = now - skb_time_func(skb);
+ WRITE_ONCE(vars->ldelay, now - skb_time_func(skb));
if (unlikely(skb_len > stats->maxpacket))
- stats->maxpacket = skb_len;
+ WRITE_ONCE(stats->maxpacket, skb_len);
if (codel_time_before(vars->ldelay, params->target) ||
*backlog <= params->mtu) {
@@ -158,7 +158,8 @@ static struct sk_buff *codel_dequeue(void *ctx,
bool drop;
if (!skb) {
- vars->dropping = false;
+ vars->first_above_time = 0;
+ WRITE_ONCE(vars->dropping, false);
return skb;
}
now = codel_get_time();
@@ -167,7 +168,7 @@ static struct sk_buff *codel_dequeue(void *ctx,
if (vars->dropping) {
if (!drop) {
/* sojourn time below target - leave dropping state */
- vars->dropping = false;
+ WRITE_ONCE(vars->dropping, false);
} else if (codel_time_after_eq(now, vars->drop_next)) {
/* It's time for the next drop. Drop the current
* packet and dequeue the next. The dequeue might
@@ -179,16 +180,18 @@ static struct sk_buff *codel_dequeue(void *ctx,
*/
while (vars->dropping &&
codel_time_after_eq(now, vars->drop_next)) {
- vars->count++; /* dont care of possible wrap
- * since there is no more divide
- */
+ /* dont care of possible wrap
+ * since there is no more divide.
+ */
+ WRITE_ONCE(vars->count, vars->count + 1);
codel_Newton_step(vars);
if (params->ecn && INET_ECN_set_ce(skb)) {
- stats->ecn_mark++;
- vars->drop_next =
+ WRITE_ONCE(stats->ecn_mark,
+ stats->ecn_mark + 1);
+ WRITE_ONCE(vars->drop_next,
codel_control_law(vars->drop_next,
params->interval,
- vars->rec_inv_sqrt);
+ vars->rec_inv_sqrt));
goto end;
}
stats->drop_len += skb_len_func(skb);
@@ -201,13 +204,13 @@ static struct sk_buff *codel_dequeue(void *ctx,
skb_time_func,
backlog, now)) {
/* leave dropping state */
- vars->dropping = false;
+ WRITE_ONCE(vars->dropping, false);
} else {
/* and schedule the next drop */
- vars->drop_next =
+ WRITE_ONCE(vars->drop_next,
codel_control_law(vars->drop_next,
params->interval,
- vars->rec_inv_sqrt);
+ vars->rec_inv_sqrt));
}
}
}
@@ -215,7 +218,7 @@ static struct sk_buff *codel_dequeue(void *ctx,
u32 delta;
if (params->ecn && INET_ECN_set_ce(skb)) {
- stats->ecn_mark++;
+ WRITE_ONCE(stats->ecn_mark, stats->ecn_mark + 1);
} else {
stats->drop_len += skb_len_func(skb);
drop_func(skb, ctx);
@@ -226,7 +229,7 @@ static struct sk_buff *codel_dequeue(void *ctx,
stats, skb_len_func,
skb_time_func, backlog, now);
}
- vars->dropping = true;
+ WRITE_ONCE(vars->dropping, true);
/* if min went above target close to when we last went below it
* assume that the drop rate that controlled the queue on the
* last cycle is a good starting point to control it now.
@@ -235,19 +238,20 @@ static struct sk_buff *codel_dequeue(void *ctx,
if (delta > 1 &&
codel_time_before(now - vars->drop_next,
16 * params->interval)) {
- vars->count = delta;
+ WRITE_ONCE(vars->count, delta);
/* we dont care if rec_inv_sqrt approximation
* is not very precise :
* Next Newton steps will correct it quadratically.
*/
codel_Newton_step(vars);
} else {
- vars->count = 1;
+ WRITE_ONCE(vars->count, 1);
vars->rec_inv_sqrt = ~0U >> REC_INV_SQRT_SHIFT;
}
- vars->lastcount = vars->count;
- vars->drop_next = codel_control_law(now, params->interval,
- vars->rec_inv_sqrt);
+ WRITE_ONCE(vars->lastcount, vars->count);
+ WRITE_ONCE(vars->drop_next,
+ codel_control_law(now, params->interval,
+ vars->rec_inv_sqrt));
}
end:
if (skb && codel_time_after(vars->ldelay, params->ce_threshold)) {
@@ -261,7 +265,7 @@ end:
params->ce_threshold_selector));
}
if (set_ce && INET_ECN_set_ce(skb))
- stats->ce_mark++;
+ WRITE_ONCE(stats->ce_mark, stats->ce_mark + 1);
}
return skb;
}
diff --git a/include/net/devlink.h b/include/net/devlink.h
index b8783126c1ed..bcd31de1f890 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -78,6 +78,9 @@ struct devlink_port_pci_sf_attrs {
* @flavour: flavour of the port
* @split: indicates if this is split port
* @splittable: indicates if the port can be split.
+ * @no_phys_port_name: skip automatic phys_port_name generation; for
+ * compatibility only, newly added driver/port instance
+ * should never set this.
* @lanes: maximum number of lanes the port supports. 0 value is not passed to netlink.
* @switch_id: if the port is part of switch, this is buffer with ID, otherwise this is NULL
* @phys: physical port attributes
@@ -87,7 +90,8 @@ struct devlink_port_pci_sf_attrs {
*/
struct devlink_port_attrs {
u8 split:1,
- splittable:1;
+ splittable:1,
+ no_phys_port_name:1;
u32 lanes;
enum devlink_port_flavour flavour;
struct netdev_phys_item_id switch_id;
@@ -118,11 +122,14 @@ struct devlink_rate {
u32 tx_priority;
u32 tx_weight;
+
+ u32 tc_bw[DEVLINK_RATE_TCS_MAX];
};
struct devlink_port {
struct list_head list;
struct list_head region_list;
+ struct list_head resource_list;
struct devlink *devlink;
const struct devlink_port_ops *ops;
unsigned int index;
@@ -420,17 +427,19 @@ typedef u64 devlink_resource_occ_get_t(void *priv);
#define __DEVLINK_PARAM_MAX_STRING_VALUE 32
enum devlink_param_type {
- DEVLINK_PARAM_TYPE_U8,
- DEVLINK_PARAM_TYPE_U16,
- DEVLINK_PARAM_TYPE_U32,
- DEVLINK_PARAM_TYPE_STRING,
- DEVLINK_PARAM_TYPE_BOOL,
+ DEVLINK_PARAM_TYPE_U8 = DEVLINK_VAR_ATTR_TYPE_U8,
+ DEVLINK_PARAM_TYPE_U16 = DEVLINK_VAR_ATTR_TYPE_U16,
+ DEVLINK_PARAM_TYPE_U32 = DEVLINK_VAR_ATTR_TYPE_U32,
+ DEVLINK_PARAM_TYPE_U64 = DEVLINK_VAR_ATTR_TYPE_U64,
+ DEVLINK_PARAM_TYPE_STRING = DEVLINK_VAR_ATTR_TYPE_STRING,
+ DEVLINK_PARAM_TYPE_BOOL = DEVLINK_VAR_ATTR_TYPE_FLAG,
};
union devlink_param_value {
u8 vu8;
u16 vu16;
u32 vu32;
+ u64 vu64;
char vstr[__DEVLINK_PARAM_MAX_STRING_VALUE];
bool vbool;
};
@@ -471,6 +480,10 @@ struct devlink_flash_notify {
* @set: set parameter value, used for runtime and permanent
* configuration modes
* @validate: validate input value is applicable (within value range, etc.)
+ * @get_default: get parameter default value, used for runtime and permanent
+ * configuration modes
+ * @reset_default: reset parameter to default value, used for runtime and permanent
+ * configuration modes
*
* This struct should be used by the driver to fill the data for
* a parameter it registers.
@@ -482,13 +495,20 @@ struct devlink_param {
enum devlink_param_type type;
unsigned long supported_cmodes;
int (*get)(struct devlink *devlink, u32 id,
- struct devlink_param_gset_ctx *ctx);
+ struct devlink_param_gset_ctx *ctx,
+ struct netlink_ext_ack *extack);
int (*set)(struct devlink *devlink, u32 id,
struct devlink_param_gset_ctx *ctx,
struct netlink_ext_ack *extack);
int (*validate)(struct devlink *devlink, u32 id,
union devlink_param_value val,
struct netlink_ext_ack *extack);
+ int (*get_default)(struct devlink *devlink, u32 id,
+ struct devlink_param_gset_ctx *ctx,
+ struct netlink_ext_ack *extack);
+ int (*reset_default)(struct devlink *devlink, u32 id,
+ enum devlink_param_cmode cmode,
+ struct netlink_ext_ack *extack);
};
struct devlink_param_item {
@@ -500,6 +520,7 @@ struct devlink_param_item {
* until reload.
*/
bool driverinit_value_new_valid;
+ union devlink_param_value driverinit_default;
};
enum devlink_param_generic_id {
@@ -520,6 +541,11 @@ enum devlink_param_generic_id {
DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP,
DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE,
DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE,
+ DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC,
+ DEVLINK_PARAM_GENERIC_ID_CLOCK_ID,
+ DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS,
+ DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS,
+ DEVLINK_PARAM_GENERIC_ID_MAX_MAC_PER_VF,
/* add new param generic ids above here*/
__DEVLINK_PARAM_GENERIC_ID_MAX,
@@ -578,6 +604,21 @@ enum devlink_param_generic_id {
#define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME "event_eq_size"
#define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE DEVLINK_PARAM_TYPE_U32
+#define DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME "enable_phc"
+#define DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE DEVLINK_PARAM_TYPE_BOOL
+
+#define DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME "clock_id"
+#define DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE DEVLINK_PARAM_TYPE_U64
+
+#define DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME "total_vfs"
+#define DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE DEVLINK_PARAM_TYPE_U32
+
+#define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME "num_doorbells"
+#define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE DEVLINK_PARAM_TYPE_U32
+
+#define DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_NAME "max_mac_per_vf"
+#define DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_TYPE DEVLINK_PARAM_TYPE_U32
+
#define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \
{ \
.id = DEVLINK_PARAM_GENERIC_ID_##_id, \
@@ -601,6 +642,37 @@ enum devlink_param_generic_id {
.validate = _validate, \
}
+#define DEVLINK_PARAM_GENERIC_WITH_DEFAULTS(_id, _cmodes, _get, _set, \
+ _validate, _get_default, \
+ _reset_default) \
+{ \
+ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \
+ .name = DEVLINK_PARAM_GENERIC_##_id##_NAME, \
+ .type = DEVLINK_PARAM_GENERIC_##_id##_TYPE, \
+ .generic = true, \
+ .supported_cmodes = _cmodes, \
+ .get = _get, \
+ .set = _set, \
+ .validate = _validate, \
+ .get_default = _get_default, \
+ .reset_default = _reset_default, \
+}
+
+#define DEVLINK_PARAM_DRIVER_WITH_DEFAULTS(_id, _name, _type, _cmodes, \
+ _get, _set, _validate, \
+ _get_default, _reset_default) \
+{ \
+ .id = _id, \
+ .name = _name, \
+ .type = _type, \
+ .supported_cmodes = _cmodes, \
+ .get = _get, \
+ .set = _set, \
+ .validate = _validate, \
+ .get_default = _get_default, \
+ .reset_default = _reset_default, \
+}
+
/* Identifier of board design */
#define DEVLINK_INFO_VERSION_GENERIC_BOARD_ID "board.id"
/* Revision of board design */
@@ -730,6 +802,10 @@ enum devlink_health_reporter_state {
* if priv_ctx is NULL, run a full dump
* @diagnose: callback to diagnose the current status
* @test: callback to trigger a test event
+ * @default_graceful_period: default min time (in msec)
+ * between recovery attempts
+ * @default_burst_period: default time (in msec) for
+ * error recoveries before starting the grace period
*/
struct devlink_health_reporter_ops {
@@ -744,6 +820,8 @@ struct devlink_health_reporter_ops {
struct netlink_ext_ack *extack);
int (*test)(struct devlink_health_reporter *reporter,
struct netlink_ext_ack *extack);
+ u64 default_graceful_period;
+ u64 default_burst_period;
};
/**
@@ -1482,6 +1560,9 @@ struct devlink_ops {
u32 tx_priority, struct netlink_ext_ack *extack);
int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv,
u32 tx_weight, struct netlink_ext_ack *extack);
+ int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate,
+ void *priv, u32 *tc_bw,
+ struct netlink_ext_ack *extack);
int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv,
u64 tx_share, struct netlink_ext_ack *extack);
int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv,
@@ -1490,6 +1571,9 @@ struct devlink_ops {
u32 tx_priority, struct netlink_ext_ack *extack);
int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv,
u32 tx_weight, struct netlink_ext_ack *extack);
+ int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate,
+ void *priv, u32 *tc_bw,
+ struct netlink_ext_ack *extack);
int (*rate_node_new)(struct devlink_rate *rate_node, void **priv,
struct netlink_ext_ack *extack);
int (*rate_node_del)(struct devlink_rate *rate_node, void *priv,
@@ -1528,6 +1612,9 @@ struct devlink_ops {
void *devlink_priv(struct devlink *devlink);
struct devlink *priv_to_devlink(void *priv);
struct device *devlink_to_dev(const struct devlink *devlink);
+const char *devlink_bus_name(const struct devlink *devlink);
+const char *devlink_dev_name(const struct devlink *devlink);
+const char *devlink_dev_driver_name(const struct devlink *devlink);
/* Devlink instance explicit locking */
void devl_lock(struct devlink *devlink);
@@ -1561,6 +1648,13 @@ void devlink_register(struct devlink *devlink);
void devlink_unregister(struct devlink *devlink);
void devlink_free(struct devlink *devlink);
+struct devlink *devlink_shd_get(const char *id,
+ const struct devlink_ops *ops,
+ size_t priv_size,
+ const struct device_driver *driver);
+void devlink_shd_put(struct devlink *devlink);
+void *devlink_shd_get_priv(struct devlink *devlink);
+
/**
* struct devlink_port_ops - Port operations
* @port_split: Callback used to split the port into multiple ones.
@@ -1721,7 +1815,7 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port,
struct ib_device *ibdev);
void devlink_port_type_clear(struct devlink_port *devlink_port);
void devlink_port_attrs_set(struct devlink_port *devlink_port,
- struct devlink_port_attrs *devlink_port_attrs);
+ const struct devlink_port_attrs *attrs);
void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller,
u16 pf, bool external);
void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller,
@@ -1792,12 +1886,19 @@ int devl_resource_register(struct devlink *devlink,
u64 resource_size,
u64 resource_id,
u64 parent_resource_id,
- const struct devlink_resource_size_params *size_params);
+ const struct devlink_resource_size_params *params);
void devl_resources_unregister(struct devlink *devlink);
void devlink_resources_unregister(struct devlink *devlink);
int devl_resource_size_get(struct devlink *devlink,
u64 resource_id,
u64 *p_resource_size);
+int
+devl_port_resource_register(struct devlink_port *devlink_port,
+ const char *resource_name,
+ u64 resource_size, u64 resource_id,
+ u64 parent_resource_id,
+ const struct devlink_resource_size_params *params);
+void devl_port_resources_unregister(struct devlink_port *devlink_port);
int devl_dpipe_table_resource_set(struct devlink *devlink,
const char *table_name, u64 resource_id,
u64 resource_units);
@@ -1906,22 +2007,22 @@ void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
struct devlink_health_reporter *
devl_port_health_reporter_create(struct devlink_port *port,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv);
+ void *priv);
struct devlink_health_reporter *
devlink_port_health_reporter_create(struct devlink_port *port,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv);
+ void *priv);
struct devlink_health_reporter *
devl_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv);
+ void *priv);
struct devlink_health_reporter *
devlink_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv);
+ void *priv);
void
devl_health_reporter_destroy(struct devlink_health_reporter *reporter);
diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h
index 32a34dfe8cc5..2f312d1f67d6 100644
--- a/include/net/dropreason-core.h
+++ b/include/net/dropreason-core.h
@@ -40,8 +40,12 @@
FN(TCP_OFOMERGE) \
FN(TCP_RFC7323_PAWS) \
FN(TCP_RFC7323_PAWS_ACK) \
+ FN(TCP_RFC7323_TW_PAWS) \
+ FN(TCP_RFC7323_TSECR) \
+ FN(TCP_LISTEN_OVERFLOW) \
FN(TCP_OLD_SEQUENCE) \
FN(TCP_INVALID_SEQUENCE) \
+ FN(TCP_INVALID_END_SEQUENCE) \
FN(TCP_INVALID_ACK_SEQUENCE) \
FN(TCP_RESET) \
FN(TCP_INVALID_SYN) \
@@ -59,21 +63,20 @@
FN(NEIGH_FAILED) \
FN(NEIGH_QUEUEFULL) \
FN(NEIGH_DEAD) \
+ FN(NEIGH_HH_FILLFAIL) \
FN(TC_EGRESS) \
FN(SECURITY_HOOK) \
FN(QDISC_DROP) \
- FN(QDISC_OVERLIMIT) \
- FN(QDISC_CONGESTED) \
- FN(CAKE_FLOOD) \
- FN(FQ_BAND_LIMIT) \
- FN(FQ_HORIZON_LIMIT) \
- FN(FQ_FLOW_LIMIT) \
+ FN(QDISC_BURST_DROP) \
FN(CPU_BACKLOG) \
+ FN(MACVLAN_BROADCAST_BACKLOG) \
+ FN(IPVLAN_MULTICAST_BACKLOG) \
FN(XDP) \
FN(TC_INGRESS) \
FN(UNHANDLED_PROTO) \
FN(SKB_CSUM) \
FN(SKB_GSO_SEG) \
+ FN(SKB_BAD_GSO) \
FN(SKB_UCOPY_FAULT) \
FN(DEV_HDR) \
FN(DEV_READY) \
@@ -96,6 +99,7 @@
FN(FRAG_TOO_FAR) \
FN(TCP_MINTTL) \
FN(IPV6_BAD_EXTHDR) \
+ FN(IPV6_TOO_MANY_EXTHDRS) \
FN(IPV6_NDISC_FRAG) \
FN(IPV6_NDISC_HOP_LIMIT) \
FN(IPV6_NDISC_BAD_CODE) \
@@ -117,6 +121,13 @@
FN(ARP_PVLAN_DISABLE) \
FN(MAC_IEEE_MAC_CONTROL) \
FN(BRIDGE_INGRESS_STP_STATE) \
+ FN(CAN_RX_INVALID_FRAME) \
+ FN(CANFD_RX_INVALID_FRAME) \
+ FN(CANXL_RX_INVALID_FRAME) \
+ FN(PFMEMALLOC) \
+ FN(PSP_INPUT) \
+ FN(PSP_OUTPUT) \
+ FN(RECURSION_LIMIT) \
FNe(MAX)
/**
@@ -281,11 +292,30 @@ enum skb_drop_reason {
* Corresponds to LINUX_MIB_PAWS_OLD_ACK.
*/
SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK,
+ /**
+ * @SKB_DROP_REASON_TCP_RFC7323_TW_PAWS: PAWS check, socket is in
+ * TIME_WAIT state.
+ * Corresponds to LINUX_MIB_PAWS_TW_REJECTED.
+ */
+ SKB_DROP_REASON_TCP_RFC7323_TW_PAWS,
+ /**
+ * @SKB_DROP_REASON_TCP_RFC7323_TSECR: PAWS check, invalid TSEcr.
+ * Corresponds to LINUX_MIB_TSECRREJECTED.
+ */
+ SKB_DROP_REASON_TCP_RFC7323_TSECR,
+ /** @SKB_DROP_REASON_TCP_LISTEN_OVERFLOW: listener queue full. */
+ SKB_DROP_REASON_TCP_LISTEN_OVERFLOW,
/** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */
SKB_DROP_REASON_TCP_OLD_SEQUENCE,
- /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */
+ /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field. */
SKB_DROP_REASON_TCP_INVALID_SEQUENCE,
/**
+ * @SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE:
+ * Not acceptable END_SEQ field.
+ * Corresponds to LINUX_MIB_BEYOND_WINDOW.
+ */
+ SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE,
+ /**
* @SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ
* field because ack sequence is not in the window between snd_una
* and snd_nxt
@@ -332,51 +362,40 @@ enum skb_drop_reason {
SKB_DROP_REASON_NEIGH_QUEUEFULL,
/** @SKB_DROP_REASON_NEIGH_DEAD: neigh entry is dead */
SKB_DROP_REASON_NEIGH_DEAD,
+ /** @SKB_DROP_REASON_NEIGH_HH_FILLFAIL: failed to fill the device hard header */
+ SKB_DROP_REASON_NEIGH_HH_FILLFAIL,
/** @SKB_DROP_REASON_TC_EGRESS: dropped in TC egress HOOK */
SKB_DROP_REASON_TC_EGRESS,
/** @SKB_DROP_REASON_SECURITY_HOOK: dropped due to security HOOK */
SKB_DROP_REASON_SECURITY_HOOK,
/**
- * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc when packet outputting (
- * failed to enqueue to current qdisc)
+ * @SKB_DROP_REASON_QDISC_DROP: dropped by qdisc during enqueue or
+ * dequeue. More specific drop reasons are available via the
+ * qdisc:qdisc_drop tracepoint, which also provides qdisc handle
+ * and name for identifying the source.
*/
SKB_DROP_REASON_QDISC_DROP,
/**
- * @SKB_DROP_REASON_QDISC_OVERLIMIT: dropped by qdisc when a qdisc
- * instance exceeds its total buffer size limit.
- */
- SKB_DROP_REASON_QDISC_OVERLIMIT,
- /**
- * @SKB_DROP_REASON_QDISC_CONGESTED: dropped by a qdisc AQM algorithm
- * due to congestion.
- */
- SKB_DROP_REASON_QDISC_CONGESTED,
- /**
- * @SKB_DROP_REASON_CAKE_FLOOD: dropped by the flood protection part of
- * CAKE qdisc AQM algorithm (BLUE).
- */
- SKB_DROP_REASON_CAKE_FLOOD,
- /**
- * @SKB_DROP_REASON_FQ_BAND_LIMIT: dropped by fq qdisc when per band
- * limit is reached.
- */
- SKB_DROP_REASON_FQ_BAND_LIMIT,
- /**
- * @SKB_DROP_REASON_FQ_HORIZON_LIMIT: dropped by fq qdisc when packet
- * timestamp is too far in the future.
- */
- SKB_DROP_REASON_FQ_HORIZON_LIMIT,
- /**
- * @SKB_DROP_REASON_FQ_FLOW_LIMIT: dropped by fq qdisc when a flow
- * exceeds its limits.
+ * @SKB_DROP_REASON_QDISC_BURST_DROP: dropped when net.core.qdisc_max_burst
+ * limit is hit.
*/
- SKB_DROP_REASON_FQ_FLOW_LIMIT,
+ SKB_DROP_REASON_QDISC_BURST_DROP,
/**
* @SKB_DROP_REASON_CPU_BACKLOG: failed to enqueue the skb to the per CPU
* backlog queue. This can be caused by backlog queue full (see
* netdev_max_backlog in net.rst) or RPS flow limit
*/
SKB_DROP_REASON_CPU_BACKLOG,
+ /**
+ * @SKB_DROP_REASON_MACVLAN_BROADCAST_BACKLOG: failed to enqueue the skb
+ * to macvlan broadcast queue.
+ */
+ SKB_DROP_REASON_MACVLAN_BROADCAST_BACKLOG,
+ /**
+ * @SKB_DROP_REASON_IPVLAN_MULTICAST_BACKLOG: failed to enqueue the skb
+ * to ipvlan multicast queue.
+ */
+ SKB_DROP_REASON_IPVLAN_MULTICAST_BACKLOG,
/** @SKB_DROP_REASON_XDP: dropped by XDP in input path */
SKB_DROP_REASON_XDP,
/** @SKB_DROP_REASON_TC_INGRESS: dropped in TC ingress HOOK */
@@ -387,6 +406,8 @@ enum skb_drop_reason {
SKB_DROP_REASON_SKB_CSUM,
/** @SKB_DROP_REASON_SKB_GSO_SEG: gso segmentation error */
SKB_DROP_REASON_SKB_GSO_SEG,
+ /** @SKB_DROP_REASON_SKB_BAD_GSO: malicious gso packet. */
+ SKB_DROP_REASON_SKB_BAD_GSO,
/**
* @SKB_DROP_REASON_SKB_UCOPY_FAULT: failed to copy data from user space,
* e.g., via zerocopy_sg_from_iter() or skb_orphan_frags_rx()
@@ -474,6 +495,11 @@ enum skb_drop_reason {
SKB_DROP_REASON_TCP_MINTTL,
/** @SKB_DROP_REASON_IPV6_BAD_EXTHDR: Bad IPv6 extension header. */
SKB_DROP_REASON_IPV6_BAD_EXTHDR,
+ /**
+ * @SKB_DROP_REASON_IPV6_TOO_MANY_EXTHDRS: Number of IPv6 extension
+ * headers in the packet exceeds IP6_MAX_EXT_HDRS_CNT.
+ */
+ SKB_DROP_REASON_IPV6_TOO_MANY_EXTHDRS,
/** @SKB_DROP_REASON_IPV6_NDISC_FRAG: invalid frag (suppress_frag_ndisc). */
SKB_DROP_REASON_IPV6_NDISC_FRAG,
/** @SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT: invalid hop limit. */
@@ -555,6 +581,32 @@ enum skb_drop_reason {
*/
SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE,
/**
+ * @SKB_DROP_REASON_CAN_RX_INVALID_FRAME: received
+ * non conform CAN frame (or device is unable to receive CAN frames)
+ */
+ SKB_DROP_REASON_CAN_RX_INVALID_FRAME,
+ /**
+ * @SKB_DROP_REASON_CANFD_RX_INVALID_FRAME: received
+ * non conform CAN-FD frame (or device is unable to receive CAN frames)
+ */
+ SKB_DROP_REASON_CANFD_RX_INVALID_FRAME,
+ /**
+ * @SKB_DROP_REASON_CANXL_RX_INVALID_FRAME: received
+ * non conform CAN-XL frame (or device is unable to receive CAN frames)
+ */
+ SKB_DROP_REASON_CANXL_RX_INVALID_FRAME,
+ /**
+ * @SKB_DROP_REASON_PFMEMALLOC: packet allocated from memory reserve
+ * reached a path or socket not eligible for use of memory reserves
+ */
+ SKB_DROP_REASON_PFMEMALLOC,
+ /** @SKB_DROP_REASON_PSP_INPUT: PSP input checks failed */
+ SKB_DROP_REASON_PSP_INPUT,
+ /** @SKB_DROP_REASON_PSP_OUTPUT: PSP output checks failed */
+ SKB_DROP_REASON_PSP_OUTPUT,
+ /** @SKB_DROP_REASON_RECURSION_LIMIT: Dead loop on virtual device. */
+ SKB_DROP_REASON_RECURSION_LIMIT,
+ /**
* @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which
* shouldn't be used as a real 'reason' - only for tracing code gen
*/
diff --git a/include/net/dropreason-qdisc.h b/include/net/dropreason-qdisc.h
new file mode 100644
index 000000000000..fb151cd31751
--- /dev/null
+++ b/include/net/dropreason-qdisc.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_DROPREASON_QDISC_H
+#define _LINUX_DROPREASON_QDISC_H
+#include <net/dropreason.h>
+
+#define DEFINE_QDISC_DROP_REASON(FN, FNe) \
+ FN(UNSPEC) \
+ FN(GENERIC) \
+ FN(OVERLIMIT) \
+ FN(CONGESTED) \
+ FN(MAXFLOWS) \
+ FN(FLOOD_PROTECTION) \
+ FN(BAND_LIMIT) \
+ FN(HORIZON_LIMIT) \
+ FN(FLOW_LIMIT) \
+ FN(L4S_STEP_NON_ECN) \
+ FNe(MAX)
+
+#undef FN
+#undef FNe
+#define FN(reason) QDISC_DROP_##reason,
+#define FNe(reason) QDISC_DROP_##reason
+
+/**
+ * enum qdisc_drop_reason - reason why a qdisc dropped a packet
+ *
+ * Qdisc-specific drop reasons for packet drops that occur within the
+ * traffic control (TC) queueing discipline layer. These reasons provide
+ * detailed diagnostics about why packets were dropped by various qdisc
+ * algorithms, enabling fine-grained monitoring and troubleshooting of
+ * queue behavior.
+ */
+enum qdisc_drop_reason {
+ /**
+ * @QDISC_DROP_UNSPEC: unspecified/invalid qdisc drop reason.
+ * Value 0 serves as analogous to SKB_NOT_DROPPED_YET for enum skb_drop_reason.
+ * Used for catching zero-initialized drop_reason fields.
+ */
+ QDISC_DROP_UNSPEC = 0,
+ /**
+ * @__QDISC_DROP_REASON: subsystem base value for qdisc drop reasons
+ */
+ __QDISC_DROP_REASON = SKB_DROP_REASON_SUBSYS_QDISC <<
+ SKB_DROP_REASON_SUBSYS_SHIFT,
+ /**
+ * @QDISC_DROP_GENERIC: generic/default qdisc drop, used when no
+ * more specific reason applies
+ */
+ QDISC_DROP_GENERIC,
+ /**
+ * @QDISC_DROP_OVERLIMIT: packet dropped because the qdisc queue
+ * length exceeded its configured limit (sch->limit). This typically
+ * indicates the queue is full and cannot accept more packets.
+ */
+ QDISC_DROP_OVERLIMIT,
+ /**
+ * @QDISC_DROP_CONGESTED: packet dropped due to active congestion
+ * control algorithms (e.g., CoDel, PIE, RED) detecting network
+ * congestion. The qdisc proactively dropped the packet to signal
+ * congestion to the sender and prevent bufferbloat.
+ */
+ QDISC_DROP_CONGESTED,
+ /**
+ * @QDISC_DROP_MAXFLOWS: packet dropped because the qdisc's flow
+ * tracking table is full and no free slots are available to allocate
+ * for a new flow. This indicates flow table exhaustion in flow-based
+ * qdiscs that maintain per-flow state (e.g., SFQ).
+ */
+ QDISC_DROP_MAXFLOWS,
+ /**
+ * @QDISC_DROP_FLOOD_PROTECTION: packet dropped by flood protection
+ * mechanism detecting unresponsive flows (potential DoS/flood).
+ * Used by qdiscs implementing probabilistic drop algorithms like
+ * BLUE (e.g., CAKE's Cobalt AQM).
+ */
+ QDISC_DROP_FLOOD_PROTECTION,
+ /**
+ * @QDISC_DROP_BAND_LIMIT: packet dropped because the priority band's
+ * limit was reached. Used by qdiscs with priority bands that have
+ * per-band packet limits (e.g., FQ).
+ */
+ QDISC_DROP_BAND_LIMIT,
+ /**
+ * @QDISC_DROP_HORIZON_LIMIT: packet dropped because its timestamp
+ * is too far in the future (beyond the configured horizon).
+ * Used by qdiscs with time-based scheduling (e.g., FQ).
+ */
+ QDISC_DROP_HORIZON_LIMIT,
+ /**
+ * @QDISC_DROP_FLOW_LIMIT: packet dropped because an individual flow
+ * exceeded its per-flow packet/depth limit. Used by FQ and SFQ qdiscs
+ * to enforce per-flow fairness and prevent a single flow from
+ * monopolizing queue resources.
+ */
+ QDISC_DROP_FLOW_LIMIT,
+ /**
+ * @QDISC_DROP_L4S_STEP_NON_ECN: DualPI2 qdisc dropped a non-ECN-capable
+ * packet because the L4S queue delay exceeded the step threshold.
+ * Since the packet cannot be ECN-marked, it must be dropped to signal
+ * congestion. See RFC 9332 for the DualQ Coupled AQM step mechanism.
+ */
+ QDISC_DROP_L4S_STEP_NON_ECN,
+ /**
+ * @QDISC_DROP_MAX: the maximum of qdisc drop reasons, which
+ * shouldn't be used as a real 'reason' - only for tracing code gen
+ */
+ QDISC_DROP_MAX,
+};
+
+#undef FN
+#undef FNe
+
+#endif
diff --git a/include/net/dropreason.h b/include/net/dropreason.h
index 56cb7be92244..1df60645fb27 100644
--- a/include/net/dropreason.h
+++ b/include/net/dropreason.h
@@ -18,17 +18,17 @@ enum skb_drop_reason_subsys {
SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE,
/**
- * @SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR: mac80211 drop reasons
- * for frames still going to monitor, see net/mac80211/drop.h
- */
- SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR,
-
- /**
* @SKB_DROP_REASON_SUBSYS_OPENVSWITCH: openvswitch drop reasons,
* see net/openvswitch/drop.h
*/
SKB_DROP_REASON_SUBSYS_OPENVSWITCH,
+ /**
+ * @SKB_DROP_REASON_SUBSYS_QDISC: TC qdisc drop reasons,
+ * see include/net/dropreason-qdisc.h
+ */
+ SKB_DROP_REASON_SUBSYS_QDISC,
+
/** @SKB_DROP_REASON_SUBSYS_NUM: number of subsystems defined */
SKB_DROP_REASON_SUBSYS_NUM
};
diff --git a/include/net/dsa.h b/include/net/dsa.h
index a0a9481c52c2..8b6d34e8a6f0 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -54,11 +54,16 @@ struct tc_action;
#define DSA_TAG_PROTO_RZN1_A5PSW_VALUE 26
#define DSA_TAG_PROTO_LAN937X_VALUE 27
#define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE 28
+#define DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE 29
+#define DSA_TAG_PROTO_YT921X_VALUE 30
+#define DSA_TAG_PROTO_MXL_GSW1XX_VALUE 31
+#define DSA_TAG_PROTO_MXL862_VALUE 32
enum dsa_tag_protocol {
DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE,
DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE,
DSA_TAG_PROTO_BRCM_LEGACY = DSA_TAG_PROTO_BRCM_LEGACY_VALUE,
+ DSA_TAG_PROTO_BRCM_LEGACY_FCS = DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE,
DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE,
DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE,
DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE,
@@ -85,6 +90,9 @@ enum dsa_tag_protocol {
DSA_TAG_PROTO_RZN1_A5PSW = DSA_TAG_PROTO_RZN1_A5PSW_VALUE,
DSA_TAG_PROTO_LAN937X = DSA_TAG_PROTO_LAN937X_VALUE,
DSA_TAG_PROTO_VSC73XX_8021Q = DSA_TAG_PROTO_VSC73XX_8021Q_VALUE,
+ DSA_TAG_PROTO_YT921X = DSA_TAG_PROTO_YT921X_VALUE,
+ DSA_TAG_PROTO_MXL_GSW1XX = DSA_TAG_PROTO_MXL_GSW1XX_VALUE,
+ DSA_TAG_PROTO_MXL862 = DSA_TAG_PROTO_MXL862_VALUE,
};
struct dsa_switch;
@@ -210,12 +218,6 @@ struct dsa_mall_mirror_tc_entry {
bool ingress;
};
-/* TC port policer entry */
-struct dsa_mall_policer_tc_entry {
- u32 burst;
- u64 rate_bytes_per_sec;
-};
-
/* TC matchall entry */
struct dsa_mall_tc_entry {
struct list_head list;
@@ -223,7 +225,7 @@ struct dsa_mall_tc_entry {
enum dsa_port_mall_action_type type;
union {
struct dsa_mall_mirror_tc_entry mirror;
- struct dsa_mall_policer_tc_entry policer;
+ struct flow_action_police policer;
};
};
@@ -296,6 +298,7 @@ struct dsa_port {
struct devlink_port devlink_port;
struct phylink *pl;
struct phylink_config pl_config;
+ netdevice_tracker conduit_tracker;
struct dsa_lag *lag;
struct net_device *hsr_dev;
@@ -828,6 +831,22 @@ dsa_tree_offloads_bridge_dev(struct dsa_switch_tree *dst,
return false;
}
+#define dsa_switch_for_each_bridge_member(_dp, _ds, _bdev) \
+ dsa_switch_for_each_user_port(_dp, _ds) \
+ if (dsa_port_offloads_bridge_dev(_dp, _bdev))
+
+static inline u32
+dsa_bridge_ports(struct dsa_switch *ds, const struct net_device *bdev)
+{
+ struct dsa_port *dp;
+ u32 mask = 0;
+
+ dsa_switch_for_each_bridge_member(dp, ds, bdev)
+ mask |= BIT(dp->index);
+
+ return mask;
+}
+
static inline bool dsa_port_tree_same(const struct dsa_port *a,
const struct dsa_port *b)
{
@@ -1103,7 +1122,7 @@ struct dsa_switch_ops {
void (*port_mirror_del)(struct dsa_switch *ds, int port,
struct dsa_mall_mirror_tc_entry *mirror);
int (*port_policer_add)(struct dsa_switch *ds, int port,
- struct dsa_mall_policer_tc_entry *policer);
+ const struct flow_action_police *policer);
void (*port_policer_del)(struct dsa_switch *ds, int port);
int (*port_setup_tc)(struct dsa_switch *ds, int port,
enum tc_setup_type type, void *type_data);
@@ -1131,9 +1150,10 @@ struct dsa_switch_ops {
* PTP functionality
*/
int (*port_hwtstamp_get)(struct dsa_switch *ds, int port,
- struct ifreq *ifr);
+ struct kernel_hwtstamp_config *config);
int (*port_hwtstamp_set)(struct dsa_switch *ds, int port,
- struct ifreq *ifr);
+ struct kernel_hwtstamp_config *config,
+ struct netlink_ext_ack *extack);
void (*port_txtstamp)(struct dsa_switch *ds, int port,
struct sk_buff *skb);
bool (*port_rxtstamp)(struct dsa_switch *ds, int port,
@@ -1244,7 +1264,8 @@ struct dsa_switch_ops {
dsa_devlink_param_get, dsa_devlink_param_set, NULL)
int dsa_devlink_param_get(struct devlink *dl, u32 id,
- struct devlink_param_gset_ctx *ctx);
+ struct devlink_param_gset_ctx *ctx,
+ struct netlink_ext_ack *extack);
int dsa_devlink_param_set(struct devlink *dl, u32 id,
struct devlink_param_gset_ctx *ctx,
struct netlink_ext_ack *extack);
@@ -1307,11 +1328,6 @@ static inline int dsa_devlink_port_to_port(struct devlink_port *port)
return port->index;
}
-struct dsa_switch_driver {
- struct list_head list;
- const struct dsa_switch_ops *ops;
-};
-
bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port,
const unsigned char *addr, u16 vid,
struct dsa_db db);
@@ -1319,6 +1335,15 @@ bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port,
const struct switchdev_obj_port_mdb *mdb,
struct dsa_db db);
+int dsa_port_simple_hsr_validate(struct dsa_switch *ds, int port,
+ struct net_device *hsr,
+ struct netlink_ext_ack *extack);
+int dsa_port_simple_hsr_join(struct dsa_switch *ds, int port,
+ struct net_device *hsr,
+ struct netlink_ext_ack *extack);
+int dsa_port_simple_hsr_leave(struct dsa_switch *ds, int port,
+ struct net_device *hsr);
+
/* Keep inline for faster access in hot path */
static inline bool netdev_uses_dsa(const struct net_device *dev)
{
diff --git a/include/net/dst.h b/include/net/dst.h
index 78c78cdce0e9..307073eae7f8 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -24,7 +24,10 @@
struct sk_buff;
struct dst_entry {
- struct net_device *dev;
+ union {
+ struct net_device *dev;
+ struct net_device __rcu *dev_rcu;
+ };
struct dst_ops *ops;
unsigned long _metrics;
unsigned long expires;
@@ -216,6 +219,12 @@ static inline u32 dst_mtu(const struct dst_entry *dst)
return INDIRECT_CALL_INET(dst->ops->mtu, ip6_mtu, ipv4_mtu, dst);
}
+/* Variant of dst_mtu() for IPv4 users. */
+static inline u32 dst4_mtu(const struct dst_entry *dst)
+{
+ return INDIRECT_CALL_1(dst->ops->mtu, ipv4_mtu, dst);
+}
+
/* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */
static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric)
{
@@ -240,9 +249,9 @@ static inline void dst_hold(struct dst_entry *dst)
static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
{
- if (unlikely(time != dst->lastuse)) {
+ if (unlikely(time != READ_ONCE(dst->lastuse))) {
dst->__use++;
- dst->lastuse = time;
+ WRITE_ONCE(dst->lastuse, time);
}
}
@@ -431,13 +440,15 @@ static inline void dst_link_failure(struct sk_buff *skb)
static inline void dst_set_expires(struct dst_entry *dst, int timeout)
{
- unsigned long expires = jiffies + timeout;
+ unsigned long old, expires = jiffies + timeout;
if (expires == 0)
expires = 1;
- if (dst->expires == 0 || time_before(expires, dst->expires))
- dst->expires = expires;
+ old = READ_ONCE(dst->expires);
+
+ if (!old || time_before(expires, old))
+ WRITE_ONCE(dst->expires, expires);
}
static inline unsigned int dst_dev_overhead(struct dst_entry *dst,
@@ -456,7 +467,7 @@ INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *,
/* Output packet to network from transport. */
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- return INDIRECT_CALL_INET(skb_dst(skb)->output,
+ return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->output),
ip6_output, ip_output,
net, sk, skb);
}
@@ -466,7 +477,7 @@ INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *));
/* Input packet from network to transport. */
static inline int dst_input(struct sk_buff *skb)
{
- return INDIRECT_CALL_INET(skb_dst(skb)->input,
+ return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->input),
ip6_input, ip_local_deliver, skb);
}
@@ -476,7 +487,7 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
u32));
static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
{
- if (dst->obsolete)
+ if (READ_ONCE(dst->obsolete))
dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check,
ipv4_dst_check, dst, cookie);
return dst;
@@ -561,6 +572,41 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu)
dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
}
+static inline struct net_device *dst_dev(const struct dst_entry *dst)
+{
+ return READ_ONCE(dst->dev);
+}
+
+static inline struct net_device *dst_dev_rcu(const struct dst_entry *dst)
+{
+ return rcu_dereference(dst->dev_rcu);
+}
+
+static inline struct net *dst_dev_net_rcu(const struct dst_entry *dst)
+{
+ return dev_net_rcu(dst_dev_rcu(dst));
+}
+
+static inline struct net_device *skb_dst_dev(const struct sk_buff *skb)
+{
+ return dst_dev(skb_dst(skb));
+}
+
+static inline struct net_device *skb_dst_dev_rcu(const struct sk_buff *skb)
+{
+ return dst_dev_rcu(skb_dst(skb));
+}
+
+static inline struct net *skb_dst_dev_net(const struct sk_buff *skb)
+{
+ return dev_net(skb_dst_dev(skb));
+}
+
+static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb)
+{
+ return dev_net_rcu(skb_dst_dev_rcu(skb));
+}
+
struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie);
void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu, bool confirm_neigh);
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 84c15402931c..1fc2fb03ce3f 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -3,6 +3,7 @@
#define __NET_DST_METADATA_H 1
#include <linux/skbuff.h>
+#include <net/ip.h>
#include <net/ip_tunnels.h>
#include <net/macsec.h>
#include <net/dst.h>
@@ -163,11 +164,8 @@ static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb)
if (!new_md)
return ERR_PTR(-ENOMEM);
- unsafe_memcpy(&new_md->u.tun_info, &md_dst->u.tun_info,
- sizeof(struct ip_tunnel_info) + md_size,
- /* metadata_dst_alloc() reserves room (md_size bytes) for
- * options right after the ip_tunnel_info struct.
- */);
+ memcpy(&new_md->u.tun_info, &md_dst->u.tun_info,
+ sizeof(struct ip_tunnel_info) + md_size);
#ifdef CONFIG_DST_CACHE
/* Unclone the dst cache if there is one */
if (new_md->u.tun_info.dst_cache.cache) {
@@ -223,9 +221,15 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
int md_size)
{
const struct iphdr *iph = ip_hdr(skb);
+ struct metadata_dst *tun_dst;
+
+ tun_dst = __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl,
+ 0, flags, tunnel_id, md_size);
- return __ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl,
- 0, flags, tunnel_id, md_size);
+ if (tun_dst && (iph->frag_off & htons(IP_DF)))
+ __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT,
+ tun_dst->u.tun_info.key.tun_flags);
+ return tun_dst;
}
static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *saddr,
diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 04383d90a1e3..6e68e359ad18 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -43,6 +43,10 @@ struct fib_rule {
struct fib_kuid_range uid_range;
struct fib_rule_port_range sport_range;
struct fib_rule_port_range dport_range;
+ u16 sport_mask;
+ u16 dport_mask;
+ u8 iif_is_l3_master;
+ u8 oif_is_l3_master;
struct rcu_head rcu;
};
@@ -146,6 +150,17 @@ static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a,
ntohs(port) <= a->end;
}
+static inline bool fib_rule_port_match(const struct fib_rule_port_range *range,
+ u16 port_mask, __be16 port)
+{
+ if ((range->start ^ ntohs(port)) & port_mask)
+ return false;
+ if (!port_mask && fib_rule_port_range_set(range) &&
+ !fib_rule_port_inrange(range, port))
+ return false;
+ return true;
+}
+
static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a)
{
return a->start != 0 && a->end != 0 && a->end < 0xffff &&
@@ -159,6 +174,12 @@ static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a,
a->end == b->end;
}
+static inline bool
+fib_rule_port_is_range(const struct fib_rule_port_range *range)
+{
+ return range->start != range->end;
+}
+
static inline bool fib_rule_requires_fldissect(struct fib_rule *rule)
{
return rule->iifindex != LOOPBACK_IFINDEX && (rule->ip_proto ||
@@ -178,10 +199,10 @@ int fib_rules_dump(struct net *net, struct notifier_block *nb, int family,
struct netlink_ext_ack *extack);
unsigned int fib_rules_seq_read(const struct net *net, int family);
-int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack);
-int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack);
+int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, bool rtnl_held);
+int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, bool rtnl_held);
INDIRECT_CALLABLE_DECLARE(int fib6_rule_match(struct fib_rule *rule,
struct flowi *fl, int flags));
diff --git a/include/net/flow.h b/include/net/flow.h
index 335bbc52171c..ae9481c40063 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -12,6 +12,7 @@
#include <linux/atomic.h>
#include <linux/container_of.h>
#include <linux/uidgid.h>
+#include <net/inet_dscp.h>
struct flow_keys;
@@ -32,12 +33,14 @@ struct flowi_common {
int flowic_iif;
int flowic_l3mdev;
__u32 flowic_mark;
- __u8 flowic_tos;
+ dscp_t flowic_dscp;
__u8 flowic_scope;
__u8 flowic_proto;
__u8 flowic_flags;
#define FLOWI_FLAG_ANYSRC 0x01
#define FLOWI_FLAG_KNOWN_NH 0x02
+#define FLOWI_FLAG_L3MDEV_OIF 0x04
+#define FLOWI_FLAG_ANY_SPORT 0x08
__u32 flowic_secid;
kuid_t flowic_uid;
__u32 flowic_multipath_hash;
@@ -68,7 +71,7 @@ struct flowi4 {
#define flowi4_iif __fl_common.flowic_iif
#define flowi4_l3mdev __fl_common.flowic_l3mdev
#define flowi4_mark __fl_common.flowic_mark
-#define flowi4_tos __fl_common.flowic_tos
+#define flowi4_dscp __fl_common.flowic_dscp
#define flowi4_scope __fl_common.flowic_scope
#define flowi4_proto __fl_common.flowic_proto
#define flowi4_flags __fl_common.flowic_flags
@@ -101,7 +104,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif,
fl4->flowi4_iif = LOOPBACK_IFINDEX;
fl4->flowi4_l3mdev = 0;
fl4->flowi4_mark = mark;
- fl4->flowi4_tos = tos;
+ fl4->flowi4_dscp = inet_dsfield_to_dscp(tos);
fl4->flowi4_scope = scope;
fl4->flowi4_proto = proto;
fl4->flowi4_flags = flags;
@@ -139,7 +142,7 @@ struct flowi6 {
#define flowi6_uid __fl_common.flowic_uid
struct in6_addr daddr;
struct in6_addr saddr;
- /* Note: flowi6_tos is encoded in flowlabel, too. */
+ /* Note: flowi6_dscp is encoded in flowlabel, too. */
__be32 flowlabel;
union flowi_uli uli;
#define fl6_sport uli.ports.sport
@@ -161,7 +164,7 @@ struct flowi {
#define flowi_iif u.__fl_common.flowic_iif
#define flowi_l3mdev u.__fl_common.flowic_l3mdev
#define flowi_mark u.__fl_common.flowic_mark
-#define flowi_tos u.__fl_common.flowic_tos
+#define flowi_dscp u.__fl_common.flowic_dscp
#define flowi_scope u.__fl_common.flowic_scope
#define flowi_proto u.__fl_common.flowic_proto
#define flowi_flags u.__fl_common.flowic_flags
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 596ab9791e4d..70a02ee14308 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -231,6 +231,21 @@ struct flow_action_cookie *flow_action_cookie_create(void *data,
gfp_t gfp);
void flow_action_cookie_destroy(struct flow_action_cookie *cookie);
+struct flow_action_police {
+ u32 burst;
+ u64 rate_bytes_ps;
+ u64 peakrate_bytes_ps;
+ u32 avrate;
+ u16 overhead;
+ u64 burst_pkt;
+ u64 rate_pkt_ps;
+ u32 mtu;
+ struct {
+ enum flow_action_id act_id;
+ u32 extval;
+ } exceed, notexceed;
+};
+
struct flow_action_entry {
enum flow_action_id id;
u32 hw_index;
@@ -275,20 +290,7 @@ struct flow_action_entry {
u32 trunc_size;
bool truncate;
} sample;
- struct { /* FLOW_ACTION_POLICE */
- u32 burst;
- u64 rate_bytes_ps;
- u64 peakrate_bytes_ps;
- u32 avrate;
- u16 overhead;
- u64 burst_pkt;
- u64 rate_pkt_ps;
- u32 mtu;
- struct {
- enum flow_action_id act_id;
- u32 extval;
- } exceed, notexceed;
- } police;
+ struct flow_action_police police; /* FLOW_ACTION_POLICE */
struct { /* FLOW_ACTION_CT */
int action;
u16 zone;
@@ -526,7 +528,7 @@ static inline bool flow_rule_has_enc_control_flags(const u32 enc_ctrl_flags,
*
* Return: true if control flags are set, false otherwise.
*/
-static inline bool flow_rule_match_has_control_flags(struct flow_rule *rule,
+static inline bool flow_rule_match_has_control_flags(const struct flow_rule *rule,
struct netlink_ext_ack *extack)
{
struct flow_match_control match;
@@ -718,7 +720,7 @@ struct flow_offload_action {
struct flow_offload_action *offload_action_alloc(unsigned int num_actions);
static inline struct flow_rule *
-flow_cls_offload_flow_rule(struct flow_cls_offload *flow_cmd)
+flow_cls_offload_flow_rule(const struct flow_cls_offload *flow_cmd)
{
return flow_cmd->rule;
}
diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h
index 9467e33dfb36..8aca881937da 100644
--- a/include/net/fq_impl.h
+++ b/include/net/fq_impl.h
@@ -358,7 +358,7 @@ static int fq_init(struct fq *fq, int flows_cnt)
fq->limit = 8192;
fq->memory_limit = 16 << 20; /* 16 MBytes */
- fq->flows = kvcalloc(fq->flows_cnt, sizeof(fq->flows[0]), GFP_KERNEL);
+ fq->flows = kvzalloc_objs(fq->flows[0], fq->flows_cnt);
if (!fq->flows)
return -ENOMEM;
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index a03d56765832..d70510ac31ab 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -62,7 +62,7 @@ struct genl_info;
* @small_ops: the small-struct operations supported by this family
* @n_small_ops: number of small-struct operations supported by this family
* @split_ops: the split do/dump form of operation definition
- * @n_split_ops: number of entries in @split_ops, not that with split do/dump
+ * @n_split_ops: number of entries in @split_ops, note that with split do/dump
* ops the number of entries is not the same as number of commands
* @sock_priv_size: the size of per-socket private memory
* @sock_priv_init: the per-socket private memory initializer
@@ -489,8 +489,10 @@ genlmsg_multicast_netns_filtered(const struct genl_family *family,
netlink_filter_fn filter,
void *filter_data)
{
- if (WARN_ON_ONCE(group >= family->n_mcgrps))
+ if (WARN_ON_ONCE(group >= family->n_mcgrps)) {
+ nlmsg_free(skb);
return -EINVAL;
+ }
group = family->mcgrp_offset + group;
return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group,
flags, filter, filter_data);
diff --git a/include/net/gro.h b/include/net/gro.h
index 7b548f91754b..2300b6da05b2 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -71,14 +71,11 @@ struct napi_gro_cb {
/* Free the skb? */
u8 free:2;
- /* Used in foo-over-udp, set in udp[46]_gro_receive */
- u8 is_ipv6:1;
-
/* Used in GRE, set in fou/gue_gro_receive */
u8 is_fou:1;
/* Used to determine if ipid_offset can be ignored */
- u8 ip_fixedid:1;
+ u8 ip_fixedid:2;
/* Number of gro_receive callbacks this packet already went through */
u8 recursion_counter:4;
@@ -408,9 +405,8 @@ INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *,
struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int));
-INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *,
- struct sk_buff *));
-INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int));
+struct sk_buff *udp6_gro_receive(struct list_head *, struct sk_buff *);
+int udp6_gro_complete(struct sk_buff *, int);
#define indirect_call_gro_receive_inet(cb, f2, f1, head, skb) \
({ \
@@ -445,29 +441,25 @@ static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb,
}
static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2,
- struct sk_buff *p, bool outer)
+ struct sk_buff *p, bool inner)
{
const u32 id = ntohl(*(__be32 *)&iph->id);
const u32 id2 = ntohl(*(__be32 *)&iph2->id);
const u16 ipid_offset = (id >> 16) - (id2 >> 16);
const u16 count = NAPI_GRO_CB(p)->count;
- const u32 df = id & IP_DF;
- int flush;
/* All fields must match except length and checksum. */
- flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF));
-
- if (flush | (outer && df))
- return flush;
+ if ((iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | ((id ^ id2) & IP_DF))
+ return true;
/* When we receive our second frame we can make a decision on if we
* continue this flow as an atomic flow with a fixed ID or if we use
* an incrementing ID.
*/
- if (count == 1 && df && !ipid_offset)
- NAPI_GRO_CB(p)->ip_fixedid = true;
+ if (count == 1 && !ipid_offset)
+ NAPI_GRO_CB(p)->ip_fixedid |= 1 << inner;
- return ipid_offset ^ (count * !NAPI_GRO_CB(p)->ip_fixedid);
+ return ipid_offset ^ (count * !(NAPI_GRO_CB(p)->ip_fixedid & (1 << inner)));
}
static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2)
@@ -482,7 +474,7 @@ static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr
static inline int __gro_receive_network_flush(const void *th, const void *th2,
struct sk_buff *p, const u16 diff,
- bool outer)
+ bool inner)
{
const void *nh = th - diff;
const void *nh2 = th2 - diff;
@@ -490,47 +482,70 @@ static inline int __gro_receive_network_flush(const void *th, const void *th2,
if (((struct iphdr *)nh)->version == 6)
return ipv6_gro_flush(nh, nh2);
else
- return inet_gro_flush(nh, nh2, p, outer);
+ return inet_gro_flush(nh, nh2, p, inner);
}
static inline int gro_receive_network_flush(const void *th, const void *th2,
struct sk_buff *p)
{
- const bool encap_mark = NAPI_GRO_CB(p)->encap_mark;
int off = skb_transport_offset(p);
int flush;
- flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark);
- if (encap_mark)
- flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false);
+ flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, false);
+ if (NAPI_GRO_CB(p)->encap_mark)
+ flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, true);
return flush;
}
int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
+void __gro_flush(struct gro_node *gro, bool flush_old);
+
+static inline void gro_flush(struct gro_node *gro, bool flush_old)
+{
+ if (!gro->bitmask)
+ return;
+
+ __gro_flush(gro, flush_old);
+}
+
+static inline void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+{
+ gro_flush(&napi->gro, flush_old);
+}
/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
-static inline void gro_normal_list(struct napi_struct *napi)
+static inline void gro_normal_list(struct gro_node *gro)
{
- if (!napi->rx_count)
+ if (!gro->rx_count)
return;
- netif_receive_skb_list_internal(&napi->rx_list);
- INIT_LIST_HEAD(&napi->rx_list);
- napi->rx_count = 0;
+ netif_receive_skb_list_internal(&gro->rx_list);
+ INIT_LIST_HEAD(&gro->rx_list);
+ gro->rx_count = 0;
+}
+
+static inline void gro_flush_normal(struct gro_node *gro, bool flush_old)
+{
+ gro_flush(gro, flush_old);
+ gro_normal_list(gro);
}
/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
* pass the whole batch up to the stack.
*/
-static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
+static inline void gro_normal_one(struct gro_node *gro, struct sk_buff *skb,
+ int segs)
{
- list_add_tail(&skb->list, &napi->rx_list);
- napi->rx_count += segs;
- if (napi->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch))
- gro_normal_list(napi);
+ list_add_tail(&skb->list, &gro->rx_list);
+ gro->rx_count += segs;
+ if (gro->rx_count >= READ_ONCE(net_hotdata.gro_normal_batch))
+ gro_normal_list(gro);
}
+void gro_init(struct gro_node *gro);
+void gro_cleanup(struct gro_node *gro);
+
/* This function is the alternative of 'inet_iif' and 'inet_sdif'
* functions in case we can not rely on fields of IPCB.
*
@@ -577,4 +592,31 @@ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *
struct packet_offload *gro_find_receive_by_type(__be16 type);
struct packet_offload *gro_find_complete_by_type(__be16 type);
+static inline struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
+{
+ unsigned int thlen, hlen, off;
+ struct tcphdr *th;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*th);
+ th = skb_gro_header(skb, hlen, off);
+ if (unlikely(!th))
+ return NULL;
+
+ thlen = th->doff * 4;
+ if (unlikely(thlen < sizeof(*th)))
+ return NULL;
+
+ hlen = off + thlen;
+ if (!skb_gro_may_pull(skb, hlen)) {
+ th = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!th))
+ return NULL;
+ }
+
+ skb_gro_pull(skb, thlen);
+
+ return th;
+}
+
#endif /* _NET_GRO_H */
diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index 30e9570beb2a..62534d1f3c70 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -2,9 +2,18 @@
#ifndef _NET_HOTDATA_H
#define _NET_HOTDATA_H
+#include <linux/llist.h>
#include <linux/types.h>
#include <linux/netdevice.h>
#include <net/protocol.h>
+#ifdef CONFIG_RPS
+#include <net/rps-types.h>
+#endif
+
+struct skb_defer_node {
+ struct llist_head defer_list;
+ atomic_long_t defer_count;
+} ____cacheline_aligned_in_smp;
/* Read mostly data used in network fast paths. */
struct net_hotdata {
@@ -23,19 +32,20 @@ struct net_hotdata {
struct net_offload udpv6_offload;
#endif
struct list_head offload_base;
- struct list_head ptype_all;
struct kmem_cache *skbuff_cache;
struct kmem_cache *skbuff_fclone_cache;
struct kmem_cache *skb_small_head_cache;
#ifdef CONFIG_RPS
- struct rps_sock_flow_table __rcu *rps_sock_flow_table;
+ rps_tag_ptr rps_sock_flow_table;
u32 rps_cpu_mask;
#endif
+ struct skb_defer_node __percpu *skb_defer_nodes;
int gro_normal_batch;
int netdev_budget;
int netdev_budget_usecs;
int tstamp_prequeue;
int max_backlog;
+ int qdisc_max_burst;
int dev_tx_weight;
int dev_rx_weight;
int sysctl_max_skb_frags;
diff --git a/include/net/icmp.h b/include/net/icmp.h
index caddf4a59ad1..935ee13d9ae9 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -37,10 +37,10 @@ struct sk_buff;
struct net;
void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
- const struct ip_options *opt);
+ const struct inet_skb_parm *parm);
static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
- __icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt);
+ __icmp_send(skb_in, type, code, info, IPCB(skb_in));
}
#if IS_ENABLED(CONFIG_NF_NAT)
@@ -48,8 +48,10 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info);
#else
static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
- struct ip_options opts = { 0 };
- __icmp_send(skb_in, type, code, info, &opts);
+ struct inet_skb_parm parm;
+
+ memset(&parm, 0, sizeof(parm));
+ __icmp_send(skb_in, type, code, info, &parm);
}
#endif
diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h
index 813e163ce27c..c60867e7e43c 100644
--- a/include/net/ieee80211_radiotap.h
+++ b/include/net/ieee80211_radiotap.h
@@ -1,6 +1,6 @@
/*
* Copyright (c) 2017 Intel Deutschland GmbH
- * Copyright (c) 2018-2019, 2021-2022 Intel Corporation
+ * Copyright (c) 2018-2019, 2021-2022, 2025 Intel Corporation
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
@@ -202,6 +202,24 @@ enum ieee80211_radiotap_vht_coding {
IEEE80211_RADIOTAP_CODING_LDPC_USER3 = 0x08,
};
+enum ieee80211_radiotap_vht_bandwidth {
+ /* Note: more values are defined but can't really be used */
+ IEEE80211_RADIOTAP_VHT_BW_20 = 0,
+ IEEE80211_RADIOTAP_VHT_BW_40 = 1,
+ IEEE80211_RADIOTAP_VHT_BW_80 = 4,
+ IEEE80211_RADIOTAP_VHT_BW_160 = 11,
+};
+
+struct ieee80211_radiotap_vht {
+ __le16 known;
+ u8 flags;
+ u8 bandwidth;
+ u8 mcs_nss[4];
+ u8 coding;
+ u8 group_id;
+ __le16 partial_aid;
+} __packed;
+
/* for IEEE80211_RADIOTAP_TIMESTAMP */
enum ieee80211_radiotap_timestamp_unit_spos {
IEEE80211_RADIOTAP_TIMESTAMP_UNIT_MASK = 0x000F,
diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
index 025bd8d3c769..b814e1acc512 100644
--- a/include/net/inet6_connection_sock.h
+++ b/include/net/inet6_connection_sock.h
@@ -18,12 +18,14 @@ struct sk_buff;
struct sock;
struct sockaddr;
-struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6,
- const struct request_sock *req, u8 proto);
+struct dst_entry *inet6_csk_route_socket(struct sock *sk,
+ struct flowi6 *fl6);
-void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);
+struct dst_entry *inet6_csk_route_req(const struct sock *sk,
+ struct dst_entry *dst,
+ struct flowi6 *fl6,
+ const struct request_sock *req, u8 proto);
int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl);
-struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu);
#endif /* _INET6_CONNECTION_SOCK_H */
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 74dd90ff5f12..2cc5d416bbb5 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -24,6 +24,8 @@
struct inet_hashinfo;
+void inet6_init_ehash_secret(void);
+
static inline unsigned int __inet6_ehashfn(const u32 lhash,
const u16 lport,
const u32 fhash,
@@ -41,7 +43,6 @@ static inline unsigned int __inet6_ehashfn(const u32 lhash,
* The sockhash lock must be held as a reader here.
*/
struct sock *__inet6_lookup_established(const struct net *net,
- struct inet_hashinfo *hashinfo,
const struct in6_addr *saddr,
const __be16 sport,
const struct in6_addr *daddr,
@@ -65,7 +66,6 @@ struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk,
inet6_ehashfn_t *ehashfn);
struct sock *inet6_lookup_listener(const struct net *net,
- struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const struct in6_addr *saddr,
const __be16 sport,
@@ -83,7 +83,6 @@ struct sock *inet6_lookup_run_sk_lookup(const struct net *net,
inet6_ehashfn_t *ehashfn);
static inline struct sock *__inet6_lookup(const struct net *net,
- struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const struct in6_addr *saddr,
const __be16 sport,
@@ -92,14 +91,14 @@ static inline struct sock *__inet6_lookup(const struct net *net,
const int dif, const int sdif,
bool *refcounted)
{
- struct sock *sk = __inet6_lookup_established(net, hashinfo, saddr,
- sport, daddr, hnum,
+ struct sock *sk = __inet6_lookup_established(net, saddr, sport,
+ daddr, hnum,
dif, sdif);
*refcounted = true;
if (sk)
return sk;
*refcounted = false;
- return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
+ return inet6_lookup_listener(net, skb, doff, saddr, sport,
daddr, hnum, dif, sdif);
}
@@ -143,14 +142,13 @@ struct sock *inet6_steal_sock(struct net *net, struct sk_buff *skb, int doff,
return reuse_sk;
}
-static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
- struct sk_buff *skb, int doff,
+static inline struct sock *__inet6_lookup_skb(struct sk_buff *skb, int doff,
const __be16 sport,
const __be16 dport,
int iif, int sdif,
bool *refcounted)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net = skb_dst_dev_net_rcu(skb);
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct sock *sk;
@@ -161,20 +159,16 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
if (sk)
return sk;
- return __inet6_lookup(net, hashinfo, skb,
- doff, &ip6h->saddr, sport,
+ return __inet6_lookup(net, skb, doff, &ip6h->saddr, sport,
&ip6h->daddr, ntohs(dport),
iif, sdif, refcounted);
}
-struct sock *inet6_lookup(const struct net *net, struct inet_hashinfo *hashinfo,
- struct sk_buff *skb, int doff,
+struct sock *inet6_lookup(const struct net *net, struct sk_buff *skb, int doff,
const struct in6_addr *saddr, const __be16 sport,
const struct in6_addr *daddr, const __be16 dport,
const int dif);
-int inet6_hash(struct sock *sk);
-
static inline bool inet6_match(const struct net *net, const struct sock *sk,
const struct in6_addr *saddr,
const struct in6_addr *daddr,
@@ -183,7 +177,7 @@ static inline bool inet6_match(const struct net *net, const struct sock *sk,
{
if (!net_eq(sock_net(sk), net) ||
sk->sk_family != AF_INET6 ||
- sk->sk_portpair != ports ||
+ READ_ONCE(sk->sk_portpair) != ports ||
!ipv6_addr_equal(&sk->sk_v6_daddr, saddr) ||
!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
return false;
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index c17a6585d0b0..3d747896be30 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -19,15 +19,14 @@ struct msghdr;
struct net;
struct page;
struct sock;
-struct sockaddr;
struct socket;
int inet_release(struct socket *sock);
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags);
-int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags, int is_sendmsg);
-int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
+int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags);
int inet_accept(struct socket *sock, struct socket *newsock,
struct proto_accept_arg *arg);
@@ -42,8 +41,8 @@ int inet_shutdown(struct socket *sock, int how);
int inet_listen(struct socket *sock, int backlog);
int __inet_listen_sk(struct sock *sk, int backlog);
void inet_sock_destruct(struct sock *sk);
-int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
-int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len);
+int inet_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
/* Don't allocate port at this moment, defer to connect. */
#define BIND_FORCE_ADDRESS_NO_PORT (1 << 0)
/* Grab and release socket lock. */
@@ -52,7 +51,7 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len);
#define BIND_FROM_BPF (1 << 2)
/* Skip CAP_NET_BIND_SERVICE check. */
#define BIND_NO_CAP_NET_BIND_SERVICE (1 << 3)
-int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+int __inet_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len,
u32 flags);
int inet_getname(struct socket *sock, struct sockaddr *uaddr,
int peer);
@@ -60,8 +59,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int inet_ctl_sock_create(struct sock **sk, unsigned short family,
unsigned short type, unsigned char protocol,
struct net *net);
-int inet_recv_error(struct sock *sk, struct msghdr *msg, int len,
- int *addr_len);
+int inet_recv_error(struct sock *sk, struct msghdr *msg, int len);
struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb);
int inet_gro_complete(struct sk_buff *skb, int nhoff);
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index c7f42844c79a..433c2df23076 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -34,7 +34,7 @@ struct tcp_congestion_ops;
*/
struct inet_connection_sock_af_ops {
int (*queue_xmit)(struct sock *sk, struct sk_buff *skb, struct flowi *fl);
- void (*send_check)(struct sock *sk, struct sk_buff *skb);
+ u16 net_header_len;
int (*rebuild_header)(struct sock *sk);
void (*sk_rx_dst_set)(struct sock *sk, const struct sk_buff *skb);
int (*conn_request)(struct sock *sk, struct sk_buff *skb);
@@ -42,14 +42,13 @@ struct inet_connection_sock_af_ops {
struct request_sock *req,
struct dst_entry *dst,
struct request_sock *req_unhash,
- bool *own_req);
- u16 net_header_len;
- u16 sockaddr_len;
+ bool *own_req,
+ void (*opt_child_init)(struct sock *newsk,
+ const struct sock *sk));
int (*setsockopt)(struct sock *sk, int level, int optname,
sockptr_t optval, unsigned int optlen);
int (*getsockopt)(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
- void (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
void (*mtu_reduced)(struct sock *sk);
};
@@ -58,15 +57,15 @@ struct inet_connection_sock_af_ops {
* @icsk_accept_queue: FIFO of established children
* @icsk_bind_hash: Bind node
* @icsk_bind2_hash: Bind node in the bhash2 table
- * @icsk_timeout: Timeout
- * @icsk_retransmit_timer: Resend (no ack)
+ * @icsk_delack_timer: Delayed ACK timer
+ * @icsk_keepalive_timer: Keepalive timer
+ * @mptcp_tout_timer: mptcp timer
* @icsk_rto: Retransmit timeout
* @icsk_pmtu_cookie Last pmtu seen by socket
* @icsk_ca_ops Pluggable congestion control hook
* @icsk_af_ops Operations which are AF_INET{4,6} specific
* @icsk_ulp_ops Pluggable ULP control hook
* @icsk_ulp_data ULP private data
- * @icsk_clean_acked Clean acked data hook
* @icsk_ca_state: Congestion control state
* @icsk_retransmits: Number of unrecovered [RTO] timeouts
* @icsk_pending: Scheduled timer event
@@ -85,18 +84,20 @@ struct inet_connection_sock {
struct request_sock_queue icsk_accept_queue;
struct inet_bind_bucket *icsk_bind_hash;
struct inet_bind2_bucket *icsk_bind2_hash;
- unsigned long icsk_timeout;
- struct timer_list icsk_retransmit_timer;
- struct timer_list icsk_delack_timer;
+ struct timer_list icsk_delack_timer;
+ union {
+ struct timer_list icsk_keepalive_timer;
+ struct timer_list mptcp_tout_timer;
+ };
__u32 icsk_rto;
__u32 icsk_rto_min;
+ u32 icsk_rto_max;
__u32 icsk_delack_max;
__u32 icsk_pmtu_cookie;
const struct tcp_congestion_ops *icsk_ca_ops;
const struct inet_connection_sock_af_ops *icsk_af_ops;
const struct tcp_ulp_ops *icsk_ulp_ops;
void __rcu *icsk_ulp_data;
- void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
__u8 icsk_ca_state:5,
icsk_ca_initialized:1,
@@ -116,8 +117,8 @@ struct inet_connection_sock {
#define ATO_BITS 8
__u32 ato:ATO_BITS, /* Predicted tick of soft clock */
lrcv_flowlabel:20, /* last received ipv6 flowlabel */
- unused:4;
- unsigned long timeout; /* Currently scheduled timeout */
+ dst_quick_ack:1, /* cache dst RTAX_QUICKACK */
+ unused:3;
__u32 lrcvtime; /* timestamp of last received data packet */
__u16 last_seg_size; /* Size of last incoming segment */
__u16 rcv_mss; /* MSS used for delayed ACK decisions */
@@ -189,8 +190,16 @@ static inline void inet_csk_delack_init(struct sock *sk)
memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack));
}
-void inet_csk_delete_keepalive_timer(struct sock *sk);
-void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long timeout);
+static inline unsigned long tcp_timeout_expires(const struct sock *sk)
+{
+ return READ_ONCE(sk->tcp_retransmit_timer.expires);
+}
+
+static inline unsigned long
+icsk_delack_timeout(const struct inet_connection_sock *icsk)
+{
+ return READ_ONCE(icsk->icsk_delack_timer.expires);
+}
static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what)
{
@@ -199,7 +208,7 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what)
if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) {
smp_store_release(&icsk->icsk_pending, 0);
#ifdef INET_CSK_CLEAR_TIMERS
- sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ sk_stop_timer(sk, &sk->tcp_retransmit_timer);
#endif
} else if (what == ICSK_TIME_DACK) {
smp_store_release(&icsk->icsk_ack.pending, 0);
@@ -227,16 +236,15 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
when = max_when;
}
+ when += jiffies;
if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) {
smp_store_release(&icsk->icsk_pending, what);
- icsk->icsk_timeout = jiffies + when;
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
+ sk_reset_timer(sk, &sk->tcp_retransmit_timer, when);
} else if (what == ICSK_TIME_DACK) {
smp_store_release(&icsk->icsk_ack.pending,
icsk->icsk_ack.pending | ICSK_ACK_TIMER);
- icsk->icsk_ack.timeout = jiffies + when;
- sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, when);
} else {
pr_debug("inet_csk BUG: unknown timer value\n");
}
@@ -264,8 +272,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
struct request_sock *req,
struct sock *child);
-bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
- unsigned long timeout);
+bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req);
struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
struct request_sock *req,
bool own_req);
@@ -288,22 +295,8 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req);
void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req);
-static inline unsigned long
-reqsk_timeout(struct request_sock *req, unsigned long max_timeout)
-{
- u64 timeout = (u64)req->timeout << req->num_timeout;
-
- return (unsigned long)min_t(u64, timeout, max_timeout);
-}
-
-static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk)
-{
- /* The below has to be done to allow calling inet_csk_destroy_sock */
- sock_set_flag(sk, SOCK_DEAD);
- this_cpu_inc(*sk->sk_prot->orphan_count);
-}
-
void inet_csk_destroy_sock(struct sock *sk);
+void inet_csk_prepare_for_destroy_sock(struct sock *sk);
void inet_csk_prepare_forced_close(struct sock *sk);
/*
@@ -318,11 +311,10 @@ static inline __poll_t inet_csk_listen_poll(const struct sock *sk)
int inet_csk_listen_start(struct sock *sk);
void inet_csk_listen_stop(struct sock *sk);
-void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);
-
/* update the fast reuse flag when adding a socket */
-void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
- struct sock *sk);
+void inet_csk_update_fastreuse(const struct sock *sk,
+ struct inet_bind_bucket *tb,
+ struct inet_bind2_bucket *tb2);
struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
diff --git a/include/net/inet_dscp.h b/include/net/inet_dscp.h
index 72f250dffada..1aa9f04ed1ab 100644
--- a/include/net/inet_dscp.h
+++ b/include/net/inet_dscp.h
@@ -39,6 +39,12 @@ typedef u8 __bitwise dscp_t;
#define INET_DSCP_MASK 0xfc
+/* A few places in the IPv4 code need to ignore the three high order bits of
+ * DSCP because of backward compatibility (as these bits used to represent the
+ * IPv4 Precedence in RFC 791's TOS field and were ignored).
+ */
+#define INET_DSCP_LEGACY_TOS_MASK ((__force dscp_t)0x1c)
+
static inline dscp_t inet_dsfield_to_dscp(__u8 dsfield)
{
return (__force dscp_t)(dsfield & INET_DSCP_MASK);
diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h
index ea32393464a2..827b87a95dab 100644
--- a/include/net/inet_ecn.h
+++ b/include/net/inet_ecn.h
@@ -51,11 +51,25 @@ static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner)
return outer;
}
+/* Apply either ECT(0) or ECT(1) */
+static inline void __INET_ECN_xmit(struct sock *sk, bool use_ect_1)
+{
+ __u8 ect = use_ect_1 ? INET_ECN_ECT_1 : INET_ECN_ECT_0;
+
+ /* Mask the complete byte in case the connection alternates between
+ * ECT(0) and ECT(1).
+ */
+ inet_sk(sk)->tos &= ~INET_ECN_MASK;
+ inet_sk(sk)->tos |= ect;
+ if (inet6_sk(sk)) {
+ inet6_sk(sk)->tclass &= ~INET_ECN_MASK;
+ inet6_sk(sk)->tclass |= ect;
+ }
+}
+
static inline void INET_ECN_xmit(struct sock *sk)
{
- inet_sk(sk)->tos |= INET_ECN_ECT_0;
- if (inet6_sk(sk) != NULL)
- inet6_sk(sk)->tclass |= INET_ECN_ECT_0;
+ __INET_ECN_xmit(sk, false);
}
static inline void INET_ECN_dontxmit(struct sock *sk)
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 5af6eb14c5db..365925c9d262 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -123,31 +123,19 @@ void inet_frags_fini(struct inet_frags *);
int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net);
-static inline void fqdir_pre_exit(struct fqdir *fqdir)
-{
- /* Prevent creation of new frags.
- * Pairs with READ_ONCE() in inet_frag_find().
- */
- WRITE_ONCE(fqdir->high_thresh, 0);
-
- /* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire()
- * and ip6frag_expire_frag_queue().
- */
- WRITE_ONCE(fqdir->dead, true);
-}
+void fqdir_pre_exit(struct fqdir *fqdir);
void fqdir_exit(struct fqdir *fqdir);
-void inet_frag_kill(struct inet_frag_queue *q);
+void inet_frag_kill(struct inet_frag_queue *q, int *refs);
void inet_frag_destroy(struct inet_frag_queue *q);
struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key);
-/* Free all skbs in the queue; return the sum of their truesizes. */
-unsigned int inet_frag_rbtree_purge(struct rb_root *root,
- enum skb_drop_reason reason);
+void inet_frag_queue_flush(struct inet_frag_queue *q,
+ enum skb_drop_reason reason);
-static inline void inet_frag_put(struct inet_frag_queue *q)
+static inline void inet_frag_putn(struct inet_frag_queue *q, int refs)
{
- if (refcount_dec_and_test(&q->refcnt))
+ if (refs && refcount_sub_and_test(refs, &q->refcnt))
inet_frag_destroy(q);
}
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 5eea47f135a4..6e2fe186d0dc 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -89,6 +89,7 @@ struct inet_bind_bucket {
bool fast_ipv6_only;
struct hlist_node node;
struct hlist_head bhash2;
+ struct rcu_head rcu;
};
struct inet_bind2_bucket {
@@ -107,6 +108,8 @@ struct inet_bind2_bucket {
struct hlist_node bhash_node;
/* List of sockets hashed to this bucket */
struct hlist_head owners;
+ signed char fastreuse;
+ signed char fastreuseport;
};
static inline struct net *ib_net(const struct inet_bind_bucket *ib)
@@ -174,14 +177,9 @@ struct inet_hashinfo {
bool pernet;
} ____cacheline_aligned_in_smp;
-static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk)
+static inline struct inet_hashinfo *tcp_get_hashinfo(const struct sock *sk)
{
-#if IS_ENABLED(CONFIG_IP_DCCP)
- return sk->sk_prot->h.hashinfo ? :
- sock_net(sk)->ipv4.tcp_death_row.hashinfo;
-#else
return sock_net(sk)->ipv4.tcp_death_row.hashinfo;
-#endif
}
static inline struct inet_listen_hashbucket *
@@ -206,12 +204,6 @@ static inline spinlock_t *inet_ehash_lockp(
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo);
-static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h)
-{
- kfree(h->lhash2);
- h->lhash2 = NULL;
-}
-
static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
{
kvfree(hashinfo->ehash_locks);
@@ -226,8 +218,7 @@ struct inet_bind_bucket *
inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
const unsigned short snum, int l3mdev);
-void inet_bind_bucket_destroy(struct kmem_cache *cachep,
- struct inet_bind_bucket *tb);
+void inet_bind_bucket_destroy(struct inet_bind_bucket *tb);
bool inet_bind_bucket_match(const struct inet_bind_bucket *tb,
const struct net *net, unsigned short port,
@@ -273,6 +264,20 @@ inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk,
return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
}
+static inline bool inet_use_hash2_on_bind(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ return false;
+
+ if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ return true;
+ }
+#endif
+ return sk->sk_rcv_saddr != htonl(INADDR_ANY);
+}
+
struct inet_bind_hashbucket *
inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port);
@@ -295,17 +300,14 @@ void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
unsigned long numentries, int scale,
unsigned long low_limit,
unsigned long high_limit);
-int inet_hashinfo2_init_mod(struct inet_hashinfo *h);
bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk);
bool inet_ehash_nolisten(struct sock *sk, struct sock *osk,
bool *found_dup_sk);
-int __inet_hash(struct sock *sk, struct sock *osk);
int inet_hash(struct sock *sk);
void inet_unhash(struct sock *sk);
struct sock *__inet_lookup_listener(const struct net *net,
- struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be32 saddr, const __be16 sport,
const __be32 daddr,
@@ -313,12 +315,12 @@ struct sock *__inet_lookup_listener(const struct net *net,
const int dif, const int sdif);
static inline struct sock *inet_lookup_listener(struct net *net,
- struct inet_hashinfo *hashinfo,
- struct sk_buff *skb, int doff,
- __be32 saddr, __be16 sport,
- __be32 daddr, __be16 dport, int dif, int sdif)
+ struct sk_buff *skb, int doff,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, __be16 dport,
+ int dif, int sdif)
{
- return __inet_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
+ return __inet_lookup_listener(net, skb, doff, saddr, sport,
daddr, ntohs(dport), dif, sdif);
}
@@ -356,7 +358,7 @@ static inline bool inet_match(const struct net *net, const struct sock *sk,
int dif, int sdif)
{
if (!net_eq(sock_net(sk), net) ||
- sk->sk_portpair != ports ||
+ READ_ONCE(sk->sk_portpair) != ports ||
sk->sk_addrpair != cookie)
return false;
@@ -369,7 +371,6 @@ static inline bool inet_match(const struct net *net, const struct sock *sk,
* not check it for lookups anymore, thanks Alexey. -DaveM
*/
struct sock *__inet_lookup_established(const struct net *net,
- struct inet_hashinfo *hashinfo,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const u16 hnum,
const int dif, const int sdif);
@@ -395,18 +396,16 @@ struct sock *inet_lookup_run_sk_lookup(const struct net *net,
__be32 daddr, u16 hnum, const int dif,
inet_ehashfn_t *ehashfn);
-static inline struct sock *
- inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo,
- const __be32 saddr, const __be16 sport,
- const __be32 daddr, const __be16 dport,
- const int dif)
+static inline struct sock *inet_lookup_established(struct net *net,
+ const __be32 saddr, const __be16 sport,
+ const __be32 daddr, const __be16 dport,
+ const int dif)
{
- return __inet_lookup_established(net, hashinfo, saddr, sport, daddr,
+ return __inet_lookup_established(net, saddr, sport, daddr,
ntohs(dport), dif, 0);
}
static inline struct sock *__inet_lookup(struct net *net,
- struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const __be16 dport,
@@ -416,18 +415,17 @@ static inline struct sock *__inet_lookup(struct net *net,
u16 hnum = ntohs(dport);
struct sock *sk;
- sk = __inet_lookup_established(net, hashinfo, saddr, sport,
+ sk = __inet_lookup_established(net, saddr, sport,
daddr, hnum, dif, sdif);
*refcounted = true;
if (sk)
return sk;
*refcounted = false;
- return __inet_lookup_listener(net, hashinfo, skb, doff, saddr,
+ return __inet_lookup_listener(net, skb, doff, saddr,
sport, daddr, hnum, dif, sdif);
}
static inline struct sock *inet_lookup(struct net *net,
- struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be32 saddr, const __be16 sport,
const __be32 daddr, const __be16 dport,
@@ -436,7 +434,7 @@ static inline struct sock *inet_lookup(struct net *net,
struct sock *sk;
bool refcounted;
- sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
+ sk = __inet_lookup(net, skb, doff, saddr, sport, daddr,
dport, dif, 0, &refcounted);
if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
@@ -484,15 +482,14 @@ struct sock *inet_steal_sock(struct net *net, struct sk_buff *skb, int doff,
return reuse_sk;
}
-static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
- struct sk_buff *skb,
+static inline struct sock *__inet_lookup_skb(struct sk_buff *skb,
int doff,
const __be16 sport,
const __be16 dport,
const int sdif,
bool *refcounted)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net = skb_dst_dev_net_rcu(skb);
const struct iphdr *iph = ip_hdr(skb);
struct sock *sk;
@@ -503,8 +500,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
if (sk)
return sk;
- return __inet_lookup(net, hashinfo, skb,
- doff, iph->saddr, sport,
+ return __inet_lookup(net, skb, doff, iph->saddr, sport,
iph->daddr, dport, inet_iif(skb), sdif,
refcounted);
}
@@ -527,9 +523,12 @@ static inline void sk_rcv_saddr_set(struct sock *sk, __be32 addr)
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u64 port_offset,
+ u32 hash_port0,
int (*check_established)(struct inet_timewait_death_row *,
struct sock *, __u16,
- struct inet_timewait_sock **));
+ struct inet_timewait_sock **,
+ bool rcu_lookup,
+ u32 hash));
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk);
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index 1086256549fa..7cdcbed3e5cb 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -26,6 +26,8 @@
#include <net/tcp_states.h>
#include <net/l3mdev.h>
+#define IP_OPTIONS_DATA_FIXED_SIZE 40
+
/** struct ip_options - IP Options
*
* @faddr - Saved first hop address
@@ -58,12 +60,9 @@ struct ip_options {
struct ip_options_rcu {
struct rcu_head rcu;
- struct ip_options opt;
-};
-struct ip_options_data {
- struct ip_options_rcu opt;
- char data[40];
+ /* Must be last as it ends in a flexible-array member. */
+ struct ip_options opt;
};
struct inet_request_sock {
@@ -101,10 +100,7 @@ struct inet_request_sock {
};
};
-static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
-{
- return (struct inet_request_sock *)sk;
-}
+#define inet_rsk(ptr) container_of_const(ptr, struct inet_request_sock, req)
static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
{
@@ -163,6 +159,13 @@ static inline bool inet_sk_bound_dev_eq(const struct net *net,
#endif
}
+struct inet6_cork {
+ struct ipv6_txoptions *opt;
+ u8 hop_limit;
+ u8 tclass;
+ u8 dontfrag:1;
+};
+
struct inet_cork {
unsigned int flags;
__be32 addr;
@@ -183,6 +186,9 @@ struct inet_cork {
struct inet_cork_full {
struct inet_cork base;
struct flowi fl;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_cork base6;
+#endif
};
struct ip_mc_socklist;
@@ -214,6 +220,7 @@ struct inet_sock {
struct sock sk;
#if IS_ENABLED(CONFIG_IPV6)
struct ipv6_pinfo *pinet6;
+ struct ipv6_fl_socklist __rcu *ipv6_fl_list;
#endif
/* Socket demultiplex comparisons on incoming packets. */
#define inet_daddr sk.__sk_common.skc_daddr
@@ -354,14 +361,6 @@ static inline struct sock *skb_to_full_sk(const struct sk_buff *skb)
#define inet_sk(ptr) container_of_const(ptr, struct inet_sock, sk)
-static inline void __inet_sk_copy_descendant(struct sock *sk_to,
- const struct sock *sk_from,
- const int ancestor_size)
-{
- memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
- sk_from->sk_prot->obj_size - ancestor_size);
-}
-
int inet_sk_rebuild_header(struct sock *sk);
/**
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 67a313575780..63a644ff30de 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -70,7 +70,8 @@ struct inet_timewait_sock {
unsigned int tw_transparent : 1,
tw_flowlabel : 20,
tw_usec_ts : 1,
- tw_pad : 2, /* 2 bits hole */
+ tw_connect_bind : 1,
+ tw_pad : 1, /* 1 bit hole */
tw_tos : 8;
u32 tw_txhash;
u32 tw_priority;
@@ -81,6 +82,14 @@ struct inet_timewait_sock {
struct timer_list tw_timer;
struct inet_bind_bucket *tw_tb;
struct inet_bind2_bucket *tw_tb2;
+#if IS_ENABLED(CONFIG_INET_PSP)
+ struct psp_assoc __rcu *psp_assoc;
+#endif
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+ struct sk_buff* (*tw_validate_xmit_skb)(struct sock *sk,
+ struct net_device *dev,
+ struct sk_buff *skb);
+#endif
};
#define tw_tclass tw_tos
diff --git a/include/net/ioam6.h b/include/net/ioam6.h
index 2cbbee6e806a..a75912fe247e 100644
--- a/include/net/ioam6.h
+++ b/include/net/ioam6.h
@@ -60,6 +60,8 @@ void ioam6_fill_trace_data(struct sk_buff *skb,
struct ioam6_trace_hdr *trace,
bool is_input);
+u8 ioam6_trace_compute_nodelen(u32 trace_type);
+
int ioam6_init(void);
void ioam6_exit(void);
diff --git a/include/net/ip.h b/include/net/ip.h
index ba7b43447775..7f2fe1a8401b 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -59,6 +59,7 @@ struct inet_skb_parm {
#define IPSKB_L3SLAVE BIT(7)
#define IPSKB_NOPOLICY BIT(8)
#define IPSKB_MULTIPATH BIT(9)
+#define IPSKB_MCROUTE BIT(10)
u16 frag_max_size;
};
@@ -92,14 +93,15 @@ static inline void ipcm_init(struct ipcm_cookie *ipcm)
static inline void ipcm_init_sk(struct ipcm_cookie *ipcm,
const struct inet_sock *inet)
{
- ipcm_init(ipcm);
+ *ipcm = (struct ipcm_cookie) {
+ .tos = READ_ONCE(inet->tos),
+ };
+
+ sockcm_init(&ipcm->sockc, &inet->sk);
- ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark);
- ipcm->sockc.priority = READ_ONCE(inet->sk.sk_priority);
- ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags);
ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if);
ipcm->addr = inet->inet_saddr;
- ipcm->protocol = inet->inet_num;
+ ipcm->protocol = READ_ONCE(inet->inet_num);
}
#define IPCB(skb) ((struct inet_skb_parm*)((skb)->cb))
@@ -166,6 +168,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
int ip_local_deliver(struct sk_buff *skb);
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto);
int ip_mr_input(struct sk_buff *skb);
+int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
@@ -257,16 +260,9 @@ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet,
return RT_SCOPE_UNIVERSE;
}
-static inline __u8 get_rttos(struct ipcm_cookie* ipc, struct inet_sock *inet)
-{
- u8 dsfield = ipc->tos != -1 ? ipc->tos : READ_ONCE(inet->tos);
-
- return dsfield & INET_DSCP_MASK;
-}
-
/* datagram.c */
-int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
-int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+int __ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
+int ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
void ip4_datagram_release_cb(struct sock *sk);
@@ -330,11 +326,12 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o
}
#endif
-#define snmp_get_cpu_field64_batch(buff64, stats_list, mib_statistic, offset) \
+#define snmp_get_cpu_field64_batch_cnt(buff64, stats_list, cnt, \
+ mib_statistic, offset) \
{ \
int i, c; \
for_each_possible_cpu(c) { \
- for (i = 0; stats_list[i].name; i++) \
+ for (i = 0; i < cnt; i++) \
buff64[i] += snmp_get_cpu_field64( \
mib_statistic, \
c, stats_list[i].entry, \
@@ -342,11 +339,11 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o
} \
}
-#define snmp_get_cpu_field_batch(buff, stats_list, mib_statistic) \
+#define snmp_get_cpu_field_batch_cnt(buff, stats_list, cnt, mib_statistic) \
{ \
int i, c; \
for_each_possible_cpu(c) { \
- for (i = 0; stats_list[i].name; i++) \
+ for (i = 0; i < cnt; i++) \
buff[i] += snmp_get_cpu_field( \
mib_statistic, \
c, stats_list[i].entry); \
@@ -363,7 +360,7 @@ static inline void inet_get_local_port_range(const struct net *net, int *low, in
bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
#ifdef CONFIG_SYSCTL
-static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
+static inline bool inet_is_local_reserved_port(const struct net *net, unsigned short port)
{
if (!net->ipv4.sysctl_local_reserved_ports)
return false;
@@ -471,17 +468,19 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
bool forwarding)
{
const struct rtable *rt = dst_rtable(dst);
+ const struct net_device *dev;
unsigned int mtu, res;
struct net *net;
rcu_read_lock();
- net = dev_net_rcu(dst->dev);
+ dev = dst_dev_rcu(dst);
+ net = dev_net_rcu(dev);
if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) ||
ip_mtu_locked(dst) ||
!forwarding) {
mtu = rt->rt_pmtu;
- if (mtu && time_before(jiffies, rt->dst.expires))
+ if (mtu && time_before(jiffies, READ_ONCE(rt->dst.expires)))
goto out;
}
@@ -490,7 +489,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
if (mtu)
goto out;
- mtu = READ_ONCE(dst->dev->mtu);
+ mtu = READ_ONCE(dev->mtu);
if (unlikely(ip_mtu_locked(dst))) {
if (rt->rt_uses_gateway && mtu > 576)
@@ -510,16 +509,17 @@ out:
static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
const struct sk_buff *skb)
{
+ const struct dst_entry *dst = skb_dst(skb);
unsigned int mtu;
if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) {
bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED;
- return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
+ return ip_dst_mtu_maybe_forward(dst, forwarding);
}
- mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU);
- return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu);
+ mtu = min(READ_ONCE(dst_dev(dst)->mtu), IP_MAX_MTU);
+ return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len,
@@ -804,7 +804,7 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
int ip_ra_control(struct sock *sk, unsigned char on,
void (*destructor)(struct sock *));
-int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len);
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len);
void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
u32 info, u8 *payload);
void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport,
diff --git a/include/net/ip6_checksum.h b/include/net/ip6_checksum.h
index c8a96b888277..6677b3cc3972 100644
--- a/include/net/ip6_checksum.h
+++ b/include/net/ip6_checksum.h
@@ -82,6 +82,4 @@ static inline __sum16 udp_v6_check(int len,
void udp6_set_csum(bool nocheck, struct sk_buff *skb,
const struct in6_addr *saddr,
const struct in6_addr *daddr, int len);
-
-int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto);
#endif
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 7c87873ae211..9cd27e1b9b69 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -198,6 +198,7 @@ struct fib6_info {
fib6_destroying:1,
unused:4;
+ struct list_head purge_link;
struct rcu_head rcu;
struct nexthop *nh;
struct fib6_nh fib6_nh[];
@@ -485,11 +486,30 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
rcu_read_unlock();
}
+#if IS_ENABLED(CONFIG_IPV6)
int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
struct fib6_config *cfg, gfp_t gfp_flags,
struct netlink_ext_ack *extack);
void fib6_nh_release(struct fib6_nh *fib6_nh);
void fib6_nh_release_dsts(struct fib6_nh *fib6_nh);
+#else
+static inline int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
+ struct fib6_config *cfg, gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
+{
+ NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
+ return -EAFNOSUPPORT;
+}
+
+static inline void fib6_nh_release(struct fib6_nh *fib6_nh)
+{
+}
+
+static inline void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
+{
+}
+#endif
+
int call_fib6_entry_notifiers(struct net *net,
enum fib_event_type event_type,
@@ -501,17 +521,26 @@ int call_fib6_multipath_entry_notifiers(struct net *net,
unsigned int nsiblings,
struct netlink_ext_ack *extack);
int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt);
+#if IS_ENABLED(CONFIG_IPV6)
void fib6_rt_update(struct net *net, struct fib6_info *rt,
struct nl_info *info);
+#else
+static inline void fib6_rt_update(struct net *net, struct fib6_info *rt,
+ struct nl_info *info)
+{
+}
+#endif
void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
unsigned int flags);
+void fib6_age_exceptions(struct fib6_info *rt, struct fib6_gc_args *gc_args,
+ unsigned long now);
void fib6_run_gc(unsigned long expires, struct net *net, bool force);
-
void fib6_gc_cleanup(void);
int fib6_init(void);
+#if IS_ENABLED(CONFIG_IPV6)
/* Add the route to the gc list if it is not already there
*
* The callers should hold f6i->fib6_table->tb6_lock.
@@ -544,6 +573,23 @@ static inline void fib6_remove_gc_list(struct fib6_info *f6i)
hlist_del_init(&f6i->gc_link);
}
+static inline void fib6_may_remove_gc_list(struct net *net,
+ struct fib6_info *f6i)
+{
+ struct fib6_gc_args gc_args;
+
+ if (hlist_unhashed(&f6i->gc_link))
+ return;
+
+ gc_args.timeout = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval);
+ gc_args.more = 0;
+
+ rcu_read_lock();
+ fib6_age_exceptions(f6i, &gc_args, jiffies);
+ rcu_read_unlock();
+}
+#endif
+
struct ipv6_route_iter {
struct seq_net_private p;
struct fib6_walker w;
@@ -568,8 +614,13 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb,
struct netlink_ext_ack *extack);
void fib6_update_sernum(struct net *net, struct fib6_info *rt);
+#if IS_ENABLED(CONFIG_IPV6)
void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt);
-void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i);
+#else
+static inline void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
+{
+}
+#endif
void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val);
static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
@@ -579,7 +630,7 @@ static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric)
void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
bool offload, bool trap, bool offload_failed);
-#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
+#if IS_ENABLED(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
struct bpf_iter__ipv6_route {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct fib6_info *, rt);
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 6dbdf60b342f..09ffe0f13ce7 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -77,7 +77,14 @@ static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
f6i->fib6_nh->fib_nh_gw_family;
}
+#if IS_ENABLED(CONFIG_IPV6)
void ip6_route_input(struct sk_buff *skb);
+#else
+static inline void ip6_route_input(struct sk_buff *skb)
+{
+}
+#endif
+
struct dst_entry *ip6_route_input_lookup(struct net *net,
struct net_device *dev,
struct flowi6 *fl6,
@@ -119,7 +126,15 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd,
int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
struct netlink_ext_ack *extack);
int ip6_ins_rt(struct net *net, struct fib6_info *f6i);
+#if IS_ENABLED(CONFIG_IPV6)
int ip6_del_rt(struct net *net, struct fib6_info *f6i, bool skip_notify);
+#else
+static inline int ip6_del_rt(struct net *net, struct fib6_info *f6i,
+ bool skip_notify)
+{
+ return -EAFNOSUPPORT;
+}
+#endif
void rt6_flush_exceptions(struct fib6_info *f6i);
void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args,
@@ -229,16 +244,16 @@ static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
* Store a destination cache entry in a socket
*/
static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
- const struct in6_addr *daddr,
- const struct in6_addr *saddr)
+ bool daddr_set,
+ bool saddr_set)
{
struct ipv6_pinfo *np = inet6_sk(sk);
np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
sk_setup_caps(sk, dst);
- np->daddr_cache = daddr;
+ np->daddr_cache = daddr_set;
#ifdef CONFIG_IPV6_SUBTREES
- np->saddr_cache = saddr;
+ np->saddr_cache = saddr_set;
#endif
}
@@ -252,19 +267,43 @@ static inline bool ipv6_unicast_destination(const struct sk_buff *skb)
return rt->rt6i_flags & RTF_LOCAL;
}
+static inline bool __ipv6_anycast_destination(const struct rt6key *rt6i_dst,
+ u32 rt6i_flags,
+ const struct in6_addr *daddr)
+{
+ return rt6i_flags & RTF_ANYCAST ||
+ (rt6i_dst->plen < 127 &&
+ !(rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) &&
+ ipv6_addr_equal(&rt6i_dst->addr, daddr));
+}
+
static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
const struct in6_addr *daddr)
{
const struct rt6_info *rt = dst_rt6_info(dst);
- return rt->rt6i_flags & RTF_ANYCAST ||
- (rt->rt6i_dst.plen < 127 &&
- !(rt->rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) &&
- ipv6_addr_equal(&rt->rt6i_dst.addr, daddr));
+ return __ipv6_anycast_destination(&rt->rt6i_dst, rt->rt6i_flags, daddr);
}
+#if IS_ENABLED(CONFIG_IPV6)
int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
int (*output)(struct net *, struct sock *, struct sk_buff *));
+#else
+static inline int ip6_fragment(struct net *net, struct sock *sk,
+ struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *,
+ struct sk_buff *))
+{
+ kfree_skb(skb);
+ return -EAFNOSUPPORT;
+}
+#endif
+
+/* Variant of dst_mtu() for IPv6 users */
+static inline u32 dst6_mtu(const struct dst_entry *dst)
+{
+ return INDIRECT_CALL_1(dst->ops->mtu, ip6_mtu, dst);
+}
static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb)
{
@@ -274,7 +313,7 @@ static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb)
unsigned int mtu;
if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) {
- mtu = READ_ONCE(dst->dev->mtu);
+ mtu = READ_ONCE(dst_dev(dst)->mtu);
mtu -= lwtunnel_headroom(dst->lwtstate, mtu);
} else {
mtu = dst_mtu(dst);
@@ -337,7 +376,7 @@ static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst
mtu = IPV6_MIN_MTU;
rcu_read_lock();
- idev = __in6_dev_get(dst->dev);
+ idev = __in6_dev_get(dst_dev_rcu(dst));
if (idev)
mtu = READ_ONCE(idev->cnf.mtu6);
rcu_read_unlock();
diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 399592405c72..b99805ee2fd1 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -152,19 +152,34 @@ int ip6_tnl_get_iflink(const struct net_device *dev);
int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu);
static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb,
- struct net_device *dev)
+ struct net_device *dev, u16 ip6cb_flags)
{
int pkt_len, err;
+ if (unlikely(dev_recursion_level() > IP_TUNNEL_RECURSION_LIMIT)) {
+ if (dev) {
+ net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
+ dev->name);
+ DEV_STATS_INC(dev, tx_errors);
+ }
+ kfree_skb_reason(skb, SKB_DROP_REASON_RECURSION_LIMIT);
+ return;
+ }
+
+ dev_xmit_recursion_inc();
+
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
+ IP6CB(skb)->flags = ip6cb_flags;
pkt_len = skb->len - skb_inner_network_offset(skb);
- err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb);
+ err = ip6_local_out(skb_dst_dev_net(skb), sk, skb);
if (dev) {
if (unlikely(net_xmit_eval(err)))
pkt_len = -1;
iptunnel_xmit_stats(dev, pkt_len);
}
+
+ dev_xmit_recursion_dec();
}
#endif
#endif
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index a113c11ab56b..318593743b6e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -162,6 +162,8 @@ struct fib_info {
struct fib_nh fib_nh[] __counted_by(fib_nhs);
};
+int __net_init fib4_semantics_init(struct net *net);
+void __net_exit fib4_semantics_exit(struct net *net);
#ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rule;
@@ -438,7 +440,7 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net,
static inline bool fib_dscp_masked_match(dscp_t dscp, const struct flowi4 *fl4)
{
- return dscp == inet_dsfield_to_dscp(RT_TOS(fl4->flowi4_tos));
+ return dscp == (fl4->flowi4_dscp & INET_DSCP_LEGACY_TOS_MASK);
}
/* Exported by fib_frontend.c */
@@ -557,7 +559,7 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net,
siphash_aligned_key_t hash_key;
u32 mp_seed;
- mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed).mp_seed;
+ mp_seed = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed.mp_seed);
fib_multipath_hash_construct_key(&hash_key, mp_seed);
return flow_hash_from_keys_seed(keys, &hash_key);
@@ -572,7 +574,8 @@ static inline u32 fib_multipath_hash_from_keys(const struct net *net,
int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
struct netlink_ext_ack *extack);
-void fib_select_multipath(struct fib_result *res, int hash);
+void fib_select_multipath(struct fib_result *res, int hash,
+ const struct flowi4 *fl4);
void fib_select_path(struct net *net, struct fib_result *res,
struct flowi4 *fl4, const struct sk_buff *skb);
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 1aa31bdb2b31..d708b66e55cd 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -11,12 +11,15 @@
#include <linux/bitops.h>
#include <net/dsfield.h>
+#include <net/flow.h>
#include <net/gro_cells.h>
+#include <net/inet_dscp.h>
#include <net/inet_ecn.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/lwtunnel.h>
#include <net/dst_cache.h>
+#include <net/netdev_lock.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@@ -24,6 +27,13 @@
#include <net/ip6_route.h>
#endif
+/* Recursion limit for tunnel xmit to detect routing loops.
+ * Unlike XMIT_RECURSION_LIMIT (8) used in the no-qdisc path, tunnel
+ * recursion involves route lookups and full IP output, consuming much
+ * more stack per level, so a lower limit is needed.
+ */
+#define IP_TUNNEL_RECURSION_LIMIT 5
+
/* Keep error state on tunnel for 30 sec */
#define IPTUNNEL_ERR_TIMEO (30*HZ)
@@ -95,8 +105,8 @@ struct ip_tunnel_encap {
#define ip_tunnel_info_opts(info) \
_Generic(info, \
- const struct ip_tunnel_info * : ((const void *)((info) + 1)),\
- struct ip_tunnel_info * : ((void *)((info) + 1))\
+ const struct ip_tunnel_info * : ((const void *)(info)->options),\
+ struct ip_tunnel_info * : ((void *)(info)->options)\
)
struct ip_tunnel_info {
@@ -107,6 +117,7 @@ struct ip_tunnel_info {
#endif
u8 options_len;
u8 mode;
+ u8 options[] __aligned_largest __counted_by(options_len);
};
/* 6rd prefix/relay information */
@@ -361,7 +372,7 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4,
fl4->daddr = daddr;
fl4->saddr = saddr;
- fl4->flowi4_tos = tos;
+ fl4->flowi4_dscp = inet_dsfield_to_dscp(tos);
fl4->flowi4_proto = proto;
fl4->fl4_gre_key = key;
fl4->flowi4_mark = mark;
@@ -369,17 +380,26 @@ static inline void ip_tunnel_init_flow(struct flowi4 *fl4,
fl4->flowi4_flags = flow_flags;
}
-int ip_tunnel_init(struct net_device *dev);
+int __ip_tunnel_init(struct net_device *dev);
+#define ip_tunnel_init(DEV) \
+({ \
+ struct net_device *__dev = (DEV); \
+ int __res = __ip_tunnel_init(__dev); \
+ \
+ if (!__res) \
+ netdev_lockdep_set_classes(__dev);\
+ __res; \
+})
+
void ip_tunnel_uninit(struct net_device *dev);
void ip_tunnel_dellink(struct net_device *dev, struct list_head *head);
struct net *ip_tunnel_get_link_net(const struct net_device *dev);
int ip_tunnel_get_iflink(const struct net_device *dev);
int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
struct rtnl_link_ops *ops, char *devname);
-
-void ip_tunnel_delete_nets(struct list_head *list_net, unsigned int id,
- struct rtnl_link_ops *ops,
- struct list_head *dev_to_kill);
+void ip_tunnel_delete_net(struct net *net, unsigned int id,
+ struct rtnl_link_ops *ops,
+ struct list_head *dev_to_kill);
void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params, const u8 protocol);
@@ -406,8 +426,9 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
bool log_ecn_error);
int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
struct ip_tunnel_parm_kern *p, __u32 fwmark);
-int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
- struct ip_tunnel_parm_kern *p, __u32 fwmark);
+int ip_tunnel_newlink(struct net *net, struct net_device *dev,
+ struct nlattr *tb[], struct ip_tunnel_parm_kern *p,
+ __u32 fwmark);
void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);
bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
@@ -602,12 +623,27 @@ static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, u8 proto,
- u8 tos, u8 ttl, __be16 df, bool xnet);
+ u8 tos, u8 ttl, __be16 df, bool xnet, u16 ipcb_flags);
struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
gfp_t flags);
int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
int headroom, bool reply);
+static inline void ip_tunnel_adj_headroom(struct net_device *dev,
+ unsigned int headroom)
+{
+ /* we must cap headroom to some upperlimit, else pskb_expand_head
+ * will overflow header offsets in skb_headers_offset_update().
+ */
+ const unsigned int max_allowed = 512;
+
+ if (headroom > max_allowed)
+ headroom = max_allowed;
+
+ if (headroom > READ_ONCE(dev->needed_headroom))
+ WRITE_ONCE(dev->needed_headroom, headroom);
+}
+
int iptunnel_handle_offloads(struct sk_buff *skb, int gso_type_mask);
static inline int iptunnel_pull_offloads(struct sk_buff *skb)
@@ -629,13 +665,29 @@ static inline int iptunnel_pull_offloads(struct sk_buff *skb)
static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len)
{
if (pkt_len > 0) {
- struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats);
-
- u64_stats_update_begin(&tstats->syncp);
- u64_stats_add(&tstats->tx_bytes, pkt_len);
- u64_stats_inc(&tstats->tx_packets);
- u64_stats_update_end(&tstats->syncp);
- put_cpu_ptr(tstats);
+ if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) {
+ struct pcpu_dstats *dstats = get_cpu_ptr(dev->dstats);
+
+ u64_stats_update_begin(&dstats->syncp);
+ u64_stats_add(&dstats->tx_bytes, pkt_len);
+ u64_stats_inc(&dstats->tx_packets);
+ u64_stats_update_end(&dstats->syncp);
+ put_cpu_ptr(dstats);
+ return;
+ }
+ if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
+ struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ u64_stats_add(&tstats->tx_bytes, pkt_len);
+ u64_stats_inc(&tstats->tx_packets);
+ u64_stats_update_end(&tstats->syncp);
+ put_cpu_ptr(tstats);
+ return;
+ }
+ pr_err_once("iptunnel_xmit_stats pcpu_stat_type=%d\n",
+ dev->pcpu_stat_type);
+ WARN_ON_ONCE(1);
return;
}
@@ -650,7 +702,7 @@ static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len)
static inline void ip_tunnel_info_opts_get(void *to,
const struct ip_tunnel_info *info)
{
- memcpy(to, info + 1, info->options_len);
+ memcpy(to, ip_tunnel_info_opts(info), info->options_len);
}
static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info,
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index ff406ef4fd4a..e517eaaa177b 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -11,6 +11,7 @@
#include <asm/types.h> /* for __uXX types */
#include <linux/list.h> /* for struct list_head */
+#include <linux/rculist_bl.h> /* for struct hlist_bl_head */
#include <linux/spinlock.h> /* for struct rwlock_t */
#include <linux/atomic.h> /* for struct atomic_t */
#include <linux/refcount.h> /* for struct refcount_t */
@@ -30,10 +31,23 @@
#endif
#include <net/net_namespace.h> /* Netw namespace */
#include <linux/sched/isolation.h>
+#include <linux/siphash.h>
#define IP_VS_HDR_INVERSE 1
#define IP_VS_HDR_ICMP 2
+/* conn_tab limits (as per Kconfig) */
+#define IP_VS_CONN_TAB_MIN_BITS 8
+#if BITS_PER_LONG > 32
+#define IP_VS_CONN_TAB_MAX_BITS 27
+#else
+#define IP_VS_CONN_TAB_MAX_BITS 20
+#endif
+
+/* svc_table limits */
+#define IP_VS_SVC_TAB_MIN_BITS 4
+#define IP_VS_SVC_TAB_MAX_BITS 20
+
/* Generic access of ipvs struct */
static inline struct netns_ipvs *net_ipvs(struct net* net)
{
@@ -43,8 +57,6 @@ static inline struct netns_ipvs *net_ipvs(struct net* net)
/* Connections' size value needed by ip_vs_ctl.c */
extern int ip_vs_conn_tab_size;
-extern struct mutex __ip_vs_mutex;
-
struct ip_vs_iphdr {
int hdr_flags; /* ipvs flags */
__u32 off; /* Where IP or IPv4 header starts */
@@ -265,6 +277,29 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
pr_err(msg, ##__VA_ARGS__); \
} while (0)
+struct ip_vs_aligned_lock {
+ spinlock_t l; /* Protect buckets */
+} ____cacheline_aligned_in_smp;
+
+/* For arrays per family */
+enum {
+ IP_VS_AF_INET,
+ IP_VS_AF_INET6,
+ IP_VS_AF_MAX
+};
+
+static inline int ip_vs_af_index(int af)
+{
+ return af == AF_INET6 ? IP_VS_AF_INET6 : IP_VS_AF_INET;
+}
+
+/* work_flags */
+enum {
+ IP_VS_WORK_SVC_RESIZE, /* Schedule svc_resize_work */
+ IP_VS_WORK_SVC_NORESIZE, /* Stopping svc_resize_work */
+ IP_VS_WORK_CONN_RESIZE, /* Schedule conn_resize_work */
+};
+
/* The port number of FTP service (in network order). */
#define FTPPORT cpu_to_be16(21)
#define FTPDATA cpu_to_be16(20)
@@ -456,6 +491,7 @@ struct ip_vs_est_kt_data {
DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */
unsigned long est_timer; /* estimation timer (jiffies) */
struct ip_vs_stats *calc_stats; /* Used for calculation */
+ int needed; /* task is needed */
int tick_len[IPVS_EST_NTICKS]; /* est count */
int id; /* ktid per netns */
int chain_max; /* max ests per tick chain */
@@ -466,6 +502,198 @@ struct ip_vs_est_kt_data {
int est_row; /* estimated row */
};
+/* IPVS resizable hash tables */
+struct ip_vs_rht {
+ struct hlist_bl_head *buckets;
+ struct ip_vs_rht __rcu *new_tbl; /* New/Same table */
+ seqcount_t *seqc; /* Protects moves */
+ struct ip_vs_aligned_lock *lock; /* Protect seqc */
+ int mask; /* Buckets mask */
+ int size; /* Buckets */
+ int seqc_mask; /* seqc mask */
+ int lock_mask; /* lock mask */
+ u32 table_id;
+ int u_thresh; /* upper threshold */
+ int l_thresh; /* lower threshold */
+ int lfactor; /* Load Factor (shift)*/
+ int bits; /* size = 1 << bits */
+ siphash_key_t hash_key;
+ struct rcu_head rcu_head;
+};
+
+/**
+ * ip_vs_rht_for_each_table() - Walk the hash tables
+ * @table: struct ip_vs_rht __rcu *table
+ * @t: current table, used as cursor, struct ip_vs_rht *var
+ * @p: previous table, temp struct ip_vs_rht *var
+ *
+ * Walk tables assuming others can not change the installed tables
+ */
+#define ip_vs_rht_for_each_table(table, t, p) \
+ for (p = NULL, t = rcu_dereference_protected(table, 1); \
+ t != p; \
+ p = t, t = rcu_dereference_protected(t->new_tbl, 1))
+
+/**
+ * ip_vs_rht_for_each_table_rcu() - Walk the hash tables under RCU reader lock
+ * @table: struct ip_vs_rht __rcu *table
+ * @t: current table, used as cursor, struct ip_vs_rht *var
+ * @p: previous table, temp struct ip_vs_rht *var
+ *
+ * We usually search in one table and also in second table on resizing
+ */
+#define ip_vs_rht_for_each_table_rcu(table, t, p) \
+ for (p = NULL, t = rcu_dereference(table); \
+ t != p; \
+ p = t, t = rcu_dereference(t->new_tbl))
+
+/**
+ * ip_vs_rht_for_each_bucket() - Walk all table buckets
+ * @t: current table, used as cursor, struct ip_vs_rht *var
+ * @bucket: bucket index, used as cursor, u32 var
+ * @head: bucket address, used as cursor, struct hlist_bl_head *var
+ */
+#define ip_vs_rht_for_each_bucket(t, bucket, head) \
+ for (bucket = 0, head = (t)->buckets; \
+ bucket < t->size; bucket++, head++)
+
+/**
+ * ip_vs_rht_for_bucket_retry() - Retry bucket if entries are moved
+ * @t: current table, used as cursor, struct ip_vs_rht *var
+ * @bucket: index of current bucket or hash key
+ * @sc: temp seqcount_t *var
+ * @seq: temp unsigned int var for sequence count
+ * @retry: temp int var
+ */
+#define ip_vs_rht_for_bucket_retry(t, bucket, sc, seq, retry) \
+ for (retry = 1, sc = &(t)->seqc[(bucket) & (t)->seqc_mask]; \
+ retry && ({ seq = read_seqcount_begin(sc); 1; }); \
+ retry = read_seqcount_retry(sc, seq))
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_buckets_rcu
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() \
+ struct ip_vs_rht *_t, *_p; \
+ unsigned int _seq; \
+ seqcount_t *_sc; \
+ u32 _bucket; \
+ int _retry
+/**
+ * ip_vs_rht_walk_buckets_rcu() - Walk all buckets under RCU read lock
+ * @table: struct ip_vs_rht __rcu *table
+ * @head: bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Can be used while others add/delete/move entries
+ * Not suitable if duplicates are not desired
+ * Possible cases for reader that uses cond_resched_rcu() in the loop:
+ * - new table can not be installed, no need to repeat
+ * - new table can be installed => check and repeat if new table is
+ * installed, needed for !PREEMPT_RCU
+ */
+#define ip_vs_rht_walk_buckets_rcu(table, head) \
+ ip_vs_rht_for_each_table_rcu(table, _t, _p) \
+ ip_vs_rht_for_each_bucket(_t, _bucket, head) \
+ ip_vs_rht_for_bucket_retry(_t, _bucket, _sc, \
+ _seq, _retry)
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_bucket_rcu
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() \
+ unsigned int _seq; \
+ seqcount_t *_sc; \
+ int _retry
+/**
+ * ip_vs_rht_walk_bucket_rcu() - Walk bucket under RCU read lock
+ * @t: current table, struct ip_vs_rht *var
+ * @bucket: index of current bucket or hash key
+ * @head: bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Can be used while others add/delete/move entries
+ * Not suitable if duplicates are not desired
+ * Possible cases for reader that uses cond_resched_rcu() in the loop:
+ * - new table can not be installed, no need to repeat
+ * - new table can be installed => check and repeat if new table is
+ * installed, needed for !PREEMPT_RCU
+ */
+#define ip_vs_rht_walk_bucket_rcu(t, bucket, head) \
+ if (({ head = (t)->buckets + ((bucket) & (t)->mask); 0; })) \
+ {} \
+ else \
+ ip_vs_rht_for_bucket_retry(t, (bucket), _sc, _seq, _retry)
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_buckets_safe_rcu
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() \
+ struct ip_vs_rht *_t, *_p; \
+ u32 _bucket
+/**
+ * ip_vs_rht_walk_buckets_safe_rcu() - Walk all buckets under RCU read lock
+ * @table: struct ip_vs_rht __rcu *table
+ * @head: bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Can be used while others add/delete entries but moving is disabled
+ * Using cond_resched_rcu() should be safe if tables do not change
+ */
+#define ip_vs_rht_walk_buckets_safe_rcu(table, head) \
+ ip_vs_rht_for_each_table_rcu(table, _t, _p) \
+ ip_vs_rht_for_each_bucket(_t, _bucket, head)
+
+/**
+ * DECLARE_IP_VS_RHT_WALK_BUCKETS() - Declare variables
+ *
+ * Variables for ip_vs_rht_walk_buckets
+ */
+#define DECLARE_IP_VS_RHT_WALK_BUCKETS() \
+ struct ip_vs_rht *_t, *_p; \
+ u32 _bucket
+
+/**
+ * ip_vs_rht_walk_buckets() - Walk all buckets
+ * @table: struct ip_vs_rht __rcu *table
+ * @head: bucket address, used as cursor, struct hlist_bl_head *var
+ *
+ * Use if others can not add/delete/move entries
+ */
+#define ip_vs_rht_walk_buckets(table, head) \
+ ip_vs_rht_for_each_table(table, _t, _p) \
+ ip_vs_rht_for_each_bucket(_t, _bucket, head)
+
+/* Entries can be in one of two tables, so we flip bit when new table is
+ * created and store it as highest bit in hash keys
+ */
+#define IP_VS_RHT_TABLE_ID_MASK BIT(31)
+
+/* Check if hash key is from this table */
+static inline bool ip_vs_rht_same_table(struct ip_vs_rht *t, u32 hash_key)
+{
+ return !((t->table_id ^ hash_key) & IP_VS_RHT_TABLE_ID_MASK);
+}
+
+/* Build per-table hash key from hash value */
+static inline u32 ip_vs_rht_build_hash_key(struct ip_vs_rht *t, u32 hash)
+{
+ return t->table_id | (hash & ~IP_VS_RHT_TABLE_ID_MASK);
+}
+
+void ip_vs_rht_free(struct ip_vs_rht *t);
+void ip_vs_rht_rcu_free(struct rcu_head *head);
+struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks);
+int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
+ int lfactor, int min_bits, int max_bits);
+void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor,
+ int min_bits, int max_bits);
+u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af,
+ const union nf_inet_addr *addr, u32 v1, u32 v2);
+
struct dst_entry;
struct iphdr;
struct ip_vs_conn;
@@ -559,50 +787,48 @@ struct ip_vs_conn_param {
__u8 pe_data_len;
};
+/* Hash node in conn_tab */
+struct ip_vs_conn_hnode {
+ struct hlist_bl_node node; /* node in conn_tab */
+ u32 hash_key; /* Key for the hash table */
+ u8 dir; /* 0=out->in, 1=in->out */
+} __packed;
+
/* IP_VS structure allocated for each dynamically scheduled connection */
struct ip_vs_conn {
- struct hlist_node c_list; /* hashed list heads */
- /* Protocol, addresses and port numbers */
+ /* Cacheline for hash table nodes - rarely modified */
+
+ struct ip_vs_conn_hnode hn0; /* Original direction */
+ u8 af; /* address family */
__be16 cport;
+ struct ip_vs_conn_hnode hn1; /* Reply direction */
+ u8 daf; /* Address family of the dest */
__be16 dport;
- __be16 vport;
- u16 af; /* address family */
- union nf_inet_addr caddr; /* client address */
- union nf_inet_addr vaddr; /* virtual address */
- union nf_inet_addr daddr; /* destination address */
+ struct ip_vs_dest *dest; /* real server */
+ atomic_t n_control; /* Number of controlled ones */
volatile __u32 flags; /* status flags */
- __u16 protocol; /* Which protocol (TCP/UDP) */
- __u16 daf; /* Address family of the dest */
- struct netns_ipvs *ipvs;
-
- /* counter and timer */
- refcount_t refcnt; /* reference count */
- struct timer_list timer; /* Expiration timer */
- volatile unsigned long timeout; /* timeout */
+ /* 44/64 */
- /* Flags and state transition */
- spinlock_t lock; /* lock for state transition */
+ struct ip_vs_conn *control; /* Master control connection */
+ const struct ip_vs_pe *pe;
+ char *pe_data;
+ __u8 pe_data_len;
volatile __u16 state; /* state info */
volatile __u16 old_state; /* old state, to be used for
* state transition triggered
* synchronization
*/
- __u32 fwmark; /* Fire wall mark from skb */
- unsigned long sync_endtime; /* jiffies + sent_retries */
+ /* 2-byte hole */
+ /* 64/96 */
- /* Control members */
- struct ip_vs_conn *control; /* Master control connection */
- atomic_t n_control; /* Number of controlled ones */
- struct ip_vs_dest *dest; /* real server */
- atomic_t in_pkts; /* incoming packet counter */
+ union nf_inet_addr caddr; /* client address */
+ union nf_inet_addr vaddr; /* virtual address */
+ /* 96/128 */
- /* Packet transmitter for different forwarding methods. If it
- * mangles the packet, it must return NF_DROP or better NF_STOLEN,
- * otherwise this must be changed to a sk_buff **.
- * NF_ACCEPT can be returned when destination is local.
- */
- int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
+ union nf_inet_addr daddr; /* destination address */
+ __u32 fwmark; /* Fire wall mark from skb */
+ __be16 vport;
+ __u16 protocol; /* Which protocol (TCP/UDP) */
/* Note: we can group the following members into a structure,
* in order to save more space, and the following members are
@@ -610,14 +836,31 @@ struct ip_vs_conn {
*/
struct ip_vs_app *app; /* bound ip_vs_app object */
void *app_data; /* Application private data */
+ /* 128/168 */
struct_group(sync_conn_opt,
struct ip_vs_seq in_seq; /* incoming seq. struct */
struct ip_vs_seq out_seq; /* outgoing seq. struct */
);
+ /* 152/192 */
- const struct ip_vs_pe *pe;
- char *pe_data;
- __u8 pe_data_len;
+ struct timer_list timer; /* Expiration timer */
+ volatile unsigned long timeout; /* timeout */
+ spinlock_t lock; /* lock for state transition */
+ refcount_t refcnt; /* reference count */
+ atomic_t in_pkts; /* incoming packet counter */
+ /* 64-bit: 4-byte gap */
+
+ /* 188/256 */
+ unsigned long sync_endtime; /* jiffies + sent_retries */
+ struct netns_ipvs *ipvs;
+
+ /* Packet transmitter for different forwarding methods. If it
+ * mangles the packet, it must return NF_DROP or better NF_STOLEN,
+ * otherwise this must be changed to a sk_buff **.
+ * NF_ACCEPT can be returned when destination is local.
+ */
+ int (*packet_xmit)(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
struct rcu_head rcu_head;
};
@@ -673,15 +916,15 @@ struct ip_vs_dest_user_kern {
* forwarding entries.
*/
struct ip_vs_service {
- struct hlist_node s_list; /* for normal service table */
- struct hlist_node f_list; /* for fwmark-based service table */
- atomic_t refcnt; /* reference counter */
-
+ struct hlist_bl_node s_list; /* node in service table */
+ u32 hash_key; /* Key for the hash table */
u16 af; /* address family */
__u16 protocol; /* which protocol (TCP/UDP) */
+
union nf_inet_addr addr; /* IP address for virtual service */
- __be16 port; /* port number for the service */
__u32 fwmark; /* firewall mark of the service */
+ atomic_t refcnt; /* reference counter */
+ __be16 port; /* port number for the service */
unsigned int flags; /* service status flags */
unsigned int timeout; /* persistent timeout in ticks */
__be32 netmask; /* grouping granularity, mask/plen */
@@ -791,8 +1034,8 @@ struct ip_vs_pe {
int (*fill_param)(struct ip_vs_conn_param *p, struct sk_buff *skb);
bool (*ct_match)(const struct ip_vs_conn_param *p,
struct ip_vs_conn *ct);
- u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval,
- bool inverse);
+ u32 (*hashkey_raw)(const struct ip_vs_conn_param *p,
+ struct ip_vs_rht *t, bool inverse);
int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf);
/* create connections for real-server outgoing packets */
struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc,
@@ -931,21 +1174,28 @@ struct netns_ipvs {
#endif
/* ip_vs_conn */
atomic_t conn_count; /* connection counter */
+ atomic_t no_cport_conns[IP_VS_AF_MAX];
+ struct delayed_work conn_resize_work;/* resize conn_tab */
/* ip_vs_ctl */
struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */
- int num_services; /* no of virtual services */
- int num_services6; /* IPv6 virtual services */
-
/* Trash for destinations */
struct list_head dest_trash;
spinlock_t dest_trash_lock;
struct timer_list dest_trash_timer; /* expiration timer */
+ struct mutex service_mutex; /* service reconfig */
+ struct rw_semaphore svc_resize_sem; /* svc_table resizing */
+ struct rw_semaphore svc_replace_sem; /* svc_table replace */
+ struct delayed_work svc_resize_work; /* resize svc_table */
+ atomic_t svc_table_changes;/* ++ on table changes */
/* Service counters */
- atomic_t ftpsvc_counter;
- atomic_t nullsvc_counter;
- atomic_t conn_out_counter;
+ atomic_t num_services[IP_VS_AF_MAX]; /* Services */
+ atomic_t fwm_services[IP_VS_AF_MAX]; /* Services */
+ atomic_t nonfwm_services[IP_VS_AF_MAX];/* Services */
+ atomic_t ftpsvc_counter[IP_VS_AF_MAX]; /* FTPPORT */
+ atomic_t nullsvc_counter[IP_VS_AF_MAX];/* Zero port */
+ atomic_t conn_out_counter[IP_VS_AF_MAX];/* out conn */
#ifdef CONFIG_SYSCTL
/* delayed work for expiring no dest connections */
@@ -956,6 +1206,7 @@ struct netns_ipvs {
int drop_counter;
int old_secure_tcp;
atomic_t dropentry;
+ s8 dropentry_counters[8];
/* locks in ctl.c */
spinlock_t dropentry_lock; /* drop entry handling */
spinlock_t droppacket_lock; /* drop packet handling */
@@ -1002,6 +1253,8 @@ struct netns_ipvs {
int sysctl_est_nice; /* kthread nice */
int est_stopped; /* stop tasks */
#endif
+ int sysctl_conn_lfactor;
+ int sysctl_svc_lfactor;
/* ip_vs_lblc */
int sysctl_lblc_expiration;
@@ -1011,6 +1264,7 @@ struct netns_ipvs {
int sysctl_lblcr_expiration;
struct ctl_table_header *lblcr_ctl_header;
struct ctl_table *lblcr_ctl_table;
+ unsigned long work_flags; /* IP_VS_WORK_* flags */
/* ip_vs_est */
struct delayed_work est_reload_work;/* Reload kthread tasks */
struct mutex est_mutex; /* protect kthread tasks */
@@ -1041,6 +1295,10 @@ struct netns_ipvs {
*/
unsigned int mixed_address_family_dests;
unsigned int hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */
+
+ struct ip_vs_rht __rcu *svc_table; /* Services */
+ struct ip_vs_rht __rcu *conn_tab; /* Connections */
+ atomic_t conn_tab_changes;/* ++ on new table */
};
#define DEFAULT_SYNC_THRESHOLD 3
@@ -1155,7 +1413,7 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs)
return ipvs->sysctl_run_estimation;
}
-static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+static inline const struct cpumask *__sysctl_est_cpulist(struct netns_ipvs *ipvs)
{
if (ipvs->est_cpulist_valid)
return ipvs->sysctl_est_cpulist;
@@ -1163,6 +1421,14 @@ static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
return housekeeping_cpumask(HK_TYPE_KTHREAD);
}
+static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs)
+{
+ if (ipvs->est_cpulist_valid)
+ return ipvs->sysctl_est_cpulist;
+ else
+ return NULL;
+}
+
static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
{
return ipvs->sysctl_est_nice;
@@ -1265,11 +1531,16 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs)
return 1;
}
-static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+static inline const struct cpumask *__sysctl_est_cpulist(struct netns_ipvs *ipvs)
{
return housekeeping_cpumask(HK_TYPE_KTHREAD);
}
+static inline const struct cpumask *sysctl_est_preferred_cpulist(struct netns_ipvs *ipvs)
+{
+ return NULL;
+}
+
static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
{
return IPVS_EST_NICE;
@@ -1277,6 +1548,36 @@ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
#endif
+/* Get load factor to map conn_count/u_thresh to t->size */
+static inline int sysctl_conn_lfactor(struct netns_ipvs *ipvs)
+{
+ return READ_ONCE(ipvs->sysctl_conn_lfactor);
+}
+
+/* Get load factor to map num_services/u_thresh to t->size
+ * Smaller value decreases u_thresh to reduce collisions but increases
+ * the table size
+ * Returns factor where:
+ * - <0: u_thresh = size >> -factor, eg. lfactor -2 = 25% load
+ * - >=0: u_thresh = size << factor, eg. lfactor 1 = 200% load
+ */
+static inline int sysctl_svc_lfactor(struct netns_ipvs *ipvs)
+{
+ return READ_ONCE(ipvs->sysctl_svc_lfactor);
+}
+
+static inline bool sysctl_est_cpulist_empty(struct netns_ipvs *ipvs)
+{
+ guard(rcu)();
+ return cpumask_empty(__sysctl_est_cpulist(ipvs));
+}
+
+static inline unsigned int sysctl_est_cpulist_weight(struct netns_ipvs *ipvs)
+{
+ guard(rcu)();
+ return cpumask_weight(__sysctl_est_cpulist(ipvs));
+}
+
/* IPVS core functions
* (from ip_vs_core.c)
*/
@@ -1350,6 +1651,23 @@ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp)
}
void ip_vs_conn_put(struct ip_vs_conn *cp);
void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport);
+int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
+ int lfactor);
+struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets,
+ int lfactor);
+
+static inline struct ip_vs_conn *
+ip_vs_hn0_to_conn(struct ip_vs_conn_hnode *hn)
+{
+ return container_of(hn, struct ip_vs_conn, hn0);
+}
+
+static inline struct ip_vs_conn *
+ip_vs_hn_to_conn(struct ip_vs_conn_hnode *hn)
+{
+ return hn->dir ? container_of(hn, struct ip_vs_conn, hn1) :
+ container_of(hn, struct ip_vs_conn, hn0);
+}
struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
const union nf_inet_addr *daddr,
@@ -1506,8 +1824,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler);
int ip_vs_bind_scheduler(struct ip_vs_service *svc,
struct ip_vs_scheduler *scheduler);
-void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
- struct ip_vs_scheduler *sched);
+void ip_vs_unbind_scheduler(struct ip_vs_service *svc);
struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name);
void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler);
struct ip_vs_conn *
@@ -1580,18 +1897,26 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats);
void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats);
void ip_vs_zero_estimator(struct ip_vs_stats *stats);
void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats);
-void ip_vs_est_reload_start(struct netns_ipvs *ipvs);
+void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool restart);
int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
struct ip_vs_est_kt_data *kd);
void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd);
+static inline void ip_vs_stop_estimator_tot_stats(struct netns_ipvs *ipvs)
+{
+#ifdef CONFIG_SYSCTL
+ ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
+ ipvs->tot_stats->s.est.ktid = -2;
+#endif
+}
+
static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs)
{
#ifdef CONFIG_SYSCTL
/* Stop tasks while cpulist is empty or if disabled with flag */
ipvs->est_stopped = !sysctl_run_estimation(ipvs) ||
(ipvs->est_cpulist_valid &&
- cpumask_empty(sysctl_est_cpulist(ipvs)));
+ sysctl_est_cpulist_empty(ipvs));
#endif
}
@@ -1607,7 +1932,7 @@ static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs)
static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs)
{
unsigned int limit = IPVS_EST_CPU_KTHREADS *
- cpumask_weight(sysctl_est_cpulist(ipvs));
+ sysctl_est_cpulist_weight(ipvs);
return max(1U, limit);
}
@@ -1703,6 +2028,13 @@ static inline char ip_vs_fwd_tag(struct ip_vs_conn *cp)
return fwd;
}
+/* Check if connection uses double hashing */
+static inline bool ip_vs_conn_use_hash2(struct ip_vs_conn *cp)
+{
+ return IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ &&
+ !(cp->flags & IP_VS_CONN_F_TEMPLATE);
+}
+
void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct ip_vs_conn *cp, int dir);
diff --git a/include/net/ipcomp.h b/include/net/ipcomp.h
index 8660a2a6d1fc..51401f01e2a5 100644
--- a/include/net/ipcomp.h
+++ b/include/net/ipcomp.h
@@ -3,20 +3,9 @@
#define _NET_IPCOMP_H
#include <linux/skbuff.h>
-#include <linux/types.h>
-
-#define IPCOMP_SCRATCH_SIZE 65400
-
-struct crypto_comp;
-struct ip_comp_hdr;
-
-struct ipcomp_data {
- u16 threshold;
- struct crypto_comp * __percpu *tfms;
-};
struct ip_comp_hdr;
-struct sk_buff;
+struct netlink_ext_ack;
struct xfrm_state;
int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index f5c43ad1565e..1dec81faff28 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -25,8 +25,6 @@ struct ip_tunnel_info;
#define SIN6_LEN_RFC2133 24
-#define IPV6_MAXPLEN 65535
-
/*
* NextHeader field of IPv6 header
*/
@@ -92,6 +90,9 @@ struct ip_tunnel_info;
#define IP6_DEFAULT_MAX_DST_OPTS_LEN INT_MAX /* No limit */
#define IP6_DEFAULT_MAX_HBH_OPTS_LEN INT_MAX /* No limit */
+/* Hard limit on traversed IPv6 extension headers */
+#define IP6_MAX_EXT_HDRS_CNT 12
+
/*
* Addr type
*
@@ -151,17 +152,6 @@ struct frag_hdr {
__be32 identification;
};
-/*
- * Jumbo payload option, as described in RFC 2675 2.
- */
-struct hop_jumbo_hdr {
- u8 nexthdr;
- u8 hdrlen;
- u8 tlv_type; /* IPV6_TLV_JUMBO, 0xC2 */
- u8 tlv_len; /* 4 */
- __be32 jumbo_payload_len;
-};
-
#define IP6_MF 0x0001
#define IP6_OFFSET 0xFFF8
@@ -246,17 +236,20 @@ extern int sysctl_mld_qrv;
#define _DEVADD(net, statname, mod, idev, field, val) \
({ \
struct inet6_dev *_idev = (idev); \
+ unsigned long _field = (field); \
+ unsigned long _val = (val); \
if (likely(_idev != NULL)) \
- mod##SNMP_ADD_STATS((_idev)->stats.statname, (field), (val)); \
- mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, (field), (val));\
+ mod##SNMP_ADD_STATS((_idev)->stats.statname, _field, _val); \
+ mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, _field, _val);\
})
#define _DEVUPD(net, statname, mod, idev, field, val) \
({ \
struct inet6_dev *_idev = (idev); \
+ unsigned long _val = (val); \
if (likely(_idev != NULL)) \
- mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, (val)); \
- mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\
+ mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, _val); \
+ mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, _val);\
})
/* MIBs */
@@ -363,15 +356,6 @@ struct ipcm6_cookie {
struct ipv6_txoptions *opt;
};
-static inline void ipcm6_init(struct ipcm6_cookie *ipc6)
-{
- *ipc6 = (struct ipcm6_cookie) {
- .hlimit = -1,
- .tclass = -1,
- .dontfrag = -1,
- };
-}
-
static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6,
const struct sock *sk)
{
@@ -380,6 +364,8 @@ static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6,
.tclass = inet6_sk(sk)->tclass,
.dontfrag = inet6_test_bit(DONTFRAG, sk),
};
+
+ sockcm_init(&ipc6->sockc, sk);
}
static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np)
@@ -468,72 +454,6 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
struct ipv6_txoptions *opt);
-/* This helper is specialized for BIG TCP needs.
- * It assumes the hop_jumbo_hdr will immediately follow the IPV6 header.
- * It assumes headers are already in skb->head.
- * Returns: 0, or IPPROTO_TCP if a BIG TCP packet is there.
- */
-static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb)
-{
- const struct hop_jumbo_hdr *jhdr;
- const struct ipv6hdr *nhdr;
-
- if (likely(skb->len <= GRO_LEGACY_MAX_SIZE))
- return 0;
-
- if (skb->protocol != htons(ETH_P_IPV6))
- return 0;
-
- if (skb_network_offset(skb) +
- sizeof(struct ipv6hdr) +
- sizeof(struct hop_jumbo_hdr) > skb_headlen(skb))
- return 0;
-
- nhdr = ipv6_hdr(skb);
-
- if (nhdr->nexthdr != NEXTHDR_HOP)
- return 0;
-
- jhdr = (const struct hop_jumbo_hdr *) (nhdr + 1);
- if (jhdr->tlv_type != IPV6_TLV_JUMBO || jhdr->hdrlen != 0 ||
- jhdr->nexthdr != IPPROTO_TCP)
- return 0;
- return jhdr->nexthdr;
-}
-
-/* Return 0 if HBH header is successfully removed
- * Or if HBH removal is unnecessary (packet is not big TCP)
- * Return error to indicate dropping the packet
- */
-static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb)
-{
- const int hophdr_len = sizeof(struct hop_jumbo_hdr);
- int nexthdr = ipv6_has_hopopt_jumbo(skb);
- struct ipv6hdr *h6;
-
- if (!nexthdr)
- return 0;
-
- if (skb_cow_head(skb, 0))
- return -1;
-
- /* Remove the HBH header.
- * Layout: [Ethernet header][IPv6 header][HBH][L4 Header]
- */
- memmove(skb_mac_header(skb) + hophdr_len, skb_mac_header(skb),
- skb_network_header(skb) - skb_mac_header(skb) +
- sizeof(struct ipv6hdr));
-
- __skb_pull(skb, hophdr_len);
- skb->network_header += hophdr_len;
- skb->mac_header += hophdr_len;
-
- h6 = ipv6_hdr(skb);
- h6->nexthdr = nexthdr;
-
- return 0;
-}
-
static inline bool ipv6_accept_ra(const struct inet6_dev *idev)
{
s32 accept_ra = READ_ONCE(idev->cnf.accept_ra);
@@ -935,10 +855,10 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
#if IS_ENABLED(CONFIG_IPV6)
-static inline bool ipv6_can_nonlocal_bind(struct net *net,
- struct inet_sock *inet)
+static inline bool ipv6_can_nonlocal_bind(const struct net *net,
+ const struct inet_sock *inet)
{
- return net->ipv6.sysctl.ip_nonlocal_bind ||
+ return READ_ONCE(net->ipv6.sysctl.ip_nonlocal_bind) ||
test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) ||
test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags);
}
@@ -953,10 +873,12 @@ static inline bool ipv6_can_nonlocal_bind(struct net *net,
#define IP6_DEFAULT_AUTO_FLOW_LABELS IP6_AUTO_FLOW_LABEL_OPTOUT
-static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
+static inline __be32 ip6_make_flowlabel(const struct net *net,
+ struct sk_buff *skb,
__be32 flowlabel, bool autolabel,
struct flowi6 *fl6)
{
+ u8 auto_flowlabels;
u32 hash;
/* @flowlabel may include more than a flow label, eg, the traffic class.
@@ -964,10 +886,12 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
*/
flowlabel &= IPV6_FLOWLABEL_MASK;
- if (flowlabel ||
- net->ipv6.sysctl.auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF ||
- (!autolabel &&
- net->ipv6.sysctl.auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED))
+ if (flowlabel)
+ return flowlabel;
+
+ auto_flowlabels = READ_ONCE(net->ipv6.sysctl.auto_flowlabels);
+ if (auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF ||
+ (!autolabel && auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED))
return flowlabel;
hash = skb_get_hash_flowi6(skb, fl6);
@@ -980,15 +904,15 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
- if (net->ipv6.sysctl.flowlabel_state_ranges)
+ if (READ_ONCE(net->ipv6.sysctl.flowlabel_state_ranges))
flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG;
return flowlabel;
}
-static inline int ip6_default_np_autolabel(struct net *net)
+static inline int ip6_default_np_autolabel(const struct net *net)
{
- switch (net->ipv6.sysctl.auto_flowlabels) {
+ switch (READ_ONCE(net->ipv6.sysctl.auto_flowlabels)) {
case IP6_AUTO_FLOW_LABEL_OFF:
case IP6_AUTO_FLOW_LABEL_OPTIN:
default:
@@ -999,13 +923,13 @@ static inline int ip6_default_np_autolabel(struct net *net)
}
}
#else
-static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
+static inline __be32 ip6_make_flowlabel(const struct net *net, struct sk_buff *skb,
__be32 flowlabel, bool autolabel,
struct flowi6 *fl6)
{
return flowlabel;
}
-static inline int ip6_default_np_autolabel(struct net *net)
+static inline int ip6_default_np_autolabel(const struct net *net)
{
return 0;
}
@@ -1014,11 +938,11 @@ static inline int ip6_default_np_autolabel(struct net *net)
#if IS_ENABLED(CONFIG_IPV6)
static inline int ip6_multipath_hash_policy(const struct net *net)
{
- return net->ipv6.sysctl.multipath_hash_policy;
+ return READ_ONCE(net->ipv6.sysctl.multipath_hash_policy);
}
static inline u32 ip6_multipath_hash_fields(const struct net *net)
{
- return net->ipv6.sysctl.multipath_hash_fields;
+ return READ_ONCE(net->ipv6.sysctl.multipath_hash_fields);
}
#else
static inline int ip6_multipath_hash_policy(const struct net *net)
@@ -1107,8 +1031,7 @@ void ip6_flush_pending_frames(struct sock *sk);
int ip6_send_skb(struct sk_buff *skb);
struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue,
- struct inet_cork_full *cork,
- struct inet6_cork *v6_cork);
+ struct inet_cork_full *cork);
struct sk_buff *ip6_make_skb(struct sock *sk,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
@@ -1119,14 +1042,23 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
static inline struct sk_buff *ip6_finish_skb(struct sock *sk)
{
- return __ip6_make_skb(sk, &sk->sk_write_queue, &inet_sk(sk)->cork,
- &inet6_sk(sk)->cork);
+ return __ip6_make_skb(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
}
int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
struct flowi6 *fl6);
+#if IS_ENABLED(CONFIG_IPV6)
struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
const struct in6_addr *final_dst);
+#else
+static inline struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk,
+ struct flowi6 *fl6,
+ const struct in6_addr *final_dst)
+{
+ return ERR_PTR(-EAFNOSUPPORT);
+}
+#endif
+
struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
const struct in6_addr *final_dst,
bool connected);
@@ -1151,11 +1083,11 @@ int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
* Extension header (options) processing
*/
-void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
- u8 *proto, struct in6_addr **daddr_p,
- struct in6_addr *saddr);
-void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
- u8 *proto);
+u8 ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
+ u8 proto, struct in6_addr **daddr_p,
+ struct in6_addr *saddr);
+u8 ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
+ u8 proto);
int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp,
__be16 *frag_offp);
@@ -1174,9 +1106,19 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target,
int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type);
-struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
- const struct ipv6_txoptions *opt,
- struct in6_addr *orig);
+struct in6_addr *__fl6_update_dst(struct flowi6 *fl6,
+ const struct ipv6_txoptions *opt,
+ struct in6_addr *orig);
+
+static inline struct in6_addr *
+fl6_update_dst(struct flowi6 *fl6, const struct ipv6_txoptions *opt,
+ struct in6_addr *orig)
+{
+ if (likely(!opt))
+ return NULL;
+
+ return __fl6_update_dst(fl6, opt, orig);
+}
/*
* socket options (ipv6_sockglue.c)
@@ -1192,18 +1134,16 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
int ipv6_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
-int __ip6_datagram_connect(struct sock *sk, struct sockaddr *addr,
+int __ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *addr,
int addr_len);
-int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len);
-int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr,
+int ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *addr, int addr_len);
+int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr_unsized *addr,
int addr_len);
int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr);
void ip6_datagram_release_cb(struct sock *sk);
-int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
- int *addr_len);
-int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len,
- int *addr_len);
+int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len);
+int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len);
void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port,
u32 info, u8 *payload);
void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info);
@@ -1212,8 +1152,10 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu);
void inet6_cleanup_sock(struct sock *sk);
void inet6_sock_destruct(struct sock *sk);
int inet6_release(struct socket *sock);
-int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
-int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+int __inet6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len,
+ u32 flags);
+int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len);
+int inet6_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
int peer);
int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
@@ -1252,8 +1194,6 @@ int tcp6_proc_init(struct net *net);
void tcp6_proc_exit(struct net *net);
int udp6_proc_init(struct net *net);
void udp6_proc_exit(struct net *net);
-int udplite6_proc_init(void);
-void udplite6_proc_exit(void);
int ipv6_misc_proc_init(void);
void ipv6_misc_proc_exit(void);
int snmp6_register_dev(struct inet6_dev *idev);
@@ -1284,12 +1224,15 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
static inline int ip6_sock_set_v6only(struct sock *sk)
{
- if (inet_sk(sk)->inet_num)
- return -EINVAL;
+ int ret = 0;
+
lock_sock(sk);
- sk->sk_ipv6only = true;
+ if (inet_sk(sk)->inet_num)
+ ret = -EINVAL;
+ else
+ sk->sk_ipv6only = true;
release_sock(sk);
- return 0;
+ return ret;
}
static inline void ip6_sock_set_recverr(struct sock *sk)
diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h
index 7321ffe3a108..41d9fc6965f9 100644
--- a/include/net/ipv6_frag.h
+++ b/include/net/ipv6_frag.h
@@ -66,18 +66,22 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
{
struct net_device *dev = NULL;
struct sk_buff *head;
+ int refs = 1;
rcu_read_lock();
- /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */
- if (READ_ONCE(fq->q.fqdir->dead))
- goto out_rcu_unlock;
spin_lock(&fq->q.lock);
if (fq->q.flags & INET_FRAG_COMPLETE)
goto out;
fq->q.flags |= INET_FRAG_DROP;
- inet_frag_kill(&fq->q);
+ inet_frag_kill(&fq->q, &refs);
+
+ /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */
+ if (READ_ONCE(fq->q.fqdir->dead)) {
+ inet_frag_queue_flush(&fq->q, 0);
+ goto out;
+ }
dev = dev_get_by_index_rcu(net, fq->iif);
if (!dev)
@@ -109,7 +113,7 @@ out:
spin_unlock(&fq->q.lock);
out_rcu_unlock:
rcu_read_unlock();
- inet_frag_put(&fq->q);
+ inet_frag_putn(&fq->q, refs);
}
/* Check if the upper layer header is truncated in the first fragment. */
diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h
deleted file mode 100644
index 8a3465c8c2c5..000000000000
--- a/include/net/ipv6_stubs.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _IPV6_STUBS_H
-#define _IPV6_STUBS_H
-
-#include <linux/in6.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/neighbour.h>
-#include <net/sock.h>
-#include <net/ipv6.h>
-
-/* structs from net/ip6_fib.h */
-struct fib6_info;
-struct fib6_nh;
-struct fib6_config;
-struct fib6_result;
-
-/* This is ugly, ideally these symbols should be built
- * into the core kernel.
- */
-struct ipv6_stub {
- int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex,
- const struct in6_addr *addr);
- int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex,
- const struct in6_addr *addr);
- struct dst_entry *(*ipv6_dst_lookup_flow)(struct net *net,
- const struct sock *sk,
- struct flowi6 *fl6,
- const struct in6_addr *final_dst);
- int (*ipv6_route_input)(struct sk_buff *skb);
-
- struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
- int (*fib6_lookup)(struct net *net, int oif, struct flowi6 *fl6,
- struct fib6_result *res, int flags);
- int (*fib6_table_lookup)(struct net *net, struct fib6_table *table,
- int oif, struct flowi6 *fl6,
- struct fib6_result *res, int flags);
- void (*fib6_select_path)(const struct net *net, struct fib6_result *res,
- struct flowi6 *fl6, int oif, bool oif_match,
- const struct sk_buff *skb, int strict);
- u32 (*ip6_mtu_from_fib6)(const struct fib6_result *res,
- const struct in6_addr *daddr,
- const struct in6_addr *saddr);
-
- int (*fib6_nh_init)(struct net *net, struct fib6_nh *fib6_nh,
- struct fib6_config *cfg, gfp_t gfp_flags,
- struct netlink_ext_ack *extack);
- void (*fib6_nh_release)(struct fib6_nh *fib6_nh);
- void (*fib6_nh_release_dsts)(struct fib6_nh *fib6_nh);
- void (*fib6_update_sernum)(struct net *net, struct fib6_info *rt);
- int (*ip6_del_rt)(struct net *net, struct fib6_info *rt, bool skip_notify);
- void (*fib6_rt_update)(struct net *net, struct fib6_info *rt,
- struct nl_info *info);
-
- void (*udpv6_encap_enable)(void);
- void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
- const struct in6_addr *solicited_addr,
- bool router, bool solicited, bool override, bool inc_opt);
-#if IS_ENABLED(CONFIG_XFRM)
- void (*xfrm6_local_rxpmtu)(struct sk_buff *skb, u32 mtu);
- int (*xfrm6_udp_encap_rcv)(struct sock *sk, struct sk_buff *skb);
- struct sk_buff *(*xfrm6_gro_udp_encap_rcv)(struct sock *sk,
- struct list_head *head,
- struct sk_buff *skb);
- int (*xfrm6_rcv_encap)(struct sk_buff *skb, int nexthdr, __be32 spi,
- int encap_type);
-#endif
- struct neigh_table *nd_tbl;
-
- int (*ipv6_fragment)(struct net *net, struct sock *sk, struct sk_buff *skb,
- int (*output)(struct net *, struct sock *, struct sk_buff *));
- struct net_device *(*ipv6_dev_find)(struct net *net, const struct in6_addr *addr,
- struct net_device *dev);
- int (*ip6_xmit)(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
- __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority);
-};
-extern const struct ipv6_stub *ipv6_stub __read_mostly;
-
-/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */
-struct ipv6_bpf_stub {
- int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len,
- u32 flags);
- struct sock *(*udp6_lib_lookup)(const struct net *net,
- const struct in6_addr *saddr, __be16 sport,
- const struct in6_addr *daddr, __be16 dport,
- int dif, int sdif, struct udp_table *tbl,
- struct sk_buff *skb);
- int (*ipv6_setsockopt)(struct sock *sk, int level, int optname,
- sockptr_t optval, unsigned int optlen);
- int (*ipv6_getsockopt)(struct sock *sk, int level, int optname,
- sockptr_t optval, sockptr_t optlen);
- int (*ipv6_dev_get_saddr)(struct net *net,
- const struct net_device *dst_dev,
- const struct in6_addr *daddr,
- unsigned int prefs,
- struct in6_addr *saddr);
-};
-extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
-
-#endif
diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h
index 9804fa5d9c67..18f511da9a09 100644
--- a/include/net/iucv/iucv.h
+++ b/include/net/iucv/iucv.h
@@ -195,35 +195,15 @@ struct iucv_handler {
struct list_head paths;
};
-/**
- * iucv_register:
- * @handler: address of iucv handler structure
- * @smp: != 0 indicates that the handler can deal with out of order messages
- *
- * Registers a driver with IUCV.
- *
- * Returns: 0 on success, -ENOMEM if the memory allocation for the pathid
- * table failed, or -EIO if IUCV_DECLARE_BUFFER failed on all cpus.
- */
int iucv_register(struct iucv_handler *handler, int smp);
+void iucv_unregister(struct iucv_handler *handler, int smp);
/**
- * iucv_unregister
- * @handler: address of iucv handler structure
- * @smp: != 0 indicates that the handler can deal with out of order messages
- *
- * Unregister driver from IUCV.
- */
-void iucv_unregister(struct iucv_handler *handle, int smp);
-
-/**
- * iucv_path_alloc
+ * iucv_path_alloc - Allocate a new path structure for use with iucv_connect.
* @msglim: initial message limit
* @flags: initial flags
* @gfp: kmalloc allocation flag
*
- * Allocate a new path structure for use with iucv_connect.
- *
* Returns: NULL if the memory allocation failed or a pointer to the
* path structure.
*/
@@ -231,7 +211,7 @@ static inline struct iucv_path *iucv_path_alloc(u16 msglim, u8 flags, gfp_t gfp)
{
struct iucv_path *path;
- path = kzalloc(sizeof(struct iucv_path), gfp);
+ path = kzalloc_obj(struct iucv_path, gfp);
if (path) {
path->msglim = msglim;
path->flags = flags;
@@ -240,229 +220,48 @@ static inline struct iucv_path *iucv_path_alloc(u16 msglim, u8 flags, gfp_t gfp)
}
/**
- * iucv_path_free
+ * iucv_path_free - Frees a path structure.
* @path: address of iucv path structure
- *
- * Frees a path structure.
*/
static inline void iucv_path_free(struct iucv_path *path)
{
kfree(path);
}
-/**
- * iucv_path_accept
- * @path: address of iucv path structure
- * @handler: address of iucv handler structure
- * @userdata: 16 bytes of data reflected to the communication partner
- * @private: private data passed to interrupt handlers for this path
- *
- * This function is issued after the user received a connection pending
- * external interrupt and now wishes to complete the IUCV communication path.
- *
- * Returns: the result of the CP IUCV call.
- */
int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler,
u8 *userdata, void *private);
-/**
- * iucv_path_connect
- * @path: address of iucv path structure
- * @handler: address of iucv handler structure
- * @userid: 8-byte user identification
- * @system: 8-byte target system identification
- * @userdata: 16 bytes of data reflected to the communication partner
- * @private: private data passed to interrupt handlers for this path
- *
- * This function establishes an IUCV path. Although the connect may complete
- * successfully, you are not able to use the path until you receive an IUCV
- * Connection Complete external interrupt.
- *
- * Returns: the result of the CP IUCV call.
- */
int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler,
u8 *userid, u8 *system, u8 *userdata,
void *private);
-/**
- * iucv_path_quiesce:
- * @path: address of iucv path structure
- * @userdata: 16 bytes of data reflected to the communication partner
- *
- * This function temporarily suspends incoming messages on an IUCV path.
- * You can later reactivate the path by invoking the iucv_resume function.
- *
- * Returns: the result from the CP IUCV call.
- */
int iucv_path_quiesce(struct iucv_path *path, u8 *userdata);
-/**
- * iucv_path_resume:
- * @path: address of iucv path structure
- * @userdata: 16 bytes of data reflected to the communication partner
- *
- * This function resumes incoming messages on an IUCV path that has
- * been stopped with iucv_path_quiesce.
- *
- * Returns: the result from the CP IUCV call.
- */
int iucv_path_resume(struct iucv_path *path, u8 *userdata);
-/**
- * iucv_path_sever
- * @path: address of iucv path structure
- * @userdata: 16 bytes of data reflected to the communication partner
- *
- * This function terminates an IUCV path.
- *
- * Returns: the result from the CP IUCV call.
- */
int iucv_path_sever(struct iucv_path *path, u8 *userdata);
-/**
- * iucv_message_purge
- * @path: address of iucv path structure
- * @msg: address of iucv msg structure
- * @srccls: source class of message
- *
- * Cancels a message you have sent.
- *
- * Returns: the result from the CP IUCV call.
- */
int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg,
u32 srccls);
-/**
- * iucv_message_receive
- * @path: address of iucv path structure
- * @msg: address of iucv msg structure
- * @flags: flags that affect how the message is received (IUCV_IPBUFLST)
- * @buffer: address of data buffer or address of struct iucv_array
- * @size: length of data buffer
- * @residual:
- *
- * This function receives messages that are being sent to you over
- * established paths. This function will deal with RMDATA messages
- * embedded in struct iucv_message as well.
- *
- * Locking: local_bh_enable/local_bh_disable
- *
- * Returns: the result from the CP IUCV call.
- */
int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg,
u8 flags, void *buffer, size_t size, size_t *residual);
-/**
- * __iucv_message_receive
- * @path: address of iucv path structure
- * @msg: address of iucv msg structure
- * @flags: flags that affect how the message is received (IUCV_IPBUFLST)
- * @buffer: address of data buffer or address of struct iucv_array
- * @size: length of data buffer
- * @residual:
- *
- * This function receives messages that are being sent to you over
- * established paths. This function will deal with RMDATA messages
- * embedded in struct iucv_message as well.
- *
- * Locking: no locking.
- *
- * Returns: the result from the CP IUCV call.
- */
int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg,
u8 flags, void *buffer, size_t size,
size_t *residual);
-/**
- * iucv_message_reject
- * @path: address of iucv path structure
- * @msg: address of iucv msg structure
- *
- * The reject function refuses a specified message. Between the time you
- * are notified of a message and the time that you complete the message,
- * the message may be rejected.
- *
- * Returns: the result from the CP IUCV call.
- */
int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg);
-/**
- * iucv_message_reply
- * @path: address of iucv path structure
- * @msg: address of iucv msg structure
- * @flags: how the reply is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST)
- * @reply: address of data buffer or address of struct iucv_array
- * @size: length of reply data buffer
- *
- * This function responds to the two-way messages that you receive. You
- * must identify completely the message to which you wish to reply. ie,
- * pathid, msgid, and trgcls. Prmmsg signifies the data is moved into
- * the parameter list.
- *
- * Returns: the result from the CP IUCV call.
- */
int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg,
u8 flags, void *reply, size_t size);
-/**
- * iucv_message_send
- * @path: address of iucv path structure
- * @msg: address of iucv msg structure
- * @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST)
- * @srccls: source class of message
- * @buffer: address of data buffer or address of struct iucv_array
- * @size: length of send buffer
- *
- * This function transmits data to another application. Data to be
- * transmitted is in a buffer and this is a one-way message and the
- * receiver will not reply to the message.
- *
- * Locking: local_bh_enable/local_bh_disable
- *
- * Returns: the result from the CP IUCV call.
- */
int iucv_message_send(struct iucv_path *path, struct iucv_message *msg,
u8 flags, u32 srccls, void *buffer, size_t size);
-/**
- * __iucv_message_send
- * @path: address of iucv path structure
- * @msg: address of iucv msg structure
- * @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST)
- * @srccls: source class of message
- * @buffer: address of data buffer or address of struct iucv_array
- * @size: length of send buffer
- *
- * This function transmits data to another application. Data to be
- * transmitted is in a buffer and this is a one-way message and the
- * receiver will not reply to the message.
- *
- * Locking: no locking.
- *
- * Returns: the result from the CP IUCV call.
- */
int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg,
u8 flags, u32 srccls, void *buffer, size_t size);
-/**
- * iucv_message_send2way
- * @path: address of iucv path structure
- * @msg: address of iucv msg structure
- * @flags: how the message is sent and the reply is received
- * (IUCV_IPRMDATA, IUCV_IPBUFLST, IUCV_IPPRTY, IUCV_ANSLST)
- * @srccls: source class of message
- * @buffer: address of data buffer or address of struct iucv_array
- * @size: length of send buffer
- * @ansbuf: address of answer buffer or address of struct iucv_array
- * @asize: size of reply buffer
- *
- * This function transmits data to another application. Data to be
- * transmitted is in a buffer. The receiver of the send is expected to
- * reply to the message and a buffer is provided into which IUCV moves
- * the reply to this message.
- *
- * Returns: the result from the CP IUCV call.
- */
int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg,
u8 flags, u32 srccls, void *buffer, size_t size,
void *answer, size_t asize, size_t *residual);
diff --git a/include/net/kcm.h b/include/net/kcm.h
index 441e993be634..d9c35e71ecea 100644
--- a/include/net/kcm.h
+++ b/include/net/kcm.h
@@ -71,7 +71,6 @@ struct kcm_sock {
struct list_head wait_psock_list;
struct sk_buff *seq_skb;
struct mutex tx_mutex;
- u32 tx_stopped : 1;
/* Don't use bit fields here, these are set under different locks */
bool tx_wait;
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index f7fe796e8429..710e98665eb3 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -59,6 +59,20 @@ int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type, struct net *net,
int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
struct fib_lookup_arg *arg);
+static inline
+bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex)
+{
+ return !(fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF) &&
+ fl->flowi_l3mdev == iifindex;
+}
+
+static inline
+bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex)
+{
+ return fl->flowi_flags & FLOWI_FLAG_L3MDEV_OIF &&
+ fl->flowi_l3mdev == oifindex;
+}
+
void l3mdev_update_flow(struct net *net, struct flowi *fl);
int l3mdev_master_ifindex_rcu(const struct net_device *dev);
@@ -193,18 +207,19 @@ struct sk_buff *l3mdev_ip6_rcv(struct sk_buff *skb)
static inline
struct sk_buff *l3mdev_l3_out(struct sock *sk, struct sk_buff *skb, u16 proto)
{
- struct net_device *dev = skb_dst(skb)->dev;
+ struct net_device *dev;
+ rcu_read_lock();
+ dev = skb_dst_dev_rcu(skb);
if (netif_is_l3_slave(dev)) {
struct net_device *master;
- rcu_read_lock();
master = netdev_master_upper_dev_get_rcu(dev);
if (master && master->l3mdev_ops->l3mdev_l3_out)
skb = master->l3mdev_ops->l3mdev_l3_out(master, sk,
skb, proto);
- rcu_read_unlock();
}
+ rcu_read_unlock();
return skb;
}
@@ -327,6 +342,19 @@ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
{
return 1;
}
+
+static inline
+bool l3mdev_fib_rule_iif_match(const struct flowi *fl, int iifindex)
+{
+ return false;
+}
+
+static inline
+bool l3mdev_fib_rule_oif_match(const struct flowi *fl, int oifindex)
+{
+ return false;
+}
+
static inline
void l3mdev_update_flow(struct net *net, struct flowi *fl)
{
diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h
index 43574bd6612f..5d991404845e 100644
--- a/include/net/libeth/rx.h
+++ b/include/net/libeth/rx.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2024 Intel Corporation */
+/* Copyright (C) 2024-2025 Intel Corporation */
#ifndef __LIBETH_RX_H
#define __LIBETH_RX_H
@@ -13,8 +13,10 @@
/* Space reserved in front of each frame */
#define LIBETH_SKB_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN)
+#define LIBETH_XDP_HEADROOM (ALIGN(XDP_PACKET_HEADROOM, NET_SKB_PAD) + \
+ NET_IP_ALIGN)
/* Maximum headroom for worst-case calculations */
-#define LIBETH_MAX_HEADROOM LIBETH_SKB_HEADROOM
+#define LIBETH_MAX_HEADROOM LIBETH_XDP_HEADROOM
/* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */
#define LIBETH_RX_LL_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)
/* Maximum supported L2-L4 header length */
@@ -31,7 +33,7 @@
/**
* struct libeth_fqe - structure representing an Rx buffer (fill queue element)
- * @page: page holding the buffer
+ * @netmem: network memory reference holding the buffer
* @offset: offset from the page start (to the headroom)
* @truesize: total space occupied by the buffer (w/ headroom and tailroom)
*
@@ -40,7 +42,7 @@
* former, @offset is always 0 and @truesize is always ```PAGE_SIZE```.
*/
struct libeth_fqe {
- struct page *page;
+ netmem_ref netmem;
u32 offset;
u32 truesize;
} __aligned_largest;
@@ -66,6 +68,7 @@ enum libeth_fqe_type {
* @count: number of descriptors/buffers the queue has
* @type: type of the buffers this queue has
* @hsplit: flag whether header split is enabled
+ * @xdp: flag indicating whether XDP is enabled
* @buf_len: HW-writeable length per each buffer
* @nid: ID of the closest NUMA node with memory
*/
@@ -81,6 +84,7 @@ struct libeth_fq {
/* Cold fields */
enum libeth_fqe_type type:2;
bool hsplit:1;
+ bool xdp:1;
u32 buf_len;
int nid;
@@ -102,15 +106,16 @@ static inline dma_addr_t libeth_rx_alloc(const struct libeth_fq_fp *fq, u32 i)
struct libeth_fqe *buf = &fq->fqes[i];
buf->truesize = fq->truesize;
- buf->page = page_pool_dev_alloc(fq->pp, &buf->offset, &buf->truesize);
- if (unlikely(!buf->page))
+ buf->netmem = page_pool_dev_alloc_netmem(fq->pp, &buf->offset,
+ &buf->truesize);
+ if (unlikely(!buf->netmem))
return DMA_MAPPING_ERROR;
- return page_pool_get_dma_addr(buf->page) + buf->offset +
+ return page_pool_get_dma_addr_netmem(buf->netmem) + buf->offset +
fq->pp->p.offset;
}
-void libeth_rx_recycle_slow(struct page *page);
+void libeth_rx_recycle_slow(netmem_ref netmem);
/**
* libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA
@@ -126,18 +131,19 @@ void libeth_rx_recycle_slow(struct page *page);
static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe,
u32 len)
{
- struct page *page = fqe->page;
+ netmem_ref netmem = fqe->netmem;
/* Very rare, but possible case. The most common reason:
* the last fragment contained FCS only, which was then
* stripped by the HW.
*/
if (unlikely(!len)) {
- libeth_rx_recycle_slow(page);
+ libeth_rx_recycle_slow(netmem);
return false;
}
- page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len);
+ page_pool_dma_sync_netmem_for_cpu(netmem_get_pp(netmem), netmem,
+ fqe->offset, len);
return true;
}
@@ -198,6 +204,53 @@ struct libeth_rx_pt {
enum xdp_rss_hash_type hash_type:16;
};
+/**
+ * struct libeth_rx_csum - checksum offload bits decoded from the Rx descriptor
+ * @l3l4p: detectable L3 and L4 integrity check is processed by the hardware
+ * @ipe: IP checksum error
+ * @eipe: external (outermost) IP header (only for tunels)
+ * @eudpe: external (outermost) UDP checksum error (only for tunels)
+ * @ipv6exadd: IPv6 header with extension headers
+ * @l4e: L4 integrity error
+ * @pprs: set for packets that skip checksum calculation in the HW pre parser
+ * @nat: the packet is a UDP tunneled packet
+ * @raw_csum_valid: set if raw checksum is valid
+ * @pad: padding to naturally align raw_csum field
+ * @raw_csum: raw checksum
+ */
+struct libeth_rx_csum {
+ u32 l3l4p:1;
+ u32 ipe:1;
+ u32 eipe:1;
+ u32 eudpe:1;
+ u32 ipv6exadd:1;
+ u32 l4e:1;
+ u32 pprs:1;
+ u32 nat:1;
+
+ u32 raw_csum_valid:1;
+ u32 pad:7;
+ u32 raw_csum:16;
+};
+
+/**
+ * struct libeth_rqe_info - receive queue element info
+ * @len: packet length
+ * @ptype: packet type based on types programmed into the device
+ * @eop: whether it's the last fragment of the packet
+ * @rxe: MAC errors: CRC, Alignment, Oversize, Undersizes, Length error
+ * @vlan: C-VLAN or S-VLAN tag depending on the VLAN offload configuration
+ */
+struct libeth_rqe_info {
+ u32 len;
+
+ u32 ptype:14;
+ u32 eop:1;
+ u32 rxe:1;
+
+ u32 vlan:16;
+};
+
void libeth_rx_pt_gen_hash_type(struct libeth_rx_pt *pt);
/**
diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h
index 35614f9523f6..c3db5c6f1641 100644
--- a/include/net/libeth/tx.h
+++ b/include/net/libeth/tx.h
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2024 Intel Corporation */
+/* Copyright (C) 2024-2025 Intel Corporation */
#ifndef __LIBETH_TX_H
#define __LIBETH_TX_H
@@ -12,11 +12,17 @@
/**
* enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion
- * @LIBETH_SQE_EMPTY: unused/empty, no action required
+ * @LIBETH_SQE_EMPTY: unused/empty OR XDP_TX/XSk frame, no action required
* @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required
* @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree()
* @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA
* @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats
+ * @__LIBETH_SQE_XDP_START: separator between skb and XDP types
+ * @LIBETH_SQE_XDP_TX: &skb_shared_info, libeth_xdp_return_buff_bulk(), stats
+ * @LIBETH_SQE_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame_bulk(), stats
+ * @LIBETH_SQE_XDP_XMIT_FRAG: &xdp_frame frag, only unmap DMA
+ * @LIBETH_SQE_XSK_TX: &libeth_xdp_buff on XSk queue, xsk_buff_free(), stats
+ * @LIBETH_SQE_XSK_TX_FRAG: &libeth_xdp_buff frag on XSk queue, xsk_buff_free()
*/
enum libeth_sqe_type {
LIBETH_SQE_EMPTY = 0U,
@@ -24,6 +30,13 @@ enum libeth_sqe_type {
LIBETH_SQE_SLAB,
LIBETH_SQE_FRAG,
LIBETH_SQE_SKB,
+
+ __LIBETH_SQE_XDP_START,
+ LIBETH_SQE_XDP_TX = __LIBETH_SQE_XDP_START,
+ LIBETH_SQE_XDP_XMIT,
+ LIBETH_SQE_XDP_XMIT_FRAG,
+ LIBETH_SQE_XSK_TX,
+ LIBETH_SQE_XSK_TX_FRAG,
};
/**
@@ -32,6 +45,9 @@ enum libeth_sqe_type {
* @rs_idx: index of the last buffer from the batch this one was sent in
* @raw: slab buffer to free via kfree()
* @skb: &sk_buff to consume
+ * @sinfo: skb shared info of an XDP_TX frame
+ * @xdpf: XDP frame from ::ndo_xdp_xmit()
+ * @xsk: XSk Rx frame from XDP_TX action
* @dma: DMA address to unmap
* @len: length of the mapped region to unmap
* @nr_frags: number of frags in the frame this buffer belongs to
@@ -46,6 +62,9 @@ struct libeth_sqe {
union {
void *raw;
struct sk_buff *skb;
+ struct skb_shared_info *sinfo;
+ struct xdp_frame *xdpf;
+ struct libeth_xdp_buff *xsk;
};
DEFINE_DMA_UNMAP_ADDR(dma);
@@ -71,7 +90,10 @@ struct libeth_sqe {
/**
* struct libeth_cq_pp - completion queue poll params
* @dev: &device to perform DMA unmapping
+ * @bq: XDP frame bulk to combine return operations
* @ss: onstack NAPI stats to fill
+ * @xss: onstack XDPSQ NAPI stats to fill
+ * @xdp_tx: number of XDP-not-XSk frames processed
* @napi: whether it's called from the NAPI context
*
* libeth uses this structure to access objects needed for performing full
@@ -80,7 +102,13 @@ struct libeth_sqe {
*/
struct libeth_cq_pp {
struct device *dev;
- struct libeth_sq_napi_stats *ss;
+ struct xdp_frame_bulk *bq;
+
+ union {
+ struct libeth_sq_napi_stats *ss;
+ struct libeth_xdpsq_napi_stats *xss;
+ };
+ u32 xdp_tx;
bool napi;
};
@@ -126,4 +154,6 @@ static inline void libeth_tx_complete(struct libeth_sqe *sqe,
sqe->type = LIBETH_SQE_EMPTY;
}
+void libeth_tx_complete_any(struct libeth_sqe *sqe, struct libeth_cq_pp *cp);
+
#endif /* __LIBETH_TX_H */
diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h
index 603825e45133..cf1d78a9dc38 100644
--- a/include/net/libeth/types.h
+++ b/include/net/libeth/types.h
@@ -1,10 +1,32 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (C) 2024 Intel Corporation */
+/* Copyright (C) 2024-2025 Intel Corporation */
#ifndef __LIBETH_TYPES_H
#define __LIBETH_TYPES_H
-#include <linux/types.h>
+#include <linux/workqueue.h>
+
+/* Stats */
+
+/**
+ * struct libeth_rq_napi_stats - "hot" counters to update in Rx polling loop
+ * @packets: received frames counter
+ * @bytes: sum of bytes of received frames above
+ * @fragments: sum of fragments of received S/G frames
+ * @hsplit: number of frames the device performed the header split for
+ * @raw: alias to access all the fields as an array
+ */
+struct libeth_rq_napi_stats {
+ union {
+ struct {
+ u32 packets;
+ u32 bytes;
+ u32 fragments;
+ u32 hsplit;
+ };
+ DECLARE_FLEX_ARRAY(u32, raw);
+ };
+};
/**
* struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop
@@ -22,4 +44,84 @@ struct libeth_sq_napi_stats {
};
};
+/**
+ * struct libeth_xdpsq_napi_stats - "hot" counters to update in XDP Tx
+ * completion loop
+ * @packets: completed frames counter
+ * @bytes: sum of bytes of completed frames above
+ * @fragments: sum of fragments of completed S/G frames
+ * @raw: alias to access all the fields as an array
+ */
+struct libeth_xdpsq_napi_stats {
+ union {
+ struct {
+ u32 packets;
+ u32 bytes;
+ u32 fragments;
+ };
+ DECLARE_FLEX_ARRAY(u32, raw);
+ };
+};
+
+/* XDP */
+
+/*
+ * The following structures should be embedded into driver's queue structure
+ * and passed to the libeth_xdp helpers, never used directly.
+ */
+
+/* XDPSQ sharing */
+
+/**
+ * struct libeth_xdpsq_lock - locking primitive for sharing XDPSQs
+ * @lock: spinlock for locking the queue
+ * @share: whether this particular queue is shared
+ */
+struct libeth_xdpsq_lock {
+ spinlock_t lock;
+ bool share;
+};
+
+/* XDPSQ clean-up timers */
+
+/**
+ * struct libeth_xdpsq_timer - timer for cleaning up XDPSQs w/o interrupts
+ * @xdpsq: queue this timer belongs to
+ * @lock: lock for the queue
+ * @dwork: work performing cleanups
+ *
+ * XDPSQs not using interrupts but lazy cleaning, i.e. only when there's no
+ * space for sending the current queued frame/bulk, must fire up timers to
+ * make sure there are no stale buffers to free.
+ */
+struct libeth_xdpsq_timer {
+ void *xdpsq;
+ struct libeth_xdpsq_lock *lock;
+
+ struct delayed_work dwork;
+};
+
+/* Rx polling path */
+
+/**
+ * struct libeth_xdp_buff_stash - struct for stashing &xdp_buff onto a queue
+ * @data: pointer to the start of the frame, xdp_buff.data
+ * @headroom: frame headroom, xdp_buff.data - xdp_buff.data_hard_start
+ * @len: frame linear space length, xdp_buff.data_end - xdp_buff.data
+ * @frame_sz: truesize occupied by the frame, xdp_buff.frame_sz
+ * @flags: xdp_buff.flags
+ *
+ * &xdp_buff is 56 bytes long on x64, &libeth_xdp_buff is 64 bytes. This
+ * structure carries only necessary fields to save/restore a partially built
+ * frame on the queue structure to finish it during the next NAPI poll.
+ */
+struct libeth_xdp_buff_stash {
+ void *data;
+ u16 headroom;
+ u16 len;
+
+ u32 frame_sz:24;
+ u32 flags:8;
+} __aligned_largest;
+
#endif /* __LIBETH_TYPES_H */
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
new file mode 100644
index 000000000000..898723ab62e8
--- /dev/null
+++ b/include/net/libeth/xdp.h
@@ -0,0 +1,1870 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2025 Intel Corporation */
+
+#ifndef __LIBETH_XDP_H
+#define __LIBETH_XDP_H
+
+#include <linux/bpf_trace.h>
+#include <linux/unroll.h>
+
+#include <net/libeth/rx.h>
+#include <net/libeth/tx.h>
+#include <net/xsk_buff_pool.h>
+
+/*
+ * Defined as bits to be able to use them as a mask on Rx.
+ * Also used as internal return values on Tx.
+ */
+enum {
+ LIBETH_XDP_PASS = 0U,
+ LIBETH_XDP_DROP = BIT(0),
+ LIBETH_XDP_ABORTED = BIT(1),
+ LIBETH_XDP_TX = BIT(2),
+ LIBETH_XDP_REDIRECT = BIT(3),
+};
+
+/*
+ * &xdp_buff_xsk is the largest structure &libeth_xdp_buff gets casted to,
+ * pick maximum pointer-compatible alignment.
+ */
+#define __LIBETH_XDP_BUFF_ALIGN \
+ (IS_ALIGNED(sizeof(struct xdp_buff_xsk), 16) ? 16 : \
+ IS_ALIGNED(sizeof(struct xdp_buff_xsk), 8) ? 8 : \
+ sizeof(long))
+
+/**
+ * struct libeth_xdp_buff - libeth extension over &xdp_buff
+ * @base: main &xdp_buff
+ * @data: shortcut for @base.data
+ * @desc: RQ descriptor containing metadata for this buffer
+ * @priv: driver-private scratchspace
+ *
+ * The main reason for this is to have a pointer to the descriptor to be able
+ * to quickly get frame metadata from xdpmo and driver buff-to-xdp callbacks
+ * (as well as bigger alignment).
+ * Pointer/layout-compatible with &xdp_buff and &xdp_buff_xsk.
+ */
+struct libeth_xdp_buff {
+ union {
+ struct xdp_buff base;
+ void *data;
+ };
+
+ const void *desc;
+ unsigned long priv[]
+ __aligned(__LIBETH_XDP_BUFF_ALIGN);
+} __aligned(__LIBETH_XDP_BUFF_ALIGN);
+static_assert(offsetof(struct libeth_xdp_buff, data) ==
+ offsetof(struct xdp_buff_xsk, xdp.data));
+static_assert(offsetof(struct libeth_xdp_buff, desc) ==
+ offsetof(struct xdp_buff_xsk, cb));
+static_assert(IS_ALIGNED(sizeof(struct xdp_buff_xsk),
+ __alignof(struct libeth_xdp_buff)));
+
+/**
+ * __LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack
+ * @name: name of the variable to declare
+ * @...: sizeof() of the driver-private data
+ */
+#define __LIBETH_XDP_ONSTACK_BUFF(name, ...) \
+ ___LIBETH_XDP_ONSTACK_BUFF(name, ##__VA_ARGS__)
+/**
+ * LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack
+ * @name: name of the variable to declare
+ * @...: type or variable name of the driver-private data
+ */
+#define LIBETH_XDP_ONSTACK_BUFF(name, ...) \
+ __LIBETH_XDP_ONSTACK_BUFF(name, __libeth_xdp_priv_sz(__VA_ARGS__))
+
+#define ___LIBETH_XDP_ONSTACK_BUFF(name, ...) \
+ __DEFINE_FLEX(struct libeth_xdp_buff, name, priv, \
+ LIBETH_XDP_PRIV_SZ(__VA_ARGS__ + 0), \
+ __uninitialized); \
+ LIBETH_XDP_ASSERT_PRIV_SZ(__VA_ARGS__ + 0)
+
+#define __libeth_xdp_priv_sz(...) \
+ CONCATENATE(__libeth_xdp_psz, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__)
+
+#define __libeth_xdp_psz0(...)
+#define __libeth_xdp_psz1(...) sizeof(__VA_ARGS__)
+
+#define LIBETH_XDP_PRIV_SZ(sz) \
+ (ALIGN(sz, __alignof(struct libeth_xdp_buff)) / sizeof(long))
+
+/* Performs XSK_CHECK_PRIV_TYPE() */
+#define LIBETH_XDP_ASSERT_PRIV_SZ(sz) \
+ static_assert(offsetofend(struct xdp_buff_xsk, cb) >= \
+ struct_size_t(struct libeth_xdp_buff, priv, \
+ LIBETH_XDP_PRIV_SZ(sz)))
+
+/* XDPSQ sharing */
+
+DECLARE_STATIC_KEY_FALSE(libeth_xdpsq_share);
+
+/**
+ * libeth_xdpsq_num - calculate optimal number of XDPSQs for this device + sys
+ * @rxq: current number of active Rx queues
+ * @txq: current number of active Tx queues
+ * @max: maximum number of Tx queues
+ *
+ * Each RQ must have its own XDPSQ for XSk pairs, each CPU must have own XDPSQ
+ * for lockless sending (``XDP_TX``, .ndo_xdp_xmit()). Cap the maximum of these
+ * two with the number of SQs the device can have (minus used ones).
+ *
+ * Return: number of XDP Tx queues the device needs to use.
+ */
+static inline u32 libeth_xdpsq_num(u32 rxq, u32 txq, u32 max)
+{
+ return min(max(nr_cpu_ids, rxq), max - txq);
+}
+
+/**
+ * libeth_xdpsq_shared - whether XDPSQs can be shared between several CPUs
+ * @num: number of active XDPSQs
+ *
+ * Return: true if there's no 1:1 XDPSQ/CPU association, false otherwise.
+ */
+static inline bool libeth_xdpsq_shared(u32 num)
+{
+ return num < nr_cpu_ids;
+}
+
+/**
+ * libeth_xdpsq_id - get XDPSQ index corresponding to this CPU
+ * @num: number of active XDPSQs
+ *
+ * Helper for libeth_xdp routines, do not use in drivers directly.
+ *
+ * Return: XDPSQ index needs to be used on this CPU.
+ */
+static inline u32 libeth_xdpsq_id(u32 num)
+{
+ u32 ret = raw_smp_processor_id();
+
+ if (static_branch_unlikely(&libeth_xdpsq_share) &&
+ libeth_xdpsq_shared(num))
+ ret %= num;
+
+ return ret;
+}
+
+void __libeth_xdpsq_get(struct libeth_xdpsq_lock *lock,
+ const struct net_device *dev);
+void __libeth_xdpsq_put(struct libeth_xdpsq_lock *lock,
+ const struct net_device *dev);
+
+/**
+ * libeth_xdpsq_get - initialize &libeth_xdpsq_lock
+ * @lock: lock to initialize
+ * @dev: netdev which this lock belongs to
+ * @share: whether XDPSQs can be shared
+ *
+ * Tracks the current XDPSQ association and enables the static lock
+ * if needed.
+ */
+static inline void libeth_xdpsq_get(struct libeth_xdpsq_lock *lock,
+ const struct net_device *dev,
+ bool share)
+{
+ if (unlikely(share))
+ __libeth_xdpsq_get(lock, dev);
+}
+
+/**
+ * libeth_xdpsq_put - deinitialize &libeth_xdpsq_lock
+ * @lock: lock to deinitialize
+ * @dev: netdev which this lock belongs to
+ *
+ * Tracks the current XDPSQ association and disables the static lock
+ * if needed.
+ */
+static inline void libeth_xdpsq_put(struct libeth_xdpsq_lock *lock,
+ const struct net_device *dev)
+{
+ if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share)
+ __libeth_xdpsq_put(lock, dev);
+}
+
+void __libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock);
+void __libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock);
+
+/**
+ * libeth_xdpsq_lock - grab &libeth_xdpsq_lock if needed
+ * @lock: lock to take
+ *
+ * Touches the underlying spinlock only if the static key is enabled
+ * and the queue itself is marked as shareable.
+ */
+static inline void libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock)
+{
+ if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share)
+ __libeth_xdpsq_lock(lock);
+}
+
+/**
+ * libeth_xdpsq_unlock - free &libeth_xdpsq_lock if needed
+ * @lock: lock to free
+ *
+ * Touches the underlying spinlock only if the static key is enabled
+ * and the queue itself is marked as shareable.
+ */
+static inline void libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock)
+{
+ if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share)
+ __libeth_xdpsq_unlock(lock);
+}
+
+/* XDPSQ clean-up timers */
+
+void libeth_xdpsq_init_timer(struct libeth_xdpsq_timer *timer, void *xdpsq,
+ struct libeth_xdpsq_lock *lock,
+ void (*poll)(struct work_struct *work));
+
+/**
+ * libeth_xdpsq_deinit_timer - deinitialize &libeth_xdpsq_timer
+ * @timer: timer to deinitialize
+ *
+ * Flush and disable the underlying workqueue.
+ */
+static inline void libeth_xdpsq_deinit_timer(struct libeth_xdpsq_timer *timer)
+{
+ cancel_delayed_work_sync(&timer->dwork);
+}
+
+/**
+ * libeth_xdpsq_queue_timer - run &libeth_xdpsq_timer
+ * @timer: timer to queue
+ *
+ * Should be called after the queue was filled and the transmission was run
+ * to complete the pending buffers if no further sending will be done in a
+ * second (-> lazy cleaning won't happen).
+ * If the timer was already run, it will be requeued back to one second
+ * timeout again.
+ */
+static inline void libeth_xdpsq_queue_timer(struct libeth_xdpsq_timer *timer)
+{
+ mod_delayed_work_on(raw_smp_processor_id(), system_bh_highpri_wq,
+ &timer->dwork, HZ);
+}
+
+/**
+ * libeth_xdpsq_run_timer - wrapper to run a queue clean-up on a timer event
+ * @work: workqueue belonging to the corresponding timer
+ * @poll: driver-specific completion queue poll function
+ *
+ * Run the polling function on the locked queue and requeue the timer if
+ * there's more work to do.
+ * Designed to be used via LIBETH_XDP_DEFINE_TIMER() below.
+ */
+static __always_inline void
+libeth_xdpsq_run_timer(struct work_struct *work,
+ u32 (*poll)(void *xdpsq, u32 budget))
+{
+ struct libeth_xdpsq_timer *timer = container_of(work, typeof(*timer),
+ dwork.work);
+
+ libeth_xdpsq_lock(timer->lock);
+
+ if (poll(timer->xdpsq, U32_MAX))
+ libeth_xdpsq_queue_timer(timer);
+
+ libeth_xdpsq_unlock(timer->lock);
+}
+
+/* Common Tx bits */
+
+/**
+ * enum - libeth_xdp internal Tx flags
+ * @LIBETH_XDP_TX_BULK: one bulk size at which it will be flushed to the queue
+ * @LIBETH_XDP_TX_BATCH: batch size for which the queue fill loop is unrolled
+ * @LIBETH_XDP_TX_DROP: indicates the send function must drop frames not sent
+ * @LIBETH_XDP_TX_NDO: whether the send function is called from .ndo_xdp_xmit()
+ * @LIBETH_XDP_TX_XSK: whether the function is called for ``XDP_TX`` for XSk
+ */
+enum {
+ LIBETH_XDP_TX_BULK = DEV_MAP_BULK_SIZE,
+ LIBETH_XDP_TX_BATCH = 8,
+
+ LIBETH_XDP_TX_DROP = BIT(0),
+ LIBETH_XDP_TX_NDO = BIT(1),
+ LIBETH_XDP_TX_XSK = BIT(2),
+};
+
+/**
+ * enum - &libeth_xdp_tx_frame and &libeth_xdp_tx_desc flags
+ * @LIBETH_XDP_TX_LEN: only for ``XDP_TX``, [15:0] of ::len_fl is actual length
+ * @LIBETH_XDP_TX_CSUM: for XSk xmit, enable checksum offload
+ * @LIBETH_XDP_TX_XSKMD: for XSk xmit, mask of the metadata bits
+ * @LIBETH_XDP_TX_FIRST: indicates the frag is the first one of the frame
+ * @LIBETH_XDP_TX_LAST: whether the frag is the last one of the frame
+ * @LIBETH_XDP_TX_MULTI: whether the frame contains several frags
+ * @LIBETH_XDP_TX_FLAGS: only for ``XDP_TX``, [31:16] of ::len_fl is flags
+ */
+enum {
+ LIBETH_XDP_TX_LEN = GENMASK(15, 0),
+
+ LIBETH_XDP_TX_CSUM = XDP_TXMD_FLAGS_CHECKSUM,
+ LIBETH_XDP_TX_XSKMD = LIBETH_XDP_TX_LEN,
+
+ LIBETH_XDP_TX_FIRST = BIT(16),
+ LIBETH_XDP_TX_LAST = BIT(17),
+ LIBETH_XDP_TX_MULTI = BIT(18),
+
+ LIBETH_XDP_TX_FLAGS = GENMASK(31, 16),
+};
+
+/**
+ * struct libeth_xdp_tx_frame - represents one XDP Tx element
+ * @data: frame start pointer for ``XDP_TX``
+ * @len_fl: ``XDP_TX``, combined flags [31:16] and len [15:0] field for speed
+ * @soff: ``XDP_TX``, offset from @data to the start of &skb_shared_info
+ * @frag: one (non-head) frag for ``XDP_TX``
+ * @xdpf: &xdp_frame for the head frag for .ndo_xdp_xmit()
+ * @dma: DMA address of the non-head frag for .ndo_xdp_xmit()
+ * @xsk: ``XDP_TX`` for XSk, XDP buffer for any frag
+ * @len: frag length for XSk ``XDP_TX`` and .ndo_xdp_xmit()
+ * @flags: Tx flags for the above
+ * @opts: combined @len + @flags for the above for speed
+ * @desc: XSk xmit descriptor for direct casting
+ */
+struct libeth_xdp_tx_frame {
+ union {
+ /* ``XDP_TX`` */
+ struct {
+ void *data;
+ u32 len_fl;
+ u32 soff;
+ };
+
+ /* ``XDP_TX`` frag */
+ skb_frag_t frag;
+
+ /* .ndo_xdp_xmit(), XSk ``XDP_TX`` */
+ struct {
+ union {
+ struct xdp_frame *xdpf;
+ dma_addr_t dma;
+
+ struct libeth_xdp_buff *xsk;
+ };
+ union {
+ struct {
+ u32 len;
+ u32 flags;
+ };
+ aligned_u64 opts;
+ };
+ };
+
+ /* XSk xmit */
+ struct xdp_desc desc;
+ };
+} __aligned(sizeof(struct xdp_desc));
+static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) ==
+ offsetof(struct libeth_xdp_tx_frame, len_fl));
+static_assert(sizeof(struct libeth_xdp_tx_frame) == sizeof(struct xdp_desc));
+
+/**
+ * struct libeth_xdp_tx_bulk - XDP Tx frame bulk for bulk sending
+ * @prog: corresponding active XDP program, %NULL for .ndo_xdp_xmit()
+ * @dev: &net_device which the frames are transmitted on
+ * @xdpsq: shortcut to the corresponding driver-specific XDPSQ structure
+ * @act_mask: Rx only, mask of all the XDP prog verdicts for that NAPI session
+ * @count: current number of frames in @bulk
+ * @bulk: array of queued frames for bulk Tx
+ *
+ * All XDP Tx operations except XSk xmit queue each frame to the bulk first
+ * and flush it when @count reaches the array end. Bulk is always placed on
+ * the stack for performance. One bulk element contains all the data necessary
+ * for sending a frame and then freeing it on completion.
+ * For XSk xmit, Tx descriptor array from &xsk_buff_pool is casted directly
+ * to &libeth_xdp_tx_frame as they are compatible and the bulk structure is
+ * not used.
+ */
+struct libeth_xdp_tx_bulk {
+ const struct bpf_prog *prog;
+ struct net_device *dev;
+ void *xdpsq;
+
+ u32 act_mask;
+ u32 count;
+ struct libeth_xdp_tx_frame bulk[LIBETH_XDP_TX_BULK];
+} __aligned(sizeof(struct libeth_xdp_tx_frame));
+
+/**
+ * LIBETH_XDP_ONSTACK_BULK - declare &libeth_xdp_tx_bulk on the stack
+ * @bq: name of the variable to declare
+ *
+ * Helper to declare a bulk on the stack with a compiler hint that it should
+ * not be initialized automatically (with `CONFIG_INIT_STACK_ALL_*`) for
+ * performance reasons.
+ */
+#define LIBETH_XDP_ONSTACK_BULK(bq) \
+ struct libeth_xdp_tx_bulk bq __uninitialized
+
+/**
+ * struct libeth_xdpsq - abstraction for an XDPSQ
+ * @pool: XSk buffer pool for XSk ``XDP_TX`` and xmit
+ * @sqes: array of Tx buffers from the actual queue struct
+ * @descs: opaque pointer to the HW descriptor array
+ * @ntu: pointer to the next free descriptor index
+ * @count: number of descriptors on that queue
+ * @pending: pointer to the number of sent-not-completed descs on that queue
+ * @xdp_tx: pointer to the above, but only for non-XSk-xmit frames
+ * @lock: corresponding XDPSQ lock
+ *
+ * Abstraction for driver-independent implementation of Tx. Placed on the stack
+ * and filled by the driver before the transmission, so that the generic
+ * functions can access and modify driver-specific resources.
+ */
+struct libeth_xdpsq {
+ struct xsk_buff_pool *pool;
+ struct libeth_sqe *sqes;
+ void *descs;
+
+ u32 *ntu;
+ u32 count;
+
+ u32 *pending;
+ u32 *xdp_tx;
+ struct libeth_xdpsq_lock *lock;
+};
+
+/**
+ * struct libeth_xdp_tx_desc - abstraction for an XDP Tx descriptor
+ * @addr: DMA address of the frame
+ * @len: length of the frame
+ * @flags: XDP Tx flags
+ * @opts: combined @len + @flags for speed
+ *
+ * Filled by the generic functions and then passed to driver-specific functions
+ * to fill a HW Tx descriptor, always placed on the [function] stack.
+ */
+struct libeth_xdp_tx_desc {
+ dma_addr_t addr;
+ union {
+ struct {
+ u32 len;
+ u32 flags;
+ };
+ aligned_u64 opts;
+ };
+} __aligned_largest;
+
+/**
+ * libeth_xdp_ptr_to_priv - convert pointer to a libeth_xdp u64 priv
+ * @ptr: pointer to convert
+ *
+ * The main sending function passes private data as the largest scalar, u64.
+ * Use this helper when you want to pass a pointer there.
+ */
+#define libeth_xdp_ptr_to_priv(ptr) ({ \
+ typecheck_pointer(ptr); \
+ ((u64)(uintptr_t)(ptr)); \
+})
+/**
+ * libeth_xdp_priv_to_ptr - convert libeth_xdp u64 priv to a pointer
+ * @priv: private data to convert
+ *
+ * The main sending function passes private data as the largest scalar, u64.
+ * Use this helper when your callback takes this u64 and you want to convert
+ * it back to a pointer.
+ */
+#define libeth_xdp_priv_to_ptr(priv) ({ \
+ static_assert(__same_type(priv, u64)); \
+ ((const void *)(uintptr_t)(priv)); \
+})
+
+/*
+ * On 64-bit systems, assigning one u64 is faster than two u32s. When ::len
+ * occupies lowest 32 bits (LE), whole ::opts can be assigned directly instead.
+ */
+#ifdef __LITTLE_ENDIAN
+#define __LIBETH_WORD_ACCESS 1
+#endif
+#ifdef __LIBETH_WORD_ACCESS
+#define __libeth_xdp_tx_len(flen, ...) \
+ .opts = ((flen) | FIELD_PREP(GENMASK_ULL(63, 32), (__VA_ARGS__ + 0)))
+#else
+#define __libeth_xdp_tx_len(flen, ...) \
+ .len = (flen), .flags = (__VA_ARGS__ + 0)
+#endif
+
+/**
+ * libeth_xdp_tx_xmit_bulk - main XDP Tx function
+ * @bulk: array of frames to send
+ * @xdpsq: pointer to the driver-specific XDPSQ struct
+ * @n: number of frames to send
+ * @unroll: whether to unroll the queue filling loop for speed
+ * @priv: driver-specific private data
+ * @prep: callback for cleaning the queue and filling abstract &libeth_xdpsq
+ * @fill: internal callback for filling &libeth_sqe and &libeth_xdp_tx_desc
+ * @xmit: callback for filling a HW descriptor with the frame info
+ *
+ * Internal abstraction for placing @n XDP Tx frames on the HW XDPSQ. Used for
+ * all types of frames: ``XDP_TX``, .ndo_xdp_xmit(), XSk ``XDP_TX``, and XSk
+ * xmit.
+ * @prep must lock the queue as this function releases it at the end. @unroll
+ * greatly increases the object code size, but also greatly increases XSk xmit
+ * performance; for other types of frames, it's not enabled.
+ * The compilers inline all those onstack abstractions to direct data accesses.
+ *
+ * Return: number of frames actually placed on the queue, <= @n. The function
+ * can't fail, but can send less frames if there's no enough free descriptors
+ * available. The actual free space is returned by @prep from the driver.
+ */
+static __always_inline __nocfi_generic u32
+libeth_xdp_tx_xmit_bulk(const struct libeth_xdp_tx_frame *bulk, void *xdpsq,
+ u32 n, bool unroll, u64 priv,
+ u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq),
+ struct libeth_xdp_tx_desc
+ (*fill)(struct libeth_xdp_tx_frame frm, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv),
+ void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv))
+{
+ struct libeth_xdpsq sq __uninitialized;
+ u32 this, batched, off = 0;
+ u32 ntu, i = 0;
+
+ n = min(n, prep(xdpsq, &sq));
+ if (unlikely(!n))
+ goto unlock;
+
+ ntu = *sq.ntu;
+
+ this = sq.count - ntu;
+ if (likely(this > n))
+ this = n;
+
+again:
+ if (!unroll)
+ goto linear;
+
+ batched = ALIGN_DOWN(this, LIBETH_XDP_TX_BATCH);
+
+ for ( ; i < off + batched; i += LIBETH_XDP_TX_BATCH) {
+ u32 base = ntu + i - off;
+
+ unrolled_count(LIBETH_XDP_TX_BATCH)
+ for (u32 j = 0; j < LIBETH_XDP_TX_BATCH; j++)
+ xmit(fill(bulk[i + j], base + j, &sq, priv),
+ base + j, &sq, priv);
+ }
+
+ if (batched < this) {
+linear:
+ for ( ; i < off + this; i++)
+ xmit(fill(bulk[i], ntu + i - off, &sq, priv),
+ ntu + i - off, &sq, priv);
+ }
+
+ ntu += this;
+ if (likely(ntu < sq.count))
+ goto out;
+
+ ntu = 0;
+
+ if (i < n) {
+ this = n - i;
+ off = i;
+
+ goto again;
+ }
+
+out:
+ *sq.ntu = ntu;
+ *sq.pending += n;
+ if (sq.xdp_tx)
+ *sq.xdp_tx += n;
+
+unlock:
+ libeth_xdpsq_unlock(sq.lock);
+
+ return n;
+}
+
+/* ``XDP_TX`` bulking */
+
+void libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp);
+
+/**
+ * libeth_xdp_tx_queue_head - internal helper for queueing one ``XDP_TX`` head
+ * @bq: XDP Tx bulk to queue the head frag to
+ * @xdp: XDP buffer with the head to queue
+ *
+ * Return: false if it's the only frag of the frame, true if it's an S/G frame.
+ */
+static inline bool libeth_xdp_tx_queue_head(struct libeth_xdp_tx_bulk *bq,
+ const struct libeth_xdp_buff *xdp)
+{
+ const struct xdp_buff *base = &xdp->base;
+
+ bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+ .data = xdp->data,
+ .len_fl = (base->data_end - xdp->data) | LIBETH_XDP_TX_FIRST,
+ .soff = xdp_data_hard_end(base) - xdp->data,
+ };
+
+ if (!xdp_buff_has_frags(base))
+ return false;
+
+ bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_MULTI;
+
+ return true;
+}
+
+/**
+ * libeth_xdp_tx_queue_frag - internal helper for queueing one ``XDP_TX`` frag
+ * @bq: XDP Tx bulk to queue the frag to
+ * @frag: frag to queue
+ */
+static inline void libeth_xdp_tx_queue_frag(struct libeth_xdp_tx_bulk *bq,
+ const skb_frag_t *frag)
+{
+ bq->bulk[bq->count++].frag = *frag;
+}
+
+/**
+ * libeth_xdp_tx_queue_bulk - internal helper for queueing one ``XDP_TX`` frame
+ * @bq: XDP Tx bulk to queue the frame to
+ * @xdp: XDP buffer to queue
+ * @flush_bulk: driver callback to flush the bulk to the HW queue
+ *
+ * Return: true on success, false on flush error.
+ */
+static __always_inline bool
+libeth_xdp_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq,
+ struct libeth_xdp_buff *xdp,
+ bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+ u32 flags))
+{
+ const struct skb_shared_info *sinfo;
+ bool ret = true;
+ u32 nr_frags;
+
+ if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+ unlikely(!flush_bulk(bq, 0))) {
+ libeth_xdp_return_buff_slow(xdp);
+ return false;
+ }
+
+ if (!libeth_xdp_tx_queue_head(bq, xdp))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_buff(&xdp->base);
+ nr_frags = sinfo->nr_frags;
+
+ for (u32 i = 0; i < nr_frags; i++) {
+ if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+ unlikely(!flush_bulk(bq, 0))) {
+ ret = false;
+ break;
+ }
+
+ libeth_xdp_tx_queue_frag(bq, &sinfo->frags[i]);
+ }
+
+out:
+ bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_LAST;
+ xdp->data = NULL;
+
+ return ret;
+}
+
+/**
+ * libeth_xdp_tx_fill_stats - fill &libeth_sqe with ``XDP_TX`` frame stats
+ * @sqe: SQ element to fill
+ * @desc: libeth_xdp Tx descriptor
+ * @sinfo: &skb_shared_info for this frame
+ *
+ * Internal helper for filling an SQE with the frame stats, do not use in
+ * drivers. Fills the number of frags and bytes for this frame.
+ */
+#define libeth_xdp_tx_fill_stats(sqe, desc, sinfo) \
+ __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, __UNIQUE_ID(sqe_), \
+ __UNIQUE_ID(desc_), __UNIQUE_ID(sinfo_))
+
+#define __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, ue, ud, us) do { \
+ const struct libeth_xdp_tx_desc *ud = (desc); \
+ const struct skb_shared_info *us; \
+ struct libeth_sqe *ue = (sqe); \
+ \
+ ue->nr_frags = 1; \
+ ue->bytes = ud->len; \
+ \
+ if (ud->flags & LIBETH_XDP_TX_MULTI) { \
+ us = (sinfo); \
+ ue->nr_frags += us->nr_frags; \
+ ue->bytes += us->xdp_frags_size; \
+ } \
+} while (0)
+
+/**
+ * libeth_xdp_tx_fill_buf - internal helper to fill one ``XDP_TX`` &libeth_sqe
+ * @frm: XDP Tx frame from the bulk
+ * @i: index on the HW queue
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: private data
+ *
+ * Return: XDP Tx descriptor with the synced DMA and other info to pass to
+ * the driver callback.
+ */
+static inline struct libeth_xdp_tx_desc
+libeth_xdp_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv)
+{
+ struct libeth_xdp_tx_desc desc;
+ struct skb_shared_info *sinfo;
+ skb_frag_t *frag = &frm.frag;
+ struct libeth_sqe *sqe;
+ netmem_ref netmem;
+
+ if (frm.len_fl & LIBETH_XDP_TX_FIRST) {
+ sinfo = frm.data + frm.soff;
+ skb_frag_fill_netmem_desc(frag, virt_to_netmem(frm.data),
+ offset_in_page(frm.data),
+ frm.len_fl);
+ } else {
+ sinfo = NULL;
+ }
+
+ netmem = skb_frag_netmem(frag);
+ desc = (typeof(desc)){
+ .addr = page_pool_get_dma_addr_netmem(netmem) +
+ skb_frag_off(frag),
+ .len = skb_frag_size(frag) & LIBETH_XDP_TX_LEN,
+ .flags = skb_frag_size(frag) & LIBETH_XDP_TX_FLAGS,
+ };
+
+ dma_sync_single_for_device(__netmem_get_pp(netmem)->p.dev, desc.addr,
+ desc.len, DMA_BIDIRECTIONAL);
+
+ if (!sinfo)
+ return desc;
+
+ sqe = &sq->sqes[i];
+ sqe->type = LIBETH_SQE_XDP_TX;
+ sqe->sinfo = sinfo;
+ libeth_xdp_tx_fill_stats(sqe, &desc, sinfo);
+
+ return desc;
+}
+
+void libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent,
+ u32 flags);
+
+/**
+ * __libeth_xdp_tx_flush_bulk - internal helper to flush one XDP Tx bulk
+ * @bq: bulk to flush
+ * @flags: XDP TX flags (.ndo_xdp_xmit(), XSk etc.)
+ * @prep: driver-specific callback to prepare the queue for sending
+ * @fill: libeth_xdp callback to fill &libeth_sqe and &libeth_xdp_tx_desc
+ * @xmit: driver callback to fill a HW descriptor
+ *
+ * Internal abstraction to create bulk flush functions for drivers. Used for
+ * everything except XSk xmit.
+ *
+ * Return: true if anything was sent, false otherwise.
+ */
+static __always_inline bool
+__libeth_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags,
+ u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq),
+ struct libeth_xdp_tx_desc
+ (*fill)(struct libeth_xdp_tx_frame frm, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv),
+ void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i,
+ const struct libeth_xdpsq *sq,
+ u64 priv))
+{
+ u32 sent, drops;
+ int err = 0;
+
+ sent = libeth_xdp_tx_xmit_bulk(bq->bulk, bq->xdpsq,
+ min(bq->count, LIBETH_XDP_TX_BULK),
+ false, 0, prep, fill, xmit);
+ drops = bq->count - sent;
+
+ if (unlikely(drops)) {
+ libeth_xdp_tx_exception(bq, sent, flags);
+ err = -ENXIO;
+ } else {
+ bq->count = 0;
+ }
+
+ trace_xdp_bulk_tx(bq->dev, sent, drops, err);
+
+ return likely(sent);
+}
+
+/**
+ * libeth_xdp_tx_flush_bulk - wrapper to define flush of one ``XDP_TX`` bulk
+ * @bq: bulk to flush
+ * @flags: Tx flags, see above
+ * @prep: driver callback to prepare the queue
+ * @xmit: driver callback to fill a HW descriptor
+ *
+ * Use via LIBETH_XDP_DEFINE_FLUSH_TX() to define an ``XDP_TX`` driver
+ * callback.
+ */
+#define libeth_xdp_tx_flush_bulk(bq, flags, prep, xmit) \
+ __libeth_xdp_tx_flush_bulk(bq, flags, prep, libeth_xdp_tx_fill_buf, \
+ xmit)
+
+/* .ndo_xdp_xmit() implementation */
+
+/**
+ * libeth_xdp_xmit_init_bulk - internal helper to initialize bulk for XDP xmit
+ * @bq: bulk to initialize
+ * @dev: target &net_device
+ * @xdpsqs: array of driver-specific XDPSQ structs
+ * @num: number of active XDPSQs (the above array length)
+ */
+#define libeth_xdp_xmit_init_bulk(bq, dev, xdpsqs, num) \
+ __libeth_xdp_xmit_init_bulk(bq, dev, (xdpsqs)[libeth_xdpsq_id(num)])
+
+static inline void __libeth_xdp_xmit_init_bulk(struct libeth_xdp_tx_bulk *bq,
+ struct net_device *dev,
+ void *xdpsq)
+{
+ bq->dev = dev;
+ bq->xdpsq = xdpsq;
+ bq->count = 0;
+}
+
+/**
+ * libeth_xdp_xmit_frame_dma - internal helper to access DMA of an &xdp_frame
+ * @xf: pointer to the XDP frame
+ *
+ * There's no place in &libeth_xdp_tx_frame to store DMA address for an
+ * &xdp_frame head. The headroom is used then, the address is placed right
+ * after the frame struct, naturally aligned.
+ *
+ * Return: pointer to the DMA address to use.
+ */
+#define libeth_xdp_xmit_frame_dma(xf) \
+ _Generic((xf), \
+ const struct xdp_frame *: \
+ (const dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf), \
+ struct xdp_frame *: \
+ (dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf) \
+ )
+
+static inline void *__libeth_xdp_xmit_frame_dma(const struct xdp_frame *xdpf)
+{
+ void *addr = (void *)(xdpf + 1);
+
+ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
+ __alignof(*xdpf) < sizeof(dma_addr_t))
+ addr = PTR_ALIGN(addr, sizeof(dma_addr_t));
+
+ return addr;
+}
+
+/**
+ * libeth_xdp_xmit_queue_head - internal helper for queueing one XDP xmit head
+ * @bq: XDP Tx bulk to queue the head frag to
+ * @xdpf: XDP frame with the head to queue
+ * @dev: device to perform DMA mapping
+ *
+ * Return: ``LIBETH_XDP_DROP`` on DMA mapping error,
+ * ``LIBETH_XDP_PASS`` if it's the only frag in the frame,
+ * ``LIBETH_XDP_TX`` if it's an S/G frame.
+ */
+static inline u32 libeth_xdp_xmit_queue_head(struct libeth_xdp_tx_bulk *bq,
+ struct xdp_frame *xdpf,
+ struct device *dev)
+{
+ dma_addr_t dma;
+
+ dma = dma_map_single(dev, xdpf->data, xdpf->len, DMA_TO_DEVICE);
+ if (dma_mapping_error(dev, dma))
+ return LIBETH_XDP_DROP;
+
+ *libeth_xdp_xmit_frame_dma(xdpf) = dma;
+
+ bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+ .xdpf = xdpf,
+ __libeth_xdp_tx_len(xdpf->len, LIBETH_XDP_TX_FIRST),
+ };
+
+ if (!xdp_frame_has_frags(xdpf))
+ return LIBETH_XDP_PASS;
+
+ bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI;
+
+ return LIBETH_XDP_TX;
+}
+
+/**
+ * libeth_xdp_xmit_queue_frag - internal helper for queueing one XDP xmit frag
+ * @bq: XDP Tx bulk to queue the frag to
+ * @frag: frag to queue
+ * @dev: device to perform DMA mapping
+ *
+ * Return: true on success, false on DMA mapping error.
+ */
+static inline bool libeth_xdp_xmit_queue_frag(struct libeth_xdp_tx_bulk *bq,
+ const skb_frag_t *frag,
+ struct device *dev)
+{
+ dma_addr_t dma;
+
+ dma = skb_frag_dma_map(dev, frag);
+ if (dma_mapping_error(dev, dma))
+ return false;
+
+ bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+ .dma = dma,
+ __libeth_xdp_tx_len(skb_frag_size(frag)),
+ };
+
+ return true;
+}
+
+/**
+ * libeth_xdp_xmit_queue_bulk - internal helper for queueing one XDP xmit frame
+ * @bq: XDP Tx bulk to queue the frame to
+ * @xdpf: XDP frame to queue
+ * @flush_bulk: driver callback to flush the bulk to the HW queue
+ *
+ * Return: ``LIBETH_XDP_TX`` on success,
+ * ``LIBETH_XDP_DROP`` if the frame should be dropped by the stack,
+ * ``LIBETH_XDP_ABORTED`` if the frame will be dropped by libeth_xdp.
+ */
+static __always_inline u32
+libeth_xdp_xmit_queue_bulk(struct libeth_xdp_tx_bulk *bq,
+ struct xdp_frame *xdpf,
+ bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+ u32 flags))
+{
+ u32 head, nr_frags, i, ret = LIBETH_XDP_TX;
+ struct device *dev = bq->dev->dev.parent;
+ const struct skb_shared_info *sinfo;
+
+ if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+ unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO)))
+ return LIBETH_XDP_DROP;
+
+ head = libeth_xdp_xmit_queue_head(bq, xdpf, dev);
+ if (head == LIBETH_XDP_PASS)
+ goto out;
+ else if (head == LIBETH_XDP_DROP)
+ return LIBETH_XDP_DROP;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ nr_frags = sinfo->nr_frags;
+
+ for (i = 0; i < nr_frags; i++) {
+ if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+ unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO)))
+ break;
+
+ if (!libeth_xdp_xmit_queue_frag(bq, &sinfo->frags[i], dev))
+ break;
+ }
+
+ if (unlikely(i < nr_frags))
+ ret = LIBETH_XDP_ABORTED;
+
+out:
+ bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST;
+
+ return ret;
+}
+
+/**
+ * libeth_xdp_xmit_fill_buf - internal helper to fill one XDP xmit &libeth_sqe
+ * @frm: XDP Tx frame from the bulk
+ * @i: index on the HW queue
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: private data
+ *
+ * Return: XDP Tx descriptor with the mapped DMA and other info to pass to
+ * the driver callback.
+ */
+static inline struct libeth_xdp_tx_desc
+libeth_xdp_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv)
+{
+ struct libeth_xdp_tx_desc desc;
+ struct libeth_sqe *sqe;
+ struct xdp_frame *xdpf;
+
+ if (frm.flags & LIBETH_XDP_TX_FIRST) {
+ xdpf = frm.xdpf;
+ desc.addr = *libeth_xdp_xmit_frame_dma(xdpf);
+ } else {
+ xdpf = NULL;
+ desc.addr = frm.dma;
+ }
+ desc.opts = frm.opts;
+
+ sqe = &sq->sqes[i];
+ dma_unmap_addr_set(sqe, dma, desc.addr);
+ dma_unmap_len_set(sqe, len, desc.len);
+
+ if (!xdpf) {
+ sqe->type = LIBETH_SQE_XDP_XMIT_FRAG;
+ return desc;
+ }
+
+ sqe->type = LIBETH_SQE_XDP_XMIT;
+ sqe->xdpf = xdpf;
+ libeth_xdp_tx_fill_stats(sqe, &desc,
+ xdp_get_shared_info_from_frame(xdpf));
+
+ return desc;
+}
+
+/**
+ * libeth_xdp_xmit_flush_bulk - wrapper to define flush of one XDP xmit bulk
+ * @bq: bulk to flush
+ * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk()
+ * @prep: driver callback to prepare the queue
+ * @xmit: driver callback to fill a HW descriptor
+ *
+ * Use via LIBETH_XDP_DEFINE_FLUSH_XMIT() to define an XDP xmit driver
+ * callback.
+ */
+#define libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit) \
+ __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_NDO, prep, \
+ libeth_xdp_xmit_fill_buf, xmit)
+
+u32 libeth_xdp_xmit_return_bulk(const struct libeth_xdp_tx_frame *bq,
+ u32 count, const struct net_device *dev);
+
+/**
+ * __libeth_xdp_xmit_do_bulk - internal function to implement .ndo_xdp_xmit()
+ * @bq: XDP Tx bulk to queue frames to
+ * @frames: XDP frames passed by the stack
+ * @n: number of frames
+ * @flags: flags passed by the stack
+ * @flush_bulk: driver callback to flush an XDP xmit bulk
+ * @finalize: driver callback to finalize sending XDP Tx frames on the queue
+ *
+ * Perform common checks, map the frags and queue them to the bulk, then flush
+ * the bulk to the XDPSQ. If requested by the stack, finalize the queue.
+ *
+ * Return: number of frames send or -errno on error.
+ */
+static __always_inline int
+__libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq,
+ struct xdp_frame **frames, u32 n, u32 flags,
+ bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+ u32 flags),
+ void (*finalize)(void *xdpsq, bool sent, bool flush))
+{
+ u32 nxmit = 0;
+
+ if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+ return -EINVAL;
+
+ for (u32 i = 0; likely(i < n); i++) {
+ u32 ret;
+
+ ret = libeth_xdp_xmit_queue_bulk(bq, frames[i], flush_bulk);
+ if (unlikely(ret != LIBETH_XDP_TX)) {
+ nxmit += ret == LIBETH_XDP_ABORTED;
+ break;
+ }
+
+ nxmit++;
+ }
+
+ if (bq->count) {
+ flush_bulk(bq, LIBETH_XDP_TX_NDO);
+ if (unlikely(bq->count))
+ nxmit -= libeth_xdp_xmit_return_bulk(bq->bulk,
+ bq->count,
+ bq->dev);
+ }
+
+ finalize(bq->xdpsq, nxmit, flags & XDP_XMIT_FLUSH);
+
+ return nxmit;
+}
+
+/**
+ * libeth_xdp_xmit_do_bulk - implement full .ndo_xdp_xmit() in driver
+ * @dev: target &net_device
+ * @n: number of frames to send
+ * @fr: XDP frames to send
+ * @f: flags passed by the stack
+ * @xqs: array of XDPSQs driver structs
+ * @nqs: number of active XDPSQs, the above array length
+ * @fl: driver callback to flush an XDP xmit bulk
+ * @fin: driver cabback to finalize the queue
+ *
+ * If the driver has active XDPSQs, perform common checks and send the frames.
+ * Finalize the queue, if requested.
+ *
+ * Return: number of frames sent or -errno on error.
+ */
+#define libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin) \
+ _libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin, \
+ __UNIQUE_ID(bq_), __UNIQUE_ID(ret_), \
+ __UNIQUE_ID(nqs_))
+
+#define _libeth_xdp_xmit_do_bulk(d, n, fr, f, xqs, nqs, fl, fin, ub, ur, un) \
+({ \
+ u32 un = (nqs); \
+ int ur; \
+ \
+ if (likely(un)) { \
+ LIBETH_XDP_ONSTACK_BULK(ub); \
+ \
+ libeth_xdp_xmit_init_bulk(&ub, d, xqs, un); \
+ ur = __libeth_xdp_xmit_do_bulk(&ub, fr, n, f, fl, fin); \
+ } else { \
+ ur = -ENXIO; \
+ } \
+ \
+ ur; \
+})
+
+/* Rx polling path */
+
+/**
+ * libeth_xdp_tx_init_bulk - initialize an XDP Tx bulk for Rx NAPI poll
+ * @bq: bulk to initialize
+ * @prog: RCU pointer to the XDP program (can be %NULL)
+ * @dev: target &net_device
+ * @xdpsqs: array of driver XDPSQ structs
+ * @num: number of active XDPSQs, the above array length
+ *
+ * Should be called on an onstack XDP Tx bulk before the NAPI polling loop.
+ * Initializes all the needed fields to run libeth_xdp functions. If @num == 0,
+ * assumes XDP is not enabled.
+ * Do not use for XSk, it has its own optimized helper.
+ */
+#define libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num) \
+ __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, false, \
+ __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_))
+
+#define __libeth_xdp_tx_init_bulk(bq, pr, d, xdpsqs, num, xsk, ub, un) do { \
+ typeof(bq) ub = (bq); \
+ u32 un = (num); \
+ \
+ rcu_read_lock(); \
+ \
+ if (un || (xsk)) { \
+ ub->prog = rcu_dereference(pr); \
+ ub->dev = (d); \
+ ub->xdpsq = (xdpsqs)[libeth_xdpsq_id(un)]; \
+ } else { \
+ ub->prog = NULL; \
+ } \
+ \
+ ub->act_mask = 0; \
+ ub->count = 0; \
+} while (0)
+
+void libeth_xdp_load_stash(struct libeth_xdp_buff *dst,
+ const struct libeth_xdp_buff_stash *src);
+void libeth_xdp_save_stash(struct libeth_xdp_buff_stash *dst,
+ const struct libeth_xdp_buff *src);
+void __libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash);
+
+/**
+ * libeth_xdp_init_buff - initialize a &libeth_xdp_buff for Rx NAPI poll
+ * @dst: onstack buffer to initialize
+ * @src: XDP buffer stash placed on the queue
+ * @rxq: registered &xdp_rxq_info corresponding to this queue
+ *
+ * Should be called before the main NAPI polling loop. Loads the content of
+ * the previously saved stash or initializes the buffer from scratch.
+ * Do not use for XSk.
+ */
+static inline void
+libeth_xdp_init_buff(struct libeth_xdp_buff *dst,
+ const struct libeth_xdp_buff_stash *src,
+ struct xdp_rxq_info *rxq)
+{
+ if (likely(!src->data))
+ dst->data = NULL;
+ else
+ libeth_xdp_load_stash(dst, src);
+
+ dst->base.rxq = rxq;
+}
+
+/**
+ * libeth_xdp_save_buff - save a partially built buffer on a queue
+ * @dst: XDP buffer stash placed on the queue
+ * @src: onstack buffer to save
+ *
+ * Should be called after the main NAPI polling loop. If the loop exited before
+ * the buffer was finished, saves its content on the queue, so that it can be
+ * completed during the next poll. Otherwise, clears the stash.
+ */
+static inline void libeth_xdp_save_buff(struct libeth_xdp_buff_stash *dst,
+ const struct libeth_xdp_buff *src)
+{
+ if (likely(!src->data))
+ dst->data = NULL;
+ else
+ libeth_xdp_save_stash(dst, src);
+}
+
+/**
+ * libeth_xdp_return_stash - free an XDP buffer stash from a queue
+ * @stash: stash to free
+ *
+ * If the queue is about to be destroyed, but it still has an incompleted
+ * buffer stash, this helper should be called to free it.
+ */
+static inline void libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash)
+{
+ if (stash->data)
+ __libeth_xdp_return_stash(stash);
+}
+
+static inline void libeth_xdp_return_va(const void *data, bool napi)
+{
+ netmem_ref netmem = virt_to_netmem(data);
+
+ page_pool_put_full_netmem(__netmem_get_pp(netmem), netmem, napi);
+}
+
+static inline void libeth_xdp_return_frags(const struct skb_shared_info *sinfo,
+ bool napi)
+{
+ for (u32 i = 0; i < sinfo->nr_frags; i++) {
+ netmem_ref netmem = skb_frag_netmem(&sinfo->frags[i]);
+
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, napi);
+ }
+}
+
+/**
+ * libeth_xdp_return_buff - free/recycle &libeth_xdp_buff
+ * @xdp: buffer to free
+ *
+ * Hotpath helper to free &libeth_xdp_buff. Comparing to xdp_return_buff(),
+ * it's faster as it gets inlined and always assumes order-0 pages and safe
+ * direct recycling. Zeroes @xdp->data to avoid UAFs.
+ */
+#define libeth_xdp_return_buff(xdp) __libeth_xdp_return_buff(xdp, true)
+
+static inline void __libeth_xdp_return_buff(struct libeth_xdp_buff *xdp,
+ bool napi)
+{
+ if (!xdp_buff_has_frags(&xdp->base))
+ goto out;
+
+ libeth_xdp_return_frags(xdp_get_shared_info_from_buff(&xdp->base),
+ napi);
+
+out:
+ libeth_xdp_return_va(xdp->data, napi);
+ xdp->data = NULL;
+}
+
+bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp,
+ const struct libeth_fqe *fqe,
+ u32 len);
+
+/**
+ * libeth_xdp_prepare_buff - fill &libeth_xdp_buff with head FQE data
+ * @xdp: XDP buffer to attach the head to
+ * @fqe: FQE containing the head buffer
+ * @len: buffer len passed from HW
+ *
+ * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer
+ * head with the Rx buffer data: data pointer, length, headroom, and
+ * truesize/tailroom. Zeroes the flags.
+ */
+static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp,
+ const struct libeth_fqe *fqe,
+ u32 len)
+{
+ const struct page *page = __netmem_to_page(fqe->netmem);
+
+ xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset,
+ pp_page_to_nmdesc(page)->pp->p.offset, len, true);
+ xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq);
+}
+
+/**
+ * libeth_xdp_process_buff - attach Rx buffer to &libeth_xdp_buff
+ * @xdp: XDP buffer to attach the Rx buffer to
+ * @fqe: Rx buffer to process
+ * @len: received data length from the descriptor
+ *
+ * If the XDP buffer is empty, attaches the Rx buffer as head and initializes
+ * the required fields. Otherwise, attaches the buffer as a frag.
+ * Already performs DMA sync-for-CPU and frame start prefetch
+ * (for head buffers only).
+ *
+ * Return: true on success, false if the descriptor must be skipped (empty or
+ * no space for a new frag).
+ */
+static inline bool libeth_xdp_process_buff(struct libeth_xdp_buff *xdp,
+ const struct libeth_fqe *fqe,
+ u32 len)
+{
+ if (!libeth_rx_sync_for_cpu(fqe, len))
+ return false;
+
+ if (xdp->data)
+ return libeth_xdp_buff_add_frag(xdp, fqe, len);
+
+ libeth_xdp_prepare_buff(xdp, fqe, len);
+
+ prefetch(xdp->data);
+
+ return true;
+}
+
+/**
+ * libeth_xdp_buff_stats_frags - update onstack RQ stats with XDP frags info
+ * @ss: onstack stats to update
+ * @xdp: buffer to account
+ *
+ * Internal helper used by __libeth_xdp_run_pass(), do not call directly.
+ * Adds buffer's frags count and total len to the onstack stats.
+ */
+static inline void
+libeth_xdp_buff_stats_frags(struct libeth_rq_napi_stats *ss,
+ const struct libeth_xdp_buff *xdp)
+{
+ const struct skb_shared_info *sinfo;
+
+ sinfo = xdp_get_shared_info_from_buff(&xdp->base);
+ ss->bytes += sinfo->xdp_frags_size;
+ ss->fragments += sinfo->nr_frags + 1;
+}
+
+u32 libeth_xdp_prog_exception(const struct libeth_xdp_tx_bulk *bq,
+ struct libeth_xdp_buff *xdp,
+ enum xdp_action act, int ret);
+
+/**
+ * __libeth_xdp_run_prog - run XDP program on an XDP buffer
+ * @xdp: XDP buffer to run the prog on
+ * @bq: buffer bulk for ``XDP_TX`` queueing
+ *
+ * Internal inline abstraction to run XDP program. Handles ``XDP_DROP``
+ * and ``XDP_REDIRECT`` only, the rest is processed levels up.
+ * Reports an XDP prog exception on errors.
+ *
+ * Return: libeth_xdp prog verdict depending on the prog's verdict.
+ */
+static __always_inline u32
+__libeth_xdp_run_prog(struct libeth_xdp_buff *xdp,
+ const struct libeth_xdp_tx_bulk *bq)
+{
+ enum xdp_action act;
+
+ act = bpf_prog_run_xdp(bq->prog, &xdp->base);
+ if (unlikely(act < XDP_DROP || act > XDP_REDIRECT))
+ goto out;
+
+ switch (act) {
+ case XDP_PASS:
+ return LIBETH_XDP_PASS;
+ case XDP_DROP:
+ libeth_xdp_return_buff(xdp);
+
+ return LIBETH_XDP_DROP;
+ case XDP_TX:
+ return LIBETH_XDP_TX;
+ case XDP_REDIRECT:
+ if (unlikely(xdp_do_redirect(bq->dev, &xdp->base, bq->prog)))
+ break;
+
+ xdp->data = NULL;
+
+ return LIBETH_XDP_REDIRECT;
+ default:
+ break;
+ }
+
+out:
+ return libeth_xdp_prog_exception(bq, xdp, act, 0);
+}
+
+/**
+ * __libeth_xdp_run_flush - run XDP program and handle ``XDP_TX`` verdict
+ * @xdp: XDP buffer to run the prog on
+ * @bq: buffer bulk for ``XDP_TX`` queueing
+ * @run: internal callback for running XDP program
+ * @queue: internal callback for queuing ``XDP_TX`` frame
+ * @flush_bulk: driver callback for flushing a bulk
+ *
+ * Internal inline abstraction to run XDP program and additionally handle
+ * ``XDP_TX`` verdict. Used by both XDP and XSk, hence @run and @queue.
+ * Do not use directly.
+ *
+ * Return: libeth_xdp prog verdict depending on the prog's verdict.
+ */
+static __always_inline u32
+__libeth_xdp_run_flush(struct libeth_xdp_buff *xdp,
+ struct libeth_xdp_tx_bulk *bq,
+ u32 (*run)(struct libeth_xdp_buff *xdp,
+ const struct libeth_xdp_tx_bulk *bq),
+ bool (*queue)(struct libeth_xdp_tx_bulk *bq,
+ struct libeth_xdp_buff *xdp,
+ bool (*flush_bulk)
+ (struct libeth_xdp_tx_bulk *bq,
+ u32 flags)),
+ bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+ u32 flags))
+{
+ u32 act;
+
+ act = run(xdp, bq);
+ if (act == LIBETH_XDP_TX && unlikely(!queue(bq, xdp, flush_bulk)))
+ act = LIBETH_XDP_DROP;
+
+ bq->act_mask |= act;
+
+ return act;
+}
+
+/**
+ * libeth_xdp_run_prog - run XDP program (non-XSk path) and handle all verdicts
+ * @xdp: XDP buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers
+ * @fl: driver ``XDP_TX`` bulk flush callback
+ *
+ * Run the attached XDP program and handle all possible verdicts. XSk has its
+ * own version.
+ * Prefer using it via LIBETH_XDP_DEFINE_RUN{,_PASS,_PROG}().
+ *
+ * Return: true if the buffer should be passed up the stack, false if the poll
+ * should go to the next buffer.
+ */
+#define libeth_xdp_run_prog(xdp, bq, fl) \
+ (__libeth_xdp_run_flush(xdp, bq, __libeth_xdp_run_prog, \
+ libeth_xdp_tx_queue_bulk, \
+ fl) == LIBETH_XDP_PASS)
+
+/**
+ * __libeth_xdp_run_pass - helper to run XDP program and handle the result
+ * @xdp: XDP buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` frames
+ * @napi: NAPI to build an skb and pass it up the stack
+ * @rs: onstack libeth RQ stats
+ * @md: metadata that should be filled to the XDP buffer
+ * @prep: callback for filling the metadata
+ * @run: driver wrapper to run XDP program
+ * @populate: driver callback to populate an skb with the HW descriptor data
+ *
+ * Inline abstraction that does the following (non-XSk path):
+ * 1) adds frame size and frag number (if needed) to the onstack stats;
+ * 2) fills the descriptor metadata to the onstack &libeth_xdp_buff
+ * 3) runs XDP program if present;
+ * 4) handles all possible verdicts;
+ * 5) on ``XDP_PASS`, builds an skb from the buffer;
+ * 6) populates it with the descriptor metadata;
+ * 7) passes it up the stack.
+ *
+ * In most cases, number 2 means just writing the pointer to the HW descriptor
+ * to the XDP buffer. If so, please use LIBETH_XDP_DEFINE_RUN{,_PASS}()
+ * wrappers to build a driver function.
+ */
+static __always_inline void
+__libeth_xdp_run_pass(struct libeth_xdp_buff *xdp,
+ struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi,
+ struct libeth_rq_napi_stats *rs, const void *md,
+ void (*prep)(struct libeth_xdp_buff *xdp,
+ const void *md),
+ bool (*run)(struct libeth_xdp_buff *xdp,
+ struct libeth_xdp_tx_bulk *bq),
+ bool (*populate)(struct sk_buff *skb,
+ const struct libeth_xdp_buff *xdp,
+ struct libeth_rq_napi_stats *rs))
+{
+ struct sk_buff *skb;
+
+ rs->bytes += xdp->base.data_end - xdp->data;
+ rs->packets++;
+
+ if (xdp_buff_has_frags(&xdp->base))
+ libeth_xdp_buff_stats_frags(rs, xdp);
+
+ if (prep && (!__builtin_constant_p(!!md) || md))
+ prep(xdp, md);
+
+ if (!bq || !run || !bq->prog)
+ goto build;
+
+ if (!run(xdp, bq))
+ return;
+
+build:
+ skb = xdp_build_skb_from_buff(&xdp->base);
+ if (unlikely(!skb)) {
+ libeth_xdp_return_buff_slow(xdp);
+ return;
+ }
+
+ xdp->data = NULL;
+
+ if (unlikely(!populate(skb, xdp, rs))) {
+ napi_consume_skb(skb, true);
+ return;
+ }
+
+ napi_gro_receive(napi, skb);
+}
+
+static inline void libeth_xdp_prep_desc(struct libeth_xdp_buff *xdp,
+ const void *desc)
+{
+ xdp->desc = desc;
+}
+
+/**
+ * libeth_xdp_run_pass - helper to run XDP program and handle the result
+ * @xdp: XDP buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` frames
+ * @napi: NAPI to build an skb and pass it up the stack
+ * @ss: onstack libeth RQ stats
+ * @desc: pointer to the HW descriptor for that frame
+ * @run: driver wrapper to run XDP program
+ * @populate: driver callback to populate an skb with the HW descriptor data
+ *
+ * Wrapper around the underscored version when "fill the descriptor metadata"
+ * means just writing the pointer to the HW descriptor as @xdp->desc.
+ */
+#define libeth_xdp_run_pass(xdp, bq, napi, ss, desc, run, populate) \
+ __libeth_xdp_run_pass(xdp, bq, napi, ss, desc, libeth_xdp_prep_desc, \
+ run, populate)
+
+/**
+ * libeth_xdp_finalize_rx - finalize XDPSQ after a NAPI polling loop (non-XSk)
+ * @bq: ``XDP_TX`` frame bulk
+ * @flush: driver callback to flush the bulk
+ * @finalize: driver callback to start sending the frames and run the timer
+ *
+ * Flush the bulk if there are frames left to send, kick the queue and flush
+ * the XDP maps.
+ */
+#define libeth_xdp_finalize_rx(bq, flush, finalize) \
+ __libeth_xdp_finalize_rx(bq, 0, flush, finalize)
+
+static __always_inline void
+__libeth_xdp_finalize_rx(struct libeth_xdp_tx_bulk *bq, u32 flags,
+ bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+ u32 flags),
+ void (*finalize)(void *xdpsq, bool sent, bool flush))
+{
+ if (bq->act_mask & LIBETH_XDP_TX) {
+ if (bq->count)
+ flush_bulk(bq, flags | LIBETH_XDP_TX_DROP);
+ finalize(bq->xdpsq, true, true);
+ }
+ if (bq->act_mask & LIBETH_XDP_REDIRECT)
+ xdp_do_flush();
+
+ rcu_read_unlock();
+}
+
+/*
+ * Helpers to reduce boilerplate code in drivers.
+ *
+ * Typical driver Rx flow would be (excl. bulk and buff init, frag attach):
+ *
+ * LIBETH_XDP_DEFINE_START();
+ * LIBETH_XDP_DEFINE_FLUSH_TX(static driver_xdp_flush_tx, driver_xdp_tx_prep,
+ * driver_xdp_xmit);
+ * LIBETH_XDP_DEFINE_RUN(static driver_xdp_run, driver_xdp_run_prog,
+ * driver_xdp_flush_tx, driver_populate_skb);
+ * LIBETH_XDP_DEFINE_FINALIZE(static driver_xdp_finalize_rx,
+ * driver_xdp_flush_tx, driver_xdp_finalize_sq);
+ * LIBETH_XDP_DEFINE_END();
+ *
+ * This will build a set of 4 static functions. The compiler is free to decide
+ * whether to inline them.
+ * Then, in the NAPI polling function:
+ *
+ * while (packets < budget) {
+ * // ...
+ * driver_xdp_run(xdp, &bq, napi, &rs, desc);
+ * }
+ * driver_xdp_finalize_rx(&bq);
+ */
+
+#define LIBETH_XDP_DEFINE_START() \
+ __diag_push(); \
+ __diag_ignore(GCC, 8, "-Wold-style-declaration", \
+ "Allow specifying \'static\' after the return type")
+
+/**
+ * LIBETH_XDP_DEFINE_TIMER - define a driver XDPSQ cleanup timer callback
+ * @name: name of the function to define
+ * @poll: Tx polling/completion function
+ */
+#define LIBETH_XDP_DEFINE_TIMER(name, poll) \
+void name(struct work_struct *work) \
+{ \
+ libeth_xdpsq_run_timer(work, poll); \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_FLUSH_TX - define a driver ``XDP_TX`` bulk flush function
+ * @name: name of the function to define
+ * @prep: driver callback to clean an XDPSQ
+ * @xmit: driver callback to write a HW Tx descriptor
+ */
+#define LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit) \
+ __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xdp)
+
+#define __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, pfx) \
+bool name(struct libeth_xdp_tx_bulk *bq, u32 flags) \
+{ \
+ return libeth_##pfx##_tx_flush_bulk(bq, flags, prep, xmit); \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_FLUSH_XMIT - define a driver XDP xmit bulk flush function
+ * @name: name of the function to define
+ * @prep: driver callback to clean an XDPSQ
+ * @xmit: driver callback to write a HW Tx descriptor
+ */
+#define LIBETH_XDP_DEFINE_FLUSH_XMIT(name, prep, xmit) \
+bool name(struct libeth_xdp_tx_bulk *bq, u32 flags) \
+{ \
+ return libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit); \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_RUN_PROG - define a driver XDP program run function
+ * @name: name of the function to define
+ * @flush: driver callback to flush an ``XDP_TX`` bulk
+ */
+#define LIBETH_XDP_DEFINE_RUN_PROG(name, flush) \
+ bool __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xdp)
+
+#define __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, pfx) \
+name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq) \
+{ \
+ return libeth_##pfx##_run_prog(xdp, bq, flush); \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_RUN_PASS - define a driver buffer process + pass function
+ * @name: name of the function to define
+ * @run: driver callback to run XDP program (above)
+ * @populate: driver callback to fill an skb with HW descriptor info
+ */
+#define LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate) \
+ void __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xdp)
+
+#define __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, pfx) \
+name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq, \
+ struct napi_struct *napi, struct libeth_rq_napi_stats *ss, \
+ const void *desc) \
+{ \
+ return libeth_##pfx##_run_pass(xdp, bq, napi, ss, desc, run, \
+ populate); \
+}
+
+/**
+ * LIBETH_XDP_DEFINE_RUN - define a driver buffer process, run + pass function
+ * @name: name of the function to define
+ * @run: name of the XDP prog run function to define
+ * @flush: driver callback to flush an ``XDP_TX`` bulk
+ * @populate: driver callback to fill an skb with HW descriptor info
+ */
+#define LIBETH_XDP_DEFINE_RUN(name, run, flush, populate) \
+ __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XDP)
+
+#define __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, pfx) \
+ LIBETH_##pfx##_DEFINE_RUN_PROG(static run, flush); \
+ LIBETH_##pfx##_DEFINE_RUN_PASS(name, run, populate)
+
+/**
+ * LIBETH_XDP_DEFINE_FINALIZE - define a driver Rx NAPI poll finalize function
+ * @name: name of the function to define
+ * @flush: driver callback to flush an ``XDP_TX`` bulk
+ * @finalize: driver callback to finalize an XDPSQ and run the timer
+ */
+#define LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize) \
+ __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xdp)
+
+#define __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, pfx) \
+void name(struct libeth_xdp_tx_bulk *bq) \
+{ \
+ libeth_##pfx##_finalize_rx(bq, flush, finalize); \
+}
+
+#define LIBETH_XDP_DEFINE_END() __diag_pop()
+
+/* XMO */
+
+/**
+ * libeth_xdp_buff_to_rq - get RQ pointer from an XDP buffer pointer
+ * @xdp: &libeth_xdp_buff corresponding to the queue
+ * @type: typeof() of the driver Rx queue structure
+ * @member: name of &xdp_rxq_info inside @type
+ *
+ * Often times, pointer to the RQ is needed when reading/filling metadata from
+ * HW descriptors. The helper can be used to quickly jump from an XDP buffer
+ * to the queue corresponding to its &xdp_rxq_info without introducing
+ * additional fields (&libeth_xdp_buff is precisely 1 cacheline long on x64).
+ */
+#define libeth_xdp_buff_to_rq(xdp, type, member) \
+ container_of_const((xdp)->base.rxq, type, member)
+
+/**
+ * libeth_xdpmo_rx_hash - convert &libeth_rx_pt to an XDP RSS hash metadata
+ * @hash: pointer to the variable to write the hash to
+ * @rss_type: pointer to the variable to write the hash type to
+ * @val: hash value from the HW descriptor
+ * @pt: libeth parsed packet type
+ *
+ * Handle zeroed/non-available hash and convert libeth parsed packet type to
+ * the corresponding XDP RSS hash type. To be called at the end of
+ * xdp_metadata_ops idpf_xdpmo::xmo_rx_hash() implementation.
+ * Note that if the driver doesn't use a constant packet type lookup table but
+ * generates it at runtime, it must call libeth_rx_pt_gen_hash_type(pt) to
+ * generate XDP RSS hash type for each packet type.
+ *
+ * Return: 0 on success, -ENODATA when the hash is not available.
+ */
+static inline int libeth_xdpmo_rx_hash(u32 *hash,
+ enum xdp_rss_hash_type *rss_type,
+ u32 val, struct libeth_rx_pt pt)
+{
+ if (unlikely(!val))
+ return -ENODATA;
+
+ *hash = val;
+ *rss_type = pt.hash_type;
+
+ return 0;
+}
+
+/* Tx buffer completion */
+
+void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo,
+ struct xdp_frame_bulk *bq, bool frags);
+void libeth_xsk_buff_free_slow(struct libeth_xdp_buff *xdp);
+
+/**
+ * __libeth_xdp_complete_tx - complete sent XDPSQE
+ * @sqe: SQ element / Tx buffer to complete
+ * @cp: Tx polling/completion params
+ * @bulk: internal callback to bulk-free ``XDP_TX`` buffers
+ * @xsk: internal callback to free XSk ``XDP_TX`` buffers
+ *
+ * Use the non-underscored version in drivers instead. This one is shared
+ * internally with libeth_tx_complete_any().
+ * Complete an XDPSQE of any type of XDP frame. This includes DMA unmapping
+ * when needed, buffer freeing, stats update, and SQE invalidation.
+ */
+static __always_inline void
+__libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp,
+ typeof(libeth_xdp_return_buff_bulk) bulk,
+ typeof(libeth_xsk_buff_free_slow) xsk)
+{
+ enum libeth_sqe_type type = sqe->type;
+
+ switch (type) {
+ case LIBETH_SQE_EMPTY:
+ return;
+ case LIBETH_SQE_XDP_XMIT:
+ case LIBETH_SQE_XDP_XMIT_FRAG:
+ dma_unmap_page(cp->dev, dma_unmap_addr(sqe, dma),
+ dma_unmap_len(sqe, len), DMA_TO_DEVICE);
+ break;
+ default:
+ break;
+ }
+
+ switch (type) {
+ case LIBETH_SQE_XDP_TX:
+ bulk(sqe->sinfo, cp->bq, sqe->nr_frags != 1);
+ break;
+ case LIBETH_SQE_XDP_XMIT:
+ xdp_return_frame_bulk(sqe->xdpf, cp->bq);
+ break;
+ case LIBETH_SQE_XSK_TX:
+ case LIBETH_SQE_XSK_TX_FRAG:
+ xsk(sqe->xsk);
+ break;
+ default:
+ break;
+ }
+
+ switch (type) {
+ case LIBETH_SQE_XDP_TX:
+ case LIBETH_SQE_XDP_XMIT:
+ case LIBETH_SQE_XSK_TX:
+ cp->xdp_tx -= sqe->nr_frags;
+
+ cp->xss->packets++;
+ cp->xss->bytes += sqe->bytes;
+ break;
+ default:
+ break;
+ }
+
+ sqe->type = LIBETH_SQE_EMPTY;
+}
+
+static inline void libeth_xdp_complete_tx(struct libeth_sqe *sqe,
+ struct libeth_cq_pp *cp)
+{
+ __libeth_xdp_complete_tx(sqe, cp, libeth_xdp_return_buff_bulk,
+ libeth_xsk_buff_free_slow);
+}
+
+/* Misc */
+
+u32 libeth_xdp_queue_threshold(u32 count);
+
+void __libeth_xdp_set_features(struct net_device *dev,
+ const struct xdp_metadata_ops *xmo,
+ u32 zc_segs,
+ const struct xsk_tx_metadata_ops *tmo);
+void libeth_xdp_set_redirect(struct net_device *dev, bool enable);
+
+/**
+ * libeth_xdp_set_features - set XDP features for netdev
+ * @dev: &net_device to configure
+ * @...: optional params, see __libeth_xdp_set_features()
+ *
+ * Set all the features libeth_xdp supports, including .ndo_xdp_xmit(). That
+ * said, it should be used only when XDPSQs are always available regardless
+ * of whether an XDP prog is attached to @dev.
+ */
+#define libeth_xdp_set_features(dev, ...) \
+ CONCATENATE(__libeth_xdp_feat, \
+ COUNT_ARGS(__VA_ARGS__))(dev, ##__VA_ARGS__)
+
+#define __libeth_xdp_feat0(dev) \
+ __libeth_xdp_set_features(dev, NULL, 0, NULL)
+#define __libeth_xdp_feat1(dev, xmo) \
+ __libeth_xdp_set_features(dev, xmo, 0, NULL)
+#define __libeth_xdp_feat2(dev, xmo, zc_segs) \
+ __libeth_xdp_set_features(dev, xmo, zc_segs, NULL)
+#define __libeth_xdp_feat3(dev, xmo, zc_segs, tmo) \
+ __libeth_xdp_set_features(dev, xmo, zc_segs, tmo)
+
+/**
+ * libeth_xdp_set_features_noredir - enable all libeth_xdp features w/o redir
+ * @dev: target &net_device
+ * @...: optional params, see __libeth_xdp_set_features()
+ *
+ * Enable everything except the .ndo_xdp_xmit() feature, use when XDPSQs are
+ * not available right after netdev registration.
+ */
+#define libeth_xdp_set_features_noredir(dev, ...) \
+ __libeth_xdp_set_features_noredir(dev, __UNIQUE_ID(dev_), \
+ ##__VA_ARGS__)
+
+#define __libeth_xdp_set_features_noredir(dev, ud, ...) do { \
+ struct net_device *ud = (dev); \
+ \
+ libeth_xdp_set_features(ud, ##__VA_ARGS__); \
+ libeth_xdp_set_redirect(ud, false); \
+} while (0)
+
+#define libeth_xsktmo ((const void *)GOLDEN_RATIO_PRIME)
+
+#endif /* __LIBETH_XDP_H */
diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h
new file mode 100644
index 000000000000..82b5d21aae87
--- /dev/null
+++ b/include/net/libeth/xsk.h
@@ -0,0 +1,688 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2025 Intel Corporation */
+
+#ifndef __LIBETH_XSK_H
+#define __LIBETH_XSK_H
+
+#include <net/libeth/xdp.h>
+#include <net/xdp_sock_drv.h>
+
+/* ``XDP_TXMD_FLAGS_VALID`` is defined only under ``CONFIG_XDP_SOCKETS`` */
+#ifdef XDP_TXMD_FLAGS_VALID
+static_assert(XDP_TXMD_FLAGS_VALID <= LIBETH_XDP_TX_XSKMD);
+#endif
+
+/* ``XDP_TX`` bulking */
+
+/**
+ * libeth_xsk_tx_queue_head - internal helper for queueing XSk ``XDP_TX`` head
+ * @bq: XDP Tx bulk to queue the head frag to
+ * @xdp: XSk buffer with the head to queue
+ *
+ * Return: false if it's the only frag of the frame, true if it's an S/G frame.
+ */
+static inline bool libeth_xsk_tx_queue_head(struct libeth_xdp_tx_bulk *bq,
+ struct libeth_xdp_buff *xdp)
+{
+ bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+ .xsk = xdp,
+ __libeth_xdp_tx_len(xdp->base.data_end - xdp->data,
+ LIBETH_XDP_TX_FIRST),
+ };
+
+ if (likely(!xdp_buff_has_frags(&xdp->base)))
+ return false;
+
+ bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI;
+
+ return true;
+}
+
+/**
+ * libeth_xsk_tx_queue_frag - internal helper for queueing XSk ``XDP_TX`` frag
+ * @bq: XDP Tx bulk to queue the frag to
+ * @frag: XSk frag to queue
+ */
+static inline void libeth_xsk_tx_queue_frag(struct libeth_xdp_tx_bulk *bq,
+ struct libeth_xdp_buff *frag)
+{
+ bq->bulk[bq->count++] = (typeof(*bq->bulk)){
+ .xsk = frag,
+ __libeth_xdp_tx_len(frag->base.data_end - frag->data),
+ };
+}
+
+/**
+ * libeth_xsk_tx_queue_bulk - internal helper for queueing XSk ``XDP_TX`` frame
+ * @bq: XDP Tx bulk to queue the frame to
+ * @xdp: XSk buffer to queue
+ * @flush_bulk: driver callback to flush the bulk to the HW queue
+ *
+ * Return: true on success, false on flush error.
+ */
+static __always_inline bool
+libeth_xsk_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq,
+ struct libeth_xdp_buff *xdp,
+ bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq,
+ u32 flags))
+{
+ bool ret = true;
+
+ if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+ unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) {
+ libeth_xsk_buff_free_slow(xdp);
+ return false;
+ }
+
+ if (!libeth_xsk_tx_queue_head(bq, xdp))
+ goto out;
+
+ for (const struct libeth_xdp_buff *head = xdp; ; ) {
+ xdp = container_of(xsk_buff_get_frag(&head->base),
+ typeof(*xdp), base);
+ if (!xdp)
+ break;
+
+ if (unlikely(bq->count == LIBETH_XDP_TX_BULK) &&
+ unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) {
+ ret = false;
+ break;
+ }
+
+ libeth_xsk_tx_queue_frag(bq, xdp);
+ }
+
+out:
+ bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST;
+
+ return ret;
+}
+
+/**
+ * libeth_xsk_tx_fill_buf - internal helper to fill XSk ``XDP_TX`` &libeth_sqe
+ * @frm: XDP Tx frame from the bulk
+ * @i: index on the HW queue
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: private data
+ *
+ * Return: XDP Tx descriptor with the synced DMA and other info to pass to
+ * the driver callback.
+ */
+static inline struct libeth_xdp_tx_desc
+libeth_xsk_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv)
+{
+ struct libeth_xdp_buff *xdp = frm.xsk;
+ struct libeth_xdp_tx_desc desc = {
+ .addr = xsk_buff_xdp_get_dma(&xdp->base),
+ .opts = frm.opts,
+ };
+ struct libeth_sqe *sqe;
+
+ xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len);
+
+ sqe = &sq->sqes[i];
+ sqe->xsk = xdp;
+
+ if (!(desc.flags & LIBETH_XDP_TX_FIRST)) {
+ sqe->type = LIBETH_SQE_XSK_TX_FRAG;
+ return desc;
+ }
+
+ sqe->type = LIBETH_SQE_XSK_TX;
+ libeth_xdp_tx_fill_stats(sqe, &desc,
+ xdp_get_shared_info_from_buff(&xdp->base));
+
+ return desc;
+}
+
+/**
+ * libeth_xsk_tx_flush_bulk - wrapper to define flush of XSk ``XDP_TX`` bulk
+ * @bq: bulk to flush
+ * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk()
+ * @prep: driver callback to prepare the queue
+ * @xmit: driver callback to fill a HW descriptor
+ *
+ * Use via LIBETH_XSK_DEFINE_FLUSH_TX() to define an XSk ``XDP_TX`` driver
+ * callback.
+ */
+#define libeth_xsk_tx_flush_bulk(bq, flags, prep, xmit) \
+ __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_XSK, prep, \
+ libeth_xsk_tx_fill_buf, xmit)
+
+/* XSk TMO */
+
+/**
+ * libeth_xsktmo_req_csum - XSk Tx metadata op to request checksum offload
+ * @csum_start: unused
+ * @csum_offset: unused
+ * @priv: &libeth_xdp_tx_desc from the filling helper
+ *
+ * Generic implementation of ::tmo_request_checksum. Works only when HW doesn't
+ * require filling checksum offsets and other parameters beside the checksum
+ * request bit.
+ * Consider using within @libeth_xsktmo unless the driver requires HW-specific
+ * callbacks.
+ */
+static inline void libeth_xsktmo_req_csum(u16 csum_start, u16 csum_offset,
+ void *priv)
+{
+ ((struct libeth_xdp_tx_desc *)priv)->flags |= LIBETH_XDP_TX_CSUM;
+}
+
+/* Only to inline the callbacks below, use @libeth_xsktmo in drivers instead */
+static const struct xsk_tx_metadata_ops __libeth_xsktmo = {
+ .tmo_request_checksum = libeth_xsktmo_req_csum,
+};
+
+/**
+ * __libeth_xsk_xmit_fill_buf_md - internal helper to prepare XSk xmit w/meta
+ * @xdesc: &xdp_desc from the XSk buffer pool
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: XSk Tx metadata ops
+ *
+ * Same as __libeth_xsk_xmit_fill_buf(), but requests metadata pointer and
+ * fills additional fields in &libeth_xdp_tx_desc to ask for metadata offload.
+ *
+ * Return: XDP Tx descriptor with the DMA, metadata request bits, and other
+ * info to pass to the driver callback.
+ */
+static __always_inline struct libeth_xdp_tx_desc
+__libeth_xsk_xmit_fill_buf_md(const struct xdp_desc *xdesc,
+ const struct libeth_xdpsq *sq,
+ u64 priv)
+{
+ const struct xsk_tx_metadata_ops *tmo = libeth_xdp_priv_to_ptr(priv);
+ struct libeth_xdp_tx_desc desc;
+ struct xdp_desc_ctx ctx;
+
+ ctx = xsk_buff_raw_get_ctx(sq->pool, xdesc->addr);
+ desc = (typeof(desc)){
+ .addr = ctx.dma,
+ __libeth_xdp_tx_len(xdesc->len),
+ };
+
+ BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo));
+ tmo = tmo == libeth_xsktmo ? &__libeth_xsktmo : tmo;
+
+ xsk_tx_metadata_request(ctx.meta, tmo, &desc);
+
+ return desc;
+}
+
+/* XSk xmit implementation */
+
+/**
+ * __libeth_xsk_xmit_fill_buf - internal helper to prepare XSk xmit w/o meta
+ * @xdesc: &xdp_desc from the XSk buffer pool
+ * @sq: XDPSQ abstraction for the queue
+ *
+ * Return: XDP Tx descriptor with the DMA and other info to pass to
+ * the driver callback.
+ */
+static inline struct libeth_xdp_tx_desc
+__libeth_xsk_xmit_fill_buf(const struct xdp_desc *xdesc,
+ const struct libeth_xdpsq *sq)
+{
+ return (struct libeth_xdp_tx_desc){
+ .addr = xsk_buff_raw_get_dma(sq->pool, xdesc->addr),
+ __libeth_xdp_tx_len(xdesc->len),
+ };
+}
+
+/**
+ * libeth_xsk_xmit_fill_buf - internal helper to prepare an XSk xmit
+ * @frm: &xdp_desc from the XSk buffer pool
+ * @i: index on the HW queue
+ * @sq: XDPSQ abstraction for the queue
+ * @priv: XSk Tx metadata ops
+ *
+ * Depending on the metadata ops presence (determined at compile time), calls
+ * the quickest helper to build a libeth XDP Tx descriptor.
+ *
+ * Return: XDP Tx descriptor with the synced DMA, metadata request bits,
+ * and other info to pass to the driver callback.
+ */
+static __always_inline struct libeth_xdp_tx_desc
+libeth_xsk_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv)
+{
+ struct libeth_xdp_tx_desc desc;
+
+ if (priv)
+ desc = __libeth_xsk_xmit_fill_buf_md(&frm.desc, sq, priv);
+ else
+ desc = __libeth_xsk_xmit_fill_buf(&frm.desc, sq);
+
+ desc.flags |= xsk_is_eop_desc(&frm.desc) ? LIBETH_XDP_TX_LAST : 0;
+
+ xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len);
+
+ return desc;
+}
+
+/**
+ * libeth_xsk_xmit_do_bulk - send XSk xmit frames
+ * @pool: XSk buffer pool containing the frames to send
+ * @xdpsq: opaque pointer to driver's XDPSQ struct
+ * @budget: maximum number of frames can be sent
+ * @tmo: optional XSk Tx metadata ops
+ * @prep: driver callback to build a &libeth_xdpsq
+ * @xmit: driver callback to put frames to a HW queue
+ * @finalize: driver callback to start a transmission
+ *
+ * Implements generic XSk xmit. Always turns on XSk Tx wakeup as it's assumed
+ * lazy cleaning is used and interrupts are disabled for the queue.
+ * HW descriptor filling is unrolled by ``LIBETH_XDP_TX_BATCH`` to optimize
+ * writes.
+ * Note that unlike other XDP Tx ops, the queue must be locked and cleaned
+ * prior to calling this function to already know available @budget.
+ * @prepare must only build a &libeth_xdpsq and return ``U32_MAX``.
+ *
+ * Return: false if @budget was exhausted, true otherwise.
+ */
+static __always_inline bool
+libeth_xsk_xmit_do_bulk(struct xsk_buff_pool *pool, void *xdpsq, u32 budget,
+ const struct xsk_tx_metadata_ops *tmo,
+ u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq),
+ void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv),
+ void (*finalize)(void *xdpsq, bool sent, bool flush))
+{
+ const struct libeth_xdp_tx_frame *bulk;
+ bool wake;
+ u32 n;
+
+ wake = xsk_uses_need_wakeup(pool);
+ if (wake)
+ xsk_clear_tx_need_wakeup(pool);
+
+ n = xsk_tx_peek_release_desc_batch(pool, budget);
+ bulk = container_of(&pool->tx_descs[0], typeof(*bulk), desc);
+
+ libeth_xdp_tx_xmit_bulk(bulk, xdpsq, n, true,
+ libeth_xdp_ptr_to_priv(tmo), prep,
+ libeth_xsk_xmit_fill_buf, xmit);
+ finalize(xdpsq, n, true);
+
+ if (wake)
+ xsk_set_tx_need_wakeup(pool);
+
+ return n < budget;
+}
+
+/* Rx polling path */
+
+/**
+ * libeth_xsk_tx_init_bulk - initialize XDP Tx bulk for an XSk Rx NAPI poll
+ * @bq: bulk to initialize
+ * @prog: RCU pointer to the XDP program (never %NULL)
+ * @dev: target &net_device
+ * @xdpsqs: array of driver XDPSQ structs
+ * @num: number of active XDPSQs, the above array length
+ *
+ * Should be called on an onstack XDP Tx bulk before the XSk NAPI polling loop.
+ * Initializes all the needed fields to run libeth_xdp functions.
+ * Never checks if @prog is %NULL or @num == 0 as XDP must always be enabled
+ * when hitting this path.
+ */
+#define libeth_xsk_tx_init_bulk(bq, prog, dev, xdpsqs, num) \
+ __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, true, \
+ __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_))
+
+struct libeth_xdp_buff *libeth_xsk_buff_add_frag(struct libeth_xdp_buff *head,
+ struct libeth_xdp_buff *xdp);
+
+/**
+ * libeth_xsk_process_buff - attach XSk Rx buffer to &libeth_xdp_buff
+ * @head: head XSk buffer to attach the XSk buffer to (or %NULL)
+ * @xdp: XSk buffer to process
+ * @len: received data length from the descriptor
+ *
+ * If @head == %NULL, treats the XSk buffer as head and initializes
+ * the required fields. Otherwise, attaches the buffer as a frag.
+ * Already performs DMA sync-for-CPU and frame start prefetch
+ * (for head buffers only).
+ *
+ * Return: head XSk buffer on success or if the descriptor must be skipped
+ * (empty), %NULL if there is no space for a new frag.
+ */
+static inline struct libeth_xdp_buff *
+libeth_xsk_process_buff(struct libeth_xdp_buff *head,
+ struct libeth_xdp_buff *xdp, u32 len)
+{
+ if (unlikely(!len)) {
+ libeth_xsk_buff_free_slow(xdp);
+ return head;
+ }
+
+ xsk_buff_set_size(&xdp->base, len);
+ xsk_buff_dma_sync_for_cpu(&xdp->base);
+
+ if (head)
+ return libeth_xsk_buff_add_frag(head, xdp);
+
+ prefetch(xdp->data);
+
+ return xdp;
+}
+
+void libeth_xsk_buff_stats_frags(struct libeth_rq_napi_stats *rs,
+ const struct libeth_xdp_buff *xdp);
+
+u32 __libeth_xsk_run_prog_slow(struct libeth_xdp_buff *xdp,
+ const struct libeth_xdp_tx_bulk *bq,
+ enum xdp_action act, int ret);
+
+/**
+ * __libeth_xsk_run_prog - run XDP program on XSk buffer
+ * @xdp: XSk buffer to run the prog on
+ * @bq: buffer bulk for ``XDP_TX`` queueing
+ *
+ * Internal inline abstraction to run XDP program on XSk Rx path. Handles
+ * only the most common ``XDP_REDIRECT`` inline, the rest is processed
+ * externally.
+ * Reports an XDP prog exception on errors.
+ *
+ * Return: libeth_xdp prog verdict depending on the prog's verdict.
+ */
+static __always_inline u32
+__libeth_xsk_run_prog(struct libeth_xdp_buff *xdp,
+ const struct libeth_xdp_tx_bulk *bq)
+{
+ enum xdp_action act;
+ int ret = 0;
+
+ act = bpf_prog_run_xdp(bq->prog, &xdp->base);
+ if (unlikely(act != XDP_REDIRECT))
+rest:
+ return __libeth_xsk_run_prog_slow(xdp, bq, act, ret);
+
+ ret = xdp_do_redirect(bq->dev, &xdp->base, bq->prog);
+ if (unlikely(ret))
+ goto rest;
+
+ return LIBETH_XDP_REDIRECT;
+}
+
+/**
+ * libeth_xsk_run_prog - run XDP program on XSk path and handle all verdicts
+ * @xdp: XSk buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers
+ * @fl: driver ``XDP_TX`` bulk flush callback
+ *
+ * Run the attached XDP program and handle all possible verdicts.
+ * Prefer using it via LIBETH_XSK_DEFINE_RUN{,_PASS,_PROG}().
+ *
+ * Return: libeth_xdp prog verdict depending on the prog's verdict.
+ */
+#define libeth_xsk_run_prog(xdp, bq, fl) \
+ __libeth_xdp_run_flush(xdp, bq, __libeth_xsk_run_prog, \
+ libeth_xsk_tx_queue_bulk, fl)
+
+/**
+ * __libeth_xsk_run_pass - helper to run XDP program and handle the result
+ * @xdp: XSk buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` frames
+ * @napi: NAPI to build an skb and pass it up the stack
+ * @rs: onstack libeth RQ stats
+ * @md: metadata that should be filled to the XSk buffer
+ * @prep: callback for filling the metadata
+ * @run: driver wrapper to run XDP program
+ * @populate: driver callback to populate an skb with the HW descriptor data
+ *
+ * Inline abstraction, XSk's counterpart of __libeth_xdp_run_pass(), see its
+ * doc for details.
+ *
+ * Return: false if the polling loop must be exited due to lack of free
+ * buffers, true otherwise.
+ */
+static __always_inline bool
+__libeth_xsk_run_pass(struct libeth_xdp_buff *xdp,
+ struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi,
+ struct libeth_rq_napi_stats *rs, const void *md,
+ void (*prep)(struct libeth_xdp_buff *xdp,
+ const void *md),
+ u32 (*run)(struct libeth_xdp_buff *xdp,
+ struct libeth_xdp_tx_bulk *bq),
+ bool (*populate)(struct sk_buff *skb,
+ const struct libeth_xdp_buff *xdp,
+ struct libeth_rq_napi_stats *rs))
+{
+ struct sk_buff *skb;
+ u32 act;
+
+ rs->bytes += xdp->base.data_end - xdp->data;
+ rs->packets++;
+
+ if (unlikely(xdp_buff_has_frags(&xdp->base)))
+ libeth_xsk_buff_stats_frags(rs, xdp);
+
+ if (prep && (!__builtin_constant_p(!!md) || md))
+ prep(xdp, md);
+
+ act = run(xdp, bq);
+ if (likely(act == LIBETH_XDP_REDIRECT))
+ return true;
+
+ if (act != LIBETH_XDP_PASS)
+ return act != LIBETH_XDP_ABORTED;
+
+ skb = xdp_build_skb_from_zc(&xdp->base);
+ if (unlikely(!skb)) {
+ libeth_xsk_buff_free_slow(xdp);
+ return true;
+ }
+
+ if (unlikely(!populate(skb, xdp, rs))) {
+ napi_consume_skb(skb, true);
+ return true;
+ }
+
+ napi_gro_receive(napi, skb);
+
+ return true;
+}
+
+/**
+ * libeth_xsk_run_pass - helper to run XDP program and handle the result
+ * @xdp: XSk buffer to process
+ * @bq: XDP Tx bulk to queue ``XDP_TX`` frames
+ * @napi: NAPI to build an skb and pass it up the stack
+ * @rs: onstack libeth RQ stats
+ * @desc: pointer to the HW descriptor for that frame
+ * @run: driver wrapper to run XDP program
+ * @populate: driver callback to populate an skb with the HW descriptor data
+ *
+ * Wrapper around the underscored version when "fill the descriptor metadata"
+ * means just writing the pointer to the HW descriptor as @xdp->desc.
+ */
+#define libeth_xsk_run_pass(xdp, bq, napi, rs, desc, run, populate) \
+ __libeth_xsk_run_pass(xdp, bq, napi, rs, desc, libeth_xdp_prep_desc, \
+ run, populate)
+
+/**
+ * libeth_xsk_finalize_rx - finalize XDPSQ after an XSk NAPI polling loop
+ * @bq: ``XDP_TX`` frame bulk
+ * @flush: driver callback to flush the bulk
+ * @finalize: driver callback to start sending the frames and run the timer
+ *
+ * Flush the bulk if there are frames left to send, kick the queue and flush
+ * the XDP maps.
+ */
+#define libeth_xsk_finalize_rx(bq, flush, finalize) \
+ __libeth_xdp_finalize_rx(bq, LIBETH_XDP_TX_XSK, flush, finalize)
+
+/*
+ * Helpers to reduce boilerplate code in drivers.
+ *
+ * Typical driver XSk Rx flow would be (excl. bulk and buff init, frag attach):
+ *
+ * LIBETH_XDP_DEFINE_START();
+ * LIBETH_XSK_DEFINE_FLUSH_TX(static driver_xsk_flush_tx, driver_xsk_tx_prep,
+ * driver_xdp_xmit);
+ * LIBETH_XSK_DEFINE_RUN(static driver_xsk_run, driver_xsk_run_prog,
+ * driver_xsk_flush_tx, driver_populate_skb);
+ * LIBETH_XSK_DEFINE_FINALIZE(static driver_xsk_finalize_rx,
+ * driver_xsk_flush_tx, driver_xdp_finalize_sq);
+ * LIBETH_XDP_DEFINE_END();
+ *
+ * This will build a set of 4 static functions. The compiler is free to decide
+ * whether to inline them.
+ * Then, in the NAPI polling function:
+ *
+ * while (packets < budget) {
+ * // ...
+ * if (!driver_xsk_run(xdp, &bq, napi, &rs, desc))
+ * break;
+ * }
+ * driver_xsk_finalize_rx(&bq);
+ */
+
+/**
+ * LIBETH_XSK_DEFINE_FLUSH_TX - define a driver XSk ``XDP_TX`` flush function
+ * @name: name of the function to define
+ * @prep: driver callback to clean an XDPSQ
+ * @xmit: driver callback to write a HW Tx descriptor
+ */
+#define LIBETH_XSK_DEFINE_FLUSH_TX(name, prep, xmit) \
+ __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xsk)
+
+/**
+ * LIBETH_XSK_DEFINE_RUN_PROG - define a driver XDP program run function
+ * @name: name of the function to define
+ * @flush: driver callback to flush an XSk ``XDP_TX`` bulk
+ */
+#define LIBETH_XSK_DEFINE_RUN_PROG(name, flush) \
+ u32 __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xsk)
+
+/**
+ * LIBETH_XSK_DEFINE_RUN_PASS - define a driver buffer process + pass function
+ * @name: name of the function to define
+ * @run: driver callback to run XDP program (above)
+ * @populate: driver callback to fill an skb with HW descriptor info
+ */
+#define LIBETH_XSK_DEFINE_RUN_PASS(name, run, populate) \
+ bool __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xsk)
+
+/**
+ * LIBETH_XSK_DEFINE_RUN - define a driver buffer process, run + pass function
+ * @name: name of the function to define
+ * @run: name of the XDP prog run function to define
+ * @flush: driver callback to flush an XSk ``XDP_TX`` bulk
+ * @populate: driver callback to fill an skb with HW descriptor info
+ */
+#define LIBETH_XSK_DEFINE_RUN(name, run, flush, populate) \
+ __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XSK)
+
+/**
+ * LIBETH_XSK_DEFINE_FINALIZE - define a driver XSk NAPI poll finalize function
+ * @name: name of the function to define
+ * @flush: driver callback to flush an XSk ``XDP_TX`` bulk
+ * @finalize: driver callback to finalize an XDPSQ and run the timer
+ */
+#define LIBETH_XSK_DEFINE_FINALIZE(name, flush, finalize) \
+ __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xsk)
+
+/* Refilling */
+
+/**
+ * struct libeth_xskfq - structure representing an XSk buffer (fill) queue
+ * @fp: hotpath part of the structure
+ * @pool: &xsk_buff_pool for buffer management
+ * @fqes: array of XSk buffer pointers
+ * @descs: opaque pointer to the HW descriptor array
+ * @ntu: index of the next buffer to poll
+ * @count: number of descriptors/buffers the queue has
+ * @pending: current number of XSkFQEs to refill
+ * @thresh: threshold below which the queue is refilled
+ * @buf_len: HW-writeable length per each buffer
+ * @truesize: step between consecutive buffers, 0 if none exists
+ * @nid: ID of the closest NUMA node with memory
+ */
+struct libeth_xskfq {
+ struct_group_tagged(libeth_xskfq_fp, fp,
+ struct xsk_buff_pool *pool;
+ struct libeth_xdp_buff **fqes;
+ void *descs;
+
+ u32 ntu;
+ u32 count;
+ );
+
+ /* Cold fields */
+ u32 pending;
+ u32 thresh;
+
+ u32 buf_len;
+ u32 truesize;
+
+ int nid;
+};
+
+int libeth_xskfq_create(struct libeth_xskfq *fq);
+void libeth_xskfq_destroy(struct libeth_xskfq *fq);
+
+/**
+ * libeth_xsk_buff_xdp_get_dma - get DMA address of XSk &libeth_xdp_buff
+ * @xdp: buffer to get the DMA addr for
+ */
+#define libeth_xsk_buff_xdp_get_dma(xdp) \
+ xsk_buff_xdp_get_dma(&(xdp)->base)
+
+/**
+ * libeth_xskfqe_alloc - allocate @n XSk Rx buffers
+ * @fq: hotpath part of the XSkFQ, usually onstack
+ * @n: number of buffers to allocate
+ * @fill: driver callback to write DMA addresses to HW descriptors
+ *
+ * Note that @fq->ntu gets updated, but ::pending must be recalculated
+ * by the caller.
+ *
+ * Return: number of buffers refilled.
+ */
+static __always_inline u32
+libeth_xskfqe_alloc(struct libeth_xskfq_fp *fq, u32 n,
+ void (*fill)(const struct libeth_xskfq_fp *fq, u32 i))
+{
+ u32 this, ret, done = 0;
+ struct xdp_buff **xskb;
+
+ this = fq->count - fq->ntu;
+ if (likely(this > n))
+ this = n;
+
+again:
+ xskb = (typeof(xskb))&fq->fqes[fq->ntu];
+ ret = xsk_buff_alloc_batch(fq->pool, xskb, this);
+
+ for (u32 i = 0, ntu = fq->ntu; likely(i < ret); i++)
+ fill(fq, ntu + i);
+
+ done += ret;
+ fq->ntu += ret;
+
+ if (likely(fq->ntu < fq->count) || unlikely(ret < this))
+ goto out;
+
+ fq->ntu = 0;
+
+ if (this < n) {
+ this = n - this;
+ goto again;
+ }
+
+out:
+ return done;
+}
+
+/* .ndo_xsk_wakeup */
+
+void libeth_xsk_init_wakeup(call_single_data_t *csd, struct napi_struct *napi);
+void libeth_xsk_wakeup(call_single_data_t *csd, u32 qid);
+
+/* Pool setup */
+
+int libeth_xsk_setup_pool(struct net_device *dev, u32 qid, bool enable);
+
+#endif /* __LIBETH_XSK_H */
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 53bd2d02a4f0..26232f603e33 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -138,12 +138,12 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
static inline void lwtunnel_set_redirect(struct dst_entry *dst)
{
if (lwtunnel_output_redirect(dst->lwtstate)) {
- dst->lwtstate->orig_output = dst->output;
- dst->output = lwtunnel_output;
+ dst->lwtstate->orig_output = READ_ONCE(dst->output);
+ WRITE_ONCE(dst->output, lwtunnel_output);
}
if (lwtunnel_input_redirect(dst->lwtstate)) {
- dst->lwtstate->orig_input = dst->input;
- dst->input = lwtunnel_input;
+ dst->lwtstate->orig_input = READ_ONCE(dst->input);
+ WRITE_ONCE(dst->input, lwtunnel_input);
}
}
#else
@@ -206,6 +206,7 @@ static inline int lwtunnel_valid_encap_type(u16 encap_type,
NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel");
return -EOPNOTSUPP;
}
+
static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len,
struct netlink_ext_ack *extack)
{
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c3ed2fcff8b7..40cb20d9309c 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -7,7 +7,7 @@
* Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2024 Intel Corporation
+ * Copyright (C) 2018 - 2026 Intel Corporation
*/
#ifndef MAC80211_H
@@ -365,6 +365,7 @@ struct ieee80211_vif_chanctx_switch {
* @BSS_CHANGED_MLD_VALID_LINKS: MLD valid links status changed.
* @BSS_CHANGED_MLD_TTLM: negotiated TID to link mapping was changed
* @BSS_CHANGED_TPE: transmit power envelope changed
+ * @BSS_CHANGED_NAN_LOCAL_SCHED: NAN local schedule changed (NAN mode only)
*/
enum ieee80211_bss_change {
BSS_CHANGED_ASSOC = 1<<0,
@@ -402,6 +403,7 @@ enum ieee80211_bss_change {
BSS_CHANGED_MLD_VALID_LINKS = BIT_ULL(33),
BSS_CHANGED_MLD_TTLM = BIT_ULL(34),
BSS_CHANGED_TPE = BIT_ULL(35),
+ BSS_CHANGED_NAN_LOCAL_SCHED = BIT_ULL(36),
/* when adding here, make sure to change ieee80211_reconfig */
};
@@ -682,6 +684,9 @@ struct ieee80211_parsed_tpe {
* responder functionality.
* @ftmr_params: configurable lci/civic parameter when enabling FTM responder.
* @nontransmitted: this BSS is a nontransmitted BSS profile
+ * @tx_bss_conf: Pointer to the BSS configuration of transmitting interface
+ * if MBSSID is enabled. This pointer is RCU-protected due to CSA finish
+ * and BSS color change flows accessing it.
* @transmitter_bssid: the address of transmitter AP
* @bssid_index: index inside the multiple BSSID set
* @bssid_indicator: 2^bssid_indicator is the maximum number of APs in set
@@ -702,6 +707,8 @@ struct ieee80211_parsed_tpe {
* @tpe: transmit power envelope information
* @pwr_reduction: power constraint of BSS.
* @eht_support: does this BSS support EHT
+ * @epcs_support: does this BSS support EPCS
+ * @uhr_support: does this BSS support UHR
* @csa_active: marks whether a channel switch is going on.
* @mu_mimo_owner: indicates interface owns MU-MIMO capability
* @chanctx_conf: The channel context this interface is assigned to, or %NULL
@@ -740,6 +747,7 @@ struct ieee80211_parsed_tpe {
* @eht_80mhz_full_bw_ul_mumimo: in AP-mode, does this BSS support the
* reception of an EHT TB PPDU on an RU that spans the entire PPDU
* bandwidth
+ * @eht_disable_mcs15: disable EHT-MCS 15 reception capability.
* @bss_param_ch_cnt: in BSS-mode, the BSS params change count. This
* information is the latest known value. It can come from this link's
* beacon or from a beacon sent by another link.
@@ -753,6 +761,8 @@ struct ieee80211_parsed_tpe {
* be updated to 1, even if bss_param_ch_cnt didn't change. This allows
* the link to know that it heard the latest value from its own beacon
* (as opposed to hearing its value from another link's beacon).
+ * @s1g_long_beacon_period: number of beacon intervals between each long
+ * beacon transmission.
*/
struct ieee80211_bss_conf {
struct ieee80211_vif *vif;
@@ -803,6 +813,7 @@ struct ieee80211_bss_conf {
struct ieee80211_ftm_responder_params *ftmr_params;
/* Multiple BSSID data */
bool nontransmitted;
+ struct ieee80211_bss_conf __rcu *tx_bss_conf;
u8 transmitter_bssid[ETH_ALEN];
u8 bssid_index;
u8 bssid_indicator;
@@ -823,6 +834,8 @@ struct ieee80211_bss_conf {
u8 pwr_reduction;
bool eht_support;
+ bool epcs_support;
+ bool uhr_support;
bool csa_active;
@@ -847,8 +860,80 @@ struct ieee80211_bss_conf {
bool eht_su_beamformee;
bool eht_mu_beamformer;
bool eht_80mhz_full_bw_ul_mumimo;
+ bool eht_disable_mcs15;
+
u8 bss_param_ch_cnt;
u8 bss_param_ch_cnt_link_id;
+
+ u8 s1g_long_beacon_period;
+};
+
+#define IEEE80211_NAN_MAX_CHANNELS 3
+
+/**
+ * struct ieee80211_nan_channel - NAN channel information
+ *
+ * @chanreq: channel request for this NAN channel. Even though this chanreq::ap
+ * is irrelevant for NAN, still store it for convenience - some functions
+ * require it as an argument.
+ * @needed_rx_chains: number of RX chains needed for this NAN channel
+ * @chanctx_conf: chanctx_conf assigned to this NAN channel.
+ * If a local channel is being ULWed (because we needed this chanctx for
+ * something else), the local NAN channel that used this chanctx,
+ * will have this pointer set to %NULL.
+ * A peer NAN channel should never have this pointer set to %NULL.
+ * @channel_entry: the Channel Entry blob as defined in Wi-Fi Aware
+ * (TM) 4.0 specification Table 100 (Channel Entry format for the NAN
+ * Availability attribute).
+ */
+struct ieee80211_nan_channel {
+ struct ieee80211_chan_req chanreq;
+ u8 needed_rx_chains;
+ struct ieee80211_chanctx_conf *chanctx_conf;
+ u8 channel_entry[6];
+};
+
+/**
+ * struct ieee80211_nan_peer_map - NAN peer schedule map
+ *
+ * This stores a single map from a peer's schedule. Each peer can have
+ * multiple maps.
+ *
+ * @map_id: the map ID from the peer schedule, %CFG80211_NAN_INVALID_MAP_ID
+ * if unused
+ * @slots: mapping of time slots to channel configurations in the schedule's
+ * channels array
+ */
+struct ieee80211_nan_peer_map {
+ u8 map_id;
+ struct ieee80211_nan_channel *slots[CFG80211_NAN_SCHED_NUM_TIME_SLOTS];
+};
+
+/**
+ * struct ieee80211_nan_peer_sched - NAN peer schedule
+ *
+ * This stores the complete schedule from a peer. Contains peer-level
+ * parameters and an array of schedule maps.
+ *
+ * @seq_id: the sequence ID from the peer schedule
+ * @committed_dw: committed DW as published by the peer
+ * @max_chan_switch: maximum channel switch time in microseconds
+ * @init_ulw: initial ULWs as published by the peer (copied)
+ * @ulw_size: number of bytes in @init_ulw
+ * @maps: array of peer schedule maps. Invalid slots have map_id set to
+ * %CFG80211_NAN_INVALID_MAP_ID.
+ * @n_channels: number of valid channel entries in @channels
+ * @channels: flexible array of negotiated peer channels for this schedule
+ */
+struct ieee80211_nan_peer_sched {
+ u8 seq_id;
+ u16 committed_dw;
+ u16 max_chan_switch;
+ const u8 *init_ulw;
+ u16 ulw_size;
+ struct ieee80211_nan_peer_map maps[CFG80211_NAN_MAX_PEER_MAPS];
+ u8 n_channels;
+ struct ieee80211_nan_channel channels[] __counted_by(n_channels);
};
/**
@@ -1517,6 +1602,7 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info)
* known the frame shouldn't be reported.
* @RX_FLAG_8023: the frame has an 802.3 header (decap offload performed by
* hardware or driver)
+ * @RX_FLAG_RADIOTAP_VHT: VHT radiotap data is present
*/
enum mac80211_rx_flags {
RX_FLAG_MMIC_ERROR = BIT(0),
@@ -1552,6 +1638,7 @@ enum mac80211_rx_flags {
RX_FLAG_RADIOTAP_LSIG = BIT(28),
RX_FLAG_NO_PSDU = BIT(29),
RX_FLAG_8023 = BIT(30),
+ RX_FLAG_RADIOTAP_VHT = BIT(31),
};
/**
@@ -1584,6 +1671,7 @@ enum mac80211_rx_encoding {
RX_ENC_VHT,
RX_ENC_HE,
RX_ENC_EHT,
+ RX_ENC_UHR,
};
/**
@@ -1617,7 +1705,7 @@ enum mac80211_rx_encoding {
* @antenna: antenna used
* @rate_idx: index of data rate into band's supported rates or MCS index if
* HT or VHT is used (%RX_FLAG_HT/%RX_FLAG_VHT)
- * @nss: number of streams (VHT, HE and EHT only)
+ * @nss: number of streams (VHT, HE, EHT and UHR only)
* @flag: %RX_FLAG_\*
* @encoding: &enum mac80211_rx_encoding
* @bw: &enum rate_info_bw
@@ -1628,6 +1716,11 @@ enum mac80211_rx_encoding {
* @eht: EHT specific rate information
* @eht.ru: EHT RU, from &enum nl80211_eht_ru_alloc
* @eht.gi: EHT GI, from &enum nl80211_eht_gi
+ * @uhr: UHR specific rate information
+ * @uhr.ru: UHR RU, from &enum nl80211_eht_ru_alloc
+ * @uhr.gi: UHR GI, from &enum nl80211_eht_gi
+ * @uhr.elr: UHR ELR MCS was used
+ * @uhr.im: UHR interference mitigation was used
* @rx_flags: internal RX flags for mac80211
* @ampdu_reference: A-MPDU reference number, must be a different value for
* each A-MPDU but the same for each subframe within one A-MPDU
@@ -1659,6 +1752,12 @@ struct ieee80211_rx_status {
u8 ru:4;
u8 gi:2;
} eht;
+ struct {
+ u8 ru:4;
+ u8 gi:2;
+ u8 elr:1;
+ u8 im:1;
+ } uhr;
};
u8 rate_idx;
u8 nss;
@@ -1888,6 +1987,59 @@ enum ieee80211_offload_flags {
IEEE80211_OFFLOAD_DECAP_ENABLED = BIT(2),
};
+#define IEEE80211_NAN_AVAIL_BLOB_MAX_LEN 54
+
+/**
+ * struct ieee80211_eml_params - EHT Operating mode notification parameters
+ *
+ * EML Operating mode notification parameters received in the Operating mode
+ * notification frame. This struct is used as a container to pass the info to
+ * the underlay driver.
+ *
+ * @link_id: the link ID where the Operating mode notification frame has been
+ * received.
+ * @control: EML control field defined in P802.11be section 9.4.1.76.
+ * @link_bitmap: eMLSR/eMLMR enabled links defined in P802.11be
+ * section 9.4.1.76.
+ * @emlmr_mcs_map_count: eMLMR number of valid mcs_map_bw fields according to
+ * P802.11be section 9.4.1.76 (valid if eMLMR mode control bit is set).
+ * @emlmr_mcs_map_bw: eMLMR supported MCS and NSS set subfileds defined in
+ * P802.11be section 9.4.1.76 (valid if eMLMR mode control bit is set).
+ */
+struct ieee80211_eml_params {
+ u8 link_id;
+ u8 control;
+ u16 link_bitmap;
+ u8 emlmr_mcs_map_count;
+ u8 emlmr_mcs_map_bw[9];
+};
+
+/**
+ * struct ieee80211_nan_sched_cfg - NAN schedule configuration
+ * @channels: array of NAN channels. A channel entry is in use if
+ * channels[i].chanreq.oper.chan is not NULL.
+ * @schedule: NAN local schedule - mapping of each 16TU time slot to
+ * the NAN channel on which the radio will operate. NULL if unscheduled.
+ * @avail_blob: NAN Availability attribute blob.
+ * @avail_blob_len: length of the @avail_blob in bytes.
+ * @deferred: indicates that the driver should notify peers before applying the
+ * new NAN schedule, and apply the new schedule the second NAN Slot
+ * boundary after it notified the peers, as defined in Wi-Fi Aware (TM) 4.0
+ * specification, section 5.2.2.
+ * The driver must call ieee80211_nan_sched_update_done() after the
+ * schedule has been applied.
+ * If a HW restart happened while a deferred schedule update was pending,
+ * mac80211 will reconfigure the deferred schedule (and wait for the driver
+ * to notify that the schedule has been applied).
+ */
+struct ieee80211_nan_sched_cfg {
+ struct ieee80211_nan_channel channels[IEEE80211_NAN_MAX_CHANNELS];
+ struct ieee80211_nan_channel *schedule[CFG80211_NAN_SCHED_NUM_TIME_SLOTS];
+ u8 avail_blob[IEEE80211_NAN_AVAIL_BLOB_MAX_LEN];
+ u16 avail_blob_len;
+ bool deferred;
+};
+
/**
* struct ieee80211_vif_cfg - interface configuration
* @assoc: association status
@@ -1916,6 +2068,7 @@ enum ieee80211_offload_flags {
* your driver/device needs to do.
* @ap_addr: AP MLD address, or BSSID for non-MLO connections
* (station mode only)
+ * @nan_sched: NAN schedule parameters. &struct ieee80211_nan_sched_cfg
*/
struct ieee80211_vif_cfg {
/* association related data */
@@ -1934,6 +2087,8 @@ struct ieee80211_vif_cfg {
bool s1g;
bool idle;
u8 ap_addr[ETH_ALEN] __aligned(2);
+ /* Protected by the wiphy mutex */
+ struct ieee80211_nan_sched_cfg nan_sched;
};
#define IEEE80211_TTLM_NUM_TIDS 8
@@ -2020,9 +2175,9 @@ enum ieee80211_neg_ttlm_res {
* @drv_priv: data area for driver use, will always be aligned to
* sizeof(void \*).
* @txq: the multicast data TX queue
+ * @txq_mgmt: the mgmt frame TX queue, currently only exists for NAN devices
* @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see
* &enum ieee80211_offload_flags.
- * @mbssid_tx_vif: Pointer to the transmitting interface if MBSSID is enabled.
*/
struct ieee80211_vif {
enum nl80211_iftype type;
@@ -2039,6 +2194,7 @@ struct ieee80211_vif {
u8 hw_queue[IEEE80211_NUM_ACS];
struct ieee80211_txq *txq;
+ struct ieee80211_txq *txq_mgmt;
netdev_features_t netdev_features;
u32 driver_flags;
@@ -2051,8 +2207,6 @@ struct ieee80211_vif {
bool probe_req_reg;
bool rx_mcast_action_reg;
- struct ieee80211_vif *mbssid_tx_vif;
-
/* must be last */
u8 drv_priv[] __aligned(sizeof(void *));
};
@@ -2423,12 +2577,18 @@ struct ieee80211_sta_aggregates {
* @he_cap: HE capabilities of this STA
* @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities
* @eht_cap: EHT capabilities of this STA
+ * @uhr_cap: UHR capabilities of this STA
+ * @s1g_cap: S1G capabilities of this STA
* @agg: per-link data for multi-link aggregation
- * @bandwidth: current bandwidth the station can receive with
+ * @bandwidth: current bandwidth the station can receive with.
+ * This is the minimum between the peer's capabilities and our own
+ * operating channel width; Invalid for NAN since that is operating on
+ * multiple channels.
* @rx_nss: in HT/VHT, the maximum number of spatial streams the
* station can receive at the moment, changed by operating mode
* notifications and capabilities. The value is only valid after
- * the station moves to associated state.
+ * the station moves to associated state. Invalid for NAN since it
+ * operates on multiple configurations of rx_nss.
* @txpwr: the station tx power configuration
*
*/
@@ -2445,6 +2605,8 @@ struct ieee80211_link_sta {
struct ieee80211_sta_he_cap he_cap;
struct ieee80211_he_6ghz_capa he_6ghz_capa;
struct ieee80211_sta_eht_cap eht_cap;
+ struct ieee80211_sta_uhr_cap uhr_cap;
+ struct ieee80211_sta_s1g_cap s1g_cap;
struct ieee80211_sta_aggregates agg;
@@ -2487,6 +2649,7 @@ struct ieee80211_link_sta {
* @max_amsdu_subframes: indicates the maximal number of MSDUs in a single
* A-MSDU. Taken from the Extended Capabilities element. 0 means
* unlimited.
+ * @eml_cap: EML capabilities of this MLO station
* @cur: currently valid data as aggregated from the active links
* For non MLO STA it will point to the deflink data. For MLO STA
* ieee80211_sta_recalc_aggregates() must be called to update it.
@@ -2506,6 +2669,9 @@ struct ieee80211_link_sta {
* by the AP.
* @valid_links: bitmap of valid links, or 0 for non-MLO
* @spp_amsdu: indicates whether the STA uses SPP A-MSDU or not.
+ * @epp_peer: indicates that the peer is an EPP peer.
+ * @nmi: For NDI stations, pointer to the NMI station of the peer.
+ * @nan_sched: NAN peer schedule for this station. Valid only for NMI stations.
*/
struct ieee80211_sta {
u8 addr[ETH_ALEN] __aligned(2);
@@ -2521,6 +2687,7 @@ struct ieee80211_sta {
bool mlo;
bool spp_amsdu;
u8 max_amsdu_subframes;
+ u16 eml_cap;
struct ieee80211_sta_aggregates *cur;
@@ -2529,9 +2696,15 @@ struct ieee80211_sta {
struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1];
u16 valid_links;
+ bool epp_peer;
struct ieee80211_link_sta deflink;
struct ieee80211_link_sta __rcu *link[IEEE80211_MLD_MAX_NUM_LINKS];
+ struct ieee80211_sta __rcu *nmi;
+
+ /* should only be accessed with the wiphy mutex held */
+ struct ieee80211_nan_peer_sched *nan_sched;
+
/* must be last */
u8 drv_priv[] __aligned(sizeof(void *));
};
@@ -2765,6 +2938,8 @@ struct ieee80211_txq {
* station has a unique address, i.e. each station entry can be identified
* by just its MAC address; this prevents, for example, the same station
* from connecting to two virtual AP interfaces at the same time.
+ * Note that this doesn't apply for NAN, in which the peer's NMI address
+ * can be equal to its NDI address.
*
* @IEEE80211_HW_SUPPORTS_REORDERING_BUFFER: Hardware (or driver) manages the
* reordering buffer internally, guaranteeing mac80211 receives frames in
@@ -2843,14 +3018,20 @@ struct ieee80211_txq {
*
* @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT
* and connecting with a lower bandwidth instead
- * @IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ: HW requires disabling puncturing in
- * EHT in 5 GHz and connecting with a lower bandwidth instead
*
* @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so
* no need to stop queues. This really should be set by a driver that
* implements MLO, so operation can continue on other links when one
* link is switching.
*
+ * @IEEE80211_HW_STRICT: strictly enforce certain things mandated by the spec
+ * but otherwise ignored/worked around for interoperability. This is a
+ * HW flag so drivers can opt in according to their own control, e.g. in
+ * testing.
+ *
+ * @IEEE80211_HW_SUPPORTS_NDP_BLOCKACK: HW can transmit/receive S1G NDP
+ * BlockAck frames.
+ *
* @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays
*/
enum ieee80211_hw_flags {
@@ -2909,8 +3090,9 @@ enum ieee80211_hw_flags {
IEEE80211_HW_DETECTS_COLOR_COLLISION,
IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX,
IEEE80211_HW_DISALLOW_PUNCTURING,
- IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ,
IEEE80211_HW_HANDLES_QUIET_CSA,
+ IEEE80211_HW_STRICT,
+ IEEE80211_HW_SUPPORTS_NDP_BLOCKACK,
/* keep last, obviously */
NUM_IEEE80211_HW_FLAGS
@@ -3176,6 +3358,10 @@ ieee80211_get_tx_rate(const struct ieee80211_hw *hw,
{
if (WARN_ON_ONCE(c->control.rates[0].idx < 0))
return NULL;
+
+ if (c->band >= NUM_NL80211_BANDS)
+ return NULL;
+
return &hw->wiphy->bands[c->band]->bitrates[c->control.rates[0].idx];
}
@@ -3823,7 +4009,7 @@ enum ieee80211_reconfig_type {
* @was_assoc: set if this call is due to deauth/disassoc
* while just having been associated
* @link_id: the link id on which the frame will be TX'ed.
- * Only used with the mgd_prepare_tx() method.
+ * 0 for a non-MLO connection.
*/
struct ieee80211_prep_tx_info {
u16 duration;
@@ -4120,6 +4306,15 @@ struct ieee80211_prep_tx_info {
* Statistics that the driver doesn't fill will be filled by mac80211.
* The callback can sleep.
*
+ * @link_sta_statistics: Get link statistics for this station. For example with
+ * beacon filtering, the statistics kept by mac80211 might not be
+ * accurate, so let the driver pre-fill the statistics. The driver can
+ * fill most of the values (indicating which by setting the filled
+ * bitmap), but not all of them make sense - see the source for which
+ * ones are possible.
+ * Statistics that the driver doesn't fill will be filled by mac80211.
+ * The callback can sleep.
+ *
* @conf_tx: Configure TX queue parameters (EDCF (aifs, cw_min, cw_max),
* bursting) for a hardware TX queue.
* Returns a negative error code on failure.
@@ -4289,6 +4484,8 @@ struct ieee80211_prep_tx_info {
* @mgd_complete_tx: Notify the driver that the response frame for a previously
* transmitted frame announced with @mgd_prepare_tx was received, the data
* is filled similarly to @mgd_prepare_tx though the duration is not used.
+ * Note that this isn't always called for each mgd_prepare_tx() call, for
+ * example for SAE the 'confirm' messages can be on the air in any order.
*
* @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending
* a TDLS discovery-request, we expect a reply to arrive on the AP's
@@ -4409,6 +4606,12 @@ struct ieee80211_prep_tx_info {
* @del_nan_func: Remove a NAN function. The driver must call
* ieee80211_nan_func_terminated() with
* NL80211_NAN_FUNC_TERM_REASON_USER_REQUEST reason code upon removal.
+ * @nan_peer_sched_changed: Notifies the driver that the peer NAN schedule
+ * has changed. The new schedule is available via sta->nan_sched.
+ * Note that the channel_entry blob might not match the actual chandef
+ * since the bandwidth of the chandef is the minimum of the local and peer
+ * bandwidth. It is the driver responsibility to remove the peer schedule
+ * when the NMI station is removed.
* @can_aggregate_in_amsdu: Called in order to determine if HW supports
* aggregating two specific frames in the same A-MSDU. The relation
* between the skbs should be symmetric and transitive. Note that while
@@ -4453,6 +4656,8 @@ struct ieee80211_prep_tx_info {
* new links bitmaps may be 0 if going from/to a non-MLO situation.
* The @old array contains pointers to the old bss_conf structures
* that were already removed, in case they're needed.
+ * Note that removal of link should always succeed, so the return value
+ * will be ignored in a removal only case.
* This callback can sleep.
* @change_sta_links: Change the valid links of a station, similar to
* @change_vif_links. This callback can sleep.
@@ -4476,6 +4681,9 @@ struct ieee80211_prep_tx_info {
* interface with the specified type would be added, and thus drivers that
* implement this callback need to handle such cases. The type is the full
* &enum nl80211_iftype.
+ * @set_eml_op_mode: Configure eMLSR/eMLMR operation mode in the underlay
+ * driver according to the parameter received in the EML Operating mode
+ * notification frame.
*/
struct ieee80211_ops {
void (*tx)(struct ieee80211_hw *hw,
@@ -4495,7 +4703,7 @@ struct ieee80211_ops {
enum nl80211_iftype new_type, bool p2p);
void (*remove_interface)(struct ieee80211_hw *hw,
struct ieee80211_vif *vif);
- int (*config)(struct ieee80211_hw *hw, u32 changed);
+ int (*config)(struct ieee80211_hw *hw, int radio_idx, u32 changed);
void (*bss_info_changed)(struct ieee80211_hw *hw,
struct ieee80211_vif *vif,
struct ieee80211_bss_conf *info,
@@ -4558,8 +4766,10 @@ struct ieee80211_ops {
void (*get_key_seq)(struct ieee80211_hw *hw,
struct ieee80211_key_conf *key,
struct ieee80211_key_seq *seq);
- int (*set_frag_threshold)(struct ieee80211_hw *hw, u32 value);
- int (*set_rts_threshold)(struct ieee80211_hw *hw, u32 value);
+ int (*set_frag_threshold)(struct ieee80211_hw *hw, int radio_idx,
+ u32 value);
+ int (*set_rts_threshold)(struct ieee80211_hw *hw, int radio_idx,
+ u32 value);
int (*sta_add)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
struct ieee80211_sta *sta);
int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
@@ -4614,6 +4824,10 @@ struct ieee80211_ops {
s64 offset);
void (*reset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif);
int (*tx_last_beacon)(struct ieee80211_hw *hw);
+ void (*link_sta_statistics)(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ struct ieee80211_link_sta *link_sta,
+ struct link_station_info *link_sinfo);
/**
* @ampdu_action:
@@ -4652,7 +4866,8 @@ struct ieee80211_ops {
int (*get_survey)(struct ieee80211_hw *hw, int idx,
struct survey_info *survey);
void (*rfkill_poll)(struct ieee80211_hw *hw);
- void (*set_coverage_class)(struct ieee80211_hw *hw, s16 coverage_class);
+ void (*set_coverage_class)(struct ieee80211_hw *hw, int radio_idx,
+ s16 coverage_class);
#ifdef CONFIG_NL80211_TESTMODE
int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
void *data, int len);
@@ -4667,8 +4882,10 @@ struct ieee80211_ops {
void (*channel_switch)(struct ieee80211_hw *hw,
struct ieee80211_vif *vif,
struct ieee80211_channel_switch *ch_switch);
- int (*set_antenna)(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant);
- int (*get_antenna)(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant);
+ int (*set_antenna)(struct ieee80211_hw *hw, int radio_idx,
+ u32 tx_ant, u32 rx_ant);
+ int (*get_antenna)(struct ieee80211_hw *hw, int radio_idx,
+ u32 *tx_ant, u32 *rx_ant);
int (*remain_on_channel)(struct ieee80211_hw *hw,
struct ieee80211_vif *vif,
@@ -4800,6 +5017,8 @@ struct ieee80211_ops {
void (*del_nan_func)(struct ieee80211_hw *hw,
struct ieee80211_vif *vif,
u8 instance_id);
+ int (*nan_peer_sched_changed)(struct ieee80211_hw *hw,
+ struct ieee80211_sta *sta);
bool (*can_aggregate_in_amsdu)(struct ieee80211_hw *hw,
struct sk_buff *head,
struct sk_buff *skb);
@@ -4862,6 +5081,10 @@ struct ieee80211_ops {
struct ieee80211_neg_ttlm *ttlm);
void (*prep_add_interface)(struct ieee80211_hw *hw,
enum nl80211_iftype type);
+ int (*set_eml_op_mode)(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ struct ieee80211_sta *sta,
+ struct ieee80211_eml_params *eml_params);
};
/**
@@ -5340,22 +5563,6 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif,
int max_rates);
/**
- * ieee80211_sta_set_expected_throughput - set the expected tpt for a station
- *
- * Call this function to notify mac80211 about a change in expected throughput
- * to a station. A driver for a device that does rate control in firmware can
- * call this function when the expected throughput estimate towards a station
- * changes. The information is used to tune the CoDel AQM applied to traffic
- * going towards that station (which can otherwise be too aggressive and cause
- * slow stations to starve).
- *
- * @pubsta: the station to set throughput for.
- * @thr: the current expected throughput in kbps.
- */
-void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta,
- u32 thr);
-
-/**
* ieee80211_tx_rate_update - transmit rate update callback
*
* Drivers should call this functions with a non-NULL pub sta
@@ -6011,21 +6218,12 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
int tid, struct ieee80211_key_seq *seq);
/**
- * ieee80211_remove_key - remove the given key
- * @keyconf: the parameter passed with the set key
- *
- * Context: Must be called with the wiphy mutex held.
- *
- * Remove the given key. If the key was uploaded to the hardware at the
- * time this function is called, it is not deleted in the hardware but
- * instead assumed to have been removed already.
- */
-void ieee80211_remove_key(struct ieee80211_key_conf *keyconf);
-
-/**
* ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN
* @vif: the virtual interface to add the key on
- * @keyconf: new key data
+ * @idx: the keyidx of the key
+ * @key_data: the key data
+ * @key_len: the key data. Might be bigger than the actual key length,
+ * but not smaller (for the driver convinence)
* @link_id: the link id of the key or -1 for non-MLO
*
* When GTK rekeying was done while the system was suspended, (a) new
@@ -6048,13 +6246,11 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf);
* for the new key for each TID to set up sequence counters properly.
*
* IMPORTANT: If this replaces a key that is present in the hardware,
- * then it will attempt to remove it during this call. In many cases
- * this isn't what you want, so call ieee80211_remove_key() first for
- * the key that's being replaced.
+ * then it will attempt to remove it during this call.
*/
struct ieee80211_key_conf *
ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
- struct ieee80211_key_conf *keyconf,
+ u8 idx, u8 *key_data, u8 key_len,
int link_id);
/**
@@ -6255,6 +6451,30 @@ void ieee80211_iterate_active_interfaces_atomic(struct ieee80211_hw *hw,
struct ieee80211_vif *vif),
void *data);
+struct ieee80211_vif *
+__ieee80211_iterate_interfaces(struct ieee80211_hw *hw,
+ struct ieee80211_vif *prev,
+ u32 iter_flags);
+
+/**
+ * for_each_interface - iterate interfaces under wiphy mutex
+ * @vif: the iterator variable
+ * @hw: the HW to iterate for
+ * @flags: the iteration flags, see &enum ieee80211_interface_iteration_flags
+ */
+#define for_each_interface(vif, hw, flags) \
+ for (vif = __ieee80211_iterate_interfaces(hw, NULL, flags); \
+ vif; \
+ vif = __ieee80211_iterate_interfaces(hw, vif, flags))
+
+/**
+ * for_each_active_interface - iterate active interfaces under wiphy mutex
+ * @vif: the iterator variable
+ * @hw: the HW to iterate for
+ */
+#define for_each_active_interface(vif, hw) \
+ for_each_interface(vif, hw, IEEE80211_IFACE_ITER_ACTIVE)
+
/**
* ieee80211_iterate_active_interfaces_mtx - iterate active interfaces
*
@@ -6267,12 +6487,18 @@ void ieee80211_iterate_active_interfaces_atomic(struct ieee80211_hw *hw,
* @iterator: the iterator function to call, cannot sleep
* @data: first argument of the iterator function
*/
-void ieee80211_iterate_active_interfaces_mtx(struct ieee80211_hw *hw,
- u32 iter_flags,
- void (*iterator)(void *data,
- u8 *mac,
- struct ieee80211_vif *vif),
- void *data);
+static inline void
+ieee80211_iterate_active_interfaces_mtx(struct ieee80211_hw *hw,
+ u32 iter_flags,
+ void (*iterator)(void *data, u8 *mac,
+ struct ieee80211_vif *vif),
+ void *data)
+{
+ struct ieee80211_vif *vif;
+
+ for_each_interface(vif, hw, iter_flags | IEEE80211_IFACE_ITER_ACTIVE)
+ iterator(data, vif->addr, vif);
+}
/**
* ieee80211_iterate_stations_atomic - iterate stations
@@ -6291,6 +6517,20 @@ void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw,
struct ieee80211_sta *sta),
void *data);
+struct ieee80211_sta *
+__ieee80211_iterate_stations(struct ieee80211_hw *hw,
+ struct ieee80211_sta *prev);
+
+/**
+ * for_each_station - iterate stations under wiphy mutex
+ * @sta: the iterator variable
+ * @hw: the HW to iterate for
+ */
+#define for_each_station(sta, hw) \
+ for (sta = __ieee80211_iterate_stations(hw, NULL); \
+ sta; \
+ sta = __ieee80211_iterate_stations(hw, sta))
+
/**
* ieee80211_iterate_stations_mtx - iterate stations
*
@@ -6303,10 +6543,17 @@ void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw,
* @iterator: the iterator function to call
* @data: first argument of the iterator function
*/
-void ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw,
- void (*iterator)(void *data,
- struct ieee80211_sta *sta),
- void *data);
+static inline void
+ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw,
+ void (*iterator)(void *data,
+ struct ieee80211_sta *sta),
+ void *data)
+{
+ struct ieee80211_sta *sta;
+
+ for_each_station(sta, hw)
+ iterator(data, sta);
+}
/**
* ieee80211_queue_work - add work onto the mac80211 workqueue
@@ -6665,6 +6912,31 @@ void ieee80211_iter_chan_contexts_atomic(
void *iter_data);
/**
+ * ieee80211_iter_chan_contexts_mtx - iterate channel contexts
+ * @hw: pointer obtained from ieee80211_alloc_hw().
+ * @iter: iterator function
+ * @iter_data: data passed to iterator function
+ *
+ * Iterate all active channel contexts. This function can only be used while
+ * holding the wiphy mutex.
+ *
+ * The iterator will not find a context that's being added (during
+ * the driver callback to add it) but will find it while it's being
+ * removed.
+ *
+ * Note that during hardware restart, all contexts that existed
+ * before the restart are considered already present so will be
+ * found while iterating, whether they've been re-added already
+ * or not.
+ */
+void ieee80211_iter_chan_contexts_mtx(
+ struct ieee80211_hw *hw,
+ void (*iter)(struct ieee80211_hw *hw,
+ struct ieee80211_chanctx_conf *chanctx_conf,
+ void *data),
+ void *iter_data);
+
+/**
* ieee80211_ap_probereq_get - retrieve a Probe Request template
* @hw: pointer obtained from ieee80211_alloc_hw().
* @vif: &struct ieee80211_vif pointer from the add_interface callback.
@@ -7181,7 +7453,7 @@ ieee80211_get_he_6ghz_capa_vif(const struct ieee80211_supported_band *sband,
}
/**
- * ieee80211_get_eht_iftype_cap_vif - return ETH capabilities for sband/vif
+ * ieee80211_get_eht_iftype_cap_vif - return EHT capabilities for sband/vif
* @sband: the sband to search for the iftype on
* @vif: the vif to get the iftype from
*
@@ -7195,6 +7467,20 @@ ieee80211_get_eht_iftype_cap_vif(const struct ieee80211_supported_band *sband,
}
/**
+ * ieee80211_get_uhr_iftype_cap_vif - return UHR capabilities for sband/vif
+ * @sband: the sband to search for the iftype on
+ * @vif: the vif to get the iftype from
+ *
+ * Return: pointer to the struct ieee80211_sta_uhr_cap, or %NULL is none found
+ */
+static inline const struct ieee80211_sta_uhr_cap *
+ieee80211_get_uhr_iftype_cap_vif(const struct ieee80211_supported_band *sband,
+ struct ieee80211_vif *vif)
+{
+ return ieee80211_get_uhr_iftype_cap(sband, ieee80211_vif_type_p2p(vif));
+}
+
+/**
* ieee80211_update_mu_groups - set the VHT MU-MIMO groud data
*
* @vif: the specified virtual interface
@@ -7220,13 +7506,32 @@ void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif);
* ieee80211_ave_rssi - report the average RSSI for the specified interface
*
* @vif: the specified virtual interface
+ * @link_id: the link ID for MLO, or -1 for non-MLO
*
* Note: This function assumes that the given vif is valid.
*
* Return: The average RSSI value for the requested interface, or 0 if not
* applicable.
*/
-int ieee80211_ave_rssi(struct ieee80211_vif *vif);
+int ieee80211_ave_rssi(struct ieee80211_vif *vif, int link_id);
+
+/**
+ * ieee80211_calculate_rx_timestamp - calculate timestamp in frame
+ * @hw: pointer as obtained from ieee80211_alloc_hw()
+ * @status: RX status
+ * @mpdu_len: total MPDU length (including FCS)
+ * @mpdu_offset: offset into MPDU to calculate timestamp at
+ *
+ * This function calculates the RX timestamp at the given MPDU offset, taking
+ * into account what the RX timestamp was. An offset of 0 will just normalize
+ * the timestamp to TSF at beginning of MPDU reception.
+ *
+ * Returns: the calculated timestamp
+ */
+u64 ieee80211_calculate_rx_timestamp(struct ieee80211_hw *hw,
+ struct ieee80211_rx_status *status,
+ unsigned int mpdu_len,
+ unsigned int mpdu_offset);
/**
* ieee80211_report_wowlan_wakeup - report WoWLAN wakeup
@@ -7248,7 +7553,9 @@ void ieee80211_report_wowlan_wakeup(struct ieee80211_vif *vif,
* @band: the band to transmit on
* @sta: optional pointer to get the station to send the frame to
*
- * Return: %true if the skb was prepared, %false otherwise
+ * Return: %true if the skb was prepared, %false otherwise.
+ * On failure, the skb is freed by this function; callers must not
+ * free it again.
*
* Note: must be called under RCU lock
*/
@@ -7572,6 +7879,17 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif,
gfp_t gfp);
/**
+ * ieee80211_nan_sched_update_done - notify that NAN schedule update is done
+ *
+ * This function is called by the driver to notify mac80211 that the NAN
+ * schedule update has been applied.
+ * Must be called with wiphy mutex held. May sleep.
+ *
+ * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ */
+void ieee80211_nan_sched_update_done(struct ieee80211_vif *vif);
+
+/**
* ieee80211_calc_rx_airtime - calculate estimated transmission airtime for RX.
*
* This function calculates the estimated airtime usage of a frame based on the
@@ -7607,19 +7925,22 @@ u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw,
* ieee80211_get_fils_discovery_tmpl - Get FILS discovery template.
* @hw: pointer obtained from ieee80211_alloc_hw().
* @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @link_id: valid link_id during MLO or 0 for non-MLO.
*
* The driver is responsible for freeing the returned skb.
*
* Return: FILS discovery template. %NULL on error.
*/
struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw,
- struct ieee80211_vif *vif);
+ struct ieee80211_vif *vif,
+ unsigned int link_id);
/**
* ieee80211_get_unsol_bcast_probe_resp_tmpl - Get unsolicited broadcast
* probe response template.
* @hw: pointer obtained from ieee80211_alloc_hw().
* @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @link_id: valid link_id during MLO or 0 for non-MLO.
*
* The driver is responsible for freeing the returned skb.
*
@@ -7627,7 +7948,8 @@ struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw,
*/
struct sk_buff *
ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw,
- struct ieee80211_vif *vif);
+ struct ieee80211_vif *vif,
+ unsigned int link_id);
/**
* ieee80211_obss_color_collision_notify - notify userland about a BSS color
@@ -7797,4 +8119,17 @@ int ieee80211_emulate_switch_vif_chanctx(struct ieee80211_hw *hw,
int n_vifs,
enum ieee80211_chanctx_switch_mode mode);
+/**
+ * ieee80211_vif_nan_started - Return whether a NAN vif is started
+ * @vif: the vif
+ * Return: %true iff the vif is a NAN interface and NAN is started
+ */
+bool ieee80211_vif_nan_started(struct ieee80211_vif *vif);
+
+/**
+ * ieee80211_encrypt_tx_skb - Encrypt the transmit skb
+ * @skb: the skb
+ * Return: 0 if success and non-zero on error
+ */
+int ieee80211_encrypt_tx_skb(struct sk_buff *skb);
#endif /* MAC80211_H */
diff --git a/include/net/macsec.h b/include/net/macsec.h
index bc7de5b53e54..d962093ee923 100644
--- a/include/net/macsec.h
+++ b/include/net/macsec.h
@@ -9,6 +9,7 @@
#include <linux/u64_stats_sync.h>
#include <linux/if_vlan.h>
+#include <linux/workqueue.h>
#include <uapi/linux/if_link.h>
#include <uapi/linux/if_macsec.h>
@@ -123,6 +124,7 @@ struct macsec_dev_stats {
* @key: key structure
* @ssci: short secure channel identifier
* @stats: per-SA stats
+ * @destroy_work: deferred work to free the SA in process context after RCU grace period
*/
struct macsec_rx_sa {
struct macsec_key key;
@@ -136,7 +138,7 @@ struct macsec_rx_sa {
bool active;
struct macsec_rx_sa_stats __percpu *stats;
struct macsec_rx_sc *sc;
- struct rcu_head rcu;
+ struct rcu_work destroy_work;
};
struct pcpu_rx_sc_stats {
@@ -174,6 +176,7 @@ struct macsec_rx_sc {
* @key: key structure
* @ssci: short secure channel identifier
* @stats: per-SA stats
+ * @destroy_work: deferred work to free the SA in process context after RCU grace period
*/
struct macsec_tx_sa {
struct macsec_key key;
@@ -186,7 +189,7 @@ struct macsec_tx_sa {
refcount_t refcnt;
bool active;
struct macsec_tx_sa_stats __percpu *stats;
- struct rcu_head rcu;
+ struct rcu_work destroy_work;
};
/**
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 62e9d7673862..6d836060976a 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -10,6 +10,7 @@
#include "shm_channel.h"
#define GDMA_STATUS_MORE_ENTRIES 0x00000105
+#define GDMA_STATUS_CMD_UNSUPPORTED 0xffffffff
/* Structures labeled with "HW DATA" are exchanged with the hardware. All of
* them are naturally aligned and hence don't need __packed.
@@ -34,6 +35,8 @@ enum gdma_request_type {
GDMA_CREATE_MR = 31,
GDMA_DESTROY_MR = 32,
GDMA_QUERY_HWC_TIMEOUT = 84, /* 0x54 */
+ GDMA_ALLOC_DM = 96, /* 0x60 */
+ GDMA_DESTROY_DM = 97, /* 0x61 */
};
#define GDMA_RESOURCE_DOORBELL_PAGE 27
@@ -58,8 +61,10 @@ enum gdma_eqe_type {
GDMA_EQE_HWC_INIT_EQ_ID_DB = 129,
GDMA_EQE_HWC_INIT_DATA = 130,
GDMA_EQE_HWC_INIT_DONE = 131,
- GDMA_EQE_HWC_SOC_RECONFIG = 132,
+ GDMA_EQE_HWC_FPGA_RECONFIG = 132,
GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133,
+ GDMA_EQE_HWC_SOC_SERVICE = 134,
+ GDMA_EQE_HWC_RESET_REQUEST = 135,
GDMA_EQE_RNIC_QP_FATAL = 176,
};
@@ -70,6 +75,18 @@ enum {
GDMA_DEVICE_MANA_IB = 3,
};
+enum gdma_service_type {
+ GDMA_SERVICE_TYPE_NONE = 0,
+ GDMA_SERVICE_TYPE_RDMA_SUSPEND = 1,
+ GDMA_SERVICE_TYPE_RDMA_RESUME = 2,
+};
+
+struct mana_service_work {
+ struct work_struct work;
+ struct gdma_dev *gdma_dev;
+ enum gdma_service_type event;
+};
+
struct gdma_resource {
/* Protect the bitmap */
spinlock_t lock;
@@ -152,6 +169,7 @@ struct gdma_general_req {
#define GDMA_MESSAGE_V1 1
#define GDMA_MESSAGE_V2 2
#define GDMA_MESSAGE_V3 3
+#define GDMA_MESSAGE_V4 4
struct gdma_general_resp {
struct gdma_resp_hdr hdr;
@@ -197,6 +215,12 @@ enum gdma_page_type {
#define GDMA_INVALID_DMA_REGION 0
+struct mana_serv_work {
+ struct work_struct serv_work;
+ struct pci_dev *pdev;
+ enum gdma_eqe_type type;
+};
+
struct gdma_mem_info {
struct device *dev;
@@ -223,6 +247,8 @@ struct gdma_dev {
void *driver_data;
struct auxiliary_device *adev;
+ bool is_suspended;
+ bool rdma_teardown;
};
/* MANA_PAGE_SIZE is the DMA unit */
@@ -364,6 +390,11 @@ struct gdma_irq_context {
char name[MANA_IRQ_NAME_SZ];
};
+enum gdma_context_flags {
+ GC_PROBE_SUCCEEDED = 0,
+ GC_IN_SERVICE = 1,
+};
+
struct gdma_context {
struct device *dev;
struct dentry *mana_pci_debugfs;
@@ -372,7 +403,7 @@ struct gdma_context {
unsigned int max_num_queues;
unsigned int max_num_msix;
unsigned int num_msix_usable;
- struct gdma_irq_context *irq_contexts;
+ struct xarray irq_contexts;
/* L2 MTU */
u16 adapter_mtu;
@@ -387,12 +418,15 @@ struct gdma_context {
u32 test_event_eq_id;
bool is_pf;
+
phys_addr_t bar0_pa;
void __iomem *bar0_va;
+ resource_size_t bar0_size;
void __iomem *shm_base;
void __iomem *db_page_base;
phys_addr_t phys_db_page_base;
- u32 db_page_size;
+ u64 db_page_off;
+ u64 db_page_size;
int numa_node;
/* Shared memory chanenl (used to bootstrap HWC) */
@@ -406,6 +440,12 @@ struct gdma_context {
/* Azure RDMA adapter */
struct gdma_dev mana_ib;
+
+ u64 pf_cap_flags1;
+
+ struct workqueue_struct *service_wq;
+
+ unsigned long flags;
};
static inline bool mana_gd_is_mana(struct gdma_dev *gd)
@@ -441,6 +481,8 @@ int mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe);
void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit);
+int mana_schedule_serv_work(struct gdma_context *gc, enum gdma_eqe_type type);
+
struct gdma_wqe {
u32 reserved :24;
u32 last_vbytes :8;
@@ -462,6 +504,8 @@ struct gdma_wqe {
#define INLINE_OOB_SMALL_SIZE 8
#define INLINE_OOB_LARGE_SIZE 24
+#define MANA_MAX_TX_WQE_SGL_ENTRIES 30
+
#define MAX_TX_WQE_SIZE 512
#define MAX_RX_WQE_SIZE 256
@@ -552,17 +596,53 @@ enum {
*/
#define GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX BIT(2)
#define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3)
+#define GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB BIT(4)
#define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5)
/* Driver can handle holes (zeros) in the device list */
#define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11)
+/* Driver supports dynamic MSI-X vector allocation */
+#define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13)
+
+/* Driver can self reset on EQE notification */
+#define GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE BIT(14)
+
+/* Driver can self reset on FPGA Reconfig EQE notification */
+#define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17)
+
+/* Driver detects stalled send queues and recovers them */
+#define GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY BIT(18)
+
+#define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6)
+
+/* Driver supports linearizing the skb when num_sge exceeds hardware limit */
+#define GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE BIT(20)
+
+/* Driver can send HWC periodically to query stats */
+#define GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY BIT(21)
+
+/* Driver can handle hardware recovery events during probe */
+#define GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY BIT(22)
+
+/* Driver supports self recovery on Hardware Channel timeouts */
+#define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY BIT(25)
+
#define GDMA_DRV_CAP_FLAGS1 \
(GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \
GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \
GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \
GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \
- GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP)
+ GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \
+ GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \
+ GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \
+ GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE | \
+ GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE | \
+ GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY | \
+ GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE | \
+ GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY | \
+ GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY | \
+ GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY)
#define GDMA_DRV_CAP_FLAGS2 0
@@ -706,26 +786,13 @@ struct gdma_query_hwc_timeout_resp {
u32 reserved;
};
-enum atb_page_size {
- ATB_PAGE_SIZE_4K,
- ATB_PAGE_SIZE_8K,
- ATB_PAGE_SIZE_16K,
- ATB_PAGE_SIZE_32K,
- ATB_PAGE_SIZE_64K,
- ATB_PAGE_SIZE_128K,
- ATB_PAGE_SIZE_256K,
- ATB_PAGE_SIZE_512K,
- ATB_PAGE_SIZE_1M,
- ATB_PAGE_SIZE_2M,
- ATB_PAGE_SIZE_MAX,
-};
-
enum gdma_mr_access_flags {
GDMA_ACCESS_FLAG_LOCAL_READ = BIT_ULL(0),
GDMA_ACCESS_FLAG_LOCAL_WRITE = BIT_ULL(1),
GDMA_ACCESS_FLAG_REMOTE_READ = BIT_ULL(2),
GDMA_ACCESS_FLAG_REMOTE_WRITE = BIT_ULL(3),
GDMA_ACCESS_FLAG_REMOTE_ATOMIC = BIT_ULL(4),
+ GDMA_ACCESS_FLAG_BIND_MW = BIT_ULL(5),
};
/* GDMA_CREATE_DMA_REGION */
@@ -778,6 +845,7 @@ struct gdma_destroy_dma_region_req {
enum gdma_pd_flags {
GDMA_PD_FLAG_INVALID = 0,
+ GDMA_PD_FLAG_ALLOW_GPA_MR = 1,
};
struct gdma_create_pd_req {
@@ -803,11 +871,24 @@ struct gdma_destory_pd_resp {
};/* HW DATA */
enum gdma_mr_type {
+ /*
+ * Guest Physical Address - MRs of this type allow access
+ * to any DMA-mapped memory using bus-logical address
+ */
+ GDMA_MR_TYPE_GPA = 1,
/* Guest Virtual Address - MRs of this type allow access
* to memory mapped by PTEs associated with this MR using a virtual
* address that is set up in the MST
*/
GDMA_MR_TYPE_GVA = 2,
+ /* Guest zero-based address MRs */
+ GDMA_MR_TYPE_ZBVA = 4,
+ /* Device address MRs */
+ GDMA_MR_TYPE_DM = 5,
+ /* Memory Window type 1 */
+ GDMA_MR_TYPE_MW1 = 6,
+ /* Memory Window type 2 */
+ GDMA_MR_TYPE_MW2 = 7,
};
struct gdma_create_mr_params {
@@ -819,6 +900,16 @@ struct gdma_create_mr_params {
u64 virtual_address;
enum gdma_mr_access_flags access_flags;
} gva;
+ struct {
+ u64 dma_region_handle;
+ enum gdma_mr_access_flags access_flags;
+ } zbva;
+ struct {
+ u64 dm_handle;
+ u64 offset;
+ u64 length;
+ enum gdma_mr_access_flags access_flags;
+ } da;
};
};
@@ -833,10 +924,23 @@ struct gdma_create_mr_request {
u64 dma_region_handle;
u64 virtual_address;
enum gdma_mr_access_flags access_flags;
- } gva;
-
- };
+ } __packed gva;
+ struct {
+ u64 dma_region_handle;
+ enum gdma_mr_access_flags access_flags;
+ } __packed zbva;
+ struct {
+ u64 dm_handle;
+ u64 offset;
+ enum gdma_mr_access_flags access_flags;
+ } __packed da;
+ } __packed;
u32 reserved_2;
+ union {
+ struct {
+ u64 length;
+ } da_ext;
+ };
};/* HW DATA */
struct gdma_create_mr_response {
@@ -855,6 +959,27 @@ struct gdma_destroy_mr_response {
struct gdma_resp_hdr hdr;
};/* HW DATA */
+struct gdma_alloc_dm_req {
+ struct gdma_req_hdr hdr;
+ u64 length;
+ u32 alignment;
+ u32 flags;
+}; /* HW Data */
+
+struct gdma_alloc_dm_resp {
+ struct gdma_resp_hdr hdr;
+ u64 dm_handle;
+}; /* HW Data */
+
+struct gdma_destroy_dm_req {
+ struct gdma_req_hdr hdr;
+ u64 dm_handle;
+}; /* HW Data */
+
+struct gdma_destroy_dm_resp {
+ struct gdma_resp_hdr hdr;
+}; /* HW Data */
+
int mana_gd_verify_vf_version(struct pci_dev *pdev);
int mana_gd_register_device(struct gdma_dev *gd);
@@ -886,4 +1011,11 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle);
void mana_register_debugfs(void);
void mana_unregister_debugfs(void);
+int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event);
+
+int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state);
+int mana_gd_resume(struct pci_dev *pdev);
+
+bool mana_need_log(struct gdma_context *gc, int err);
+
#endif /* _GDMA_H */
diff --git a/include/net/mana/hw_channel.h b/include/net/mana/hw_channel.h
index 158b125692c2..16feb39616c1 100644
--- a/include/net/mana/hw_channel.h
+++ b/include/net/mana/hw_channel.h
@@ -24,6 +24,8 @@
#define HWC_INIT_DATA_PF_DEST_CQ_ID 11
#define HWC_DATA_CFG_HWC_TIMEOUT 1
+#define HWC_DATA_HW_LINK_CONNECT 2
+#define HWC_DATA_HW_LINK_DISCONNECT 3
#define HW_CHANNEL_WAIT_RESOURCE_TIMEOUT_MS 30000
@@ -49,6 +51,15 @@ union hwc_init_type_data {
};
}; /* HW DATA */
+union hwc_init_soc_service_type {
+ u32 as_uint32;
+
+ struct {
+ u32 value : 28;
+ u32 type : 4;
+ };
+}; /* HW DATA */
+
struct hwc_rx_oob {
u32 type : 6;
u32 eom : 1;
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 0d00b24eacaf..8f721cd4e4a7 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -5,6 +5,7 @@
#define _MANA_H
#include <net/xdp.h>
+#include <net/net_shaper.h>
#include "gdma.h"
#include "hw_channel.h"
@@ -60,16 +61,23 @@ enum TRI_STATE {
#define MAX_PORTS_IN_MANA_DEV 256
+/* Maximum number of packets per coalesced CQE */
+#define MANA_RXCOMP_OOB_NUM_PPI 4
+
/* Update this count whenever the respective structures are changed */
-#define MANA_STATS_RX_COUNT 5
+#define MANA_STATS_RX_COUNT (6 + MANA_RXCOMP_OOB_NUM_PPI - 1)
#define MANA_STATS_TX_COUNT 11
+#define MANA_RX_FRAG_ALIGNMENT 64
+
struct mana_stats_rx {
u64 packets;
u64 bytes;
u64 xdp_drop;
u64 xdp_tx;
u64 xdp_redirect;
+ u64 pkt_len0_err;
+ u64 coalesced_cqe[MANA_RXCOMP_OOB_NUM_PPI - 1];
struct u64_stats_sync syncp;
};
@@ -224,8 +232,6 @@ struct mana_rxcomp_perpkt_info {
u32 pkt_hash;
}; /* HW DATA */
-#define MANA_RXCOMP_OOB_NUM_PPI 4
-
/* Receive completion OOB */
struct mana_rxcomp_oob {
struct mana_cqe_header cqe_hdr;
@@ -327,6 +333,7 @@ struct mana_rxq {
u32 datasize;
u32 alloc_size;
u32 headroom;
+ u32 frag_count;
mana_handle_t rxobj;
@@ -371,6 +378,13 @@ struct mana_tx_qp {
struct mana_ethtool_stats {
u64 stop_queue;
u64 wake_queue;
+ u64 tx_cqe_err;
+ u64 tx_cqe_unknown_type;
+ u64 tx_linear_pkt_cnt;
+ u64 rx_cqe_unknown_type;
+};
+
+struct mana_ethtool_hc_stats {
u64 hc_rx_discards_no_wqe;
u64 hc_rx_err_vport_disabled;
u64 hc_rx_bytes;
@@ -398,26 +412,92 @@ struct mana_ethtool_stats {
u64 hc_tx_mcast_pkts;
u64 hc_tx_mcast_bytes;
u64 hc_tx_err_gdma;
- u64 tx_cqe_err;
- u64 tx_cqe_unknown_type;
- u64 rx_coalesced_err;
- u64 rx_cqe_unknown_type;
+};
+
+struct mana_ethtool_phy_stats {
+ /* Drop Counters */
+ u64 rx_pkt_drop_phy;
+ u64 tx_pkt_drop_phy;
+
+ /* Per TC traffic Counters */
+ u64 rx_pkt_tc0_phy;
+ u64 tx_pkt_tc0_phy;
+ u64 rx_pkt_tc1_phy;
+ u64 tx_pkt_tc1_phy;
+ u64 rx_pkt_tc2_phy;
+ u64 tx_pkt_tc2_phy;
+ u64 rx_pkt_tc3_phy;
+ u64 tx_pkt_tc3_phy;
+ u64 rx_pkt_tc4_phy;
+ u64 tx_pkt_tc4_phy;
+ u64 rx_pkt_tc5_phy;
+ u64 tx_pkt_tc5_phy;
+ u64 rx_pkt_tc6_phy;
+ u64 tx_pkt_tc6_phy;
+ u64 rx_pkt_tc7_phy;
+ u64 tx_pkt_tc7_phy;
+
+ u64 rx_byte_tc0_phy;
+ u64 tx_byte_tc0_phy;
+ u64 rx_byte_tc1_phy;
+ u64 tx_byte_tc1_phy;
+ u64 rx_byte_tc2_phy;
+ u64 tx_byte_tc2_phy;
+ u64 rx_byte_tc3_phy;
+ u64 tx_byte_tc3_phy;
+ u64 rx_byte_tc4_phy;
+ u64 tx_byte_tc4_phy;
+ u64 rx_byte_tc5_phy;
+ u64 tx_byte_tc5_phy;
+ u64 rx_byte_tc6_phy;
+ u64 tx_byte_tc6_phy;
+ u64 rx_byte_tc7_phy;
+ u64 tx_byte_tc7_phy;
+
+ /* Per TC pause Counters */
+ u64 rx_pause_tc0_phy;
+ u64 tx_pause_tc0_phy;
+ u64 rx_pause_tc1_phy;
+ u64 tx_pause_tc1_phy;
+ u64 rx_pause_tc2_phy;
+ u64 tx_pause_tc2_phy;
+ u64 rx_pause_tc3_phy;
+ u64 tx_pause_tc3_phy;
+ u64 rx_pause_tc4_phy;
+ u64 tx_pause_tc4_phy;
+ u64 rx_pause_tc5_phy;
+ u64 tx_pause_tc5_phy;
+ u64 rx_pause_tc6_phy;
+ u64 tx_pause_tc6_phy;
+ u64 rx_pause_tc7_phy;
+ u64 tx_pause_tc7_phy;
};
struct mana_context {
struct gdma_dev *gdma_dev;
u16 num_ports;
+ u8 bm_hostmode;
+ struct mana_ethtool_hc_stats hc_stats;
struct mana_eq *eqs;
struct dentry *mana_eqs_debugfs;
+ struct workqueue_struct *per_port_queue_reset_wq;
+ /* Workqueue for querying hardware stats */
+ struct delayed_work gf_stats_work;
+ bool hwc_timeout_occurred;
struct net_device *ports[MAX_PORTS_IN_MANA_DEV];
+
+ /* Link state change work */
+ struct work_struct link_change_work;
+ u32 link_event;
};
struct mana_port_context {
struct mana_context *ac;
struct net_device *ndev;
+ struct work_struct queue_reset_work;
u8 mac_addr[ETH_ALEN];
@@ -449,6 +529,7 @@ struct mana_port_context {
u32 rxbpre_datasize;
u32 rxbpre_alloc_size;
u32 rxbpre_headroom;
+ u32 rxbpre_frag_count;
struct bpf_prog *bpf_prog;
@@ -466,13 +547,25 @@ struct mana_port_context {
struct mutex vport_mutex;
int vport_use_count;
+ /* Net shaper handle*/
+ struct net_shaper_handle handle;
+
u16 port_idx;
+ /* Currently configured speed (mbps) */
+ u32 speed;
+ /* Maximum speed supported by the SKU (mbps) */
+ u32 max_speed;
bool port_is_up;
bool port_st_save; /* Saved port state */
+ u8 cqe_coalescing_enable;
+ u32 cqe_coalescing_timeout_ns;
+
struct mana_ethtool_stats eth_stats;
+ struct mana_ethtool_phy_stats phy_stats;
+
/* Debugfs */
struct dentry *mana_port_debugfs;
};
@@ -480,6 +573,7 @@ struct mana_port_context {
netdev_tx_t mana_start_xmit(struct sk_buff *skb, struct net_device *ndev);
int mana_config_rss(struct mana_port_context *ac, enum TRI_STATE rx,
bool update_hash, bool update_tab);
+int mana_disable_vport_rx(struct mana_port_context *apc);
int mana_alloc_queues(struct net_device *ndev);
int mana_attach(struct net_device *ndev);
@@ -488,6 +582,9 @@ int mana_detach(struct net_device *ndev, bool from_close);
int mana_probe(struct gdma_dev *gd, bool resuming);
void mana_remove(struct gdma_dev *gd, bool suspending);
+int mana_rdma_probe(struct gdma_dev *gd);
+void mana_rdma_remove(struct gdma_dev *gd);
+
void mana_xdp_tx(struct sk_buff *skb, struct net_device *ndev);
int mana_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames,
u32 flags);
@@ -496,9 +593,14 @@ u32 mana_run_xdp(struct net_device *ndev, struct mana_rxq *rxq,
struct bpf_prog *mana_xdp_get(struct mana_port_context *apc);
void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog);
int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf);
-void mana_query_gf_stats(struct mana_port_context *apc);
+int mana_query_gf_stats(struct mana_context *ac);
+int mana_query_link_cfg(struct mana_port_context *apc);
+int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed,
+ int enable_clamping);
+void mana_query_phy_stats(struct mana_port_context *apc);
int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues);
void mana_pre_dealloc_rxbufs(struct mana_port_context *apc);
+void mana_unmap_skb(struct sk_buff *skb, struct mana_port_context *apc);
extern const struct ethtool_ops mana_ethtool_ops;
extern struct dentry *mana_debugfs_root;
@@ -523,6 +625,9 @@ enum mana_command_code {
MANA_FENCE_RQ = 0x20006,
MANA_CONFIG_VPORT_RX = 0x20007,
MANA_QUERY_VPORT_CONFIG = 0x20008,
+ MANA_QUERY_LINK_CONFIG = 0x2000A,
+ MANA_SET_BW_CLAMP = 0x2000B,
+ MANA_QUERY_PHY_STAT = 0x2000c,
/* Privileged commands for the PF mode */
MANA_REGISTER_FILTER = 0x28000,
@@ -531,6 +636,35 @@ enum mana_command_code {
MANA_DEREGISTER_HW_PORT = 0x28004,
};
+/* Query Link Configuration*/
+struct mana_query_link_config_req {
+ struct gdma_req_hdr hdr;
+ mana_handle_t vport;
+}; /* HW DATA */
+
+struct mana_query_link_config_resp {
+ struct gdma_resp_hdr hdr;
+ u32 qos_speed_mbps;
+ u8 qos_unconfigured;
+ u8 reserved1[3];
+ u32 link_speed_mbps;
+ u8 reserved2[4];
+}; /* HW DATA */
+
+/* Set Bandwidth Clamp*/
+struct mana_set_bw_clamp_req {
+ struct gdma_req_hdr hdr;
+ mana_handle_t vport;
+ enum TRI_STATE enable_clamping;
+ u32 link_speed_mbps;
+}; /* HW DATA */
+
+struct mana_set_bw_clamp_resp {
+ struct gdma_resp_hdr hdr;
+ u8 qos_unconfigured;
+ u8 reserved[7];
+}; /* HW DATA */
+
/* Query Device Configuration */
struct mana_query_device_cfg_req {
struct gdma_req_hdr hdr;
@@ -557,7 +691,8 @@ struct mana_query_device_cfg_resp {
u64 pf_cap_flags4;
u16 max_num_vports;
- u16 reserved;
+ u8 bm_hostmode; /* response v3: Bare Metal Host Mode */
+ u8 reserved;
u32 max_num_eqs;
/* response v2: */
@@ -684,6 +819,74 @@ struct mana_query_gf_stat_resp {
u64 tx_err_gdma;
}; /* HW DATA */
+/* Query phy stats */
+struct mana_query_phy_stat_req {
+ struct gdma_req_hdr hdr;
+ u64 req_stats;
+}; /* HW DATA */
+
+struct mana_query_phy_stat_resp {
+ struct gdma_resp_hdr hdr;
+ u64 reported_stats;
+
+ /* Aggregate Drop Counters */
+ u64 rx_pkt_drop_phy;
+ u64 tx_pkt_drop_phy;
+
+ /* Per TC(Traffic class) traffic Counters */
+ u64 rx_pkt_tc0_phy;
+ u64 tx_pkt_tc0_phy;
+ u64 rx_pkt_tc1_phy;
+ u64 tx_pkt_tc1_phy;
+ u64 rx_pkt_tc2_phy;
+ u64 tx_pkt_tc2_phy;
+ u64 rx_pkt_tc3_phy;
+ u64 tx_pkt_tc3_phy;
+ u64 rx_pkt_tc4_phy;
+ u64 tx_pkt_tc4_phy;
+ u64 rx_pkt_tc5_phy;
+ u64 tx_pkt_tc5_phy;
+ u64 rx_pkt_tc6_phy;
+ u64 tx_pkt_tc6_phy;
+ u64 rx_pkt_tc7_phy;
+ u64 tx_pkt_tc7_phy;
+
+ u64 rx_byte_tc0_phy;
+ u64 tx_byte_tc0_phy;
+ u64 rx_byte_tc1_phy;
+ u64 tx_byte_tc1_phy;
+ u64 rx_byte_tc2_phy;
+ u64 tx_byte_tc2_phy;
+ u64 rx_byte_tc3_phy;
+ u64 tx_byte_tc3_phy;
+ u64 rx_byte_tc4_phy;
+ u64 tx_byte_tc4_phy;
+ u64 rx_byte_tc5_phy;
+ u64 tx_byte_tc5_phy;
+ u64 rx_byte_tc6_phy;
+ u64 tx_byte_tc6_phy;
+ u64 rx_byte_tc7_phy;
+ u64 tx_byte_tc7_phy;
+
+ /* Per TC(Traffic Class) pause Counters */
+ u64 rx_pause_tc0_phy;
+ u64 tx_pause_tc0_phy;
+ u64 rx_pause_tc1_phy;
+ u64 tx_pause_tc1_phy;
+ u64 rx_pause_tc2_phy;
+ u64 tx_pause_tc2_phy;
+ u64 rx_pause_tc3_phy;
+ u64 tx_pause_tc3_phy;
+ u64 rx_pause_tc4_phy;
+ u64 tx_pause_tc4_phy;
+ u64 rx_pause_tc5_phy;
+ u64 tx_pause_tc5_phy;
+ u64 rx_pause_tc6_phy;
+ u64 tx_pause_tc6_phy;
+ u64 rx_pause_tc7_phy;
+ u64 tx_pause_tc7_phy;
+}; /* HW DATA */
+
/* Configure vPort Rx Steering */
struct mana_cfg_rx_steer_req_v2 {
struct gdma_req_hdr hdr;
@@ -705,6 +908,10 @@ struct mana_cfg_rx_steer_req_v2 {
struct mana_cfg_rx_steer_resp {
struct gdma_resp_hdr hdr;
+
+ /* V2 */
+ u32 cqe_coalescing_timeout_ns;
+ u32 reserved1;
}; /* HW DATA */
/* Register HW vPort */
@@ -801,6 +1008,7 @@ struct mana_deregister_filter_resp {
#define STATISTICS_FLAGS_TX_ERRORS_GDMA_ERROR 0x0000000004000000
#define MANA_MAX_NUM_QUEUES 64
+#define MANA_DEF_NUM_QUEUES 16
#define MANA_SHORT_VPORT_OFFSET_MAX ((1U << 8) - 1)
@@ -827,5 +1035,7 @@ int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
u32 doorbell_pg_id);
void mana_uncfg_vport(struct mana_port_context *apc);
-struct net_device *mana_get_primary_netdev_rcu(struct mana_context *ac, u32 port_index);
+struct net_device *mana_get_primary_netdev(struct mana_context *ac,
+ u32 port_index,
+ netdevice_tracker *tracker);
#endif /* _MANA_H */
diff --git a/include/net/mana/shm_channel.h b/include/net/mana/shm_channel.h
index 5199b41497ff..dbabcfb95daf 100644
--- a/include/net/mana/shm_channel.h
+++ b/include/net/mana/shm_channel.h
@@ -4,6 +4,12 @@
#ifndef _SHM_CHANNEL_H
#define _SHM_CHANNEL_H
+#define SMC_APERTURE_BITS 256
+#define SMC_BASIC_UNIT (sizeof(u32))
+#define SMC_APERTURE_DWORDS (SMC_APERTURE_BITS / (SMC_BASIC_UNIT * 8))
+#define SMC_LAST_DWORD (SMC_APERTURE_DWORDS - 1)
+#define SMC_APERTURE_SIZE (SMC_APERTURE_BITS / 8)
+
struct shm_channel {
struct device *dev;
void __iomem *base;
diff --git a/include/net/mctp.h b/include/net/mctp.h
index 1ecbff7116f6..d8bf9074110d 100644
--- a/include/net/mctp.h
+++ b/include/net/mctp.h
@@ -26,6 +26,9 @@ struct mctp_hdr {
#define MCTP_VER_MIN 1
#define MCTP_VER_MAX 1
+/* Definitions for ver field */
+#define MCTP_HDR_VER_MASK GENMASK(3, 0)
+
/* Definitions for flags_seq_tag field */
#define MCTP_HDR_FLAG_SOM BIT(7)
#define MCTP_HDR_FLAG_EOM BIT(6)
@@ -69,7 +72,10 @@ struct mctp_sock {
/* bind() params */
unsigned int bind_net;
- mctp_eid_t bind_addr;
+ mctp_eid_t bind_local_addr;
+ mctp_eid_t bind_peer_addr;
+ unsigned int bind_peer_net;
+ bool bind_peer_set;
__u8 bind_type;
/* sendmsg()/recvmsg() uses struct sockaddr_mctp_ext */
@@ -183,8 +189,8 @@ struct mctp_sk_key {
struct mctp_skb_cb {
unsigned int magic;
unsigned int net;
- int ifindex; /* extended/direct addressing if set */
- mctp_eid_t src;
+ /* fields below provide extended addressing for ingress to recvmsg() */
+ int ifindex;
unsigned char halen;
unsigned char haddr[MAX_ADDR_LEN];
};
@@ -212,7 +218,7 @@ static inline struct mctp_skb_cb *mctp_cb(struct sk_buff *skb)
BUILD_BUG_ON(sizeof(struct mctp_skb_cb) > sizeof(skb->cb));
WARN_ON(cb->magic != 0x4d435450);
- return (void *)(skb->cb);
+ return cb;
}
/* If CONFIG_MCTP_FLOWS, we may add one of these as a SKB extension,
@@ -222,6 +228,8 @@ struct mctp_flow {
struct mctp_sk_key *key;
};
+struct mctp_dst;
+
/* Route definition.
*
* These are held in the pernet->mctp.routes list, with RCU protection for
@@ -229,16 +237,25 @@ struct mctp_flow {
* dropped on NETDEV_UNREGISTER events.
*
* Updates to the route table are performed under rtnl; all reads under RCU,
- * so routes cannot be referenced over a RCU grace period. Specifically: A
- * caller cannot block between mctp_route_lookup and mctp_route_release()
+ * so routes cannot be referenced over a RCU grace period.
*/
struct mctp_route {
mctp_eid_t min, max;
unsigned char type;
+
unsigned int mtu;
- struct mctp_dev *dev;
- int (*output)(struct mctp_route *route,
+
+ enum {
+ MCTP_ROUTE_DIRECT,
+ MCTP_ROUTE_GATEWAY,
+ } dst_type;
+ union {
+ struct mctp_dev *dev;
+ struct mctp_fq_addr gateway;
+ };
+
+ int (*output)(struct mctp_dst *dst,
struct sk_buff *skb);
struct list_head list;
@@ -246,12 +263,36 @@ struct mctp_route {
struct rcu_head rcu;
};
+/* Route lookup result: dst. Represents the results of a routing decision,
+ * but is only held over the individual routing operation.
+ *
+ * Will typically be stored on the caller stack, and must be released after
+ * usage.
+ */
+struct mctp_dst {
+ struct mctp_dev *dev;
+ unsigned int mtu;
+ mctp_eid_t nexthop;
+ mctp_eid_t saddr;
+
+ /* set for direct addressing */
+ unsigned char halen;
+ unsigned char haddr[MAX_ADDR_LEN];
+
+ int (*output)(struct mctp_dst *dst, struct sk_buff *skb);
+};
+
+int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex,
+ unsigned char halen, const unsigned char *haddr);
+
/* route interfaces */
-struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet,
- mctp_eid_t daddr);
+int mctp_route_lookup(struct net *net, unsigned int dnet,
+ mctp_eid_t daddr, struct mctp_dst *dst);
+
+void mctp_dst_release(struct mctp_dst *dst);
/* always takes ownership of skb */
-int mctp_local_output(struct sock *sk, struct mctp_route *rt,
+int mctp_local_output(struct sock *sk, struct mctp_dst *dst,
struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag);
void mctp_key_unref(struct mctp_sk_key *key);
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 814b5f2e3ed5..ee70f597a4de 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -14,6 +14,7 @@
struct mptcp_info;
struct mptcp_sock;
+struct mptcp_pm_addr_entry;
struct seq_file;
/* MPTCP sk_buff extension data */
@@ -26,7 +27,9 @@ struct mptcp_ext {
u32 subflow_seq;
u16 data_len;
__sum16 csum;
- u8 use_map:1,
+
+ struct_group(flags,
+ u8 use_map:1,
dsn64:1,
data_fin:1,
use_ack:1,
@@ -34,9 +37,10 @@ struct mptcp_ext {
mpc_map:1,
frozen:1,
reset_transient:1;
- u8 reset_reason:4,
+ u8 reset_reason:4,
csum_reqd:1,
infinite_map:1;
+ ); /* end of flags group */
};
#define MPTCPOPT_HMAC_LEN 20
@@ -100,17 +104,9 @@ struct mptcp_out_options {
#define MPTCP_SCHED_MAX 128
#define MPTCP_SCHED_BUF_MAX (MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX)
-#define MPTCP_SUBFLOWS_MAX 8
-
-struct mptcp_sched_data {
- bool reinject;
- u8 subflows;
- struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX];
-};
-
struct mptcp_sched_ops {
- int (*get_subflow)(struct mptcp_sock *msk,
- struct mptcp_sched_data *data);
+ int (*get_send)(struct mptcp_sock *msk);
+ int (*get_retrans)(struct mptcp_sock *msk);
char name[MPTCP_SCHED_NAME_MAX];
struct module *owner;
@@ -120,6 +116,19 @@ struct mptcp_sched_ops {
void (*release)(struct mptcp_sock *msk);
} ____cacheline_aligned_in_smp;
+#define MPTCP_PM_NAME_MAX 16
+#define MPTCP_PM_MAX 128
+#define MPTCP_PM_BUF_MAX (MPTCP_PM_NAME_MAX * MPTCP_PM_MAX)
+
+struct mptcp_pm_ops {
+ char name[MPTCP_PM_NAME_MAX];
+ struct module *owner;
+ struct list_head list;
+
+ void (*init)(struct mptcp_sock *msk);
+ void (*release)(struct mptcp_sock *msk);
+} ____cacheline_aligned_in_smp;
+
#ifdef CONFIG_MPTCP
void mptcp_init(void);
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 3c88d5bc5eed..3da1a6f8d3f9 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -2,8 +2,6 @@
#ifndef _NDISC_H
#define _NDISC_H
-#include <net/ipv6_stubs.h>
-
/*
* ICMP codes for neighbour discovery messages
*/
@@ -60,15 +58,6 @@ enum {
#include <net/neighbour.h>
-/* Set to 3 to get tracing... */
-#define ND_DEBUG 1
-
-#define ND_PRINTK(val, level, fmt, ...) \
-do { \
- if (val <= ND_DEBUG) \
- net_##level##_ratelimited(fmt, ##__VA_ARGS__); \
-} while (0)
-
struct ctl_table;
struct inet6_dev;
struct net_device;
@@ -368,14 +357,6 @@ static inline struct neighbour *__ipv6_neigh_lookup_noref(struct net_device *dev
return ___neigh_lookup_noref(&nd_tbl, neigh_key_eq128, ndisc_hashfn, pkey, dev);
}
-static inline
-struct neighbour *__ipv6_neigh_lookup_noref_stub(struct net_device *dev,
- const void *pkey)
-{
- return ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
- ndisc_hashfn, pkey, dev);
-}
-
static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, const void *pkey)
{
struct neighbour *n;
@@ -400,28 +381,20 @@ static inline void __ipv6_confirm_neigh(struct net_device *dev,
rcu_read_unlock();
}
-static inline void __ipv6_confirm_neigh_stub(struct net_device *dev,
- const void *pkey)
-{
- struct neighbour *n;
-
- rcu_read_lock();
- n = __ipv6_neigh_lookup_noref_stub(dev, pkey);
- neigh_confirm(n);
- rcu_read_unlock();
-}
-
-/* uses ipv6_stub and is meant for use outside of IPv6 core */
static inline struct neighbour *ip_neigh_gw6(struct net_device *dev,
const void *addr)
{
+#if IS_ENABLED(CONFIG_IPV6)
struct neighbour *neigh;
- neigh = __ipv6_neigh_lookup_noref_stub(dev, addr);
+ neigh = __ipv6_neigh_lookup_noref(dev, addr);
if (unlikely(!neigh))
- neigh = __neigh_create(ipv6_stub->nd_tbl, addr, dev, false);
+ neigh = __neigh_create(&nd_tbl, addr, dev, false);
return neigh;
+#else
+ return ERR_PTR(-EAFNOSUPPORT);
+#endif
}
int ndisc_init(void);
@@ -443,6 +416,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
void ndisc_send_rs(struct net_device *dev,
const struct in6_addr *saddr, const struct in6_addr *daddr);
+
void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
const struct in6_addr *solicited_addr,
bool router, bool solicited, bool override, bool inc_opt);
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 9a832cab5b1d..8860cc2175fc 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -92,15 +92,17 @@ struct neigh_parms {
static inline void neigh_var_set(struct neigh_parms *p, int index, int val)
{
set_bit(index, p->data_state);
- p->data[index] = val;
+ WRITE_ONCE(p->data[index], val);
}
-#define NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr])
+#define __NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr])
+#define NEIGH_VAR(p, attr) READ_ONCE(__NEIGH_VAR(p, attr))
+#define NEIGH_VAR_PTR(p, attr) (&(__NEIGH_VAR(p, attr)))
/* In ndo_neigh_setup, NEIGH_VAR_INIT should be used.
* In other cases, NEIGH_VAR_SET should be used.
*/
-#define NEIGH_VAR_INIT(p, attr, val) (NEIGH_VAR(p, attr) = val)
+#define NEIGH_VAR_INIT(p, attr, val) (__NEIGH_VAR(p, attr) = val)
#define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val)
static inline void neigh_parms_data_state_setall(struct neigh_parms *p)
@@ -176,12 +178,17 @@ struct neigh_ops {
};
struct pneigh_entry {
- struct pneigh_entry *next;
+ struct pneigh_entry __rcu *next;
possible_net_t net;
struct net_device *dev;
netdevice_tracker dev_tracker;
+ union {
+ struct list_head free_node;
+ struct rcu_head rcu;
+ };
u32 flags;
u8 protocol;
+ bool permanent;
u32 key[];
};
@@ -231,11 +238,12 @@ struct neigh_table {
atomic_t gc_entries;
struct list_head gc_list;
struct list_head managed_list;
- rwlock_t lock;
+ spinlock_t lock;
unsigned long last_rand;
struct neigh_statistics __percpu *stats;
struct neigh_hash_table __rcu *nht;
- struct pneigh_entry **phash_buckets;
+ struct mutex phash_lock;
+ struct pneigh_entry __rcu **phash_buckets;
};
static inline int neigh_parms_family(struct neigh_parms *p)
@@ -260,13 +268,15 @@ static inline void *neighbour_priv(const struct neighbour *n)
#define NEIGH_UPDATE_F_EXT_LEARNED BIT(5)
#define NEIGH_UPDATE_F_ISROUTER BIT(6)
#define NEIGH_UPDATE_F_ADMIN BIT(7)
+#define NEIGH_UPDATE_F_EXT_VALIDATED BIT(8)
/* In-kernel representation for NDA_FLAGS_EXT flags: */
#define NTF_OLD_MASK 0xff
#define NTF_EXT_SHIFT 8
-#define NTF_EXT_MASK (NTF_EXT_MANAGED)
+#define NTF_EXT_MASK (NTF_EXT_MANAGED | NTF_EXT_EXT_VALIDATED)
#define NTF_MANAGED (NTF_EXT_MANAGED << NTF_EXT_SHIFT)
+#define NTF_EXT_VALIDATED (NTF_EXT_EXT_VALIDATED << NTF_EXT_SHIFT)
extern const struct nla_policy nda_policy[];
@@ -370,13 +380,20 @@ struct net *neigh_parms_net(const struct neigh_parms *parms)
unsigned long neigh_rand_reach_time(unsigned long base);
+static inline void neigh_set_reach_time(struct neigh_parms *p)
+{
+ unsigned long base = NEIGH_VAR(p, BASE_REACHABLE_TIME);
+
+ WRITE_ONCE(p->reachable_time, neigh_rand_reach_time(base));
+}
+
void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
struct sk_buff *skb);
struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net,
- const void *key, struct net_device *dev,
- int creat);
-struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net,
- const void *key, struct net_device *dev);
+ const void *key, struct net_device *dev);
+int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key,
+ struct net_device *dev, u32 flags, u8 protocol,
+ bool permanent);
int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key,
struct net_device *dev);
@@ -472,11 +489,15 @@ static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb)
{
- unsigned int seq, hh_alen;
+ unsigned int seq, hh_alen = HH_DATA_ALIGN(ETH_HLEN);
+ int err;
+
+ err = skb_cow_head(skb, hh_alen);
+ if (err)
+ return err;
do {
seq = read_seqbegin(&hh->hh_lock);
- hh_alen = HH_DATA_ALIGN(ETH_HLEN);
memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN);
} while (read_seqretry(&hh->hh_lock, seq));
return 0;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index f467a66abc6b..80de5e98a66d 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -37,6 +37,7 @@
#include <net/netns/smc.h>
#include <net/netns/bpf.h>
#include <net/netns/mctp.h>
+#include <net/netns/vsock.h>
#include <net/net_trackers.h>
#include <linux/ns_common.h>
#include <linux/idr.h>
@@ -83,6 +84,9 @@ struct net {
struct llist_node defer_free_list;
struct llist_node cleanup_list; /* namespaces on death row */
+ struct list_head ptype_all;
+ struct list_head ptype_specific;
+
#ifdef CONFIG_KEYS
struct key_tag *key_domain; /* Key domain of operation tag */
#endif
@@ -117,6 +121,7 @@ struct net {
* it is critical that it is on a read_mostly cache line.
*/
u32 hash_mix;
+ bool is_dying;
struct net_device *loopback_dev; /* The loopback */
@@ -193,6 +198,9 @@ struct net {
/* Move to a better place when the config guard is removed. */
struct mutex rtnl_mutex;
#endif
+#if IS_ENABLED(CONFIG_VSOCKETS)
+ struct netns_vsock vsock;
+#endif
} __randomize_layout;
#include <linux/seq_file_net.h>
@@ -201,7 +209,7 @@ struct net {
extern struct net init_net;
#ifdef CONFIG_NET_NS
-struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns,
+struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns,
struct net *old_net);
void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid);
@@ -215,7 +223,7 @@ extern struct task_struct *cleanup_net_task;
#else /* CONFIG_NET_NS */
#include <linux/sched.h>
#include <linux/nsproxy.h>
-static inline struct net *copy_net_ns(unsigned long flags,
+static inline struct net *copy_net_ns(u64 flags,
struct user_namespace *user_ns, struct net *old_net)
{
if (flags & CLONE_NEWNET)
@@ -256,13 +264,18 @@ void ipx_unregister_sysctl(void);
#define ipx_unregister_sysctl()
#endif
+static inline struct net *to_net_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct net, ns);
+}
+
#ifdef CONFIG_NET_NS
void __put_net(struct net *net);
/* Try using get_net_track() instead */
static inline struct net *get_net(struct net *net)
{
- refcount_inc(&net->ns.count);
+ ns_ref_inc(net);
return net;
}
@@ -273,7 +286,7 @@ static inline struct net *maybe_get_net(struct net *net)
* exists. If the reference count is zero this
* function fails and returns NULL.
*/
- if (!refcount_inc_not_zero(&net->ns.count))
+ if (!ns_ref_get(net))
net = NULL;
return net;
}
@@ -281,7 +294,7 @@ static inline struct net *maybe_get_net(struct net *net)
/* Try using put_net_track() instead */
static inline void put_net(struct net *net)
{
- if (refcount_dec_and_test(&net->ns.count))
+ if (ns_ref_put(net))
__put_net(net);
}
@@ -293,10 +306,10 @@ int net_eq(const struct net *net1, const struct net *net2)
static inline int check_net(const struct net *net)
{
- return refcount_read(&net->ns.count) != 0;
+ return ns_ref_read(net) != 0;
}
-void net_drop_ns(void *);
+void net_drop_ns(struct ns_common *);
void net_passive_dec(struct net *net);
#else
@@ -472,8 +485,8 @@ struct pernet_operations {
void (*exit)(struct net *net);
void (*exit_batch)(struct list_head *net_exit_list);
/* Following method is called with RTNL held. */
- void (*exit_batch_rtnl)(struct list_head *net_exit_list,
- struct list_head *dev_kill_list);
+ void (*exit_rtnl)(struct net *net,
+ struct list_head *dev_kill_list);
unsigned int * const id;
const size_t size;
};
diff --git a/include/net/net_shaper.h b/include/net/net_shaper.h
index 5c3f49b52fe9..3939b816b001 100644
--- a/include/net/net_shaper.h
+++ b/include/net/net_shaper.h
@@ -53,6 +53,7 @@ struct net_shaper {
/* private: */
u32 leaves; /* accounted only for NODE scope */
+ bool valid;
struct rcu_head rcu;
};
diff --git a/include/net/netdev_lock.h b/include/net/netdev_lock.h
new file mode 100644
index 000000000000..3d3aef80beac
--- /dev/null
+++ b/include/net/netdev_lock.h
@@ -0,0 +1,138 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _NET_NETDEV_LOCK_H
+#define _NET_NETDEV_LOCK_H
+
+#include <linux/lockdep.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+
+static inline bool netdev_trylock(struct net_device *dev)
+{
+ return mutex_trylock(&dev->lock);
+}
+
+static inline void netdev_assert_locked(const struct net_device *dev)
+{
+ lockdep_assert_held(&dev->lock);
+}
+
+static inline void
+netdev_assert_locked_or_invisible(const struct net_device *dev)
+{
+ if (dev->reg_state == NETREG_REGISTERED ||
+ dev->reg_state == NETREG_UNREGISTERING)
+ netdev_assert_locked(dev);
+}
+
+static inline bool netdev_need_ops_lock(const struct net_device *dev)
+{
+ bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops;
+
+#if IS_ENABLED(CONFIG_NET_SHAPER)
+ ret |= !!dev->netdev_ops->net_shaper_ops;
+#endif
+
+ return ret;
+}
+
+static inline void netdev_lock_ops(struct net_device *dev)
+{
+ if (netdev_need_ops_lock(dev))
+ netdev_lock(dev);
+}
+
+static inline void netdev_unlock_ops(struct net_device *dev)
+{
+ if (netdev_need_ops_lock(dev))
+ netdev_unlock(dev);
+}
+
+static inline void netdev_lock_ops_to_full(struct net_device *dev)
+{
+ if (netdev_need_ops_lock(dev))
+ netdev_assert_locked(dev);
+ else
+ netdev_lock(dev);
+}
+
+static inline void netdev_unlock_full_to_ops(struct net_device *dev)
+{
+ if (netdev_need_ops_lock(dev))
+ netdev_assert_locked(dev);
+ else
+ netdev_unlock(dev);
+}
+
+static inline void netdev_ops_assert_locked(const struct net_device *dev)
+{
+ if (netdev_need_ops_lock(dev))
+ lockdep_assert_held(&dev->lock);
+ else
+ ASSERT_RTNL();
+}
+
+static inline void
+netdev_ops_assert_locked_or_invisible(const struct net_device *dev)
+{
+ if (dev->reg_state == NETREG_REGISTERED ||
+ dev->reg_state == NETREG_UNREGISTERING)
+ netdev_ops_assert_locked(dev);
+}
+
+static inline void netdev_lock_ops_compat(struct net_device *dev)
+{
+ if (netdev_need_ops_lock(dev))
+ netdev_lock(dev);
+ else
+ rtnl_lock();
+}
+
+static inline void netdev_unlock_ops_compat(struct net_device *dev)
+{
+ if (netdev_need_ops_lock(dev))
+ netdev_unlock(dev);
+ else
+ rtnl_unlock();
+}
+
+static inline int netdev_lock_cmp_fn(const struct lockdep_map *a,
+ const struct lockdep_map *b)
+{
+ if (a == b)
+ return 0;
+
+ /* Allow locking multiple devices only under rtnl_lock,
+ * the exact order doesn't matter.
+ * Note that upper devices don't lock their ops, so nesting
+ * mostly happens in batched device removal for now.
+ */
+ return lockdep_rtnl_is_held() ? -1 : 1;
+}
+
+#define netdev_lockdep_set_classes(dev) \
+{ \
+ static struct lock_class_key qdisc_tx_busylock_key; \
+ static struct lock_class_key qdisc_xmit_lock_key; \
+ static struct lock_class_key dev_addr_list_lock_key; \
+ static struct lock_class_key dev_instance_lock_key; \
+ unsigned int i; \
+ \
+ (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \
+ lockdep_set_class(&(dev)->addr_list_lock, \
+ &dev_addr_list_lock_key); \
+ lockdep_set_class(&(dev)->lock, \
+ &dev_instance_lock_key); \
+ lock_set_cmp_fn(&dev->lock, netdev_lock_cmp_fn, NULL); \
+ for (i = 0; i < (dev)->num_tx_queues; i++) \
+ lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \
+ &qdisc_xmit_lock_key); \
+}
+
+#define netdev_lock_dereference(p, dev) \
+ rcu_dereference_protected(p, lockdep_is_held(&(dev)->lock))
+
+int netdev_debug_event(struct notifier_block *nb, unsigned long event,
+ void *ptr);
+
+#endif
diff --git a/include/net/netdev_netlink.h b/include/net/netdev_netlink.h
new file mode 100644
index 000000000000..075962dbe743
--- /dev/null
+++ b/include/net/netdev_netlink.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_NETDEV_NETLINK_H
+#define __NET_NETDEV_NETLINK_H
+
+#include <linux/list.h>
+
+struct netdev_nl_sock {
+ struct mutex lock;
+ struct list_head bindings;
+};
+
+#endif /* __NET_NETDEV_NETLINK_H */
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index b02bb9f109d5..70c9fe9e83cc 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -14,6 +14,10 @@ struct netdev_config {
u8 hds_config;
};
+struct netdev_queue_config {
+ u32 rx_page_size;
+};
+
/* See the netdev.yaml spec for definition of each statistic */
struct netdev_queue_stats_rx {
u64 bytes;
@@ -23,6 +27,7 @@ struct netdev_queue_stats_rx {
u64 hw_drops;
u64 hw_drop_overruns;
+ u64 csum_complete;
u64 csum_unnecessary;
u64 csum_none;
u64 csum_bad;
@@ -84,9 +89,11 @@ struct netdev_queue_stats_tx {
* for some of the events is not maintained, and reliable "total" cannot
* be provided).
*
+ * Ops are called under the instance lock if netdev_need_ops_lock()
+ * returns true, otherwise under rtnl_lock.
* Device drivers can assume that when collecting total device stats,
* the @get_base_stats and subsequent per-queue calls are performed
- * "atomically" (without releasing the rtnl_lock).
+ * "atomically" (without releasing the relevant lock).
*
* Device drivers are encouraged to reset the per-queue statistics when
* number of queues change. This is because the primary use case for
@@ -102,6 +109,17 @@ struct netdev_stat_ops {
struct netdev_queue_stats_tx *tx);
};
+void netdev_stat_queue_sum(struct net_device *netdev,
+ int rx_start, int rx_end,
+ struct netdev_queue_stats_rx *rx_sum,
+ int tx_start, int tx_end,
+ struct netdev_queue_stats_tx *tx_sum);
+
+enum {
+ /* The queue checks and honours the page size qcfg parameter */
+ QCFG_RX_PAGE_SIZE = 0x1,
+};
+
/**
* struct netdev_queue_mgmt_ops - netdev ops for queue management
*
@@ -117,22 +135,65 @@ struct netdev_stat_ops {
*
* @ndo_queue_stop: Stop the RX queue at the specified index. The stopped
* queue's memory is written at the specified address.
+ *
+ * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used
+ * for this queue. Return NULL on error.
+ *
+ * @ndo_default_qcfg: (Optional) Populate queue config struct with defaults.
+ * Queue config structs are passed to this helper before
+ * the user-requested settings are applied.
+ *
+ * @ndo_validate_qcfg: (Optional) Check if queue config is supported.
+ * Called when configuration affecting a queue may be
+ * changing, either due to NIC-wide config, or config
+ * scoped to the queue at a specified index.
+ * When NIC-wide config is changed the callback will
+ * be invoked for all queues.
+ *
+ * @ndo_queue_create: Create a new RX queue on a virtual device that will
+ * be paired with a physical device's queue via leasing.
+ * Return the new queue id on success, negative error
+ * on failure.
+ *
+ * @supported_params: Bitmask of supported parameters, see QCFG_*.
+ *
+ * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while
+ * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only
+ * be called for an interface which is open.
*/
struct netdev_queue_mgmt_ops {
- size_t ndo_queue_mem_size;
- int (*ndo_queue_mem_alloc)(struct net_device *dev,
- void *per_queue_mem,
- int idx);
- void (*ndo_queue_mem_free)(struct net_device *dev,
- void *per_queue_mem);
- int (*ndo_queue_start)(struct net_device *dev,
- void *per_queue_mem,
- int idx);
- int (*ndo_queue_stop)(struct net_device *dev,
- void *per_queue_mem,
- int idx);
+ size_t ndo_queue_mem_size;
+ int (*ndo_queue_mem_alloc)(struct net_device *dev,
+ struct netdev_queue_config *qcfg,
+ void *per_queue_mem,
+ int idx);
+ void (*ndo_queue_mem_free)(struct net_device *dev,
+ void *per_queue_mem);
+ int (*ndo_queue_start)(struct net_device *dev,
+ struct netdev_queue_config *qcfg,
+ void *per_queue_mem,
+ int idx);
+ int (*ndo_queue_stop)(struct net_device *dev,
+ void *per_queue_mem,
+ int idx);
+ void (*ndo_default_qcfg)(struct net_device *dev,
+ struct netdev_queue_config *qcfg);
+ int (*ndo_validate_qcfg)(struct net_device *dev,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack);
+ struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev,
+ int idx);
+ int (*ndo_queue_create)(struct net_device *dev,
+ struct netlink_ext_ack *extack);
+
+ unsigned int supported_params;
};
+void netdev_queue_config(struct net_device *dev, int rxq,
+ struct netdev_queue_config *qcfg);
+
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx);
+
/**
* DOC: Lockless queue stopping / waking helpers.
*
@@ -275,28 +336,58 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue,
#define netif_subqueue_try_stop(dev, idx, get_desc, start_thrs) \
({ \
- struct netdev_queue *txq; \
+ struct netdev_queue *_txq; \
\
- txq = netdev_get_tx_queue(dev, idx); \
- netif_txq_try_stop(txq, get_desc, start_thrs); \
+ _txq = netdev_get_tx_queue(dev, idx); \
+ netif_txq_try_stop(_txq, get_desc, start_thrs); \
})
+static inline void netif_subqueue_sent(const struct net_device *dev,
+ unsigned int idx, unsigned int bytes)
+{
+ struct netdev_queue *txq;
+
+ txq = netdev_get_tx_queue(dev, idx);
+ netdev_tx_sent_queue(txq, bytes);
+}
+
+static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq)
+{
+ unsigned long trans_start = READ_ONCE(txq->trans_start);
+
+ if (netif_xmit_stopped(txq) &&
+ time_after(jiffies, trans_start + txq->dev->watchdog_timeo))
+ return jiffies_to_msecs(jiffies - trans_start);
+
+ return 0;
+}
+
#define netif_subqueue_maybe_stop(dev, idx, get_desc, stop_thrs, start_thrs) \
({ \
- struct netdev_queue *txq; \
+ struct netdev_queue *_txq; \
\
- txq = netdev_get_tx_queue(dev, idx); \
- netif_txq_maybe_stop(txq, get_desc, stop_thrs, start_thrs); \
+ _txq = netdev_get_tx_queue(dev, idx); \
+ netif_txq_maybe_stop(_txq, get_desc, stop_thrs, start_thrs); \
})
#define netif_subqueue_completed_wake(dev, idx, pkts, bytes, \
get_desc, start_thrs) \
({ \
- struct netdev_queue *txq; \
+ struct netdev_queue *_txq; \
\
- txq = netdev_get_tx_queue(dev, idx); \
- netif_txq_completed_wake(txq, pkts, bytes, \
+ _txq = netdev_get_tx_queue(dev, idx); \
+ netif_txq_completed_wake(_txq, pkts, bytes, \
get_desc, start_thrs); \
})
-#endif
+struct device *netdev_queue_get_dma_dev(struct net_device *dev,
+ unsigned int idx,
+ enum netdev_queue_type type);
+bool netdev_can_create_queue(const struct net_device *dev,
+ struct netlink_ext_ack *extack);
+bool netdev_can_lease_queue(const struct net_device *dev,
+ struct netlink_ext_ack *extack);
+bool netdev_queue_busy(struct net_device *dev, unsigned int idx,
+ enum netdev_queue_type type,
+ struct netlink_ext_ack *extack);
+#endif /* _LINUX_NET_QUEUES_H */
diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
index 596836abf7bf..9415a94d333d 100644
--- a/include/net/netdev_rx_queue.h
+++ b/include/net/netdev_rx_queue.h
@@ -7,26 +7,38 @@
#include <linux/sysfs.h>
#include <net/xdp.h>
#include <net/page_pool/types.h>
+#include <net/netdev_queues.h>
+#include <net/rps-types.h>
/* This structure contains an instance of an RX queue. */
struct netdev_rx_queue {
struct xdp_rxq_info xdp_rxq;
#ifdef CONFIG_RPS
struct rps_map __rcu *rps_map;
- struct rps_dev_flow_table __rcu *rps_flow_table;
+ rps_tag_ptr rps_flow_table;
#endif
struct kobject kobj;
+ const struct attribute_group **groups;
struct net_device *dev;
netdevice_tracker dev_tracker;
+ /* All fields below are "ops protected",
+ * see comment about net_device::lock
+ */
#ifdef CONFIG_XDP_SOCKETS
struct xsk_buff_pool *pool;
#endif
- /* NAPI instance for the queue
- * Readers and writers must hold RTNL
- */
struct napi_struct *napi;
+ struct netdev_queue_config qcfg;
struct pp_memory_provider_params mp_params;
+
+ /* If a queue is leased, then the lease pointer is always
+ * valid. From the physical device it points to the virtual
+ * queue, and from the virtual device it points to the
+ * physical queue.
+ */
+ struct netdev_rx_queue *lease;
+ netdevice_tracker lease_tracker;
} ____cacheline_aligned_in_smp;
/*
@@ -55,6 +67,18 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue)
return index;
}
-int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq);
+enum netif_lease_dir {
+ NETIF_VIRT_TO_PHYS,
+ NETIF_PHYS_TO_VIRT,
+};
-#endif
+struct netdev_rx_queue *
+__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq,
+ enum netif_lease_dir dir);
+
+int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq);
+void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst,
+ struct netdev_rx_queue *rxq_src);
+void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst,
+ struct netdev_rx_queue *rxq_src);
+#endif /* _LINUX_NETDEV_RX_QUEUE_H */
diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
index 2c8c2b023848..b39417ad955e 100644
--- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h
@@ -13,15 +13,9 @@
extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp;
extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp;
extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp;
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp;
-#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp;
#endif
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
-extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite;
-#endif
#ifdef CONFIG_NF_CT_PROTO_GRE
extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre;
#endif
diff --git a/include/net/netfilter/ipv4/nf_reject.h b/include/net/netfilter/ipv4/nf_reject.h
index c653fcb88354..09de2f2686b5 100644
--- a/include/net/netfilter/ipv4/nf_reject.h
+++ b/include/net/netfilter/ipv4/nf_reject.h
@@ -10,14 +10,6 @@
void nf_send_unreach(struct sk_buff *skb_in, int code, int hook);
void nf_send_reset(struct net *net, struct sock *, struct sk_buff *oldskb,
int hook);
-const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
- struct tcphdr *_oth, int hook);
-struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
- const struct sk_buff *oldskb,
- __u8 protocol, int ttl);
-void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
- const struct tcphdr *oth);
-
struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
struct sk_buff *oldskb,
const struct net_device *dev,
diff --git a/include/net/netfilter/ipv6/nf_reject.h b/include/net/netfilter/ipv6/nf_reject.h
index d729344ba644..94ec0b9f2838 100644
--- a/include/net/netfilter/ipv6/nf_reject.h
+++ b/include/net/netfilter/ipv6/nf_reject.h
@@ -9,16 +9,6 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, unsigned char cod
unsigned int hooknum);
void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
int hook);
-const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
- struct tcphdr *otcph,
- unsigned int *otcplen, int hook);
-struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb,
- const struct sk_buff *oldskb,
- __u8 protocol, int hoplimit);
-void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
- const struct sk_buff *oldskb,
- const struct tcphdr *oth, unsigned int otcplen);
-
struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net,
struct sk_buff *oldskb,
const struct net_device *dev,
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 3f02a45773e8..bc42dd0e10e6 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -16,9 +16,9 @@
#include <linux/bitops.h>
#include <linux/compiler.h>
+#include <net/netns/generic.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_conntrack_tcp.h>
-#include <linux/netfilter/nf_conntrack_dccp.h>
#include <linux/netfilter/nf_conntrack_sctp.h>
#include <linux/netfilter/nf_conntrack_proto_gre.h>
@@ -31,7 +31,6 @@ struct nf_ct_udp {
/* per conntrack: protocol private data */
union nf_conntrack_proto {
/* insert conntrack proto private data here */
- struct nf_ct_dccp dccp;
struct ip_ct_sctp sctp;
struct ip_ct_tcp tcp;
struct nf_ct_udp udp;
@@ -306,8 +305,19 @@ static inline bool nf_ct_is_expired(const struct nf_conn *ct)
/* use after obtaining a reference count */
static inline bool nf_ct_should_gc(const struct nf_conn *ct)
{
- return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) &&
- !nf_ct_is_dying(ct);
+ if (!nf_ct_is_confirmed(ct))
+ return false;
+
+ /* load ct->timeout after is_confirmed() test.
+ * Pairs with __nf_conntrack_confirm() which:
+ * 1. Increases ct->timeout value
+ * 2. Inserts ct into rcu hlist
+ * 3. Sets the confirmed bit
+ * 4. Unlocks the hlist lock
+ */
+ smp_acquire__after_ctrl_dep();
+
+ return nf_ct_is_expired(ct) && !nf_ct_is_dying(ct);
}
#define NF_CT_DAY (86400 * HZ)
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 3384859a8921..8883575adcc1 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -83,6 +83,11 @@ void nf_conntrack_lock(spinlock_t *lock);
extern spinlock_t nf_conntrack_expect_lock;
+static inline void lockdep_nfct_expect_lock_held(void)
+{
+ lockdep_assert_held(&nf_conntrack_expect_lock);
+}
+
/* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */
static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout)
diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h
index 1b58b5b91ff6..cf0166520cf3 100644
--- a/include/net/netfilter/nf_conntrack_count.h
+++ b/include/net/netfilter/nf_conntrack_count.h
@@ -13,20 +13,20 @@ struct nf_conncount_list {
u32 last_gc; /* jiffies at most recent gc */
struct list_head head; /* connections with the same filtering key */
unsigned int count; /* length of list */
+ unsigned int last_gc_count; /* length of list at most recent gc */
};
struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen);
void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data);
-unsigned int nf_conncount_count(struct net *net,
- struct nf_conncount_data *data,
- const u32 *key,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone);
+unsigned int nf_conncount_count_skb(struct net *net,
+ const struct sk_buff *skb,
+ u16 l3num,
+ struct nf_conncount_data *data,
+ const u32 *key);
-int nf_conncount_add(struct net *net, struct nf_conncount_list *list,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone);
+int nf_conncount_add_skb(struct net *net, const struct sk_buff *skb,
+ u16 l3num, struct nf_conncount_list *list);
void nf_conncount_list_init(struct nf_conncount_list *list);
diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h
index 165e7a03b8e9..80f50fd0f7ad 100644
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -22,10 +22,16 @@ struct nf_conntrack_expect {
/* Hash member */
struct hlist_node hnode;
+ /* Network namespace */
+ possible_net_t net;
+
/* We expect this tuple, with the following mask */
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_mask mask;
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ struct nf_conntrack_zone zone;
+#endif
/* Usage count. */
refcount_t use;
@@ -39,8 +45,11 @@ struct nf_conntrack_expect {
void (*expectfn)(struct nf_conn *new,
struct nf_conntrack_expect *this);
+ /* Helper that created this expectation */
+ struct nf_conntrack_helper __rcu *helper;
+
/* Helper to assign to new connection */
- struct nf_conntrack_helper *helper;
+ struct nf_conntrack_helper __rcu *assign_helper;
/* The conntrack of the master connection */
struct nf_conn *master;
@@ -62,7 +71,17 @@ struct nf_conntrack_expect {
static inline struct net *nf_ct_exp_net(struct nf_conntrack_expect *exp)
{
- return nf_ct_net(exp->master);
+ return read_pnet(&exp->net);
+}
+
+static inline bool nf_ct_exp_zone_equal_any(const struct nf_conntrack_expect *a,
+ const struct nf_conntrack_zone *b)
+{
+#ifdef CONFIG_NF_CONNTRACK_ZONES
+ return a->zone.id == b->id;
+#else
+ return true;
+#endif
}
#define NF_CT_EXP_POLICY_NAME_LEN 16
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index 1f47bef51722..fde2427ceb8f 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -30,7 +30,7 @@ struct nf_conntrack_l4proto {
/* called by gc worker if table is full */
bool (*can_early_drop)(const struct nf_conn *ct);
- /* convert protoinfo to nfnetink attributes */
+ /* convert protoinfo to nfnetlink attributes */
int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla,
struct nf_conn *ct, bool destroy);
@@ -107,21 +107,11 @@ int nf_conntrack_udp_packet(struct nf_conn *ct,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
const struct nf_hook_state *state);
-int nf_conntrack_udplite_packet(struct nf_conn *ct,
- struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- const struct nf_hook_state *state);
int nf_conntrack_tcp_packet(struct nf_conn *ct,
struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
const struct nf_hook_state *state);
-int nf_conntrack_dccp_packet(struct nf_conn *ct,
- struct sk_buff *skb,
- unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- const struct nf_hook_state *state);
int nf_conntrack_sctp_packet(struct nf_conn *ct,
struct sk_buff *skb,
unsigned int dataoff,
@@ -137,7 +127,6 @@ void nf_conntrack_generic_init_net(struct net *net);
void nf_conntrack_tcp_init_net(struct net *net);
void nf_conntrack_udp_init_net(struct net *net);
void nf_conntrack_gre_init_net(struct net *net);
-void nf_conntrack_dccp_init_net(struct net *net);
void nf_conntrack_sctp_init_net(struct net *net);
void nf_conntrack_icmp_init_net(struct net *net);
void nf_conntrack_icmpv6_init_net(struct net *net);
@@ -145,8 +134,6 @@ void nf_conntrack_icmpv6_init_net(struct net *net);
/* Existing built-in generic protocol */
extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic;
-#define MAX_NF_CT_PROTO IPPROTO_UDPLITE
-
const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto);
/* Generic netlink helpers */
@@ -223,13 +210,6 @@ static inline bool nf_conntrack_tcp_established(const struct nf_conn *ct)
}
#endif
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-static inline struct nf_dccp_net *nf_dccp_pernet(struct net *net)
-{
- return &net->ct.nf_ct_proto.dccp;
-}
-#endif
-
#ifdef CONFIG_NF_CT_PROTO_SCTP
static inline struct nf_sctp_net *nf_sctp_pernet(struct net *net)
{
diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h
index 9fdaba911de6..3a66d4abb6d6 100644
--- a/include/net/netfilter/nf_conntrack_timeout.h
+++ b/include/net/netfilter/nf_conntrack_timeout.h
@@ -14,6 +14,7 @@
struct nf_ct_timeout {
__u16 l3num;
const struct nf_conntrack_l4proto *l4proto;
+ struct rcu_head rcu;
char data[];
};
diff --git a/include/net/netfilter/nf_conntrack_tuple.h b/include/net/netfilter/nf_conntrack_tuple.h
index f7dd950ff250..4d55b7325707 100644
--- a/include/net/netfilter/nf_conntrack_tuple.h
+++ b/include/net/netfilter/nf_conntrack_tuple.h
@@ -11,7 +11,7 @@
#ifndef _NF_CONNTRACK_TUPLE_H
#define _NF_CONNTRACK_TUPLE_H
-#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter.h>
#include <linux/netfilter/nf_conntrack_tuple_common.h>
#include <linux/list_nulls.h>
diff --git a/include/net/netfilter/nf_dup_netdev.h b/include/net/netfilter/nf_dup_netdev.h
index b175d271aec9..609bcf422a9b 100644
--- a/include/net/netfilter/nf_dup_netdev.h
+++ b/include/net/netfilter/nf_dup_netdev.h
@@ -3,10 +3,23 @@
#define _NF_DUP_NETDEV_H_
#include <net/netfilter/nf_tables.h>
+#include <linux/netdevice.h>
+#include <linux/sched.h>
void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif);
void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif);
+#define NF_RECURSION_LIMIT 2
+
+static inline u8 *nf_get_nf_dup_skb_recursion(void)
+{
+#ifndef CONFIG_PREEMPT_RT
+ return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
+#else
+ return &current->net_xmit.nf_dup_skb_recursion;
+#endif
+}
+
struct nft_offload_ctx;
struct nft_flow_rule;
diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
index d711642e78b5..7b23b245a5a8 100644
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -107,6 +107,19 @@ enum flow_offload_xmit_type {
#define NF_FLOW_TABLE_ENCAP_MAX 2
+struct flow_offload_tunnel {
+ union {
+ struct in_addr src_v4;
+ struct in6_addr src_v6;
+ };
+ union {
+ struct in_addr dst_v4;
+ struct in6_addr dst_v6;
+ };
+
+ u8 l3_proto;
+};
+
struct flow_offload_tuple {
union {
struct in_addr src_v4;
@@ -130,22 +143,26 @@ struct flow_offload_tuple {
__be16 proto;
} encap[NF_FLOW_TABLE_ENCAP_MAX];
+ struct flow_offload_tunnel tun;
+
/* All members above are keys for lookups, see flow_offload_hash(). */
struct { } __hash;
- u8 dir:2,
+ u16 dir:2,
xmit_type:3,
encap_num:2,
+ needs_gso_segment:1,
+ tun_num:2,
in_vlan_ingress:2;
u16 mtu;
union {
struct {
struct dst_entry *dst_cache;
+ u32 ifidx;
u32 dst_cookie;
};
struct {
u32 ifidx;
- u32 hw_ifidx;
u8 h_source[ETH_ALEN];
u8 h_dest[ETH_ALEN];
} out;
@@ -206,7 +223,9 @@ struct nf_flow_route {
u16 id;
__be16 proto;
} encap[NF_FLOW_TABLE_ENCAP_MAX];
+ struct flow_offload_tunnel tun;
u8 num_encaps:2,
+ num_tuns:2,
ingress_vlans:2;
} in;
struct {
@@ -214,6 +233,7 @@ struct nf_flow_route {
u32 hw_ifindex;
u8 h_source[ETH_ALEN];
u8 h_dest[ETH_ALEN];
+ u8 needs_gso_segment:1;
} out;
enum flow_offload_xmit_type xmit_type;
} tuple[FLOW_OFFLOAD_DIR_MAX];
@@ -222,6 +242,12 @@ struct nf_flow_route {
struct flow_offload *flow_offload_alloc(struct nf_conn *ct);
void flow_offload_free(struct flow_offload *flow);
+struct nft_flowtable;
+struct nft_pktinfo;
+int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct,
+ struct nf_flow_route *route, enum ip_conntrack_dir dir,
+ struct nft_flowtable *ft);
+
static inline int
nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table,
flow_setup_cb_t *cb, void *cb_priv)
@@ -370,7 +396,7 @@ static inline __be16 __nf_flow_pppoe_proto(const struct sk_buff *skb)
static inline bool nf_flow_pppoe_proto(struct sk_buff *skb, __be16 *inner_proto)
{
- if (!pskb_may_pull(skb, PPPOE_SES_HLEN))
+ if (!pskb_may_pull(skb, ETH_HLEN + PPPOE_SES_HLEN))
return false;
*inner_proto = __nf_flow_pppoe_proto(skb);
diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index e55eedc84ed7..00506792a06d 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -59,6 +59,9 @@ extern int sysctl_nf_log_all_netns;
int nf_log_register(u_int8_t pf, struct nf_logger *logger);
void nf_log_unregister(struct nf_logger *logger);
+/* Check if any logger is registered for a given protocol family. */
+bool nf_log_is_registered(u_int8_t pf);
+
int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger);
void nf_log_unset(struct net *net, const struct nf_logger *logger);
diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h
index 4aeffddb7586..3978c3174cdb 100644
--- a/include/net/netfilter/nf_queue.h
+++ b/include/net/netfilter/nf_queue.h
@@ -6,12 +6,15 @@
#include <linux/ipv6.h>
#include <linux/jhash.h>
#include <linux/netfilter.h>
+#include <linux/rhashtable-types.h>
#include <linux/skbuff.h>
/* Each queued (to userspace) skbuff has one of these. */
struct nf_queue_entry {
struct list_head list;
+ struct rhash_head hash_node;
struct sk_buff *skb;
+ struct net_device *skb_dev;
unsigned int id;
unsigned int hook_index; /* index in hook_entries->hook[] */
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
@@ -19,6 +22,7 @@ struct nf_queue_entry {
struct net_device *physout;
#endif
struct nf_hook_state state;
+ bool nf_ct_is_unconfirmed;
u16 size; /* sizeof(entry) + saved route keys */
/* extra space to store route keys */
diff --git a/include/net/netfilter/nf_reject.h b/include/net/netfilter/nf_reject.h
index 7c669792fb9c..f1db33bc6bf8 100644
--- a/include/net/netfilter/nf_reject.h
+++ b/include/net/netfilter/nf_reject.h
@@ -34,7 +34,6 @@ static inline bool nf_reject_verify_csum(struct sk_buff *skb, int dataoff,
/* Protocols with partial checksums. */
case IPPROTO_UDPLITE:
- case IPPROTO_DCCP:
return false;
}
return true;
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 803d5f1601f9..9d844354c4d9 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -6,7 +6,6 @@
#include <linux/list.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/nf_tables.h>
#include <linux/u64_stats_sync.h>
#include <linux/rhashtable.h>
@@ -32,7 +31,9 @@ struct nft_pktinfo {
const struct nf_hook_state *state;
u8 flags;
u8 tprot;
+ __be16 ethertype;
u16 fragoff;
+ u16 nhoff;
u16 thoff;
u16 inneroff;
};
@@ -84,6 +85,8 @@ static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt)
{
pkt->flags = 0;
pkt->tprot = 0;
+ pkt->ethertype = pkt->skb->protocol;
+ pkt->nhoff = 0;
pkt->thoff = 0;
pkt->fragoff = 0;
}
@@ -123,17 +126,6 @@ struct nft_regs {
};
};
-struct nft_regs_track {
- struct {
- const struct nft_expr *selector;
- const struct nft_expr *bitwise;
- u8 num_reg;
- } regs[NFT_REG32_NUM];
-
- const struct nft_expr *cur;
- const struct nft_expr *last;
-};
-
/* Store/load an u8, u16 or u64 integer to/from the u32 data register.
*
* Note, when using concatenations, register allocation happens at 32-bit
@@ -188,6 +180,13 @@ static inline u64 nft_reg_load64(const u32 *sreg)
return get_unaligned((u64 *)sreg);
}
+static inline bool nft_reg_overlap(u8 src, u8 dst, u32 len)
+{
+ unsigned int n = DIV_ROUND_UP(len, sizeof(u32));
+
+ return src != dst && src < dst + n && dst < src + n;
+}
+
static inline void nft_data_copy(u32 *dst, const struct nft_data *src,
unsigned int len)
{
@@ -317,11 +316,13 @@ static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv)
* @NFT_ITER_UNSPEC: unspecified, to catch errors
* @NFT_ITER_READ: read-only iteration over set elements
* @NFT_ITER_UPDATE: iteration under mutex to update set element state
+ * @NFT_ITER_UPDATE_CLONE: clone set before iteration under mutex to update element
*/
enum nft_iter_type {
NFT_ITER_UNSPEC,
NFT_ITER_READ,
NFT_ITER_UPDATE,
+ NFT_ITER_UPDATE_CLONE,
};
struct nft_set;
@@ -424,8 +425,6 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp);
void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr);
int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
const struct nft_expr *expr, bool reset);
-bool nft_expr_reduce_bitwise(struct nft_regs_track *track,
- const struct nft_expr *expr);
struct nft_set_ext;
@@ -452,6 +451,7 @@ struct nft_set_ext;
* @init: initialize private data of new set instance
* @destroy: destroy private data of set instance
* @gc_init: initialize garbage collection
+ * @abort_skip_removal: skip removal of elements from abort path
* @elemsize: element private size
*
* Operations lookup, update and delete have simpler interfaces, are faster
@@ -459,19 +459,13 @@ struct nft_set_ext;
* control plane functions.
*/
struct nft_set_ops {
- bool (*lookup)(const struct net *net,
+ const struct nft_set_ext * (*lookup)(const struct net *net,
const struct nft_set *set,
+ const u32 *key);
+ const struct nft_set_ext * (*update)(struct nft_set *set,
const u32 *key,
- const struct nft_set_ext **ext);
- bool (*update)(struct nft_set *set,
- const u32 *key,
- struct nft_elem_priv *
- (*new)(struct nft_set *,
- const struct nft_expr *,
- struct nft_regs *),
const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_set_ext **ext);
+ struct nft_regs *regs);
bool (*delete)(const struct nft_set *set,
const u32 *key);
@@ -515,6 +509,7 @@ struct nft_set_ops {
const struct nft_set *set);
void (*gc_init)(const struct nft_set *set);
+ bool abort_skip_removal;
unsigned int elemsize;
};
@@ -562,6 +557,7 @@ struct nft_set_elem_expr {
* @size: maximum set size
* @field_len: length of each field in concatenation, bytes
* @field_count: number of concatenated fields in element
+ * @in_update_walk: true during ->walk() in transaction phase
* @use: number of rules references to this set
* @nelems: number of elements
* @ndeact: number of deactivated elements queued for removal
@@ -596,6 +592,7 @@ struct nft_set {
u32 size;
u8 field_len[NFT_REG32_COUNT];
u8 field_count;
+ bool in_update_walk;
u32 use;
atomic_t nelems;
u32 ndeact;
@@ -875,6 +872,8 @@ struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set,
u64 timeout, u64 expiration, gfp_t gfp);
int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
struct nft_expr *expr_array[]);
+void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
+ struct nft_set_elem_expr *elem_expr);
void nft_set_elem_destroy(const struct nft_set *set,
const struct nft_elem_priv *elem_priv,
bool destroy_expr);
@@ -940,7 +939,6 @@ struct nft_offload_ctx;
* @destroy_clone: destruction clone function
* @dump: function to dump parameters
* @validate: validate expression, called during loop detection
- * @reduce: reduce expression
* @gc: garbage collection expression
* @offload: hardware offload expression
* @offload_action: function to report true/false to allocate one slot or not in the flow
@@ -974,8 +972,6 @@ struct nft_expr_ops {
bool reset);
int (*validate)(const struct nft_ctx *ctx,
const struct nft_expr *expr);
- bool (*reduce)(struct nft_regs_track *track,
- const struct nft_expr *expr);
bool (*gc)(struct net *net,
const struct nft_expr *expr);
int (*offload)(struct nft_offload_ctx *ctx,
@@ -1095,6 +1091,29 @@ struct nft_rule_blob {
__attribute__((aligned(__alignof__(struct nft_rule_dp))));
};
+enum nft_chain_types {
+ NFT_CHAIN_T_DEFAULT = 0,
+ NFT_CHAIN_T_ROUTE,
+ NFT_CHAIN_T_NAT,
+ NFT_CHAIN_T_MAX
+};
+
+/**
+ * struct nft_chain_validate_state - validation state
+ *
+ * If a chain is encountered again during table validation it is
+ * possible to avoid revalidation provided the calling context is
+ * compatible. This structure stores relevant calling context of
+ * previous validations.
+ *
+ * @hook_mask: the hook numbers and locations the chain is linked to
+ * @depth: the deepest call chain level the chain is linked to
+ */
+struct nft_chain_validate_state {
+ u8 hook_mask[NFT_CHAIN_T_MAX];
+ u8 depth;
+};
+
/**
* struct nft_chain - nf_tables chain
*
@@ -1113,6 +1132,7 @@ struct nft_rule_blob {
* @udlen: user data length
* @udata: user data in the chain
* @blob_next: rule blob pointer to the next in the chain
+ * @vstate: validation state
*/
struct nft_chain {
struct nft_rule_blob __rcu *blob_gen_0;
@@ -1132,9 +1152,10 @@ struct nft_chain {
/* Only used during control plane commit phase: */
struct nft_rule_blob *blob_next;
+ struct nft_chain_validate_state vstate;
};
-int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain);
+int nft_chain_validate(const struct nft_ctx *ctx, struct nft_chain *chain);
int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
const struct nft_set_iter *iter,
struct nft_elem_priv *elem_priv);
@@ -1142,13 +1163,6 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set);
int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
-enum nft_chain_types {
- NFT_CHAIN_T_DEFAULT = 0,
- NFT_CHAIN_T_ROUTE,
- NFT_CHAIN_T_NAT,
- NFT_CHAIN_T_MAX
-};
-
/**
* struct nft_chain_type - nf_tables chain type info
*
@@ -1197,14 +1211,22 @@ struct nft_stats {
struct u64_stats_sync syncp;
};
+#define NFT_HOOK_REMOVE (1 << 0)
+
struct nft_hook {
struct list_head list;
- struct nf_hook_ops ops;
+ struct list_head ops_list;
struct rcu_head rcu;
char ifname[IFNAMSIZ];
u8 ifnamelen;
+ u8 flags;
};
+struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook,
+ const struct net_device *dev);
+struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook,
+ const struct net_device *dev);
+
/**
* struct nft_base_chain - nf_tables base chain
*
@@ -1653,6 +1675,16 @@ struct nft_trans {
};
/**
+ * struct nft_trans_hook - nf_tables hook update in transaction
+ * @list: used internally
+ * @hook: struct nft_hook with the device hook
+ */
+struct nft_trans_hook {
+ struct list_head list;
+ struct nft_hook *hook;
+};
+
+/**
* struct nft_trans_binding - nf_tables object with binding support in transaction
* @nft_trans: base structure, MUST be first member
* @binding_list: list of objects with possible bindings
@@ -1837,6 +1869,11 @@ struct nft_trans_gc {
struct rcu_head rcu;
};
+static inline int nft_trans_gc_space(const struct nft_trans_gc *trans)
+{
+ return NFT_TRANS_GC_BATCHCOUNT - trans->count;
+}
+
static inline void nft_ctx_update(struct nft_ctx *ctx,
const struct nft_trans *trans)
{
@@ -1913,7 +1950,6 @@ struct nftables_pernet {
struct mutex commit_mutex;
u64 table_handle;
u64 tstamp;
- unsigned int base_seq;
unsigned int gc_seq;
u8 validate_state;
struct work_struct destroy_work;
@@ -1931,25 +1967,4 @@ static inline u64 nft_net_tstamp(const struct net *net)
return nft_pernet(net)->tstamp;
}
-#define __NFT_REDUCE_READONLY 1UL
-#define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY
-
-static inline bool nft_reduce_is_readonly(const struct nft_expr *expr)
-{
- return expr->ops->reduce == NFT_REDUCE_READONLY;
-}
-
-void nft_reg_track_update(struct nft_regs_track *track,
- const struct nft_expr *expr, u8 dreg, u8 len);
-void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len);
-void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg);
-
-static inline bool nft_reg_track_cmp(struct nft_regs_track *track,
- const struct nft_expr *expr, u8 dreg)
-{
- return track->regs[dreg].selector &&
- track->regs[dreg].selector->ops == expr->ops &&
- track->regs[dreg].num_reg == 0;
-}
-
#endif /* _NET_NF_TABLES_H */
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 03b6165756fc..b8df5acbb723 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -73,7 +73,7 @@ struct nft_ct {
struct nft_payload {
enum nft_payload_bases base:8;
- u8 offset;
+ u16 offset;
u8 len;
u8 dreg;
};
@@ -94,34 +94,35 @@ extern const struct nft_set_type nft_set_pipapo_type;
extern const struct nft_set_type nft_set_pipapo_avx2_type;
#ifdef CONFIG_MITIGATION_RETPOLINE
-bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
-bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
-bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
-bool nft_hash_lookup_fast(const struct net *net,
- const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
-bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
-bool nft_set_do_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
-#else
-static inline bool
-nft_set_do_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
-{
- return set->ops->lookup(net, set, key, ext);
-}
+const struct nft_set_ext *
+nft_rhash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key);
+const struct nft_set_ext *
+nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key);
+const struct nft_set_ext *
+nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key);
+const struct nft_set_ext *
+nft_hash_lookup_fast(const struct net *net, const struct nft_set *set,
+ const u32 *key);
+const struct nft_set_ext *
+nft_hash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key);
#endif
+const struct nft_set_ext *
+nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key);
+
/* called from nft_pipapo_avx2.c */
-bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
+const struct nft_set_ext *
+nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key);
/* called from nft_set_pipapo.c */
-bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext);
+const struct nft_set_ext *
+nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key);
void nft_counter_init_seqcount(void);
@@ -181,4 +182,7 @@ void nft_objref_eval(const struct nft_expr *expr, struct nft_regs *regs,
const struct nft_pktinfo *pkt);
void nft_objref_map_eval(const struct nft_expr *expr, struct nft_regs *regs,
const struct nft_pktinfo *pkt);
+struct nft_elem_priv *nft_dynset_new(struct nft_set *set,
+ const struct nft_expr *expr,
+ struct nft_regs *regs);
#endif /* _NET_NF_TABLES_CORE_H */
diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h
index fcf967286e37..e715405a73cb 100644
--- a/include/net/netfilter/nf_tables_ipv4.h
+++ b/include/net/netfilter/nf_tables_ipv4.h
@@ -12,16 +12,19 @@ static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt)
ip = ip_hdr(pkt->skb);
pkt->flags = NFT_PKTINFO_L4PROTO;
pkt->tprot = ip->protocol;
+ pkt->ethertype = pkt->skb->protocol;
+ pkt->nhoff = 0;
pkt->thoff = ip_hdrlen(pkt->skb);
pkt->fragoff = ntohs(ip->frag_off) & IP_OFFSET;
}
-static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt)
+static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt,
+ int nhoff)
{
struct iphdr *iph, _iph;
u32 len, thoff, skb_len;
- iph = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb),
+ iph = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb) + nhoff,
sizeof(*iph), &_iph);
if (!iph)
return -1;
@@ -31,7 +34,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt)
len = iph_totlen(pkt->skb, iph);
thoff = iph->ihl * 4;
- skb_len = pkt->skb->len - skb_network_offset(pkt->skb);
+ skb_len = pkt->skb->len - skb_network_offset(pkt->skb) - nhoff;
if (skb_len < len)
return -1;
@@ -42,7 +45,9 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt)
pkt->flags = NFT_PKTINFO_L4PROTO;
pkt->tprot = iph->protocol;
- pkt->thoff = skb_network_offset(pkt->skb) + thoff;
+ pkt->ethertype = pkt->skb->protocol;
+ pkt->nhoff = nhoff;
+ pkt->thoff = skb_network_offset(pkt->skb) + nhoff + thoff;
pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET;
return 0;
@@ -50,7 +55,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt)
static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt)
{
- if (__nft_set_pktinfo_ipv4_validate(pkt) < 0)
+ if (__nft_set_pktinfo_ipv4_validate(pkt, 0) < 0)
nft_set_pktinfo_unspec(pkt);
}
@@ -78,6 +83,8 @@ static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt)
}
pkt->flags = NFT_PKTINFO_L4PROTO;
+ pkt->ethertype = pkt->skb->protocol;
+ pkt->nhoff = 0;
pkt->tprot = iph->protocol;
pkt->thoff = thoff;
pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET;
diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h
index a0633eeaec97..d7b8c559b795 100644
--- a/include/net/netfilter/nf_tables_ipv6.h
+++ b/include/net/netfilter/nf_tables_ipv6.h
@@ -20,21 +20,23 @@ static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt)
pkt->flags = NFT_PKTINFO_L4PROTO;
pkt->tprot = protohdr;
+ pkt->ethertype = pkt->skb->protocol;
+ pkt->nhoff = 0;
pkt->thoff = thoff;
pkt->fragoff = frag_off;
}
-static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt)
+static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, int nhoff)
{
#if IS_ENABLED(CONFIG_IPV6)
unsigned int flags = IP6_FH_F_AUTH;
struct ipv6hdr *ip6h, _ip6h;
- unsigned int thoff = 0;
+ unsigned int thoff = nhoff;
unsigned short frag_off;
u32 pkt_len, skb_len;
int protohdr;
- ip6h = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb),
+ ip6h = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb) + nhoff,
sizeof(*ip6h), &_ip6h);
if (!ip6h)
return -1;
@@ -42,8 +44,8 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt)
if (ip6h->version != 6)
return -1;
- pkt_len = ntohs(ip6h->payload_len);
- skb_len = pkt->skb->len - skb_network_offset(pkt->skb);
+ pkt_len = ipv6_payload_len(pkt->skb, ip6h);
+ skb_len = pkt->skb->len - skb_network_offset(pkt->skb) - nhoff;
if (pkt_len + sizeof(*ip6h) > skb_len)
return -1;
@@ -53,6 +55,8 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt)
pkt->flags = NFT_PKTINFO_L4PROTO;
pkt->tprot = protohdr;
+ pkt->ethertype = pkt->skb->protocol;
+ pkt->nhoff = nhoff;
pkt->thoff = thoff;
pkt->fragoff = frag_off;
@@ -64,7 +68,7 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt)
static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt)
{
- if (__nft_set_pktinfo_ipv6_validate(pkt) < 0)
+ if (__nft_set_pktinfo_ipv6_validate(pkt, 0) < 0)
nft_set_pktinfo_unspec(pkt);
}
@@ -86,7 +90,7 @@ static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt)
if (ip6h->version != 6)
goto inhdr_error;
- pkt_len = ntohs(ip6h->payload_len);
+ pkt_len = ipv6_payload_len(pkt->skb, ip6h);
if (pkt_len + sizeof(*ip6h) > pkt->skb->len) {
idev = __in6_dev_get(nft_in(pkt));
__IP6_INC_STATS(nft_net(pkt), idev, IPSTATS_MIB_INTRUNCATEDPKTS);
@@ -99,6 +103,8 @@ static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt)
pkt->flags = NFT_PKTINFO_L4PROTO;
pkt->tprot = protohdr;
+ pkt->ethertype = pkt->skb->protocol;
+ pkt->nhoff = 0;
pkt->thoff = thoff;
pkt->fragoff = frag_off;
diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h
index 3568b6a2f5f0..14c427891ee6 100644
--- a/include/net/netfilter/nf_tables_offload.h
+++ b/include/net/netfilter/nf_tables_offload.h
@@ -67,6 +67,16 @@ struct nft_flow_rule {
struct flow_rule *rule;
};
+static inline struct flow_action_entry *
+nft_flow_action_entry_next(struct nft_offload_ctx *ctx,
+ struct nft_flow_rule *flow)
+{
+ if (unlikely(ctx->num_actions >= flow->rule->action.num_entries))
+ return NULL;
+
+ return &flow->rule->action.entries[ctx->num_actions++];
+}
+
void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow,
enum flow_dissector_key_id addr_type);
diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h
index 38cae7113de4..e0422456f27b 100644
--- a/include/net/netfilter/nft_fib.h
+++ b/include/net/netfilter/nft_fib.h
@@ -2,6 +2,7 @@
#ifndef _NFT_FIB_H_
#define _NFT_FIB_H_
+#include <net/l3mdev.h>
#include <net/netfilter/nf_tables.h>
struct nft_fib {
@@ -18,6 +19,35 @@ nft_fib_is_loopback(const struct sk_buff *skb, const struct net_device *in)
return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
}
+static inline bool nft_fib_can_skip(const struct nft_pktinfo *pkt)
+{
+ const struct net_device *indev = nft_in(pkt);
+ const struct sock *sk;
+
+ switch (nft_hook(pkt)) {
+ case NF_INET_PRE_ROUTING:
+ case NF_INET_INGRESS:
+ case NF_INET_LOCAL_IN:
+ break;
+ default:
+ return false;
+ }
+
+ sk = pkt->skb->sk;
+ if (sk && sk_fullsock(sk))
+ return sk->sk_rx_dst_ifindex == indev->ifindex;
+
+ return nft_fib_is_loopback(pkt->skb, indev);
+}
+
+static inline int nft_fib_l3mdev_master_ifindex_rcu(const struct nft_pktinfo *pkt,
+ const struct net_device *iif)
+{
+ const struct net_device *dev = iif ? iif : pkt->skb->dev;
+
+ return l3mdev_master_ifindex_rcu(dev);
+}
+
int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset);
int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
const struct nlattr * const tb[]);
@@ -36,6 +66,4 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
void nft_fib_store_result(void *reg, const struct nft_fib *priv,
const struct net_device *dev);
-bool nft_fib_reduce(struct nft_regs_track *track,
- const struct nft_expr *expr);
#endif
diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h
index d602263590fe..f74e63290603 100644
--- a/include/net/netfilter/nft_meta.h
+++ b/include/net/netfilter/nft_meta.h
@@ -43,9 +43,6 @@ void nft_meta_set_destroy(const struct nft_ctx *ctx,
int nft_meta_set_validate(const struct nft_ctx *ctx,
const struct nft_expr *expr);
-bool nft_meta_get_reduce(struct nft_regs_track *track,
- const struct nft_expr *expr);
-
struct nft_inner_tun_ctx;
void nft_meta_inner_eval(const struct nft_expr *expr,
struct nft_regs *regs, const struct nft_pktinfo *pkt,
diff --git a/include/net/netlink.h b/include/net/netlink.h
index e015ffbed819..546d10586576 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -68,6 +68,8 @@
* nlmsg_for_each_msg() loop over all messages
* nlmsg_validate() validate netlink message incl. attrs
* nlmsg_for_each_attr() loop over all attributes
+ * nlmsg_for_each_attr_type() loop over all attributes with the
+ * given type
*
* Misc:
* nlmsg_report() report back to application?
@@ -118,6 +120,7 @@
* nla_nest_start(skb, type) start a nested attribute
* nla_nest_end(skb, nla) finalize a nested attribute
* nla_nest_cancel(skb, nla) cancel nested attribute construction
+ * nla_put_empty_nest(skb, type) create an empty nest
*
* Attribute Length Calculations:
* nla_attr_size(payload) length of attribute w/o padding
@@ -320,7 +323,13 @@ enum nla_policy_validation {
* All other Unused - but note that it's a union
*
* Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN:
+ * NLA_U8, NLA_U16,
+ * NLA_U32, NLA_U64,
+ * NLA_S8, NLA_S16,
+ * NLA_S32, NLA_S64,
+ * NLA_MSECS,
* NLA_BINARY Validation function called for the attribute.
+ *
* All other Unused - but note that it's a union
*
* Example:
@@ -611,6 +620,22 @@ static inline int nlmsg_len(const struct nlmsghdr *nlh)
}
/**
+ * nlmsg_payload - message payload if the data fits in the len
+ * @nlh: netlink message header
+ * @len: struct length
+ *
+ * Returns: The netlink message payload/data if the length is sufficient,
+ * otherwise NULL.
+ */
+static inline void *nlmsg_payload(const struct nlmsghdr *nlh, size_t len)
+{
+ if (nlh->nlmsg_len < nlmsg_msg_size(len))
+ return NULL;
+
+ return nlmsg_data(nlh);
+}
+
+/**
* nlmsg_attrdata - head of attributes data
* @nlh: netlink message header
* @hdrlen: length of family specific header
@@ -944,6 +969,18 @@ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh)
nlmsg_attrlen(nlh, hdrlen), rem)
/**
+ * nlmsg_for_each_attr_type - iterate over a stream of attributes
+ * @pos: loop counter, set to the current attribute
+ * @type: required attribute type for @pos
+ * @nlh: netlink message header
+ * @hdrlen: length of the family specific header
+ * @rem: initialized to len, holds bytes currently remaining in stream
+ */
+#define nlmsg_for_each_attr_type(pos, type, nlh, hdrlen, rem) \
+ nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \
+ if (nla_type(pos) == type)
+
+/**
* nlmsg_put - Add a new netlink message to an skb
* @skb: socket buffer to store message in
* @portid: netlink PORTID of requesting application
@@ -2228,6 +2265,25 @@ static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start)
}
/**
+ * nla_nest_end_safe - Validate and finalize nesting of attributes
+ * @skb: socket buffer the attributes are stored in
+ * @start: container attribute
+ *
+ * Corrects the container attribute header to include all appended
+ * attributes.
+ *
+ * Returns: the total data length of the skb, or -EMSGSIZE if the
+ * nested attribute length exceeds U16_MAX.
+ */
+static inline int nla_nest_end_safe(struct sk_buff *skb, struct nlattr *start)
+{
+ if (skb_tail_pointer(skb) - (unsigned char *)start > U16_MAX)
+ return -EMSGSIZE;
+
+ return nla_nest_end(skb, start);
+}
+
+/**
* nla_nest_cancel - Cancel nesting of attributes
* @skb: socket buffer the message is stored in
* @start: container attribute
@@ -2241,6 +2297,20 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
}
/**
+ * nla_put_empty_nest - Create an empty nest
+ * @skb: socket buffer the message is stored in
+ * @attrtype: attribute type of the container
+ *
+ * This function is a helper for creating empty nests.
+ *
+ * Returns: 0 when successful or -EMSGSIZE on failure.
+ */
+static inline int nla_put_empty_nest(struct sk_buff *skb, int attrtype)
+{
+ return nla_nest_start(skb, attrtype) ? 0 : -EMSGSIZE;
+}
+
+/**
* __nla_validate_nested - Validate a stream of nested attributes
* @start: container attribute
* @maxtype: maximum attribute type to be expected
diff --git a/include/net/netmem.h b/include/net/netmem.h
index 1b58faa4f20f..bccacd21b6c3 100644
--- a/include/net/netmem.h
+++ b/include/net/netmem.h
@@ -8,9 +8,54 @@
#ifndef _NET_NETMEM_H
#define _NET_NETMEM_H
+#include <linux/dma-mapping.h>
#include <linux/mm.h>
#include <net/net_debug.h>
+/* These fields in struct page are used by the page_pool and net stack:
+ *
+ * struct {
+ * unsigned long pp_magic;
+ * struct page_pool *pp;
+ * unsigned long _pp_mapping_pad;
+ * unsigned long dma_addr;
+ * atomic_long_t pp_ref_count;
+ * };
+ *
+ * We mirror the page_pool fields here so the page_pool can access these
+ * fields without worrying whether the underlying fields belong to a
+ * page or netmem_desc.
+ *
+ * CAUTION: Do not update the fields in netmem_desc without also
+ * updating the anonymous aliasing union in struct net_iov.
+ */
+struct netmem_desc {
+ unsigned long _flags;
+ unsigned long pp_magic;
+ struct page_pool *pp;
+ unsigned long _pp_mapping_pad;
+ unsigned long dma_addr;
+ atomic_long_t pp_ref_count;
+};
+
+#define NETMEM_DESC_ASSERT_OFFSET(pg, desc) \
+ static_assert(offsetof(struct page, pg) == \
+ offsetof(struct netmem_desc, desc))
+NETMEM_DESC_ASSERT_OFFSET(flags, _flags);
+NETMEM_DESC_ASSERT_OFFSET(pp_magic, pp_magic);
+NETMEM_DESC_ASSERT_OFFSET(pp, pp);
+NETMEM_DESC_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad);
+NETMEM_DESC_ASSERT_OFFSET(dma_addr, dma_addr);
+NETMEM_DESC_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
+#undef NETMEM_DESC_ASSERT_OFFSET
+
+/*
+ * Since struct netmem_desc uses the space in struct page, the size
+ * should be checked, until struct netmem_desc has its own instance from
+ * slab, to avoid conflicting with other members within struct page.
+ */
+static_assert(sizeof(struct netmem_desc) <= offsetof(struct page, _refcount));
+
/* net_iov */
DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);
@@ -20,39 +65,67 @@ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);
*/
#define NET_IOV 0x01UL
-struct net_iov {
- unsigned long __unused_padding;
- unsigned long pp_magic;
- struct page_pool *pp;
- struct dmabuf_genpool_chunk_owner *owner;
- unsigned long dma_addr;
- atomic_long_t pp_ref_count;
+enum net_iov_type {
+ NET_IOV_DMABUF,
+ NET_IOV_IOURING,
};
-/* These fields in struct page are used by the page_pool and net stack:
+/* A memory descriptor representing abstract networking I/O vectors,
+ * generally for non-pages memory that doesn't have its corresponding
+ * struct page and needs to be explicitly allocated through slab.
*
- * struct {
- * unsigned long pp_magic;
- * struct page_pool *pp;
- * unsigned long _pp_mapping_pad;
- * unsigned long dma_addr;
- * atomic_long_t pp_ref_count;
- * };
+ * net_iovs are allocated and used by networking code, and the size of
+ * the chunk is PAGE_SIZE.
*
- * We mirror the page_pool fields here so the page_pool can access these fields
- * without worrying whether the underlying fields belong to a page or net_iov.
+ * This memory can be any form of non-struct paged memory. Examples
+ * include imported dmabuf memory and imported io_uring memory. See
+ * net_iov_type for all the supported types.
*
- * The non-net stack fields of struct page are private to the mm stack and must
- * never be mirrored to net_iov.
+ * @pp_magic: pp field, similar to the one in struct page/struct
+ * netmem_desc.
+ * @pp: the pp this net_iov belongs to, if any.
+ * @dma_addr: the dma addrs of the net_iov. Needed for the network
+ * card to send/receive this net_iov.
+ * @pp_ref_count: the pp ref count of this net_iov, exactly the same
+ * usage as struct page/struct netmem_desc.
+ * @owner: the net_iov_area this net_iov belongs to, if any.
+ * @type: the type of the memory. Different types of net_iovs are
+ * supported.
*/
-#define NET_IOV_ASSERT_OFFSET(pg, iov) \
- static_assert(offsetof(struct page, pg) == \
- offsetof(struct net_iov, iov))
-NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic);
-NET_IOV_ASSERT_OFFSET(pp, pp);
-NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr);
-NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
-#undef NET_IOV_ASSERT_OFFSET
+struct net_iov {
+ struct netmem_desc desc;
+ enum net_iov_type type;
+ struct net_iov_area *owner;
+};
+
+struct net_iov_area {
+ /* Array of net_iovs for this area. */
+ struct net_iov *niovs;
+ size_t num_niovs;
+
+ /* Offset into the dma-buf where this chunk starts. */
+ unsigned long base_virtual;
+};
+
+static inline struct net_iov_area *net_iov_owner(const struct net_iov *niov)
+{
+ return niov->owner;
+}
+
+static inline unsigned int net_iov_idx(const struct net_iov *niov)
+{
+ return niov - net_iov_owner(niov)->niovs;
+}
+
+/* Initialize a niov: stamp the owning area, the memory provider type.
+ */
+static inline void net_iov_init(struct net_iov *niov,
+ struct net_iov_area *owner,
+ enum net_iov_type type)
+{
+ niov->owner = owner;
+ niov->type = type;
+}
/* netmem */
@@ -60,8 +133,7 @@ NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
* typedef netmem_ref - a nonexistent type marking a reference to generic
* network memory.
*
- * A netmem_ref currently is always a reference to a struct page. This
- * abstraction is introduced so support for new memory types can be added.
+ * A netmem_ref can be a struct page* or a struct net_iov* underneath.
*
* Use the supplied helpers to obtain the underlying memory pointer and fields.
*/
@@ -88,9 +160,6 @@ static inline struct page *__netmem_to_page(netmem_ref netmem)
return (__force struct page *)netmem;
}
-/* This conversion fails (returns NULL) if the netmem_ref is not struct page
- * backed.
- */
static inline struct page *netmem_to_page(netmem_ref netmem)
{
if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
@@ -114,10 +183,9 @@ static inline netmem_ref net_iov_to_netmem(struct net_iov *niov)
return (__force netmem_ref)((unsigned long)niov | NET_IOV);
}
-static inline netmem_ref page_to_netmem(struct page *page)
-{
- return (__force netmem_ref)page;
-}
+#define page_to_netmem(p) (_Generic((p), \
+ const struct page * : (__force const netmem_ref)(p), \
+ struct page * : (__force netmem_ref)(p)))
/**
* virt_to_netmem - convert virtual memory pointer to a netmem reference
@@ -149,9 +217,59 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem)
return page_to_pfn(netmem_to_page(netmem));
}
-static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem)
+/* XXX: How to extract netmem_desc from page must be changed, once
+ * netmem_desc no longer overlays on page and will be allocated through
+ * slab.
+ */
+#define __pp_page_to_nmdesc(p) (_Generic((p), \
+ const struct page * : (const struct netmem_desc *)(p), \
+ struct page * : (struct netmem_desc *)(p)))
+
+/* CAUTION: Check if the page is a pp page before calling this helper or
+ * know it's a pp page.
+ */
+#define pp_page_to_nmdesc(p) \
+({ \
+ DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p)); \
+ __pp_page_to_nmdesc(p); \
+})
+
+/**
+ * __netmem_to_nmdesc - unsafely get pointer to the &netmem_desc backing
+ * @netmem
+ * @netmem: netmem reference to convert
+ *
+ * Unsafe version that can be used only when @netmem is always backed by
+ * system memory, performs faster and generates smaller object code (no
+ * check for the LSB, no WARN). When @netmem points to IOV, provokes
+ * undefined behaviour.
+ *
+ * Return: pointer to the &netmem_desc (garbage if @netmem is not backed
+ * by system memory).
+ */
+static inline struct netmem_desc *__netmem_to_nmdesc(netmem_ref netmem)
{
- return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV);
+ return (__force struct netmem_desc *)netmem;
+}
+
+/* netmem_to_nmdesc - convert netmem_ref to struct netmem_desc * for
+ * access to common fields.
+ * @netmem: netmem reference to get netmem_desc.
+ *
+ * All the sub types of netmem_ref (netmem_desc, net_iov) have the same
+ * pp, pp_magic, dma_addr, and pp_ref_count fields via netmem_desc.
+ *
+ * Return: the pointer to struct netmem_desc * regardless of its
+ * underlying type.
+ */
+static inline struct netmem_desc *netmem_to_nmdesc(netmem_ref netmem)
+{
+ void *p = (void *)((__force unsigned long)netmem & ~NET_IOV);
+
+ if (netmem_is_net_iov(netmem))
+ return &((struct net_iov *)p)->desc;
+
+ return __pp_page_to_nmdesc((struct page *)p);
}
/**
@@ -167,17 +285,17 @@ static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem)
*/
static inline struct page_pool *__netmem_get_pp(netmem_ref netmem)
{
- return __netmem_to_page(netmem)->pp;
+ return __netmem_to_nmdesc(netmem)->pp;
}
static inline struct page_pool *netmem_get_pp(netmem_ref netmem)
{
- return __netmem_clear_lsb(netmem)->pp;
+ return netmem_to_nmdesc(netmem)->pp;
}
static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem)
{
- return &__netmem_clear_lsb(netmem)->pp_ref_count;
+ return &netmem_to_nmdesc(netmem)->pp_ref_count;
}
static inline bool netmem_is_pref_nid(netmem_ref netmem, int pref_nid)
@@ -242,7 +360,57 @@ static inline bool netmem_is_pfmemalloc(netmem_ref netmem)
static inline unsigned long netmem_get_dma_addr(netmem_ref netmem)
{
- return __netmem_clear_lsb(netmem)->dma_addr;
+ return netmem_to_nmdesc(netmem)->dma_addr;
+}
+
+#if defined(CONFIG_NET_DEVMEM)
+static inline bool net_is_devmem_iov(const struct net_iov *niov)
+{
+ return niov->type == NET_IOV_DMABUF;
+}
+#else
+static inline bool net_is_devmem_iov(const struct net_iov *niov)
+{
+ return false;
+}
+#endif
+
+void __get_netmem(netmem_ref netmem);
+void __put_netmem(netmem_ref netmem);
+
+static __always_inline void get_netmem(netmem_ref netmem)
+{
+ if (netmem_is_net_iov(netmem))
+ __get_netmem(netmem);
+ else
+ get_page(netmem_to_page(netmem));
+}
+
+static __always_inline void put_netmem(netmem_ref netmem)
+{
+ if (netmem_is_net_iov(netmem))
+ __put_netmem(netmem);
+ else
+ put_page(netmem_to_page(netmem));
+}
+
+#define netmem_dma_unmap_addr_set(NETMEM, PTR, ADDR_NAME, VAL) \
+ do { \
+ if (!netmem_is_net_iov(NETMEM)) \
+ dma_unmap_addr_set(PTR, ADDR_NAME, VAL); \
+ else \
+ dma_unmap_addr_set(PTR, ADDR_NAME, 0); \
+ } while (0)
+
+static inline void netmem_dma_unmap_page_attrs(struct device *dev,
+ dma_addr_t addr, size_t size,
+ enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ if (!addr)
+ return;
+
+ dma_unmap_page_attrs(dev, addr, size, dir, attrs);
}
#endif /* _NET_NETMEM_H */
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..ab74b5ed0b01 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -7,9 +7,6 @@
#include <linux/atomic.h>
#include <linux/workqueue.h>
#include <linux/netfilter/nf_conntrack_tcp.h>
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-#include <linux/netfilter/nf_conntrack_dccp.h>
-#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
#include <linux/netfilter/nf_conntrack_sctp.h>
#endif
@@ -50,13 +47,6 @@ struct nf_icmp_net {
unsigned int timeout;
};
-#ifdef CONFIG_NF_CT_PROTO_DCCP
-struct nf_dccp_net {
- u8 dccp_loose;
- unsigned int dccp_timeout[CT_DCCP_MAX + 1];
-};
-#endif
-
#ifdef CONFIG_NF_CT_PROTO_SCTP
struct nf_sctp_net {
unsigned int timeouts[SCTP_CONNTRACK_MAX];
@@ -82,9 +72,6 @@ struct nf_ip_net {
struct nf_udp_net udp;
struct nf_icmp_net icmp;
struct nf_icmp_net icmpv6;
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- struct nf_dccp_net dccp;
-#endif
#ifdef CONFIG_NF_CT_PROTO_SCTP
struct nf_sctp_net sctp;
#endif
diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 9b36f0ff0c20..9ef3d70e5e9c 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -13,9 +13,11 @@ struct netns_core {
struct ctl_table_header *sysctl_hdr;
int sysctl_somaxconn;
+ int sysctl_txq_reselection;
int sysctl_optmem_max;
u8 sysctl_txrehash;
u8 sysctl_tstamp_allow_data;
+ u8 sysctl_bypass_prot_mem;
#ifdef CONFIG_PROC_FS
struct prot_inuse __percpu *prot_inuse;
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 46452da35206..6e27c56514df 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -47,6 +47,11 @@ struct sysctl_fib_multipath_hash_seed {
};
#endif
+struct udp_tunnel_gro {
+ struct sock __rcu *sk;
+ struct hlist_head list;
+};
+
struct netns_ipv4 {
/* Cacheline organization can be found documented in
* Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst.
@@ -69,22 +74,35 @@ struct netns_ipv4 {
/* TXRX readonly hotpath cache lines */
__cacheline_group_begin(netns_ipv4_read_txrx);
- u8 sysctl_tcp_moderate_rcvbuf;
+ u8 sysctl_tcp_shrink_window;
__cacheline_group_end(netns_ipv4_read_txrx);
/* RX readonly hotpath cache line */
__cacheline_group_begin(netns_ipv4_read_rx);
+ u8 sysctl_tcp_moderate_rcvbuf;
u8 sysctl_ip_early_demux;
u8 sysctl_tcp_early_demux;
u8 sysctl_tcp_l3mdev_accept;
/* 3 bytes hole, try to pack */
int sysctl_tcp_reordering;
int sysctl_tcp_rmem[3];
+ int sysctl_tcp_rcvbuf_low_rtt;
__cacheline_group_end(netns_ipv4_read_rx);
+ /* ICMP rate limiter hot cache line. */
+ __cacheline_group_begin_aligned(icmp);
+ atomic_t icmp_global_credit;
+ u32 icmp_global_stamp;
+ __cacheline_group_end_aligned(icmp);
+
struct inet_timewait_death_row tcp_death_row;
struct udp_table *udp_table;
+#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
+ /* Not in a pernet subsys because need to be available at GRO stage */
+ struct udp_tunnel_gro udp_tunnel_gro[2];
+#endif
+
#ifdef CONFIG_SYSCTL
struct ctl_table_header *forw_hdr;
struct ctl_table_header *frags_hdr;
@@ -105,12 +123,14 @@ struct netns_ipv4 {
#endif
bool fib_has_custom_local_routes;
bool fib_offload_disabled;
- u8 sysctl_tcp_shrink_window;
#ifdef CONFIG_IP_ROUTE_CLASSID
atomic_t fib_num_tclassid_users;
#endif
struct hlist_head *fib_table_hash;
struct sock *fibnl;
+ struct hlist_head *fib_info_hash;
+ unsigned int fib_info_hash_bits;
+ unsigned int fib_info_cnt;
struct sock *mc_autojoin_sk;
@@ -122,12 +142,12 @@ struct netns_ipv4 {
u8 sysctl_icmp_echo_ignore_broadcasts;
u8 sysctl_icmp_ignore_bogus_error_responses;
u8 sysctl_icmp_errors_use_inbound_ifaddr;
+ u8 sysctl_icmp_errors_extension_mask;
int sysctl_icmp_ratelimit;
int sysctl_icmp_ratemask;
int sysctl_icmp_msgs_per_sec;
int sysctl_icmp_msgs_burst;
- atomic_t icmp_global_credit;
- u32 icmp_global_stamp;
+
u32 ip_rt_min_pmtu;
int ip_rt_mtu_expires;
int ip_rt_min_advmss;
@@ -135,6 +155,8 @@ struct netns_ipv4 {
struct local_ports ip_local_ports;
u8 sysctl_tcp_ecn;
+ u8 sysctl_tcp_ecn_option;
+ u8 sysctl_tcp_ecn_option_beacon;
u8 sysctl_tcp_ecn_fallback;
u8 sysctl_ip_default_ttl;
@@ -144,6 +166,7 @@ struct netns_ipv4 {
u8 sysctl_ip_autobind_reuse;
/* Shall we try to damage output packets if routing dev changes? */
u8 sysctl_ip_dynaddr;
+ u32 sysctl_ip_local_port_step_width;
#ifdef CONFIG_NET_L3_MASTER_DEV
u8 sysctl_raw_l3mdev_accept;
#endif
@@ -181,6 +204,7 @@ struct netns_ipv4 {
u8 sysctl_tcp_window_scaling;
u8 sysctl_tcp_timestamps;
int sysctl_tcp_rto_min_us;
+ int sysctl_tcp_rto_max_ms;
u8 sysctl_tcp_recovery;
u8 sysctl_tcp_thin_linear_timeouts;
u8 sysctl_tcp_slow_start_after_idle;
@@ -204,6 +228,7 @@ struct netns_ipv4 {
int sysctl_tcp_pacing_ss_ratio;
int sysctl_tcp_pacing_ca_ratio;
unsigned int sysctl_tcp_child_ehash_entries;
+ int sysctl_tcp_comp_sack_rtt_percent;
unsigned long sysctl_tcp_comp_sack_delay_ns;
unsigned long sysctl_tcp_comp_sack_slack_ns;
int sysctl_max_syn_backlog;
@@ -237,6 +262,7 @@ struct netns_ipv4 {
int sysctl_igmp_qrv;
struct ping_group_range ping_group_range;
+ u16 ping_port_rover;
atomic_t dev_addr_genid;
@@ -249,11 +275,14 @@ struct netns_ipv4 {
#ifdef CONFIG_IP_MROUTE
#ifndef CONFIG_IP_MROUTE_MULTIPLE_TABLES
- struct mr_table *mrt;
+ struct mr_table __rcu *mrt;
#else
struct list_head mr_tables;
struct fib_rules_ops *mr_rules_ops;
#endif
+ struct fib_notifier_ops *ipmr_notifier_ops;
+ atomic_t ipmr_seq;
+ struct mutex mfc_mutex;
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
struct sysctl_fib_multipath_hash_seed sysctl_fib_multipath_hash_seed;
@@ -265,12 +294,10 @@ struct netns_ipv4 {
struct fib_notifier_ops *notifier_ops;
unsigned int fib_seq; /* writes protected by rtnl_mutex */
- struct fib_notifier_ops *ipmr_notifier_ops;
- unsigned int ipmr_seq; /* protected by rtnl_mutex */
-
atomic_t rt_genid;
siphash_key_t ip_id_key;
struct hlist_head *inet_addr_lst;
struct delayed_work addr_chk_work;
};
+
#endif
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 5f2cfd84570a..875916d60bfe 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -30,19 +30,23 @@ struct netns_sysctl_ipv6 {
int ip6_rt_min_advmss;
u32 multipath_hash_fields;
u8 multipath_hash_policy;
- u8 bindv6only;
+
+ __cacheline_group_begin(sysctl_ipv6_flowlabel);
u8 flowlabel_consistency;
u8 auto_flowlabels;
- int icmpv6_time;
+ u8 flowlabel_state_ranges;
+ __cacheline_group_end(sysctl_ipv6_flowlabel);
+
u8 icmpv6_echo_ignore_all;
u8 icmpv6_echo_ignore_multicast;
u8 icmpv6_echo_ignore_anycast;
+ int icmpv6_time;
DECLARE_BITMAP(icmpv6_ratemask, ICMPV6_MSG_MAX + 1);
unsigned long *icmpv6_ratemask_ptr;
u8 anycast_src_echo_reply;
+ u8 bindv6only;
u8 ip_nonlocal_bind;
u8 fwmark_reflect;
- u8 flowlabel_state_ranges;
int idgen_retries;
int idgen_delay;
int flowlabel_reflect;
@@ -56,6 +60,7 @@ struct netns_sysctl_ipv6 {
u8 skip_notify_on_dev_down;
u8 fib_notify_on_flag_change;
u8 icmpv6_error_anycast_as_unicast;
+ u8 icmpv6_errors_extension_mask;
};
struct netns_ipv6 {
@@ -72,6 +77,7 @@ struct netns_ipv6 {
struct rt6_statistics *rt6_stats;
struct timer_list ip6_fib_timer;
struct hlist_head *fib_table_hash;
+ spinlock_t fib_table_hash_lock;
struct fib6_table *fib6_main_tbl;
struct list_head fib6_walkers;
rwlock_t fib6_walker_lock;
@@ -112,7 +118,8 @@ struct netns_ipv6 {
struct seg6_pernet_data *seg6_data;
struct fib_notifier_ops *notifier_ops;
struct fib_notifier_ops *ip6mr_notifier_ops;
- unsigned int ipmr_seq; /* protected by rtnl_mutex */
+ atomic_t ipmr_seq;
+ int flowlabel_count;
struct {
struct hlist_head head;
spinlock_t lock;
diff --git a/include/net/netns/mctp.h b/include/net/netns/mctp.h
index 1db8f9aaddb4..89555f90b97b 100644
--- a/include/net/netns/mctp.h
+++ b/include/net/netns/mctp.h
@@ -6,19 +6,25 @@
#ifndef __NETNS_MCTP_H__
#define __NETNS_MCTP_H__
+#include <linux/hash.h>
+#include <linux/hashtable.h>
#include <linux/mutex.h>
#include <linux/types.h>
+#define MCTP_BINDS_BITS 7
+
struct netns_mctp {
/* Only updated under RTNL, entries freed via RCU */
struct list_head routes;
- /* Bound sockets: list of sockets bound by type.
- * This list is updated from non-atomic contexts (under bind_lock),
- * and read (under rcu) in packet rx
+ /* Bound sockets: hash table of sockets, keyed by
+ * (type, src_eid, dest_eid).
+ * Specific src_eid/dest_eid entries also have an entry for
+ * MCTP_ADDR_ANY. This list is updated from non-atomic contexts
+ * (under bind_lock), and read (under rcu) in packet rx.
*/
struct mutex bind_lock;
- struct hlist_head binds;
+ DECLARE_HASHTABLE(binds, MCTP_BINDS_BITS);
/* tag allocations. This list is read and updated from atomic contexts,
* but elements are free()ed after a RCU grace-period
@@ -34,4 +40,10 @@ struct netns_mctp {
struct list_head neighbours;
};
+static inline u32 mctp_bind_hash(u8 type, u8 local_addr, u8 peer_addr)
+{
+ return hash_32(type | (u32)local_addr << 8 | (u32)peer_addr << 16,
+ MCTP_BINDS_BITS);
+}
+
#endif /* __NETNS_MCTP_H__ */
diff --git a/include/net/netns/mib.h b/include/net/netns/mib.h
index 7e373664b1e7..dce05f8e6a33 100644
--- a/include/net/netns/mib.h
+++ b/include/net/netns/mib.h
@@ -28,11 +28,6 @@ struct netns_mib {
DEFINE_SNMP_STAT(struct mptcp_mib, mptcp_statistics);
#endif
- DEFINE_SNMP_STAT(struct udp_mib, udplite_statistics);
-#if IS_ENABLED(CONFIG_IPV6)
- DEFINE_SNMP_STAT(struct udp_mib, udplite_stats_in6);
-#endif
-
DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
DEFINE_SNMP_STAT_ATOMIC(struct icmpmsg_mib, icmpmsg_statistics);
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h
index 19ad2574b267..2073cbac2afb 100644
--- a/include/net/netns/mpls.h
+++ b/include/net/netns/mpls.h
@@ -16,6 +16,8 @@ struct netns_mpls {
int default_ttl;
size_t platform_labels;
struct mpls_route __rcu * __rcu *platform_label;
+ struct mutex platform_mutex;
+ seqcount_mutex_t platform_label_seq;
struct ctl_table_header *ctl;
};
diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h
index cc8060c017d5..99dd166c5d07 100644
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -3,6 +3,7 @@
#define _NETNS_NFTABLES_H_
struct netns_nftables {
+ unsigned int base_seq;
u8 gencursor;
};
diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index d25cd7a9c5ff..c0f97f36389e 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -75,8 +75,8 @@ struct netns_sctp {
/* Whether Cookie Preservative is enabled(1) or not(0) */
int cookie_preserve_enable;
- /* The namespace default hmac alg */
- char *sctp_hmac_alg;
+ /* Whether cookie authentication is enabled(1) or not(0) */
+ int cookie_auth_enable;
/* Valid.Cookie.Life - 60 seconds */
unsigned int valid_cookie_life;
diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h
index fc752a50f91b..ed24c9f638ee 100644
--- a/include/net/netns/smc.h
+++ b/include/net/netns/smc.h
@@ -17,6 +17,9 @@ struct netns_smc {
#ifdef CONFIG_SYSCTL
struct ctl_table_header *smc_hdr;
#endif
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+ struct smc_hs_ctrl __rcu *hs_ctrl;
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
unsigned int sysctl_autocorking_size;
unsigned int sysctl_smcr_buf_type;
int sysctl_smcr_testlink_time;
@@ -24,5 +27,7 @@ struct netns_smc {
int sysctl_rmem;
int sysctl_max_links_per_lgr;
int sysctl_max_conns_per_lgr;
+ unsigned int sysctl_smcr_max_send_wr;
+ unsigned int sysctl_smcr_max_recv_wr;
};
#endif
diff --git a/include/net/netns/vsock.h b/include/net/netns/vsock.h
new file mode 100644
index 000000000000..7f84aad92f57
--- /dev/null
+++ b/include/net/netns/vsock.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_NET_NAMESPACE_VSOCK_H
+#define __NET_NET_NAMESPACE_VSOCK_H
+
+#include <linux/types.h>
+
+enum vsock_net_mode {
+ VSOCK_NET_MODE_GLOBAL,
+ VSOCK_NET_MODE_LOCAL,
+};
+
+struct netns_vsock {
+ struct ctl_table_header *sysctl_hdr;
+
+ /* protected by the vsock_table_lock in af_vsock.c */
+ u32 port;
+
+ enum vsock_net_mode mode;
+ enum vsock_net_mode child_ns_mode;
+
+ /* 0 = unlocked, 1 = locked to global, 2 = locked to local */
+ int child_ns_mode_locked;
+
+ int g2h_fallback;
+};
+#endif /* __NET_NET_NAMESPACE_VSOCK_H */
diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 23dd647fe024..b73983a17e08 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -59,7 +59,7 @@ struct netns_xfrm {
struct list_head inexact_bins;
- struct sock *nlsk;
+ struct sock __rcu *nlsk;
struct sock *nlsk_stash;
u32 sysctl_aevent_etime;
diff --git a/include/net/netrom.h b/include/net/netrom.h
deleted file mode 100644
index f0565a5987d1..000000000000
--- a/include/net/netrom.h
+++ /dev/null
@@ -1,273 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Declarations of NET/ROM type objects.
- *
- * Jonathan Naylor G4KLX 9/4/95
- */
-
-#ifndef _NETROM_H
-#define _NETROM_H
-
-#include <linux/netrom.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <net/sock.h>
-#include <linux/refcount.h>
-#include <linux/seq_file.h>
-#include <net/ax25.h>
-
-#define NR_NETWORK_LEN 15
-#define NR_TRANSPORT_LEN 5
-
-#define NR_PROTO_IP 0x0C
-
-#define NR_PROTOEXT 0x00
-#define NR_CONNREQ 0x01
-#define NR_CONNACK 0x02
-#define NR_DISCREQ 0x03
-#define NR_DISCACK 0x04
-#define NR_INFO 0x05
-#define NR_INFOACK 0x06
-#define NR_RESET 0x07
-
-#define NR_CHOKE_FLAG 0x80
-#define NR_NAK_FLAG 0x40
-#define NR_MORE_FLAG 0x20
-
-/* Define Link State constants. */
-enum {
- NR_STATE_0,
- NR_STATE_1,
- NR_STATE_2,
- NR_STATE_3
-};
-
-#define NR_COND_ACK_PENDING 0x01
-#define NR_COND_REJECT 0x02
-#define NR_COND_PEER_RX_BUSY 0x04
-#define NR_COND_OWN_RX_BUSY 0x08
-
-#define NR_DEFAULT_T1 120000 /* Outstanding frames - 120 seconds */
-#define NR_DEFAULT_T2 5000 /* Response delay - 5 seconds */
-#define NR_DEFAULT_N2 3 /* Number of Retries - 3 */
-#define NR_DEFAULT_T4 180000 /* Busy Delay - 180 seconds */
-#define NR_DEFAULT_IDLE 0 /* No Activity Timeout - none */
-#define NR_DEFAULT_WINDOW 4 /* Default Window Size - 4 */
-#define NR_DEFAULT_OBS 6 /* Default Obsolescence Count - 6 */
-#define NR_DEFAULT_QUAL 10 /* Default Neighbour Quality - 10 */
-#define NR_DEFAULT_TTL 16 /* Default Time To Live - 16 */
-#define NR_DEFAULT_ROUTING 1 /* Is routing enabled ? */
-#define NR_DEFAULT_FAILS 2 /* Link fails until route fails */
-#define NR_DEFAULT_RESET 0 /* Sent / accept reset cmds? */
-
-#define NR_MODULUS 256
-#define NR_MAX_WINDOW_SIZE 127 /* Maximum Window Allowable - 127 */
-#define NR_MAX_PACKET_SIZE 236 /* Maximum Packet Length - 236 */
-
-struct nr_sock {
- struct sock sock;
- ax25_address user_addr, source_addr, dest_addr;
- struct net_device *device;
- unsigned char my_index, my_id;
- unsigned char your_index, your_id;
- unsigned char state, condition, bpqext, window;
- unsigned short vs, vr, va, vl;
- unsigned char n2, n2count;
- unsigned long t1, t2, t4, idle;
- unsigned short fraglen;
- struct timer_list t1timer;
- struct timer_list t2timer;
- struct timer_list t4timer;
- struct timer_list idletimer;
- struct sk_buff_head ack_queue;
- struct sk_buff_head reseq_queue;
- struct sk_buff_head frag_queue;
-};
-
-#define nr_sk(sk) ((struct nr_sock *)(sk))
-
-struct nr_neigh {
- struct hlist_node neigh_node;
- ax25_address callsign;
- ax25_digi *digipeat;
- ax25_cb *ax25;
- struct net_device *dev;
- unsigned char quality;
- unsigned char locked;
- unsigned short count;
- unsigned int number;
- unsigned char failed;
- refcount_t refcount;
-};
-
-struct nr_route {
- unsigned char quality;
- unsigned char obs_count;
- struct nr_neigh *neighbour;
-};
-
-struct nr_node {
- struct hlist_node node_node;
- ax25_address callsign;
- char mnemonic[7];
- unsigned char which;
- unsigned char count;
- struct nr_route routes[3];
- refcount_t refcount;
- spinlock_t node_lock;
-};
-
-/*********************************************************************
- * nr_node & nr_neigh lists, refcounting and locking
- *********************************************************************/
-
-#define nr_node_hold(__nr_node) \
- refcount_inc(&((__nr_node)->refcount))
-
-static __inline__ void nr_node_put(struct nr_node *nr_node)
-{
- if (refcount_dec_and_test(&nr_node->refcount)) {
- kfree(nr_node);
- }
-}
-
-#define nr_neigh_hold(__nr_neigh) \
- refcount_inc(&((__nr_neigh)->refcount))
-
-static __inline__ void nr_neigh_put(struct nr_neigh *nr_neigh)
-{
- if (refcount_dec_and_test(&nr_neigh->refcount)) {
- if (nr_neigh->ax25)
- ax25_cb_put(nr_neigh->ax25);
- kfree(nr_neigh->digipeat);
- kfree(nr_neigh);
- }
-}
-
-/* nr_node_lock and nr_node_unlock also hold/put the node's refcounter.
- */
-static __inline__ void nr_node_lock(struct nr_node *nr_node)
-{
- nr_node_hold(nr_node);
- spin_lock_bh(&nr_node->node_lock);
-}
-
-static __inline__ void nr_node_unlock(struct nr_node *nr_node)
-{
- spin_unlock_bh(&nr_node->node_lock);
- nr_node_put(nr_node);
-}
-
-#define nr_neigh_for_each(__nr_neigh, list) \
- hlist_for_each_entry(__nr_neigh, list, neigh_node)
-
-#define nr_neigh_for_each_safe(__nr_neigh, node2, list) \
- hlist_for_each_entry_safe(__nr_neigh, node2, list, neigh_node)
-
-#define nr_node_for_each(__nr_node, list) \
- hlist_for_each_entry(__nr_node, list, node_node)
-
-#define nr_node_for_each_safe(__nr_node, node2, list) \
- hlist_for_each_entry_safe(__nr_node, node2, list, node_node)
-
-
-/*********************************************************************/
-
-/* af_netrom.c */
-extern int sysctl_netrom_default_path_quality;
-extern int sysctl_netrom_obsolescence_count_initialiser;
-extern int sysctl_netrom_network_ttl_initialiser;
-extern int sysctl_netrom_transport_timeout;
-extern int sysctl_netrom_transport_maximum_tries;
-extern int sysctl_netrom_transport_acknowledge_delay;
-extern int sysctl_netrom_transport_busy_delay;
-extern int sysctl_netrom_transport_requested_window_size;
-extern int sysctl_netrom_transport_no_activity_timeout;
-extern int sysctl_netrom_routing_control;
-extern int sysctl_netrom_link_fails_count;
-extern int sysctl_netrom_reset_circuit;
-
-int nr_rx_frame(struct sk_buff *, struct net_device *);
-void nr_destroy_socket(struct sock *);
-
-/* nr_dev.c */
-int nr_rx_ip(struct sk_buff *, struct net_device *);
-void nr_setup(struct net_device *);
-
-/* nr_in.c */
-int nr_process_rx_frame(struct sock *, struct sk_buff *);
-
-/* nr_loopback.c */
-void nr_loopback_init(void);
-void nr_loopback_clear(void);
-int nr_loopback_queue(struct sk_buff *);
-
-/* nr_out.c */
-void nr_output(struct sock *, struct sk_buff *);
-void nr_send_nak_frame(struct sock *);
-void nr_kick(struct sock *);
-void nr_transmit_buffer(struct sock *, struct sk_buff *);
-void nr_establish_data_link(struct sock *);
-void nr_enquiry_response(struct sock *);
-void nr_check_iframes_acked(struct sock *, unsigned short);
-
-/* nr_route.c */
-void nr_rt_device_down(struct net_device *);
-struct net_device *nr_dev_first(void);
-struct net_device *nr_dev_get(ax25_address *);
-int nr_rt_ioctl(unsigned int, void __user *);
-void nr_link_failed(ax25_cb *, int);
-int nr_route_frame(struct sk_buff *, ax25_cb *);
-extern const struct seq_operations nr_node_seqops;
-extern const struct seq_operations nr_neigh_seqops;
-void nr_rt_free(void);
-
-/* nr_subr.c */
-void nr_clear_queues(struct sock *);
-void nr_frames_acked(struct sock *, unsigned short);
-void nr_requeue_frames(struct sock *);
-int nr_validate_nr(struct sock *, unsigned short);
-int nr_in_rx_window(struct sock *, unsigned short);
-void nr_write_internal(struct sock *, int);
-
-void __nr_transmit_reply(struct sk_buff *skb, int mine, unsigned char cmdflags);
-
-/*
- * This routine is called when a Connect Acknowledge with the Choke Flag
- * set is needed to refuse a connection.
- */
-#define nr_transmit_refusal(skb, mine) \
-do { \
- __nr_transmit_reply((skb), (mine), NR_CONNACK | NR_CHOKE_FLAG); \
-} while (0)
-
-/*
- * This routine is called when we don't have a circuit matching an incoming
- * NET/ROM packet. This is an G8PZT Xrouter extension.
- */
-#define nr_transmit_reset(skb, mine) \
-do { \
- __nr_transmit_reply((skb), (mine), NR_RESET); \
-} while (0)
-
-void nr_disconnect(struct sock *, int);
-
-/* nr_timer.c */
-void nr_init_timers(struct sock *sk);
-void nr_start_heartbeat(struct sock *);
-void nr_start_t1timer(struct sock *);
-void nr_start_t2timer(struct sock *);
-void nr_start_t4timer(struct sock *);
-void nr_start_idletimer(struct sock *);
-void nr_stop_heartbeat(struct sock *);
-void nr_stop_t1timer(struct sock *);
-void nr_stop_t2timer(struct sock *);
-void nr_stop_t4timer(struct sock *);
-void nr_stop_idletimer(struct sock *);
-int nr_t1timer_running(struct sock *);
-
-/* sysctl_net_netrom.c */
-int nr_register_sysctl(void);
-void nr_unregister_sysctl(void);
-
-#endif
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index d9fb44e8b321..572e69cda476 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -152,6 +152,8 @@ struct nexthop {
u8 protocol; /* app managing this nh */
u8 nh_flags;
bool is_group;
+ bool dead;
+ spinlock_t lock; /* protect dead and f6i_list */
refcount_t refcnt;
struct rcu_head rcu;
diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h
index e180bdf2f82b..664d5058e66e 100644
--- a/include/net/nfc/nci_core.h
+++ b/include/net/nfc/nci_core.h
@@ -52,7 +52,7 @@ enum nci_state {
#define NCI_RF_DISC_SELECT_TIMEOUT 5000
#define NCI_RF_DEACTIVATE_TIMEOUT 30000
#define NCI_CMD_TIMEOUT 5000
-#define NCI_DATA_TIMEOUT 700
+#define NCI_DATA_TIMEOUT 3000
struct nci_dev;
diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h
index 127e6c7d910d..c54df042db6b 100644
--- a/include/net/nfc/nfc.h
+++ b/include/net/nfc/nfc.h
@@ -219,6 +219,8 @@ static inline void nfc_free_device(struct nfc_dev *dev)
int nfc_register_device(struct nfc_dev *dev);
+void nfc_unregister_rfkill(struct nfc_dev *dev);
+void nfc_remove_device(struct nfc_dev *dev);
void nfc_unregister_device(struct nfc_dev *dev);
/**
diff --git a/include/net/nl802154.h b/include/net/nl802154.h
index a994dea74596..442822746e92 100644
--- a/include/net/nl802154.h
+++ b/include/net/nl802154.h
@@ -191,14 +191,12 @@ enum nl802154_iftype {
* @NL802154_CAP_ATTR_CHANNELS: a nested attribute for nl802154_channel_attr
* @NL802154_CAP_ATTR_TX_POWERS: a nested attribute for
* nl802154_wpan_phy_tx_power
- * @NL802154_CAP_ATTR_MIN_CCA_ED_LEVEL: minimum value for cca_ed_level
- * @NL802154_CAP_ATTR_MAX_CCA_ED_LEVEL: maximum value for cca_ed_level
* @NL802154_CAP_ATTR_CCA_MODES: nl802154_cca_modes flags
* @NL802154_CAP_ATTR_CCA_OPTS: nl802154_cca_opts flags
* @NL802154_CAP_ATTR_MIN_MINBE: minimum of minbe value
* @NL802154_CAP_ATTR_MAX_MINBE: maximum of minbe value
* @NL802154_CAP_ATTR_MIN_MAXBE: minimum of maxbe value
- * @NL802154_CAP_ATTR_MAX_MINBE: maximum of maxbe value
+ * @NL802154_CAP_ATTR_MAX_MAXBE: maximum of maxbe value
* @NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS: minimum of csma backoff value
* @NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS: maximum of csma backoffs value
* @NL802154_CAP_ATTR_MIN_FRAME_RETRIES: minimum of frame retries value
@@ -364,6 +362,7 @@ enum nl802154_cca_opts {
NL802154_CCA_OPT_ENERGY_CARRIER_AND,
NL802154_CCA_OPT_ENERGY_CARRIER_OR,
+ /* private: */
/* keep last */
__NL802154_CCA_OPT_ATTR_AFTER_LAST,
NL802154_CCA_OPT_ATTR_MAX = __NL802154_CCA_OPT_ATTR_AFTER_LAST - 1
diff --git a/include/net/nsh.h b/include/net/nsh.h
index 16a751093896..15a26c590815 100644
--- a/include/net/nsh.h
+++ b/include/net/nsh.h
@@ -247,10 +247,10 @@ struct nshhdr {
#define NSH_M_TYPE1_LEN 24
/* NSH header maximum Length. */
-#define NSH_HDR_MAX_LEN 256
+#define NSH_HDR_MAX_LEN ((NSH_LEN_MASK >> NSH_LEN_SHIFT) * 4)
/* NSH context headers maximum Length. */
-#define NSH_CTX_HDRS_MAX_LEN 248
+#define NSH_CTX_HDRS_MAX_LEN (NSH_HDR_MAX_LEN - NSH_BASE_HDR_LEN)
static inline struct nshhdr *nsh_hdr(struct sk_buff *skb)
{
diff --git a/include/net/p8022.h b/include/net/p8022.h
deleted file mode 100644
index a29e224ac498..000000000000
--- a/include/net/p8022.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NET_P8022_H
-#define _NET_P8022_H
-
-struct net_device;
-struct packet_type;
-struct sk_buff;
-
-struct datalink_proto *
-register_8022_client(unsigned char type,
- int (*func)(struct sk_buff *skb,
- struct net_device *dev,
- struct packet_type *pt,
- struct net_device *orig_dev));
-void unregister_8022_client(struct datalink_proto *proto);
-#endif
diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 582a3d00cbe2..3247026e096a 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -153,6 +153,13 @@ static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool,
return page_pool_alloc_netmem(pool, offset, size, gfp);
}
+static inline netmem_ref page_pool_dev_alloc_netmems(struct page_pool *pool)
+{
+ gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+
+ return page_pool_alloc_netmems(pool, gfp);
+}
+
static inline struct page *page_pool_alloc(struct page_pool *pool,
unsigned int *offset,
unsigned int *size, gfp_t gfp)
@@ -395,6 +402,12 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
page_pool_put_full_page(pool, page, true);
}
+static inline void page_pool_recycle_direct_netmem(struct page_pool *pool,
+ netmem_ref netmem)
+{
+ page_pool_put_full_netmem(pool, netmem, true);
+}
+
#define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \
(sizeof(dma_addr_t) > sizeof(unsigned long))
@@ -431,12 +444,7 @@ static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem)
*/
static inline dma_addr_t page_pool_get_dma_addr(const struct page *page)
{
- dma_addr_t ret = page->dma_addr;
-
- if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA)
- ret <<= PAGE_SHIFT;
-
- return ret;
+ return page_pool_get_dma_addr_netmem(page_to_netmem(page));
}
static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool,
@@ -481,6 +489,11 @@ page_pool_dma_sync_netmem_for_cpu(const struct page_pool *pool,
offset, dma_sync_size);
}
+static inline void page_pool_get(struct page_pool *pool)
+{
+ refcount_inc(&pool->user_cnt);
+}
+
static inline bool page_pool_put(struct page_pool *pool)
{
return refcount_dec_and_test(&pool->user_cnt);
@@ -492,4 +505,21 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid)
page_pool_update_nid(pool, new_nid);
}
+/**
+ * page_pool_is_unreadable() - will allocated buffers be unreadable for the CPU
+ * @pool: queried page pool
+ *
+ * Check if page pool will return buffers which are unreadable to the CPU /
+ * kernel. This will only be the case if user space bound a memory provider (mp)
+ * which returns unreadable memory to the queue served by the page pool.
+ * If %PP_FLAG_ALLOW_UNREADABLE_NETMEM was set but there is no mp bound
+ * this helper will return false. See also netif_rxq_has_unreadable_mp().
+ *
+ * Return: true if memory allocated by the page pool may be unreadable
+ */
+static inline bool page_pool_is_unreadable(struct page_pool *pool)
+{
+ return !!pool->mp_ops;
+}
+
#endif /* _NET_PAGE_POOL_HELPERS_H */
diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h
new file mode 100644
index 000000000000..255ce4cfd975
--- /dev/null
+++ b/include/net/page_pool/memory_provider.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _NET_PAGE_POOL_MEMORY_PROVIDER_H
+#define _NET_PAGE_POOL_MEMORY_PROVIDER_H
+
+#include <net/netmem.h>
+#include <net/page_pool/types.h>
+
+struct netdev_rx_queue;
+struct netlink_ext_ack;
+struct sk_buff;
+
+struct memory_provider_ops {
+ netmem_ref (*alloc_netmems)(struct page_pool *pool, gfp_t gfp);
+ bool (*release_netmem)(struct page_pool *pool, netmem_ref netmem);
+ int (*init)(struct page_pool *pool);
+ void (*destroy)(struct page_pool *pool);
+ int (*nl_fill)(void *mp_priv, struct sk_buff *rsp,
+ struct netdev_rx_queue *rxq);
+ void (*uninstall)(void *mp_priv, struct netdev_rx_queue *rxq);
+};
+
+bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr);
+void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov);
+void net_mp_niov_clear_page_pool(struct net_iov *niov);
+
+int netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ const struct pp_memory_provider_params *p,
+ struct netlink_ext_ack *extack);
+void netif_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
+ const struct pp_memory_provider_params *old_p);
+
+/**
+ * net_mp_netmem_place_in_cache() - give a netmem to a page pool
+ * @pool: the page pool to place the netmem into
+ * @netmem: netmem to give
+ *
+ * Push an accounted netmem into the page pool's allocation cache. The caller
+ * must ensure that there is space in the cache. It should only be called off
+ * the mp_ops->alloc_netmems() path.
+ */
+static inline void net_mp_netmem_place_in_cache(struct page_pool *pool,
+ netmem_ref netmem)
+{
+ pool->alloc.cache[pool->alloc.count++] = netmem;
+}
+
+#endif
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 7f405672b089..03da138722f5 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -6,6 +6,7 @@
#include <linux/dma-direction.h>
#include <linux/ptr_ring.h>
#include <linux/types.h>
+#include <linux/xarray.h>
#include <net/netmem.h>
#define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA
@@ -33,6 +34,9 @@
#define PP_FLAG_ALL (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV | \
PP_FLAG_SYSTEM_POOL | PP_FLAG_ALLOW_UNREADABLE_NETMEM)
+/* Index limit to stay within PP_DMA_INDEX_BITS for DMA indices */
+#define PP_DMA_INDEX_LIMIT XA_LIMIT(1, BIT(PP_DMA_INDEX_BITS) - 1)
+
/*
* Fast allocation side cache array/stack
*
@@ -40,6 +44,8 @@
* use-case. The NAPI budget is 64 packets. After a NAPI poll the RX
* ring is usually refilled and the max consumed elements will be 64,
* thus a natural max size of objects needed in the cache.
+ * The refill watermark is set to 64 for 4KB pages,
+ * and scales to balance its size in bytes across page sizes.
*
* Keeping room for more objects, is due to XDP_DROP use-case. As
* XDP_DROP allows the opportunity to recycle objects directly into
@@ -47,8 +53,15 @@
* cache is already full (or partly full) then the XDP_DROP recycles
* would have to take a slower code path.
*/
-#define PP_ALLOC_CACHE_SIZE 128
+#if PAGE_SIZE >= SZ_64K
+#define PP_ALLOC_CACHE_REFILL 4
+#elif PAGE_SIZE >= SZ_16K
+#define PP_ALLOC_CACHE_REFILL 16
+#else
#define PP_ALLOC_CACHE_REFILL 64
+#endif
+
+#define PP_ALLOC_CACHE_SIZE (PP_ALLOC_CACHE_REFILL * 2)
struct pp_alloc_cache {
u32 count;
netmem_ref cache[PP_ALLOC_CACHE_SIZE];
@@ -152,8 +165,12 @@ struct page_pool_stats {
*/
#define PAGE_POOL_FRAG_GROUP_ALIGN (4 * sizeof(long))
+struct memory_provider_ops;
+
struct pp_memory_provider_params {
void *mp_priv;
+ const struct memory_provider_ops *mp_ops;
+ u32 rx_page_size;
};
struct page_pool {
@@ -216,6 +233,9 @@ struct page_pool {
struct ptr_ring ring;
void *mp_priv;
+ const struct memory_provider_ops *mp_ops;
+
+ struct xarray dma_mapped;
#ifdef CONFIG_PAGE_POOL_STATS
/* recycle stats are per-cpu to avoid locking */
@@ -236,7 +256,7 @@ struct page_pool {
/* User-facing fields, protected by page_pools_lock */
struct {
struct hlist_node list;
- u64 detach_time;
+ ktime_t detach_time;
u32 id;
} user;
};
@@ -255,6 +275,8 @@ struct page_pool *page_pool_create_percpu(const struct page_pool_params *params,
struct xdp_mem_info;
#ifdef CONFIG_PAGE_POOL
+void page_pool_enable_direct_recycling(struct page_pool *pool,
+ struct napi_struct *napi);
void page_pool_disable_direct_recycling(struct page_pool *pool);
void page_pool_destroy(struct page_pool *pool);
void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
diff --git a/include/net/pfcp.h b/include/net/pfcp.h
index af14f970b80e..639553797d3e 100644
--- a/include/net/pfcp.h
+++ b/include/net/pfcp.h
@@ -45,7 +45,7 @@ struct pfcphdr_session {
reserved:4;
#elif defined(__BIG_ENDIAN_BITFIELD)
u8 reserved:4,
- message_priprity:4;
+ message_priority:4;
#else
#error "Please fix <asm/byteorder>"
#endif
diff --git a/include/net/phy/realtek_phy.h b/include/net/phy/realtek_phy.h
new file mode 100644
index 000000000000..d683bc1b0659
--- /dev/null
+++ b/include/net/phy/realtek_phy.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _REALTEK_PHY_H
+#define _REALTEK_PHY_H
+
+#define PHY_ID_RTL_DUMMY_SFP 0x001ccbff
+
+#endif /* _REALTEK_PHY_H */
diff --git a/include/net/pie.h b/include/net/pie.h
index 01cbc66825a4..1f3db0c35514 100644
--- a/include/net/pie.h
+++ b/include/net/pie.h
@@ -104,7 +104,7 @@ static inline void pie_vars_init(struct pie_vars *vars)
vars->dq_tstamp = DTIME_INVALID;
vars->accu_prob = 0;
vars->dq_count = DQCOUNT_INVALID;
- vars->avg_dq_rate = 0;
+ WRITE_ONCE(vars->avg_dq_rate, 0);
}
static inline struct pie_skb_cb *get_pie_cb(const struct sk_buff *skb)
diff --git a/include/net/ping.h b/include/net/ping.h
index bc7779262e60..bcbdb5a136e3 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -20,8 +20,7 @@
/* Compatibility glue so we can support IPv6 when it's compiled as a module */
struct pingv6_ops {
- int (*ipv6_recv_error)(struct sock *sk, struct msghdr *msg, int len,
- int *addr_len);
+ int (*ipv6_recv_error)(struct sock *sk, struct msghdr *msg, int len);
void (*ip6_datagram_recv_common_ctl)(struct sock *sk,
struct msghdr *msg,
struct sk_buff *skb);
@@ -54,18 +53,17 @@ struct pingfakehdr {
};
int ping_get_port(struct sock *sk, unsigned short ident);
-int ping_hash(struct sock *sk);
void ping_unhash(struct sock *sk);
int ping_init_sock(struct sock *sk);
void ping_close(struct sock *sk, long timeout);
-int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+int ping_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
void ping_err(struct sk_buff *skb, int offset, u32 info);
int ping_getfrag(void *from, char *to, int offset, int fraglen, int odd,
struct sk_buff *);
int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int flags, int *addr_len);
+ int flags);
int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
void *user_icmph, size_t icmph_len);
int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index c64fd896b1f9..99ac747b7906 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -536,6 +536,8 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer)
case TCF_LAYER_NETWORK:
return skb_network_header(skb);
case TCF_LAYER_TRANSPORT:
+ if (!skb_transport_header_was_set(skb))
+ break;
return skb_transport_header(skb);
}
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index d7b7b6cd4aa1..18a419cd9d94 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -25,11 +25,6 @@ struct qdisc_walker {
const struct Qdisc * : (const void *)&q->privdata, \
struct Qdisc * : (void *)&q->privdata)
-static inline struct Qdisc *qdisc_from_priv(void *priv)
-{
- return container_of(priv, struct Qdisc, privdata);
-}
-
/*
Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth
@@ -48,7 +43,6 @@ static inline struct Qdisc *qdisc_from_priv(void *priv)
*/
typedef u64 psched_time_t;
-typedef long psched_tdiff_t;
/* Avoid doing 64 bit divide */
#define PSCHED_SHIFT 6
@@ -114,19 +108,19 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
struct netlink_ext_ack *extack);
void qdisc_put_rtab(struct qdisc_rate_table *tab);
void qdisc_put_stab(struct qdisc_size_table *tab);
-void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc);
bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev, struct netdev_queue *txq,
spinlock_t *root_lock, bool validate);
void __qdisc_run(struct Qdisc *q);
-static inline void qdisc_run(struct Qdisc *q)
+static inline struct sk_buff *qdisc_run(struct Qdisc *q)
{
if (qdisc_run_begin(q)) {
__qdisc_run(q);
- qdisc_run_end(q);
+ return qdisc_run_end(q);
}
+ return NULL;
}
extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1];
@@ -290,4 +284,52 @@ static inline bool tc_qdisc_stats_dump(struct Qdisc *sch,
return true;
}
+static inline void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
+{
+ if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
+ pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
+ txt, qdisc->ops->id, qdisc->handle >> 16);
+ qdisc->flags |= TCQ_F_WARN_NONWC;
+ }
+}
+
+static inline unsigned int qdisc_peek_len(struct Qdisc *sch)
+{
+ struct sk_buff *skb;
+ unsigned int len;
+
+ skb = sch->ops->peek(sch);
+ if (unlikely(skb == NULL)) {
+ qdisc_warn_nonwc("qdisc_peek_len", sch);
+ return 0;
+ }
+ len = qdisc_pkt_len(skb);
+
+ return len;
+}
+
+static inline void qdisc_lock_init(struct Qdisc *sch,
+ const struct Qdisc_ops *ops)
+{
+ spin_lock_init(&sch->q.lock);
+
+ /* Skip dynamic keys if nesting is not possible */
+ if (ops->static_flags & TCQ_F_INGRESS ||
+ ops == &noqueue_qdisc_ops)
+ return;
+
+ lockdep_register_key(&sch->root_lock_key);
+ lockdep_set_class(&sch->q.lock, &sch->root_lock_key);
+}
+
+static inline void qdisc_lock_uninit(struct Qdisc *sch,
+ const struct Qdisc_ops *ops)
+{
+ if (ops->static_flags & TCQ_F_INGRESS ||
+ ops == &noqueue_qdisc_ops)
+ return;
+
+ lockdep_unregister_key(&sch->root_lock_key);
+}
+
#endif
diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h
index a6ab2f4f5e28..ad6d703ce6fe 100644
--- a/include/net/proto_memory.h
+++ b/include/net/proto_memory.h
@@ -31,10 +31,13 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
if (!sk->sk_prot->memory_pressure)
return false;
- if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
- mem_cgroup_under_socket_pressure(sk->sk_memcg))
+ if (mem_cgroup_sk_enabled(sk) &&
+ mem_cgroup_sk_under_memory_pressure(sk))
return true;
+ if (sk->sk_bypass_prot_mem)
+ return false;
+
return !!READ_ONCE(*sk->sk_prot->memory_pressure);
}
diff --git a/include/net/psp.h b/include/net/psp.h
new file mode 100644
index 000000000000..33bb4d1dc46e
--- /dev/null
+++ b/include/net/psp.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __NET_PSP_ALL_H
+#define __NET_PSP_ALL_H
+
+#include <uapi/linux/psp.h>
+#include <net/psp/functions.h>
+#include <net/psp/types.h>
+
+/* Do not add any code here. Put it in the sub-headers instead. */
+
+#endif /* __NET_PSP_ALL_H */
diff --git a/include/net/psp/functions.h b/include/net/psp/functions.h
new file mode 100644
index 000000000000..c5c23a54774e
--- /dev/null
+++ b/include/net/psp/functions.h
@@ -0,0 +1,209 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __NET_PSP_HELPERS_H
+#define __NET_PSP_HELPERS_H
+
+#include <linux/skbuff.h>
+#include <linux/rcupdate.h>
+#include <linux/udp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/psp/types.h>
+
+struct inet_timewait_sock;
+
+/* Driver-facing API */
+struct psp_dev *
+psp_dev_create(struct net_device *netdev, struct psp_dev_ops *psd_ops,
+ struct psp_dev_caps *psd_caps, void *priv_ptr);
+void psp_dev_unregister(struct psp_dev *psd);
+bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi,
+ u8 ver, __be16 sport);
+int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv);
+
+/* Kernel-facing API */
+void psp_assoc_put(struct psp_assoc *pas);
+
+static inline void *psp_assoc_drv_data(struct psp_assoc *pas)
+{
+ return pas->drv_data;
+}
+
+#if IS_ENABLED(CONFIG_INET_PSP)
+unsigned int psp_key_size(u32 version);
+void psp_sk_assoc_free(struct sock *sk);
+void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk);
+void psp_twsk_assoc_free(struct inet_timewait_sock *tw);
+void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb);
+
+static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk)
+{
+ return rcu_dereference_check(sk->psp_assoc, lockdep_sock_is_held(sk));
+}
+
+static inline void
+psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb)
+{
+ struct psp_assoc *pas;
+
+ pas = psp_sk_assoc(sk);
+ if (pas && pas->tx.spi)
+ skb->decrypted = 1;
+}
+
+static inline unsigned long
+__psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two,
+ unsigned long diffs)
+{
+ struct psp_skb_ext *a, *b;
+
+ a = skb_ext_find(one, SKB_EXT_PSP);
+ b = skb_ext_find(two, SKB_EXT_PSP);
+
+ diffs |= (!!a) ^ (!!b);
+ if (!diffs && unlikely(a))
+ diffs |= memcmp(a, b, sizeof(*a));
+ return diffs;
+}
+
+static inline bool
+psp_is_allowed_nondata(struct sk_buff *skb, struct psp_assoc *pas)
+{
+ bool fin = !!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN);
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+ u32 seq = TCP_SKB_CB(skb)->seq;
+ bool pure_fin;
+
+ pure_fin = fin && end_seq - seq == 1;
+
+ return seq == end_seq || (pure_fin && seq == pas->upgrade_seq);
+}
+
+static inline bool
+psp_pse_matches_pas(struct psp_skb_ext *pse, struct psp_assoc *pas)
+{
+ return pse && pas->rx.spi == pse->spi &&
+ pas->generation == pse->generation &&
+ pas->version == pse->version &&
+ pas->dev_id == pse->dev_id;
+}
+
+static inline enum skb_drop_reason
+__psp_sk_rx_policy_check(struct sk_buff *skb, struct psp_assoc *pas)
+{
+ struct psp_skb_ext *pse = skb_ext_find(skb, SKB_EXT_PSP);
+
+ if (!pas)
+ return pse ? SKB_DROP_REASON_PSP_INPUT : 0;
+
+ if (likely(psp_pse_matches_pas(pse, pas))) {
+ if (unlikely(!pas->peer_tx))
+ pas->peer_tx = 1;
+
+ return 0;
+ }
+
+ if (!pse) {
+ if (!pas->tx.spi ||
+ (!pas->peer_tx && psp_is_allowed_nondata(skb, pas)))
+ return 0;
+ }
+
+ return SKB_DROP_REASON_PSP_INPUT;
+}
+
+static inline enum skb_drop_reason
+psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb)
+{
+ return __psp_sk_rx_policy_check(skb, psp_sk_assoc(sk));
+}
+
+static inline enum skb_drop_reason
+psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb)
+{
+ return __psp_sk_rx_policy_check(skb, rcu_dereference(tw->psp_assoc));
+}
+
+static inline struct psp_assoc *psp_sk_get_assoc_rcu(const struct sock *sk)
+{
+ struct psp_assoc *pas;
+ int state;
+
+ state = READ_ONCE(sk->sk_state);
+ if (!sk_is_inet(sk) || state == TCP_NEW_SYN_RECV)
+ return NULL;
+
+ pas = state == TCP_TIME_WAIT ?
+ rcu_dereference(inet_twsk(sk)->psp_assoc) :
+ rcu_dereference(sk->psp_assoc);
+ return pas;
+}
+
+static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb)
+{
+ if (!skb->decrypted || !skb->sk)
+ return NULL;
+
+ return psp_sk_get_assoc_rcu(skb->sk);
+}
+
+static inline unsigned int psp_sk_overhead(const struct sock *sk)
+{
+ int psp_encap = sizeof(struct udphdr) + PSP_HDR_SIZE + PSP_TRL_SIZE;
+ bool has_psp = rcu_access_pointer(sk->psp_assoc);
+
+ return has_psp ? psp_encap : 0;
+}
+#else
+static inline void psp_sk_assoc_free(struct sock *sk) { }
+static inline void
+psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) { }
+static inline void psp_twsk_assoc_free(struct inet_timewait_sock *tw) { }
+static inline void
+psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb) { }
+
+static inline struct psp_assoc *psp_sk_assoc(const struct sock *sk)
+{
+ return NULL;
+}
+
+static inline void
+psp_enqueue_set_decrypted(struct sock *sk, struct sk_buff *skb) { }
+
+static inline unsigned long
+__psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two,
+ unsigned long diffs)
+{
+ return diffs;
+}
+
+static inline enum skb_drop_reason
+psp_sk_rx_policy_check(struct sock *sk, struct sk_buff *skb)
+{
+ return 0;
+}
+
+static inline enum skb_drop_reason
+psp_twsk_rx_policy_check(struct inet_timewait_sock *tw, struct sk_buff *skb)
+{
+ return 0;
+}
+
+static inline struct psp_assoc *psp_skb_get_assoc_rcu(struct sk_buff *skb)
+{
+ return NULL;
+}
+
+static inline unsigned int psp_sk_overhead(const struct sock *sk)
+{
+ return 0;
+}
+#endif
+
+static inline unsigned long
+psp_skb_coalesce_diff(const struct sk_buff *one, const struct sk_buff *two)
+{
+ return __psp_skb_coalesce_diff(one, two, 0);
+}
+
+#endif /* __NET_PSP_HELPERS_H */
diff --git a/include/net/psp/types.h b/include/net/psp/types.h
new file mode 100644
index 000000000000..25a9096d4e7d
--- /dev/null
+++ b/include/net/psp/types.h
@@ -0,0 +1,216 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __NET_PSP_H
+#define __NET_PSP_H
+
+#include <linux/mutex.h>
+#include <linux/refcount.h>
+
+struct netlink_ext_ack;
+
+#define PSP_DEFAULT_UDP_PORT 1000
+
+struct psphdr {
+ u8 nexthdr;
+ u8 hdrlen;
+ u8 crypt_offset;
+ u8 verfl;
+ __be32 spi;
+ __be64 iv;
+ __be64 vc[]; /* optional */
+};
+
+#define PSP_ENCAP_HLEN (sizeof(struct udphdr) + sizeof(struct psphdr))
+
+#define PSP_SPI_KEY_ID GENMASK(30, 0)
+#define PSP_SPI_KEY_PHASE BIT(31)
+
+#define PSPHDR_CRYPT_OFFSET GENMASK(5, 0)
+
+#define PSPHDR_VERFL_SAMPLE BIT(7)
+#define PSPHDR_VERFL_DROP BIT(6)
+#define PSPHDR_VERFL_VERSION GENMASK(5, 2)
+#define PSPHDR_VERFL_VIRT BIT(1)
+#define PSPHDR_VERFL_ONE BIT(0)
+
+#define PSP_HDRLEN_NOOPT ((sizeof(struct psphdr) - 8) / 8)
+
+/**
+ * struct psp_dev_config - PSP device configuration
+ * @versions: PSP versions enabled on the device
+ */
+struct psp_dev_config {
+ u32 versions;
+};
+
+/**
+ * struct psp_dev - PSP device struct
+ * @main_netdev: original netdevice of this PSP device
+ * @ops: driver callbacks
+ * @caps: device capabilities
+ * @drv_priv: driver priv pointer
+ * @lock: instance lock, protects all fields
+ * @refcnt: reference count for the instance
+ * @id: instance id
+ * @generation: current generation of the device key
+ * @config: current device configuration
+ * @active_assocs: list of registered associations
+ * @prev_assocs: associations which use old (but still usable)
+ * device key
+ * @stale_assocs: associations which use a rotated out key
+ *
+ * @stats: statistics maintained by the core
+ * @stats.rotations: See stats attr key-rotations
+ * @stats.stales: See stats attr stale-events
+ *
+ * @rcu: RCU head for freeing the structure
+ */
+struct psp_dev {
+ struct net_device *main_netdev;
+
+ struct psp_dev_ops *ops;
+ struct psp_dev_caps *caps;
+ void *drv_priv;
+
+ struct mutex lock;
+ refcount_t refcnt;
+
+ u32 id;
+
+ u8 generation;
+
+ struct psp_dev_config config;
+
+ struct list_head active_assocs;
+ struct list_head prev_assocs;
+ struct list_head stale_assocs;
+
+ struct {
+ unsigned long rotations;
+ unsigned long stales;
+ } stats;
+
+ struct rcu_head rcu;
+};
+
+#define PSP_GEN_VALID_MASK 0x7f
+
+/**
+ * struct psp_dev_caps - PSP device capabilities
+ */
+struct psp_dev_caps {
+ /**
+ * @versions: mask of supported PSP versions
+ * Set this field to 0 to indicate PSP is not supported at all.
+ */
+ u32 versions;
+
+ /**
+ * @assoc_drv_spc: size of driver-specific state in Tx assoc
+ * Determines the size of struct psp_assoc::drv_data
+ */
+ u32 assoc_drv_spc;
+};
+
+#define PSP_MAX_KEY 32
+
+#define PSP_HDR_SIZE 16 /* We don't support optional fields, yet */
+#define PSP_TRL_SIZE 16 /* AES-GCM/GMAC trailer size */
+
+struct psp_skb_ext {
+ __be32 spi;
+ u16 dev_id;
+ u8 generation;
+ u8 version;
+};
+
+struct psp_key_parsed {
+ __be32 spi;
+ u8 key[PSP_MAX_KEY];
+};
+
+struct psp_assoc {
+ struct psp_dev *psd;
+
+ u16 dev_id;
+ u8 generation;
+ u8 version;
+ u8 peer_tx;
+
+ u32 upgrade_seq;
+
+ struct psp_key_parsed tx;
+ struct psp_key_parsed rx;
+
+ refcount_t refcnt;
+ struct rcu_head rcu;
+ struct work_struct work;
+ struct list_head assocs_list;
+
+ u8 drv_data[] __aligned(8);
+};
+
+struct psp_dev_stats {
+ union {
+ struct {
+ u64 rx_packets;
+ u64 rx_bytes;
+ u64 rx_auth_fail;
+ u64 rx_error;
+ u64 rx_bad;
+ u64 tx_packets;
+ u64 tx_bytes;
+ u64 tx_error;
+ };
+ DECLARE_FLEX_ARRAY(u64, required);
+ };
+};
+
+/**
+ * struct psp_dev_ops - netdev driver facing PSP callbacks
+ */
+struct psp_dev_ops {
+ /**
+ * @set_config: set configuration of a PSP device
+ * Driver can inspect @psd->config for the previous configuration.
+ * Core will update @psd->config with @config on success.
+ */
+ int (*set_config)(struct psp_dev *psd, struct psp_dev_config *conf,
+ struct netlink_ext_ack *extack);
+
+ /**
+ * @key_rotate: rotate the device key
+ */
+ int (*key_rotate)(struct psp_dev *psd, struct netlink_ext_ack *extack);
+
+ /**
+ * @rx_spi_alloc: allocate an Rx SPI+key pair
+ * Allocate an Rx SPI and resulting derived key.
+ * This key should remain valid until key rotation.
+ */
+ int (*rx_spi_alloc)(struct psp_dev *psd, u32 version,
+ struct psp_key_parsed *assoc,
+ struct netlink_ext_ack *extack);
+
+ /**
+ * @tx_key_add: add a Tx key to the device
+ * Install an association in the device. Core will allocate space
+ * for the driver to use at drv_data.
+ */
+ int (*tx_key_add)(struct psp_dev *psd, struct psp_assoc *pas,
+ struct netlink_ext_ack *extack);
+ /**
+ * @tx_key_del: remove a Tx key from the device
+ * Remove an association from the device.
+ */
+ void (*tx_key_del)(struct psp_dev *psd, struct psp_assoc *pas);
+
+ /**
+ * @get_stats: get statistics from the device
+ * Stats required by the spec must be maintained and filled in.
+ * Stats must be filled in member-by-member, never memset the struct.
+ */
+ void (*get_stats)(struct psp_dev *psd, struct psp_dev_stats *stats);
+};
+
+#endif /* __NET_PSP_H */
diff --git a/include/net/raw.h b/include/net/raw.h
index 32a61481a253..66c0ffeada2e 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -81,6 +81,7 @@ struct raw_sock {
struct inet_sock inet;
struct icmp_filter filter;
u32 ipmr_table;
+ struct numa_drop_counters drop_counters;
};
#define raw_sk(ptr) container_of_const(ptr, struct raw_sock, inet.sk)
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index b07b1cd14e9f..5a9c826a7092 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -30,19 +30,14 @@ struct request_sock_ops {
unsigned int obj_size;
struct kmem_cache *slab;
char *slab_name;
- int (*rtx_syn_ack)(const struct sock *sk,
- struct request_sock *req);
void (*send_ack)(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
void (*send_reset)(const struct sock *sk,
struct sk_buff *skb,
enum sk_rst_reason reason);
void (*destructor)(struct request_sock *req);
- void (*syn_ack_timeout)(const struct request_sock *req);
};
-int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req);
-
struct saved_syn {
u32 mac_hdrlen;
u32 network_hdrlen;
@@ -128,14 +123,7 @@ static inline struct sock *skb_steal_sock(struct sk_buff *skb,
return sk;
}
-static inline void __reqsk_free(struct request_sock *req)
-{
- req->rsk_ops->destructor(req);
- if (req->rsk_listener)
- sock_put(req->rsk_listener);
- kfree(req->saved_syn);
- kmem_cache_free(req->rsk_ops->slab, req);
-}
+void __reqsk_free(struct request_sock *req);
static inline void reqsk_free(struct request_sock *req)
{
@@ -189,8 +177,8 @@ struct fastopen_queue {
struct request_sock_queue {
spinlock_t rskq_lock;
u8 rskq_defer_accept;
+ u8 synflood_warned;
- u32 synflood_warned;
atomic_t qlen;
atomic_t young;
@@ -201,8 +189,6 @@ struct request_sock_queue {
*/
};
-void reqsk_queue_alloc(struct request_sock_queue *queue);
-
void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
bool reset);
diff --git a/include/net/rose.h b/include/net/rose.h
index 23267b4efcfa..41bfcb224f0b 100644
--- a/include/net/rose.h
+++ b/include/net/rose.h
@@ -1,250 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Declarations of Rose type objects.
- *
- * Jonathan Naylor G4KLX 25/8/96
- */
-
#ifndef _ROSE_H
-#define _ROSE_H
-
-#include <linux/rose.h>
-#include <net/ax25.h>
-#include <net/sock.h>
-
-#define ROSE_ADDR_LEN 5
-
-#define ROSE_MIN_LEN 3
-
-#define ROSE_CALL_REQ_ADDR_LEN_OFF 3
-#define ROSE_CALL_REQ_ADDR_LEN_VAL 0xAA /* each address is 10 digits */
-#define ROSE_CALL_REQ_DEST_ADDR_OFF 4
-#define ROSE_CALL_REQ_SRC_ADDR_OFF 9
-#define ROSE_CALL_REQ_FACILITIES_OFF 14
-
-#define ROSE_GFI 0x10
-#define ROSE_Q_BIT 0x80
-#define ROSE_D_BIT 0x40
-#define ROSE_M_BIT 0x10
-
-#define ROSE_CALL_REQUEST 0x0B
-#define ROSE_CALL_ACCEPTED 0x0F
-#define ROSE_CLEAR_REQUEST 0x13
-#define ROSE_CLEAR_CONFIRMATION 0x17
-#define ROSE_DATA 0x00
-#define ROSE_INTERRUPT 0x23
-#define ROSE_INTERRUPT_CONFIRMATION 0x27
-#define ROSE_RR 0x01
-#define ROSE_RNR 0x05
-#define ROSE_REJ 0x09
-#define ROSE_RESET_REQUEST 0x1B
-#define ROSE_RESET_CONFIRMATION 0x1F
-#define ROSE_REGISTRATION_REQUEST 0xF3
-#define ROSE_REGISTRATION_CONFIRMATION 0xF7
-#define ROSE_RESTART_REQUEST 0xFB
-#define ROSE_RESTART_CONFIRMATION 0xFF
-#define ROSE_DIAGNOSTIC 0xF1
-#define ROSE_ILLEGAL 0xFD
-
-/* Define Link State constants. */
-
-enum {
- ROSE_STATE_0, /* Ready */
- ROSE_STATE_1, /* Awaiting Call Accepted */
- ROSE_STATE_2, /* Awaiting Clear Confirmation */
- ROSE_STATE_3, /* Data Transfer */
- ROSE_STATE_4, /* Awaiting Reset Confirmation */
- ROSE_STATE_5 /* Deferred Call Acceptance */
-};
-
-#define ROSE_DEFAULT_T0 180000 /* Default T10 T20 value */
-#define ROSE_DEFAULT_T1 200000 /* Default T11 T21 value */
-#define ROSE_DEFAULT_T2 180000 /* Default T12 T22 value */
-#define ROSE_DEFAULT_T3 180000 /* Default T13 T23 value */
-#define ROSE_DEFAULT_HB 5000 /* Default Holdback value */
-#define ROSE_DEFAULT_IDLE 0 /* No Activity Timeout - none */
-#define ROSE_DEFAULT_ROUTING 1 /* Default routing flag */
-#define ROSE_DEFAULT_FAIL_TIMEOUT 120000 /* Time until link considered usable */
-#define ROSE_DEFAULT_MAXVC 50 /* Maximum number of VCs per neighbour */
-#define ROSE_DEFAULT_WINDOW_SIZE 7 /* Default window size */
-
-#define ROSE_MODULUS 8
-#define ROSE_MAX_PACKET_SIZE 251 /* Maximum packet size */
-
-#define ROSE_COND_ACK_PENDING 0x01
-#define ROSE_COND_PEER_RX_BUSY 0x02
-#define ROSE_COND_OWN_RX_BUSY 0x04
-
-#define FAC_NATIONAL 0x00
-#define FAC_CCITT 0x0F
-
-#define FAC_NATIONAL_RAND 0x7F
-#define FAC_NATIONAL_FLAGS 0x3F
-#define FAC_NATIONAL_DEST_DIGI 0xE9
-#define FAC_NATIONAL_SRC_DIGI 0xEB
-#define FAC_NATIONAL_FAIL_CALL 0xED
-#define FAC_NATIONAL_FAIL_ADD 0xEE
-#define FAC_NATIONAL_DIGIS 0xEF
-
-#define FAC_CCITT_DEST_NSAP 0xC9
-#define FAC_CCITT_SRC_NSAP 0xCB
-
-struct rose_neigh {
- struct rose_neigh *next;
- ax25_address callsign;
- ax25_digi *digipeat;
- ax25_cb *ax25;
- struct net_device *dev;
- unsigned short count;
- unsigned short use;
- unsigned int number;
- char restarted;
- char dce_mode;
- char loopback;
- struct sk_buff_head queue;
- struct timer_list t0timer;
- struct timer_list ftimer;
-};
-
-struct rose_node {
- struct rose_node *next;
- rose_address address;
- unsigned short mask;
- unsigned char count;
- char loopback;
- struct rose_neigh *neighbour[3];
-};
-
-struct rose_route {
- struct rose_route *next;
- unsigned int lci1, lci2;
- rose_address src_addr, dest_addr;
- ax25_address src_call, dest_call;
- struct rose_neigh *neigh1, *neigh2;
- unsigned int rand;
-};
-
-struct rose_sock {
- struct sock sock;
- rose_address source_addr, dest_addr;
- ax25_address source_call, dest_call;
- unsigned char source_ndigis, dest_ndigis;
- ax25_address source_digis[ROSE_MAX_DIGIS];
- ax25_address dest_digis[ROSE_MAX_DIGIS];
- struct rose_neigh *neighbour;
- struct net_device *device;
- netdevice_tracker dev_tracker;
- unsigned int lci, rand;
- unsigned char state, condition, qbitincl, defer;
- unsigned char cause, diagnostic;
- unsigned short vs, vr, va, vl;
- unsigned long t1, t2, t3, hb, idle;
-#ifdef M_BIT
- unsigned short fraglen;
- struct sk_buff_head frag_queue;
-#endif
- struct sk_buff_head ack_queue;
- struct rose_facilities_struct facilities;
- struct timer_list timer;
- struct timer_list idletimer;
-};
-
-#define rose_sk(sk) ((struct rose_sock *)(sk))
-
-/* af_rose.c */
-extern ax25_address rose_callsign;
-extern int sysctl_rose_restart_request_timeout;
-extern int sysctl_rose_call_request_timeout;
-extern int sysctl_rose_reset_request_timeout;
-extern int sysctl_rose_clear_request_timeout;
-extern int sysctl_rose_no_activity_timeout;
-extern int sysctl_rose_ack_hold_back_timeout;
-extern int sysctl_rose_routing_control;
-extern int sysctl_rose_link_fail_timeout;
-extern int sysctl_rose_maximum_vcs;
-extern int sysctl_rose_window_size;
-
-int rosecmp(const rose_address *, const rose_address *);
-int rosecmpm(const rose_address *, const rose_address *, unsigned short);
-char *rose2asc(char *buf, const rose_address *);
-struct sock *rose_find_socket(unsigned int, struct rose_neigh *);
-void rose_kill_by_neigh(struct rose_neigh *);
-unsigned int rose_new_lci(struct rose_neigh *);
-int rose_rx_call_request(struct sk_buff *, struct net_device *,
- struct rose_neigh *, unsigned int);
-void rose_destroy_socket(struct sock *);
-
-/* rose_dev.c */
-void rose_setup(struct net_device *);
-
-/* rose_in.c */
-int rose_process_rx_frame(struct sock *, struct sk_buff *);
-
-/* rose_link.c */
-void rose_start_ftimer(struct rose_neigh *);
-void rose_stop_ftimer(struct rose_neigh *);
-void rose_stop_t0timer(struct rose_neigh *);
-int rose_ftimer_running(struct rose_neigh *);
-void rose_link_rx_restart(struct sk_buff *, struct rose_neigh *,
- unsigned short);
-void rose_transmit_clear_request(struct rose_neigh *, unsigned int,
- unsigned char, unsigned char);
-void rose_transmit_link(struct sk_buff *, struct rose_neigh *);
-
-/* rose_loopback.c */
-void rose_loopback_init(void);
-void rose_loopback_clear(void);
-int rose_loopback_queue(struct sk_buff *, struct rose_neigh *);
-
-/* rose_out.c */
-void rose_kick(struct sock *);
-void rose_enquiry_response(struct sock *);
-
-/* rose_route.c */
-extern struct rose_neigh *rose_loopback_neigh;
-extern const struct seq_operations rose_neigh_seqops;
-extern const struct seq_operations rose_node_seqops;
-extern struct seq_operations rose_route_seqops;
-
-void rose_add_loopback_neigh(void);
-int __must_check rose_add_loopback_node(const rose_address *);
-void rose_del_loopback_node(const rose_address *);
-void rose_rt_device_down(struct net_device *);
-void rose_link_device_down(struct net_device *);
-struct net_device *rose_dev_first(void);
-struct net_device *rose_dev_get(rose_address *);
-struct rose_route *rose_route_free_lci(unsigned int, struct rose_neigh *);
-struct rose_neigh *rose_get_neigh(rose_address *, unsigned char *,
- unsigned char *, int);
-int rose_rt_ioctl(unsigned int, void __user *);
-void rose_link_failed(ax25_cb *, int);
-int rose_route_frame(struct sk_buff *, ax25_cb *);
-void rose_rt_free(void);
-
-/* rose_subr.c */
-void rose_clear_queues(struct sock *);
-void rose_frames_acked(struct sock *, unsigned short);
-void rose_requeue_frames(struct sock *);
-int rose_validate_nr(struct sock *, unsigned short);
-void rose_write_internal(struct sock *, int);
-int rose_decode(struct sk_buff *, int *, int *, int *, int *, int *);
-int rose_parse_facilities(unsigned char *, unsigned int,
- struct rose_facilities_struct *);
-void rose_disconnect(struct sock *, int, int, int);
-
-/* rose_timer.c */
-void rose_start_heartbeat(struct sock *);
-void rose_start_t1timer(struct sock *);
-void rose_start_t2timer(struct sock *);
-void rose_start_t3timer(struct sock *);
-void rose_start_hbtimer(struct sock *);
-void rose_start_idletimer(struct sock *);
-void rose_stop_heartbeat(struct sock *);
-void rose_stop_timer(struct sock *);
-void rose_stop_idletimer(struct sock *);
+#define _ROSE_H
-/* sysctl_net_rose.c */
-void rose_register_sysctl(void);
-void rose_unregister_sysctl(void);
+#define ROSE_ADDR_LEN 5
#endif
diff --git a/include/net/route.h b/include/net/route.h
index c605fd5ec0c0..f90106f383c5 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -153,7 +153,7 @@ static inline void inet_sk_init_flowi4(const struct inet_sock *inet,
ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
sk->sk_protocol, inet_sk_flowi_flags(sk), daddr,
inet->inet_saddr, inet->inet_dport,
- inet->inet_sport, sk->sk_uid);
+ inet->inet_sport, sk_uid(sk));
security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
}
@@ -189,7 +189,7 @@ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr,
{
struct flowi4 fl4 = {
.flowi4_oif = oif,
- .flowi4_tos = inet_dscp_to_dsfield(dscp),
+ .flowi4_dscp = dscp,
.flowi4_scope = scope,
.daddr = daddr,
.saddr = saddr,
@@ -326,9 +326,12 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst,
if (inet_test_bit(TRANSPARENT, sk))
flow_flags |= FLOWI_FLAG_ANYSRC;
+ if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport)
+ flow_flags |= FLOWI_FLAG_ANY_SPORT;
+
flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk),
ip_sock_rt_scope(sk), protocol, flow_flags, dst,
- src, dport, sport, sk->sk_uid);
+ src, dport, sport, sk_uid(sk));
}
static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst,
@@ -387,7 +390,7 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
const struct net *net;
rcu_read_lock();
- net = dev_net_rcu(dst->dev);
+ net = dst_dev_net_rcu(dst);
hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
rcu_read_unlock();
}
diff --git a/include/net/rps-types.h b/include/net/rps-types.h
new file mode 100644
index 000000000000..6b90a66866c1
--- /dev/null
+++ b/include/net/rps-types.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_RPS_TYPES_H
+#define _NET_RPS_TYPES_H
+
+/* Define a rps_tag_ptr:
+ * Low order 5 bits are used to store the ilog2(size) of an RPS table.
+ */
+typedef unsigned long rps_tag_ptr;
+
+static inline u8 rps_tag_to_log(rps_tag_ptr tag_ptr)
+{
+ return tag_ptr & 31U;
+}
+
+static inline u32 rps_tag_to_mask(rps_tag_ptr tag_ptr)
+{
+ return (1U << rps_tag_to_log(tag_ptr)) - 1;
+}
+
+static inline void *rps_tag_to_table(rps_tag_ptr tag_ptr)
+{
+ return (void *)(tag_ptr & ~31UL);
+}
+#endif /* _NET_RPS_TYPES_H */
diff --git a/include/net/rps.h b/include/net/rps.h
index a93401d23d66..e33c6a2fa8bb 100644
--- a/include/net/rps.h
+++ b/include/net/rps.h
@@ -8,6 +8,7 @@
#include <net/hotdata.h>
#ifdef CONFIG_RPS
+#include <net/rps-types.h>
extern struct static_key_false rps_needed;
extern struct static_key_false rfs_needed;
@@ -25,28 +26,20 @@ struct rps_map {
/*
* The rps_dev_flow structure contains the mapping of a flow to a CPU, the
- * tail pointer for that CPU's input queue at the time of last enqueue, and
- * a hardware filter index.
+ * tail pointer for that CPU's input queue at the time of last enqueue, a
+ * hardware filter index, and the hash of the flow if aRFS is enabled.
*/
struct rps_dev_flow {
u16 cpu;
u16 filter;
unsigned int last_qtail;
+#ifdef CONFIG_RFS_ACCEL
+ u32 hash;
+#endif
};
#define RPS_NO_FILTER 0xffff
/*
- * The rps_dev_flow_table structure contains a table of flow mappings.
- */
-struct rps_dev_flow_table {
- unsigned int mask;
- struct rcu_head rcu;
- struct rps_dev_flow flows[];
-};
-#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
- ((_num) * sizeof(struct rps_dev_flow)))
-
-/*
* The rps_sock_flow_table contains mappings of flows to the last CPU
* on which they were processed by the application (set in recvmsg).
* Each entry is a 32bit value. Upper part is the high-order bits
@@ -57,68 +50,119 @@ struct rps_dev_flow_table {
* meaning we use 32-6=26 bits for the hash.
*/
struct rps_sock_flow_table {
- u32 mask;
-
- u32 ents[] ____cacheline_aligned_in_smp;
+ u32 ent;
};
-#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
#define RPS_NO_CPU 0xffff
-static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
- u32 hash)
+static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash)
{
- unsigned int index = hash & table->mask;
+ unsigned int index = hash & rps_tag_to_mask(tag_ptr);
u32 val = hash & ~net_hotdata.rps_cpu_mask;
+ struct rps_sock_flow_table *table;
/* We only give a hint, preemption can change CPU under us */
val |= raw_smp_processor_id();
+ table = rps_tag_to_table(tag_ptr);
/* The following WRITE_ONCE() is paired with the READ_ONCE()
* here, and another one in get_rps_cpu().
*/
- if (READ_ONCE(table->ents[index]) != val)
- WRITE_ONCE(table->ents[index], val);
+ if (READ_ONCE(table[index].ent) != val)
+ WRITE_ONCE(table[index].ent, val);
}
-#endif /* CONFIG_RPS */
+static inline void _sock_rps_record_flow_hash(__u32 hash)
+{
+ rps_tag_ptr tag_ptr;
-static inline void sock_rps_record_flow_hash(__u32 hash)
+ if (!hash)
+ return;
+ rcu_read_lock();
+ tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
+ if (tag_ptr)
+ rps_record_sock_flow(tag_ptr, hash);
+ rcu_read_unlock();
+}
+
+static inline void _sock_rps_record_flow(const struct sock *sk)
{
-#ifdef CONFIG_RPS
- struct rps_sock_flow_table *sock_flow_table;
+ /* Reading sk->sk_rxhash might incur an expensive cache line
+ * miss.
+ *
+ * TCP_ESTABLISHED does cover almost all states where RFS
+ * might be useful, and is cheaper [1] than testing :
+ * IPv4: inet_sk(sk)->inet_daddr
+ * IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
+ * OR an additional socket flag
+ * [1] : sk_state and sk_prot are in the same cache line.
+ */
+ if (sk->sk_state == TCP_ESTABLISHED) {
+ /* This READ_ONCE() is paired with the WRITE_ONCE()
+ * from sock_rps_save_rxhash() and sock_rps_reset_rxhash().
+ */
+ _sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash));
+ }
+}
+
+static inline void _sock_rps_delete_flow(const struct sock *sk)
+{
+ struct rps_sock_flow_table *table;
+ rps_tag_ptr tag_ptr;
+ u32 hash, index;
+ hash = READ_ONCE(sk->sk_rxhash);
if (!hash)
return;
+
rcu_read_lock();
- sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
- if (sock_flow_table)
- rps_record_sock_flow(sock_flow_table, hash);
+ tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
+ if (tag_ptr) {
+ index = hash & rps_tag_to_mask(tag_ptr);
+ table = rps_tag_to_table(tag_ptr);
+ if (READ_ONCE(table[index].ent) != RPS_NO_CPU)
+ WRITE_ONCE(table[index].ent, RPS_NO_CPU);
+ }
rcu_read_unlock();
+}
+#endif /* CONFIG_RPS */
+
+static inline bool rfs_is_needed(void)
+{
+#ifdef CONFIG_RPS
+ return static_branch_unlikely(&rfs_needed);
+#else
+ return false;
+#endif
+}
+
+static inline void sock_rps_record_flow_hash(__u32 hash)
+{
+#ifdef CONFIG_RPS
+ if (!rfs_is_needed())
+ return;
+
+ _sock_rps_record_flow_hash(hash);
#endif
}
static inline void sock_rps_record_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
- if (static_branch_unlikely(&rfs_needed)) {
- /* Reading sk->sk_rxhash might incur an expensive cache line
- * miss.
- *
- * TCP_ESTABLISHED does cover almost all states where RFS
- * might be useful, and is cheaper [1] than testing :
- * IPv4: inet_sk(sk)->inet_daddr
- * IPv6: ipv6_addr_any(&sk->sk_v6_daddr)
- * OR an additional socket flag
- * [1] : sk_state and sk_prot are in the same cache line.
- */
- if (sk->sk_state == TCP_ESTABLISHED) {
- /* This READ_ONCE() is paired with the WRITE_ONCE()
- * from sock_rps_save_rxhash() and sock_rps_reset_rxhash().
- */
- sock_rps_record_flow_hash(READ_ONCE(sk->sk_rxhash));
- }
- }
+ if (!rfs_is_needed())
+ return;
+
+ _sock_rps_record_flow(sk);
+#endif
+}
+
+static inline void sock_rps_delete_flow(const struct sock *sk)
+{
+#ifdef CONFIG_RPS
+ if (!rfs_is_needed())
+ return;
+
+ _sock_rps_delete_flow(sk);
#endif
}
diff --git a/include/net/rstreason.h b/include/net/rstreason.h
index 69cb2e52b7da..979ac87b5d99 100644
--- a/include/net/rstreason.h
+++ b/include/net/rstreason.h
@@ -36,7 +36,7 @@
/**
* enum sk_rst_reason - the reasons of socket reset
*
- * The reasons of sk reset, which are used in DCCP/TCP/MPTCP protocols.
+ * The reasons of sk reset, which are used in TCP/MPTCP protocols.
*
* There are three parts in order:
* 1) skb drop reasons: relying on drop reasons for such as passive reset
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index bc0069a8b6ea..ec65a8cebb99 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -70,6 +70,40 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh)
}
/**
+ * struct rtnl_newlink_params - parameters of rtnl_link_ops::newlink()
+ *
+ * @src_net: Source netns of rtnetlink socket
+ * @link_net: Link netns by IFLA_LINK_NETNSID, NULL if not specified
+ * @peer_net: Peer netns
+ * @tb: IFLA_* attributes
+ * @data: IFLA_INFO_DATA attributes
+ */
+struct rtnl_newlink_params {
+ struct net *src_net;
+ struct net *link_net;
+ struct net *peer_net;
+ struct nlattr **tb;
+ struct nlattr **data;
+};
+
+/* Get effective link netns from newlink params. Generally, this is link_net
+ * and falls back to src_net. But for compatibility, a driver may * choose to
+ * use dev_net(dev) instead.
+ */
+static inline struct net *rtnl_newlink_link_net(struct rtnl_newlink_params *p)
+{
+ return p->link_net ? : p->src_net;
+}
+
+/* Get peer netns from newlink params. Fallback to link netns if peer netns is
+ * not specified explicitly.
+ */
+static inline struct net *rtnl_newlink_peer_net(struct rtnl_newlink_params *p)
+{
+ return p->peer_net ? : rtnl_newlink_link_net(p);
+}
+
+/**
* struct rtnl_link_ops - rtnetlink link operations
*
* @list: Used internally, protected by link_ops_mutex and SRCU
@@ -125,10 +159,8 @@ struct rtnl_link_ops {
struct nlattr *data[],
struct netlink_ext_ack *extack);
- int (*newlink)(struct net *src_net,
- struct net_device *dev,
- struct nlattr *tb[],
- struct nlattr *data[],
+ int (*newlink)(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack);
int (*changelink)(struct net_device *dev,
struct nlattr *tb[],
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index d48c657191cd..11159a50d6a1 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -20,12 +20,15 @@
#include <net/rtnetlink.h>
#include <net/flow_offload.h>
#include <linux/xarray.h>
+#include <net/dropreason-qdisc.h>
struct Qdisc_ops;
struct qdisc_walker;
struct tcf_walker;
struct module;
struct bpf_flow_keys;
+struct Qdisc;
+struct netdev_queue;
struct qdisc_rate_table {
struct tc_ratespec rate;
@@ -41,13 +44,6 @@ enum qdisc_state_t {
__QDISC_STATE_DRAINING,
};
-enum qdisc_state2_t {
- /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly.
- * Use qdisc_run_begin/end() or qdisc_is_running() instead.
- */
- __QDISC_STATE2_RUNNING,
-};
-
#define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED)
#define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING)
@@ -95,6 +91,8 @@ struct Qdisc {
#define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */
#define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */
#define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */
+#define TCQ_F_DEQUEUE_DROPS 0x400 /* ->dequeue() can drop packets in q->to_free */
+
u32 limit;
const struct Qdisc_ops *ops;
struct qdisc_size_table __rcu *stab;
@@ -110,20 +108,30 @@ struct Qdisc {
int pad;
refcount_t refcnt;
- /*
- * For performance sake on SMP, we put highly modified fields at the end
- */
- struct sk_buff_head gso_skb ____cacheline_aligned_in_smp;
- struct qdisc_skb_head q;
- struct gnet_stats_basic_sync bstats;
- struct gnet_stats_queue qstats;
- int owner;
- unsigned long state;
- unsigned long state2; /* must be written under qdisc spinlock */
- struct Qdisc *next_sched;
- struct sk_buff_head skb_bad_txq;
-
- spinlock_t busylock ____cacheline_aligned_in_smp;
+ /* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */
+ __cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned;
+ struct sk_buff_head gso_skb;
+ struct Qdisc *next_sched;
+ struct sk_buff_head skb_bad_txq;
+ __cacheline_group_end(Qdisc_read_mostly);
+
+ /* Fields dirtied in dequeue() fast path. */
+ __cacheline_group_begin(Qdisc_write) ____cacheline_aligned;
+ struct qdisc_skb_head q;
+ unsigned long state;
+ struct gnet_stats_basic_sync bstats;
+ bool running; /* must be written under qdisc spinlock */
+
+ /* Note : we only change qstats.backlog in fast path. */
+ struct gnet_stats_queue qstats;
+
+ struct sk_buff *to_free;
+ __cacheline_group_end(Qdisc_write);
+
+
+ atomic_long_t defer_count ____cacheline_aligned_in_smp;
+ struct llist_head defer_list;
+
spinlock_t seqlock;
struct rcu_head rcu;
@@ -168,7 +176,7 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc)
{
if (qdisc->flags & TCQ_F_NOLOCK)
return spin_is_locked(&qdisc->seqlock);
- return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
+ return READ_ONCE(qdisc->running);
}
static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc)
@@ -211,11 +219,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
*/
return spin_trylock(&qdisc->seqlock);
}
- return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
+ if (READ_ONCE(qdisc->running))
+ return false;
+ WRITE_ONCE(qdisc->running, true);
+ return true;
}
-static inline void qdisc_run_end(struct Qdisc *qdisc)
+static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc)
{
+ struct sk_buff *to_free = NULL;
+
if (qdisc->flags & TCQ_F_NOLOCK) {
spin_unlock(&qdisc->seqlock);
@@ -228,9 +241,16 @@ static inline void qdisc_run_end(struct Qdisc *qdisc)
if (unlikely(test_bit(__QDISC_STATE_MISSED,
&qdisc->state)))
__netif_schedule(qdisc);
- } else {
- __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
+ return NULL;
}
+
+ if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) {
+ to_free = qdisc->to_free;
+ if (to_free)
+ qdisc->to_free = NULL;
+ }
+ WRITE_ONCE(qdisc->running, false);
+ return to_free;
}
static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
@@ -432,13 +452,16 @@ struct tcf_proto {
};
struct qdisc_skb_cb {
- struct {
- unsigned int pkt_len;
- u16 slave_dev_queue_mapping;
- u16 tc_classid;
- };
+ unsigned int pkt_len;
+ u16 pkt_segs;
+ u16 tc_classid;
#define QDISC_CB_PRIV_LEN 20
unsigned char data[QDISC_CB_PRIV_LEN];
+
+ u16 slave_dev_queue_mapping;
+ u8 post_ct:1;
+ u8 post_ct_snat:1;
+ u8 post_ct_dnat:1;
};
typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);
@@ -687,8 +710,8 @@ void dev_qdisc_change_real_num_tx(struct net_device *dev,
void dev_init_scheduler(struct net_device *dev);
void dev_shutdown(struct net_device *dev);
void dev_activate(struct net_device *dev);
-void dev_deactivate(struct net_device *dev);
-void dev_deactivate_many(struct list_head *head);
+void dev_deactivate(struct net_device *dev, bool reset_needed);
+void dev_deactivate_many(struct list_head *head, bool reset_needed);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
@@ -696,6 +719,34 @@ void qdisc_destroy(struct Qdisc *qdisc);
void qdisc_put(struct Qdisc *qdisc);
void qdisc_put_unlocked(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len);
+
+static inline void dev_reset_queue(struct net_device *dev,
+ struct netdev_queue *dev_queue,
+ void *_unused)
+{
+ struct Qdisc *qdisc;
+ bool nolock;
+
+ qdisc = rtnl_dereference(dev_queue->qdisc_sleeping);
+ if (!qdisc)
+ return;
+
+ nolock = qdisc->flags & TCQ_F_NOLOCK;
+
+ if (nolock)
+ spin_lock_bh(&qdisc->seqlock);
+ spin_lock_bh(qdisc_lock(qdisc));
+
+ qdisc_reset(qdisc);
+
+ spin_unlock_bh(qdisc_lock(qdisc));
+ if (nolock) {
+ clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
+ clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
+ spin_unlock_bh(&qdisc->seqlock);
+ }
+}
+
#ifdef CONFIG_NET_SCHED
int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
void *type_data);
@@ -758,13 +809,23 @@ static inline bool skb_skip_tc_classify(struct sk_buff *skb)
static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
{
struct Qdisc *qdisc;
+ bool nolock;
for (; i < dev->num_tx_queues; i++) {
qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
if (qdisc) {
+ nolock = qdisc->flags & TCQ_F_NOLOCK;
+
+ if (nolock)
+ spin_lock_bh(&qdisc->seqlock);
spin_lock_bh(qdisc_lock(qdisc));
qdisc_reset(qdisc);
spin_unlock_bh(qdisc_lock(qdisc));
+ if (nolock) {
+ clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
+ clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
+ spin_unlock_bh(&qdisc->seqlock);
+ }
}
}
}
@@ -803,6 +864,14 @@ static inline bool qdisc_tx_changing(const struct net_device *dev)
return false;
}
+/* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */
+static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq)
+{
+ struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc);
+
+ return qdisc->enqueue == NULL;
+}
+
/* Is the device using the noop qdisc on all queues? */
static inline bool qdisc_tx_is_noop(const struct net_device *dev)
{
@@ -821,6 +890,15 @@ static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb)
return qdisc_skb_cb(skb)->pkt_len;
}
+static inline unsigned int qdisc_pkt_segs(const struct sk_buff *skb)
+{
+ u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs;
+
+ DEBUG_NET_WARN_ON_ONCE(pkt_segs !=
+ (skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1));
+ return pkt_segs;
+}
+
/* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */
enum net_xmit_qdisc_t {
__NET_XMIT_STOLEN = 0x00010000,
@@ -862,9 +940,7 @@ static inline void _bstats_update(struct gnet_stats_basic_sync *bstats,
static inline void bstats_update(struct gnet_stats_basic_sync *bstats,
const struct sk_buff *skb)
{
- _bstats_update(bstats,
- qdisc_pkt_len(skb),
- skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
+ _bstats_update(bstats, qdisc_pkt_len(skb), qdisc_pkt_segs(skb));
}
static inline void qdisc_bstats_cpu_update(struct Qdisc *sch,
@@ -965,14 +1041,6 @@ static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch, __u32 *qlen,
*backlog = qstats.backlog;
}
-static inline void qdisc_tree_flush_backlog(struct Qdisc *sch)
-{
- __u32 qlen, backlog;
-
- qdisc_qstats_qlen_backlog(sch, &qlen, &backlog);
- qdisc_tree_reduce_backlog(sch, qlen, backlog);
-}
-
static inline void qdisc_purge_queue(struct Qdisc *sch)
{
__u32 qlen, backlog;
@@ -1031,6 +1099,26 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh)
return skb;
}
+static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool direct)
+{
+ struct sk_buff *skb;
+
+ skb = __skb_dequeue(&sch->gso_skb);
+ if (skb) {
+ sch->q.qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ return skb;
+ }
+ if (direct) {
+ skb = __qdisc_dequeue_head(&sch->q);
+ if (skb)
+ qdisc_qstats_backlog_dec(sch, skb);
+ return skb;
+ } else {
+ return sch->dequeue(sch);
+ }
+}
+
static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
{
struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
@@ -1047,11 +1135,8 @@ struct tc_skb_cb {
struct qdisc_skb_cb qdisc_cb;
u32 drop_reason;
- u16 zone; /* Only valid if post_ct = true */
+ u16 zone; /* Only valid if qdisc_skb_cb(skb)->post_ct = true */
u16 mru;
- u8 post_ct:1;
- u8 post_ct_snat:1;
- u8 post_ct_dnat:1;
};
static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb)
@@ -1062,18 +1147,64 @@ static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb)
return cb;
}
+/* TC classifier accessors - use enum skb_drop_reason */
static inline enum skb_drop_reason
tcf_get_drop_reason(const struct sk_buff *skb)
{
- return tc_skb_cb(skb)->drop_reason;
+ return (enum skb_drop_reason)tc_skb_cb(skb)->drop_reason;
}
static inline void tcf_set_drop_reason(const struct sk_buff *skb,
enum skb_drop_reason reason)
{
+ tc_skb_cb(skb)->drop_reason = (enum qdisc_drop_reason)reason;
+}
+
+/* Qdisc accessors - use enum qdisc_drop_reason */
+static inline enum qdisc_drop_reason
+tcf_get_qdisc_drop_reason(const struct sk_buff *skb)
+{
+ return tc_skb_cb(skb)->drop_reason;
+}
+
+static inline void tcf_set_qdisc_drop_reason(const struct sk_buff *skb,
+ enum qdisc_drop_reason reason)
+{
tc_skb_cb(skb)->drop_reason = reason;
}
+void __tcf_kfree_skb_list(struct sk_buff *skb, struct Qdisc *q,
+ struct netdev_queue *txq, struct net_device *dev);
+
+static inline void tcf_kfree_skb_list(struct sk_buff *skb, struct Qdisc *q,
+ struct netdev_queue *txq,
+ struct net_device *dev)
+{
+ if (unlikely(skb))
+ __tcf_kfree_skb_list(skb, q, txq, dev);
+}
+
+static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb,
+ enum qdisc_drop_reason reason)
+{
+ struct Qdisc *root;
+
+ DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS));
+ DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK);
+
+ rcu_read_lock();
+ root = qdisc_root_sleeping(q);
+
+ if (root->flags & TCQ_F_DEQUEUE_DROPS) {
+ tcf_set_qdisc_drop_reason(skb, reason);
+ skb->next = root->to_free;
+ root->to_free = skb;
+ } else {
+ kfree_skb_reason(skb, (enum skb_drop_reason)reason);
+ }
+ rcu_read_unlock();
+}
+
/* Instead of calling kfree_skb() while root qdisc lock is held,
* queue the skb for future freeing at end of __dev_xmit_skb()
*/
@@ -1246,9 +1377,9 @@ static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
static inline int qdisc_drop_reason(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free,
- enum skb_drop_reason reason)
+ enum qdisc_drop_reason reason)
{
- tcf_set_drop_reason(skb, reason);
+ tcf_set_qdisc_drop_reason(skb, reason);
return qdisc_drop(skb, sch, to_free);
}
@@ -1353,6 +1484,11 @@ void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
struct tcf_block *block);
+static inline bool mini_qdisc_pair_inited(struct mini_Qdisc_pair *miniqp)
+{
+ return !!miniqp->p_miniq;
+}
+
void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx);
int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb));
diff --git a/include/net/sch_priv.h b/include/net/sch_priv.h
new file mode 100644
index 000000000000..4789f668ae87
--- /dev/null
+++ b/include/net/sch_priv.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_SCHED_PRIV_H
+#define __NET_SCHED_PRIV_H
+
+#include <net/sch_generic.h>
+
+struct mq_sched {
+ struct Qdisc **qdiscs;
+};
+
+int mq_init_common(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack,
+ const struct Qdisc_ops *qdisc_ops);
+void mq_destroy_common(struct Qdisc *sch);
+void mq_attach(struct Qdisc *sch);
+void mq_dump_common(struct Qdisc *sch, struct sk_buff *skb);
+struct netdev_queue *mq_select_queue(struct Qdisc *sch,
+ struct tcmsg *tcm);
+struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl);
+unsigned long mq_find(struct Qdisc *sch, u32 classid);
+int mq_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm);
+int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d);
+void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg);
+
+#endif
diff --git a/include/net/scm.h b/include/net/scm.h
index 22bb49589fde..c52519669349 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -69,7 +69,7 @@ static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_co
static __inline__ void scm_set_cred(struct scm_cookie *scm,
struct pid *pid, kuid_t uid, kgid_t gid)
{
- scm->pid = get_pid(pid);
+ scm->pid = get_pid(pid);
scm->creds.pid = pid_vnr(pid);
scm->creds.uid = uid;
scm->creds.gid = gid;
@@ -78,7 +78,7 @@ static __inline__ void scm_set_cred(struct scm_cookie *scm,
static __inline__ void scm_destroy_cred(struct scm_cookie *scm)
{
put_pid(scm->pid);
- scm->pid = NULL;
+ scm->pid = NULL;
}
static __inline__ void scm_destroy(struct scm_cookie *scm)
@@ -102,123 +102,10 @@ static __inline__ int scm_send(struct socket *sock, struct msghdr *msg,
return __scm_send(sock, msg, scm);
}
-#ifdef CONFIG_SECURITY_NETWORK
-static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm)
-{
- struct lsm_context ctx;
- int err;
-
- if (test_bit(SOCK_PASSSEC, &sock->flags)) {
- err = security_secid_to_secctx(scm->secid, &ctx);
-
- if (err >= 0) {
- put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len,
- ctx.context);
- security_release_secctx(&ctx);
- }
- }
-}
-
-static inline bool scm_has_secdata(struct socket *sock)
-{
- return test_bit(SOCK_PASSSEC, &sock->flags);
-}
-#else
-static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm)
-{ }
-
-static inline bool scm_has_secdata(struct socket *sock)
-{
- return false;
-}
-#endif /* CONFIG_SECURITY_NETWORK */
-
-static __inline__ void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm)
-{
- struct file *pidfd_file = NULL;
- int len, pidfd;
-
- /* put_cmsg() doesn't return an error if CMSG is truncated,
- * that's why we need to opencode these checks here.
- */
- if (msg->msg_flags & MSG_CMSG_COMPAT)
- len = sizeof(struct compat_cmsghdr) + sizeof(int);
- else
- len = sizeof(struct cmsghdr) + sizeof(int);
-
- if (msg->msg_controllen < len) {
- msg->msg_flags |= MSG_CTRUNC;
- return;
- }
-
- if (!scm->pid)
- return;
-
- pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file);
-
- if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
- if (pidfd_file) {
- put_unused_fd(pidfd);
- fput(pidfd_file);
- }
-
- return;
- }
-
- if (pidfd_file)
- fd_install(pidfd, pidfd_file);
-}
-
-static inline bool __scm_recv_common(struct socket *sock, struct msghdr *msg,
- struct scm_cookie *scm, int flags)
-{
- if (!msg->msg_control) {
- if (test_bit(SOCK_PASSCRED, &sock->flags) ||
- test_bit(SOCK_PASSPIDFD, &sock->flags) ||
- scm->fp || scm_has_secdata(sock))
- msg->msg_flags |= MSG_CTRUNC;
- scm_destroy(scm);
- return false;
- }
-
- if (test_bit(SOCK_PASSCRED, &sock->flags)) {
- struct user_namespace *current_ns = current_user_ns();
- struct ucred ucreds = {
- .pid = scm->creds.pid,
- .uid = from_kuid_munged(current_ns, scm->creds.uid),
- .gid = from_kgid_munged(current_ns, scm->creds.gid),
- };
- put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds);
- }
-
- scm_passec(sock, msg, scm);
-
- if (scm->fp)
- scm_detach_fds(msg, scm);
-
- return true;
-}
-
-static inline void scm_recv(struct socket *sock, struct msghdr *msg,
- struct scm_cookie *scm, int flags)
-{
- if (!__scm_recv_common(sock, msg, scm, flags))
- return;
-
- scm_destroy_cred(scm);
-}
-
-static inline void scm_recv_unix(struct socket *sock, struct msghdr *msg,
- struct scm_cookie *scm, int flags)
-{
- if (!__scm_recv_common(sock, msg, scm, flags))
- return;
-
- if (test_bit(SOCK_PASSPIDFD, &sock->flags))
- scm_pidfd_recv(msg, scm);
-
- scm_destroy_cred(scm);
-}
+void scm_recv(struct socket *sock, struct msghdr *msg,
+ struct scm_cookie *scm, int flags);
+void scm_recv_unix(struct socket *sock, struct msghdr *msg,
+ struct scm_cookie *scm, int flags);
static inline int scm_recv_one_fd(struct file *f, int __user *ufd,
unsigned int flags)
diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h
index d4b3b2dcd15b..6f2cd562b1de 100644
--- a/include/net/sctp/auth.h
+++ b/include/net/sctp/auth.h
@@ -22,16 +22,11 @@ struct sctp_endpoint;
struct sctp_association;
struct sctp_authkey;
struct sctp_hmacalgo;
-struct crypto_shash;
-/*
- * Define a generic struct that will hold all the info
- * necessary for an HMAC transform
- */
+/* Defines an HMAC algorithm supported by SCTP chunk authentication */
struct sctp_hmac {
- __u16 hmac_id; /* one of the above ids */
- char *hmac_name; /* name for loading */
- __u16 hmac_len; /* length of the signature */
+ __u16 hmac_id; /* one of SCTP_AUTH_HMAC_ID_* */
+ __u16 hmac_len; /* length of the HMAC value in bytes */
};
/* This is generic structure that containst authentication bytes used
@@ -77,10 +72,9 @@ struct sctp_shared_key *sctp_auth_get_shkey(
int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep,
struct sctp_association *asoc,
gfp_t gfp);
-int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp);
-void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[]);
-struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id);
-struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc);
+const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id);
+const struct sctp_hmac *
+sctp_auth_asoc_get_hmac(const struct sctp_association *asoc);
void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc,
struct sctp_hmac_algo_param *hmacs);
int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc,
diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h
index f514a0aa849e..654d37ec0402 100644
--- a/include/net/sctp/checksum.h
+++ b/include/net/sctp/checksum.h
@@ -15,8 +15,6 @@
* Dinakaran Joseph
* Jon Grimm <jgrimm@us.ibm.com>
* Sridhar Samudrala <sri@us.ibm.com>
- *
- * Rewritten to use libcrc32c by:
* Vlad Yasevich <vladislav.yasevich@hp.com>
*/
@@ -25,42 +23,18 @@
#include <linux/types.h>
#include <linux/sctp.h>
-#include <linux/crc32c.h>
-#include <linux/crc32.h>
-
-static inline __wsum sctp_csum_update(const void *buff, int len, __wsum sum)
-{
- /* This uses the crypto implementation of crc32c, which is either
- * implemented w/ hardware support or resolves to __crc32c_le().
- */
- return (__force __wsum)crc32c((__force __u32)sum, buff, len);
-}
-
-static inline __wsum sctp_csum_combine(__wsum csum, __wsum csum2,
- int offset, int len)
-{
- return (__force __wsum)__crc32c_le_combine((__force __u32)csum,
- (__force __u32)csum2, len);
-}
-
-static const struct skb_checksum_ops sctp_csum_ops = {
- .update = sctp_csum_update,
- .combine = sctp_csum_combine,
-};
static inline __le32 sctp_compute_cksum(const struct sk_buff *skb,
unsigned int offset)
{
struct sctphdr *sh = (struct sctphdr *)(skb->data + offset);
__le32 old = sh->checksum;
- __wsum new;
+ u32 new;
sh->checksum = 0;
- new = ~__skb_checksum(skb, offset, skb->len - offset, ~(__wsum)0,
- &sctp_csum_ops);
+ new = ~skb_crc32c(skb, offset, skb->len - offset, ~0);
sh->checksum = old;
-
- return cpu_to_le32((__force __u32)new);
+ return cpu_to_le32(new);
}
#endif /* __sctp_checksum_h__ */
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 5859e0a16a58..ae3376ba0b99 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -296,9 +296,8 @@ enum { SCTP_MAX_GABS = 16 };
*/
#define SCTP_DEFAULT_MINSEGMENT 512 /* MTU size ... if no mtu disc */
-#define SCTP_SECRET_SIZE 32 /* Number of octets in a 256 bits. */
-
-#define SCTP_SIGNATURE_SIZE 20 /* size of a SLA-1 signature */
+#define SCTP_COOKIE_KEY_SIZE 32 /* size of cookie HMAC key */
+#define SCTP_COOKIE_MAC_SIZE 32 /* size of HMAC field in cookies */
#define SCTP_COOKIE_MULTIPLE 32 /* Pad out our cookie to make our hash
* functions simpler to write.
@@ -417,16 +416,12 @@ enum {
SCTP_AUTH_HMAC_ID_RESERVED_0,
SCTP_AUTH_HMAC_ID_SHA1,
SCTP_AUTH_HMAC_ID_RESERVED_2,
-#if defined (CONFIG_CRYPTO_SHA256) || defined (CONFIG_CRYPTO_SHA256_MODULE)
SCTP_AUTH_HMAC_ID_SHA256,
-#endif
__SCTP_AUTH_HMAC_MAX
};
#define SCTP_AUTH_HMAC_ID_MAX __SCTP_AUTH_HMAC_MAX - 1
#define SCTP_AUTH_NUM_HMACS __SCTP_AUTH_HMAC_MAX
-#define SCTP_SHA1_SIG_SIZE 20
-#define SCTP_SHA256_SIG_SIZE 32
/* SCTP-AUTH, Section 3.2
* The chunk types for INIT, INIT-ACK, SHUTDOWN-COMPLETE and AUTH chunks
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 84e6b9fd5610..58242b37b47a 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -85,7 +85,7 @@ void sctp_udp_sock_stop(struct net *net);
/*
* sctp/socket.c
*/
-int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr,
+int sctp_inet_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags);
int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb);
int sctp_inet_listen(struct socket *sock, int backlog);
@@ -94,8 +94,7 @@ void sctp_data_ready(struct sock *sk);
__poll_t sctp_poll(struct file *file, struct socket *sock,
poll_table *wait);
void sctp_sock_rfree(struct sk_buff *skb);
-void sctp_copy_sock(struct sock *newsk, struct sock *sk,
- struct sctp_association *asoc);
+
extern struct percpu_counter sctp_sockets_allocated;
int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int *);
@@ -364,8 +363,6 @@ sctp_assoc_to_state(const struct sctp_association *asoc)
/* Look up the association by its id. */
struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id);
-int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp);
-
/* A macro to walk a list of skbs. */
#define sctp_skb_for_each(pos, head, tmp) \
skb_queue_walk_safe(head, pos, tmp)
@@ -636,7 +633,7 @@ static inline void sctp_transport_pl_reset(struct sctp_transport *t)
}
} else {
if (t->pl.state != SCTP_PL_DISABLED) {
- if (del_timer(&t->probe_timer))
+ if (timer_delete(&t->probe_timer))
sctp_transport_put(t);
t->pl.state = SCTP_PL_DISABLED;
}
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 64c42bd56bb2..3bfd261a53cc 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -161,7 +161,6 @@ const struct sctp_sm_table_entry *sctp_sm_lookup_event(
enum sctp_event_type event_type,
enum sctp_state state,
union sctp_subtype event_subtype);
-int sctp_chunk_iif(const struct sctp_chunk *);
struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *,
struct sctp_chunk *,
gfp_t gfp);
diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h
index 8034bf5febbe..77806ef1cb70 100644
--- a/include/net/sctp/stream_sched.h
+++ b/include/net/sctp/stream_sched.h
@@ -52,10 +52,10 @@ void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch);
void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch);
int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp);
-struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream);
+const struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream);
void sctp_sched_ops_register(enum sctp_sched_type sched,
- struct sctp_sched_ops *sched_ops);
+ const struct sctp_sched_ops *sched_ops);
void sctp_sched_ops_prio_init(void);
void sctp_sched_ops_rr_init(void);
void sctp_sched_ops_fc_init(void);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 31248cfdfb23..affee44bd38e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -32,6 +32,7 @@
#ifndef __sctp_structs_h__
#define __sctp_structs_h__
+#include <crypto/sha2.h>
#include <linux/ktime.h>
#include <linux/generic-radix-tree.h>
#include <linux/rhashtable-types.h>
@@ -51,9 +52,9 @@
* We should wean ourselves off this.
*/
union sctp_addr {
+ struct sockaddr_inet sa; /* Large enough for both address families */
struct sockaddr_in v4;
struct sockaddr_in6 v6;
- struct sockaddr sa;
};
/* Forward declarations for data structures. */
@@ -68,7 +69,6 @@ struct sctp_outq;
struct sctp_bind_addr;
struct sctp_ulpq;
struct sctp_ep_common;
-struct crypto_shash;
struct sctp_stream;
@@ -155,10 +155,6 @@ struct sctp_sock {
/* PF_ family specific functions. */
struct sctp_pf *pf;
- /* Access to HMAC transform. */
- struct crypto_shash *hmac;
- char *sctp_hmac_alg;
-
/* What is our base endpointer? */
struct sctp_endpoint *ep;
@@ -227,14 +223,11 @@ struct sctp_sock {
frag_interleave:1,
recvrcvinfo:1,
recvnxtinfo:1,
- data_ready_signalled:1;
+ data_ready_signalled:1,
+ cookie_auth_enable:1;
atomic_t pd_mode;
- /* Fields after this point will be skipped on copies, like on accept
- * and peeloff operations
- */
-
/* Receive to here while partial delivery is in effect. */
struct sk_buff_head pd_lobby;
@@ -335,7 +328,7 @@ struct sctp_cookie {
/* The format of our cookie that we send to our peer. */
struct sctp_signed_cookie {
- __u8 signature[SCTP_SECRET_SIZE];
+ __u8 mac[SCTP_COOKIE_MAC_SIZE];
__u32 __pad; /* force sctp_cookie alignment to 64 bits */
struct sctp_cookie c;
} __packed;
@@ -500,9 +493,6 @@ struct sctp_pf {
int (*bind_verify) (struct sctp_sock *, union sctp_addr *);
int (*send_verify) (struct sctp_sock *, union sctp_addr *);
int (*supported_addrs)(const struct sctp_sock *, __be16 *);
- struct sock *(*create_accept_sk) (struct sock *sk,
- struct sctp_association *asoc,
- bool kern);
int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr *addr);
void (*to_sk_saddr)(union sctp_addr *, struct sock *sk);
void (*to_sk_daddr)(union sctp_addr *, struct sock *sk);
@@ -775,6 +765,7 @@ struct sctp_transport {
/* Reference counting. */
refcount_t refcnt;
+ __u32 dead:1,
/* RTO-Pending : A flag used to track if one of the DATA
* chunks sent to this address is currently being
* used to compute a RTT. If this flag is 0,
@@ -784,7 +775,7 @@ struct sctp_transport {
* calculation completes (i.e. the DATA chunk
* is SACK'd) clear this flag.
*/
- __u32 rto_pending:1,
+ rto_pending:1,
/*
* hb_sent : a flag that signals that we have a pending
@@ -1078,7 +1069,7 @@ struct sctp_outq {
struct list_head out_chunk_list;
/* Stream scheduler being used */
- struct sctp_sched_ops *sched;
+ const struct sctp_sched_ops *sched;
unsigned int out_qlen; /* Total length of queued data chunks. */
@@ -1306,33 +1297,15 @@ struct sctp_endpoint {
/* This is really a list of struct sctp_association entries. */
struct list_head asocs;
- /* Secret Key: A secret key used by this endpoint to compute
- * the MAC. This SHOULD be a cryptographic quality
- * random number with a sufficient length.
- * Discussion in [RFC1750] can be helpful in
- * selection of the key.
- */
- __u8 secret_key[SCTP_SECRET_SIZE];
-
- /* digest: This is a digest of the sctp cookie. This field is
- * only used on the receive path when we try to validate
- * that the cookie has not been tampered with. We put
- * this here so we pre-allocate this once and can re-use
- * on every receive.
- */
- __u8 *digest;
-
+ /* Cookie authentication key used by this endpoint */
+ struct hmac_sha256_key cookie_auth_key;
+
/* sendbuf acct. policy. */
__u32 sndbuf_policy;
/* rcvbuf acct. policy. */
__u32 rcvbuf_policy;
- /* SCTP AUTH: array of the HMACs that will be allocated
- * we need this per association so that we don't serialize
- */
- struct crypto_shash **auth_hmacs;
-
/* SCTP-AUTH: hmacs for the endpoint encoded into parameter */
struct sctp_hmac_algo_param *auth_hmacs_list;
@@ -2151,8 +2124,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *,
const union sctp_addr *address,
const gfp_t gfp,
const int peer_state);
-void sctp_assoc_del_peer(struct sctp_association *asoc,
- const union sctp_addr *addr);
void sctp_assoc_rm_peer(struct sctp_association *asoc,
struct sctp_transport *peer);
void sctp_assoc_control_transport(struct sctp_association *asoc,
diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index 21e7fa2a1813..6f996229167b 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -5,20 +5,47 @@
#include <linux/types.h>
struct net;
+extern struct net init_net;
+
+union tcp_seq_and_ts_off {
+ struct {
+ u32 seq;
+ u32 ts_off;
+ };
+ u64 hash64;
+};
u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport);
u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
__be16 dport);
-u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport);
-u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr);
-u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
- __be16 sport, __be16 dport);
-u32 secure_tcpv6_ts_off(const struct net *net,
- const __be32 *saddr, const __be32 *daddr);
-u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport);
-u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
- __be16 sport, __be16 dport);
+union tcp_seq_and_ts_off
+secure_tcp_seq_and_ts_off(const struct net *net, __be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport);
+
+static inline u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport)
+{
+ union tcp_seq_and_ts_off ts;
+
+ ts = secure_tcp_seq_and_ts_off(&init_net, saddr, daddr,
+ sport, dport);
+
+ return ts.seq;
+}
+
+union tcp_seq_and_ts_off
+secure_tcpv6_seq_and_ts_off(const struct net *net, const __be32 *saddr,
+ const __be32 *daddr,
+ __be16 sport, __be16 dport);
+
+static inline u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
+ __be16 sport, __be16 dport)
+{
+ union tcp_seq_and_ts_off ts;
+
+ ts = secure_tcpv6_seq_and_ts_off(&init_net, saddr, daddr,
+ sport, dport);
+ return ts.seq;
+}
#endif /* _NET_SECURE_SEQ */
diff --git a/include/net/seg6_hmac.h b/include/net/seg6_hmac.h
index 24f733b3e3fe..e9f41725933e 100644
--- a/include/net/seg6_hmac.h
+++ b/include/net/seg6_hmac.h
@@ -9,6 +9,8 @@
#ifndef _NET_SEG6_HMAC_H
#define _NET_SEG6_HMAC_H
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
#include <net/flow.h>
#include <net/ip6_fib.h>
#include <net/sock.h>
@@ -19,7 +21,6 @@
#include <linux/seg6_hmac.h>
#include <linux/rhashtable-types.h>
-#define SEG6_HMAC_MAX_DIGESTSIZE 160
#define SEG6_HMAC_RING_SIZE 256
struct seg6_hmac_info {
@@ -27,16 +28,15 @@ struct seg6_hmac_info {
struct rcu_head rcu;
u32 hmackeyid;
+ /* The raw key, kept only so it can be returned back to userspace */
char secret[SEG6_HMAC_SECRET_LEN];
u8 slen;
u8 alg_id;
-};
-
-struct seg6_hmac_algo {
- u8 alg_id;
- char name[64];
- struct crypto_shash * __percpu *tfms;
- struct shash_desc * __percpu *shashs;
+ /* The prepared key, which the calculations actually use */
+ union {
+ struct hmac_sha1_key sha1;
+ struct hmac_sha256_key sha256;
+ } key;
};
extern int seg6_hmac_compute(struct seg6_hmac_info *hinfo,
@@ -50,13 +50,9 @@ extern int seg6_push_hmac(struct net *net, struct in6_addr *saddr,
struct ipv6_sr_hdr *srh);
extern bool seg6_hmac_validate_skb(struct sk_buff *skb);
#ifdef CONFIG_IPV6_SEG6_HMAC
-extern int seg6_hmac_init(void);
-extern void seg6_hmac_exit(void);
extern int seg6_hmac_net_init(struct net *net);
extern void seg6_hmac_net_exit(struct net *net);
#else
-static inline int seg6_hmac_init(void) { return 0; }
-static inline void seg6_hmac_exit(void) {}
static inline int seg6_hmac_net_init(struct net *net) { return 0; }
static inline void seg6_hmac_net_exit(struct net *net) {}
#endif
diff --git a/include/net/selftests.h b/include/net/selftests.h
index e65e8d230d33..c36e07406ad4 100644
--- a/include/net/selftests.h
+++ b/include/net/selftests.h
@@ -3,9 +3,48 @@
#define _NET_SELFTESTS
#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+
+struct net_packet_attrs {
+ const unsigned char *src;
+ const unsigned char *dst;
+ u32 ip_src;
+ u32 ip_dst;
+ bool tcp;
+ u16 sport;
+ u16 dport;
+ int timeout;
+ int size;
+ int max_size;
+ u8 id;
+ u16 queue_mapping;
+ bool bad_csum;
+};
+
+struct net_test_priv {
+ struct net_packet_attrs *packet;
+ struct packet_type pt;
+ struct completion comp;
+ int double_vlan;
+ int vlan_id;
+ int ok;
+};
+
+struct netsfhdr {
+ __be32 version;
+ __be64 magic;
+ u8 id;
+} __packed;
+
+#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \
+ sizeof(struct netsfhdr))
+#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL
+#define NET_LB_TIMEOUT msecs_to_jiffies(200)
#if IS_ENABLED(CONFIG_NET_SELFTESTS)
+struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id,
+ struct net_packet_attrs *attr);
void net_selftest(struct net_device *ndev, struct ethtool_test *etest,
u64 *buf);
int net_selftest_get_count(void);
@@ -13,6 +52,12 @@ void net_selftest_get_strings(u8 *data);
#else
+static inline struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id,
+ struct net_packet_attrs *attr)
+{
+ return NULL;
+}
+
static inline void net_selftest(struct net_device *ndev, struct ethtool_test *etest,
u64 *buf)
{
diff --git a/include/net/smc.h b/include/net/smc.h
index db84e4e35080..bfdc4c41f019 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -15,8 +15,10 @@
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/wait.h>
-#include "linux/ism.h"
+#include <linux/dibs.h>
+struct tcp_sock;
+struct inet_request_sock;
struct sock;
#define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */
@@ -27,62 +29,15 @@ struct smc_hashinfo {
};
/* SMCD/ISM device driver interface */
-struct smcd_dmb {
- u64 dmb_tok;
- u64 rgid;
- u32 dmb_len;
- u32 sba_idx;
- u32 vlan_valid;
- u32 vlan_id;
- void *cpu_addr;
- dma_addr_t dma_addr;
-};
-
-#define ISM_EVENT_DMB 0
-#define ISM_EVENT_GID 1
-#define ISM_EVENT_SWR 2
-
#define ISM_RESERVED_VLANID 0x1FFF
-#define ISM_ERROR 0xFFFF
-
-struct smcd_dev;
-
struct smcd_gid {
u64 gid;
u64 gid_ext;
};
-struct smcd_ops {
- int (*query_remote_gid)(struct smcd_dev *dev, struct smcd_gid *rgid,
- u32 vid_valid, u32 vid);
- int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb,
- void *client);
- int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb);
- int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx,
- bool sf, unsigned int offset, void *data,
- unsigned int size);
- int (*supports_v2)(void);
- void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid);
- u16 (*get_chid)(struct smcd_dev *dev);
- struct device* (*get_dev)(struct smcd_dev *dev);
-
- /* optional operations */
- int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
- int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
- int (*set_vlan_required)(struct smcd_dev *dev);
- int (*reset_vlan_required)(struct smcd_dev *dev);
- int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid,
- u32 trigger_irq, u32 event_code, u64 info);
- int (*support_dmb_nocopy)(struct smcd_dev *dev);
- int (*attach_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb);
- int (*detach_dmb)(struct smcd_dev *dev, u64 token);
-};
-
struct smcd_dev {
- const struct smcd_ops *ops;
- void *priv;
- void *client;
+ struct dibs_dev *dibs;
struct list_head list;
spinlock_t lock;
struct smc_connection **conn;
@@ -97,4 +52,55 @@ struct smcd_dev {
u8 going_away : 1;
};
+#define SMC_HS_CTRL_NAME_MAX 16
+
+enum {
+ /* ops can be inherit from init_net */
+ SMC_HS_CTRL_FLAG_INHERITABLE = 0x1,
+
+ SMC_HS_CTRL_ALL_FLAGS = SMC_HS_CTRL_FLAG_INHERITABLE,
+};
+
+struct smc_hs_ctrl {
+ /* private */
+
+ struct list_head list;
+ struct module *owner;
+
+ /* public */
+
+ /* unique name */
+ char name[SMC_HS_CTRL_NAME_MAX];
+ int flags;
+
+ /* Invoked before computing SMC option for SYN packets.
+ * We can control whether to set SMC options by returning various value.
+ * Return 0 to disable SMC, or return any other value to enable it.
+ */
+ int (*syn_option)(struct tcp_sock *tp);
+
+ /* Invoked before Set up SMC options for SYN-ACK packets
+ * We can control whether to respond SMC options by returning various
+ * value. Return 0 to disable SMC, or return any other value to enable
+ * it.
+ */
+ int (*synack_option)(const struct tcp_sock *tp,
+ struct inet_request_sock *ireq);
+};
+
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+#define smc_call_hsbpf(init_val, tp, func, ...) ({ \
+ typeof(init_val) __ret = (init_val); \
+ struct smc_hs_ctrl *ctrl; \
+ rcu_read_lock(); \
+ ctrl = rcu_dereference(sock_net((struct sock *)(tp))->smc.hs_ctrl); \
+ if (ctrl && ctrl->func) \
+ __ret = ctrl->func(tp, ##__VA_ARGS__); \
+ rcu_read_unlock(); \
+ __ret; \
+})
+#else
+#define smc_call_hsbpf(init_val, tp, ...) ({ (void)(tp); (init_val); })
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
+
#endif /* _SMC_H */
diff --git a/include/net/snmp.h b/include/net/snmp.h
index 468a67836e2f..584e70742e9b 100644
--- a/include/net/snmp.h
+++ b/include/net/snmp.h
@@ -36,11 +36,6 @@ struct snmp_mib {
.entry = _entry, \
}
-#define SNMP_MIB_SENTINEL { \
- .name = NULL, \
- .entry = 0, \
-}
-
/*
* We use unsigned longs for most mibs but u64 for ipstats.
*/
@@ -159,7 +154,7 @@ struct linux_tls_mib {
#define __SNMP_ADD_STATS64(mib, field, addend) \
do { \
- __typeof__(*mib) *ptr = raw_cpu_ptr(mib); \
+ TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \
u64_stats_update_begin(&ptr->syncp); \
ptr->mibs[field] += addend; \
u64_stats_update_end(&ptr->syncp); \
@@ -176,8 +171,7 @@ struct linux_tls_mib {
#define SNMP_INC_STATS64(mib, field) SNMP_ADD_STATS64(mib, field, 1)
#define __SNMP_UPD_PO_STATS64(mib, basefield, addend) \
do { \
- __typeof__(*mib) *ptr; \
- ptr = raw_cpu_ptr((mib)); \
+ TYPEOF_UNQUAL(*mib) *ptr = raw_cpu_ptr(mib); \
u64_stats_update_begin(&ptr->syncp); \
ptr->mibs[basefield##PKTS]++; \
ptr->mibs[basefield##OCTETS] += addend; \
diff --git a/include/net/sock.h b/include/net/sock.h
index 7ef728324e4e..dccd3738c368 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -81,8 +81,13 @@
* mini-semaphore synchronizes multiple users amongst themselves.
*/
typedef struct {
- spinlock_t slock;
- int owned;
+ union {
+ struct slock_owned {
+ int owned;
+ spinlock_t slock;
+ };
+ long combined;
+ };
wait_queue_head_t wq;
/*
* We express the mutex-alike socket_lock semantics
@@ -118,16 +123,17 @@ typedef __u64 __bitwise __addrpair;
* @skc_reuseport: %SO_REUSEPORT setting
* @skc_ipv6only: socket is IPV6 only
* @skc_net_refcnt: socket is using net ref counting
+ * @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb
* @skc_bound_dev_if: bound device index if != 0
* @skc_bind_node: bind hash linkage for various protocol lookup tables
- * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
+ * @skc_portaddr_node: second hash linkage for UDP
* @skc_prot: protocol handlers inside a network family
* @skc_net: reference to the network namespace of this socket
* @skc_v6_daddr: IPV6 destination address
* @skc_v6_rcv_saddr: IPV6 source address
* @skc_cookie: socket's cookie value
* @skc_node: main hash linkage for various protocol lookup tables
- * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
+ * @skc_nulls_node: main hash linkage for TCP
* @skc_tx_queue_mapping: tx queue number for this connection
* @skc_rx_queue_mapping: rx queue number for this connection
* @skc_flags: place holder for sk_flags
@@ -174,6 +180,7 @@ struct sock_common {
unsigned char skc_reuseport:1;
unsigned char skc_ipv6only:1;
unsigned char skc_net_refcnt:1;
+ unsigned char skc_bypass_prot_mem:1;
int skc_bound_dev_if;
union {
struct hlist_node skc_bind_node;
@@ -249,6 +256,7 @@ struct sk_filter;
* @sk_dst_cache: destination cache
* @sk_dst_pending_confirm: need to confirm neighbour
* @sk_policy: flow policy
+ * @psp_assoc: PSP association, if socket is PSP-secured
* @sk_receive_queue: incoming packets
* @sk_wmem_alloc: transmit queue bytes committed
* @sk_tsq_flags: TCP Small Queues flags
@@ -282,9 +290,11 @@ struct sk_filter;
* @sk_err_soft: errors that don't cause failure but are the cause of a
* persistent failure not just 'timed out'
* @sk_drops: raw/udp drops counter
+ * @sk_drop_counters: optional pointer to numa_drop_counters
* @sk_ack_backlog: current listen backlog
* @sk_max_ack_backlog: listen backlog set in listen()
* @sk_uid: user id of owner
+ * @sk_ino: inode number (zero if orphaned)
* @sk_prefer_busy_poll: prefer busypolling over softirq processing
* @sk_busy_poll_budget: napi processing budget when busypolling
* @sk_priority: %SO_PRIORITY setting
@@ -300,15 +310,19 @@ struct sk_filter;
* @sk_txrehash: enable TX hash rethink
* @sk_filter: socket filtering instructions
* @sk_timer: sock cleanup timer
+ * @tcp_retransmit_timer: tcp retransmit timer
+ * @mptcp_retransmit_timer: mptcp retransmit timer
* @sk_stamp: time stamp of last packet received
* @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
* @sk_tsflags: SO_TIMESTAMPING flags
+ * @sk_bpf_cb_flags: used in bpf_setsockopt()
* @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
* Sockets that can be used under memory reclaim should
* set this to false.
* @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
* for timestamping
* @sk_tskey: counter to disambiguate concurrent tstamp requests
+ * @sk_tx_queue_mapping_jiffies: time in jiffies of last @sk_tx_queue_mapping refresh.
* @sk_zckey: counter to order MSG_ZEROCOPY notifications
* @sk_socket: Identd and reporting IO signals
* @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock.
@@ -332,12 +346,21 @@ struct sk_filter;
* @sk_reuseport_cb: reuseport group container
* @sk_bpf_storage: ptr to cache and control for bpf_sk_storage
* @sk_rcu: used during RCU grace period
+ * @sk_freeptr: used for SLAB_TYPESAFE_BY_RCU managed sockets
* @sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
* @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags
+ * @sk_scm_recv_flags: all flags used by scm_recv()
+ * @sk_scm_credentials: flagged by SO_PASSCRED to recv SCM_CREDENTIALS
+ * @sk_scm_security: flagged by SO_PASSSEC to recv SCM_SECURITY
+ * @sk_scm_pidfd: flagged by SO_PASSPIDFD to recv SCM_PIDFD
+ * @sk_scm_rights: flagged by SO_PASSRIGHTS to recv SCM_RIGHTS
+ * @sk_scm_unused: unused flags for scm_recv()
* @ns_tracker: tracker for netns reference
* @sk_user_frags: xarray of pages the user is holding a reference on.
+ * @sk_owner: reference to the real owner of the socket that calls
+ * sock_lock_init_class_and_name().
*/
struct sock {
/*
@@ -368,6 +391,7 @@ struct sock {
#define sk_reuseport __sk_common.skc_reuseport
#define sk_ipv6only __sk_common.skc_ipv6only
#define sk_net_refcnt __sk_common.skc_net_refcnt
+#define sk_bypass_prot_mem __sk_common.skc_bypass_prot_mem
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
#define sk_bind_node __sk_common.skc_bind_node
#define sk_prot __sk_common.skc_prot
@@ -434,10 +458,15 @@ struct sock {
__cacheline_group_begin(sock_read_rxtx);
int sk_err;
struct socket *sk_socket;
+#ifdef CONFIG_MEMCG
struct mem_cgroup *sk_memcg;
+#endif
#ifdef CONFIG_XFRM
struct xfrm_policy __rcu *sk_policy[2];
#endif
+#if IS_ENABLED(CONFIG_INET_PSP)
+ struct psp_assoc __rcu *psp_assoc;
+#endif
__cacheline_group_end(sock_read_rxtx);
__cacheline_group_begin(sock_write_rxtx);
@@ -450,7 +479,7 @@ struct sock {
__cacheline_group_begin(sock_write_tx);
int sk_write_pending;
atomic_t sk_omem_alloc;
- int sk_sndbuf;
+ int sk_err_soft;
int sk_wmem_queued;
refcount_t sk_wmem_alloc;
@@ -460,21 +489,28 @@ struct sock {
struct rb_root tcp_rtx_queue;
};
struct sk_buff_head sk_write_queue;
- u32 sk_dst_pending_confirm;
- u32 sk_pacing_status; /* see enum sk_pacing */
struct page_frag sk_frag;
- struct timer_list sk_timer;
-
+ union {
+ struct timer_list sk_timer;
+ struct timer_list tcp_retransmit_timer;
+ struct timer_list mptcp_retransmit_timer;
+ };
unsigned long sk_pacing_rate; /* bytes per second */
atomic_t sk_zckey;
atomic_t sk_tskey;
+ unsigned long sk_tx_queue_mapping_jiffies;
__cacheline_group_end(sock_write_tx);
__cacheline_group_begin(sock_read_tx);
+ u32 sk_dst_pending_confirm;
+ u32 sk_pacing_status; /* see enum sk_pacing */
unsigned long sk_max_pacing_rate;
long sk_sndtimeo;
u32 sk_priority;
u32 sk_mark;
+ kuid_t sk_uid;
+ u16 sk_protocol;
+ u16 sk_type;
struct dst_entry __rcu *sk_dst_cache;
netdev_features_t sk_route_caps;
#ifdef CONFIG_SOCK_VALIDATE_XMIT
@@ -487,6 +523,7 @@ struct sock {
unsigned int sk_gso_max_size;
gfp_t sk_allocation;
u32 sk_txhash;
+ int sk_sndbuf;
u8 sk_pacing_shift;
bool sk_use_task_frag;
__cacheline_group_end(sock_read_tx);
@@ -500,15 +537,12 @@ struct sock {
sk_no_check_tx : 1,
sk_no_check_rx : 1;
u8 sk_shutdown;
- u16 sk_type;
- u16 sk_protocol;
unsigned long sk_lingertime;
struct proto *sk_prot_creator;
rwlock_t sk_callback_lock;
- int sk_err_soft;
u32 sk_ack_backlog;
u32 sk_max_ack_backlog;
- kuid_t sk_uid;
+ u64 sk_ino;
spinlock_t sk_peer_lock;
int sk_bind_phc;
struct pid *sk_peer_pid;
@@ -520,11 +554,23 @@ struct sock {
#endif
int sk_disconnects;
- u8 sk_txrehash;
+ union {
+ u8 sk_txrehash;
+ u8 sk_scm_recv_flags;
+ struct {
+ u8 sk_scm_credentials : 1,
+ sk_scm_security : 1,
+ sk_scm_pidfd : 1,
+ sk_scm_rights : 1,
+ sk_scm_unused : 4;
+ };
+ };
u8 sk_clockid;
u8 sk_txtime_deadline_mode : 1,
sk_txtime_report_errors : 1,
sk_txtime_unused : 6;
+#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
+ u8 sk_bpf_cb_flags;
void *sk_user_data;
#ifdef CONFIG_SECURITY
@@ -541,9 +587,21 @@ struct sock {
#ifdef CONFIG_BPF_SYSCALL
struct bpf_local_storage __rcu *sk_bpf_storage;
#endif
- struct rcu_head sk_rcu;
+ struct numa_drop_counters *sk_drop_counters;
+ /* sockets using SLAB_TYPESAFE_BY_RCU can use sk_freeptr.
+ * By the time kfree() is called, sk_rcu can not be in
+ * use and can be mangled.
+ */
+ union {
+ struct rcu_head sk_rcu;
+ freeptr_t sk_freeptr;
+ };
netns_tracker ns_tracker;
struct xarray sk_user_frags;
+
+#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
+ struct module *sk_owner;
+#endif
};
struct sock_bh_locked {
@@ -793,11 +851,9 @@ static inline bool sk_del_node_init(struct sock *sk)
{
bool rc = __sk_del_node_init(sk);
- if (rc) {
- /* paranoid for a while -acme */
- WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
+ if (rc)
__sock_put(sk);
- }
+
return rc;
}
#define sk_del_node_init_rcu(sk) sk_del_node_init(sk)
@@ -815,14 +871,25 @@ static inline bool sk_nulls_del_node_init_rcu(struct sock *sk)
{
bool rc = __sk_nulls_del_node_init_rcu(sk);
- if (rc) {
- /* paranoid for a while -acme */
- WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
+ if (rc)
__sock_put(sk);
- }
+
return rc;
}
+static inline bool sk_nulls_replace_node_init_rcu(struct sock *old,
+ struct sock *new)
+{
+ if (sk_hashed(old)) {
+ hlist_nulls_replace_init_rcu(&old->sk_nulls_node,
+ &new->sk_nulls_node);
+ __sock_put(old);
+ return true;
+ }
+
+ return false;
+}
+
static inline void __sk_add_node(struct sock *sk, struct hlist_head *list)
{
hlist_add_head(&sk->sk_node, list);
@@ -954,6 +1021,7 @@ enum sock_flags {
SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */
SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */
+ SOCK_TIMESTAMPING_ANY, /* Copy of sk_tsflags & TSFLAGS_ANY */
};
#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
@@ -1224,10 +1292,10 @@ struct proto {
void (*close)(struct sock *sk,
long timeout);
int (*pre_connect)(struct sock *sk,
- struct sockaddr *uaddr,
+ struct sockaddr_unsized *uaddr,
int addr_len);
int (*connect)(struct sock *sk,
- struct sockaddr *uaddr,
+ struct sockaddr_unsized *uaddr,
int addr_len);
int (*disconnect)(struct sock *sk, int flags);
@@ -1253,12 +1321,12 @@ struct proto {
int (*sendmsg)(struct sock *sk, struct msghdr *msg,
size_t len);
int (*recvmsg)(struct sock *sk, struct msghdr *msg,
- size_t len, int flags, int *addr_len);
+ size_t len, int flags);
void (*splice_eof)(struct socket *sock);
int (*bind)(struct sock *sk,
- struct sockaddr *addr, int addr_len);
+ struct sockaddr_unsized *addr, int addr_len);
int (*bind_add)(struct sock *sk,
- struct sockaddr *addr, int addr_len);
+ struct sockaddr_unsized *addr, int addr_len);
int (*backlog_rcv) (struct sock *sk,
struct sk_buff *skb);
@@ -1284,10 +1352,6 @@ struct proto {
unsigned int inuse_idx;
#endif
-#if IS_ENABLED(CONFIG_MPTCP)
- int (*forward_alloc_get)(const struct sock *sk);
-#endif
-
bool (*stream_memory_free)(const struct sock *sk, int wake);
bool (*sock_is_readable)(struct sock *sk);
/* Memory pressure */
@@ -1317,19 +1381,17 @@ struct proto {
struct kmem_cache *slab;
unsigned int obj_size;
+ unsigned int freeptr_offset;
unsigned int ipv6_pinfo_offset;
slab_flags_t slab_flags;
unsigned int useroffset; /* Usercopy region offset */
unsigned int usersize; /* Usercopy region size */
- unsigned int __percpu *orphan_count;
-
struct request_sock_ops *rsk_prot;
struct timewait_sock_ops *twsk_prot;
union {
struct inet_hashinfo *hashinfo;
- struct udp_table *udp_table;
struct raw_hashinfo *raw_hash;
struct smc_hashinfo *smc_hash;
} h;
@@ -1348,15 +1410,6 @@ int sock_load_diag_module(int family, int protocol);
INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));
-static inline int sk_forward_alloc_get(const struct sock *sk)
-{
-#if IS_ENABLED(CONFIG_MPTCP)
- if (sk->sk_prot->forward_alloc_get)
- return sk->sk_prot->forward_alloc_get(sk);
-#endif
- return READ_ONCE(sk->sk_forward_alloc);
-}
-
static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
@@ -1473,6 +1526,10 @@ static inline int __sk_prot_rehash(struct sock *sk)
#define SOCK_BINDADDR_LOCK 4
#define SOCK_BINDPORT_LOCK 8
+/**
+ * define SOCK_CONNECT_BIND - &sock->sk_userlocks flag for auto-bind at connect() time
+ */
+#define SOCK_CONNECT_BIND 16
struct socket_alloc {
struct socket socket;
@@ -1540,7 +1597,7 @@ __sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc)
}
static inline bool
-sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
+sk_rmem_schedule(struct sock *sk, const struct sk_buff *skb, int size)
{
return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb));
}
@@ -1592,6 +1649,37 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
sk_mem_reclaim(sk);
}
+void __sk_charge(struct sock *sk, gfp_t gfp);
+
+#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
+static inline void sk_owner_set(struct sock *sk, struct module *owner)
+{
+ __module_get(owner);
+ sk->sk_owner = owner;
+}
+
+static inline void sk_owner_clear(struct sock *sk)
+{
+ sk->sk_owner = NULL;
+}
+
+static inline void sk_owner_put(struct sock *sk)
+{
+ module_put(sk->sk_owner);
+}
+#else
+static inline void sk_owner_set(struct sock *sk, struct module *owner)
+{
+}
+
+static inline void sk_owner_clear(struct sock *sk)
+{
+}
+
+static inline void sk_owner_put(struct sock *sk)
+{
+}
+#endif
/*
* Macro so as to not evaluate some arguments when
* lockdep is not enabled.
@@ -1601,13 +1689,14 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)
*/
#define sock_lock_init_class_and_name(sk, sname, skey, name, key) \
do { \
+ sk_owner_set(sk, THIS_MODULE); \
sk->sk_lock.owned = 0; \
init_waitqueue_head(&sk->sk_lock.wq); \
spin_lock_init(&(sk)->sk_lock.slock); \
debug_check_no_locks_freed((void *)&(sk)->sk_lock, \
- sizeof((sk)->sk_lock)); \
+ sizeof((sk)->sk_lock)); \
lockdep_set_class_and_name(&(sk)->sk_lock.slock, \
- (skey), (sname)); \
+ (skey), (sname)); \
lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \
} while (0)
@@ -1624,7 +1713,6 @@ static inline void lock_sock(struct sock *sk)
lock_sock_nested(sk, 0);
}
-void __lock_sock(struct sock *sk);
void __release_sock(struct sock *sk);
void release_sock(struct sock *sk);
@@ -1753,8 +1841,12 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
void sk_free(struct sock *sk);
void sk_net_refcnt_upgrade(struct sock *sk);
void sk_destruct(struct sock *sk);
-struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
-void sk_free_unlock_clone(struct sock *sk);
+struct sock *sk_clone(const struct sock *sk, const gfp_t priority, bool lock);
+
+static inline struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
+{
+ return sk_clone(sk, priority, true);
+}
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority);
@@ -1806,6 +1898,8 @@ static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk,
}
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority);
+void *sock_kmemdup(struct sock *sk, const void *src,
+ int size, gfp_t priority);
void sock_kfree_s(struct sock *sk, void *mem, int size);
void sock_kzfree_s(struct sock *sk, void *mem, int size);
void sk_send_sigurg(struct sock *sk);
@@ -1823,12 +1917,14 @@ struct sockcm_cookie {
u32 tsflags;
u32 ts_opt_id;
u32 priority;
+ u32 dmabuf_id;
};
static inline void sockcm_init(struct sockcm_cookie *sockc,
const struct sock *sk)
{
*sockc = (struct sockcm_cookie) {
+ .mark = READ_ONCE(sk->sk_mark),
.tsflags = READ_ONCE(sk->sk_tsflags),
.priority = READ_ONCE(sk->sk_priority),
};
@@ -1843,8 +1939,8 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
* Functions to fill in entries in struct proto_ops when a protocol
* does not implement a particular function.
*/
-int sock_no_bind(struct socket *, struct sockaddr *, int);
-int sock_no_connect(struct socket *, struct sockaddr *, int, int);
+int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len);
+int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags);
int sock_no_socketpair(struct socket *, struct socket *);
int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *);
int sock_no_getname(struct socket *, struct sockaddr *, int);
@@ -1934,7 +2030,15 @@ static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
/* Paired with READ_ONCE() in sk_tx_queue_get() and
* other WRITE_ONCE() because socket lock might be not held.
*/
- WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
+ if (READ_ONCE(sk->sk_tx_queue_mapping) != tx_queue) {
+ WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
+ WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
+ return;
+ }
+
+ /* Refresh sk_tx_queue_mapping_jiffies if too old. */
+ if (time_is_before_jiffies(READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + HZ))
+ WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies);
}
#define NO_QUEUE_MAPPING USHRT_MAX
@@ -1947,19 +2051,7 @@ static inline void sk_tx_queue_clear(struct sock *sk)
WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
}
-static inline int sk_tx_queue_get(const struct sock *sk)
-{
- if (sk) {
- /* Paired with WRITE_ONCE() in sk_tx_queue_clear()
- * and sk_tx_queue_set().
- */
- int val = READ_ONCE(sk->sk_tx_queue_mapping);
-
- if (val != NO_QUEUE_MAPPING)
- return val;
- }
- return -1;
-}
+int sk_tx_queue_get(const struct sock *sk);
static inline void __sk_rx_queue_set(struct sock *sk,
const struct sk_buff *skb,
@@ -2009,7 +2101,14 @@ static inline int sk_rx_queue_get(const struct sock *sk)
static inline void sk_set_socket(struct sock *sk, struct socket *sock)
{
- sk->sk_socket = sock;
+ WRITE_ONCE(sk->sk_socket, sock);
+ if (sock) {
+ WRITE_ONCE(sk->sk_uid, SOCK_INODE(sock)->i_uid);
+ WRITE_ONCE(sk->sk_ino, SOCK_INODE(sock)->i_ino);
+ } else {
+ /* Note: sk_uid is unchanged. */
+ WRITE_ONCE(sk->sk_ino, 0);
+ }
}
static inline wait_queue_head_t *sk_sleep(struct sock *sk)
@@ -2040,18 +2139,25 @@ static inline void sock_graft(struct sock *sk, struct socket *parent)
rcu_assign_pointer(sk->sk_wq, &parent->wq);
parent->sk = sk;
sk_set_socket(sk, parent);
- sk->sk_uid = SOCK_INODE(parent)->i_uid;
security_sock_graft(sk, parent);
write_unlock_bh(&sk->sk_callback_lock);
}
-kuid_t sock_i_uid(struct sock *sk);
-unsigned long __sock_i_ino(struct sock *sk);
-unsigned long sock_i_ino(struct sock *sk);
+static inline u64 sock_i_ino(const struct sock *sk)
+{
+ /* Paired with WRITE_ONCE() in sock_graft() and sock_orphan() */
+ return READ_ONCE(sk->sk_ino);
+}
+
+static inline kuid_t sk_uid(const struct sock *sk)
+{
+ /* Paired with WRITE_ONCE() in sockfs_setattr() */
+ return READ_ONCE(sk->sk_uid);
+}
static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
{
- return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+ return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0);
}
static inline u32 net_tx_rndhash(void)
@@ -2231,6 +2337,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro
return 0;
}
+#define SK_WMEM_ALLOC_BIAS 1
/**
* sk_wmem_alloc_get - returns write allocations
* @sk: socket
@@ -2239,7 +2346,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro
*/
static inline int sk_wmem_alloc_get(const struct sock *sk)
{
- return refcount_read(&sk->sk_wmem_alloc) - 1;
+ return refcount_read(&sk->sk_wmem_alloc) - SK_WMEM_ALLOC_BIAS;
}
/**
@@ -2395,12 +2502,23 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
struct sk_buff *skb));
int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
-int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
- enum skb_drop_reason *reason);
+enum skb_drop_reason
+sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb);
static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
- return sock_queue_rcv_skb_reason(sk, skb, NULL);
+ enum skb_drop_reason drop_reason = sock_queue_rcv_skb_reason(sk, skb);
+
+ switch (drop_reason) {
+ case SKB_DROP_REASON_SOCKET_RCVBUFF:
+ return -ENOMEM;
+ case SKB_DROP_REASON_PROTO_MEM:
+ return -ENOBUFS;
+ case 0:
+ return 0;
+ default:
+ return -EPERM;
+ }
}
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
@@ -2524,12 +2642,16 @@ static inline struct page_frag *sk_page_frag(struct sock *sk)
bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
+static inline bool __sock_writeable(const struct sock *sk, int wmem_alloc)
+{
+ return wmem_alloc < (READ_ONCE(sk->sk_sndbuf) >> 1);
+}
/*
* Default write policy as shown to user space via poll/select/SIGIO
*/
static inline bool sock_writeable(const struct sock *sk)
{
- return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1);
+ return __sock_writeable(sk, refcount_read(&sk->sk_wmem_alloc));
}
static inline gfp_t gfp_any(void)
@@ -2542,14 +2664,62 @@ static inline gfp_t gfp_memcg_charge(void)
return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
}
+#ifdef CONFIG_MEMCG
+static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk)
+{
+ return sk->sk_memcg;
+}
+
+static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
+{
+ return mem_cgroup_sockets_enabled && mem_cgroup_from_sk(sk);
+}
+
+static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_sk(sk);
+
+#ifdef CONFIG_MEMCG_V1
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return !!memcg->tcpmem_pressure;
+#endif /* CONFIG_MEMCG_V1 */
+
+ do {
+ if (time_before64(get_jiffies_64(),
+ mem_cgroup_get_socket_pressure(memcg))) {
+ memcg_memory_event(mem_cgroup_from_sk(sk),
+ MEMCG_SOCK_THROTTLED);
+ return true;
+ }
+ } while ((memcg = parent_mem_cgroup(memcg)));
+
+ return false;
+}
+#else
+static inline struct mem_cgroup *mem_cgroup_from_sk(const struct sock *sk)
+{
+ return NULL;
+}
+
+static inline bool mem_cgroup_sk_enabled(const struct sock *sk)
+{
+ return false;
+}
+
+static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk)
+{
+ return false;
+}
+#endif
+
static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
{
- return noblock ? 0 : sk->sk_rcvtimeo;
+ return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo);
}
static inline long sock_sndtimeo(const struct sock *sk, bool noblock)
{
- return noblock ? 0 : sk->sk_sndtimeo;
+ return noblock ? 0 : READ_ONCE(sk->sk_sndtimeo);
}
static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
@@ -2575,8 +2745,8 @@ struct sock_skb_cb {
* using skb->cb[] would keep using it directly and utilize its
* alignment guarantee.
*/
-#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \
- sizeof(struct sock_skb_cb)))
+#define SOCK_SKB_CB_OFFSET (sizeof_field(struct sk_buff, cb) - \
+ sizeof(struct sock_skb_cb))
#define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \
SOCK_SKB_CB_OFFSET))
@@ -2584,18 +2754,53 @@ struct sock_skb_cb {
#define sock_skb_cb_check_size(size) \
BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET)
+static inline void sk_drops_add(struct sock *sk, int segs)
+{
+ struct numa_drop_counters *ndc = sk->sk_drop_counters;
+
+ if (ndc)
+ numa_drop_add(ndc, segs);
+ else
+ atomic_add(segs, &sk->sk_drops);
+}
+
+static inline void sk_drops_inc(struct sock *sk)
+{
+ sk_drops_add(sk, 1);
+}
+
+static inline int sk_drops_read(const struct sock *sk)
+{
+ const struct numa_drop_counters *ndc = sk->sk_drop_counters;
+
+ if (ndc) {
+ DEBUG_NET_WARN_ON_ONCE(atomic_read(&sk->sk_drops));
+ return numa_drop_read(ndc);
+ }
+ return atomic_read(&sk->sk_drops);
+}
+
+static inline void sk_drops_reset(struct sock *sk)
+{
+ struct numa_drop_counters *ndc = sk->sk_drop_counters;
+
+ if (ndc)
+ numa_drop_reset(ndc);
+ atomic_set(&sk->sk_drops, 0);
+}
+
static inline void
sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
{
SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
- atomic_read(&sk->sk_drops) : 0;
+ sk_drops_read(sk) : 0;
}
-static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
+static inline void sk_drops_skbadd(struct sock *sk, const struct sk_buff *skb)
{
int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
- atomic_add(segs, &sk->sk_drops);
+ sk_drops_add(sk, segs);
}
static inline ktime_t sock_read_timestamp(struct sock *sk)
@@ -2631,6 +2836,10 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb);
+bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk);
+int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk,
+ struct timespec64 *ts);
+
static inline void
sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
{
@@ -2665,13 +2874,13 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
{
#define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL) | \
(1UL << SOCK_RCVTSTAMP) | \
- (1UL << SOCK_RCVMARK) |\
- (1UL << SOCK_RCVPRIORITY))
+ (1UL << SOCK_RCVMARK) | \
+ (1UL << SOCK_RCVPRIORITY) | \
+ (1UL << SOCK_TIMESTAMPING_ANY))
#define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \
SOF_TIMESTAMPING_RAW_HARDWARE)
- if (sk->sk_flags & FLAGS_RECV_CMSGS ||
- READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY)
+ if (READ_ONCE(sk->sk_flags) & FLAGS_RECV_CMSGS)
__sock_recv_cmsgs(msg, sk, skb);
else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
sock_write_timestamp(sk, skb->tstamp);
@@ -2706,8 +2915,6 @@ static inline void _sock_tx_timestamp(struct sock *sk,
*tskey = atomic_inc_return(&sk->sk_tskey) - 1;
}
}
- if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS)))
- *tx_flags |= SKBTX_WIFI_STATUS;
}
static inline void sock_tx_timestamp(struct sock *sk,
@@ -2745,9 +2952,14 @@ static inline bool sk_is_udp(const struct sock *sk)
sk->sk_protocol == IPPROTO_UDP;
}
+static inline bool sk_is_unix(const struct sock *sk)
+{
+ return sk->sk_family == AF_UNIX;
+}
+
static inline bool sk_is_stream_unix(const struct sock *sk)
{
- return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM;
+ return sk_is_unix(sk) && sk->sk_type == SOCK_STREAM;
}
static inline bool sk_is_vsock(const struct sock *sk)
@@ -2755,6 +2967,13 @@ static inline bool sk_is_vsock(const struct sock *sk)
return sk->sk_family == AF_VSOCK;
}
+static inline bool sk_may_scm_recv(const struct sock *sk)
+{
+ return (IS_ENABLED(CONFIG_UNIX) && sk->sk_family == AF_UNIX) ||
+ sk->sk_family == AF_NETLINK ||
+ (IS_ENABLED(CONFIG_BT) && sk->sk_family == AF_BLUETOOTH);
+}
+
/**
* sk_eat_skb - Release a skb if it is no longer needed
* @sk: socket to eat this skb from
@@ -2794,26 +3013,10 @@ sk_is_refcounted(struct sock *sk)
return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
}
-/* Checks if this SKB belongs to an HW offloaded socket
- * and whether any SW fallbacks are required based on dev.
- * Check decrypted mark in case skb_orphan() cleared socket.
- */
-static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
- struct net_device *dev)
+static inline bool
+sk_requests_wifi_status(struct sock *sk)
{
-#ifdef CONFIG_SOCK_VALIDATE_XMIT
- struct sock *sk = skb->sk;
-
- if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) {
- skb = sk->sk_validate_xmit_skb(sk, dev, skb);
- } else if (unlikely(skb_is_decrypted(skb))) {
- pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
- kfree_skb(skb);
- skb = NULL;
- }
-#endif
-
- return skb;
+ return sk && sk_fullsock(sk) && sock_flag(sk, SOCK_WIFI_STATUS);
}
/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV
@@ -2852,8 +3055,8 @@ void sk_get_meminfo(const struct sock *sk, u32 *meminfo);
*/
#define _SK_MEM_PACKETS 256
#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
-#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
-#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+#define SK_WMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
+#define SK_RMEM_DEFAULT (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;
@@ -2920,7 +3123,13 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
int sock_set_timestamping(struct sock *sk, int optname,
struct so_timestamping timestamping);
-void sock_enable_timestamps(struct sock *sk);
+#if defined(CONFIG_CGROUP_BPF)
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
+#else
+static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
+{
+}
+#endif
void sock_no_linger(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);
@@ -2930,7 +3139,7 @@ void sock_set_reuseaddr(struct sock *sk);
void sock_set_reuseport(struct sock *sk);
void sock_set_sndtimeo(struct sock *sk, s64 secs);
-int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);
+int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len);
int sock_get_timeout(long timeo, void *optval, bool old_timeval);
int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
@@ -2941,8 +3150,11 @@ int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
static inline bool sk_is_readable(struct sock *sk)
{
- if (sk->sk_prot->sock_is_readable)
- return sk->sk_prot->sock_is_readable(sk);
+ const struct proto *prot = READ_ONCE(sk->sk_prot);
+
+ if (prot->sock_is_readable)
+ return prot->sock_is_readable(sk);
+
return false;
}
#endif /* _SOCK_H */
diff --git a/include/net/strparser.h b/include/net/strparser.h
index 0a83010b3a64..0ed73e364faa 100644
--- a/include/net/strparser.h
+++ b/include/net/strparser.h
@@ -114,8 +114,6 @@ static inline void strp_pause(struct strparser *strp)
/* May be called without holding lock for attached socket */
void strp_unpause(struct strparser *strp);
-/* Must be called with process lock held (lock_sock) */
-void __strp_unpause(struct strparser *strp);
static inline void save_strp_stats(struct strparser *strp,
struct strp_aggr_stats *agg_stats)
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 8346b0d29542..ee500706496b 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -15,6 +15,7 @@
#define SWITCHDEV_F_NO_RECURSE BIT(0)
#define SWITCHDEV_F_SKIP_EOPNOTSUPP BIT(1)
#define SWITCHDEV_F_DEFER BIT(2)
+#define SWITCHDEV_F_NO_FOREIGN BIT(3)
enum switchdev_attr_id {
SWITCHDEV_ATTR_ID_UNDEFINED,
diff --git a/include/net/tc_act/tc_connmark.h b/include/net/tc_act/tc_connmark.h
index e8dd77a96748..a5ce83f3eea4 100644
--- a/include/net/tc_act/tc_connmark.h
+++ b/include/net/tc_act/tc_connmark.h
@@ -7,6 +7,7 @@
struct tcf_connmark_parms {
struct net *net;
u16 zone;
+ int action;
struct rcu_head rcu;
};
diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h
index 68269e4581b7..8d0c7a9f9345 100644
--- a/include/net/tc_act/tc_csum.h
+++ b/include/net/tc_act/tc_csum.h
@@ -8,6 +8,7 @@
struct tcf_csum_params {
u32 update_flags;
+ int action;
struct rcu_head rcu;
};
@@ -18,15 +19,6 @@ struct tcf_csum {
};
#define to_tcf_csum(a) ((struct tcf_csum *)a)
-static inline bool is_tcf_csum(const struct tc_action *a)
-{
-#ifdef CONFIG_NET_CLS_ACT
- if (a->ops && a->ops->id == TCA_ID_CSUM)
- return true;
-#endif
- return false;
-}
-
static inline u32 tcf_csum_update_flags(const struct tc_action *a)
{
u32 update_flags;
diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
index 77f87c622a2e..8b90c86c0b0d 100644
--- a/include/net/tc_act/tc_ct.h
+++ b/include/net/tc_act/tc_ct.h
@@ -13,7 +13,7 @@ struct tcf_ct_params {
struct nf_conntrack_helper *helper;
struct nf_conn *tmpl;
u16 zone;
-
+ int action;
u32 mark;
u32 mark_mask;
@@ -92,13 +92,4 @@ static inline void
tcf_ct_flow_table_restore_skb(struct sk_buff *skb, unsigned long cookie) { }
#endif
-static inline bool is_tcf_ct(const struct tc_action *a)
-{
-#if defined(CONFIG_NET_CLS_ACT) && IS_ENABLED(CONFIG_NF_CONNTRACK)
- if (a->ops && a->ops->id == TCA_ID_CT)
- return true;
-#endif
- return false;
-}
-
#endif /* __NET_TC_CT_H */
diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h
index f071c1d70a25..7fe01ab236da 100644
--- a/include/net/tc_act/tc_ctinfo.h
+++ b/include/net/tc_act/tc_ctinfo.h
@@ -7,6 +7,7 @@
struct tcf_ctinfo_params {
struct rcu_head rcu;
struct net *net;
+ int action;
u32 dscpmask;
u32 dscpstatemask;
u32 cpmarkmask;
@@ -18,9 +19,9 @@ struct tcf_ctinfo_params {
struct tcf_ctinfo {
struct tc_action common;
struct tcf_ctinfo_params __rcu *params;
- u64 stats_dscp_set;
- u64 stats_dscp_error;
- u64 stats_cpmark_set;
+ atomic64_t stats_dscp_set;
+ atomic64_t stats_dscp_error;
+ atomic64_t stats_cpmark_set;
};
enum {
diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h
index c8fa11ebb397..e0fded18e18c 100644
--- a/include/net/tc_act/tc_gate.h
+++ b/include/net/tc_act/tc_gate.h
@@ -32,6 +32,7 @@ struct tcf_gate_params {
s32 tcfg_clockid;
size_t num_entries;
struct list_head entries;
+ struct rcu_head rcu;
};
#define GATE_ACT_GATE_OPEN BIT(0)
@@ -39,7 +40,7 @@ struct tcf_gate_params {
struct tcf_gate {
struct tc_action common;
- struct tcf_gate_params param;
+ struct tcf_gate_params __rcu *param;
u8 current_gate_status;
ktime_t current_close_time;
u32 current_entry_octets;
@@ -51,56 +52,65 @@ struct tcf_gate {
#define to_gate(a) ((struct tcf_gate *)a)
-static inline bool is_tcf_gate(const struct tc_action *a)
+static inline struct tcf_gate_params *tcf_gate_params_locked(const struct tc_action *a)
{
-#ifdef CONFIG_NET_CLS_ACT
- if (a->ops && a->ops->id == TCA_ID_GATE)
- return true;
-#endif
- return false;
+ struct tcf_gate *gact = to_gate(a);
+
+ return rcu_dereference_protected(gact->param,
+ lockdep_is_held(&gact->tcf_lock));
}
static inline s32 tcf_gate_prio(const struct tc_action *a)
{
+ struct tcf_gate_params *p;
s32 tcfg_prio;
- tcfg_prio = to_gate(a)->param.tcfg_priority;
+ p = tcf_gate_params_locked(a);
+ tcfg_prio = p->tcfg_priority;
return tcfg_prio;
}
static inline u64 tcf_gate_basetime(const struct tc_action *a)
{
+ struct tcf_gate_params *p;
u64 tcfg_basetime;
- tcfg_basetime = to_gate(a)->param.tcfg_basetime;
+ p = tcf_gate_params_locked(a);
+ tcfg_basetime = p->tcfg_basetime;
return tcfg_basetime;
}
static inline u64 tcf_gate_cycletime(const struct tc_action *a)
{
+ struct tcf_gate_params *p;
u64 tcfg_cycletime;
- tcfg_cycletime = to_gate(a)->param.tcfg_cycletime;
+ p = tcf_gate_params_locked(a);
+ tcfg_cycletime = p->tcfg_cycletime;
return tcfg_cycletime;
}
static inline u64 tcf_gate_cycletimeext(const struct tc_action *a)
{
+ struct tcf_gate_params *p;
u64 tcfg_cycletimeext;
- tcfg_cycletimeext = to_gate(a)->param.tcfg_cycletime_ext;
+ p = tcf_gate_params_locked(a);
+ tcfg_cycletimeext = p->tcfg_cycletime_ext;
return tcfg_cycletimeext;
}
static inline u32 tcf_gate_num_entries(const struct tc_action *a)
{
+ struct tcf_gate_params *p;
u32 num_entries;
- num_entries = to_gate(a)->param.num_entries;
+ p = tcf_gate_params_locked(a);
+ num_entries = p->num_entries;
return num_entries;
}
@@ -114,7 +124,7 @@ static inline struct action_gate_entry
u32 num_entries;
int i = 0;
- p = &to_gate(a)->param;
+ p = tcf_gate_params_locked(a);
num_entries = p->num_entries;
list_for_each_entry(entry, &p->entries, list)
@@ -123,7 +133,7 @@ static inline struct action_gate_entry
if (i != num_entries)
return NULL;
- oe = kcalloc(num_entries, sizeof(*oe), GFP_ATOMIC);
+ oe = kzalloc_objs(*oe, num_entries, GFP_ATOMIC);
if (!oe)
return NULL;
diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
index c7f24a2da1ca..24d4d5a62b3c 100644
--- a/include/net/tc_act/tc_ife.h
+++ b/include/net/tc_act/tc_ife.h
@@ -13,15 +13,13 @@ struct tcf_ife_params {
u8 eth_src[ETH_ALEN];
u16 eth_type;
u16 flags;
-
+ struct list_head metalist;
struct rcu_head rcu;
};
struct tcf_ife_info {
struct tc_action common;
struct tcf_ife_params __rcu *params;
- /* list of metaids allowed */
- struct list_head metalist;
};
#define to_ife(a) ((struct tcf_ife_info *)a)
diff --git a/include/net/tc_act/tc_mpls.h b/include/net/tc_act/tc_mpls.h
index 721de4f5733a..dd067bd4018d 100644
--- a/include/net/tc_act/tc_mpls.h
+++ b/include/net/tc_act/tc_mpls.h
@@ -10,6 +10,7 @@
struct tcf_mpls_params {
int tcfm_action;
u32 tcfm_label;
+ int action; /* tcf_action */
u8 tcfm_tc;
u8 tcfm_ttl;
u8 tcfm_bos;
@@ -27,15 +28,6 @@ struct tcf_mpls {
};
#define to_mpls(a) ((struct tcf_mpls *)a)
-static inline bool is_tcf_mpls(const struct tc_action *a)
-{
-#ifdef CONFIG_NET_CLS_ACT
- if (a->ops && a->ops->id == TCA_ID_MPLS)
- return true;
-#endif
- return false;
-}
-
static inline u32 tcf_mpls_action(const struct tc_action *a)
{
u32 tcfm_action;
diff --git a/include/net/tc_act/tc_nat.h b/include/net/tc_act/tc_nat.h
index c869274ac529..ae35f4009445 100644
--- a/include/net/tc_act/tc_nat.h
+++ b/include/net/tc_act/tc_nat.h
@@ -6,6 +6,7 @@
#include <net/act_api.h>
struct tcf_nat_parms {
+ int action;
__be32 old_addr;
__be32 new_addr;
__be32 mask;
diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h
index 83fe39931781..cb7b82f2cbc7 100644
--- a/include/net/tc_act/tc_pedit.h
+++ b/include/net/tc_act/tc_pedit.h
@@ -14,7 +14,7 @@ struct tcf_pedit_key_ex {
struct tcf_pedit_parms {
struct tc_pedit_key *tcfp_keys;
struct tcf_pedit_key_ex *tcfp_keys_ex;
- u32 tcfp_off_max_hint;
+ int action;
unsigned char tcfp_nkeys;
unsigned char tcfp_flags;
struct rcu_head rcu;
diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h
index 283bde711a42..a89fc8e68b1e 100644
--- a/include/net/tc_act/tc_police.h
+++ b/include/net/tc_act/tc_police.h
@@ -5,10 +5,11 @@
#include <net/act_api.h>
struct tcf_police_params {
+ int action;
int tcfp_result;
u32 tcfp_ewma_rate;
- s64 tcfp_burst;
u32 tcfp_mtu;
+ s64 tcfp_burst;
s64 tcfp_mtu_ptoks;
s64 tcfp_pkt_burst;
struct psched_ratecfg rate;
@@ -44,15 +45,6 @@ struct tc_police_compat {
struct tc_ratespec peakrate;
};
-static inline bool is_tcf_police(const struct tc_action *act)
-{
-#ifdef CONFIG_NET_CLS_ACT
- if (act->ops && act->ops->id == TCA_ID_POLICE)
- return true;
-#endif
- return false;
-}
-
static inline u64 tcf_police_rate_bytes_ps(const struct tc_action *act)
{
struct tcf_police *police = to_police(act);
diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h
index b5d76305e854..abd163ca1864 100644
--- a/include/net/tc_act/tc_sample.h
+++ b/include/net/tc_act/tc_sample.h
@@ -17,15 +17,6 @@ struct tcf_sample {
};
#define to_sample(a) ((struct tcf_sample *)a)
-static inline bool is_tcf_sample(const struct tc_action *a)
-{
-#ifdef CONFIG_NET_CLS_ACT
- return a->ops && a->ops->id == TCA_ID_SAMPLE;
-#else
- return false;
-#endif
-}
-
static inline __u32 tcf_sample_rate(const struct tc_action *a)
{
return to_sample(a)->rate;
diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index 9649600fb3dc..31b2cd0bebb5 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -12,6 +12,7 @@
#include <linux/tc_act/tc_skbedit.h>
struct tcf_skbedit_params {
+ int action;
u32 flags;
u32 priority;
u32 mark;
diff --git a/include/net/tc_act/tc_skbmod.h b/include/net/tc_act/tc_skbmod.h
index 7c240d2fed4e..626704cd6241 100644
--- a/include/net/tc_act/tc_skbmod.h
+++ b/include/net/tc_act/tc_skbmod.h
@@ -12,6 +12,7 @@
struct tcf_skbmod_params {
struct rcu_head rcu;
u64 flags; /*up to 64 types of operations; extend if needed */
+ int action;
u8 eth_dst[ETH_ALEN];
u16 eth_type;
u8 eth_src[ETH_ALEN];
diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h
index 879fe8cff581..0f1925f97520 100644
--- a/include/net/tc_act/tc_tunnel_key.h
+++ b/include/net/tc_act/tc_tunnel_key.h
@@ -14,6 +14,7 @@
struct tcf_tunnel_key_params {
struct rcu_head rcu;
int tcft_action;
+ int action;
struct metadata_dst *tcft_enc_metadata;
};
diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h
index 904eddfc1826..beadee41669a 100644
--- a/include/net/tc_act/tc_vlan.h
+++ b/include/net/tc_act/tc_vlan.h
@@ -10,6 +10,7 @@
#include <linux/tc_act/tc_vlan.h>
struct tcf_vlan_params {
+ int action;
int tcfv_action;
unsigned char tcfv_push_dst[ETH_ALEN];
unsigned char tcfv_push_src[ETH_ALEN];
@@ -26,15 +27,6 @@ struct tcf_vlan {
};
#define to_vlan(a) ((struct tcf_vlan *)a)
-static inline bool is_tcf_vlan(const struct tc_action *a)
-{
-#ifdef CONFIG_NET_CLS_ACT
- if (a->ops && a->ops->id == TCA_ID_VLAN)
- return true;
-#endif
- return false;
-}
-
static inline u32 tcf_vlan_action(const struct tc_action *a)
{
u32 tcfv_action;
diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h
index ffe58a02537c..4ebb053bb0dd 100644
--- a/include/net/tc_wrapper.h
+++ b/include/net/tc_wrapper.h
@@ -12,7 +12,8 @@
#define TC_INDIRECT_SCOPE
-extern struct static_key_false tc_skip_wrapper;
+extern struct static_key_false tc_skip_wrapper_act;
+extern struct static_key_false tc_skip_wrapper_cls;
/* TC Actions */
#ifdef CONFIG_NET_CLS_ACT
@@ -46,7 +47,7 @@ TC_INDIRECT_ACTION_DECLARE(tunnel_key_act);
static inline int tc_act(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
- if (static_branch_likely(&tc_skip_wrapper))
+ if (static_branch_likely(&tc_skip_wrapper_act))
goto skip;
#if IS_BUILTIN(CONFIG_NET_ACT_GACT)
@@ -153,7 +154,7 @@ TC_INDIRECT_FILTER_DECLARE(u32_classify);
static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
- if (static_branch_likely(&tc_skip_wrapper))
+ if (static_branch_likely(&tc_skip_wrapper_cls))
goto skip;
#if IS_BUILTIN(CONFIG_NET_CLS_BPF)
@@ -202,8 +203,44 @@ skip:
static inline void tc_wrapper_init(void)
{
#ifdef CONFIG_X86
- if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE))
- static_branch_enable(&tc_skip_wrapper);
+ int cnt_cls = IS_BUILTIN(CONFIG_NET_CLS_BPF) +
+ IS_BUILTIN(CONFIG_NET_CLS_U32) +
+ IS_BUILTIN(CONFIG_NET_CLS_FLOWER) +
+ IS_BUILTIN(CONFIG_NET_CLS_FW) +
+ IS_BUILTIN(CONFIG_NET_CLS_MATCHALL) +
+ IS_BUILTIN(CONFIG_NET_CLS_BASIC) +
+ IS_BUILTIN(CONFIG_NET_CLS_CGROUP) +
+ IS_BUILTIN(CONFIG_NET_CLS_FLOW) +
+ IS_BUILTIN(CONFIG_NET_CLS_ROUTE4);
+
+ int cnt_act = IS_BUILTIN(CONFIG_NET_ACT_GACT) +
+ IS_BUILTIN(CONFIG_NET_ACT_MIRRED) +
+ IS_BUILTIN(CONFIG_NET_ACT_PEDIT) +
+ IS_BUILTIN(CONFIG_NET_ACT_SKBEDIT) +
+ IS_BUILTIN(CONFIG_NET_ACT_SKBMOD) +
+ IS_BUILTIN(CONFIG_NET_ACT_POLICE) +
+ IS_BUILTIN(CONFIG_NET_ACT_BPF) +
+ IS_BUILTIN(CONFIG_NET_ACT_CONNMARK) +
+ IS_BUILTIN(CONFIG_NET_ACT_CSUM) +
+ IS_BUILTIN(CONFIG_NET_ACT_CT) +
+ IS_BUILTIN(CONFIG_NET_ACT_CTINFO) +
+ IS_BUILTIN(CONFIG_NET_ACT_GATE) +
+ IS_BUILTIN(CONFIG_NET_ACT_MPLS) +
+ IS_BUILTIN(CONFIG_NET_ACT_NAT) +
+ IS_BUILTIN(CONFIG_NET_ACT_TUNNEL_KEY) +
+ IS_BUILTIN(CONFIG_NET_ACT_VLAN) +
+ IS_BUILTIN(CONFIG_NET_ACT_IFE) +
+ IS_BUILTIN(CONFIG_NET_ACT_SIMP) +
+ IS_BUILTIN(CONFIG_NET_ACT_SAMPLE);
+
+ if (cpu_feature_enabled(X86_FEATURE_RETPOLINE))
+ return;
+
+ if (cnt_cls > 1)
+ static_branch_enable(&tc_skip_wrapper_cls);
+
+ if (cnt_act > 1)
+ static_branch_enable(&tc_skip_wrapper_act);
#endif
}
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2d08473a6dc0..98848db62894 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -26,6 +26,7 @@
#include <linux/kref.h>
#include <linux/ktime.h>
#include <linux/indirect_call_wrapper.h>
+#include <linux/bits.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
@@ -42,6 +43,7 @@
#include <net/dst.h>
#include <net/mptcp.h>
#include <net/xfrm.h>
+#include <net/secure_seq.h>
#include <linux/seq_file.h>
#include <linux/memcontrol.h>
@@ -53,7 +55,15 @@ extern struct inet_hashinfo tcp_hashinfo;
DECLARE_PER_CPU(unsigned int, tcp_orphan_count);
int tcp_orphan_count_sum(void);
-DECLARE_PER_CPU(u32, tcp_tw_isn);
+static inline void tcp_orphan_count_inc(void)
+{
+ this_cpu_inc(tcp_orphan_count);
+}
+
+static inline void tcp_orphan_count_dec(void)
+{
+ this_cpu_dec(tcp_orphan_count);
+}
void tcp_time_wait(struct sock *sk, int state, int timeo);
@@ -89,6 +99,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
/* Maximal number of window scale according to RFC1323 */
#define TCP_MAX_WSCALE 14U
+/* Default sending frequency of accurate ECN option per RTT */
+#define TCP_ACCECN_OPTION_BEACON 3
+
/* urg_data states */
#define TCP_URG_VALID 0x0100
#define TCP_URG_NOTYET 0x0200
@@ -144,8 +157,9 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCP_DELACK_MIN 4U
#define TCP_ATO_MIN 4U
#endif
-#define TCP_RTO_MAX ((unsigned)(120*HZ))
-#define TCP_RTO_MIN ((unsigned)(HZ/5))
+#define TCP_RTO_MAX_SEC 120
+#define TCP_RTO_MAX ((unsigned)(TCP_RTO_MAX_SEC * HZ))
+#define TCP_RTO_MIN ((unsigned)(HZ / 5))
#define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */
#define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */
@@ -201,6 +215,8 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOPT_AO 29 /* Authentication Option (RFC5925) */
#define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */
#define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */
+#define TCPOPT_ACCECN0 172 /* 0xAC: Accurate ECN Order 0 */
+#define TCPOPT_ACCECN1 174 /* 0xAE: Accurate ECN Order 1 */
#define TCPOPT_EXP 254 /* Experimental */
/* Magic number to be after the option value for sharing TCP
* experimental options. See draft-ietf-tcpm-experimental-options-00.txt
@@ -218,6 +234,7 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOLEN_TIMESTAMP 10
#define TCPOLEN_MD5SIG 18
#define TCPOLEN_FASTOPEN_BASE 2
+#define TCPOLEN_ACCECN_BASE 2
#define TCPOLEN_EXP_FASTOPEN_BASE 4
#define TCPOLEN_EXP_SMC_BASE 6
@@ -231,6 +248,14 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define TCPOLEN_MD5SIG_ALIGNED 20
#define TCPOLEN_MSS_ALIGNED 4
#define TCPOLEN_EXP_SMC_BASE_ALIGNED 8
+#define TCPOLEN_ACCECN_PERFIELD 3
+
+/* Maximum number of byte counters in AccECN option + size */
+#define TCP_ACCECN_NUMFIELDS 3
+#define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \
+ TCPOLEN_ACCECN_PERFIELD * \
+ TCP_ACCECN_NUMFIELDS)
+#define TCP_ACCECN_SAFETY_SHIFT 1 /* SAFETY_FACTOR in accecn draft */
/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */
@@ -265,7 +290,6 @@ extern long sysctl_tcp_mem[3];
#define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */
#define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */
-extern atomic_long_t tcp_memory_allocated;
DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
extern struct percpu_counter tcp_sockets_allocated;
@@ -274,10 +298,13 @@ extern unsigned long tcp_memory_pressure;
/* optimized version of sk_under_memory_pressure() for TCP sockets */
static inline bool tcp_under_memory_pressure(const struct sock *sk)
{
- if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
- mem_cgroup_under_socket_pressure(sk->sk_memcg))
+ if (mem_cgroup_sk_enabled(sk) &&
+ mem_cgroup_sk_under_memory_pressure(sk))
return true;
+ if (sk->sk_bypass_prot_mem)
+ return false;
+
return READ_ONCE(tcp_memory_pressure);
}
/*
@@ -319,13 +346,21 @@ extern struct proto tcp_prot;
#define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
#define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
-void tcp_tasklet_init(void);
+/*
+ * TCP splice context
+ */
+struct tcp_splice_state {
+ struct pipe_inode_info *pipe;
+ size_t len;
+ unsigned int flags;
+};
+
+void tcp_tsq_work_init(void);
int tcp_v4_err(struct sk_buff *skb, u32);
void tcp_shutdown(struct sock *sk, int how);
-int tcp_v4_early_demux(struct sk_buff *skb);
int tcp_v4_rcv(struct sk_buff *skb);
void tcp_remove_empty_skb(struct sock *sk);
@@ -338,17 +373,34 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
int tcp_wmem_schedule(struct sock *sk, int copy);
void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle,
int size_goal);
+
void tcp_release_cb(struct sock *sk);
+
+static inline bool tcp_release_cb_cond(struct sock *sk)
+{
+#ifdef CONFIG_INET
+ if (likely(sk->sk_prot->release_cb == tcp_release_cb)) {
+ if (unlikely(smp_load_acquire(&sk->sk_tsq_flags) & TCP_DEFERRED_ALL))
+ tcp_release_cb(sk);
+ return true;
+ }
+#endif
+ return false;
+}
+
void tcp_wfree(struct sk_buff *skb);
void tcp_write_timer_handler(struct sock *sk);
void tcp_delack_timer_handler(struct sock *sk);
int tcp_ioctl(struct sock *sk, int cmd, int *karg);
enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
+void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
void tcp_twsk_purge(struct list_head *net_exit_list);
+int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ unsigned int offset, size_t len);
ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags);
@@ -372,31 +424,71 @@ static inline void tcp_dec_quickack_mode(struct sock *sk)
}
}
-#define TCP_ECN_OK 1
-#define TCP_ECN_QUEUE_CWR 2
-#define TCP_ECN_DEMAND_CWR 4
-#define TCP_ECN_SEEN 8
+#define TCP_ECN_MODE_RFC3168 BIT(0)
+#define TCP_ECN_QUEUE_CWR BIT(1)
+#define TCP_ECN_DEMAND_CWR BIT(2)
+#define TCP_ECN_SEEN BIT(3)
+#define TCP_ECN_MODE_ACCECN BIT(4)
+
+#define TCP_ECN_DISABLED 0
+#define TCP_ECN_MODE_PENDING (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN)
+#define TCP_ECN_MODE_ANY (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN)
+
+static inline bool tcp_ecn_mode_any(const struct tcp_sock *tp)
+{
+ return tp->ecn_flags & TCP_ECN_MODE_ANY;
+}
+
+static inline bool tcp_ecn_mode_rfc3168(const struct tcp_sock *tp)
+{
+ return (tp->ecn_flags & TCP_ECN_MODE_ANY) == TCP_ECN_MODE_RFC3168;
+}
+
+static inline bool tcp_ecn_mode_accecn(const struct tcp_sock *tp)
+{
+ return (tp->ecn_flags & TCP_ECN_MODE_ANY) == TCP_ECN_MODE_ACCECN;
+}
+
+static inline bool tcp_ecn_disabled(const struct tcp_sock *tp)
+{
+ return !tcp_ecn_mode_any(tp);
+}
+
+static inline bool tcp_ecn_mode_pending(const struct tcp_sock *tp)
+{
+ return (tp->ecn_flags & TCP_ECN_MODE_PENDING) == TCP_ECN_MODE_PENDING;
+}
+
+static inline void tcp_ecn_mode_set(struct tcp_sock *tp, u8 mode)
+{
+ tp->ecn_flags &= ~TCP_ECN_MODE_ANY;
+ tp->ecn_flags |= mode;
+}
enum tcp_tw_status {
TCP_TW_SUCCESS = 0,
TCP_TW_RST = 1,
TCP_TW_ACK = 2,
- TCP_TW_SYN = 3
+ TCP_TW_SYN = 3,
+ TCP_TW_ACK_OOW = 4
};
enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
struct sk_buff *skb,
const struct tcphdr *th,
- u32 *tw_isn);
+ u32 *tw_isn,
+ enum skb_drop_reason *drop_reason);
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req, bool fastopen,
- bool *lost_race);
+ bool *lost_race, enum skb_drop_reason *drop_reason);
enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb);
void tcp_enter_loss(struct sock *sk);
void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag);
void tcp_clear_retrans(struct tcp_sock *tp);
+void tcp_update_pacing_rate(struct sock *sk);
+void tcp_set_rto(struct sock *sk);
void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk);
void tcp_metrics_init(void);
@@ -416,14 +508,23 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
sockptr_t optval, unsigned int optlen);
int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen);
+void tcp_reset_keepalive_timer(struct sock *sk, unsigned long timeout);
void tcp_set_keepalive(struct sock *sk, int val);
void tcp_syn_ack_timeout(const struct request_sock *req);
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int flags, int *addr_len);
+ int flags);
int tcp_set_rcvlowat(struct sock *sk, int val);
+void tcp_set_rcvbuf(struct sock *sk, int val);
int tcp_set_window_clamp(struct sock *sk, int val);
-void tcp_update_recv_tstamps(struct sk_buff *skb,
- struct scm_timestamping_internal *tss);
+
+static inline void
+tcp_update_recv_tstamps(struct sk_buff *skb,
+ struct scm_timestamping_internal *tss)
+{
+ tss->ts[0] = skb->tstamp;
+ tss->ts[2] = skb_hwtstamps(skb)->hwtstamp;
+}
+
void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
struct scm_timestamping_internal *tss);
void tcp_data_ready(struct sock *sk);
@@ -450,7 +551,6 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
* TCP v4 functions exported for the inet6 API
*/
-void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
void tcp_v4_mtu_reduced(struct sock *sk);
void tcp_req_err(struct sock *sk, u32 seq, bool abort);
void tcp_ld_RTO_revert(struct sock *sk, u32 seq);
@@ -463,14 +563,17 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst,
struct request_sock *req_unhash,
- bool *own_req);
+ bool *own_req,
+ void (*opt_child_init)(struct sock *newsk,
+ const struct sock *sk));
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
-int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
int tcp_connect(struct sock *sk);
enum tcp_synack_type {
TCP_SYNACK_NORMAL,
TCP_SYNACK_FASTOPEN,
TCP_SYNACK_COOKIE,
+ TCP_SYNACK_RETRANS,
};
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
@@ -667,7 +770,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority,
enum sk_rst_reason reason);
int tcp_send_synack(struct sock *);
void tcp_push_one(struct sock *, unsigned int mss_now);
-void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
+void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags);
void tcp_send_ack(struct sock *sk);
void tcp_send_delayed_ack(struct sock *sk);
void tcp_send_loss_probe(struct sock *sk);
@@ -681,7 +784,15 @@ void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
void tcp_done_with_error(struct sock *sk, int err);
void tcp_reset(struct sock *sk, struct sk_buff *skb);
void tcp_fin(struct sock *sk);
-void tcp_check_space(struct sock *sk);
+void __tcp_check_space(struct sock *sk);
+static inline void tcp_check_space(struct sock *sk)
+{
+ /* pairs with tcp_poll() */
+ smp_mb();
+
+ if (sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ __tcp_check_space(sk);
+}
void tcp_sack_compress_send_ack(struct sock *sk);
static inline void tcp_cleanup_skb(struct sk_buff *skb)
@@ -739,6 +850,7 @@ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
/* tcp.c */
void tcp_get_info(struct sock *, struct tcp_info *);
+void tcp_rate_check_app_limited(struct sock *sk);
/* Read 'sendfile()'-style from a TCP socket */
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
@@ -756,42 +868,27 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu);
int tcp_mss_to_mtu(struct sock *sk, int mss);
void tcp_mtup_init(struct sock *sk);
-static inline void tcp_bound_rto(struct sock *sk)
+static inline unsigned int tcp_rto_max(const struct sock *sk)
{
- if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
- inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
+ return READ_ONCE(inet_csk(sk)->icsk_rto_max);
}
-static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
+static inline void tcp_bound_rto(struct sock *sk)
{
- return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
+ inet_csk(sk)->icsk_rto = min(inet_csk(sk)->icsk_rto, tcp_rto_max(sk));
}
-static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
-{
- /* mptcp hooks are only on the slow path */
- if (sk_is_mptcp((struct sock *)tp))
- return;
-
- tp->pred_flags = htonl((tp->tcp_header_len << 26) |
- ntohl(TCP_FLAG_ACK) |
- snd_wnd);
-}
-
-static inline void tcp_fast_path_on(struct tcp_sock *tp)
+static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
{
- __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
+ return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
}
-static inline void tcp_fast_path_check(struct sock *sk)
+static inline unsigned long tcp_reqsk_timeout(struct request_sock *req)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ u64 timeout = (u64)req->timeout << req->num_timeout;
- if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
- tp->rcv_wnd &&
- atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
- !tp->urg_data)
- tcp_fast_path_on(tp);
+ return (unsigned long)min_t(u64, timeout,
+ tcp_rto_max(req->rsk_listener));
}
u32 tcp_delack_max(const struct sock *sk);
@@ -800,7 +897,7 @@ u32 tcp_delack_max(const struct sock *sk);
static inline u32 tcp_rto_min(const struct sock *sk)
{
const struct dst_entry *dst = __sk_dst_get(sk);
- u32 rto_min = inet_csk(sk)->icsk_rto_min;
+ u32 rto_min = READ_ONCE(inet_csk(sk)->icsk_rto_min);
if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
@@ -836,6 +933,28 @@ static inline u32 tcp_receive_window(const struct tcp_sock *tp)
return (u32) win;
}
+/* Compute the maximum receive window we ever advertised.
+ * Rcv_nxt can be after the window if our peer push more data
+ * than the offered window.
+ */
+static inline u32 tcp_max_receive_window(const struct tcp_sock *tp)
+{
+ s32 win = tp->rcv_mwnd_seq - tp->rcv_nxt;
+
+ if (win < 0)
+ win = 0;
+ return (u32) win;
+}
+
+/* Check if we need to update the maximum receive window sequence number */
+static inline void tcp_update_max_rcv_wnd_seq(struct tcp_sock *tp)
+{
+ u32 wre = tp->rcv_wup + tp->rcv_wnd;
+
+ if (after(wre, tp->rcv_mwnd_seq))
+ tp->rcv_mwnd_seq = wre;
+}
+
/* Choose a new window, without checks for shrinking, and without
* scaling applied to the result. The caller does these things
* if necessary. This is a "raw" window selection.
@@ -928,16 +1047,35 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
#define tcp_flag_byte(th) (((u_int8_t *)th)[13])
-#define TCPHDR_FIN 0x01
-#define TCPHDR_SYN 0x02
-#define TCPHDR_RST 0x04
-#define TCPHDR_PSH 0x08
-#define TCPHDR_ACK 0x10
-#define TCPHDR_URG 0x20
-#define TCPHDR_ECE 0x40
-#define TCPHDR_CWR 0x80
-
+#define TCPHDR_FIN BIT(0)
+#define TCPHDR_SYN BIT(1)
+#define TCPHDR_RST BIT(2)
+#define TCPHDR_PSH BIT(3)
+#define TCPHDR_ACK BIT(4)
+#define TCPHDR_URG BIT(5)
+#define TCPHDR_ECE BIT(6)
+#define TCPHDR_CWR BIT(7)
+#define TCPHDR_AE BIT(8)
+#define TCPHDR_FLAGS_MASK (TCPHDR_FIN | TCPHDR_SYN | TCPHDR_RST | \
+ TCPHDR_PSH | TCPHDR_ACK | TCPHDR_URG | \
+ TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)
+#define tcp_flags_ntohs(th) (ntohs(*(__be16 *)&tcp_flag_word(th)) & \
+ TCPHDR_FLAGS_MASK)
+
+#define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)
#define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)
+#define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR)
+
+#define TCP_ACCECN_CEP_ACE_MASK 0x7
+#define TCP_ACCECN_ACE_MAX_DELTA 6
+
+/* To avoid/detect middlebox interference, not all counters start at 0.
+ * See draft-ietf-tcpm-accurate-ecn for the latest values.
+ */
+#define TCP_ACCECN_CEP_INIT_OFFSET 5
+#define TCP_ACCECN_E1B_INIT_OFFSET 1
+#define TCP_ACCECN_E0B_INIT_OFFSET 1
+#define TCP_ACCECN_CEB_INIT_OFFSET 0
/* State flags for sacked in struct tcp_skb_cb */
enum tcp_skb_cb_sacked_flags {
@@ -962,23 +1100,28 @@ struct tcp_skb_cb {
__u32 seq; /* Starting sequence number */
__u32 end_seq; /* SEQ + FIN + SYN + datalen */
union {
- /* Note :
+ /* Notes :
+ * tcp_tw_isn is used in input path only
+ * (isn chosen by tcp_timewait_state_process())
* tcp_gso_segs/size are used in write queue only,
* cf tcp_skb_pcount()/tcp_skb_mss()
*/
+ u32 tcp_tw_isn;
struct {
u16 tcp_gso_segs;
u16 tcp_gso_size;
};
};
- __u8 tcp_flags; /* TCP header flags. (tcp[13]) */
+ __u16 tcp_flags; /* TCP header flags (tcp[12-13])*/
__u8 sacked; /* State flags for SACK. */
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
- __u8 txstamp_ack:1, /* Record TX timestamp for ack? */
+#define TSTAMP_ACK_SK 0x1
+#define TSTAMP_ACK_BPF 0x2
+ __u8 txstamp_ack:2, /* Record TX timestamp for ack? */
eor:1, /* Is skb MSG_EOR marked? */
has_rxtstamp:1, /* SKB has a RX timestamp */
- unused:5;
+ unused:4;
__u32 ack_seq; /* Sequence number ACK'd */
union {
struct {
@@ -1035,9 +1178,7 @@ static inline int tcp_v6_sdif(const struct sk_buff *skb)
extern const struct inet_connection_sock_af_ops ipv6_specific;
-INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb));
-void tcp_v6_early_demux(struct sk_buff *skb);
#endif
@@ -1124,10 +1265,18 @@ enum tcp_ca_ack_event_flags {
#define TCP_CA_UNSPEC 0
/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
-#define TCP_CONG_NON_RESTRICTED 0x1
+#define TCP_CONG_NON_RESTRICTED BIT(0)
/* Requires ECN/ECT set on all packets */
-#define TCP_CONG_NEEDS_ECN 0x2
-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
+#define TCP_CONG_NEEDS_ECN BIT(1)
+/* Require successfully negotiated AccECN capability */
+#define TCP_CONG_NEEDS_ACCECN BIT(2)
+/* Use ECT(1) instead of ECT(0) while the CA is uninitialized */
+#define TCP_CONG_ECT_1_NEGOTIATION BIT(3)
+/* Cannot fallback to RFC3168 during AccECN negotiation */
+#define TCP_CONG_NO_FALLBACK_RFC3168 BIT(4)
+#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN | \
+ TCP_CONG_NEEDS_ACCECN | TCP_CONG_ECT_1_NEGOTIATION | \
+ TCP_CONG_NO_FALLBACK_RFC3168)
union tcp_cc_info;
@@ -1167,33 +1316,45 @@ struct rate_sample {
struct tcp_congestion_ops {
/* fast path fields are put first to fill one cache line */
+ /* A congestion control (CC) must provide one of either:
+ *
+ * (a) a cong_avoid function, if the CC wants to use the core TCP
+ * stack's default functionality to implement a "classic"
+ * (Reno/CUBIC-style) response to packet loss, RFC3168 ECN,
+ * idle periods, pacing rate computations, etc.
+ *
+ * (b) a cong_control function, if the CC wants custom behavior and
+ * complete control of all congestion control behaviors.
+ */
+ /* (a) "classic" response: calculate new cwnd.
+ */
+ void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);
+ /* (b) "custom" response: call when packets are delivered to update
+ * cwnd and pacing rate, after all the ca_state processing.
+ */
+ void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs);
+
/* return slow start threshold (required) */
u32 (*ssthresh)(struct sock *sk);
- /* do new cwnd calculation (required) */
- void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);
-
/* call before changing ca_state (optional) */
void (*set_state)(struct sock *sk, u8 new_state);
/* call when cwnd event occurs (optional) */
void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
+ /* call when CA_EVENT_TX_START cwnd event occurs (optional) */
+ void (*cwnd_event_tx_start)(struct sock *sk);
+
/* call when ack arrives (optional) */
void (*in_ack_event)(struct sock *sk, u32 flags);
/* hook for packet ack accounting (optional) */
void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
- /* override sysctl_tcp_min_tso_segs */
+ /* override sysctl_tcp_min_tso_segs (optional) */
u32 (*min_tso_segs)(struct sock *sk);
- /* call when packets are delivered to update cwnd and pacing rate,
- * after all the ca_state processing. (optional)
- */
- void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs);
-
-
/* new value of cwnd after loss (required) */
u32 (*undo_cwnd)(struct sock *sk);
/* returns the multiplier used in tcp_sndbuf_expand (optional) */
@@ -1259,10 +1420,36 @@ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
}
+static inline bool tcp_ca_needs_accecn(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ACCECN;
+}
+
+static inline bool tcp_ca_ect_1_negotiation(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return icsk->icsk_ca_ops->flags & TCP_CONG_ECT_1_NEGOTIATION;
+}
+
+static inline bool tcp_ca_no_fallback_rfc3168(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ return icsk->icsk_ca_ops->flags & TCP_CONG_NO_FALLBACK_RFC3168;
+}
+
static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
+ if (event == CA_EVENT_TX_START) {
+ if (icsk->icsk_ca_ops->cwnd_event_tx_start)
+ icsk->icsk_ca_ops->cwnd_event_tx_start(sk);
+ return;
+ }
if (icsk->icsk_ca_ops->cwnd_event)
icsk->icsk_ca_ops->cwnd_event(sk, event);
}
@@ -1270,13 +1457,6 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
/* From tcp_cong.c */
void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
-/* From tcp_rate.c */
-void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
-void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
- struct rate_sample *rs);
-void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
- bool is_sack_reneg, struct rate_sample *rs);
-void tcp_rate_check_app_limited(struct sock *sk);
static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
{
@@ -1334,7 +1514,7 @@ static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp)
static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val)
{
WARN_ON_ONCE((int)val <= 0);
- tp->snd_cwnd = val;
+ WRITE_ONCE(tp->snd_cwnd, val);
}
static inline bool tcp_in_slow_start(const struct tcp_sock *tp)
@@ -1440,10 +1620,12 @@ static inline unsigned long tcp_pacing_delay(const struct sock *sk)
static inline void tcp_reset_xmit_timer(struct sock *sk,
const int what,
unsigned long when,
- const unsigned long max_when)
+ bool pace_delay)
{
- inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk),
- max_when);
+ if (pace_delay)
+ when += tcp_pacing_delay(sk);
+ inet_csk_reset_xmit_timer(sk, what, when,
+ tcp_rto_max(sk));
}
/* Something is really bad, we could not queue an additional packet,
@@ -1472,7 +1654,7 @@ static inline void tcp_check_probe_timer(struct sock *sk)
{
if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- tcp_probe0_base(sk), TCP_RTO_MAX);
+ tcp_probe0_base(sk), true);
}
static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
@@ -1500,11 +1682,16 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb)
__skb_checksum_complete(skb);
}
-bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
- enum skb_drop_reason *reason);
+enum skb_drop_reason tcp_add_backlog(struct sock *sk, struct sk_buff *skb);
+static inline enum skb_drop_reason
+tcp_filter(struct sock *sk, struct sk_buff *skb)
+{
+ const struct tcphdr *th = (const struct tcphdr *)skb->data;
+
+ return sk_filter_trim_cap(sk, skb, __tcp_hdrlen(th));
+}
-int tcp_filter(struct sock *sk, struct sk_buff *skb);
void tcp_set_state(struct sock *sk, int state);
void tcp_done(struct sock *sk);
int tcp_abort(struct sock *sk, int err);
@@ -1742,6 +1929,40 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
return true;
}
+static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
+{
+ u32 ace;
+
+ /* mptcp hooks are only on the slow path */
+ if (sk_is_mptcp((struct sock *)tp))
+ return;
+
+ ace = tcp_ecn_mode_accecn(tp) ?
+ ((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) &
+ TCP_ACCECN_CEP_ACE_MASK) : 0;
+
+ tp->pred_flags = htonl((tp->tcp_header_len << 26) |
+ (ace << 22) |
+ ntohl(TCP_FLAG_ACK) |
+ snd_wnd);
+}
+
+static inline void tcp_fast_path_on(struct tcp_sock *tp)
+{
+ __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
+}
+
+static inline void tcp_fast_path_check(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
+ tp->rcv_wnd &&
+ atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
+ !tp->urg_data)
+ tcp_fast_path_on(tp);
+}
+
bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
int mib_idx, u32 *last_oow_ack_time);
@@ -1755,14 +1976,8 @@ static inline void tcp_mib_init(struct net *net)
}
/* from STCP */
-static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp)
-{
- tp->lost_skb_hint = NULL;
-}
-
static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
{
- tcp_clear_retrans_hints_partial(tp);
tp->retransmit_skb_hint = NULL;
}
@@ -1803,13 +2018,6 @@ struct tcp6_pseudohdr {
__be32 protocol; /* including padding */
};
-union tcp_md5sum_block {
- struct tcp4_pseudohdr ip4;
-#if IS_ENABLED(CONFIG_IPV6)
- struct tcp6_pseudohdr ip6;
-#endif
-};
-
/*
* struct tcp_sigpool - per-CPU pool of ahash_requests
* @scratch: per-CPU temporary area, that can be used between
@@ -1844,8 +2052,8 @@ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c);
void tcp_sigpool_end(struct tcp_sigpool *c);
size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len);
/* - functions */
-int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
- const struct sock *sk, const struct sk_buff *skb);
+void tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+ const struct sock *sk, const struct sk_buff *skb);
int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
int family, u8 prefixlen, int l3index, u8 flags,
const u8 *newkey, u8 newkeylen);
@@ -1882,6 +2090,7 @@ tcp_md5_do_lookup_any_l3index(const struct sock *sk,
}
#define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key)
+void tcp_md5_destruct_sock(struct sock *sk);
#else
static inline struct tcp_md5sig_key *
tcp_md5_do_lookup(const struct sock *sk, int l3index,
@@ -1898,15 +2107,15 @@ tcp_md5_do_lookup_any_l3index(const struct sock *sk,
}
#define tcp_twsk_md5_key(twsk) NULL
+static inline void tcp_md5_destruct_sock(struct sock *sk)
+{
+}
#endif
-int tcp_md5_alloc_sigpool(void);
-void tcp_md5_release_sigpool(void);
-void tcp_md5_add_sigpool(void);
-extern int tcp_md5_sigpool_id;
-
-int tcp_md5_hash_key(struct tcp_sigpool *hp,
- const struct tcp_md5sig_key *key);
+struct md5_ctx;
+void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb,
+ unsigned int header_len);
+void tcp_md5_hash_key(struct md5_ctx *ctx, const struct tcp_md5sig_key *key);
/* From tcp_fastopen.c */
void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
@@ -1995,7 +2204,34 @@ enum tcp_chrono {
__TCP_CHRONO_MAX,
};
-void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
+static inline void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
+{
+ const u32 now = tcp_jiffies32;
+ enum tcp_chrono old = tp->chrono_type;
+
+ /* Following WRITE_ONCE()s pair with READ_ONCE()s in
+ * tcp_get_info_chrono_stats().
+ */
+ if (old > TCP_CHRONO_UNSPEC)
+ WRITE_ONCE(tp->chrono_stat[old - 1],
+ tp->chrono_stat[old - 1] + now - tp->chrono_start);
+ WRITE_ONCE(tp->chrono_start, now);
+ WRITE_ONCE(tp->chrono_type, new);
+}
+
+static inline void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* If there are multiple conditions worthy of tracking in a
+ * chronograph then the highest priority enum takes precedence
+ * over the other conditions. So that if something "more interesting"
+ * starts happening, stop the previous chrono and start a new one.
+ */
+ if (type > tp->chrono_type)
+ tcp_chrono_set(tp, type);
+}
+
void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
/* This helper is needed, because skb->tcp_tsorted_anchor uses
@@ -2213,21 +2449,26 @@ void tcp_v4_destroy_sock(struct sock *sk);
struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
netdev_features_t features);
-struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb);
struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th);
struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
struct tcphdr *th);
INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
-INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
-INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb));
#ifdef CONFIG_INET
void tcp_gro_complete(struct sk_buff *skb);
#else
static inline void tcp_gro_complete(struct sk_buff *skb) { }
#endif
-void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);
+static inline void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr,
+ __be32 daddr)
+{
+ struct tcphdr *th = tcp_hdr(skb);
+
+ th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+}
static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
{
@@ -2256,7 +2497,7 @@ struct tcp_sock_af_ops {
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *(*md5_lookup) (const struct sock *sk,
const struct sock *addr_sk);
- int (*calc_md5_hash)(char *location,
+ void (*calc_md5_hash)(char *location,
const struct tcp_md5sig_key *md5,
const struct sock *sk,
const struct sk_buff *skb);
@@ -2284,7 +2525,7 @@ struct tcp_request_sock_ops {
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk,
const struct sock *addr_sk);
- int (*calc_md5_hash) (char *location,
+ void (*calc_md5_hash) (char *location,
const struct tcp_md5sig_key *md5,
const struct sock *sk,
const struct sk_buff *skb);
@@ -2307,8 +2548,9 @@ struct tcp_request_sock_ops {
struct flowi *fl,
struct request_sock *req,
u32 tw_isn);
- u32 (*init_seq)(const struct sk_buff *skb);
- u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb);
+ union tcp_seq_and_ts_off (*init_seq_and_ts_off)(
+ const struct net *net,
+ const struct sk_buff *skb);
int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl, struct request_sock *req,
struct tcp_fastopen_cookie *foc,
@@ -2414,10 +2656,7 @@ void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb,
u32 reo_wnd);
extern bool tcp_rack_mark_lost(struct sock *sk);
-extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
- u64 xmit_time);
extern void tcp_rack_reo_timeout(struct sock *sk);
-extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);
/* tcp_plb.c */
@@ -2563,7 +2802,7 @@ static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb)
*/
static inline void tcp_listendrop(const struct sock *sk)
{
- atomic_inc(&((struct sock *)sk)->sk_drops);
+ sk_drops_inc((struct sock *)sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
}
@@ -2588,8 +2827,8 @@ struct tcp_ulp_ops {
/* cleanup ulp */
void (*release)(struct sock *sk);
/* diagnostic */
- int (*get_info)(struct sock *sk, struct sk_buff *skb);
- size_t (*get_info_size)(const struct sock *sk);
+ int (*get_info)(struct sock *sk, struct sk_buff *skb, bool net_admin);
+ size_t (*get_info_size)(const struct sock *sk, bool net_admin);
/* clone ulp */
void (*clone)(const struct request_sock *req, struct sock *newsk,
const gfp_t priority);
@@ -2606,8 +2845,8 @@ void tcp_update_ulp(struct sock *sk, struct proto *p,
void (*write_space)(struct sock *sk));
#define MODULE_ALIAS_TCP_ULP(name) \
- __MODULE_INFO(alias, alias_userspace, name); \
- __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
+ MODULE_INFO(alias, name); \
+ MODULE_INFO(alias, "tcp-ulp-" name)
#ifdef CONFIG_NET_SOCK_MSG
struct sk_msg;
@@ -2671,6 +2910,7 @@ static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
if (sk_fullsock(sk)) {
sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
sock_owned_by_me(sk);
}
@@ -2759,9 +2999,9 @@ extern struct static_key_false tcp_have_smc;
#endif
#if IS_ENABLED(CONFIG_TLS_DEVICE)
-void clean_acked_data_enable(struct inet_connection_sock *icsk,
+void clean_acked_data_enable(struct tcp_sock *tp,
void (*cad)(struct sock *sk, u32 ack_seq));
-void clean_acked_data_disable(struct inet_connection_sock *icsk);
+void clean_acked_data_disable(struct tcp_sock *tp);
void clean_acked_data_flush(void);
#endif
@@ -2842,4 +3082,18 @@ enum skb_drop_reason tcp_inbound_hash(struct sock *sk,
const void *saddr, const void *daddr,
int family, int dif, int sdif);
+static inline int tcp_recv_should_stop(struct sock *sk)
+{
+ return sk->sk_err ||
+ sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current);
+}
+
+INDIRECT_CALLABLE_DECLARE(union tcp_seq_and_ts_off
+ tcp_v4_init_seq_and_ts_off(const struct net *net,
+ const struct sk_buff *skb));
+INDIRECT_CALLABLE_DECLARE(union tcp_seq_and_ts_off
+ tcp_v6_init_seq_and_ts_off(const struct net *net,
+ const struct sk_buff *skb));
#endif /* _TCP_H */
diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h
index df655ce6987d..1e9e27d6e06b 100644
--- a/include/net/tcp_ao.h
+++ b/include/net/tcp_ao.h
@@ -130,7 +130,6 @@ struct tcp_ao_info {
u32 snd_sne;
u32 rcv_sne;
refcount_t refcnt; /* Protects twsk destruction */
- struct rcu_head rcu;
};
#ifdef CONFIG_TCP_MD5SIG
diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h
new file mode 100644
index 000000000000..865d5c5a7718
--- /dev/null
+++ b/include/net/tcp_ecn.h
@@ -0,0 +1,675 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _TCP_ECN_H
+#define _TCP_ECN_H
+
+#include <linux/tcp.h>
+#include <linux/skbuff.h>
+#include <linux/bitfield.h>
+
+#include <net/inet_connection_sock.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/inet_ecn.h>
+
+/* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is
+ * attemped to be negotiated and requested for incoming connection
+ * and outgoing connection, respectively.
+ */
+enum tcp_ecn_mode {
+ TCP_ECN_IN_NOECN_OUT_NOECN = 0,
+ TCP_ECN_IN_ECN_OUT_ECN = 1,
+ TCP_ECN_IN_ECN_OUT_NOECN = 2,
+ TCP_ECN_IN_ACCECN_OUT_ACCECN = 3,
+ TCP_ECN_IN_ACCECN_OUT_ECN = 4,
+ TCP_ECN_IN_ACCECN_OUT_NOECN = 5,
+};
+
+/* AccECN option sending when AccECN has been successfully negotiated */
+enum tcp_accecn_option {
+ TCP_ACCECN_OPTION_DISABLED = 0,
+ TCP_ACCECN_OPTION_MINIMUM = 1,
+ TCP_ACCECN_OPTION_FULL = 2,
+ TCP_ACCECN_OPTION_PERSIST = 3,
+};
+
+/* Apply either ECT(0) or ECT(1) based on TCP_CONG_ECT_1_NEGOTIATION flag */
+static inline void INET_ECN_xmit_ect_1_negotiation(struct sock *sk)
+{
+ __INET_ECN_xmit(sk, tcp_ca_ect_1_negotiation(sk));
+}
+
+static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp)
+{
+ /* Do not set CWR if in AccECN mode! */
+ if (tcp_ecn_mode_rfc3168(tp))
+ tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
+}
+
+static inline void tcp_ecn_accept_cwr(struct sock *sk,
+ const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) {
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+
+ /* If the sender is telling us it has entered CWR, then its
+ * cwnd may be very low (even just 1 packet), so we should ACK
+ * immediately.
+ */
+ if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+ }
+}
+
+static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
+{
+ tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
+}
+
+static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp)
+{
+ return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND;
+}
+
+static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp)
+{
+ return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV;
+}
+
+static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp)
+{
+ return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND;
+}
+
+static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp)
+{
+ return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV;
+}
+
+static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode)
+{
+ tp->accecn_fail_mode |= mode;
+}
+
+static inline u8 tcp_accecn_ace(const struct tcphdr *th)
+{
+ return (th->ae << 2) | (th->cwr << 1) | th->ece;
+}
+
+/* Infer the ECT value our SYN arrived with from the echoed ACE field */
+static inline int tcp_accecn_extract_syn_ect(u8 ace)
+{
+ /* Below is an excerpt from the 1st block of Table 2 of AccECN spec */
+ static const int ace_to_ecn[8] = {
+ INET_ECN_ECT_0, /* 0b000 (Undefined) */
+ INET_ECN_ECT_1, /* 0b001 (Undefined) */
+ INET_ECN_NOT_ECT, /* 0b010 (Not-ECT is received) */
+ INET_ECN_ECT_1, /* 0b011 (ECT-1 is received) */
+ INET_ECN_ECT_0, /* 0b100 (ECT-0 is received) */
+ INET_ECN_ECT_1, /* 0b101 (Reserved) */
+ INET_ECN_CE, /* 0b110 (CE is received) */
+ INET_ECN_ECT_1 /* 0b111 (Undefined) */
+ };
+
+ return ace_to_ecn[ace & 0x7];
+}
+
+/* Check ECN field transition to detect invalid transitions */
+static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv)
+{
+ if (rcv == snt)
+ return true;
+
+ /* Non-ECT altered to something or something became non-ECT */
+ if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT)
+ return false;
+ /* CE -> ECT(0/1)? */
+ if (snt == INET_ECN_CE)
+ return false;
+ return true;
+}
+
+static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace,
+ u8 sent_ect)
+{
+ u8 ect = tcp_accecn_extract_syn_ect(ace);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
+ return true;
+
+ if (!tcp_ect_transition_valid(sent_ect, ect)) {
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
+ return false;
+ }
+
+ return true;
+}
+
+static inline void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp,
+ u8 saw_opt)
+{
+ tp->saw_accecn_opt = saw_opt;
+ if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV);
+}
+
+/* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */
+static inline void tcp_accecn_third_ack(struct sock *sk,
+ const struct sk_buff *skb, u8 sent_ect)
+{
+ u8 ace = tcp_accecn_ace(tcp_hdr(skb));
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ switch (ace) {
+ case 0x0:
+ /* Invalid value */
+ if (!TCP_SKB_CB(skb)->sacked)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV |
+ TCP_ACCECN_OPT_FAIL_RECV);
+ break;
+ case 0x7:
+ case 0x5:
+ case 0x1:
+ /* Unused but legal values */
+ break;
+ default:
+ /* Validation only applies to first non-data packet */
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
+ !TCP_SKB_CB(skb)->sacked &&
+ tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) {
+ if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) &&
+ !tp->delivered_ce)
+ WRITE_ONCE(tp->delivered_ce, 1);
+ }
+ break;
+ }
+}
+
+/* Demand the minimum # to send AccECN optnio */
+static inline void tcp_accecn_opt_demand_min(struct sock *sk,
+ u8 opt_demand_min)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u8 opt_demand;
+
+ opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand);
+ tp->accecn_opt_demand = opt_demand;
+}
+
+/* Maps IP ECN field ECT/CE code point to AccECN option field number, given
+ * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0).
+ */
+static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield)
+{
+ switch (ecnfield & INET_ECN_MASK) {
+ case INET_ECN_NOT_ECT:
+ return 0; /* AccECN does not send counts of NOT_ECT */
+ case INET_ECN_ECT_1:
+ return 1;
+ case INET_ECN_CE:
+ return 2;
+ case INET_ECN_ECT_0:
+ return 3;
+ }
+ return 0;
+}
+
+/* Maps IP ECN field ECT/CE code point to AccECN option field value offset.
+ * Some fields do not start from zero, to detect zeroing by middleboxes.
+ */
+static inline u32 tcp_accecn_field_init_offset(u8 ecnfield)
+{
+ switch (ecnfield & INET_ECN_MASK) {
+ case INET_ECN_NOT_ECT:
+ return 0; /* AccECN does not send counts of NOT_ECT */
+ case INET_ECN_ECT_1:
+ return TCP_ACCECN_E1B_INIT_OFFSET;
+ case INET_ECN_CE:
+ return TCP_ACCECN_CEB_INIT_OFFSET;
+ case INET_ECN_ECT_0:
+ return TCP_ACCECN_E0B_INIT_OFFSET;
+ }
+ return 0;
+}
+
+/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */
+static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option,
+ bool order)
+{
+ /* Based on Table 5 of the AccECN spec to map (option, order) to
+ * the corresponding ECN conuters (ECT-1, ECT-0, or CE).
+ */
+ static const u8 optfield_lookup[2][3] = {
+ /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */
+ { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 },
+ /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */
+ { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 }
+ };
+
+ return optfield_lookup[order][option % 3];
+}
+
+/* Handles AccECN option ECT and CE 24-bit byte counters update into
+ * the u32 value in tcp_sock. As we're processing TCP options, it is
+ * safe to access from - 1.
+ */
+static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from,
+ u32 init_offset)
+{
+ u32 truncated = (get_unaligned_be32(from - 1) - init_offset) &
+ 0xFFFFFFU;
+ u32 delta = (truncated - *cnt) & 0xFFFFFFU;
+
+ /* If delta has the highest bit set (24th bit) indicating
+ * negative, sign extend to correct an estimation using
+ * sign_extend32(delta, 24 - 1)
+ */
+ delta = sign_extend32(delta, 23);
+ *cnt += delta;
+ return (s32)delta;
+}
+
+/* Updates Accurate ECN received counters from the received IP ECN field */
+static inline void tcp_ecn_received_counters(struct sock *sk,
+ const struct sk_buff *skb, u32 len)
+{
+ u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK;
+ u8 is_ce = INET_ECN_is_ce(ecnfield);
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool ecn_edge;
+
+ if (!INET_ECN_is_not_ect(ecnfield)) {
+ u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+
+ /* As for accurate ECN, the TCP_ECN_SEEN flag is set by
+ * tcp_ecn_received_counters() when the ECN codepoint of
+ * received TCP data or ACK contains ECT(0), ECT(1), or CE.
+ */
+ if (!tcp_ecn_mode_rfc3168(tp))
+ tp->ecn_flags |= TCP_ECN_SEEN;
+
+ /* ACE counter tracks *all* segments including pure ACKs */
+ tp->received_ce += pcount;
+ tp->received_ce_pending = min(tp->received_ce_pending + pcount,
+ 0xfU);
+
+ if (len > 0) {
+ u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield);
+ u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1];
+ u32 bytes_mask = GENMASK_U32(31, 22);
+
+ tp->received_ecn_bytes[ecnfield - 1] += len;
+ tp->accecn_minlen = max_t(u8, tp->accecn_minlen,
+ minlen);
+
+ /* Send AccECN option at least once per 2^22-byte
+ * increase in any ECN byte counter.
+ */
+ if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) &
+ bytes_mask) {
+ tcp_accecn_opt_demand_min(sk, 1);
+ }
+ }
+ }
+
+ ecn_edge = tp->prev_ecnfield != ecnfield;
+ if (ecn_edge || is_ce) {
+ tp->prev_ecnfield = ecnfield;
+ /* Demand Accurate ECN change-triggered ACKs. Two ACK are
+ * demanded to indicate unambiguously the ecnfield value
+ * in the latter ACK.
+ */
+ if (tcp_ecn_mode_accecn(tp)) {
+ if (ecn_edge)
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+ tp->accecn_opt_demand = 2;
+ }
+ }
+}
+
+/* AccECN specification, 2.2: [...] A Data Receiver maintains four counters
+ * initialized at the start of the half-connection. [...] These byte counters
+ * reflect only the TCP payload length, excluding TCP header and TCP options.
+ */
+static inline void tcp_ecn_received_counters_payload(struct sock *sk,
+ const struct sk_buff *skb)
+{
+ const struct tcphdr *th = (const struct tcphdr *)skb->data;
+
+ tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4);
+}
+
+/* AccECN specification, 5.1: [...] a server can determine that it
+ * negotiated AccECN as [...] if the ACK contains an ACE field with
+ * the value 0b010 to 0b111 (decimal 2 to 7).
+ */
+static inline bool cookie_accecn_ok(const struct tcphdr *th)
+{
+ return tcp_accecn_ace(th) > 0x1;
+}
+
+/* Used to form the ACE flags for SYN/ACK */
+static inline u16 tcp_accecn_reflector_flags(u8 ect)
+{
+ /* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN.
+ * Below is an excerpt from the 1st block of Table 2 of AccECN spec,
+ * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE
+ */
+ static const u8 ecn_to_ace_flags[4] = {
+ 0b010, /* Not-ECT is received */
+ 0b011, /* ECT(1) is received */
+ 0b100, /* ECT(0) is received */
+ 0b110 /* CE is received */
+ };
+
+ return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]);
+}
+
+/* AccECN specification, 3.1.2: If a TCP server that implements AccECN
+ * receives a SYN with the three TCP header flags (AE, CWR and ECE) set
+ * to any combination other than 000, 011 or 111, it MUST negotiate the
+ * use of AccECN as if they had been set to 111.
+ */
+static inline bool tcp_accecn_syn_requested(const struct tcphdr *th)
+{
+ u8 ace = tcp_accecn_ace(th);
+
+ return ace && ace != 0x3;
+}
+
+static inline void __tcp_accecn_init_bytes_counters(int *counter_array)
+{
+ BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1);
+ BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2);
+ BUILD_BUG_ON(INET_ECN_CE != 0x3);
+
+ counter_array[INET_ECN_ECT_1 - 1] = 0;
+ counter_array[INET_ECN_ECT_0 - 1] = 0;
+ counter_array[INET_ECN_CE - 1] = 0;
+}
+
+static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
+{
+ tp->received_ce = 0;
+ tp->received_ce_pending = 0;
+ __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes);
+ __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes);
+ tp->accecn_opt_sent_w_dsack = 0;
+ tp->accecn_minlen = 0;
+ tp->accecn_opt_demand = 0;
+ tp->est_ecnfield = 0;
+}
+
+/* Used for make_synack to form the ACE flags */
+static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect)
+{
+ /* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received
+ * from SYN. Below is an excerpt from Table 2 of the AccECN spec:
+ * +====================+====================================+
+ * | IP-ECN codepoint | Respective ACE falgs on SYN/ACK |
+ * | received on SYN | AE CWR ECE |
+ * +====================+====================================+
+ * | Not-ECT | 0 1 0 |
+ * | ECT(1) | 0 1 1 |
+ * | ECT(0) | 1 0 0 |
+ * | CE | 1 1 0 |
+ * +====================+====================================+
+ */
+ th->ae = !!(ect & INET_ECN_ECT_0);
+ th->cwr = ect != INET_ECN_ECT_0;
+ th->ece = ect == INET_ECN_ECT_1;
+}
+
+static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+ u32 wire_ace;
+
+ /* The final packet of the 3WHS or anything like it must reflect
+ * the SYN/ACK ECT instead of putting CEP into ACE field, such
+ * case show up in tcp_flags.
+ */
+ if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) {
+ wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET;
+ th->ece = !!(wire_ace & 0x1);
+ th->cwr = !!(wire_ace & 0x2);
+ th->ae = !!(wire_ace & 0x4);
+ tp->received_ce_pending = 0;
+ }
+}
+
+static inline u8 tcp_accecn_option_init(const struct sk_buff *skb,
+ u8 opt_offset)
+{
+ u8 *ptr = skb_transport_header(skb) + opt_offset;
+ unsigned int optlen = ptr[1] - 2;
+
+ if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1))
+ return TCP_ACCECN_OPT_FAIL_SEEN;
+ ptr += 2;
+
+ /* Detect option zeroing: an AccECN connection "MAY check that the
+ * initial value of the EE0B field or the EE1B field is non-zero"
+ */
+ if (optlen < TCPOLEN_ACCECN_PERFIELD)
+ return TCP_ACCECN_OPT_EMPTY_SEEN;
+ if (get_unaligned_be24(ptr) == 0)
+ return TCP_ACCECN_OPT_FAIL_SEEN;
+ if (optlen < TCPOLEN_ACCECN_PERFIELD * 3)
+ return TCP_ACCECN_OPT_COUNTER_SEEN;
+ ptr += TCPOLEN_ACCECN_PERFIELD * 2;
+ if (get_unaligned_be24(ptr) == 0)
+ return TCP_ACCECN_OPT_FAIL_SEEN;
+
+ return TCP_ACCECN_OPT_COUNTER_SEEN;
+}
+
+static inline void tcp_ecn_rcv_synack_accecn(struct sock *sk,
+ const struct sk_buff *skb, u8 dsf)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
+ tp->syn_ect_rcv = dsf & INET_ECN_MASK;
+ /* Demand Accurate ECN option in response to the SYN on the SYN/ACK
+ * and the TCP server will try to send one more packet with an AccECN
+ * Option at a later point during the connection.
+ */
+ if (tp->rx_opt.accecn &&
+ tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);
+
+ tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
+ tp->accecn_opt_demand = 2;
+ }
+}
+
+/* See Table 2 of the AccECN draft */
+static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb,
+ const struct tcphdr *th, u8 ip_dsfield)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u8 ace = tcp_accecn_ace(th);
+
+ switch (ace) {
+ case 0x0:
+ case 0x7:
+ /* +========+========+============+=============+
+ * | A | B | SYN/ACK | Feedback |
+ * | | | B->A | Mode of A |
+ * | | | AE CWR ECE | |
+ * +========+========+============+=============+
+ * | AccECN | No ECN | 0 0 0 | Not ECN |
+ * | AccECN | Broken | 1 1 1 | Not ECN |
+ * +========+========+============+=============+
+ */
+ tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
+ break;
+ case 0x1:
+ /* +========+========+============+=============+
+ * | A | B | SYN/ACK | Feedback |
+ * | | | B->A | Mode of A |
+ * | | | AE CWR ECE | |
+ * +========+========+============+=============+
+ * | AccECN | ECN | 0 0 1 | Classic ECN |
+ * | Nonce | AccECN | 0 0 1 | Classic ECN |
+ * | ECN | AccECN | 0 0 1 | Classic ECN |
+ * +========+========+============+=============+
+ */
+ if (tcp_ca_no_fallback_rfc3168(sk))
+ tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
+ else
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
+ break;
+ case 0x5:
+ if (tcp_ecn_mode_pending(tp)) {
+ tcp_ecn_rcv_synack_accecn(sk, skb, ip_dsfield);
+ if (INET_ECN_is_ce(ip_dsfield)) {
+ tp->received_ce++;
+ tp->received_ce_pending++;
+ }
+ }
+ break;
+ default:
+ tcp_ecn_rcv_synack_accecn(sk, skb, ip_dsfield);
+ if (INET_ECN_is_ce(ip_dsfield) &&
+ tcp_accecn_validate_syn_feedback(sk, ace,
+ tp->syn_ect_snt)) {
+ tp->received_ce++;
+ tp->received_ce_pending++;
+ }
+ break;
+ }
+}
+
+static inline void tcp_ecn_rcv_syn(struct sock *sk, const struct tcphdr *th,
+ const struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tcp_ecn_mode_pending(tp)) {
+ if (!tcp_accecn_syn_requested(th)) {
+ /* Downgrade to classic ECN feedback */
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
+ } else {
+ tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
+ INET_ECN_MASK;
+ tp->prev_ecnfield = tp->syn_ect_rcv;
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
+ }
+ }
+ if (tcp_ecn_mode_rfc3168(tp) &&
+ (!th->ece || !th->cwr || tcp_ca_no_fallback_rfc3168(sk)))
+ tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
+}
+
+static inline bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp,
+ const struct tcphdr *th)
+{
+ if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp))
+ return true;
+ return false;
+}
+
+/* Packet ECN state for a SYN-ACK */
+static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
+ if (tcp_ecn_disabled(tp))
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
+ else if (tcp_ca_needs_ecn(sk) ||
+ tcp_bpf_ca_needs_ecn(sk))
+ INET_ECN_xmit_ect_1_negotiation(sk);
+
+ if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) {
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
+ TCP_SKB_CB(skb)->tcp_flags |=
+ tcp_accecn_reflector_flags(tp->syn_ect_rcv);
+ tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
+ }
+}
+
+/* Packet ECN state for a SYN. */
+static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
+ bool use_ecn, use_accecn;
+ u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn);
+
+ use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN ||
+ tcp_ca_needs_accecn(sk);
+ use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN ||
+ tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN ||
+ tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn;
+
+ if (!use_ecn) {
+ const struct dst_entry *dst = __sk_dst_get(sk);
+
+ if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
+ use_ecn = true;
+ }
+
+ tp->ecn_flags = 0;
+
+ if (use_ecn) {
+ if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
+ INET_ECN_xmit_ect_1_negotiation(sk);
+
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
+ if (use_accecn) {
+ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE;
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING);
+ tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
+ } else {
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
+ }
+ }
+}
+
+static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
+{
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) {
+ /* tp->ecn_flags are cleared at a later point in time when
+ * SYN ACK is ultimatively being received.
+ */
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE;
+ }
+}
+
+static inline void
+tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th,
+ enum tcp_synack_type synack_type)
+{
+ /* Accurate ECN shall retransmit SYN/ACK with ACE=0 if the
+ * previously retransmitted SYN/ACK also times out.
+ */
+ if (!req->num_timeout || synack_type != TCP_SYNACK_RETRANS) {
+ if (tcp_rsk(req)->accecn_ok)
+ tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv);
+ else if (inet_rsk(req)->ecn_ok)
+ th->ece = 1;
+ } else if (tcp_rsk(req)->accecn_ok) {
+ th->ae = 0;
+ th->cwr = 0;
+ th->ece = 0;
+ }
+}
+
+static inline bool tcp_accecn_option_beacon_check(const struct sock *sk)
+{
+ u32 ecn_beacon = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon);
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!ecn_beacon)
+ return false;
+
+ return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * ecn_beacon >=
+ (tp->srtt_us >> 3);
+}
+
+#endif /* _LINUX_TCP_ECN_H */
diff --git a/include/net/tcx.h b/include/net/tcx.h
index 5ce0ce9e0c02..23a61af13547 100644
--- a/include/net/tcx.h
+++ b/include/net/tcx.h
@@ -20,7 +20,6 @@ struct tcx_entry {
struct tcx_link {
struct bpf_link link;
struct net_device *dev;
- u32 location;
};
static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress)
diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h
index 62b3e9f2aed4..0a85ac64a66d 100644
--- a/include/net/timewait_sock.h
+++ b/include/net/timewait_sock.h
@@ -15,13 +15,6 @@ struct timewait_sock_ops {
struct kmem_cache *twsk_slab;
char *twsk_slab_name;
unsigned int twsk_obj_size;
- void (*twsk_destructor)(struct sock *sk);
};
-static inline void twsk_destructor(struct sock *sk)
-{
- if (sk->sk_prot->twsk_prot->twsk_destructor != NULL)
- sk->sk_prot->twsk_prot->twsk_destructor(sk);
-}
-
#endif /* _TIMEWAIT_SOCK_H */
diff --git a/include/net/tls.h b/include/net/tls.h
index 857340338b69..ebd2550280ae 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -53,6 +53,8 @@ struct tls_rec;
/* Maximum data size carried in a TLS record */
#define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14)
+/* Minimum record size limit as per RFC8449 */
+#define TLS_MIN_RECORD_SIZE_LIM ((size_t)1 << 6)
#define TLS_HEADER_SIZE 5
#define TLS_NONCE_OFFSET TLS_HEADER_SIZE
@@ -226,6 +228,7 @@ struct tls_context {
u8 rx_conf:3;
u8 zerocopy_sendfile:1;
u8 rx_no_pad:1;
+ u16 tx_max_payload_len;
int (*push_pending_record)(struct sock *sk, int flags);
void (*sk_write_space)(struct sock *sk);
@@ -451,25 +454,26 @@ static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq)
/* Log all TLS record header TCP sequences in [seq, seq+len] */
static inline void
-tls_offload_rx_resync_async_request_start(struct sock *sk, __be32 seq, u16 len)
+tls_offload_rx_resync_async_request_start(struct tls_offload_resync_async *resync_async,
+ __be32 seq, u16 len)
{
- struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx);
-
- atomic64_set(&rx_ctx->resync_async->req, ((u64)ntohl(seq) << 32) |
+ atomic64_set(&resync_async->req, ((u64)ntohl(seq) << 32) |
((u64)len << 16) | RESYNC_REQ | RESYNC_REQ_ASYNC);
- rx_ctx->resync_async->loglen = 0;
- rx_ctx->resync_async->rcd_delta = 0;
+ resync_async->loglen = 0;
+ resync_async->rcd_delta = 0;
}
static inline void
-tls_offload_rx_resync_async_request_end(struct sock *sk, __be32 seq)
+tls_offload_rx_resync_async_request_end(struct tls_offload_resync_async *resync_async,
+ __be32 seq)
{
- struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx);
+ atomic64_set(&resync_async->req, ((u64)ntohl(seq) << 32) | RESYNC_REQ);
+}
- atomic64_set(&rx_ctx->resync_async->req,
- ((u64)ntohl(seq) << 32) | RESYNC_REQ);
+static inline void
+tls_offload_rx_resync_async_request_cancel(struct tls_offload_resync_async *resync_async)
+{
+ atomic64_set(&resync_async->req, 0);
}
static inline void
diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h
index 1a97e3f32029..c0a421fe0c2a 100644
--- a/include/net/transp_v6.h
+++ b/include/net/transp_v6.h
@@ -8,7 +8,6 @@
/* IPv6 transport protocols */
extern struct proto rawv6_prot;
extern struct proto udpv6_prot;
-extern struct proto udplitev6_prot;
extern struct proto tcpv6_prot;
extern struct proto pingv6_prot;
@@ -28,8 +27,6 @@ int rawv6_init(void);
void rawv6_exit(void);
int udpv6_init(void);
void udpv6_exit(void);
-int udplitev6_init(void);
-void udplitev6_exit(void);
int tcpv6_init(void);
void tcpv6_exit(void);
diff --git a/include/net/tso.h b/include/net/tso.h
index e7e157ae0526..da82aabd1d48 100644
--- a/include/net/tso.h
+++ b/include/net/tso.h
@@ -3,6 +3,7 @@
#define _TSO_H
#include <linux/skbuff.h>
+#include <linux/dma-mapping.h>
#include <net/ip.h>
#define TSO_HEADER_SIZE 256
@@ -28,4 +29,103 @@ void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso,
void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size);
int tso_start(struct sk_buff *skb, struct tso_t *tso);
+/**
+ * struct tso_dma_map - DMA mapping state for GSO payload
+ * @dev: device used for DMA mapping
+ * @skb: the GSO skb being mapped
+ * @hdr_len: per-segment header length
+ * @iova_state: DMA IOVA state (when IOMMU available)
+ * @iova_offset: global byte offset into IOVA range (IOVA path only)
+ * @total_len: total payload length
+ * @frag_idx: current region (-1 = linear, 0..nr_frags-1 = frag)
+ * @offset: byte offset within current region
+ * @linear_dma: DMA address of the linear payload
+ * @linear_len: length of the linear payload
+ * @nr_frags: number of frags successfully DMA-mapped
+ * @frags: per-frag DMA address and length
+ *
+ * DMA-maps the payload regions of a GSO skb (linear data + frags).
+ * Prefers the DMA IOVA API for a single contiguous mapping with one
+ * IOTLB sync; falls back to per-region dma_map_phys() otherwise.
+ */
+struct tso_dma_map {
+ struct device *dev;
+ const struct sk_buff *skb;
+ unsigned int hdr_len;
+ /* IOVA path */
+ struct dma_iova_state iova_state;
+ size_t iova_offset;
+ size_t total_len;
+ /* Fallback path if IOVA path fails */
+ int frag_idx;
+ unsigned int offset;
+ dma_addr_t linear_dma;
+ unsigned int linear_len;
+ unsigned int nr_frags;
+ struct {
+ dma_addr_t dma;
+ unsigned int len;
+ } frags[MAX_SKB_FRAGS];
+};
+
+/**
+ * struct tso_dma_map_completion_state - Completion-time cleanup state
+ * @iova_state: DMA IOVA state (when IOMMU available)
+ * @total_len: total payload length of the IOVA mapping
+ *
+ * Drivers store this on their SW ring at xmit time via
+ * tso_dma_map_completion_save(), then call tso_dma_map_complete() at
+ * completion time.
+ */
+struct tso_dma_map_completion_state {
+ struct dma_iova_state iova_state;
+ size_t total_len;
+};
+
+int tso_dma_map_init(struct tso_dma_map *map, struct device *dev,
+ const struct sk_buff *skb, unsigned int hdr_len);
+void tso_dma_map_cleanup(struct tso_dma_map *map);
+unsigned int tso_dma_map_count(struct tso_dma_map *map, unsigned int len);
+bool tso_dma_map_next(struct tso_dma_map *map, dma_addr_t *addr,
+ unsigned int *chunk_len, unsigned int *mapping_len,
+ unsigned int seg_remaining);
+
+/**
+ * tso_dma_map_completion_save - save state needed for completion-time cleanup
+ * @map: the xmit-time DMA map
+ * @cstate: driver-owned storage that persists until completion
+ *
+ * Should be called at xmit time to update the completion state and later passed
+ * to tso_dma_map_complete().
+ */
+static inline void
+tso_dma_map_completion_save(const struct tso_dma_map *map,
+ struct tso_dma_map_completion_state *cstate)
+{
+ cstate->iova_state = map->iova_state;
+ cstate->total_len = map->total_len;
+}
+
+/**
+ * tso_dma_map_complete - tear down mapping at completion time
+ * @dev: the device that owns the mapping
+ * @cstate: state saved by tso_dma_map_completion_save()
+ *
+ * Return: true if the IOVA path was used and the mapping has been
+ * destroyed; false if the fallback per-region path was used and the
+ * driver must unmap via its normal completion path.
+ */
+static inline bool
+tso_dma_map_complete(struct device *dev,
+ struct tso_dma_map_completion_state *cstate)
+{
+ if (dma_use_iova(&cstate->iova_state)) {
+ dma_iova_destroy(dev, &cstate->iova_state, cstate->total_len,
+ DMA_TO_DEVICE, 0);
+ return true;
+ }
+
+ return false;
+}
+
#endif /* _TSO_H */
diff --git a/include/net/udp.h b/include/net/udp.h
index 6e89520e100d..8262e2b215b4 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -29,13 +29,12 @@
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/indirect_call_wrapper.h>
+#include <linux/math.h>
/**
- * struct udp_skb_cb - UDP(-Lite) private variables
+ * struct udp_skb_cb - UDP private variables
*
* @header: private variables used by IPv4/IPv6
- * @cscov: checksum coverage length (UDP-Lite only)
- * @partial_cov: if set indicates partial csum coverage
*/
struct udp_skb_cb {
union {
@@ -44,8 +43,6 @@ struct udp_skb_cb {
struct inet6_skb_parm h6;
#endif
} header;
- __u16 cscov;
- __u8 partial_cov;
};
#define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb))
@@ -104,7 +101,7 @@ struct udp_table {
unsigned int log;
};
extern struct udp_table udp_table;
-void udp_table_init(struct udp_table *, const char *);
+
static inline struct udp_hslot *udp_hashslot(struct udp_table *table,
const struct net *net,
unsigned int num)
@@ -205,7 +202,6 @@ static inline void udp_hash4_dec(struct udp_hslot *hslot2)
extern struct proto udp_prot;
-extern atomic_long_t udp_memory_allocated;
DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
/* sysctl variables for udp */
@@ -216,13 +212,11 @@ extern int sysctl_udp_wmem_min;
struct sk_buff;
/*
- * Generic checksumming routines for UDP(-Lite) v4 and v6
+ * Generic checksumming routines for UDP v4 and v6
*/
static inline __sum16 __udp_lib_checksum_complete(struct sk_buff *skb)
{
- return (UDP_SKB_CB(skb)->cscov == skb->len ?
- __skb_checksum_complete(skb) :
- __skb_checksum_complete_head(skb, UDP_SKB_CB(skb)->cscov));
+ return __skb_checksum_complete(skb);
}
static inline int udp_lib_checksum_complete(struct sk_buff *skb)
@@ -273,7 +267,6 @@ static inline void udp_csum_pull_header(struct sk_buff *skb)
skb->csum = csum_partial(skb->data, sizeof(struct udphdr),
skb->csum);
skb_pull_rcsum(skb, sizeof(struct udphdr));
- UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr);
}
typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport,
@@ -282,19 +275,37 @@ typedef struct sock *(*udp_lookup_t)(const struct sk_buff *skb, __be16 sport,
void udp_v6_early_demux(struct sk_buff *skb);
INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *));
+int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
+INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *sk, struct msghdr *msg,
+ size_t len, int flags));
+
struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
netdev_features_t features, bool is_ipv6);
-static inline void udp_lib_init_sock(struct sock *sk)
+static inline int udp_lib_init_sock(struct sock *sk)
{
struct udp_sock *up = udp_sk(sk);
+ sk->sk_drop_counters = &up->drop_counters;
skb_queue_head_init(&up->reader_queue);
+ INIT_HLIST_NODE(&up->tunnel_list);
up->forward_threshold = sk->sk_rcvbuf >> 2;
set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
+
+ up->udp_prod_queue = kzalloc_objs(*up->udp_prod_queue, nr_node_ids);
+ if (!up->udp_prod_queue)
+ return -ENOMEM;
+ for (int i = 0; i < nr_node_ids; i++)
+ init_llist_head(&up->udp_prod_queue[i].ll_root);
+ return 0;
}
-/* hash routines shared between UDPv4/6 and UDP-Litev4/6 */
+static inline void udp_drops_inc(struct sock *sk)
+{
+ numa_drop_add(&udp_sk(sk)->drop_counters, 1);
+}
+
+/* hash routines shared between UDPv4/6 */
static inline int udp_lib_hash(struct sock *sk)
{
BUG();
@@ -363,7 +374,7 @@ static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb,
*/
hash ^= hash << 16;
- return htons((((u64) hash * (max - min)) >> 32) + min);
+ return htons(reciprocal_scale(hash, max - min + 1) + min);
}
static inline int udp_rqueue_get(struct sock *sk)
@@ -397,11 +408,13 @@ static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags,
return __skb_recv_udp(sk, flags, &off, err);
}
-int udp_v4_early_demux(struct sk_buff *skb);
+enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb);
bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst);
int udp_err(struct sk_buff *, u32);
int udp_abort(struct sock *sk, int err);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
+INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *sk, struct msghdr *msg,
+ size_t len, int flags));
void udp_splice_eof(struct socket *sock);
int udp_push_pending_frames(struct sock *sk);
void udp_flush_pending_frames(struct sock *sk);
@@ -409,8 +422,7 @@ int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
int udp_rcv(struct sk_buff *skb);
int udp_ioctl(struct sock *sk, int cmd, int *karg);
-int udp_init_sock(struct sock *sk);
-int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+int udp_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len);
int __udp_disconnect(struct sock *sk, int flags);
int udp_disconnect(struct sock *sk, int flags);
__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait);
@@ -425,9 +437,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif);
struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr,
- __be16 sport,
- __be32 daddr, __be16 dport, int dif, int sdif,
- struct udp_table *tbl, struct sk_buff *skb);
+ __be16 sport, __be32 daddr, __be16 dport,
+ int dif, int sdif, struct sk_buff *skb);
struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport);
struct sock *udp6_lib_lookup(const struct net *net,
@@ -437,8 +448,7 @@ struct sock *udp6_lib_lookup(const struct net *net,
struct sock *__udp6_lib_lookup(const struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport,
- int dif, int sdif, struct udp_table *tbl,
- struct sk_buff *skb);
+ int dif, int sdif, struct sk_buff *skb);
struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport);
int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
@@ -510,38 +520,28 @@ static inline int copy_linear_skb(struct sk_buff *skb, int len, int off,
}
/*
- * SNMP statistics for UDP and UDP-Lite
+ * SNMP statistics for UDP
*/
-#define UDP_INC_STATS(net, field, is_udplite) do { \
- if (is_udplite) SNMP_INC_STATS((net)->mib.udplite_statistics, field); \
- else SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0)
-#define __UDP_INC_STATS(net, field, is_udplite) do { \
- if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_statistics, field); \
- else __SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0)
-
-#define __UDP6_INC_STATS(net, field, is_udplite) do { \
- if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_stats_in6, field);\
- else __SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \
-} while(0)
-#define UDP6_INC_STATS(net, field, __lite) do { \
- if (__lite) SNMP_INC_STATS((net)->mib.udplite_stats_in6, field); \
- else SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \
-} while(0)
+#define __UDP_INC_STATS(net, field) \
+ __SNMP_INC_STATS((net)->mib.udp_statistics, field)
+#define UDP_INC_STATS(net, field) \
+ SNMP_INC_STATS((net)->mib.udp_statistics, field)
+#define __UDP6_INC_STATS(net, field) \
+ __SNMP_INC_STATS((net)->mib.udp_stats_in6, field)
+#define UDP6_INC_STATS(net, field) \
+ SNMP_INC_STATS((net)->mib.udp_stats_in6, field)
#if IS_ENABLED(CONFIG_IPV6)
-#define __UDPX_MIB(sk, ipv4) \
-({ \
- ipv4 ? (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \
- sock_net(sk)->mib.udp_statistics) : \
- (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_stats_in6 : \
- sock_net(sk)->mib.udp_stats_in6); \
-})
+#define __UDPX_MIB(sk, ipv4) \
+ ({ \
+ ipv4 ? sock_net(sk)->mib.udp_statistics : \
+ sock_net(sk)->mib.udp_stats_in6; \
+ })
#else
-#define __UDPX_MIB(sk, ipv4) \
-({ \
- IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \
- sock_net(sk)->mib.udp_statistics; \
-})
+#define __UDPX_MIB(sk, ipv4) \
+ ({ \
+ sock_net(sk)->mib.udp_statistics; \
+ })
#endif
#define __UDPX_INC_STATS(sk, field) \
@@ -550,7 +550,6 @@ static inline int copy_linear_skb(struct sk_buff *skb, int len, int off,
#ifdef CONFIG_PROC_FS
struct udp_seq_afinfo {
sa_family_t family;
- struct udp_table *udp_table;
};
struct udp_iter_state {
@@ -562,9 +561,6 @@ void *udp_seq_start(struct seq_file *seq, loff_t *pos);
void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos);
void udp_seq_stop(struct seq_file *seq, void *v);
-extern const struct seq_operations udp_seq_ops;
-extern const struct seq_operations udp6_seq_ops;
-
int udp4_proc_init(void);
void udp4_proc_exit(void);
#endif /* CONFIG_PROC_FS */
@@ -586,6 +582,16 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
{
netdev_features_t features = NETIF_F_SG;
struct sk_buff *segs;
+ int drop_count;
+
+ /*
+ * Segmentation in UDP receive path is only for UDP GRO, drop udp
+ * fragmentation offload (UFO) packets.
+ */
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) {
+ drop_count = 1;
+ goto drop;
+ }
/* Avoid csum recalculation by skb_segment unless userspace explicitly
* asks for the final checksum values
@@ -609,23 +615,22 @@ static inline struct sk_buff *udp_rcv_segment(struct sock *sk,
*/
segs = __skb_gso_segment(skb, features, false);
if (IS_ERR_OR_NULL(segs)) {
- int segs_nr = skb_shinfo(skb)->gso_segs;
-
- atomic_add(segs_nr, &sk->sk_drops);
- SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, segs_nr);
- kfree_skb(skb);
- return NULL;
+ drop_count = skb_shinfo(skb)->gso_segs;
+ goto drop;
}
consume_skb(skb);
return segs;
+
+drop:
+ sk_drops_add(sk, drop_count);
+ SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, drop_count);
+ kfree_skb(skb);
+ return NULL;
}
static inline void udp_post_segment_fix_csum(struct sk_buff *skb)
{
- /* UDP-lite can't land here - no GRO */
- WARN_ON_ONCE(UDP_SKB_CB(skb)->partial_cov);
-
/* UDP packets generated with UDP_SEGMENT and traversing:
*
* UDP tunnel(xmit) -> veth (segmentation) -> veth (gro) -> UDP tunnel (rx)
@@ -639,7 +644,6 @@ static inline void udp_post_segment_fix_csum(struct sk_buff *skb)
* a valid csum after the segmentation.
* Additionally fixup the UDP CB.
*/
- UDP_SKB_CB(skb)->cscov = skb->len;
if (skb->ip_summed == CHECKSUM_NONE && !skb->csum_valid)
skb->csum_valid = 1;
}
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index a93dc51f6323..47c23d4a1740 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -7,9 +7,13 @@
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
-#include <net/ipv6_stubs.h>
#endif
+#define UDP_TUNNEL_PARTIAL_FEATURES NETIF_F_GSO_ENCAP_ALL
+#define UDP_TUNNEL_STRIPPED_GSO_TYPES ((UDP_TUNNEL_PARTIAL_FEATURES | \
+ NETIF_F_GSO_PARTIAL) >> \
+ NETIF_F_GSO_SHIFT)
+
struct udp_port_cfg {
u8 family;
@@ -47,7 +51,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
static inline int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
struct socket **sockp)
{
- return 0;
+ return -EPFNOSUPPORT;
}
#endif
@@ -130,35 +134,47 @@ void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock,
void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type);
void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type);
-static inline void udp_tunnel_get_rx_info(struct net_device *dev)
-{
- ASSERT_RTNL();
- if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
- return;
- call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev);
-}
-
-static inline void udp_tunnel_drop_rx_info(struct net_device *dev)
-{
- ASSERT_RTNL();
- if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
- return;
- call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev);
-}
-
/* Transmit the skb using UDP encapsulation. */
void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl,
__be16 df, __be16 src_port, __be16 dst_port,
- bool xnet, bool nocheck);
+ bool xnet, bool nocheck, u16 ipcb_flags);
+
+void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb,
+ struct net_device *dev,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u8 prio, __u8 ttl, __be32 label,
+ __be16 src_port, __be16 dst_port, bool nocheck,
+ u16 ip6cb_flags);
+
+static inline bool udp_tunnel_handle_partial(struct sk_buff *skb)
+{
+ bool double_encap = !!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL);
-int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb,
- struct net_device *dev,
- const struct in6_addr *saddr,
- const struct in6_addr *daddr,
- __u8 prio, __u8 ttl, __be32 label,
- __be16 src_port, __be16 dst_port, bool nocheck);
+ /*
+ * If the skb went through partial segmentation, lower devices
+ * will not need to offload the related features - except for
+ * UDP_TUNNEL, that will be re-added by the later
+ * udp_tunnel_handle_offloads().
+ */
+ if (double_encap)
+ skb_shinfo(skb)->gso_type &= ~UDP_TUNNEL_STRIPPED_GSO_TYPES;
+ return double_encap;
+}
+
+static inline void udp_tunnel_set_inner_protocol(struct sk_buff *skb,
+ bool double_encap,
+ __be16 inner_proto)
+{
+ /*
+ * The inner protocol has been set by the nested tunnel, don't
+ * overraid it.
+ */
+ if (!double_encap)
+ skb_set_inner_protocol(skb, inner_proto);
+}
void udp_tunnel_sock_release(struct socket *sock);
@@ -191,6 +207,21 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum)
}
#endif
+#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
+void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add);
+void udp_tunnel_update_gro_rcv(struct sock *sk, bool add);
+#else
+static inline void udp_tunnel_update_gro_lookup(struct net *net,
+ struct sock *sk, bool add) {}
+static inline void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) {}
+#endif
+
+static inline void udp_tunnel_cleanup_gro(struct sock *sk)
+{
+ udp_tunnel_update_gro_rcv(sk, false);
+ udp_tunnel_update_gro_lookup(sock_net(sk), sk, false);
+}
+
static inline void udp_tunnel_encap_enable(struct sock *sk)
{
if (udp_test_and_set_bit(ENCAP_ENABLED, sk))
@@ -198,7 +229,7 @@ static inline void udp_tunnel_encap_enable(struct sock *sk)
#if IS_ENABLED(CONFIG_IPV6)
if (READ_ONCE(sk->sk_family) == PF_INET6)
- ipv6_stub->udpv6_encap_enable();
+ udpv6_encap_enable();
#endif
udp_encap_enable();
}
@@ -206,19 +237,17 @@ static inline void udp_tunnel_encap_enable(struct sock *sk)
#define UDP_TUNNEL_NIC_MAX_TABLES 4
enum udp_tunnel_nic_info_flags {
- /* Device callbacks may sleep */
- UDP_TUNNEL_NIC_INFO_MAY_SLEEP = BIT(0),
/* Device only supports offloads when it's open, all ports
* will be removed before close and re-added after open.
*/
- UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(1),
+ UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(0),
/* Device supports only IPv4 tunnels */
- UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(2),
+ UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(1),
/* Device has hard-coded the IANA VXLAN port (4789) as VXLAN.
* This port must not be counted towards n_entries of any table.
* Driver will not receive any callback associated with port 4789.
*/
- UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(3),
+ UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(2),
};
struct udp_tunnel_nic;
@@ -309,6 +338,9 @@ struct udp_tunnel_nic_ops {
size_t (*dump_size)(struct net_device *dev, unsigned int table);
int (*dump_write)(struct net_device *dev, unsigned int table,
struct sk_buff *skb);
+ void (*assert_locked)(struct net_device *dev);
+ void (*lock)(struct net_device *dev);
+ void (*unlock)(struct net_device *dev);
};
#ifdef CONFIG_INET
@@ -337,8 +369,28 @@ static inline void
udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table,
unsigned int idx, u8 priv)
{
- if (udp_tunnel_nic_ops)
+ if (udp_tunnel_nic_ops) {
+ udp_tunnel_nic_ops->assert_locked(dev);
udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv);
+ }
+}
+
+static inline void udp_tunnel_nic_assert_locked(struct net_device *dev)
+{
+ if (udp_tunnel_nic_ops)
+ udp_tunnel_nic_ops->assert_locked(dev);
+}
+
+static inline void udp_tunnel_nic_lock(struct net_device *dev)
+{
+ if (udp_tunnel_nic_ops)
+ udp_tunnel_nic_ops->lock(dev);
+}
+
+static inline void udp_tunnel_nic_unlock(struct net_device *dev)
+{
+ if (udp_tunnel_nic_ops)
+ udp_tunnel_nic_ops->unlock(dev);
}
static inline void
@@ -380,17 +432,50 @@ static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev)
static inline size_t
udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table)
{
+ size_t ret;
+
if (!udp_tunnel_nic_ops)
return 0;
- return udp_tunnel_nic_ops->dump_size(dev, table);
+
+ udp_tunnel_nic_ops->lock(dev);
+ ret = udp_tunnel_nic_ops->dump_size(dev, table);
+ udp_tunnel_nic_ops->unlock(dev);
+
+ return ret;
}
static inline int
udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table,
struct sk_buff *skb)
{
+ int ret;
+
if (!udp_tunnel_nic_ops)
return 0;
- return udp_tunnel_nic_ops->dump_write(dev, table, skb);
+
+ udp_tunnel_nic_ops->lock(dev);
+ ret = udp_tunnel_nic_ops->dump_write(dev, table, skb);
+ udp_tunnel_nic_ops->unlock(dev);
+
+ return ret;
}
+
+static inline void udp_tunnel_get_rx_info(struct net_device *dev)
+{
+ ASSERT_RTNL();
+ if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
+ return;
+ udp_tunnel_nic_assert_locked(dev);
+ call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev);
+}
+
+static inline void udp_tunnel_drop_rx_info(struct net_device *dev)
+{
+ ASSERT_RTNL();
+ if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
+ return;
+ udp_tunnel_nic_assert_locked(dev);
+ call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev);
+}
+
#endif
diff --git a/include/net/udplite.h b/include/net/udplite.h
deleted file mode 100644
index 786919d29f8d..000000000000
--- a/include/net/udplite.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Definitions for the UDP-Lite (RFC 3828) code.
- */
-#ifndef _UDPLITE_H
-#define _UDPLITE_H
-
-#include <net/ip6_checksum.h>
-#include <net/udp.h>
-
-/* UDP-Lite socket options */
-#define UDPLITE_SEND_CSCOV 10 /* sender partial coverage (as sent) */
-#define UDPLITE_RECV_CSCOV 11 /* receiver partial coverage (threshold ) */
-
-extern struct proto udplite_prot;
-extern struct udp_table udplite_table;
-
-/*
- * Checksum computation is all in software, hence simpler getfrag.
- */
-static __inline__ int udplite_getfrag(void *from, char *to, int offset,
- int len, int odd, struct sk_buff *skb)
-{
- struct msghdr *msg = from;
- return copy_from_iter_full(to, len, &msg->msg_iter) ? 0 : -EFAULT;
-}
-
-/*
- * Checksumming routines
- */
-static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh)
-{
- u16 cscov;
-
- /* In UDPv4 a zero checksum means that the transmitter generated no
- * checksum. UDP-Lite (like IPv6) mandates checksums, hence packets
- * with a zero checksum field are illegal. */
- if (uh->check == 0) {
- net_dbg_ratelimited("UDPLite: zeroed checksum field\n");
- return 1;
- }
-
- cscov = ntohs(uh->len);
-
- if (cscov == 0) /* Indicates that full coverage is required. */
- ;
- else if (cscov < 8 || cscov > skb->len) {
- /*
- * Coverage length violates RFC 3828: log and discard silently.
- */
- net_dbg_ratelimited("UDPLite: bad csum coverage %d/%d\n",
- cscov, skb->len);
- return 1;
-
- } else if (cscov < skb->len) {
- UDP_SKB_CB(skb)->partial_cov = 1;
- UDP_SKB_CB(skb)->cscov = cscov;
- if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = CHECKSUM_NONE;
- skb->csum_valid = 0;
- }
-
- return 0;
-}
-
-/* Fast-path computation of checksum. Socket may not be locked. */
-static inline __wsum udplite_csum(struct sk_buff *skb)
-{
- const int off = skb_transport_offset(skb);
- const struct sock *sk = skb->sk;
- int len = skb->len - off;
-
- if (udp_test_bit(UDPLITE_SEND_CC, sk)) {
- u16 pcslen = READ_ONCE(udp_sk(sk)->pcslen);
-
- if (pcslen < len) {
- if (pcslen > 0)
- len = pcslen;
- udp_hdr(skb)->len = htons(pcslen);
- }
- }
- skb->ip_summed = CHECKSUM_NONE; /* no HW support for checksumming */
-
- return skb_checksum(skb, off, len, 0);
-}
-
-void udplite4_register(void);
-#endif /* _UDPLITE_H */
diff --git a/include/net/vsock_addr.h b/include/net/vsock_addr.h
index cf8cc140d68d..c3f4cc206198 100644
--- a/include/net/vsock_addr.h
+++ b/include/net/vsock_addr.h
@@ -16,7 +16,7 @@ bool vsock_addr_bound(const struct sockaddr_vm *addr);
void vsock_addr_unbind(struct sockaddr_vm *addr);
bool vsock_addr_equals_addr(const struct sockaddr_vm *addr,
const struct sockaddr_vm *other);
-int vsock_addr_cast(const struct sockaddr *addr, size_t len,
+int vsock_addr_cast(const struct sockaddr_unsized *addr, size_t len,
struct sockaddr_vm **out_addr);
#endif
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 2dd23ee2bacd..0ee50785f4f1 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -296,7 +296,7 @@ struct vxlan_dev {
struct vxlan_rdst default_dst; /* default destination */
struct timer_list age_timer;
- spinlock_t hash_lock[FDB_HASH_SIZE];
+ spinlock_t hash_lock;
unsigned int addrcnt;
struct gro_cells gro_cells;
@@ -304,9 +304,10 @@ struct vxlan_dev {
struct vxlan_vni_group __rcu *vnigrp;
- struct hlist_head fdb_head[FDB_HASH_SIZE];
+ struct rhashtable fdb_hash_tbl;
struct rhashtable mdb_tbl;
+ struct hlist_head fdb_list;
struct hlist_head mdb_list;
unsigned int mdb_seq;
};
@@ -331,6 +332,7 @@ struct vxlan_dev {
#define VXLAN_F_VNIFILTER 0x20000
#define VXLAN_F_MDB 0x40000
#define VXLAN_F_LOCALBYPASS 0x80000
+#define VXLAN_F_MC_ROUTE 0x100000
/* Flags that are used in the receive path. These flags must match in
* order for a socket to be shareable
@@ -352,7 +354,9 @@ struct vxlan_dev {
VXLAN_F_UDP_ZERO_CSUM6_RX | \
VXLAN_F_COLLECT_METADATA | \
VXLAN_F_VNIFILTER | \
- VXLAN_F_LOCALBYPASS)
+ VXLAN_F_LOCALBYPASS | \
+ VXLAN_F_MC_ROUTE | \
+ 0)
struct net_device *vxlan_dev_create(struct net *net, const char *name,
u8 name_assign_type, struct vxlan_config *conf);
diff --git a/include/net/x25.h b/include/net/x25.h
index 5e833cfc864e..414f3fd99345 100644
--- a/include/net/x25.h
+++ b/include/net/x25.h
@@ -203,7 +203,6 @@ void x25_send_frame(struct sk_buff *, struct x25_neigh *);
int x25_lapb_receive_frame(struct sk_buff *, struct net_device *,
struct packet_type *, struct net_device *);
void x25_establish_link(struct x25_neigh *);
-void x25_terminate_link(struct x25_neigh *);
/* x25_facilities.c */
int x25_parse_facilities(struct sk_buff *, struct x25_facilities *,
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 4dafc5e021f1..aa742f413c35 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -76,6 +76,11 @@ enum xdp_buff_flags {
XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under
* pressure
*/
+ /* frags have unreadable mem, this can't be true for real XDP packets,
+ * but drivers may use XDP helpers to construct Rx pkt state even when
+ * XDP program is not attached.
+ */
+ XDP_FLAGS_FRAGS_UNREADABLE = BIT(2),
};
struct xdp_buff {
@@ -85,8 +90,20 @@ struct xdp_buff {
void *data_hard_start;
struct xdp_rxq_info *rxq;
struct xdp_txq_info *txq;
- u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
- u32 flags; /* supported values defined in xdp_buff_flags */
+
+ union {
+ struct {
+ /* frame size to deduce data_hard_end/tailroom */
+ u32 frame_sz;
+ /* supported values defined in xdp_buff_flags */
+ u32 flags;
+ };
+
+#ifdef __LITTLE_ENDIAN
+ /* Used to micro-optimize xdp_init_buff(), don't use directly */
+ u64 frame_sz_flags_init;
+#endif
+ };
};
static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp)
@@ -104,23 +121,42 @@ static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp)
xdp->flags &= ~XDP_FLAGS_HAS_FRAGS;
}
-static __always_inline bool
-xdp_buff_is_frag_pfmemalloc(const struct xdp_buff *xdp)
+static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp)
{
- return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
+ xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
}
-static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp)
+static __always_inline void xdp_buff_set_frag_unreadable(struct xdp_buff *xdp)
{
- xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
+ xdp->flags |= XDP_FLAGS_FRAGS_UNREADABLE;
+}
+
+static __always_inline u32 xdp_buff_get_skb_flags(const struct xdp_buff *xdp)
+{
+ return xdp->flags;
+}
+
+static __always_inline void xdp_buff_clear_frag_pfmemalloc(struct xdp_buff *xdp)
+{
+ xdp->flags &= ~XDP_FLAGS_FRAGS_PF_MEMALLOC;
}
static __always_inline void
xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
{
- xdp->frame_sz = frame_sz;
xdp->rxq = rxq;
+
+#ifdef __LITTLE_ENDIAN
+ /*
+ * Force the compilers to initialize ::flags and assign ::frame_sz with
+ * one write on 64-bit LE architectures as they're often unable to do
+ * it themselves.
+ */
+ xdp->frame_sz_flags_init = frame_sz;
+#else
+ xdp->frame_sz = frame_sz;
xdp->flags = 0;
+#endif
}
static __always_inline void
@@ -249,6 +285,8 @@ static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem,
if (unlikely(netmem_is_pfmemalloc(netmem)))
xdp_buff_set_frag_pfmemalloc(xdp);
+ if (unlikely(netmem_is_net_iov(netmem)))
+ xdp_buff_set_frag_unreadable(xdp);
return true;
}
@@ -272,10 +310,10 @@ static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame)
return !!(frame->flags & XDP_FLAGS_HAS_FRAGS);
}
-static __always_inline bool
-xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame)
+static __always_inline u32
+xdp_frame_get_skb_flags(const struct xdp_frame *frame)
{
- return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
+ return frame->flags;
}
#define XDP_BULK_QUEUE_SIZE 16
@@ -312,9 +350,9 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame)
}
static inline void
-xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags,
- unsigned int size, unsigned int truesize,
- bool pfmemalloc)
+xdp_update_skb_frags_info(struct sk_buff *skb, u8 nr_frags,
+ unsigned int size, unsigned int truesize,
+ u32 xdp_flags)
{
struct skb_shared_info *sinfo = skb_shinfo(skb);
@@ -328,7 +366,8 @@ xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags,
skb->len += size;
skb->data_len += size;
skb->truesize += truesize;
- skb->pfmemalloc |= pfmemalloc;
+ skb->pfmemalloc |= !!(xdp_flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
+ skb->unreadable |= !!(xdp_flags & XDP_FLAGS_FRAGS_UNREADABLE);
}
/* Avoids inlining WARN macro in fast-path */
@@ -343,7 +382,6 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct net_device *dev);
struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct net_device *dev);
-int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp);
struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);
static inline
@@ -617,8 +655,12 @@ struct xdp_metadata_ops {
u32 bpf_xdp_metadata_kfunc_id(int id);
bool bpf_dev_bound_kfunc_id(u32 btf_id);
void xdp_set_features_flag(struct net_device *dev, xdp_features_t val);
+void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val);
void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg);
+void xdp_features_set_redirect_target_locked(struct net_device *dev,
+ bool support_sg);
void xdp_features_clear_redirect_target(struct net_device *dev);
+void xdp_features_clear_redirect_target_locked(struct net_device *dev);
#else
static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; }
static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; }
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index bfe625b55d55..ebac60a3d8a1 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -14,7 +14,7 @@
#include <linux/mm.h>
#include <net/sock.h>
-#define XDP_UMEM_SG_FLAG (1 << 1)
+#define XDP_UMEM_SG_FLAG BIT(3)
struct net_device;
struct xsk_queue;
@@ -71,9 +71,6 @@ struct xdp_sock {
*/
u32 tx_budget_spent;
- /* Protects generic receive. */
- spinlock_t rx_lock;
-
/* Statistics */
u64 rx_dropped;
u64 rx_queue_full;
@@ -87,6 +84,7 @@ struct xdp_sock {
struct list_head map_list;
/* Protects map_list */
spinlock_t map_list_lock;
+ u32 max_tx_budget;
/* Protects multiple processes in the control path */
struct mutex mutex;
struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */
@@ -110,11 +108,16 @@ struct xdp_sock {
* indicates position where checksumming should start.
* csum_offset indicates position where checksum should be stored.
*
+ * void (*tmo_request_launch_time)(u64 launch_time, void *priv)
+ * Called when AF_XDP frame requested launch time HW offload support.
+ * launch_time indicates the PTP time at which the device can schedule the
+ * packet for transmission.
*/
struct xsk_tx_metadata_ops {
void (*tmo_request_timestamp)(void *priv);
u64 (*tmo_fill_timestamp)(void *priv);
void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv);
+ void (*tmo_request_launch_time)(u64 launch_time, void *priv);
};
#ifdef CONFIG_XDP_SOCKETS
@@ -122,6 +125,7 @@ struct xsk_tx_metadata_ops {
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp);
int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp);
void __xsk_map_flush(struct list_head *flush_list);
+INDIRECT_CALLABLE_DECLARE(void xsk_destruct_skb(struct sk_buff *));
/**
* xsk_tx_metadata_to_compl - Save enough relevant metadata information
@@ -162,6 +166,11 @@ static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta,
if (!meta)
return;
+ if (ops->tmo_request_launch_time)
+ if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME)
+ ops->tmo_request_launch_time(meta->request.launch_time,
+ priv);
+
if (ops->tmo_request_timestamp)
if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP)
ops->tmo_request_timestamp(priv);
@@ -210,6 +219,12 @@ static inline void __xsk_map_flush(struct list_head *flush_list)
{
}
+#ifdef CONFIG_MITIGATION_RETPOLINE
+static inline void xsk_destruct_skb(struct sk_buff *skb)
+{
+}
+#endif
+
static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta,
struct xsk_tx_metadata_compl *compl)
{
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 784cd34f5bba..46797645a0c2 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -12,6 +12,10 @@
#define XDP_UMEM_MIN_CHUNK_SHIFT 11
#define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT)
+#define NETDEV_XDP_ACT_XSK (NETDEV_XDP_ACT_BASIC | \
+ NETDEV_XDP_ACT_REDIRECT | \
+ NETDEV_XDP_ACT_XSK_ZEROCOPY)
+
struct xsk_cb_desc {
void *src;
u8 off;
@@ -37,16 +41,42 @@ static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool)
return XDP_PACKET_HEADROOM + pool->headroom;
}
+static inline u32 xsk_pool_get_tailroom(bool mbuf)
+{
+ return mbuf ? SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) : 0;
+}
+
static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool)
{
return pool->chunk_size;
}
-static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool)
+static inline u32 __xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool)
{
return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool);
}
+static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool)
+{
+ u32 frame_size = __xsk_pool_get_rx_frame_size(pool);
+ struct xdp_umem *umem = pool->umem;
+ bool mbuf;
+
+ /* Reserve tailroom only for zero-copy pools that opted into
+ * multi-buffer. The reserved area is used for skb_shared_info,
+ * matching the XDP core's xdp_data_hard_end() layout.
+ */
+ mbuf = pool->dev && (umem->flags & XDP_UMEM_SG_FLAG);
+ frame_size -= xsk_pool_get_tailroom(mbuf);
+
+ return ALIGN_DOWN(frame_size, 128);
+}
+
+static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool)
+{
+ return pool->unaligned ? 0 : xsk_pool_get_chunk_size(pool);
+}
+
static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool,
struct xdp_rxq_info *rxq)
{
@@ -118,7 +148,7 @@ static inline void xsk_buff_free(struct xdp_buff *xdp)
goto out;
list_for_each_entry_safe(pos, tmp, xskb_list, list_node) {
- list_del(&pos->list_node);
+ list_del_init(&pos->list_node);
xp_free(pos);
}
@@ -153,18 +183,28 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first)
frag = list_first_entry_or_null(&xskb->pool->xskb_list,
struct xdp_buff_xsk, list_node);
if (frag) {
- list_del(&frag->list_node);
+ list_del_init(&frag->list_node);
ret = &frag->xdp;
}
return ret;
}
-static inline void xsk_buff_del_tail(struct xdp_buff *tail)
+static inline void xsk_buff_del_frag(struct xdp_buff *xdp)
{
- struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp);
+ struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
+
+ list_del_init(&xskb->list_node);
+}
- list_del(&xskb->list_node);
+static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first)
+{
+ struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
+ struct xdp_buff_xsk *frag;
+
+ frag = list_first_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk,
+ list_node);
+ return &frag->xdp;
}
static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
@@ -196,9 +236,27 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
return xp_raw_get_data(pool, addr);
}
+/**
+ * xsk_buff_raw_get_ctx - get &xdp_desc context
+ * @pool: XSk buff pool desc address belongs to
+ * @addr: desc address (from userspace)
+ *
+ * Wrapper for xp_raw_get_ctx() to be used in drivers, see its kdoc for
+ * details.
+ *
+ * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata
+ * pointer, if it is present and valid (initialized to %NULL otherwise).
+ */
+static inline struct xdp_desc_ctx
+xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr)
+{
+ return xp_raw_get_ctx(pool, addr);
+}
+
#define XDP_TXMD_FLAGS_VALID ( \
XDP_TXMD_FLAGS_TIMESTAMP | \
XDP_TXMD_FLAGS_CHECKSUM | \
+ XDP_TXMD_FLAGS_LAUNCH_TIME | \
0)
static inline bool
@@ -207,20 +265,27 @@ xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta)
return !(meta->flags & ~XDP_TXMD_FLAGS_VALID);
}
-static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
+static inline struct xsk_tx_metadata *
+__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data)
{
struct xsk_tx_metadata *meta;
if (!pool->tx_metadata_len)
return NULL;
- meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len;
+ meta = data - pool->tx_metadata_len;
if (unlikely(!xsk_buff_valid_tx_metadata(meta)))
return NULL; /* no way to signal the error to the user */
return meta;
}
+static inline struct xsk_tx_metadata *
+xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
+{
+ return __xsk_buff_get_metadata(pool, xp_raw_get_data(pool, addr));
+}
+
static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
{
struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
@@ -298,6 +363,11 @@ static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool)
return 0;
}
+static inline u32 xsk_pool_get_rx_frag_step(struct xsk_buff_pool *pool)
+{
+ return 0;
+}
+
static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool,
struct xdp_rxq_info *rxq)
{
@@ -364,10 +434,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first)
return NULL;
}
-static inline void xsk_buff_del_tail(struct xdp_buff *tail)
+static inline void xsk_buff_del_frag(struct xdp_buff *xdp)
{
}
+static inline struct xdp_buff *xsk_buff_get_head(struct xdp_buff *first)
+{
+ return NULL;
+}
+
static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
{
return NULL;
@@ -388,12 +463,25 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
return NULL;
}
+static inline struct xdp_desc_ctx
+xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr)
+{
+ return (struct xdp_desc_ctx){ };
+}
+
static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta)
{
return false;
}
-static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
+static inline struct xsk_tx_metadata *
+__xsk_buff_get_metadata(const struct xsk_buff_pool *pool, void *data)
+{
+ return NULL;
+}
+
+static inline struct xsk_tx_metadata *
+xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr)
{
return NULL;
}
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index ed4b83696c77..874409127e29 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -147,8 +147,19 @@ enum {
};
struct xfrm_dev_offload {
+ /* The device for this offload.
+ * Device drivers should not use this directly, as that will prevent
+ * them from working with bonding device. Instead, the device passed
+ * to the add/delete callbacks should be used.
+ */
struct net_device *dev;
netdevice_tracker dev_tracker;
+ /* This is a private pointer used by the bonding driver (and eventually
+ * should be moved there). Device drivers should not use it.
+ * Protected by xfrm_state.lock AND bond.ipsec_lock in most cases,
+ * except in the .xdo_dev_state_del() flow, where only xfrm_state.lock
+ * is held.
+ */
struct net_device *real_dev;
unsigned long offload_handle;
u8 dir : 2;
@@ -236,7 +247,6 @@ struct xfrm_state {
/* Data for encapsulator */
struct xfrm_encap_tmpl *encap;
- struct sock __rcu *encap_sk;
/* NAT keepalive */
u32 nat_keepalive_interval; /* seconds */
@@ -431,7 +441,6 @@ int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo);
int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo);
void xfrm_flush_gc(void);
-void xfrm_state_delete_tunnel(struct xfrm_state *x);
struct xfrm_type {
struct module *owner;
@@ -464,6 +473,15 @@ struct xfrm_type_offload {
int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family);
void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family);
+void xfrm_set_type_offload(struct xfrm_state *x, bool try_load);
+static inline void xfrm_unset_type_offload(struct xfrm_state *x)
+{
+ if (!x->type_offload)
+ return;
+
+ module_put(x->type_offload->owner);
+ x->type_offload = NULL;
+}
/**
* struct xfrm_mode_cbs - XFRM mode callbacks
@@ -518,7 +536,8 @@ static inline int xfrm_af2proto(unsigned int family)
static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto)
{
- if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
+ if ((x->sel.family != AF_UNSPEC) ||
+ (ipproto == IPPROTO_IPIP && x->props.family == AF_INET) ||
(ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6))
return &x->inner_mode;
else
@@ -696,6 +715,7 @@ struct xfrm_mgr {
const struct xfrm_migrate *m,
int num_bundles,
const struct xfrm_kmaddress *k,
+ struct net *net,
const struct xfrm_encap_tmpl *encap);
bool (*is_alive)(const struct km_event *c);
};
@@ -897,7 +917,7 @@ static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols)
xfrm_pol_put(pols[i]);
}
-void __xfrm_state_destroy(struct xfrm_state *, bool);
+void __xfrm_state_destroy(struct xfrm_state *);
static inline void __xfrm_state_put(struct xfrm_state *x)
{
@@ -907,13 +927,7 @@ static inline void __xfrm_state_put(struct xfrm_state *x)
static inline void xfrm_state_put(struct xfrm_state *x)
{
if (refcount_dec_and_test(&x->refcnt))
- __xfrm_state_destroy(x, false);
-}
-
-static inline void xfrm_state_put_sync(struct xfrm_state *x)
-{
- if (refcount_dec_and_test(&x->refcnt))
- __xfrm_state_destroy(x, true);
+ __xfrm_state_destroy(x);
}
static inline void xfrm_state_hold(struct xfrm_state *x)
@@ -1143,19 +1157,19 @@ struct xfrm_offload {
#define CRYPTO_INVALID_PROTOCOL 128
/* Used to keep whole l2 header for transport mode GRO */
- __u32 orig_mac_len;
+ __u16 orig_mac_len;
__u8 proto;
__u8 inner_ipproto;
};
struct sec_path {
- int len;
- int olen;
- int verified_cnt;
-
struct xfrm_state *xvec[XFRM_MAX_DEPTH];
struct xfrm_offload ovec[XFRM_MAX_OFFLOAD_DEPTH];
+
+ u8 len;
+ u8 olen;
+ u8 verified_cnt;
};
struct sec_path *secpath_set(struct sk_buff *skb);
@@ -1751,7 +1765,7 @@ struct xfrmk_spdinfo {
struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num);
int xfrm_state_delete(struct xfrm_state *x);
-int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync);
+int xfrm_state_flush(struct net *net, u8 proto, bool task_valid);
int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid);
int xfrm_dev_policy_flush(struct net *net, struct net_device *dev,
bool task_valid);
@@ -1760,8 +1774,7 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack);
u32 xfrm_state_mtu(struct xfrm_state *x, int mtu);
-int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
- struct netlink_ext_ack *extack);
+int __xfrm_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack);
int xfrm_init_state(struct xfrm_state *x);
int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type);
int xfrm_input_resume(struct sk_buff *skb, int nexthdr);
@@ -1773,6 +1786,15 @@ int xfrm_trans_queue(struct sk_buff *skb,
struct sk_buff *));
int xfrm_output_resume(struct sock *sk, struct sk_buff *skb, int err);
int xfrm_output(struct sock *sk, struct sk_buff *skb);
+int xfrm4_tunnel_check_size(struct sk_buff *skb);
+#if IS_ENABLED(CONFIG_IPV6)
+int xfrm6_tunnel_check_size(struct sk_buff *skb);
+#else
+static inline int xfrm6_tunnel_check_size(struct sk_buff *skb)
+{
+ return -EMSGSIZE;
+}
+#endif
#if IS_ENABLED(CONFIG_NET_PKTGEN)
int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb);
@@ -1870,18 +1892,22 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol);
#ifdef CONFIG_XFRM_MIGRATE
int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
const struct xfrm_migrate *m, int num_bundles,
- const struct xfrm_kmaddress *k,
+ const struct xfrm_kmaddress *k, struct net *net,
const struct xfrm_encap_tmpl *encap);
struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net,
u32 if_id);
struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x,
struct xfrm_migrate *m,
- struct xfrm_encap_tmpl *encap);
+ struct xfrm_encap_tmpl *encap,
+ struct net *net,
+ struct xfrm_user_offload *xuo,
+ struct netlink_ext_ack *extack);
int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
struct xfrm_migrate *m, int num_bundles,
struct xfrm_kmaddress *k, struct net *net,
struct xfrm_encap_tmpl *encap, u32 if_id,
- struct netlink_ext_ack *extack);
+ struct netlink_ext_ack *extack,
+ struct xfrm_user_offload *xuo);
#endif
int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport);
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index 50779406bc2d..ccb3b350001f 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -53,6 +53,8 @@ struct xsk_buff_pool {
refcount_t users;
struct xdp_umem *umem;
struct work_struct work;
+ /* Protects generic receive in shared and non-shared umem mode. */
+ spinlock_t rx_lock;
struct list_head free_list;
struct list_head xskb_list;
u32 heads_cnt;
@@ -83,11 +85,11 @@ struct xsk_buff_pool {
bool unaligned;
bool tx_sw_csum;
void *addrs;
- /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect:
- * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when
- * sockets share a single cq when the same netdev and queue id is shared.
+ /* Mutual exclusion of the completion ring in the SKB mode.
+ * Protect: NAPI TX thread and sendmsg error paths in the SKB
+ * destructor callback.
*/
- spinlock_t cq_lock;
+ spinlock_t cq_prod_lock;
struct xdp_buff_xsk *free_heads[];
};
@@ -141,6 +143,14 @@ u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max);
bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count);
void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr);
dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr);
+
+struct xdp_desc_ctx {
+ dma_addr_t dma;
+ struct xsk_tx_metadata *meta;
+};
+
+struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr);
+
static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb)
{
return xskb->dma;
@@ -164,13 +174,6 @@ static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool,
dma_sync_single_for_device(pool->dev, dma, size, DMA_BIDIRECTIONAL);
}
-/* Masks for xdp_umem_page flags.
- * The low 12-bits of the addr will be 0 since this is the page address, so we
- * can use them for flags.
- */
-#define XSK_NEXT_PG_CONTIG_SHIFT 0
-#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT)
-
static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
u64 addr, u32 len)
{
@@ -230,8 +233,8 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb,
return orig_addr;
offset = xskb->xdp.data - xskb->xdp.data_hard_start;
- orig_addr -= offset;
offset += pool->headroom;
+ orig_addr -= offset;
return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
}