summaryrefslogtreecommitdiff
path: root/include/uapi
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-18 17:02:57 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-18 17:02:57 -0800
commit8350142a4b4cedebfa76cd4cc6e5a7ba6a330629 (patch)
tree14310648caff08366a28cd6d2b8dab44688a9995 /include/uapi
parent77a0cfafa9af9c0d5b43534eb90d530c189edca1 (diff)
parenta652958888fb1ada3e4f6b548576c2d2c1b60d66 (diff)
downloadlwn-8350142a4b4cedebfa76cd4cc6e5a7ba6a330629.tar.gz
lwn-8350142a4b4cedebfa76cd4cc6e5a7ba6a330629.zip
Merge tag 'for-6.13/io_uring-20241118' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - Cleanups of the eventfd handling code, making it fully private. - Support for sending a sync message to another ring, without having a ring available to send a normal async message. - Get rid of the separate unlocked hash table, unify everything around the single locked one. - Add support for ring resizing. It can be hard to appropriately size the CQ ring upfront, if the application doesn't know how busy it will be. This results in applications sizing rings for the most busy case, which can be wasteful. With ring resizing, they can start small and grow the ring, if needed. - Add support for fixed wait regions, rather than needing to copy the same wait data tons of times for each wait operation. - Rewrite the resource node handling, which before was serialized per ring. This caused issues with particularly fixed files, where one file waiting on IO could hold up putting and freeing of other unrelated files. Now each node is handled separately. New code is much simpler too, and was a net 250 line reduction in code. - Add support for just doing partial buffer clones, rather than always cloning the entire buffer table. - Series adding static NAPI support, where a specific NAPI instance is used rather than having a list of them available that need lookup. - Add support for mapped regions, and also convert the fixed wait support mentioned above to that concept. This avoids doing special mappings for various planned features, and folds the existing registered wait into that too. - Add support for hybrid IO polling, which is a variant of strict IOPOLL but with an initial sleep delay to avoid spinning too early and wasting resources on devices that aren't necessarily in the < 5 usec category wrt latencies. - Various cleanups and little fixes. * tag 'for-6.13/io_uring-20241118' of git://git.kernel.dk/linux: (79 commits) io_uring/region: fix error codes after failed vmap io_uring: restore back registered wait arguments io_uring: add memory region registration io_uring: introduce concept of memory regions io_uring: temporarily disable registered waits io_uring: disable ENTER_EXT_ARG_REG for IOPOLL io_uring: fortify io_pin_pages with a warning switch io_msg_ring() to CLASS(fd) io_uring: fix invalid hybrid polling ctx leaks io_uring/uring_cmd: fix buffer index retrieval io_uring/rsrc: add & apply io_req_assign_buf_node() io_uring/rsrc: remove '->ctx_ptr' of 'struct io_rsrc_node' io_uring/rsrc: pass 'struct io_ring_ctx' reference to rsrc helpers io_uring: avoid normal tw intermediate fallback io_uring/napi: add static napi tracking strategy io_uring/napi: clean up __io_napi_do_busy_loop io_uring/napi: Use lock guards io_uring/napi: improve __io_napi_add io_uring/napi: fix io_napi_entry RCU accesses io_uring/napi: protect concurrent io_napi_entry timeout accesses ...
Diffstat (limited to 'include/uapi')
-rw-r--r--include/uapi/linux/io_uring.h119
1 files changed, 115 insertions, 4 deletions
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1fe79e750470..4418d0192959 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -200,6 +200,9 @@ enum io_uring_sqe_flags_bit {
*/
#define IORING_SETUP_NO_SQARRAY (1U << 16)
+/* Use hybrid poll in iopoll process */
+#define IORING_SETUP_HYBRID_IOPOLL (1U << 17)
+
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
@@ -416,6 +419,9 @@ enum io_uring_msg_ring_flags {
* IORING_NOP_INJECT_RESULT Inject result from sqe->result
*/
#define IORING_NOP_INJECT_RESULT (1U << 0)
+#define IORING_NOP_FILE (1U << 1)
+#define IORING_NOP_FIXED_FILE (1U << 2)
+#define IORING_NOP_FIXED_BUFFER (1U << 3)
/*
* IO completion data structure (Completion Queue Entry)
@@ -518,6 +524,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_EXT_ARG (1U << 3)
#define IORING_ENTER_REGISTERED_RING (1U << 4)
#define IORING_ENTER_ABS_TIMER (1U << 5)
+#define IORING_ENTER_EXT_ARG_REG (1U << 6)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -612,6 +619,16 @@ enum io_uring_register_op {
/* clone registered buffers from source ring to current ring */
IORING_REGISTER_CLONE_BUFFERS = 30,
+ /* send MSG_RING without having a ring */
+ IORING_REGISTER_SEND_MSG_RING = 31,
+
+ /* 32 reserved for zc rx */
+
+ /* resize CQ ring */
+ IORING_REGISTER_RESIZE_RINGS = 33,
+
+ IORING_REGISTER_MEM_REGION = 34,
+
/* this goes last */
IORING_REGISTER_LAST,
@@ -632,6 +649,31 @@ struct io_uring_files_update {
__aligned_u64 /* __s32 * */ fds;
};
+enum {
+ /* initialise with user provided memory pointed by user_addr */
+ IORING_MEM_REGION_TYPE_USER = 1,
+};
+
+struct io_uring_region_desc {
+ __u64 user_addr;
+ __u64 size;
+ __u32 flags;
+ __u32 id;
+ __u64 mmap_offset;
+ __u64 __resv[4];
+};
+
+enum {
+ /* expose the region as registered wait arguments */
+ IORING_MEM_REGION_REG_WAIT_ARG = 1,
+};
+
+struct io_uring_mem_region_reg {
+ __u64 region_uptr; /* struct io_uring_region_desc * */
+ __u64 flags;
+ __u64 __resv[2];
+};
+
/*
* Register a fully sparse file space, rather than pass in an array of all
* -1 file descriptors.
@@ -698,13 +740,17 @@ struct io_uring_clock_register {
};
enum {
- IORING_REGISTER_SRC_REGISTERED = 1,
+ IORING_REGISTER_SRC_REGISTERED = (1U << 0),
+ IORING_REGISTER_DST_REPLACE = (1U << 1),
};
struct io_uring_clone_buffers {
__u32 src_fd;
__u32 flags;
- __u32 pad[6];
+ __u32 src_off;
+ __u32 dst_off;
+ __u32 nr;
+ __u32 pad[3];
};
struct io_uring_buf {
@@ -768,12 +814,40 @@ struct io_uring_buf_status {
__u32 resv[8];
};
+enum io_uring_napi_op {
+ /* register/ungister backward compatible opcode */
+ IO_URING_NAPI_REGISTER_OP = 0,
+
+ /* opcodes to update napi_list when static tracking is used */
+ IO_URING_NAPI_STATIC_ADD_ID = 1,
+ IO_URING_NAPI_STATIC_DEL_ID = 2
+};
+
+enum io_uring_napi_tracking_strategy {
+ /* value must be 0 for backward compatibility */
+ IO_URING_NAPI_TRACKING_DYNAMIC = 0,
+ IO_URING_NAPI_TRACKING_STATIC = 1,
+ IO_URING_NAPI_TRACKING_INACTIVE = 255
+};
+
/* argument for IORING_(UN)REGISTER_NAPI */
struct io_uring_napi {
__u32 busy_poll_to;
__u8 prefer_busy_poll;
- __u8 pad[3];
- __u64 resv;
+
+ /* a io_uring_napi_op value */
+ __u8 opcode;
+ __u8 pad[2];
+
+ /*
+ * for IO_URING_NAPI_REGISTER_OP, it is a
+ * io_uring_napi_tracking_strategy value.
+ *
+ * for IO_URING_NAPI_STATIC_ADD_ID/IO_URING_NAPI_STATIC_DEL_ID
+ * it is the napi id to add/del from napi_list.
+ */
+ __u32 op_param;
+ __u32 resv;
};
/*
@@ -795,6 +869,43 @@ enum io_uring_register_restriction_op {
IORING_RESTRICTION_LAST
};
+enum {
+ IORING_REG_WAIT_TS = (1U << 0),
+};
+
+/*
+ * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of
+ * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is
+ * called rather than pass in a wait argument structure separately.
+ */
+struct io_uring_cqwait_reg_arg {
+ __u32 flags;
+ __u32 struct_size;
+ __u32 nr_entries;
+ __u32 pad;
+ __u64 user_addr;
+ __u64 pad2[3];
+};
+
+/*
+ * Argument for io_uring_enter(2) with
+ * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument
+ * is an index into a previously registered fixed wait region described by
+ * the below structure.
+ */
+struct io_uring_reg_wait {
+ struct __kernel_timespec ts;
+ __u32 min_wait_usec;
+ __u32 flags;
+ __u64 sigmask;
+ __u32 sigmask_sz;
+ __u32 pad[3];
+ __u64 pad2[2];
+};
+
+/*
+ * Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG
+ */
struct io_uring_getevents_arg {
__u64 sigmask;
__u32 sigmask_sz;