summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Corbet <corbet@lwn.net>2024-12-13 09:34:13 -0700
committerJonathan Corbet <corbet@lwn.net>2024-12-13 09:34:13 -0700
commit5fad5e0a6014c824fbbf1dec9f76eb11581d8ee7 (patch)
treedb4cc60d0b71f3a4ba61f74731074de2b4388148
parent92b3d24de890ced604a741ab3cb9095c210be2c9 (diff)
parent5c14b68596e748241d84431628c0b8aea41db2be (diff)
downloadlwn-5fad5e0a6014c824fbbf1dec9f76eb11581d8ee7.tar.gz
lwn-5fad5e0a6014c824fbbf1dec9f76eb11581d8ee7.zip
Merge branch 'docs-mw' into docs-next
-rw-r--r--Documentation/accounting/taskstats-struct.rst2
-rw-r--r--Documentation/admin-guide/blockdev/zram.rst6
-rw-r--r--Documentation/admin-guide/index.rst1
-rw-r--r--Documentation/admin-guide/nvme-multipath.rst72
-rw-r--r--Documentation/core-api/kref.rst7
-rw-r--r--Documentation/filesystems/iomap/operations.rst2
-rw-r--r--Documentation/filesystems/overlayfs.rst2
-rw-r--r--Documentation/filesystems/porting.rst2
-rw-r--r--Documentation/scheduler/sched-deadline.rst13
-rw-r--r--Documentation/scheduler/sched-rt-group.rst8
-rw-r--r--Documentation/trace/rv/runtime-verification.rst4
-rw-r--r--include/linux/kref.h48
12 files changed, 133 insertions, 34 deletions
diff --git a/Documentation/accounting/taskstats-struct.rst b/Documentation/accounting/taskstats-struct.rst
index ca90fd489c9a..acca51c34157 100644
--- a/Documentation/accounting/taskstats-struct.rst
+++ b/Documentation/accounting/taskstats-struct.rst
@@ -47,7 +47,7 @@ should not change the relative position of each field within the struct.
1) Common and basic accounting fields::
/* The version number of this struct. This field is always set to
- * TAKSTATS_VERSION, which is defined in <linux/taskstats.h>.
+ * TASKSTATS_VERSION, which is defined in <linux/taskstats.h>.
* Each time the struct is changed, the value should be incremented.
*/
__u16 version;
diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst
index 714a5171bfc0..1576fb93f06c 100644
--- a/Documentation/admin-guide/blockdev/zram.rst
+++ b/Documentation/admin-guide/blockdev/zram.rst
@@ -121,14 +121,14 @@ compression algorithm to use external pre-trained dictionary, pass full
path to the `dict` along with other parameters::
#pass path to pre-trained zstd dictionary
- echo "algo=zstd dict=/etc/dictioary" > /sys/block/zram0/algorithm_params
+ echo "algo=zstd dict=/etc/dictionary" > /sys/block/zram0/algorithm_params
#same, but using algorithm priority
- echo "priority=1 dict=/etc/dictioary" > \
+ echo "priority=1 dict=/etc/dictionary" > \
/sys/block/zram0/algorithm_params
#pass path to pre-trained zstd dictionary and compression level
- echo "algo=zstd level=8 dict=/etc/dictioary" > \
+ echo "algo=zstd level=8 dict=/etc/dictionary" > \
/sys/block/zram0/algorithm_params
Parameters are algorithm specific: not all algorithms support pre-trained
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index e85b1adf5908..15a522a96e76 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -136,6 +136,7 @@ configure specific aspects of kernel behavior to your liking.
vga-softcursor
video-output
xfs
+ nvme-multipath
.. only:: subproject and html
diff --git a/Documentation/admin-guide/nvme-multipath.rst b/Documentation/admin-guide/nvme-multipath.rst
new file mode 100644
index 000000000000..97ca1ccef459
--- /dev/null
+++ b/Documentation/admin-guide/nvme-multipath.rst
@@ -0,0 +1,72 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================
+Linux NVMe multipath
+====================
+
+This document describes NVMe multipath and its path selection policies supported
+by the Linux NVMe host driver.
+
+
+Introduction
+============
+
+The NVMe multipath feature in Linux integrates namespaces with the same
+identifier into a single block device. Using multipath enhances the reliability
+and stability of I/O access while improving bandwidth performance. When a user
+sends I/O to this merged block device, the multipath mechanism selects one of
+the underlying block devices (paths) according to the configured policy.
+Different policies result in different path selections.
+
+
+Policies
+========
+
+All policies follow the ANA (Asymmetric Namespace Access) mechanism, meaning
+that when an optimized path is available, it will be chosen over a non-optimized
+one. Current the NVMe multipath policies include numa(default), round-robin and
+queue-depth.
+
+To set the desired policy (e.g., round-robin), use one of the following methods:
+ 1. echo -n "round-robin" > /sys/module/nvme_core/parameters/iopolicy
+ 2. or add the "nvme_core.iopolicy=round-robin" to cmdline.
+
+
+NUMA
+----
+
+The NUMA policy selects the path closest to the NUMA node of the current CPU for
+I/O distribution. This policy maintains the nearest paths to each NUMA node
+based on network interface connections.
+
+When to use the NUMA policy:
+ 1. Multi-core Systems: Optimizes memory access in multi-core and
+ multi-processor systems, especially under NUMA architecture.
+ 2. High Affinity Workloads: Binds I/O processing to the CPU to reduce
+ communication and data transfer delays across nodes.
+
+
+Round-Robin
+-----------
+
+The round-robin policy distributes I/O requests evenly across all paths to
+enhance throughput and resource utilization. Each I/O operation is sent to the
+next path in sequence.
+
+When to use the round-robin policy:
+ 1. Balanced Workloads: Effective for balanced and predictable workloads with
+ similar I/O size and type.
+ 2. Homogeneous Path Performance: Utilizes all paths efficiently when
+ performance characteristics (e.g., latency, bandwidth) are similar.
+
+
+Queue-Depth
+-----------
+
+The queue-depth policy manages I/O requests based on the current queue depth
+of each path, selecting the path with the least number of in-flight I/Os.
+
+When to use the queue-depth policy:
+ 1. High load with small I/Os: Effectively balances load across paths when
+ the load is high, and I/O operations consist of small, relatively
+ fixed-sized requests.
diff --git a/Documentation/core-api/kref.rst b/Documentation/core-api/kref.rst
index c61eea6f1bf2..8db9ff03d952 100644
--- a/Documentation/core-api/kref.rst
+++ b/Documentation/core-api/kref.rst
@@ -3,7 +3,7 @@ Adding reference counters (krefs) to kernel objects
===================================================
:Author: Corey Minyard <minyard@acm.org>
-:Author: Thomas Hellstrom <thellstrom@vmware.com>
+:Author: Thomas Hellström <thomas.hellstrom@linux.intel.com>
A lot of this was lifted from Greg Kroah-Hartman's 2004 OLS paper and
presentation on krefs, which can be found at:
@@ -321,3 +321,8 @@ rcu grace period after release_entry_rcu was called. That can be accomplished
by using kfree_rcu(entry, rhead) as done above, or by calling synchronize_rcu()
before using kfree, but note that synchronize_rcu() may sleep for a
substantial amount of time.
+
+Functions and structures
+========================
+
+.. kernel-doc:: include/linux/kref.h
diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index ef082e5a4e0c..2c7f5df9d8b0 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -104,7 +104,7 @@ iomap calls these functions:
For the pagecache, races can happen if writeback doesn't take
``i_rwsem`` or ``invalidate_lock`` and updates mapping information.
- Races can also happen if the filesytem allows concurrent writes.
+ Races can also happen if the filesystem allows concurrent writes.
For such files, the mapping *must* be revalidated after the folio
lock has been taken so that iomap can manage the folio correctly.
diff --git a/Documentation/filesystems/overlayfs.rst b/Documentation/filesystems/overlayfs.rst
index 4c8387e1c880..d2a277e3976e 100644
--- a/Documentation/filesystems/overlayfs.rst
+++ b/Documentation/filesystems/overlayfs.rst
@@ -156,7 +156,7 @@ A directory is made opaque by setting the xattr "trusted.overlay.opaque"
to "y". Where the upper filesystem contains an opaque directory, any
directory in the lower filesystem with the same name is ignored.
-An opaque directory should not conntain any whiteouts, because they do not
+An opaque directory should not contain any whiteouts, because they do not
serve any purpose. A merge directory containing regular files with the xattr
"trusted.overlay.whiteout", should be additionally marked by setting the xattr
"trusted.overlay.opaque" to "x" on the merge directory itself.
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 9ab2a3d6f2b4..c1c121055204 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -313,7 +313,7 @@ done.
**mandatory**
-block truncatation on error exit from ->write_begin, and ->direct_IO
+block truncation on error exit from ->write_begin, and ->direct_IO
moved from generic methods (block_write_begin, cont_write_begin,
nobh_write_begin, blockdev_direct_IO*) to callers. Take a look at
ext2_write_failed and callers for an example.
diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst
index 22838ed8e13a..a727827b8dd5 100644
--- a/Documentation/scheduler/sched-deadline.rst
+++ b/Documentation/scheduler/sched-deadline.rst
@@ -591,12 +591,13 @@ Deadline Task Scheduling
The system wide settings are configured under the /proc virtual file system.
- For now the -rt knobs are used for -deadline admission control and the
- -deadline runtime is accounted against the -rt runtime. We realize that this
- isn't entirely desirable; however, it is better to have a small interface for
- now, and be able to change it easily later. The ideal situation (see 5.) is to
- run -rt tasks from a -deadline server; in which case the -rt bandwidth is a
- direct subset of dl_bw.
+ For now the -rt knobs are used for -deadline admission control and with
+ CONFIG_RT_GROUP_SCHED the -deadline runtime is accounted against the (root)
+ -rt runtime. With !CONFIG_RT_GROUP_SCHED the knob only serves for the -dl
+ admission control. We realize that this isn't entirely desirable; however, it
+ is better to have a small interface for now, and be able to change it easily
+ later. The ideal situation (see 5.) is to run -rt tasks from a -deadline
+ server; in which case the -rt bandwidth is a direct subset of dl_bw.
This means that, for a root_domain comprising M CPUs, -deadline tasks
can be created while the sum of their bandwidths stays below:
diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst
index d685609ed3d7..80b05a3009ea 100644
--- a/Documentation/scheduler/sched-rt-group.rst
+++ b/Documentation/scheduler/sched-rt-group.rst
@@ -92,10 +92,10 @@ The system wide settings are configured under the /proc virtual file system:
/proc/sys/kernel/sched_rt_runtime_us:
A global limit on how much time real-time scheduling may use. This is always
less or equal to the period_us, as it denotes the time allocated from the
- period_us for the real-time tasks. Even without CONFIG_RT_GROUP_SCHED enabled,
- this will limit time reserved to real-time processes. With
- CONFIG_RT_GROUP_SCHED=y it signifies the total bandwidth available to all
- real-time groups.
+ period_us for the real-time tasks. Without CONFIG_RT_GROUP_SCHED enabled,
+ this only serves for admission control of deadline tasks. With
+ CONFIG_RT_GROUP_SCHED=y it also signifies the total bandwidth available to
+ all real-time groups.
* Time is specified in us because the interface is s32. This gives an
operating range from 1us to about 35 minutes.
diff --git a/Documentation/trace/rv/runtime-verification.rst b/Documentation/trace/rv/runtime-verification.rst
index dae78dfa7cdc..c700dde9259c 100644
--- a/Documentation/trace/rv/runtime-verification.rst
+++ b/Documentation/trace/rv/runtime-verification.rst
@@ -8,14 +8,14 @@ checking* and *theorem proving*) with a more practical approach for complex
systems.
Instead of relying on a fine-grained model of a system (e.g., a
-re-implementation a instruction level), RV works by analyzing the trace of the
+re-implementation at instruction level), RV works by analyzing the trace of the
system's actual execution, comparing it against a formal specification of
the system behavior.
The main advantage is that RV can give precise information on the runtime
behavior of the monitored system, without the pitfalls of developing models
that require a re-implementation of the entire system in a modeling language.
-Moreover, given an efficient monitoring method, it is possible execute an
+Moreover, given an efficient monitoring method, it is possible to execute an
*online* verification of a system, enabling the *reaction* for unexpected
events, avoiding, for example, the propagation of a failure on safety-critical
systems.
diff --git a/include/linux/kref.h b/include/linux/kref.h
index d32e21a2538c..88e82ab1367c 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -46,18 +46,18 @@ static inline void kref_get(struct kref *kref)
}
/**
- * kref_put - decrement refcount for object.
- * @kref: object.
- * @release: pointer to the function that will clean up the object when the
+ * kref_put - Decrement refcount for object
+ * @kref: Object
+ * @release: Pointer to the function that will clean up the object when the
* last reference to the object is released.
- * This pointer is required, and it is not acceptable to pass kfree
- * in as this function.
*
- * Decrement the refcount, and if 0, call release().
- * Return 1 if the object was removed, otherwise return 0. Beware, if this
- * function returns 0, you still can not count on the kref from remaining in
- * memory. Only use the return value if you want to see if the kref is now
- * gone, not present.
+ * Decrement the refcount, and if 0, call @release. The caller may not
+ * pass NULL or kfree() as the release function.
+ *
+ * Return: 1 if this call removed the object, otherwise return 0. Beware,
+ * if this function returns 0, another caller may have removed the object
+ * by the time this function returns. The return value is only certain
+ * if you want to see if the object is definitely released.
*/
static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
{
@@ -68,17 +68,37 @@ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)
return 0;
}
+/**
+ * kref_put_mutex - Decrement refcount for object
+ * @kref: Object
+ * @release: Pointer to the function that will clean up the object when the
+ * last reference to the object is released.
+ * @mutex: Mutex which protects the release function.
+ *
+ * This variant of kref_lock() calls the @release function with the @mutex
+ * held. The @release function will release the mutex.
+ */
static inline int kref_put_mutex(struct kref *kref,
void (*release)(struct kref *kref),
- struct mutex *lock)
+ struct mutex *mutex)
{
- if (refcount_dec_and_mutex_lock(&kref->refcount, lock)) {
+ if (refcount_dec_and_mutex_lock(&kref->refcount, mutex)) {
release(kref);
return 1;
}
return 0;
}
+/**
+ * kref_put_lock - Decrement refcount for object
+ * @kref: Object
+ * @release: Pointer to the function that will clean up the object when the
+ * last reference to the object is released.
+ * @lock: Spinlock which protects the release function.
+ *
+ * This variant of kref_lock() calls the @release function with the @lock
+ * held. The @release function will release the lock.
+ */
static inline int kref_put_lock(struct kref *kref,
void (*release)(struct kref *kref),
spinlock_t *lock)
@@ -94,8 +114,6 @@ static inline int kref_put_lock(struct kref *kref,
* kref_get_unless_zero - Increment refcount for object unless it is zero.
* @kref: object.
*
- * Return non-zero if the increment succeeded. Otherwise return 0.
- *
* This function is intended to simplify locking around refcounting for
* objects that can be looked up from a lookup structure, and which are
* removed from that lookup structure in the object destructor.
@@ -105,6 +123,8 @@ static inline int kref_put_lock(struct kref *kref,
* With a lookup followed by a kref_get_unless_zero *with return value check*
* locking in the kref_put path can be deferred to the actual removal from
* the lookup structure and RCU lookups become trivial.
+ *
+ * Return: non-zero if the increment succeeded. Otherwise return 0.
*/
static inline int __must_check kref_get_unless_zero(struct kref *kref)
{