summaryrefslogtreecommitdiff
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/Makefile12
-rw-r--r--Documentation/RCU/Design/Requirements/Requirements.html130
-rw-r--r--Documentation/RCU/checklist.txt121
-rw-r--r--Documentation/RCU/rcu.txt9
-rw-r--r--Documentation/RCU/rcu_dereference.txt61
-rw-r--r--Documentation/RCU/rcubarrier.txt5
-rw-r--r--Documentation/RCU/torture.txt20
-rw-r--r--Documentation/RCU/whatisRCU.txt5
-rw-r--r--Documentation/admin-guide/kernel-parameters.rst1
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt30
-rw-r--r--Documentation/arm/firmware.txt2
-rw-r--r--Documentation/atomic_bitops.txt66
-rw-r--r--Documentation/atomic_t.txt242
-rw-r--r--Documentation/conf.py6
-rw-r--r--Documentation/core-api/genalloc.rst144
-rw-r--r--Documentation/core-api/index.rst1
-rw-r--r--Documentation/core-api/kernel-api.rst49
-rw-r--r--Documentation/dev-tools/gdb-kernel-debugging.rst6
-rw-r--r--Documentation/dev-tools/kgdb.rst11
-rw-r--r--Documentation/devicetree/bindings/display/bridge/dw_mipi_dsi.txt32
-rw-r--r--Documentation/devicetree/bindings/display/exynos/exynos5433-decon.txt12
-rw-r--r--Documentation/devicetree/bindings/display/repaper.txt52
-rw-r--r--Documentation/devicetree/bindings/display/rockchip/dw_hdmi-rockchip.txt7
-rw-r--r--Documentation/devicetree/bindings/display/rockchip/rockchip-vop.txt4
-rw-r--r--Documentation/devicetree/bindings/display/sitronix,st7586.txt22
-rw-r--r--Documentation/devicetree/bindings/display/st,stm32-ltdc.txt105
-rw-r--r--Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt36
-rw-r--r--Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt9
-rw-r--r--Documentation/devicetree/bindings/hwmon/ibm,cffps1.txt21
-rw-r--r--Documentation/devicetree/bindings/hwmon/ltq-cputemp.txt10
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/fsl,ls-scfg-msi.txt8
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/socionext,uniphier-aidet.txt32
-rw-r--r--Documentation/devicetree/bindings/net/dwmac-sun8i.txt84
-rw-r--r--Documentation/devicetree/bindings/timer/nxp,tpm-timer.txt28
-rw-r--r--Documentation/devicetree/bindings/timer/renesas,cmt.txt73
-rw-r--r--Documentation/devicetree/bindings/vendor-prefixes.txt1
-rw-r--r--Documentation/doc-guide/sphinx.rst107
-rw-r--r--Documentation/driver-api/basics.rst5
-rw-r--r--Documentation/driver-api/dma-buf.rst3
-rw-r--r--Documentation/driver-api/miscellaneous.rst1
-rw-r--r--Documentation/driver-api/s390-drivers.rst2
-rw-r--r--Documentation/driver-api/scsi.rst8
-rw-r--r--Documentation/driver-model/devres.txt1
-rw-r--r--Documentation/fb/efifb.txt6
-rw-r--r--Documentation/features/core/tracehook/arch-support.txt2
-rw-r--r--Documentation/filesystems/vfs.txt4
-rw-r--r--Documentation/gpio/gpio-legacy.txt2
-rw-r--r--Documentation/gpu/drm-internals.rst2
-rw-r--r--Documentation/gpu/drm-kms-helpers.rst9
-rw-r--r--Documentation/gpu/drm-kms.rst59
-rw-r--r--Documentation/gpu/drm-mm.rst4
-rw-r--r--Documentation/gpu/drm-uapi.rst2
-rw-r--r--Documentation/gpu/i915.rst18
-rw-r--r--Documentation/gpu/todo.rst4
-rw-r--r--Documentation/hwmon/ftsteutates4
-rw-r--r--Documentation/hwmon/ibm-cffps54
-rw-r--r--Documentation/hwmon/lm250669
-rw-r--r--Documentation/infiniband/tag_matching.txt64
-rw-r--r--Documentation/input/input.rst2
-rw-r--r--Documentation/input/joydev/index.rst1
-rw-r--r--Documentation/kbuild/makefiles.txt6
-rw-r--r--Documentation/locking/crossrelease.txt874
-rw-r--r--Documentation/locking/rt-mutex-design.txt432
-rw-r--r--Documentation/locking/rt-mutex.txt58
-rw-r--r--Documentation/media/uapi/cec/cec-funcs.rst1
-rw-r--r--Documentation/memory-barriers.txt142
-rw-r--r--Documentation/networking/ieee802154.txt16
-rw-r--r--Documentation/networking/switchdev.txt4
-rw-r--r--Documentation/nvmem/nvmem.txt2
-rw-r--r--Documentation/printk-formats.txt29
-rw-r--r--Documentation/process/applying-patches.rst43
-rw-r--r--Documentation/process/changes.rst16
-rw-r--r--Documentation/process/stable-kernel-rules.rst4
-rw-r--r--Documentation/process/submitting-patches.rst2
-rw-r--r--Documentation/security/keys/core.rst16
-rw-r--r--Documentation/security/keys/request-key.rst2
-rw-r--r--Documentation/security/keys/trusted-encrypted.rst2
-rw-r--r--Documentation/sphinx-static/theme_overrides.css17
-rw-r--r--Documentation/sphinx/kerneldoc.py8
-rw-r--r--Documentation/sphinx/requirements.txt3
-rw-r--r--Documentation/static-keys.txt20
-rw-r--r--Documentation/sysctl/kernel.txt13
-rw-r--r--Documentation/sysctl/net.txt47
-rw-r--r--Documentation/translations/ko_KR/memory-barriers.txt5
-rw-r--r--Documentation/translations/zh_CN/HOWTO2
-rw-r--r--Documentation/x86/amd-memory-encryption.txt68
-rw-r--r--Documentation/x86/early-microcode.txt70
-rw-r--r--Documentation/x86/intel_rdt_ui.txt323
-rw-r--r--Documentation/x86/microcode.txt137
-rw-r--r--Documentation/x86/orc-unwinder.txt179
-rw-r--r--Documentation/x86/protection-keys.txt6
-rw-r--r--Documentation/x86/x86_64/5level-paging.txt64
92 files changed, 3391 insertions, 1050 deletions
diff --git a/Documentation/Makefile b/Documentation/Makefile
index a42320385df3..85f7856f0092 100644
--- a/Documentation/Makefile
+++ b/Documentation/Makefile
@@ -22,6 +22,8 @@ ifeq ($(HAVE_SPHINX),0)
.DEFAULT:
$(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.)
+ @echo
+ @./scripts/sphinx-pre-install
@echo " SKIP Sphinx $@ target."
else # HAVE_SPHINX
@@ -95,16 +97,6 @@ endif # HAVE_SPHINX
# The following targets are independent of HAVE_SPHINX, and the rules should
# work or silently pass without Sphinx.
-# no-ops for the Sphinx toolchain
-sgmldocs:
- @:
-psdocs:
- @:
-mandocs:
- @:
-installmandocs:
- @:
-
cleandocs:
$(Q)rm -rf $(BUILDDIR)
$(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media clean
diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html
index 95b30fa25d56..62e847bcdcdd 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -2080,6 +2080,8 @@ Some of the relevant points of interest are as follows:
<li> <a href="#Scheduler and RCU">Scheduler and RCU</a>.
<li> <a href="#Tracing and RCU">Tracing and RCU</a>.
<li> <a href="#Energy Efficiency">Energy Efficiency</a>.
+<li> <a href="#Scheduling-Clock Interrupts and RCU">
+ Scheduling-Clock Interrupts and RCU</a>.
<li> <a href="#Memory Efficiency">Memory Efficiency</a>.
<li> <a href="#Performance, Scalability, Response Time, and Reliability">
Performance, Scalability, Response Time, and Reliability</a>.
@@ -2532,6 +2534,134 @@ I learned of many of these requirements via angry phone calls:
Flaming me on the Linux-kernel mailing list was apparently not
sufficient to fully vent their ire at RCU's energy-efficiency bugs!
+<h3><a name="Scheduling-Clock Interrupts and RCU">
+Scheduling-Clock Interrupts and RCU</a></h3>
+
+<p>
+The kernel transitions between in-kernel non-idle execution, userspace
+execution, and the idle loop.
+Depending on kernel configuration, RCU handles these states differently:
+
+<table border=3>
+<tr><th><tt>HZ</tt> Kconfig</th>
+ <th>In-Kernel</th>
+ <th>Usermode</th>
+ <th>Idle</th></tr>
+<tr><th align="left"><tt>HZ_PERIODIC</tt></th>
+ <td>Can rely on scheduling-clock interrupt.</td>
+ <td>Can rely on scheduling-clock interrupt and its
+ detection of interrupt from usermode.</td>
+ <td>Can rely on RCU's dyntick-idle detection.</td></tr>
+<tr><th align="left"><tt>NO_HZ_IDLE</tt></th>
+ <td>Can rely on scheduling-clock interrupt.</td>
+ <td>Can rely on scheduling-clock interrupt and its
+ detection of interrupt from usermode.</td>
+ <td>Can rely on RCU's dyntick-idle detection.</td></tr>
+<tr><th align="left"><tt>NO_HZ_FULL</tt></th>
+ <td>Can only sometimes rely on scheduling-clock interrupt.
+ In other cases, it is necessary to bound kernel execution
+ times and/or use IPIs.</td>
+ <td>Can rely on RCU's dyntick-idle detection.</td>
+ <td>Can rely on RCU's dyntick-idle detection.</td></tr>
+</table>
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ Why can't <tt>NO_HZ_FULL</tt> in-kernel execution rely on the
+ scheduling-clock interrupt, just like <tt>HZ_PERIODIC</tt>
+ and <tt>NO_HZ_IDLE</tt> do?
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ Because, as a performance optimization, <tt>NO_HZ_FULL</tt>
+ does not necessarily re-enable the scheduling-clock interrupt
+ on entry to each and every system call.
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<p>
+However, RCU must be reliably informed as to whether any given
+CPU is currently in the idle loop, and, for <tt>NO_HZ_FULL</tt>,
+also whether that CPU is executing in usermode, as discussed
+<a href="#Energy Efficiency">earlier</a>.
+It also requires that the scheduling-clock interrupt be enabled when
+RCU needs it to be:
+
+<ol>
+<li> If a CPU is either idle or executing in usermode, and RCU believes
+ it is non-idle, the scheduling-clock tick had better be running.
+ Otherwise, you will get RCU CPU stall warnings. Or at best,
+ very long (11-second) grace periods, with a pointless IPI waking
+ the CPU from time to time.
+<li> If a CPU is in a portion of the kernel that executes RCU read-side
+ critical sections, and RCU believes this CPU to be idle, you will get
+ random memory corruption. <b>DON'T DO THIS!!!</b>
+
+ <br>This is one reason to test with lockdep, which will complain
+ about this sort of thing.
+<li> If a CPU is in a portion of the kernel that is absolutely
+ positively no-joking guaranteed to never execute any RCU read-side
+ critical sections, and RCU believes this CPU to to be idle,
+ no problem. This sort of thing is used by some architectures
+ for light-weight exception handlers, which can then avoid the
+ overhead of <tt>rcu_irq_enter()</tt> and <tt>rcu_irq_exit()</tt>
+ at exception entry and exit, respectively.
+ Some go further and avoid the entireties of <tt>irq_enter()</tt>
+ and <tt>irq_exit()</tt>.
+
+ <br>Just make very sure you are running some of your tests with
+ <tt>CONFIG_PROVE_RCU=y</tt>, just in case one of your code paths
+ was in fact joking about not doing RCU read-side critical sections.
+<li> If a CPU is executing in the kernel with the scheduling-clock
+ interrupt disabled and RCU believes this CPU to be non-idle,
+ and if the CPU goes idle (from an RCU perspective) every few
+ jiffies, no problem. It is usually OK for there to be the
+ occasional gap between idle periods of up to a second or so.
+
+ <br>If the gap grows too long, you get RCU CPU stall warnings.
+<li> If a CPU is either idle or executing in usermode, and RCU believes
+ it to be idle, of course no problem.
+<li> If a CPU is executing in the kernel, the kernel code
+ path is passing through quiescent states at a reasonable
+ frequency (preferably about once per few jiffies, but the
+ occasional excursion to a second or so is usually OK) and the
+ scheduling-clock interrupt is enabled, of course no problem.
+
+ <br>If the gap between a successive pair of quiescent states grows
+ too long, you get RCU CPU stall warnings.
+</ol>
+
+<table>
+<tr><th>&nbsp;</th></tr>
+<tr><th align="left">Quick Quiz:</th></tr>
+<tr><td>
+ But what if my driver has a hardware interrupt handler
+ that can run for many seconds?
+ I cannot invoke <tt>schedule()</tt> from an hardware
+ interrupt handler, after all!
+</td></tr>
+<tr><th align="left">Answer:</th></tr>
+<tr><td bgcolor="#ffffff"><font color="ffffff">
+ One approach is to do <tt>rcu_irq_exit();rcu_irq_enter();</tt>
+ every so often.
+ But given that long-running interrupt handlers can cause
+ other problems, not least for response time, shouldn't you
+ work to keep your interrupt handler's runtime within reasonable
+ bounds?
+</font></td></tr>
+<tr><td>&nbsp;</td></tr>
+</table>
+
+<p>
+But as long as RCU is properly informed of kernel state transitions between
+in-kernel execution, usermode execution, and idle, and as long as the
+scheduling-clock interrupt is enabled when RCU needs it to be, you
+can rest assured that the bugs you encounter will be in some other
+part of RCU or some other part of the kernel!
+
<h3><a name="Memory Efficiency">Memory Efficiency</a></h3>
<p>
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 6beda556faf3..49747717d905 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -23,6 +23,14 @@ over a rather long period of time, but improvements are always welcome!
Yet another exception is where the low real-time latency of RCU's
read-side primitives is critically important.
+ One final exception is where RCU readers are used to prevent
+ the ABA problem (https://en.wikipedia.org/wiki/ABA_problem)
+ for lockless updates. This does result in the mildly
+ counter-intuitive situation where rcu_read_lock() and
+ rcu_read_unlock() are used to protect updates, however, this
+ approach provides the same potential simplifications that garbage
+ collectors do.
+
1. Does the update code have proper mutual exclusion?
RCU does allow -readers- to run (almost) naked, but -writers- must
@@ -40,7 +48,9 @@ over a rather long period of time, but improvements are always welcome!
explain how this single task does not become a major bottleneck on
big multiprocessor machines (for example, if the task is updating
information relating to itself that other tasks can read, there
- by definition can be no bottleneck).
+ by definition can be no bottleneck). Note that the definition
+ of "large" has changed significantly: Eight CPUs was "large"
+ in the year 2000, but a hundred CPUs was unremarkable in 2017.
2. Do the RCU read-side critical sections make proper use of
rcu_read_lock() and friends? These primitives are needed
@@ -55,6 +65,12 @@ over a rather long period of time, but improvements are always welcome!
Disabling of preemption can serve as rcu_read_lock_sched(), but
is less readable.
+ Letting RCU-protected pointers "leak" out of an RCU read-side
+ critical section is every bid as bad as letting them leak out
+ from under a lock. Unless, of course, you have arranged some
+ other means of protection, such as a lock or a reference count
+ -before- letting them out of the RCU read-side critical section.
+
3. Does the update code tolerate concurrent accesses?
The whole point of RCU is to permit readers to run without
@@ -78,10 +94,10 @@ over a rather long period of time, but improvements are always welcome!
This works quite well, also.
- c. Make updates appear atomic to readers. For example,
+ c. Make updates appear atomic to readers. For example,
pointer updates to properly aligned fields will
appear atomic, as will individual atomic primitives.
- Sequences of perations performed under a lock will -not-
+ Sequences of operations performed under a lock will -not-
appear to be atomic to RCU readers, nor will sequences
of multiple atomic primitives.
@@ -168,8 +184,8 @@ over a rather long period of time, but improvements are always welcome!
5. If call_rcu(), or a related primitive such as call_rcu_bh(),
call_rcu_sched(), or call_srcu() is used, the callback function
- must be written to be called from softirq context. In particular,
- it cannot block.
+ will be called from softirq context. In particular, it cannot
+ block.
6. Since synchronize_rcu() can block, it cannot be called from
any sort of irq context. The same rule applies for
@@ -178,11 +194,14 @@ over a rather long period of time, but improvements are always welcome!
synchronize_sched_expedite(), and synchronize_srcu_expedited().
The expedited forms of these primitives have the same semantics
- as the non-expedited forms, but expediting is both expensive
- and unfriendly to real-time workloads. Use of the expedited
- primitives should be restricted to rare configuration-change
- operations that would not normally be undertaken while a real-time
- workload is running.
+ as the non-expedited forms, but expediting is both expensive and
+ (with the exception of synchronize_srcu_expedited()) unfriendly
+ to real-time workloads. Use of the expedited primitives should
+ be restricted to rare configuration-change operations that would
+ not normally be undertaken while a real-time workload is running.
+ However, real-time workloads can use rcupdate.rcu_normal kernel
+ boot parameter to completely disable expedited grace periods,
+ though this might have performance implications.
In particular, if you find yourself invoking one of the expedited
primitives repeatedly in a loop, please do everyone a favor:
@@ -193,11 +212,6 @@ over a rather long period of time, but improvements are always welcome!
of the system, especially to real-time workloads running on
the rest of the system.
- In addition, it is illegal to call the expedited forms from
- a CPU-hotplug notifier, or while holding a lock that is acquired
- by a CPU-hotplug notifier. Failing to observe this restriction
- will result in deadlock.
-
7. If the updater uses call_rcu() or synchronize_rcu(), then the
corresponding readers must use rcu_read_lock() and
rcu_read_unlock(). If the updater uses call_rcu_bh() or
@@ -321,7 +335,7 @@ over a rather long period of time, but improvements are always welcome!
Similarly, disabling preemption is not an acceptable substitute
for rcu_read_lock(). Code that attempts to use preemption
disabling where it should be using rcu_read_lock() will break
- in real-time kernel builds.
+ in CONFIG_PREEMPT=y kernel builds.
If you want to wait for interrupt handlers, NMI handlers, and
code under the influence of preempt_disable(), you instead
@@ -356,23 +370,22 @@ over a rather long period of time, but improvements are always welcome!
not the case, a self-spawning RCU callback would prevent the
victim CPU from ever going offline.)
-14. SRCU (srcu_read_lock(), srcu_read_unlock(), srcu_dereference(),
- synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu())
- may only be invoked from process context. Unlike other forms of
- RCU, it -is- permissible to block in an SRCU read-side critical
- section (demarked by srcu_read_lock() and srcu_read_unlock()),
- hence the "SRCU": "sleepable RCU". Please note that if you
- don't need to sleep in read-side critical sections, you should be
- using RCU rather than SRCU, because RCU is almost always faster
- and easier to use than is SRCU.
-
- Also unlike other forms of RCU, explicit initialization
- and cleanup is required via init_srcu_struct() and
- cleanup_srcu_struct(). These are passed a "struct srcu_struct"
- that defines the scope of a given SRCU domain. Once initialized,
- the srcu_struct is passed to srcu_read_lock(), srcu_read_unlock()
- synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu().
- A given synchronize_srcu() waits only for SRCU read-side critical
+14. Unlike other forms of RCU, it -is- permissible to block in an
+ SRCU read-side critical section (demarked by srcu_read_lock()
+ and srcu_read_unlock()), hence the "SRCU": "sleepable RCU".
+ Please note that if you don't need to sleep in read-side critical
+ sections, you should be using RCU rather than SRCU, because RCU
+ is almost always faster and easier to use than is SRCU.
+
+ Also unlike other forms of RCU, explicit initialization and
+ cleanup is required either at build time via DEFINE_SRCU()
+ or DEFINE_STATIC_SRCU() or at runtime via init_srcu_struct()
+ and cleanup_srcu_struct(). These last two are passed a
+ "struct srcu_struct" that defines the scope of a given
+ SRCU domain. Once initialized, the srcu_struct is passed
+ to srcu_read_lock(), srcu_read_unlock() synchronize_srcu(),
+ synchronize_srcu_expedited(), and call_srcu(). A given
+ synchronize_srcu() waits only for SRCU read-side critical
sections governed by srcu_read_lock() and srcu_read_unlock()
calls that have been passed the same srcu_struct. This property
is what makes sleeping read-side critical sections tolerable --
@@ -390,10 +403,16 @@ over a rather long period of time, but improvements are always welcome!
Therefore, SRCU should be used in preference to rw_semaphore
only in extremely read-intensive situations, or in situations
requiring SRCU's read-side deadlock immunity or low read-side
- realtime latency.
+ realtime latency. You should also consider percpu_rw_semaphore
+ when you need lightweight readers.
- Note that, rcu_assign_pointer() relates to SRCU just as it does
- to other forms of RCU.
+ SRCU's expedited primitive (synchronize_srcu_expedited())
+ never sends IPIs to other CPUs, so it is easier on
+ real-time workloads than is synchronize_rcu_expedited(),
+ synchronize_rcu_bh_expedited() or synchronize_sched_expedited().
+
+ Note that rcu_dereference() and rcu_assign_pointer() relate to
+ SRCU just as they do to other forms of RCU.
15. The whole point of call_rcu(), synchronize_rcu(), and friends
is to wait until all pre-existing readers have finished before
@@ -435,3 +454,33 @@ over a rather long period of time, but improvements are always welcome!
These debugging aids can help you find problems that are
otherwise extremely difficult to spot.
+
+18. If you register a callback using call_rcu(), call_rcu_bh(),
+ call_rcu_sched(), or call_srcu(), and pass in a function defined
+ within a loadable module, then it in necessary to wait for
+ all pending callbacks to be invoked after the last invocation
+ and before unloading that module. Note that it is absolutely
+ -not- sufficient to wait for a grace period! The current (say)
+ synchronize_rcu() implementation waits only for all previous
+ callbacks registered on the CPU that synchronize_rcu() is running
+ on, but it is -not- guaranteed to wait for callbacks registered
+ on other CPUs.
+
+ You instead need to use one of the barrier functions:
+
+ o call_rcu() -> rcu_barrier()
+ o call_rcu_bh() -> rcu_barrier_bh()
+ o call_rcu_sched() -> rcu_barrier_sched()
+ o call_srcu() -> srcu_barrier()
+
+ However, these barrier functions are absolutely -not- guaranteed
+ to wait for a grace period. In fact, if there are no call_rcu()
+ callbacks waiting anywhere in the system, rcu_barrier() is within
+ its rights to return immediately.
+
+ So if you need to wait for both an RCU grace period and for
+ all pre-existing call_rcu() callbacks, you will need to execute
+ both rcu_barrier() and synchronize_rcu(), if necessary, using
+ something like workqueues to to execute them concurrently.
+
+ See rcubarrier.txt for more information.
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
index 745f429fda79..7d4ae110c2c9 100644
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@@ -76,15 +76,12 @@ o I hear that RCU is patented? What is with that?
Of these, one was allowed to lapse by the assignee, and the
others have been contributed to the Linux kernel under GPL.
There are now also LGPL implementations of user-level RCU
- available (http://lttng.org/?q=node/18).
+ available (http://liburcu.org/).
o I hear that RCU needs work in order to support realtime kernels?
- This work is largely completed. Realtime-friendly RCU can be
- enabled via the CONFIG_PREEMPT_RCU kernel configuration
- parameter. However, work is in progress for enabling priority
- boosting of preempted RCU read-side critical sections. This is
- needed if you have CPU-bound realtime threads.
+ Realtime-friendly RCU can be enabled via the CONFIG_PREEMPT_RCU
+ kernel configuration parameter.
o Where can I find more information on RCU?
diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt
index b2a613f16d74..1acb26b09b48 100644
--- a/Documentation/RCU/rcu_dereference.txt
+++ b/Documentation/RCU/rcu_dereference.txt
@@ -25,35 +25,35 @@ o You must use one of the rcu_dereference() family of primitives
for an example where the compiler can in fact deduce the exact
value of the pointer, and thus cause misordering.
+o You are only permitted to use rcu_dereference on pointer values.
+ The compiler simply knows too much about integral values to
+ trust it to carry dependencies through integer operations.
+ There are a very few exceptions, namely that you can temporarily
+ cast the pointer to uintptr_t in order to:
+
+ o Set bits and clear bits down in the must-be-zero low-order
+ bits of that pointer. This clearly means that the pointer
+ must have alignment constraints, for example, this does
+ -not- work in general for char* pointers.
+
+ o XOR bits to translate pointers, as is done in some
+ classic buddy-allocator algorithms.
+
+ It is important to cast the value back to pointer before
+ doing much of anything else with it.
+
o Avoid cancellation when using the "+" and "-" infix arithmetic
operators. For example, for a given variable "x", avoid
- "(x-x)". There are similar arithmetic pitfalls from other
- arithmetic operators, such as "(x*0)", "(x/(x+1))" or "(x%1)".
- The compiler is within its rights to substitute zero for all of
- these expressions, so that subsequent accesses no longer depend
- on the rcu_dereference(), again possibly resulting in bugs due
- to misordering.
+ "(x-(uintptr_t)x)" for char* pointers. The compiler is within its
+ rights to substitute zero for this sort of expression, so that
+ subsequent accesses no longer depend on the rcu_dereference(),
+ again possibly resulting in bugs due to misordering.
Of course, if "p" is a pointer from rcu_dereference(), and "a"
and "b" are integers that happen to be equal, the expression
"p+a-b" is safe because its value still necessarily depends on
the rcu_dereference(), thus maintaining proper ordering.
-o Avoid all-zero operands to the bitwise "&" operator, and
- similarly avoid all-ones operands to the bitwise "|" operator.
- If the compiler is able to deduce the value of such operands,
- it is within its rights to substitute the corresponding constant
- for the bitwise operation. Once again, this causes subsequent
- accesses to no longer depend on the rcu_dereference(), causing
- bugs due to misordering.
-
- Please note that single-bit operands to bitwise "&" can also
- be dangerous. At this point, the compiler knows that the
- resulting value can only take on one of two possible values.
- Therefore, a very small amount of additional information will
- allow the compiler to deduce the exact value, which again can
- result in misordering.
-
o If you are using RCU to protect JITed functions, so that the
"()" function-invocation operator is applied to a value obtained
(directly or indirectly) from rcu_dereference(), you may need to
@@ -61,25 +61,6 @@ o If you are using RCU to protect JITed functions, so that the
This issue arises on some systems when a newly JITed function is
using the same memory that was used by an earlier JITed function.
-o Do not use the results from the boolean "&&" and "||" when
- dereferencing. For example, the following (rather improbable)
- code is buggy:
-
- int *p;
- int *q;
-
- ...
-
- p = rcu_dereference(gp)
- q = &global_q;
- q += p != &oom_p1 && p != &oom_p2;
- r1 = *q; /* BUGGY!!! */
-
- The reason this is buggy is that "&&" and "||" are often compiled
- using branches. While weak-memory machines such as ARM or PowerPC
- do order stores after such branches, they can speculate loads,
- which can result in misordering bugs.
-
o Do not use the results from relational operators ("==", "!=",
">", ">=", "<", or "<=") when dereferencing. For example,
the following (quite strange) code is buggy:
diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt
index b10cfe711e68..5d7759071a3e 100644
--- a/Documentation/RCU/rcubarrier.txt
+++ b/Documentation/RCU/rcubarrier.txt
@@ -263,6 +263,11 @@ Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes
are delayed for a full grace period? Couldn't this result in
rcu_barrier() returning prematurely?
+The current rcu_barrier() implementation is more complex, due to the need
+to avoid disturbing idle CPUs (especially on battery-powered systems)
+and the need to minimally disturb non-idle CPUs in real-time systems.
+However, the code above illustrates the concepts.
+
rcu_barrier() Summary
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 278f6a9383b6..55918b54808b 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -276,15 +276,17 @@ o "Free-Block Circulation": Shows the number of torture structures
somehow gets incremented farther than it should.
Different implementations of RCU can provide implementation-specific
-additional information. For example, SRCU provides the following
+additional information. For example, Tree SRCU provides the following
additional line:
- srcu-torture: per-CPU(idx=1): 0(0,1) 1(0,1) 2(0,0) 3(0,1)
+ srcud-torture: Tree SRCU per-CPU(idx=0): 0(35,-21) 1(-4,24) 2(1,1) 3(-26,20) 4(28,-47) 5(-9,4) 6(-10,14) 7(-14,11) T(1,6)
-This line shows the per-CPU counter state. The numbers in parentheses are
-the values of the "old" and "current" counters for the corresponding CPU.
-The "idx" value maps the "old" and "current" values to the underlying
-array, and is useful for debugging.
+This line shows the per-CPU counter state, in this case for Tree SRCU
+using a dynamically allocated srcu_struct (hence "srcud-" rather than
+"srcu-"). The numbers in parentheses are the values of the "old" and
+"current" counters for the corresponding CPU. The "idx" value maps the
+"old" and "current" values to the underlying array, and is useful for
+debugging. The final "T" entry contains the totals of the counters.
USAGE
@@ -304,3 +306,9 @@ checked for such errors. The "rmmod" command forces a "SUCCESS",
"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed. The first
two are self-explanatory, while the last indicates that while there
were no RCU failures, CPU-hotplug problems were detected.
+
+However, the tools/testing/selftests/rcutorture/bin/kvm.sh script
+provides better automation, including automatic failure analysis.
+It assumes a qemu/kvm-enabled platform, and runs guest OSes out of initrd.
+See tools/testing/selftests/rcutorture/doc/initrd.txt for instructions
+on setting up such an initrd.
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 8ed6c9f6133c..df62466da4e0 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -890,6 +890,8 @@ SRCU: Critical sections Grace period Barrier
srcu_read_lock_held
SRCU: Initialization/cleanup
+ DEFINE_SRCU
+ DEFINE_STATIC_SRCU
init_srcu_struct
cleanup_srcu_struct
@@ -913,7 +915,8 @@ a. Will readers need to block? If so, you need SRCU.
b. What about the -rt patchset? If readers would need to block
in an non-rt kernel, you need SRCU. If readers would block
in a -rt kernel, but not in a non-rt kernel, SRCU is not
- necessary.
+ necessary. (The -rt patchset turns spinlocks into sleeplocks,
+ hence this distinction.)
c. Do you need to treat NMI handlers, hardirq handlers,
and code segments with preemption disabled (whether
diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index d76ab3907e2b..b2598cc9834c 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -138,6 +138,7 @@ parameter is applicable::
PPT Parallel port support is enabled.
PS2 Appropriate PS/2 support is enabled.
RAM RAM disk support is enabled.
+ RDT Intel Resource Director Technology.
S390 S390 architecture is enabled.
SCSI Appropriate SCSI support is enabled.
A lot of drivers have their options described inside
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index d9c171ce4190..6996b7727b85 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2233,6 +2233,17 @@
memory contents and reserves bad memory
regions that are detected.
+ mem_encrypt= [X86-64] AMD Secure Memory Encryption (SME) control
+ Valid arguments: on, off
+ Default (depends on kernel configuration option):
+ on (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y)
+ off (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=n)
+ mem_encrypt=on: Activate SME
+ mem_encrypt=off: Do not activate SME
+
+ Refer to Documentation/x86/amd-memory-encryption.txt
+ for details on when memory encryption can be activated.
+
mem_sleep_default= [SUSPEND] Default system suspend mode:
s2idle - Suspend-To-Idle
shallow - Power-On Suspend or equivalent (if supported)
@@ -2633,9 +2644,10 @@
In kernels built with CONFIG_NO_HZ_FULL=y, set
the specified list of CPUs whose tick will be stopped
whenever possible. The boot CPU will be forced outside
- the range to maintain the timekeeping.
- The CPUs in this range must also be included in the
- rcu_nocbs= set.
+ the range to maintain the timekeeping. Any CPUs
+ in this list will have their RCU callbacks offloaded,
+ just as if they had also been called out in the
+ rcu_nocbs= boot parameter.
noiotrap [SH] Disables trapped I/O port accesses.
@@ -2696,6 +2708,8 @@
nopat [X86] Disable PAT (page attribute table extension of
pagetables) support.
+ nopcid [X86-64] Disable the PCID cpu feature.
+
norandmaps Don't use address space randomization. Equivalent to
echo 0 > /proc/sys/kernel/randomize_va_space
@@ -3598,6 +3612,12 @@
Run specified binary instead of /init from the ramdisk,
used for early userspace startup. See initrd.
+ rdt= [HW,X86,RDT]
+ Turn on/off individual RDT features. List is:
+ cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, mba.
+ E.g. to turn on cmt and turn off mba use:
+ rdt=cmt,!mba
+
reboot= [KNL]
Format (x86 or x86_64):
[w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \
@@ -4375,6 +4395,10 @@
decrease the size and leave more room for directly
mapped kernel RAM.
+ vmcp_cma=nn[MG] [KNL,S390]
+ Sets the memory size reserved for contiguous memory
+ allocations for the vmcp device driver.
+
vmhalt= [KNL,S390] Perform z/VM CP command after system halt.
Format: <command>
diff --git a/Documentation/arm/firmware.txt b/Documentation/arm/firmware.txt
index da6713adac8a..7f175dbb427e 100644
--- a/Documentation/arm/firmware.txt
+++ b/Documentation/arm/firmware.txt
@@ -60,7 +60,7 @@ Example of using a firmware operation:
/* some platform code, e.g. SMP initialization */
- __raw_writel(virt_to_phys(exynos4_secondary_startup),
+ __raw_writel(__pa_symbol(exynos4_secondary_startup),
CPU1_BOOT_REG);
/* Call Exynos specific smc call */
diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt
new file mode 100644
index 000000000000..5550bfdcce5f
--- /dev/null
+++ b/Documentation/atomic_bitops.txt
@@ -0,0 +1,66 @@
+
+On atomic bitops.
+
+
+While our bitmap_{}() functions are non-atomic, we have a number of operations
+operating on single bits in a bitmap that are atomic.
+
+
+API
+---
+
+The single bit operations are:
+
+Non-RMW ops:
+
+ test_bit()
+
+RMW atomic operations without return value:
+
+ {set,clear,change}_bit()
+ clear_bit_unlock()
+
+RMW atomic operations with return value:
+
+ test_and_{set,clear,change}_bit()
+ test_and_set_bit_lock()
+
+Barriers:
+
+ smp_mb__{before,after}_atomic()
+
+
+All RMW atomic operations have a '__' prefixed variant which is non-atomic.
+
+
+SEMANTICS
+---------
+
+Non-atomic ops:
+
+In particular __clear_bit_unlock() suffers the same issue as atomic_set(),
+which is why the generic version maps to clear_bit_unlock(), see atomic_t.txt.
+
+
+RMW ops:
+
+The test_and_{}_bit() operations return the original value of the bit.
+
+
+ORDERING
+--------
+
+Like with atomic_t, the rule of thumb is:
+
+ - non-RMW operations are unordered;
+
+ - RMW operations that have no return value are unordered;
+
+ - RMW operations that have a return value are fully ordered.
+
+Except for test_and_set_bit_lock() which has ACQUIRE semantics and
+clear_bit_unlock() which has RELEASE semantics.
+
+Since a platform only has a single means of achieving atomic operations
+the same barriers as for atomic_t are used, see atomic_t.txt.
+
diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt
new file mode 100644
index 000000000000..913396ac5824
--- /dev/null
+++ b/Documentation/atomic_t.txt
@@ -0,0 +1,242 @@
+
+On atomic types (atomic_t atomic64_t and atomic_long_t).
+
+The atomic type provides an interface to the architecture's means of atomic
+RMW operations between CPUs (atomic operations on MMIO are not supported and
+can lead to fatal traps on some platforms).
+
+API
+---
+
+The 'full' API consists of (atomic64_ and atomic_long_ prefixes omitted for
+brevity):
+
+Non-RMW ops:
+
+ atomic_read(), atomic_set()
+ atomic_read_acquire(), atomic_set_release()
+
+
+RMW atomic operations:
+
+Arithmetic:
+
+ atomic_{add,sub,inc,dec}()
+ atomic_{add,sub,inc,dec}_return{,_relaxed,_acquire,_release}()
+ atomic_fetch_{add,sub,inc,dec}{,_relaxed,_acquire,_release}()
+
+
+Bitwise:
+
+ atomic_{and,or,xor,andnot}()
+ atomic_fetch_{and,or,xor,andnot}{,_relaxed,_acquire,_release}()
+
+
+Swap:
+
+ atomic_xchg{,_relaxed,_acquire,_release}()
+ atomic_cmpxchg{,_relaxed,_acquire,_release}()
+ atomic_try_cmpxchg{,_relaxed,_acquire,_release}()
+
+
+Reference count (but please see refcount_t):
+
+ atomic_add_unless(), atomic_inc_not_zero()
+ atomic_sub_and_test(), atomic_dec_and_test()
+
+
+Misc:
+
+ atomic_inc_and_test(), atomic_add_negative()
+ atomic_dec_unless_positive(), atomic_inc_unless_negative()
+
+
+Barriers:
+
+ smp_mb__{before,after}_atomic()
+
+
+
+SEMANTICS
+---------
+
+Non-RMW ops:
+
+The non-RMW ops are (typically) regular LOADs and STOREs and are canonically
+implemented using READ_ONCE(), WRITE_ONCE(), smp_load_acquire() and
+smp_store_release() respectively.
+
+The one detail to this is that atomic_set{}() should be observable to the RMW
+ops. That is:
+
+ C atomic-set
+
+ {
+ atomic_set(v, 1);
+ }
+
+ P1(atomic_t *v)
+ {
+ atomic_add_unless(v, 1, 0);
+ }
+
+ P2(atomic_t *v)
+ {
+ atomic_set(v, 0);
+ }
+
+ exists
+ (v=2)
+
+In this case we would expect the atomic_set() from CPU1 to either happen
+before the atomic_add_unless(), in which case that latter one would no-op, or
+_after_ in which case we'd overwrite its result. In no case is "2" a valid
+outcome.
+
+This is typically true on 'normal' platforms, where a regular competing STORE
+will invalidate a LL/SC or fail a CMPXCHG.
+
+The obvious case where this is not so is when we need to implement atomic ops
+with a lock:
+
+ CPU0 CPU1
+
+ atomic_add_unless(v, 1, 0);
+ lock();
+ ret = READ_ONCE(v->counter); // == 1
+ atomic_set(v, 0);
+ if (ret != u) WRITE_ONCE(v->counter, 0);
+ WRITE_ONCE(v->counter, ret + 1);
+ unlock();
+
+the typical solution is to then implement atomic_set{}() with atomic_xchg().
+
+
+RMW ops:
+
+These come in various forms:
+
+ - plain operations without return value: atomic_{}()
+
+ - operations which return the modified value: atomic_{}_return()
+
+ these are limited to the arithmetic operations because those are
+ reversible. Bitops are irreversible and therefore the modified value
+ is of dubious utility.
+
+ - operations which return the original value: atomic_fetch_{}()
+
+ - swap operations: xchg(), cmpxchg() and try_cmpxchg()
+
+ - misc; the special purpose operations that are commonly used and would,
+ given the interface, normally be implemented using (try_)cmpxchg loops but
+ are time critical and can, (typically) on LL/SC architectures, be more
+ efficiently implemented.
+
+All these operations are SMP atomic; that is, the operations (for a single
+atomic variable) can be fully ordered and no intermediate state is lost or
+visible.
+
+
+ORDERING (go read memory-barriers.txt first)
+--------
+
+The rule of thumb:
+
+ - non-RMW operations are unordered;
+
+ - RMW operations that have no return value are unordered;
+
+ - RMW operations that have a return value are fully ordered;
+
+ - RMW operations that are conditional are unordered on FAILURE,
+ otherwise the above rules apply.
+
+Except of course when an operation has an explicit ordering like:
+
+ {}_relaxed: unordered
+ {}_acquire: the R of the RMW (or atomic_read) is an ACQUIRE
+ {}_release: the W of the RMW (or atomic_set) is a RELEASE
+
+Where 'unordered' is against other memory locations. Address dependencies are
+not defeated.
+
+Fully ordered primitives are ordered against everything prior and everything
+subsequent. Therefore a fully ordered primitive is like having an smp_mb()
+before and an smp_mb() after the primitive.
+
+
+The barriers:
+
+ smp_mb__{before,after}_atomic()
+
+only apply to the RMW ops and can be used to augment/upgrade the ordering
+inherent to the used atomic op. These barriers provide a full smp_mb().
+
+These helper barriers exist because architectures have varying implicit
+ordering on their SMP atomic primitives. For example our TSO architectures
+provide full ordered atomics and these barriers are no-ops.
+
+Thus:
+
+ atomic_fetch_add();
+
+is equivalent to:
+
+ smp_mb__before_atomic();
+ atomic_fetch_add_relaxed();
+ smp_mb__after_atomic();
+
+However the atomic_fetch_add() might be implemented more efficiently.
+
+Further, while something like:
+
+ smp_mb__before_atomic();
+ atomic_dec(&X);
+
+is a 'typical' RELEASE pattern, the barrier is strictly stronger than
+a RELEASE. Similarly for something like:
+
+ atomic_inc(&X);
+ smp_mb__after_atomic();
+
+is an ACQUIRE pattern (though very much not typical), but again the barrier is
+strictly stronger than ACQUIRE. As illustrated:
+
+ C strong-acquire
+
+ {
+ }
+
+ P1(int *x, atomic_t *y)
+ {
+ r0 = READ_ONCE(*x);
+ smp_rmb();
+ r1 = atomic_read(y);
+ }
+
+ P2(int *x, atomic_t *y)
+ {
+ atomic_inc(y);
+ smp_mb__after_atomic();
+ WRITE_ONCE(*x, 1);
+ }
+
+ exists
+ (r0=1 /\ r1=0)
+
+This should not happen; but a hypothetical atomic_inc_acquire() --
+(void)atomic_fetch_inc_acquire() for instance -- would allow the outcome,
+since then:
+
+ P1 P2
+
+ t = LL.acq *y (0)
+ t++;
+ *x = 1;
+ r0 = *x (1)
+ RMB
+ r1 = *y (0)
+ SC *y, t;
+
+is allowed.
diff --git a/Documentation/conf.py b/Documentation/conf.py
index 71b032bb44fd..f9054ab60cb1 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -29,7 +29,7 @@ from load_config import loadConfig
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
-needs_sphinx = '1.2'
+needs_sphinx = '1.3'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
@@ -344,8 +344,8 @@ if major == 1 and minor > 3:
if major == 1 and minor <= 4:
latex_elements['preamble'] += '\\usepackage[margin=0.5in, top=1in, bottom=1in]{geometry}'
elif major == 1 and (minor > 5 or (minor == 5 and patch >= 3)):
- latex_elements['sphinxsetup'] = 'hmargin=0.5in, vmargin=0.5in'
-
+ latex_elements['sphinxsetup'] = 'hmargin=0.5in, vmargin=1in'
+ latex_elements['preamble'] += '\\fvset{fontsize=auto}\n'
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
diff --git a/Documentation/core-api/genalloc.rst b/Documentation/core-api/genalloc.rst
new file mode 100644
index 000000000000..6b38a39fab24
--- /dev/null
+++ b/Documentation/core-api/genalloc.rst
@@ -0,0 +1,144 @@
+The genalloc/genpool subsystem
+==============================
+
+There are a number of memory-allocation subsystems in the kernel, each
+aimed at a specific need. Sometimes, however, a kernel developer needs to
+implement a new allocator for a specific range of special-purpose memory;
+often that memory is located on a device somewhere. The author of the
+driver for that device can certainly write a little allocator to get the
+job done, but that is the way to fill the kernel with dozens of poorly
+tested allocators. Back in 2005, Jes Sorensen lifted one of those
+allocators from the sym53c8xx_2 driver and posted_ it as a generic module
+for the creation of ad hoc memory allocators. This code was merged
+for the 2.6.13 release; it has been modified considerably since then.
+
+.. _posted: https://lwn.net/Articles/125842/
+
+Code using this allocator should include <linux/genalloc.h>. The action
+begins with the creation of a pool using one of:
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: gen_pool_create
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: devm_gen_pool_create
+
+A call to :c:func:`gen_pool_create` will create a pool. The granularity of
+allocations is set with min_alloc_order; it is a log-base-2 number like
+those used by the page allocator, but it refers to bytes rather than pages.
+So, if min_alloc_order is passed as 3, then all allocations will be a
+multiple of eight bytes. Increasing min_alloc_order decreases the memory
+required to track the memory in the pool. The nid parameter specifies
+which NUMA node should be used for the allocation of the housekeeping
+structures; it can be -1 if the caller doesn't care.
+
+The "managed" interface :c:func:`devm_gen_pool_create` ties the pool to a
+specific device. Among other things, it will automatically clean up the
+pool when the given device is destroyed.
+
+A pool is shut down with:
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: gen_pool_destroy
+
+It's worth noting that, if there are still allocations outstanding from the
+given pool, this function will take the rather extreme step of invoking
+BUG(), crashing the entire system. You have been warned.
+
+A freshly created pool has no memory to allocate. It is fairly useless in
+that state, so one of the first orders of business is usually to add memory
+to the pool. That can be done with one of:
+
+.. kernel-doc:: include/linux/genalloc.h
+ :functions: gen_pool_add
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: gen_pool_add_virt
+
+A call to :c:func:`gen_pool_add` will place the size bytes of memory
+starting at addr (in the kernel's virtual address space) into the given
+pool, once again using nid as the node ID for ancillary memory allocations.
+The :c:func:`gen_pool_add_virt` variant associates an explicit physical
+address with the memory; this is only necessary if the pool will be used
+for DMA allocations.
+
+The functions for allocating memory from the pool (and putting it back)
+are:
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: gen_pool_alloc
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: gen_pool_dma_alloc
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: gen_pool_free
+
+As one would expect, :c:func:`gen_pool_alloc` will allocate size< bytes
+from the given pool. The :c:func:`gen_pool_dma_alloc` variant allocates
+memory for use with DMA operations, returning the associated physical
+address in the space pointed to by dma. This will only work if the memory
+was added with :c:func:`gen_pool_add_virt`. Note that this function
+departs from the usual genpool pattern of using unsigned long values to
+represent kernel addresses; it returns a void * instead.
+
+That all seems relatively simple; indeed, some developers clearly found it
+to be too simple. After all, the interface above provides no control over
+how the allocation functions choose which specific piece of memory to
+return. If that sort of control is needed, the following functions will be
+of interest:
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: gen_pool_alloc_algo
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: gen_pool_set_algo
+
+Allocations with :c:func:`gen_pool_alloc_algo` specify an algorithm to be
+used to choose the memory to be allocated; the default algorithm can be set
+with :c:func:`gen_pool_set_algo`. The data value is passed to the
+algorithm; most ignore it, but it is occasionally needed. One can,
+naturally, write a special-purpose algorithm, but there is a fair set
+already available:
+
+- gen_pool_first_fit is a simple first-fit allocator; this is the default
+ algorithm if none other has been specified.
+
+- gen_pool_first_fit_align forces the allocation to have a specific
+ alignment (passed via data in a genpool_data_align structure).
+
+- gen_pool_first_fit_order_align aligns the allocation to the order of the
+ size. A 60-byte allocation will thus be 64-byte aligned, for example.
+
+- gen_pool_best_fit, as one would expect, is a simple best-fit allocator.
+
+- gen_pool_fixed_alloc allocates at a specific offset (passed in a
+ genpool_data_fixed structure via the data parameter) within the pool.
+ If the indicated memory is not available the allocation fails.
+
+There is a handful of other functions, mostly for purposes like querying
+the space available in the pool or iterating through chunks of memory.
+Most users, however, should not need much beyond what has been described
+above. With luck, wider awareness of this module will help to prevent the
+writing of special-purpose memory allocators in the future.
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: gen_pool_virt_to_phys
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: gen_pool_for_each_chunk
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: addr_in_gen_pool
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: gen_pool_avail
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: gen_pool_size
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: gen_pool_get
+
+.. kernel-doc:: lib/genalloc.c
+ :functions: of_gen_pool_get
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 0606be3a3111..d5bbe035316d 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -20,6 +20,7 @@ Core utilities
genericirq
flexible-arrays
librs
+ genalloc
Interfaces for kernel debugging
===============================
diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 17b00914c6ab..8282099e0cbf 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -344,3 +344,52 @@ codecs, and devices with strict requirements for interface clocking.
.. kernel-doc:: include/linux/clk.h
:internal:
+
+Synchronization Primitives
+==========================
+
+Read-Copy Update (RCU)
+----------------------
+
+.. kernel-doc:: include/linux/rcupdate.h
+ :external:
+
+.. kernel-doc:: include/linux/rcupdate_wait.h
+ :external:
+
+.. kernel-doc:: include/linux/rcutree.h
+ :external:
+
+.. kernel-doc:: kernel/rcu/tree.c
+ :external:
+
+.. kernel-doc:: kernel/rcu/tree_plugin.h
+ :external:
+
+.. kernel-doc:: kernel/rcu/tree_exp.h
+ :external:
+
+.. kernel-doc:: kernel/rcu/update.c
+ :external:
+
+.. kernel-doc:: include/linux/srcu.h
+ :external:
+
+.. kernel-doc:: kernel/rcu/srcutree.c
+ :external:
+
+.. kernel-doc:: include/linux/rculist_bl.h
+ :external:
+
+.. kernel-doc:: include/linux/rculist.h
+ :external:
+
+.. kernel-doc:: include/linux/rculist_nulls.h
+ :external:
+
+.. kernel-doc:: include/linux/rcu_sync.h
+ :external:
+
+.. kernel-doc:: kernel/rcu/sync.c
+ :external:
+
diff --git a/Documentation/dev-tools/gdb-kernel-debugging.rst b/Documentation/dev-tools/gdb-kernel-debugging.rst
index 5e93c9bc6619..19df79286f00 100644
--- a/Documentation/dev-tools/gdb-kernel-debugging.rst
+++ b/Documentation/dev-tools/gdb-kernel-debugging.rst
@@ -31,11 +31,13 @@ Setup
CONFIG_DEBUG_INFO_REDUCED off. If your architecture supports
CONFIG_FRAME_POINTER, keep it enabled.
-- Install that kernel on the guest.
+- Install that kernel on the guest, turn off KASLR if necessary by adding
+ "nokaslr" to the kernel command line.
Alternatively, QEMU allows to boot the kernel directly using -kernel,
-append, -initrd command line switches. This is generally only useful if
you do not depend on modules. See QEMU documentation for more details on
- this mode.
+ this mode. In this case, you should build the kernel with
+ CONFIG_RANDOMIZE_BASE disabled if the architecture supports KASLR.
- Enable the gdb stub of QEMU/KVM, either
diff --git a/Documentation/dev-tools/kgdb.rst b/Documentation/dev-tools/kgdb.rst
index 75273203a35a..d38be58f872a 100644
--- a/Documentation/dev-tools/kgdb.rst
+++ b/Documentation/dev-tools/kgdb.rst
@@ -348,6 +348,15 @@ default behavior is always set to 0.
- ``echo 1 > /sys/module/debug_core/parameters/kgdbreboot``
- Enter the debugger on reboot notify.
+Kernel parameter: ``nokaslr``
+-----------------------------
+
+If the architecture that you are using enable KASLR by default,
+you should consider turning it off. KASLR randomizes the
+virtual address where the kernel image is mapped and confuse
+gdb which resolve kernel symbol address from symbol table
+of vmlinux.
+
Using kdb
=========
@@ -358,7 +367,7 @@ This is a quick example of how to use kdb.
1. Configure kgdboc at boot using kernel parameters::
- console=ttyS0,115200 kgdboc=ttyS0,115200
+ console=ttyS0,115200 kgdboc=ttyS0,115200 nokaslr
OR
diff --git a/Documentation/devicetree/bindings/display/bridge/dw_mipi_dsi.txt b/Documentation/devicetree/bindings/display/bridge/dw_mipi_dsi.txt
new file mode 100644
index 000000000000..b13adf30b8d3
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/bridge/dw_mipi_dsi.txt
@@ -0,0 +1,32 @@
+Synopsys DesignWare MIPI DSI host controller
+============================================
+
+This document defines device tree properties for the Synopsys DesignWare MIPI
+DSI host controller. It doesn't constitue a device tree binding specification
+by itself but is meant to be referenced by platform-specific device tree
+bindings.
+
+When referenced from platform device tree bindings the properties defined in
+this document are defined as follows. The platform device tree bindings are
+responsible for defining whether each optional property is used or not.
+
+- reg: Memory mapped base address and length of the DesignWare MIPI DSI
+ host controller registers. (mandatory)
+
+- clocks: References to all the clocks specified in the clock-names property
+ as specified in [1]. (mandatory)
+
+- clock-names:
+ - "pclk" is the peripheral clock for either AHB and APB. (mandatory)
+ - "px_clk" is the pixel clock for the DPI/RGB input. (optional)
+
+- resets: References to all the resets specified in the reset-names property
+ as specified in [2]. (optional)
+
+- reset-names: string reset name, must be "apb" if used. (optional)
+
+- panel or bridge node: see [3]. (mandatory)
+
+[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
+[2] Documentation/devicetree/bindings/reset/reset.txt
+[3] Documentation/devicetree/bindings/display/mipi-dsi-bus.txt
diff --git a/Documentation/devicetree/bindings/display/exynos/exynos5433-decon.txt b/Documentation/devicetree/bindings/display/exynos/exynos5433-decon.txt
index 549c538b38a5..fc2588292a68 100644
--- a/Documentation/devicetree/bindings/display/exynos/exynos5433-decon.txt
+++ b/Documentation/devicetree/bindings/display/exynos/exynos5433-decon.txt
@@ -25,12 +25,6 @@ Required properties:
size-cells must 1 and 0, respectively.
- port: contains an endpoint node which is connected to the endpoint in the mic
node. The reg value muset be 0.
-- i80-if-timings: specify whether the panel which is connected to decon uses
- i80 lcd interface or mipi video interface. This node contains
- no timing information as that of fimd does. Because there is
- no register in decon to specify i80 interface timing value,
- it is not needed, but make it remain to use same kind of node
- in fimd and exynos7 decon.
Example:
SoC specific DT entry:
@@ -59,9 +53,3 @@ decon: decon@13800000 {
};
};
};
-
-Board specific DT entry:
-&decon {
- i80-if-timings {
- };
-};
diff --git a/Documentation/devicetree/bindings/display/repaper.txt b/Documentation/devicetree/bindings/display/repaper.txt
new file mode 100644
index 000000000000..f5f9f9cf6a25
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/repaper.txt
@@ -0,0 +1,52 @@
+Pervasive Displays RePaper branded e-ink displays
+
+Required properties:
+- compatible: "pervasive,e1144cs021" for 1.44" display
+ "pervasive,e1190cs021" for 1.9" display
+ "pervasive,e2200cs021" for 2.0" display
+ "pervasive,e2271cs021" for 2.7" display
+
+- panel-on-gpios: Timing controller power control
+- discharge-gpios: Discharge control
+- reset-gpios: RESET pin
+- busy-gpios: BUSY pin
+
+Required property for e2271cs021:
+- border-gpios: Border control
+
+The node for this driver must be a child node of a SPI controller, hence
+all mandatory properties described in ../spi/spi-bus.txt must be specified.
+
+Optional property:
+- pervasive,thermal-zone: name of thermometer's thermal zone
+
+Example:
+
+ display_temp: lm75@48 {
+ compatible = "lm75b";
+ reg = <0x48>;
+ #thermal-sensor-cells = <0>;
+ };
+
+ thermal-zones {
+ display {
+ polling-delay-passive = <0>;
+ polling-delay = <0>;
+ thermal-sensors = <&display_temp>;
+ };
+ };
+
+ papirus27@0{
+ compatible = "pervasive,e2271cs021";
+ reg = <0>;
+
+ spi-max-frequency = <8000000>;
+
+ panel-on-gpios = <&gpio 23 0>;
+ border-gpios = <&gpio 14 0>;
+ discharge-gpios = <&gpio 15 0>;
+ reset-gpios = <&gpio 24 0>;
+ busy-gpios = <&gpio 25 0>;
+
+ pervasive,thermal-zone = "display";
+ };
diff --git a/Documentation/devicetree/bindings/display/rockchip/dw_hdmi-rockchip.txt b/Documentation/devicetree/bindings/display/rockchip/dw_hdmi-rockchip.txt
index 046076c6b277..fad8b7619647 100644
--- a/Documentation/devicetree/bindings/display/rockchip/dw_hdmi-rockchip.txt
+++ b/Documentation/devicetree/bindings/display/rockchip/dw_hdmi-rockchip.txt
@@ -11,7 +11,9 @@ following device-specific properties.
Required properties:
-- compatible: Shall contain "rockchip,rk3288-dw-hdmi".
+- compatible: should be one of the following:
+ "rockchip,rk3288-dw-hdmi"
+ "rockchip,rk3399-dw-hdmi"
- reg: See dw_hdmi.txt.
- reg-io-width: See dw_hdmi.txt. Shall be 4.
- interrupts: HDMI interrupt number
@@ -30,7 +32,8 @@ Optional properties
I2C master controller.
- clock-names: See dw_hdmi.txt. The "cec" clock is optional.
- clock-names: May contain "cec" as defined in dw_hdmi.txt.
-
+- clock-names: May contain "grf", power for grf io.
+- clock-names: May contain "vpll", external clock for some hdmi phy.
Example:
diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip-vop.txt b/Documentation/devicetree/bindings/display/rockchip/rockchip-vop.txt
index 9eb3f0a2a078..5d835d9c1ba8 100644
--- a/Documentation/devicetree/bindings/display/rockchip/rockchip-vop.txt
+++ b/Documentation/devicetree/bindings/display/rockchip/rockchip-vop.txt
@@ -8,8 +8,12 @@ Required properties:
- compatible: value should be one of the following
"rockchip,rk3036-vop";
"rockchip,rk3288-vop";
+ "rockchip,rk3368-vop";
+ "rockchip,rk3366-vop";
"rockchip,rk3399-vop-big";
"rockchip,rk3399-vop-lit";
+ "rockchip,rk3228-vop";
+ "rockchip,rk3328-vop";
- interrupts: should contain a list of all VOP IP block interrupts in the
order: VSYNC, LCD_SYSTEM. The interrupt specifier
diff --git a/Documentation/devicetree/bindings/display/sitronix,st7586.txt b/Documentation/devicetree/bindings/display/sitronix,st7586.txt
new file mode 100644
index 000000000000..1d0dad1210d3
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/sitronix,st7586.txt
@@ -0,0 +1,22 @@
+Sitronix ST7586 display panel
+
+Required properties:
+- compatible: "lego,ev3-lcd".
+- a0-gpios: The A0 signal (since this binding is for serial mode, this is
+ the pin labeled D1 on the controller, not the pin labeled A0)
+- reset-gpios: Reset pin
+
+The node for this driver must be a child node of a SPI controller, hence
+all mandatory properties described in ../spi/spi-bus.txt must be specified.
+
+Optional properties:
+- rotation: panel rotation in degrees counter clockwise (0,90,180,270)
+
+Example:
+ display@0{
+ compatible = "lego,ev3-lcd";
+ reg = <0>;
+ spi-max-frequency = <10000000>;
+ a0-gpios = <&gpio 43 GPIO_ACTIVE_HIGH>;
+ reset-gpios = <&gpio 80 GPIO_ACTIVE_HIGH>;
+ };
diff --git a/Documentation/devicetree/bindings/display/st,stm32-ltdc.txt b/Documentation/devicetree/bindings/display/st,stm32-ltdc.txt
index 8e1476941c0f..74b5ac7b26d6 100644
--- a/Documentation/devicetree/bindings/display/st,stm32-ltdc.txt
+++ b/Documentation/devicetree/bindings/display/st,stm32-ltdc.txt
@@ -1,7 +1,6 @@
* STMicroelectronics STM32 lcd-tft display controller
- ltdc: lcd-tft display controller host
- must be a sub-node of st-display-subsystem
Required properties:
- compatible: "st,stm32-ltdc"
- reg: Physical base address of the IP registers and length of memory mapped region.
@@ -13,8 +12,40 @@
Required nodes:
- Video port for RGB output.
-Example:
+* STMicroelectronics STM32 DSI controller specific extensions to Synopsys
+ DesignWare MIPI DSI host controller
+The STMicroelectronics STM32 DSI controller uses the Synopsys DesignWare MIPI
+DSI host controller. For all mandatory properties & nodes, please refer
+to the related documentation in [5].
+
+Mandatory properties specific to STM32 DSI:
+- #address-cells: Should be <1>.
+- #size-cells: Should be <0>.
+- compatible: "st,stm32-dsi".
+- clock-names:
+ - phy pll reference clock string name, must be "ref".
+- resets: see [5].
+- reset-names: see [5].
+
+Mandatory nodes specific to STM32 DSI:
+- ports: A node containing DSI input & output port nodes with endpoint
+ definitions as documented in [3] & [4].
+ - port@0: DSI input port node, connected to the ltdc rgb output port.
+ - port@1: DSI output port node, connected to a panel or a bridge input port.
+- panel or bridge node: A node containing the panel or bridge description as
+ documented in [6].
+ - port: panel or bridge port node, connected to the DSI output port (port@1).
+
+Note: You can find more documentation in the following references
+[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
+[2] Documentation/devicetree/bindings/reset/reset.txt
+[3] Documentation/devicetree/bindings/media/video-interfaces.txt
+[4] Documentation/devicetree/bindings/graph.txt
+[5] Documentation/devicetree/bindings/display/bridge/dw_mipi_dsi.txt
+[6] Documentation/devicetree/bindings/display/mipi-dsi-bus.txt
+
+Example 1: RGB panel
/ {
...
soc {
@@ -34,3 +65,73 @@ Example:
};
};
};
+
+Example 2: DSI panel
+
+/ {
+ ...
+ soc {
+ ...
+ ltdc: display-controller@40016800 {
+ compatible = "st,stm32-ltdc";
+ reg = <0x40016800 0x200>;
+ interrupts = <88>, <89>;
+ resets = <&rcc STM32F4_APB2_RESET(LTDC)>;
+ clocks = <&rcc 1 CLK_LCD>;
+ clock-names = "lcd";
+
+ port {
+ ltdc_out_dsi: endpoint {
+ remote-endpoint = <&dsi_in>;
+ };
+ };
+ };
+
+
+ dsi: dsi@40016c00 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "st,stm32-dsi";
+ reg = <0x40016c00 0x800>;
+ clocks = <&rcc 1 CLK_F469_DSI>, <&clk_hse>;
+ clock-names = "ref", "pclk";
+ resets = <&rcc STM32F4_APB2_RESET(DSI)>;
+ reset-names = "apb";
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ dsi_in: endpoint {
+ remote-endpoint = <&ltdc_out_dsi>;
+ };
+ };
+
+ port@1 {
+ reg = <1>;
+ dsi_out: endpoint {
+ remote-endpoint = <&dsi_in_panel>;
+ };
+ };
+
+ };
+
+ panel-dsi@0 {
+ reg = <0>; /* dsi virtual channel (0..3) */
+ compatible = ...;
+ enable-gpios = ...;
+
+ port {
+ dsi_in_panel: endpoint {
+ remote-endpoint = <&dsi_out>;
+ };
+ };
+
+ };
+
+ };
+
+ };
+};
diff --git a/Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt b/Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt
index b83e6018041d..2ee6ff0ef98e 100644
--- a/Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt
+++ b/Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt
@@ -4,15 +4,33 @@ Allwinner A10 Display Pipeline
The Allwinner A10 Display pipeline is composed of several components
that are going to be documented below:
-For the input port of all components up to the TCON in the display
-pipeline, if there are multiple components, the local endpoint IDs
-must correspond to the index of the upstream block. For example, if
-the remote endpoint is Frontend 1, then the local endpoint ID must
-be 1.
-
-Conversely, for the output ports of the same group, the remote endpoint
-ID must be the index of the local hardware block. If the local backend
-is backend 1, then the remote endpoint ID must be 1.
+For all connections between components up to the TCONs in the display
+pipeline, when there are multiple components of the same type at the
+same depth, the local endpoint ID must be the same as the remote
+component's index. For example, if the remote endpoint is Frontend 1,
+then the local endpoint ID must be 1.
+
+ Frontend 0 [0] ------- [0] Backend 0 [0] ------- [0] TCON 0
+ [1] -- -- [1] [1] -- -- [1]
+ \ / \ /
+ X X
+ / \ / \
+ [0] -- -- [0] [0] -- -- [0]
+ Frontend 1 [1] ------- [1] Backend 1 [1] ------- [1] TCON 1
+
+For a two pipeline system such as the one depicted above, the lines
+represent the connections between the components, while the numbers
+within the square brackets corresponds to the ID of the local endpoint.
+
+The same rule also applies to DE 2.0 mixer-TCON connections:
+
+ Mixer 0 [0] ----------- [0] TCON 0
+ [1] ---- ---- [1]
+ \ /
+ X
+ / \
+ [0] ---- ---- [0]
+ Mixer 1 [1] ----------- [1] TCON 1
HDMI Encoder
------------
diff --git a/Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt b/Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt
index cf4460564adb..367c8203213b 100644
--- a/Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt
+++ b/Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt
@@ -11,6 +11,8 @@ Required properties for pwm-tacho node:
- #size-cells : should be 1.
+- #cooling-cells: should be 2.
+
- reg : address and length of the register set for the device.
- pinctrl-names : a pinctrl state named "default" must be defined.
@@ -28,12 +30,17 @@ fan subnode format:
Under fan subnode there can upto 8 child nodes, with each child node
representing a fan. If there are 8 fans each fan can have one PWM port and
one/two Fan tach inputs.
+For PWM port can be configured cooling-levels to create cooling device.
+Cooling device could be bound to a thermal zone for the thermal control.
Required properties for each child node:
- reg : should specify PWM source port.
integer value in the range 0 to 7 with 0 indicating PWM port A and
7 indicating PWM port H.
+- cooling-levels: PWM duty cycle values in a range from 0 to 255
+ which correspond to thermal cooling states.
+
- aspeed,fan-tach-ch : should specify the Fan tach input channel.
integer value in the range 0 through 15, with 0 indicating
Fan tach channel 0 and 15 indicating Fan tach channel 15.
@@ -50,6 +57,7 @@ pwm_tacho_fixed_clk: fixedclk {
pwm_tacho: pwmtachocontroller@1e786000 {
#address-cells = <1>;
#size-cells = <1>;
+ #cooling-cells = <2>;
reg = <0x1E786000 0x1000>;
compatible = "aspeed,ast2500-pwm-tacho";
clocks = <&pwm_tacho_fixed_clk>;
@@ -58,6 +66,7 @@ pwm_tacho: pwmtachocontroller@1e786000 {
fan@0 {
reg = <0x00>;
+ cooling-levels = /bits/ 8 <125 151 177 203 229 255>;
aspeed,fan-tach-ch = /bits/ 8 <0x00>;
};
diff --git a/Documentation/devicetree/bindings/hwmon/ibm,cffps1.txt b/Documentation/devicetree/bindings/hwmon/ibm,cffps1.txt
new file mode 100644
index 000000000000..f68a0a68fc52
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/ibm,cffps1.txt
@@ -0,0 +1,21 @@
+Device-tree bindings for IBM Common Form Factor Power Supply Version 1
+----------------------------------------------------------------------
+
+Required properties:
+ - compatible = "ibm,cffps1";
+ - reg = < I2C bus address >; : Address of the power supply on the
+ I2C bus.
+
+Example:
+
+ i2c-bus@100 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ #interrupt-cells = <1>;
+ < more properties >
+
+ power-supply@68 {
+ compatible = "ibm,cffps1";
+ reg = <0x68>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/hwmon/ltq-cputemp.txt b/Documentation/devicetree/bindings/hwmon/ltq-cputemp.txt
new file mode 100644
index 000000000000..33fd00a987c7
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/ltq-cputemp.txt
@@ -0,0 +1,10 @@
+Lantiq cpu temperatur sensor
+
+Requires node properties:
+- compatible value :
+ "lantiq,cputemp"
+
+Example:
+ cputemp@0 {
+ compatible = "lantiq,cputemp";
+ };
diff --git a/Documentation/devicetree/bindings/interrupt-controller/fsl,ls-scfg-msi.txt b/Documentation/devicetree/bindings/interrupt-controller/fsl,ls-scfg-msi.txt
index 9e389493203f..49ccabbfa6f3 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/fsl,ls-scfg-msi.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/fsl,ls-scfg-msi.txt
@@ -4,8 +4,10 @@ Required properties:
- compatible: should be "fsl,<soc-name>-msi" to identify
Layerscape PCIe MSI controller block such as:
- "fsl,1s1021a-msi"
- "fsl,1s1043a-msi"
+ "fsl,ls1021a-msi"
+ "fsl,ls1043a-msi"
+ "fsl,ls1046a-msi"
+ "fsl,ls1043a-v1.1-msi"
- msi-controller: indicates that this is a PCIe MSI controller node
- reg: physical base address of the controller and length of memory mapped.
- interrupts: an interrupt to the parent interrupt controller.
@@ -23,7 +25,7 @@ MSI controller node
Examples:
msi1: msi-controller@1571000 {
- compatible = "fsl,1s1043a-msi";
+ compatible = "fsl,ls1043a-msi";
reg = <0x0 0x1571000 0x0 0x8>,
msi-controller;
interrupts = <0 116 0x4>;
diff --git a/Documentation/devicetree/bindings/interrupt-controller/socionext,uniphier-aidet.txt b/Documentation/devicetree/bindings/interrupt-controller/socionext,uniphier-aidet.txt
new file mode 100644
index 000000000000..48e71d3ac2ad
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/socionext,uniphier-aidet.txt
@@ -0,0 +1,32 @@
+UniPhier AIDET
+
+UniPhier AIDET (ARM Interrupt Detector) is an add-on block for ARM GIC (Generic
+Interrupt Controller). GIC itself can handle only high level and rising edge
+interrupts. The AIDET provides logic inverter to support low level and falling
+edge interrupts.
+
+Required properties:
+- compatible: Should be one of the following:
+ "socionext,uniphier-ld4-aidet" - for LD4 SoC
+ "socionext,uniphier-pro4-aidet" - for Pro4 SoC
+ "socionext,uniphier-sld8-aidet" - for sLD8 SoC
+ "socionext,uniphier-pro5-aidet" - for Pro5 SoC
+ "socionext,uniphier-pxs2-aidet" - for PXs2/LD6b SoC
+ "socionext,uniphier-ld11-aidet" - for LD11 SoC
+ "socionext,uniphier-ld20-aidet" - for LD20 SoC
+ "socionext,uniphier-pxs3-aidet" - for PXs3 SoC
+- reg: Specifies offset and length of the register set for the device.
+- interrupt-controller: Identifies the node as an interrupt controller
+- #interrupt-cells : Specifies the number of cells needed to encode an interrupt
+ source. The value should be 2. The first cell defines the interrupt number
+ (corresponds to the SPI interrupt number of GIC). The second cell specifies
+ the trigger type as defined in interrupts.txt in this directory.
+
+Example:
+
+ aidet: aidet@5fc20000 {
+ compatible = "socionext,uniphier-pro4-aidet";
+ reg = <0x5fc20000 0x200>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ };
diff --git a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt b/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
deleted file mode 100644
index 725f3b187886..000000000000
--- a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
+++ /dev/null
@@ -1,84 +0,0 @@
-* Allwinner sun8i GMAC ethernet controller
-
-This device is a platform glue layer for stmmac.
-Please see stmmac.txt for the other unchanged properties.
-
-Required properties:
-- compatible: should be one of the following string:
- "allwinner,sun8i-a83t-emac"
- "allwinner,sun8i-h3-emac"
- "allwinner,sun8i-v3s-emac"
- "allwinner,sun50i-a64-emac"
-- reg: address and length of the register for the device.
-- interrupts: interrupt for the device
-- interrupt-names: should be "macirq"
-- clocks: A phandle to the reference clock for this device
-- clock-names: should be "stmmaceth"
-- resets: A phandle to the reset control for this device
-- reset-names: should be "stmmaceth"
-- phy-mode: See ethernet.txt
-- phy-handle: See ethernet.txt
-- #address-cells: shall be 1
-- #size-cells: shall be 0
-- syscon: A phandle to the syscon of the SoC with one of the following
- compatible string:
- - allwinner,sun8i-h3-system-controller
- - allwinner,sun8i-v3s-system-controller
- - allwinner,sun50i-a64-system-controller
- - allwinner,sun8i-a83t-system-controller
-
-Optional properties:
-- allwinner,tx-delay-ps: TX clock delay chain value in ps. Range value is 0-700. Default is 0)
-- allwinner,rx-delay-ps: RX clock delay chain value in ps. Range value is 0-3100. Default is 0)
-Both delay properties need to be a multiple of 100. They control the delay for
-external PHY.
-
-Optional properties for the following compatibles:
- - "allwinner,sun8i-h3-emac",
- - "allwinner,sun8i-v3s-emac":
-- allwinner,leds-active-low: EPHY LEDs are active low
-
-Required child node of emac:
-- mdio bus node: should be named mdio
-
-Required properties of the mdio node:
-- #address-cells: shall be 1
-- #size-cells: shall be 0
-
-The device node referenced by "phy" or "phy-handle" should be a child node
-of the mdio node. See phy.txt for the generic PHY bindings.
-
-Required properties of the phy node with the following compatibles:
- - "allwinner,sun8i-h3-emac",
- - "allwinner,sun8i-v3s-emac":
-- clocks: a phandle to the reference clock for the EPHY
-- resets: a phandle to the reset control for the EPHY
-
-Example:
-
-emac: ethernet@1c0b000 {
- compatible = "allwinner,sun8i-h3-emac";
- syscon = <&syscon>;
- reg = <0x01c0b000 0x104>;
- interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-names = "macirq";
- resets = <&ccu RST_BUS_EMAC>;
- reset-names = "stmmaceth";
- clocks = <&ccu CLK_BUS_EMAC>;
- clock-names = "stmmaceth";
- #address-cells = <1>;
- #size-cells = <0>;
-
- phy-handle = <&int_mii_phy>;
- phy-mode = "mii";
- allwinner,leds-active-low;
- mdio: mdio {
- #address-cells = <1>;
- #size-cells = <0>;
- int_mii_phy: ethernet-phy@1 {
- reg = <1>;
- clocks = <&ccu CLK_BUS_EPHY>;
- resets = <&ccu RST_BUS_EPHY>;
- };
- };
-};
diff --git a/Documentation/devicetree/bindings/timer/nxp,tpm-timer.txt b/Documentation/devicetree/bindings/timer/nxp,tpm-timer.txt
new file mode 100644
index 000000000000..b4aa7ddb5b13
--- /dev/null
+++ b/Documentation/devicetree/bindings/timer/nxp,tpm-timer.txt
@@ -0,0 +1,28 @@
+NXP Low Power Timer/Pulse Width Modulation Module (TPM)
+
+The Timer/PWM Module (TPM) supports input capture, output compare,
+and the generation of PWM signals to control electric motor and power
+management applications. The counter, compare and capture registers
+are clocked by an asynchronous clock that can remain enabled in low
+power modes. TPM can support global counter bus where one TPM drives
+the counter bus for the others, provided bit width is the same.
+
+Required properties:
+
+- compatible : should be "fsl,imx7ulp-tpm"
+- reg : Specifies base physical address and size of the register sets
+ for the clock event device and clock source device.
+- interrupts : Should be the clock event device interrupt.
+- clocks : The clocks provided by the SoC to drive the timer, must contain
+ an entry for each entry in clock-names.
+- clock-names : Must include the following entries: "igp" and "per".
+
+Example:
+tpm5: tpm@40260000 {
+ compatible = "fsl,imx7ulp-tpm";
+ reg = <0x40260000 0x1000>;
+ interrupts = <GIC_SPI 22 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&clks IMX7ULP_CLK_NIC1_BUS_DIV>,
+ <&clks IMX7ULP_CLK_LPTPM5>;
+ clock-names = "ipg", "per";
+};
diff --git a/Documentation/devicetree/bindings/timer/renesas,cmt.txt b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
index 1a05c1b243c1..6ca6b9e582a0 100644
--- a/Documentation/devicetree/bindings/timer/renesas,cmt.txt
+++ b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
@@ -12,46 +12,29 @@ datasheets.
Required Properties:
- compatible: must contain one or more of the following:
- - "renesas,cmt-32-r8a7740" for the r8a7740 32-bit CMT
- (CMT0)
- - "renesas,cmt-32-sh7372" for the sh7372 32-bit CMT
- (CMT0)
- - "renesas,cmt-32-sh73a0" for the sh73a0 32-bit CMT
- (CMT0)
- - "renesas,cmt-32" for all 32-bit CMT without fast clock support
- (CMT0 on sh7372, sh73a0 and r8a7740)
- This is a fallback for the above renesas,cmt-32-* entries.
-
- - "renesas,cmt-32-fast-r8a7740" for the r8a7740 32-bit CMT with fast
- clock support (CMT[234])
- - "renesas,cmt-32-fast-sh7372" for the sh7372 32-bit CMT with fast
- clock support (CMT[234])
- - "renesas,cmt-32-fast-sh73a0" for the sh73A0 32-bit CMT with fast
- clock support (CMT[234])
- - "renesas,cmt-32-fast" for all 32-bit CMT with fast clock support
- (CMT[234] on sh7372, sh73a0 and r8a7740)
- This is a fallback for the above renesas,cmt-32-fast-* entries.
-
- - "renesas,cmt-48-sh7372" for the sh7372 48-bit CMT
- (CMT1)
- "renesas,cmt-48-sh73a0" for the sh73A0 48-bit CMT
(CMT1)
- "renesas,cmt-48-r8a7740" for the r8a7740 48-bit CMT
(CMT1)
- "renesas,cmt-48" for all non-second generation 48-bit CMT
- (CMT1 on sh7372, sh73a0 and r8a7740)
+ (CMT1 on sh73a0 and r8a7740)
This is a fallback for the above renesas,cmt-48-* entries.
- - "renesas,cmt-48-r8a73a4" for the r8a73a4 48-bit CMT
- (CMT[01])
- - "renesas,cmt-48-r8a7790" for the r8a7790 48-bit CMT
- (CMT[01])
- - "renesas,cmt-48-r8a7791" for the r8a7791 48-bit CMT
- (CMT[01])
- - "renesas,cmt-48-gen2" for all second generation 48-bit CMT
- (CMT[01] on r8a73a4, r8a7790 and r8a7791)
- This is a fallback for the renesas,cmt-48-r8a73a4,
- renesas,cmt-48-r8a7790 and renesas,cmt-48-r8a7791 entries.
+ - "renesas,cmt0-r8a73a4" for the 32-bit CMT0 device included in r8a73a4.
+ - "renesas,cmt1-r8a73a4" for the 48-bit CMT1 device included in r8a73a4.
+ - "renesas,cmt0-r8a7790" for the 32-bit CMT0 device included in r8a7790.
+ - "renesas,cmt1-r8a7790" for the 48-bit CMT1 device included in r8a7790.
+ - "renesas,cmt0-r8a7791" for the 32-bit CMT0 device included in r8a7791.
+ - "renesas,cmt1-r8a7791" for the 48-bit CMT1 device included in r8a7791.
+ - "renesas,cmt0-r8a7793" for the 32-bit CMT0 device included in r8a7793.
+ - "renesas,cmt1-r8a7793" for the 48-bit CMT1 device included in r8a7793.
+ - "renesas,cmt0-r8a7794" for the 32-bit CMT0 device included in r8a7794.
+ - "renesas,cmt1-r8a7794" for the 48-bit CMT1 device included in r8a7794.
+
+ - "renesas,rcar-gen2-cmt0" for 32-bit CMT0 devices included in R-Car Gen2.
+ - "renesas,rcar-gen2-cmt1" for 48-bit CMT1 devices included in R-Car Gen2.
+ These are fallbacks for r8a73a4 and all the R-Car Gen2
+ entries listed above.
- reg: base address and length of the registers block for the timer module.
- interrupts: interrupt-specifier for the timer, one per channel.
@@ -59,21 +42,29 @@ Required Properties:
in clock-names.
- clock-names: must contain "fck" for the functional clock.
- - renesas,channels-mask: bitmask of the available channels.
-
-Example: R8A7790 (R-Car H2) CMT0 node
-
- CMT0 on R8A7790 implements hardware channels 5 and 6 only and names
- them channels 0 and 1 in the documentation.
+Example: R8A7790 (R-Car H2) CMT0 and CMT1 nodes
cmt0: timer@ffca0000 {
- compatible = "renesas,cmt-48-r8a7790", "renesas,cmt-48-gen2";
+ compatible = "renesas,cmt0-r8a7790", "renesas,rcar-gen2-cmt0";
reg = <0 0xffca0000 0 0x1004>;
interrupts = <0 142 IRQ_TYPE_LEVEL_HIGH>,
<0 142 IRQ_TYPE_LEVEL_HIGH>;
clocks = <&mstp1_clks R8A7790_CLK_CMT0>;
clock-names = "fck";
+ };
- renesas,channels-mask = <0x60>;
+ cmt1: timer@e6130000 {
+ compatible = "renesas,cmt1-r8a7790", "renesas,rcar-gen2-cmt1";
+ reg = <0 0xe6130000 0 0x1004>;
+ interrupts = <0 120 IRQ_TYPE_LEVEL_HIGH>,
+ <0 121 IRQ_TYPE_LEVEL_HIGH>,
+ <0 122 IRQ_TYPE_LEVEL_HIGH>,
+ <0 123 IRQ_TYPE_LEVEL_HIGH>,
+ <0 124 IRQ_TYPE_LEVEL_HIGH>,
+ <0 125 IRQ_TYPE_LEVEL_HIGH>,
+ <0 126 IRQ_TYPE_LEVEL_HIGH>,
+ <0 127 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&mstp3_clks R8A7790_CLK_CMT1>;
+ clock-names = "fck";
};
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt
index daf465bef758..36b27b1efec6 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -249,6 +249,7 @@ oxsemi Oxford Semiconductor, Ltd.
panasonic Panasonic Corporation
parade Parade Technologies Inc.
pericom Pericom Technology Inc.
+pervasive Pervasive Displays, Inc.
phytec PHYTEC Messtechnik GmbH
picochip Picochip Ltd
pine64 Pine64
diff --git a/Documentation/doc-guide/sphinx.rst b/Documentation/doc-guide/sphinx.rst
index 84e8e8a9cbdb..a2417633fdd8 100644
--- a/Documentation/doc-guide/sphinx.rst
+++ b/Documentation/doc-guide/sphinx.rst
@@ -19,6 +19,110 @@ Finally, there are thousands of plain text documentation files scattered around
``Documentation``. Some of these will likely be converted to reStructuredText
over time, but the bulk of them will remain in plain text.
+.. _sphinx_install:
+
+Sphinx Install
+==============
+
+The ReST markups currently used by the Documentation/ files are meant to be
+built with ``Sphinx`` version 1.3 or upper. If you're desiring to build
+PDF outputs, it is recommended to use version 1.4.6 or upper.
+
+There's a script that checks for the Spinx requirements. Please see
+:ref:`sphinx-pre-install` for further details.
+
+Most distributions are shipped with Sphinx, but its toolchain is fragile,
+and it is not uncommon that upgrading it or some other Python packages
+on your machine would cause the documentation build to break.
+
+A way to get rid of that is to use a different version than the one shipped
+on your distributions. In order to do that, it is recommended to install
+Sphinx inside a virtual environment, using ``virtualenv-3``
+or ``virtualenv``, depending on how your distribution packaged Python 3.
+
+.. note::
+
+ #) Sphinx versions below 1.5 don't work properly with Python's
+ docutils version 0.13.1 or upper. So, if you're willing to use
+ those versions, you should run ``pip install 'docutils==0.12'``.
+
+ #) It is recommended to use the RTD theme for html output. Depending
+ on the Sphinx version, it should be installed in separate,
+ with ``pip install sphinx_rtd_theme``.
+
+ #) Some ReST pages contain math expressions. Due to the way Sphinx work,
+ those expressions are written using LaTeX notation. It needs texlive
+ installed with amdfonts and amsmath in order to evaluate them.
+
+In summary, if you want to install Sphinx version 1.4.9, you should do::
+
+ $ virtualenv sphinx_1.4
+ $ . sphinx_1.4/bin/activate
+ (sphinx_1.4) $ pip install -r Documentation/sphinx/requirements.txt
+
+After running ``. sphinx_1.4/bin/activate``, the prompt will change,
+in order to indicate that you're using the new environment. If you
+open a new shell, you need to rerun this command to enter again at
+the virtual environment before building the documentation.
+
+Image output
+------------
+
+The kernel documentation build system contains an extension that
+handles images on both GraphViz and SVG formats (see
+:ref:`sphinx_kfigure`).
+
+For it to work, you need to install both GraphViz and ImageMagick
+packages. If those packages are not installed, the build system will
+still build the documentation, but won't include any images at the
+output.
+
+PDF and LaTeX builds
+--------------------
+
+Such builds are currently supported only with Sphinx versions 1.4 and upper.
+
+For PDF and LaTeX output, you'll also need ``XeLaTeX`` version 3.14159265.
+
+Depending on the distribution, you may also need to install a series of
+``texlive`` packages that provide the minimal set of functionalities
+required for ``XeLaTeX`` to work.
+
+.. _sphinx-pre-install:
+
+Checking for Sphinx dependencies
+--------------------------------
+
+There's a script that automatically check for Sphinx dependencies. If it can
+recognize your distribution, it will also give a hint about the install
+command line options for your distro::
+
+ $ ./scripts/sphinx-pre-install
+ Checking if the needed tools for Fedora release 26 (Twenty Six) are available
+ Warning: better to also install "texlive-luatex85".
+ You should run:
+
+ sudo dnf install -y texlive-luatex85
+ /usr/bin/virtualenv sphinx_1.4
+ . sphinx_1.4/bin/activate
+ pip install -r Documentation/sphinx/requirements.txt
+
+ Can't build as 1 mandatory dependency is missing at ./scripts/sphinx-pre-install line 468.
+
+By default, it checks all the requirements for both html and PDF, including
+the requirements for images, math expressions and LaTeX build, and assumes
+that a virtual Python environment will be used. The ones needed for html
+builds are assumed to be mandatory; the others to be optional.
+
+It supports two optional parameters:
+
+``--no-pdf``
+ Disable checks for PDF;
+
+``--no-virtualenv``
+ Use OS packaging for Sphinx instead of Python virtual environment.
+
+
Sphinx Build
============
@@ -118,7 +222,7 @@ Here are some specific guidelines for the kernel documentation:
the C domain
------------
-The `Sphinx C Domain`_ (name c) is suited for documentation of C API. E.g. a
+The **Sphinx C Domain** (name c) is suited for documentation of C API. E.g. a
function prototype:
.. code-block:: rst
@@ -229,6 +333,7 @@ Rendered as:
- column 3
+.. _sphinx_kfigure:
Figures & Images
================
diff --git a/Documentation/driver-api/basics.rst b/Documentation/driver-api/basics.rst
index ab82250c7727..73fa7d42bbba 100644
--- a/Documentation/driver-api/basics.rst
+++ b/Documentation/driver-api/basics.rst
@@ -4,7 +4,7 @@ Driver Basics
Driver Entry and Exit points
----------------------------
-.. kernel-doc:: include/linux/init.h
+.. kernel-doc:: include/linux/module.h
:internal:
Driver device table
@@ -103,9 +103,6 @@ Kernel utility functions
.. kernel-doc:: kernel/panic.c
:export:
-.. kernel-doc:: kernel/sys.c
- :export:
-
.. kernel-doc:: kernel/rcu/tree.c
:export:
diff --git a/Documentation/driver-api/dma-buf.rst b/Documentation/driver-api/dma-buf.rst
index 31671b469627..dc384f2f7f34 100644
--- a/Documentation/driver-api/dma-buf.rst
+++ b/Documentation/driver-api/dma-buf.rst
@@ -139,9 +139,6 @@ DMA Fences
Seqno Hardware Fences
~~~~~~~~~~~~~~~~~~~~~
-.. kernel-doc:: drivers/dma-buf/seqno-fence.c
- :export:
-
.. kernel-doc:: include/linux/seqno-fence.h
:internal:
diff --git a/Documentation/driver-api/miscellaneous.rst b/Documentation/driver-api/miscellaneous.rst
index 8da7d115bafc..304ffb146cf9 100644
--- a/Documentation/driver-api/miscellaneous.rst
+++ b/Documentation/driver-api/miscellaneous.rst
@@ -47,4 +47,3 @@ used by one consumer at a time.
.. kernel-doc:: drivers/pwm/core.c
:export:
-
diff --git a/Documentation/driver-api/s390-drivers.rst b/Documentation/driver-api/s390-drivers.rst
index 7060da136095..ecf8851d3565 100644
--- a/Documentation/driver-api/s390-drivers.rst
+++ b/Documentation/driver-api/s390-drivers.rst
@@ -75,7 +75,7 @@ The channel-measurement facility provides a means to collect measurement
data which is made available by the channel subsystem for each channel
attached device.
-.. kernel-doc:: arch/s390/include/asm/cmb.h
+.. kernel-doc:: arch/s390/include/uapi/asm/cmb.h
:internal:
.. kernel-doc:: drivers/s390/cio/cmf.c
diff --git a/Documentation/driver-api/scsi.rst b/Documentation/driver-api/scsi.rst
index 859fb672319f..5a2aa7a377d9 100644
--- a/Documentation/driver-api/scsi.rst
+++ b/Documentation/driver-api/scsi.rst
@@ -224,14 +224,6 @@ mid to lowlevel SCSI driver interface
.. kernel-doc:: drivers/scsi/hosts.c
:export:
-drivers/scsi/constants.c
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-mid to lowlevel SCSI driver interface
-
-.. kernel-doc:: drivers/scsi/constants.c
- :export:
-
Transport classes
-----------------
diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
index 30e04f7a690d..69f08c0f23a8 100644
--- a/Documentation/driver-model/devres.txt
+++ b/Documentation/driver-model/devres.txt
@@ -312,6 +312,7 @@ IRQ
devm_irq_alloc_descs_from()
devm_irq_alloc_generic_chip()
devm_irq_setup_generic_chip()
+ devm_irq_sim_init()
LED
devm_led_classdev_register()
diff --git a/Documentation/fb/efifb.txt b/Documentation/fb/efifb.txt
index a59916c29b33..1a85c1bdaf38 100644
--- a/Documentation/fb/efifb.txt
+++ b/Documentation/fb/efifb.txt
@@ -27,5 +27,11 @@ You have to add the following kernel parameters in your elilo.conf:
Macbook Pro 17", iMac 20" :
video=efifb:i20
+Accepted options:
+
+nowc Don't map the framebuffer write combined. This can be used
+ to workaround side-effects and slowdowns on other CPU cores
+ when large amounts of console data are written.
+
--
Edgar Hucek <gimli@dark-green.com>
diff --git a/Documentation/features/core/tracehook/arch-support.txt b/Documentation/features/core/tracehook/arch-support.txt
index 5e97a89420ef..dfb638c2f842 100644
--- a/Documentation/features/core/tracehook/arch-support.txt
+++ b/Documentation/features/core/tracehook/arch-support.txt
@@ -25,7 +25,7 @@
| mn10300: | ok |
| nios2: | ok |
| openrisc: | ok |
- | parisc: | TODO |
+ | parisc: | ok |
| powerpc: | ok |
| s390: | ok |
| score: | TODO |
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 73e7d91f03dc..405a3df759b3 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -829,9 +829,7 @@ struct address_space_operations {
swap_activate: Called when swapon is used on a file to allocate
space if necessary and pin the block lookup information in
memory. A return value of zero indicates success,
- in which case this file can be used to back swapspace. The
- swapspace operations will be proxied to this address space's
- ->swap_{out,in} methods.
+ in which case this file can be used to back swapspace.
swap_deactivate: Called during swapoff on files where swap_activate
was successful.
diff --git a/Documentation/gpio/gpio-legacy.txt b/Documentation/gpio/gpio-legacy.txt
index b34fd94f7089..5eacc147ea87 100644
--- a/Documentation/gpio/gpio-legacy.txt
+++ b/Documentation/gpio/gpio-legacy.txt
@@ -459,7 +459,7 @@ pin controller?
This is done by registering "ranges" of pins, which are essentially
cross-reference tables. These are described in
-Documentation/pinctrl.txt
+Documentation/driver-api/pinctl.rst
While the pin allocation is totally managed by the pinctrl subsystem,
gpio (under gpiolib) is still maintained by gpio drivers. It may happen
diff --git a/Documentation/gpu/drm-internals.rst b/Documentation/gpu/drm-internals.rst
index 0d936c67bf7d..5ee9674fb9e9 100644
--- a/Documentation/gpu/drm-internals.rst
+++ b/Documentation/gpu/drm-internals.rst
@@ -201,6 +201,8 @@ drivers.
Open/Close, File Operations and IOCTLs
======================================
+.. _drm_driver_fops:
+
File Operations
---------------
diff --git a/Documentation/gpu/drm-kms-helpers.rst b/Documentation/gpu/drm-kms-helpers.rst
index 7c5e2549a58a..13dd237418cc 100644
--- a/Documentation/gpu/drm-kms-helpers.rst
+++ b/Documentation/gpu/drm-kms-helpers.rst
@@ -296,3 +296,12 @@ Auxiliary Modeset Helpers
.. kernel-doc:: drivers/gpu/drm/drm_modeset_helper.c
:export:
+
+Framebuffer GEM Helper Reference
+================================
+
+.. kernel-doc:: drivers/gpu/drm/drm_gem_framebuffer_helper.c
+ :doc: overview
+
+.. kernel-doc:: drivers/gpu/drm/drm_gem_framebuffer_helper.c
+ :export:
diff --git a/Documentation/gpu/drm-kms.rst b/Documentation/gpu/drm-kms.rst
index 2d77c9580164..307284125d7a 100644
--- a/Documentation/gpu/drm-kms.rst
+++ b/Documentation/gpu/drm-kms.rst
@@ -523,9 +523,6 @@ Color Management Properties
.. kernel-doc:: drivers/gpu/drm/drm_color_mgmt.c
:doc: overview
-.. kernel-doc:: include/drm/drm_color_mgmt.h
- :internal:
-
.. kernel-doc:: drivers/gpu/drm/drm_color_mgmt.c
:export:
@@ -554,60 +551,8 @@ various modules/drivers.
Vertical Blanking
=================
-Vertical blanking plays a major role in graphics rendering. To achieve
-tear-free display, users must synchronize page flips and/or rendering to
-vertical blanking. The DRM API offers ioctls to perform page flips
-synchronized to vertical blanking and wait for vertical blanking.
-
-The DRM core handles most of the vertical blanking management logic,
-which involves filtering out spurious interrupts, keeping race-free
-blanking counters, coping with counter wrap-around and resets and
-keeping use counts. It relies on the driver to generate vertical
-blanking interrupts and optionally provide a hardware vertical blanking
-counter. Drivers must implement the following operations.
-
-- int (\*enable_vblank) (struct drm_device \*dev, int crtc); void
- (\*disable_vblank) (struct drm_device \*dev, int crtc);
- Enable or disable vertical blanking interrupts for the given CRTC.
-
-- u32 (\*get_vblank_counter) (struct drm_device \*dev, int crtc);
- Retrieve the value of the vertical blanking counter for the given
- CRTC. If the hardware maintains a vertical blanking counter its value
- should be returned. Otherwise drivers can use the
- :c:func:`drm_vblank_count()` helper function to handle this
- operation.
-
-Drivers must initialize the vertical blanking handling core with a call
-to :c:func:`drm_vblank_init()` in their load operation.
-
-Vertical blanking interrupts can be enabled by the DRM core or by
-drivers themselves (for instance to handle page flipping operations).
-The DRM core maintains a vertical blanking use count to ensure that the
-interrupts are not disabled while a user still needs them. To increment
-the use count, drivers call :c:func:`drm_vblank_get()`. Upon
-return vertical blanking interrupts are guaranteed to be enabled.
-
-To decrement the use count drivers call
-:c:func:`drm_vblank_put()`. Only when the use count drops to zero
-will the DRM core disable the vertical blanking interrupts after a delay
-by scheduling a timer. The delay is accessible through the
-vblankoffdelay module parameter or the ``drm_vblank_offdelay`` global
-variable and expressed in milliseconds. Its default value is 5000 ms.
-Zero means never disable, and a negative value means disable
-immediately. Drivers may override the behaviour by setting the
-:c:type:`struct drm_device <drm_device>`
-vblank_disable_immediate flag, which when set causes vblank interrupts
-to be disabled immediately regardless of the drm_vblank_offdelay
-value. The flag should only be set if there's a properly working
-hardware vblank counter present.
-
-When a vertical blanking interrupt occurs drivers only need to call the
-:c:func:`drm_handle_vblank()` function to account for the
-interrupt.
-
-Resources allocated by :c:func:`drm_vblank_init()` must be freed
-with a call to :c:func:`drm_vblank_cleanup()` in the driver unload
-operation handler.
+.. kernel-doc:: drivers/gpu/drm/drm_vblank.c
+ :doc: vblank handling
Vertical Blanking and Interrupt Handling Functions Reference
------------------------------------------------------------
diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
index 9412798645c1..b08e9dcd9177 100644
--- a/Documentation/gpu/drm-mm.rst
+++ b/Documentation/gpu/drm-mm.rst
@@ -191,7 +191,7 @@ acquired and release by :c:func:`calling drm_gem_object_get()` and
holding the lock.
When the last reference to a GEM object is released the GEM core calls
-the :c:type:`struct drm_driver <drm_driver>` gem_free_object
+the :c:type:`struct drm_driver <drm_driver>` gem_free_object_unlocked
operation. That operation is mandatory for GEM-enabled drivers and must
free the GEM object and all associated resources.
@@ -492,7 +492,7 @@ DRM Sync Objects
:doc: Overview
.. kernel-doc:: include/drm/drm_syncobj.h
- :export:
+ :internal:
.. kernel-doc:: drivers/gpu/drm/drm_syncobj.c
:export:
diff --git a/Documentation/gpu/drm-uapi.rst b/Documentation/gpu/drm-uapi.rst
index 858457567d3d..679373b4a03f 100644
--- a/Documentation/gpu/drm-uapi.rst
+++ b/Documentation/gpu/drm-uapi.rst
@@ -160,6 +160,8 @@ other hand, a driver requires shared state between clients which is
visible to user-space and accessible beyond open-file boundaries, they
cannot support render nodes.
+.. _drm_driver_ioctl:
+
IOCTL Support on Device Nodes
=============================
diff --git a/Documentation/gpu/i915.rst b/Documentation/gpu/i915.rst
index 9c7ed3e3f1e9..2e7ee0313c1c 100644
--- a/Documentation/gpu/i915.rst
+++ b/Documentation/gpu/i915.rst
@@ -417,6 +417,10 @@ integrate with drm/i915 and to handle the `DRM_I915_PERF_OPEN` ioctl.
:functions: i915_perf_open_ioctl
.. kernel-doc:: drivers/gpu/drm/i915/i915_perf.c
:functions: i915_perf_release
+.. kernel-doc:: drivers/gpu/drm/i915/i915_perf.c
+ :functions: i915_perf_add_config_ioctl
+.. kernel-doc:: drivers/gpu/drm/i915/i915_perf.c
+ :functions: i915_perf_remove_config_ioctl
i915 Perf Stream
----------------
@@ -477,4 +481,16 @@ specific details than found in the more high-level sections.
.. kernel-doc:: drivers/gpu/drm/i915/i915_perf.c
:internal:
-.. WARNING: DOCPROC directive not supported: !Cdrivers/gpu/drm/i915/i915_irq.c
+Style
+=====
+
+The drm/i915 driver codebase has some style rules in addition to (and, in some
+cases, deviating from) the kernel coding style.
+
+Register macro definition style
+-------------------------------
+
+The style guide for ``i915_reg.h``.
+
+.. kernel-doc:: drivers/gpu/drm/i915/i915_reg.h
+ :doc: The i915 register macro definition style guide
diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst
index 1ae42006deea..22af55d06ab8 100644
--- a/Documentation/gpu/todo.rst
+++ b/Documentation/gpu/todo.rst
@@ -108,8 +108,8 @@ This would be especially useful for tinydrm:
crtc state, clear that to the max values, x/y = 0 and w/h = MAX_INT, in
__drm_atomic_helper_crtc_duplicate_state().
-- Move tinydrm_merge_clips into drm_framebuffer.c, dropping the tinydrm_
- prefix ofc and using drm_fb_. drm_framebuffer.c makes sense since this
+- Move tinydrm_merge_clips into drm_framebuffer.c, dropping the tinydrm\_
+ prefix ofc and using drm_fb\_. drm_framebuffer.c makes sense since this
is a function useful to implement the fb->dirty function.
- Create a new drm_fb_dirty function which does essentially what e.g.
diff --git a/Documentation/hwmon/ftsteutates b/Documentation/hwmon/ftsteutates
index 8c10a916de20..af54db92391b 100644
--- a/Documentation/hwmon/ftsteutates
+++ b/Documentation/hwmon/ftsteutates
@@ -18,6 +18,10 @@ enhancements. It can monitor up to 4 voltages, 16 temperatures and
8 fans. It also contains an integrated watchdog which is currently
implemented in this driver.
+To clear a temperature or fan alarm, execute the following command with the
+correct path to the alarm file:
+ echo 0 >XXXX_alarm
+
Specification of the chip can be found here:
ftp://ftp.ts.fujitsu.com/pub/Mainboard-OEM-Sales/Services/Software&Tools/Linux_SystemMonitoring&Watchdog&GPIO/BMC-Teutates_Specification_V1.21.pdf
ftp://ftp.ts.fujitsu.com/pub/Mainboard-OEM-Sales/Services/Software&Tools/Linux_SystemMonitoring&Watchdog&GPIO/Fujitsu_mainboards-1-Sensors_HowTo-en-US.pdf
diff --git a/Documentation/hwmon/ibm-cffps b/Documentation/hwmon/ibm-cffps
new file mode 100644
index 000000000000..e05ecd8ecfcf
--- /dev/null
+++ b/Documentation/hwmon/ibm-cffps
@@ -0,0 +1,54 @@
+Kernel driver ibm-cffps
+=======================
+
+Supported chips:
+ * IBM Common Form Factor power supply
+
+Author: Eddie James <eajames@us.ibm.com>
+
+Description
+-----------
+
+This driver supports IBM Common Form Factor (CFF) power supplies. This driver
+is a client to the core PMBus driver.
+
+Usage Notes
+-----------
+
+This driver does not auto-detect devices. You will have to instantiate the
+devices explicitly. Please see Documentation/i2c/instantiating-devices for
+details.
+
+Sysfs entries
+-------------
+
+The following attributes are supported:
+
+curr1_alarm Output current over-current alarm.
+curr1_input Measured output current in mA.
+curr1_label "iout1"
+
+fan1_alarm Fan 1 warning.
+fan1_fault Fan 1 fault.
+fan1_input Fan 1 speed in RPM.
+fan2_alarm Fan 2 warning.
+fan2_fault Fan 2 fault.
+fan2_input Fan 2 speed in RPM.
+
+in1_alarm Input voltage under-voltage alarm.
+in1_input Measured input voltage in mV.
+in1_label "vin"
+in2_alarm Output voltage over-voltage alarm.
+in2_input Measured output voltage in mV.
+in2_label "vout1"
+
+power1_alarm Input fault or alarm.
+power1_input Measured input power in uW.
+power1_label "pin"
+
+temp1_alarm PSU inlet ambient temperature over-temperature alarm.
+temp1_input Measured PSU inlet ambient temp in millidegrees C.
+temp2_alarm Secondary rectifier temp over-temperature alarm.
+temp2_input Measured secondary rectifier temp in millidegrees C.
+temp3_alarm ORing FET temperature over-temperature alarm.
+temp3_input Measured ORing FET temperature in millidegrees C.
diff --git a/Documentation/hwmon/lm25066 b/Documentation/hwmon/lm25066
index 2cb20ebb234d..3fa6bf820c88 100644
--- a/Documentation/hwmon/lm25066
+++ b/Documentation/hwmon/lm25066
@@ -29,6 +29,11 @@ Supported chips:
Addresses scanned: -
Datasheet:
http://www.national.com/pf/LM/LM5066.html
+ * Texas Instruments LM5066I
+ Prefix: 'lm5066i'
+ Addresses scanned: -
+ Datasheet:
+ http://www.ti.com/product/LM5066I
Author: Guenter Roeck <linux@roeck-us.net>
@@ -37,8 +42,8 @@ Description
-----------
This driver supports hardware monitoring for National Semiconductor / TI LM25056,
-LM25063, LM25066, LM5064, and LM5066 Power Management, Monitoring, Control, and
-Protection ICs.
+LM25063, LM25066, LM5064, and LM5066/LM5066I Power Management, Monitoring,
+Control, and Protection ICs.
The driver is a client driver to the core PMBus driver. Please see
Documentation/hwmon/pmbus for details on PMBus client drivers.
diff --git a/Documentation/infiniband/tag_matching.txt b/Documentation/infiniband/tag_matching.txt
new file mode 100644
index 000000000000..d2a3bf819226
--- /dev/null
+++ b/Documentation/infiniband/tag_matching.txt
@@ -0,0 +1,64 @@
+Tag matching logic
+
+The MPI standard defines a set of rules, known as tag-matching, for matching
+source send operations to destination receives. The following parameters must
+match the following source and destination parameters:
+* Communicator
+* User tag - wild card may be specified by the receiver
+* Source rank – wild car may be specified by the receiver
+* Destination rank – wild
+The ordering rules require that when more than one pair of send and receive
+message envelopes may match, the pair that includes the earliest posted-send
+and the earliest posted-receive is the pair that must be used to satisfy the
+matching operation. However, this doesn’t imply that tags are consumed in
+the order they are created, e.g., a later generated tag may be consumed, if
+earlier tags can’t be used to satisfy the matching rules.
+
+When a message is sent from the sender to the receiver, the communication
+library may attempt to process the operation either after or before the
+corresponding matching receive is posted. If a matching receive is posted,
+this is an expected message, otherwise it is called an unexpected message.
+Implementations frequently use different matching schemes for these two
+different matching instances.
+
+To keep MPI library memory footprint down, MPI implementations typically use
+two different protocols for this purpose:
+
+1. The Eager protocol- the complete message is sent when the send is
+processed by the sender. A completion send is received in the send_cq
+notifying that the buffer can be reused.
+
+2. The Rendezvous Protocol - the sender sends the tag-matching header,
+and perhaps a portion of data when first notifying the receiver. When the
+corresponding buffer is posted, the responder will use the information from
+the header to initiate an RDMA READ operation directly to the matching buffer.
+A fin message needs to be received in order for the buffer to be reused.
+
+Tag matching implementation
+
+There are two types of matching objects used, the posted receive list and the
+unexpected message list. The application posts receive buffers through calls
+to the MPI receive routines in the posted receive list and posts send messages
+using the MPI send routines. The head of the posted receive list may be
+maintained by the hardware, with the software expected to shadow this list.
+
+When send is initiated and arrives at the receive side, if there is no
+pre-posted receive for this arriving message, it is passed to the software and
+placed in the unexpected message list. Otherwise the match is processed,
+including rendezvous processing, if appropriate, delivering the data to the
+specified receive buffer. This allows overlapping receive-side MPI tag
+matching with computation.
+
+When a receive-message is posted, the communication library will first check
+the software unexpected message list for a matching receive. If a match is
+found, data is delivered to the user buffer, using a software controlled
+protocol. The UCX implementation uses either an eager or rendezvous protocol,
+depending on data size. If no match is found, the entire pre-posted receive
+list is maintained by the hardware, and there is space to add one more
+pre-posted receive to this list, this receive is passed to the hardware.
+Software is expected to shadow this list, to help with processing MPI cancel
+operations. In addition, because hardware and software are not expected to be
+tightly synchronized with respect to the tag-matching operation, this shadow
+list is used to detect the case that a pre-posted receive is passed to the
+hardware, as the matching unexpected message is being passed from the hardware
+to the software.
diff --git a/Documentation/input/input.rst b/Documentation/input/input.rst
index 3b3a22975106..47f86a4bf16c 100644
--- a/Documentation/input/input.rst
+++ b/Documentation/input/input.rst
@@ -109,7 +109,7 @@ evdev nodes are created with minors starting with 256.
keyboard
~~~~~~~~
-``keyboard`` is in-kernel input handler ad is a part of VT code. It
+``keyboard`` is in-kernel input handler and is a part of VT code. It
consumes keyboard keystrokes and handles user input for VT consoles.
mousedev
diff --git a/Documentation/input/joydev/index.rst b/Documentation/input/joydev/index.rst
index 8d9666c7561c..ebcff43056e2 100644
--- a/Documentation/input/joydev/index.rst
+++ b/Documentation/input/joydev/index.rst
@@ -12,7 +12,6 @@ Linux Joystick support
.. toctree::
:maxdepth: 3
- :numbered:
joystick
joystick-api
diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt
index 7003141a6d4f..329e740adea7 100644
--- a/Documentation/kbuild/makefiles.txt
+++ b/Documentation/kbuild/makefiles.txt
@@ -297,9 +297,9 @@ more details, with real examples.
ccflags-y specifies options for compiling with $(CC).
Example:
- # drivers/acpi/Makefile
- ccflags-y := -Os
- ccflags-$(CONFIG_ACPI_DEBUG) += -DACPI_DEBUG_OUTPUT
+ # drivers/acpi/acpica/Makefile
+ ccflags-y := -Os -D_LINUX -DBUILDING_ACPICA
+ ccflags-$(CONFIG_ACPI_DEBUG) += -DACPI_DEBUG_OUTPUT
This variable is necessary because the top Makefile owns the
variable $(KBUILD_CFLAGS) and uses it for compilation flags for the
diff --git a/Documentation/locking/crossrelease.txt b/Documentation/locking/crossrelease.txt
new file mode 100644
index 000000000000..bdf1423d5f99
--- /dev/null
+++ b/Documentation/locking/crossrelease.txt
@@ -0,0 +1,874 @@
+Crossrelease
+============
+
+Started by Byungchul Park <byungchul.park@lge.com>
+
+Contents:
+
+ (*) Background
+
+ - What causes deadlock
+ - How lockdep works
+
+ (*) Limitation
+
+ - Limit lockdep
+ - Pros from the limitation
+ - Cons from the limitation
+ - Relax the limitation
+
+ (*) Crossrelease
+
+ - Introduce crossrelease
+ - Introduce commit
+
+ (*) Implementation
+
+ - Data structures
+ - How crossrelease works
+
+ (*) Optimizations
+
+ - Avoid duplication
+ - Lockless for hot paths
+
+ (*) APPENDIX A: What lockdep does to work aggresively
+
+ (*) APPENDIX B: How to avoid adding false dependencies
+
+
+==========
+Background
+==========
+
+What causes deadlock
+--------------------
+
+A deadlock occurs when a context is waiting for an event to happen,
+which is impossible because another (or the) context who can trigger the
+event is also waiting for another (or the) event to happen, which is
+also impossible due to the same reason.
+
+For example:
+
+ A context going to trigger event C is waiting for event A to happen.
+ A context going to trigger event A is waiting for event B to happen.
+ A context going to trigger event B is waiting for event C to happen.
+
+A deadlock occurs when these three wait operations run at the same time,
+because event C cannot be triggered if event A does not happen, which in
+turn cannot be triggered if event B does not happen, which in turn
+cannot be triggered if event C does not happen. After all, no event can
+be triggered since any of them never meets its condition to wake up.
+
+A dependency might exist between two waiters and a deadlock might happen
+due to an incorrect releationship between dependencies. Thus, we must
+define what a dependency is first. A dependency exists between them if:
+
+ 1. There are two waiters waiting for each event at a given time.
+ 2. The only way to wake up each waiter is to trigger its event.
+ 3. Whether one can be woken up depends on whether the other can.
+
+Each wait in the example creates its dependency like:
+
+ Event C depends on event A.
+ Event A depends on event B.
+ Event B depends on event C.
+
+ NOTE: Precisely speaking, a dependency is one between whether a
+ waiter for an event can be woken up and whether another waiter for
+ another event can be woken up. However from now on, we will describe
+ a dependency as if it's one between an event and another event for
+ simplicity.
+
+And they form circular dependencies like:
+
+ -> C -> A -> B -
+ / \
+ \ /
+ ----------------
+
+ where 'A -> B' means that event A depends on event B.
+
+Such circular dependencies lead to a deadlock since no waiter can meet
+its condition to wake up as described.
+
+CONCLUSION
+
+Circular dependencies cause a deadlock.
+
+
+How lockdep works
+-----------------
+
+Lockdep tries to detect a deadlock by checking dependencies created by
+lock operations, acquire and release. Waiting for a lock corresponds to
+waiting for an event, and releasing a lock corresponds to triggering an
+event in the previous section.
+
+In short, lockdep does:
+
+ 1. Detect a new dependency.
+ 2. Add the dependency into a global graph.
+ 3. Check if that makes dependencies circular.
+ 4. Report a deadlock or its possibility if so.
+
+For example, consider a graph built by lockdep that looks like:
+
+ A -> B -
+ \
+ -> E
+ /
+ C -> D -
+
+ where A, B,..., E are different lock classes.
+
+Lockdep will add a dependency into the graph on detection of a new
+dependency. For example, it will add a dependency 'E -> C' when a new
+dependency between lock E and lock C is detected. Then the graph will be:
+
+ A -> B -
+ \
+ -> E -
+ / \
+ -> C -> D - \
+ / /
+ \ /
+ ------------------
+
+ where A, B,..., E are different lock classes.
+
+This graph contains a subgraph which demonstrates circular dependencies:
+
+ -> E -
+ / \
+ -> C -> D - \
+ / /
+ \ /
+ ------------------
+
+ where C, D and E are different lock classes.
+
+This is the condition under which a deadlock might occur. Lockdep
+reports it on detection after adding a new dependency. This is the way
+how lockdep works.
+
+CONCLUSION
+
+Lockdep detects a deadlock or its possibility by checking if circular
+dependencies were created after adding each new dependency.
+
+
+==========
+Limitation
+==========
+
+Limit lockdep
+-------------
+
+Limiting lockdep to work on only typical locks e.g. spin locks and
+mutexes, which are released within the acquire context, the
+implementation becomes simple but its capacity for detection becomes
+limited. Let's check pros and cons in next section.
+
+
+Pros from the limitation
+------------------------
+
+Given the limitation, when acquiring a lock, locks in a held_locks
+cannot be released if the context cannot acquire it so has to wait to
+acquire it, which means all waiters for the locks in the held_locks are
+stuck. It's an exact case to create dependencies between each lock in
+the held_locks and the lock to acquire.
+
+For example:
+
+ CONTEXT X
+ ---------
+ acquire A
+ acquire B /* Add a dependency 'A -> B' */
+ release B
+ release A
+
+ where A and B are different lock classes.
+
+When acquiring lock A, the held_locks of CONTEXT X is empty thus no
+dependency is added. But when acquiring lock B, lockdep detects and adds
+a new dependency 'A -> B' between lock A in the held_locks and lock B.
+They can be simply added whenever acquiring each lock.
+
+And data required by lockdep exists in a local structure, held_locks
+embedded in task_struct. Forcing to access the data within the context,
+lockdep can avoid racy problems without explicit locks while handling
+the local data.
+
+Lastly, lockdep only needs to keep locks currently being held, to build
+a dependency graph. However, relaxing the limitation, it needs to keep
+even locks already released, because a decision whether they created
+dependencies might be long-deferred.
+
+To sum up, we can expect several advantages from the limitation:
+
+ 1. Lockdep can easily identify a dependency when acquiring a lock.
+ 2. Races are avoidable while accessing local locks in a held_locks.
+ 3. Lockdep only needs to keep locks currently being held.
+
+CONCLUSION
+
+Given the limitation, the implementation becomes simple and efficient.
+
+
+Cons from the limitation
+------------------------
+
+Given the limitation, lockdep is applicable only to typical locks. For
+example, page locks for page access or completions for synchronization
+cannot work with lockdep.
+
+Can we detect deadlocks below, under the limitation?
+
+Example 1:
+
+ CONTEXT X CONTEXT Y CONTEXT Z
+ --------- --------- ----------
+ mutex_lock A
+ lock_page B
+ lock_page B
+ mutex_lock A /* DEADLOCK */
+ unlock_page B held by X
+ unlock_page B
+ mutex_unlock A
+ mutex_unlock A
+
+ where A and B are different lock classes.
+
+No, we cannot.
+
+Example 2:
+
+ CONTEXT X CONTEXT Y
+ --------- ---------
+ mutex_lock A
+ mutex_lock A
+ wait_for_complete B /* DEADLOCK */
+ complete B
+ mutex_unlock A
+ mutex_unlock A
+
+ where A is a lock class and B is a completion variable.
+
+No, we cannot.
+
+CONCLUSION
+
+Given the limitation, lockdep cannot detect a deadlock or its
+possibility caused by page locks or completions.
+
+
+Relax the limitation
+--------------------
+
+Under the limitation, things to create dependencies are limited to
+typical locks. However, synchronization primitives like page locks and
+completions, which are allowed to be released in any context, also
+create dependencies and can cause a deadlock. So lockdep should track
+these locks to do a better job. We have to relax the limitation for
+these locks to work with lockdep.
+
+Detecting dependencies is very important for lockdep to work because
+adding a dependency means adding an opportunity to check whether it
+causes a deadlock. The more lockdep adds dependencies, the more it
+thoroughly works. Thus Lockdep has to do its best to detect and add as
+many true dependencies into a graph as possible.
+
+For example, considering only typical locks, lockdep builds a graph like:
+
+ A -> B -
+ \
+ -> E
+ /
+ C -> D -
+
+ where A, B,..., E are different lock classes.
+
+On the other hand, under the relaxation, additional dependencies might
+be created and added. Assuming additional 'FX -> C' and 'E -> GX' are
+added thanks to the relaxation, the graph will be:
+
+ A -> B -
+ \
+ -> E -> GX
+ /
+ FX -> C -> D -
+
+ where A, B,..., E, FX and GX are different lock classes, and a suffix
+ 'X' is added on non-typical locks.
+
+The latter graph gives us more chances to check circular dependencies
+than the former. However, it might suffer performance degradation since
+relaxing the limitation, with which design and implementation of lockdep
+can be efficient, might introduce inefficiency inevitably. So lockdep
+should provide two options, strong detection and efficient detection.
+
+Choosing efficient detection:
+
+ Lockdep works with only locks restricted to be released within the
+ acquire context. However, lockdep works efficiently.
+
+Choosing strong detection:
+
+ Lockdep works with all synchronization primitives. However, lockdep
+ suffers performance degradation.
+
+CONCLUSION
+
+Relaxing the limitation, lockdep can add additional dependencies giving
+additional opportunities to check circular dependencies.
+
+
+============
+Crossrelease
+============
+
+Introduce crossrelease
+----------------------
+
+In order to allow lockdep to handle additional dependencies by what
+might be released in any context, namely 'crosslock', we have to be able
+to identify those created by crosslocks. The proposed 'crossrelease'
+feature provoides a way to do that.
+
+Crossrelease feature has to do:
+
+ 1. Identify dependencies created by crosslocks.
+ 2. Add the dependencies into a dependency graph.
+
+That's all. Once a meaningful dependency is added into graph, then
+lockdep would work with the graph as it did. The most important thing
+crossrelease feature has to do is to correctly identify and add true
+dependencies into the global graph.
+
+A dependency e.g. 'A -> B' can be identified only in the A's release
+context because a decision required to identify the dependency can be
+made only in the release context. That is to decide whether A can be
+released so that a waiter for A can be woken up. It cannot be made in
+other than the A's release context.
+
+It's no matter for typical locks because each acquire context is same as
+its release context, thus lockdep can decide whether a lock can be
+released in the acquire context. However for crosslocks, lockdep cannot
+make the decision in the acquire context but has to wait until the
+release context is identified.
+
+Therefore, deadlocks by crosslocks cannot be detected just when it
+happens, because those cannot be identified until the crosslocks are
+released. However, deadlock possibilities can be detected and it's very
+worth. See 'APPENDIX A' section to check why.
+
+CONCLUSION
+
+Using crossrelease feature, lockdep can work with what might be released
+in any context, namely crosslock.
+
+
+Introduce commit
+----------------
+
+Since crossrelease defers the work adding true dependencies of
+crosslocks until they are actually released, crossrelease has to queue
+all acquisitions which might create dependencies with the crosslocks.
+Then it identifies dependencies using the queued data in batches at a
+proper time. We call it 'commit'.
+
+There are four types of dependencies:
+
+1. TT type: 'typical lock A -> typical lock B'
+
+ Just when acquiring B, lockdep can see it's in the A's release
+ context. So the dependency between A and B can be identified
+ immediately. Commit is unnecessary.
+
+2. TC type: 'typical lock A -> crosslock BX'
+
+ Just when acquiring BX, lockdep can see it's in the A's release
+ context. So the dependency between A and BX can be identified
+ immediately. Commit is unnecessary, too.
+
+3. CT type: 'crosslock AX -> typical lock B'
+
+ When acquiring B, lockdep cannot identify the dependency because
+ there's no way to know if it's in the AX's release context. It has
+ to wait until the decision can be made. Commit is necessary.
+
+4. CC type: 'crosslock AX -> crosslock BX'
+
+ When acquiring BX, lockdep cannot identify the dependency because
+ there's no way to know if it's in the AX's release context. It has
+ to wait until the decision can be made. Commit is necessary.
+ But, handling CC type is not implemented yet. It's a future work.
+
+Lockdep can work without commit for typical locks, but commit step is
+necessary once crosslocks are involved. Introducing commit, lockdep
+performs three steps. What lockdep does in each step is:
+
+1. Acquisition: For typical locks, lockdep does what it originally did
+ and queues the lock so that CT type dependencies can be checked using
+ it at the commit step. For crosslocks, it saves data which will be
+ used at the commit step and increases a reference count for it.
+
+2. Commit: No action is reauired for typical locks. For crosslocks,
+ lockdep adds CT type dependencies using the data saved at the
+ acquisition step.
+
+3. Release: No changes are required for typical locks. When a crosslock
+ is released, it decreases a reference count for it.
+
+CONCLUSION
+
+Crossrelease introduces commit step to handle dependencies of crosslocks
+in batches at a proper time.
+
+
+==============
+Implementation
+==============
+
+Data structures
+---------------
+
+Crossrelease introduces two main data structures.
+
+1. hist_lock
+
+ This is an array embedded in task_struct, for keeping lock history so
+ that dependencies can be added using them at the commit step. Since
+ it's local data, it can be accessed locklessly in the owner context.
+ The array is filled at the acquisition step and consumed at the
+ commit step. And it's managed in circular manner.
+
+2. cross_lock
+
+ One per lockdep_map exists. This is for keeping data of crosslocks
+ and used at the commit step.
+
+
+How crossrelease works
+----------------------
+
+It's the key of how crossrelease works, to defer necessary works to an
+appropriate point in time and perform in at once at the commit step.
+Let's take a look with examples step by step, starting from how lockdep
+works without crossrelease for typical locks.
+
+ acquire A /* Push A onto held_locks */
+ acquire B /* Push B onto held_locks and add 'A -> B' */
+ acquire C /* Push C onto held_locks and add 'B -> C' */
+ release C /* Pop C from held_locks */
+ release B /* Pop B from held_locks */
+ release A /* Pop A from held_locks */
+
+ where A, B and C are different lock classes.
+
+ NOTE: This document assumes that readers already understand how
+ lockdep works without crossrelease thus omits details. But there's
+ one thing to note. Lockdep pretends to pop a lock from held_locks
+ when releasing it. But it's subtly different from the original pop
+ operation because lockdep allows other than the top to be poped.
+
+In this case, lockdep adds 'the top of held_locks -> the lock to acquire'
+dependency every time acquiring a lock.
+
+After adding 'A -> B', a dependency graph will be:
+
+ A -> B
+
+ where A and B are different lock classes.
+
+And after adding 'B -> C', the graph will be:
+
+ A -> B -> C
+
+ where A, B and C are different lock classes.
+
+Let's performs commit step even for typical locks to add dependencies.
+Of course, commit step is not necessary for them, however, it would work
+well because this is a more general way.
+
+ acquire A
+ /*
+ * Queue A into hist_locks
+ *
+ * In hist_locks: A
+ * In graph: Empty
+ */
+
+ acquire B
+ /*
+ * Queue B into hist_locks
+ *
+ * In hist_locks: A, B
+ * In graph: Empty
+ */
+
+ acquire C
+ /*
+ * Queue C into hist_locks
+ *
+ * In hist_locks: A, B, C
+ * In graph: Empty
+ */
+
+ commit C
+ /*
+ * Add 'C -> ?'
+ * Answer the following to decide '?'
+ * What has been queued since acquire C: Nothing
+ *
+ * In hist_locks: A, B, C
+ * In graph: Empty
+ */
+
+ release C
+
+ commit B
+ /*
+ * Add 'B -> ?'
+ * Answer the following to decide '?'
+ * What has been queued since acquire B: C
+ *
+ * In hist_locks: A, B, C
+ * In graph: 'B -> C'
+ */
+
+ release B
+
+ commit A
+ /*
+ * Add 'A -> ?'
+ * Answer the following to decide '?'
+ * What has been queued since acquire A: B, C
+ *
+ * In hist_locks: A, B, C
+ * In graph: 'B -> C', 'A -> B', 'A -> C'
+ */
+
+ release A
+
+ where A, B and C are different lock classes.
+
+In this case, dependencies are added at the commit step as described.
+
+After commits for A, B and C, the graph will be:
+
+ A -> B -> C
+
+ where A, B and C are different lock classes.
+
+ NOTE: A dependency 'A -> C' is optimized out.
+
+We can see the former graph built without commit step is same as the
+latter graph built using commit steps. Of course the former way leads to
+earlier finish for building the graph, which means we can detect a
+deadlock or its possibility sooner. So the former way would be prefered
+when possible. But we cannot avoid using the latter way for crosslocks.
+
+Let's look at how commit steps work for crosslocks. In this case, the
+commit step is performed only on crosslock AX as real. And it assumes
+that the AX release context is different from the AX acquire context.
+
+ BX RELEASE CONTEXT BX ACQUIRE CONTEXT
+ ------------------ ------------------
+ acquire A
+ /*
+ * Push A onto held_locks
+ * Queue A into hist_locks
+ *
+ * In held_locks: A
+ * In hist_locks: A
+ * In graph: Empty
+ */
+
+ acquire BX
+ /*
+ * Add 'the top of held_locks -> BX'
+ *
+ * In held_locks: A
+ * In hist_locks: A
+ * In graph: 'A -> BX'
+ */
+
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ It must be guaranteed that the following operations are seen after
+ acquiring BX globally. It can be done by things like barrier.
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ acquire C
+ /*
+ * Push C onto held_locks
+ * Queue C into hist_locks
+ *
+ * In held_locks: C
+ * In hist_locks: C
+ * In graph: 'A -> BX'
+ */
+
+ release C
+ /*
+ * Pop C from held_locks
+ *
+ * In held_locks: Empty
+ * In hist_locks: C
+ * In graph: 'A -> BX'
+ */
+ acquire D
+ /*
+ * Push D onto held_locks
+ * Queue D into hist_locks
+ * Add 'the top of held_locks -> D'
+ *
+ * In held_locks: A, D
+ * In hist_locks: A, D
+ * In graph: 'A -> BX', 'A -> D'
+ */
+ acquire E
+ /*
+ * Push E onto held_locks
+ * Queue E into hist_locks
+ *
+ * In held_locks: E
+ * In hist_locks: C, E
+ * In graph: 'A -> BX', 'A -> D'
+ */
+
+ release E
+ /*
+ * Pop E from held_locks
+ *
+ * In held_locks: Empty
+ * In hist_locks: D, E
+ * In graph: 'A -> BX', 'A -> D'
+ */
+ release D
+ /*
+ * Pop D from held_locks
+ *
+ * In held_locks: A
+ * In hist_locks: A, D
+ * In graph: 'A -> BX', 'A -> D'
+ */
+ commit BX
+ /*
+ * Add 'BX -> ?'
+ * What has been queued since acquire BX: C, E
+ *
+ * In held_locks: Empty
+ * In hist_locks: D, E
+ * In graph: 'A -> BX', 'A -> D',
+ * 'BX -> C', 'BX -> E'
+ */
+
+ release BX
+ /*
+ * In held_locks: Empty
+ * In hist_locks: D, E
+ * In graph: 'A -> BX', 'A -> D',
+ * 'BX -> C', 'BX -> E'
+ */
+ release A
+ /*
+ * Pop A from held_locks
+ *
+ * In held_locks: Empty
+ * In hist_locks: A, D
+ * In graph: 'A -> BX', 'A -> D',
+ * 'BX -> C', 'BX -> E'
+ */
+
+ where A, BX, C,..., E are different lock classes, and a suffix 'X' is
+ added on crosslocks.
+
+Crossrelease considers all acquisitions after acqiuring BX are
+candidates which might create dependencies with BX. True dependencies
+will be determined when identifying the release context of BX. Meanwhile,
+all typical locks are queued so that they can be used at the commit step.
+And then two dependencies 'BX -> C' and 'BX -> E' are added at the
+commit step when identifying the release context.
+
+The final graph will be, with crossrelease:
+
+ -> C
+ /
+ -> BX -
+ / \
+ A - -> E
+ \
+ -> D
+
+ where A, BX, C,..., E are different lock classes, and a suffix 'X' is
+ added on crosslocks.
+
+However, the final graph will be, without crossrelease:
+
+ A -> D
+
+ where A and D are different lock classes.
+
+The former graph has three more dependencies, 'A -> BX', 'BX -> C' and
+'BX -> E' giving additional opportunities to check if they cause
+deadlocks. This way lockdep can detect a deadlock or its possibility
+caused by crosslocks.
+
+CONCLUSION
+
+We checked how crossrelease works with several examples.
+
+
+=============
+Optimizations
+=============
+
+Avoid duplication
+-----------------
+
+Crossrelease feature uses a cache like what lockdep already uses for
+dependency chains, but this time it's for caching CT type dependencies.
+Once that dependency is cached, the same will never be added again.
+
+
+Lockless for hot paths
+----------------------
+
+To keep all locks for later use at the commit step, crossrelease adopts
+a local array embedded in task_struct, which makes access to the data
+lockless by forcing it to happen only within the owner context. It's
+like how lockdep handles held_locks. Lockless implmentation is important
+since typical locks are very frequently acquired and released.
+
+
+=================================================
+APPENDIX A: What lockdep does to work aggresively
+=================================================
+
+A deadlock actually occurs when all wait operations creating circular
+dependencies run at the same time. Even though they don't, a potential
+deadlock exists if the problematic dependencies exist. Thus it's
+meaningful to detect not only an actual deadlock but also its potential
+possibility. The latter is rather valuable. When a deadlock occurs
+actually, we can identify what happens in the system by some means or
+other even without lockdep. However, there's no way to detect possiblity
+without lockdep unless the whole code is parsed in head. It's terrible.
+Lockdep does the both, and crossrelease only focuses on the latter.
+
+Whether or not a deadlock actually occurs depends on several factors.
+For example, what order contexts are switched in is a factor. Assuming
+circular dependencies exist, a deadlock would occur when contexts are
+switched so that all wait operations creating the dependencies run
+simultaneously. Thus to detect a deadlock possibility even in the case
+that it has not occured yet, lockdep should consider all possible
+combinations of dependencies, trying to:
+
+1. Use a global dependency graph.
+
+ Lockdep combines all dependencies into one global graph and uses them,
+ regardless of which context generates them or what order contexts are
+ switched in. Aggregated dependencies are only considered so they are
+ prone to be circular if a problem exists.
+
+2. Check dependencies between classes instead of instances.
+
+ What actually causes a deadlock are instances of lock. However,
+ lockdep checks dependencies between classes instead of instances.
+ This way lockdep can detect a deadlock which has not happened but
+ might happen in future by others but the same class.
+
+3. Assume all acquisitions lead to waiting.
+
+ Although locks might be acquired without waiting which is essential
+ to create dependencies, lockdep assumes all acquisitions lead to
+ waiting since it might be true some time or another.
+
+CONCLUSION
+
+Lockdep detects not only an actual deadlock but also its possibility,
+and the latter is more valuable.
+
+
+==================================================
+APPENDIX B: How to avoid adding false dependencies
+==================================================
+
+Remind what a dependency is. A dependency exists if:
+
+ 1. There are two waiters waiting for each event at a given time.
+ 2. The only way to wake up each waiter is to trigger its event.
+ 3. Whether one can be woken up depends on whether the other can.
+
+For example:
+
+ acquire A
+ acquire B /* A dependency 'A -> B' exists */
+ release B
+ release A
+
+ where A and B are different lock classes.
+
+A depedency 'A -> B' exists since:
+
+ 1. A waiter for A and a waiter for B might exist when acquiring B.
+ 2. Only way to wake up each is to release what it waits for.
+ 3. Whether the waiter for A can be woken up depends on whether the
+ other can. IOW, TASK X cannot release A if it fails to acquire B.
+
+For another example:
+
+ TASK X TASK Y
+ ------ ------
+ acquire AX
+ acquire B /* A dependency 'AX -> B' exists */
+ release B
+ release AX held by Y
+
+ where AX and B are different lock classes, and a suffix 'X' is added
+ on crosslocks.
+
+Even in this case involving crosslocks, the same rule can be applied. A
+depedency 'AX -> B' exists since:
+
+ 1. A waiter for AX and a waiter for B might exist when acquiring B.
+ 2. Only way to wake up each is to release what it waits for.
+ 3. Whether the waiter for AX can be woken up depends on whether the
+ other can. IOW, TASK X cannot release AX if it fails to acquire B.
+
+Let's take a look at more complicated example:
+
+ TASK X TASK Y
+ ------ ------
+ acquire B
+ release B
+ fork Y
+ acquire AX
+ acquire C /* A dependency 'AX -> C' exists */
+ release C
+ release AX held by Y
+
+ where AX, B and C are different lock classes, and a suffix 'X' is
+ added on crosslocks.
+
+Does a dependency 'AX -> B' exist? Nope.
+
+Two waiters are essential to create a dependency. However, waiters for
+AX and B to create 'AX -> B' cannot exist at the same time in this
+example. Thus the dependency 'AX -> B' cannot be created.
+
+It would be ideal if the full set of true ones can be considered. But
+we can ensure nothing but what actually happened. Relying on what
+actually happens at runtime, we can anyway add only true ones, though
+they might be a subset of true ones. It's similar to how lockdep works
+for typical locks. There might be more true dependencies than what
+lockdep has detected in runtime. Lockdep has no choice but to rely on
+what actually happens. Crossrelease also relies on it.
+
+CONCLUSION
+
+Relying on what actually happens, lockdep can avoid adding false
+dependencies.
diff --git a/Documentation/locking/rt-mutex-design.txt b/Documentation/locking/rt-mutex-design.txt
index 8666070d3189..6c6e8c2410de 100644
--- a/Documentation/locking/rt-mutex-design.txt
+++ b/Documentation/locking/rt-mutex-design.txt
@@ -97,9 +97,9 @@ waiter - A waiter is a struct that is stored on the stack of a blocked
a process being blocked on the mutex, it is fine to allocate
the waiter on the process's stack (local variable). This
structure holds a pointer to the task, as well as the mutex that
- the task is blocked on. It also has the plist node structures to
- place the task in the waiter_list of a mutex as well as the
- pi_list of a mutex owner task (described below).
+ the task is blocked on. It also has rbtree node structures to
+ place the task in the waiters rbtree of a mutex as well as the
+ pi_waiters rbtree of a mutex owner task (described below).
waiter is sometimes used in reference to the task that is waiting
on a mutex. This is the same as waiter->task.
@@ -179,53 +179,34 @@ again.
|
F->L5-+
+If process G has the highest priority in the chain, then all the tasks up
+the chain (A and B in this example), must have their priorities increased
+to that of G.
-Plist
------
-
-Before I go further and talk about how the PI chain is stored through lists
-on both mutexes and processes, I'll explain the plist. This is similar to
-the struct list_head functionality that is already in the kernel.
-The implementation of plist is out of scope for this document, but it is
-very important to understand what it does.
-
-There are a few differences between plist and list, the most important one
-being that plist is a priority sorted linked list. This means that the
-priorities of the plist are sorted, such that it takes O(1) to retrieve the
-highest priority item in the list. Obviously this is useful to store processes
-based on their priorities.
-
-Another difference, which is important for implementation, is that, unlike
-list, the head of the list is a different element than the nodes of a list.
-So the head of the list is declared as struct plist_head and nodes that will
-be added to the list are declared as struct plist_node.
-
-
-Mutex Waiter List
+Mutex Waiters Tree
-----------------
-Every mutex keeps track of all the waiters that are blocked on itself. The mutex
-has a plist to store these waiters by priority. This list is protected by
-a spin lock that is located in the struct of the mutex. This lock is called
-wait_lock. Since the modification of the waiter list is never done in
-interrupt context, the wait_lock can be taken without disabling interrupts.
+Every mutex keeps track of all the waiters that are blocked on itself. The
+mutex has a rbtree to store these waiters by priority. This tree is protected
+by a spin lock that is located in the struct of the mutex. This lock is called
+wait_lock.
-Task PI List
+Task PI Tree
------------
-To keep track of the PI chains, each process has its own PI list. This is
-a list of all top waiters of the mutexes that are owned by the process.
-Note that this list only holds the top waiters and not all waiters that are
+To keep track of the PI chains, each process has its own PI rbtree. This is
+a tree of all top waiters of the mutexes that are owned by the process.
+Note that this tree only holds the top waiters and not all waiters that are
blocked on mutexes owned by the process.
-The top of the task's PI list is always the highest priority task that
+The top of the task's PI tree is always the highest priority task that
is waiting on a mutex that is owned by the task. So if the task has
inherited a priority, it will always be the priority of the task that is
-at the top of this list.
+at the top of this tree.
-This list is stored in the task structure of a process as a plist called
-pi_list. This list is protected by a spin lock also in the task structure,
+This tree is stored in the task structure of a process as a rbtree called
+pi_waiters. It is protected by a spin lock also in the task structure,
called pi_lock. This lock may also be taken in interrupt context, so when
locking the pi_lock, interrupts must be disabled.
@@ -312,15 +293,12 @@ Mutex owner and flags
The mutex structure contains a pointer to the owner of the mutex. If the
mutex is not owned, this owner is set to NULL. Since all architectures
-have the task structure on at least a four byte alignment (and if this is
-not true, the rtmutex.c code will be broken!), this allows for the two
-least significant bits to be used as flags. This part is also described
-in Documentation/rt-mutex.txt, but will also be briefly described here.
-
-Bit 0 is used as the "Pending Owner" flag. This is described later.
-Bit 1 is used as the "Has Waiters" flags. This is also described later
- in more detail, but is set whenever there are waiters on a mutex.
+have the task structure on at least a two byte alignment (and if this is
+not true, the rtmutex.c code will be broken!), this allows for the least
+significant bit to be used as a flag. Bit 0 is used as the "Has Waiters"
+flag. It's set whenever there are waiters on a mutex.
+See Documentation/locking/rt-mutex.txt for further details.
cmpxchg Tricks
--------------
@@ -359,40 +337,31 @@ Priority adjustments
--------------------
The implementation of the PI code in rtmutex.c has several places that a
-process must adjust its priority. With the help of the pi_list of a
+process must adjust its priority. With the help of the pi_waiters of a
process this is rather easy to know what needs to be adjusted.
-The functions implementing the task adjustments are rt_mutex_adjust_prio,
-__rt_mutex_adjust_prio (same as the former, but expects the task pi_lock
-to already be taken), rt_mutex_getprio, and rt_mutex_setprio.
+The functions implementing the task adjustments are rt_mutex_adjust_prio
+and rt_mutex_setprio. rt_mutex_setprio is only used in rt_mutex_adjust_prio.
-rt_mutex_getprio and rt_mutex_setprio are only used in __rt_mutex_adjust_prio.
+rt_mutex_adjust_prio examines the priority of the task, and the highest
+priority process that is waiting any of mutexes owned by the task. Since
+the pi_waiters of a task holds an order by priority of all the top waiters
+of all the mutexes that the task owns, we simply need to compare the top
+pi waiter to its own normal/deadline priority and take the higher one.
+Then rt_mutex_setprio is called to adjust the priority of the task to the
+new priority. Note that rt_mutex_setprio is defined in kernel/sched/core.c
+to implement the actual change in priority.
-rt_mutex_getprio returns the priority that the task should have. Either the
-task's own normal priority, or if a process of a higher priority is waiting on
-a mutex owned by the task, then that higher priority should be returned.
-Since the pi_list of a task holds an order by priority list of all the top
-waiters of all the mutexes that the task owns, rt_mutex_getprio simply needs
-to compare the top pi waiter to its own normal priority, and return the higher
-priority back.
+(Note: For the "prio" field in task_struct, the lower the number, the
+ higher the priority. A "prio" of 5 is of higher priority than a
+ "prio" of 10.)
-(Note: if looking at the code, you will notice that the lower number of
- prio is returned. This is because the prio field in the task structure
- is an inverse order of the actual priority. So a "prio" of 5 is
- of higher priority than a "prio" of 10.)
-
-__rt_mutex_adjust_prio examines the result of rt_mutex_getprio, and if the
-result does not equal the task's current priority, then rt_mutex_setprio
-is called to adjust the priority of the task to the new priority.
-Note that rt_mutex_setprio is defined in kernel/sched/core.c to implement the
-actual change in priority.
-
-It is interesting to note that __rt_mutex_adjust_prio can either increase
+It is interesting to note that rt_mutex_adjust_prio can either increase
or decrease the priority of the task. In the case that a higher priority
-process has just blocked on a mutex owned by the task, __rt_mutex_adjust_prio
+process has just blocked on a mutex owned by the task, rt_mutex_adjust_prio
would increase/boost the task's priority. But if a higher priority task
were for some reason to leave the mutex (timeout or signal), this same function
-would decrease/unboost the priority of the task. That is because the pi_list
+would decrease/unboost the priority of the task. That is because the pi_waiters
always contains the highest priority task that is waiting on a mutex owned
by the task, so we only need to compare the priority of that top pi waiter
to the normal priority of the given task.
@@ -412,9 +381,10 @@ priorities.
rt_mutex_adjust_prio_chain is called with a task to be checked for PI
(de)boosting (the owner of a mutex that a process is blocking on), a flag to
-check for deadlocking, the mutex that the task owns, and a pointer to a waiter
+check for deadlocking, the mutex that the task owns, a pointer to a waiter
that is the process's waiter struct that is blocked on the mutex (although this
-parameter may be NULL for deboosting).
+parameter may be NULL for deboosting), a pointer to the mutex on which the task
+is blocked, and a top_task as the top waiter of the mutex.
For this explanation, I will not mention deadlock detection. This explanation
will try to stay at a high level.
@@ -424,133 +394,14 @@ that the state of the owner and lock can change when entered into this function.
Before this function is called, the task has already had rt_mutex_adjust_prio
performed on it. This means that the task is set to the priority that it
-should be at, but the plist nodes of the task's waiter have not been updated
-with the new priorities, and that this task may not be in the proper locations
-in the pi_lists and wait_lists that the task is blocked on. This function
+should be at, but the rbtree nodes of the task's waiter have not been updated
+with the new priorities, and this task may not be in the proper locations
+in the pi_waiters and waiters trees that the task is blocked on. This function
solves all that.
-A loop is entered, where task is the owner to be checked for PI changes that
-was passed by parameter (for the first iteration). The pi_lock of this task is
-taken to prevent any more changes to the pi_list of the task. This also
-prevents new tasks from completing the blocking on a mutex that is owned by this
-task.
-
-If the task is not blocked on a mutex then the loop is exited. We are at
-the top of the PI chain.
-
-A check is now done to see if the original waiter (the process that is blocked
-on the current mutex) is the top pi waiter of the task. That is, is this
-waiter on the top of the task's pi_list. If it is not, it either means that
-there is another process higher in priority that is blocked on one of the
-mutexes that the task owns, or that the waiter has just woken up via a signal
-or timeout and has left the PI chain. In either case, the loop is exited, since
-we don't need to do any more changes to the priority of the current task, or any
-task that owns a mutex that this current task is waiting on. A priority chain
-walk is only needed when a new top pi waiter is made to a task.
-
-The next check sees if the task's waiter plist node has the priority equal to
-the priority the task is set at. If they are equal, then we are done with
-the loop. Remember that the function started with the priority of the
-task adjusted, but the plist nodes that hold the task in other processes
-pi_lists have not been adjusted.
-
-Next, we look at the mutex that the task is blocked on. The mutex's wait_lock
-is taken. This is done by a spin_trylock, because the locking order of the
-pi_lock and wait_lock goes in the opposite direction. If we fail to grab the
-lock, the pi_lock is released, and we restart the loop.
-
-Now that we have both the pi_lock of the task as well as the wait_lock of
-the mutex the task is blocked on, we update the task's waiter's plist node
-that is located on the mutex's wait_list.
-
-Now we release the pi_lock of the task.
-
-Next the owner of the mutex has its pi_lock taken, so we can update the
-task's entry in the owner's pi_list. If the task is the highest priority
-process on the mutex's wait_list, then we remove the previous top waiter
-from the owner's pi_list, and replace it with the task.
-
-Note: It is possible that the task was the current top waiter on the mutex,
- in which case the task is not yet on the pi_list of the waiter. This
- is OK, since plist_del does nothing if the plist node is not on any
- list.
-
-If the task was not the top waiter of the mutex, but it was before we
-did the priority updates, that means we are deboosting/lowering the
-task. In this case, the task is removed from the pi_list of the owner,
-and the new top waiter is added.
-
-Lastly, we unlock both the pi_lock of the task, as well as the mutex's
-wait_lock, and continue the loop again. On the next iteration of the
-loop, the previous owner of the mutex will be the task that will be
-processed.
-
-Note: One might think that the owner of this mutex might have changed
- since we just grab the mutex's wait_lock. And one could be right.
- The important thing to remember is that the owner could not have
- become the task that is being processed in the PI chain, since
- we have taken that task's pi_lock at the beginning of the loop.
- So as long as there is an owner of this mutex that is not the same
- process as the tasked being worked on, we are OK.
-
- Looking closely at the code, one might be confused. The check for the
- end of the PI chain is when the task isn't blocked on anything or the
- task's waiter structure "task" element is NULL. This check is
- protected only by the task's pi_lock. But the code to unlock the mutex
- sets the task's waiter structure "task" element to NULL with only
- the protection of the mutex's wait_lock, which was not taken yet.
- Isn't this a race condition if the task becomes the new owner?
-
- The answer is No! The trick is the spin_trylock of the mutex's
- wait_lock. If we fail that lock, we release the pi_lock of the
- task and continue the loop, doing the end of PI chain check again.
-
- In the code to release the lock, the wait_lock of the mutex is held
- the entire time, and it is not let go when we grab the pi_lock of the
- new owner of the mutex. So if the switch of a new owner were to happen
- after the check for end of the PI chain and the grabbing of the
- wait_lock, the unlocking code would spin on the new owner's pi_lock
- but never give up the wait_lock. So the PI chain loop is guaranteed to
- fail the spin_trylock on the wait_lock, release the pi_lock, and
- try again.
-
- If you don't quite understand the above, that's OK. You don't have to,
- unless you really want to make a proof out of it ;)
-
-
-Pending Owners and Lock stealing
---------------------------------
-
-One of the flags in the owner field of the mutex structure is "Pending Owner".
-What this means is that an owner was chosen by the process releasing the
-mutex, but that owner has yet to wake up and actually take the mutex.
-
-Why is this important? Why can't we just give the mutex to another process
-and be done with it?
-
-The PI code is to help with real-time processes, and to let the highest
-priority process run as long as possible with little latencies and delays.
-If a high priority process owns a mutex that a lower priority process is
-blocked on, when the mutex is released it would be given to the lower priority
-process. What if the higher priority process wants to take that mutex again.
-The high priority process would fail to take that mutex that it just gave up
-and it would need to boost the lower priority process to run with full
-latency of that critical section (since the low priority process just entered
-it).
-
-There's no reason a high priority process that gives up a mutex should be
-penalized if it tries to take that mutex again. If the new owner of the
-mutex has not woken up yet, there's no reason that the higher priority process
-could not take that mutex away.
-
-To solve this, we introduced Pending Ownership and Lock Stealing. When a
-new process is given a mutex that it was blocked on, it is only given
-pending ownership. This means that it's the new owner, unless a higher
-priority process comes in and tries to grab that mutex. If a higher priority
-process does come along and wants that mutex, we let the higher priority
-process "steal" the mutex from the pending owner (only if it is still pending)
-and continue with the mutex.
-
+The main operation of this function is summarized by Thomas Gleixner in
+rtmutex.c. See the 'Chain walk basics and protection scope' comment for further
+details.
Taking of a mutex (The walk through)
------------------------------------
@@ -563,14 +414,14 @@ done when we have CMPXCHG enabled (otherwise the fast taking automatically
fails). Only when the owner field of the mutex is NULL can the lock be
taken with the CMPXCHG and nothing else needs to be done.
-If there is contention on the lock, whether it is owned or pending owner
-we go about the slow path (rt_mutex_slowlock).
+If there is contention on the lock, we go about the slow path
+(rt_mutex_slowlock).
The slow path function is where the task's waiter structure is created on
the stack. This is because the waiter structure is only needed for the
scope of this function. The waiter structure holds the nodes to store
-the task on the wait_list of the mutex, and if need be, the pi_list of
-the owner.
+the task on the waiters tree of the mutex, and if need be, the pi_waiters
+tree of the owner.
The wait_lock of the mutex is taken since the slow path of unlocking the
mutex also takes this lock.
@@ -581,102 +432,45 @@ contention).
try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
slow path. The first thing that is done here is an atomic setting of
-the "Has Waiters" flag of the mutex's owner field. Yes, this could really
-be false, because if the mutex has no owner, there are no waiters and
-the current task also won't have any waiters. But we don't have the lock
-yet, so we assume we are going to be a waiter. The reason for this is to
-play nice for those architectures that do have CMPXCHG. By setting this flag
-now, the owner of the mutex can't release the mutex without going into the
-slow unlock path, and it would then need to grab the wait_lock, which this
-code currently holds. So setting the "Has Waiters" flag forces the owner
-to synchronize with this code.
-
-Now that we know that we can't have any races with the owner releasing the
-mutex, we check to see if we can take the ownership. This is done if the
-mutex doesn't have a owner, or if we can steal the mutex from a pending
-owner. Let's look at the situations we have here.
-
- 1) Has owner that is pending
- ----------------------------
-
- The mutex has a owner, but it hasn't woken up and the mutex flag
- "Pending Owner" is set. The first check is to see if the owner isn't the
- current task. This is because this function is also used for the pending
- owner to grab the mutex. When a pending owner wakes up, it checks to see
- if it can take the mutex, and this is done if the owner is already set to
- itself. If so, we succeed and leave the function, clearing the "Pending
- Owner" bit.
-
- If the pending owner is not current, we check to see if the current priority is
- higher than the pending owner. If not, we fail the function and return.
-
- There's also something special about a pending owner. That is a pending owner
- is never blocked on a mutex. So there is no PI chain to worry about. It also
- means that if the mutex doesn't have any waiters, there's no accounting needed
- to update the pending owner's pi_list, since we only worry about processes
- blocked on the current mutex.
-
- If there are waiters on this mutex, and we just stole the ownership, we need
- to take the top waiter, remove it from the pi_list of the pending owner, and
- add it to the current pi_list. Note that at this moment, the pending owner
- is no longer on the list of waiters. This is fine, since the pending owner
- would add itself back when it realizes that it had the ownership stolen
- from itself. When the pending owner tries to grab the mutex, it will fail
- in try_to_take_rt_mutex if the owner field points to another process.
-
- 2) No owner
- -----------
-
- If there is no owner (or we successfully stole the lock), we set the owner
- of the mutex to current, and set the flag of "Has Waiters" if the current
- mutex actually has waiters, or we clear the flag if it doesn't. See, it was
- OK that we set that flag early, since now it is cleared.
-
- 3) Failed to grab ownership
- ---------------------------
-
- The most interesting case is when we fail to take ownership. This means that
- there exists an owner, or there's a pending owner with equal or higher
- priority than the current task.
-
-We'll continue on the failed case.
-
-If the mutex has a timeout, we set up a timer to go off to break us out
-of this mutex if we failed to get it after a specified amount of time.
-
-Now we enter a loop that will continue to try to take ownership of the mutex, or
-fail from a timeout or signal.
-
-Once again we try to take the mutex. This will usually fail the first time
-in the loop, since it had just failed to get the mutex. But the second time
-in the loop, this would likely succeed, since the task would likely be
-the pending owner.
-
-If the mutex is TASK_INTERRUPTIBLE a check for signals and timeout is done
-here.
-
-The waiter structure has a "task" field that points to the task that is blocked
-on the mutex. This field can be NULL the first time it goes through the loop
-or if the task is a pending owner and had its mutex stolen. If the "task"
-field is NULL then we need to set up the accounting for it.
+the "Has Waiters" flag of the mutex's owner field. By setting this flag
+now, the current owner of the mutex being contended for can't release the mutex
+without going into the slow unlock path, and it would then need to grab the
+wait_lock, which this code currently holds. So setting the "Has Waiters" flag
+forces the current owner to synchronize with this code.
+
+The lock is taken if the following are true:
+ 1) The lock has no owner
+ 2) The current task is the highest priority against all other
+ waiters of the lock
+
+If the task succeeds to acquire the lock, then the task is set as the
+owner of the lock, and if the lock still has waiters, the top_waiter
+(highest priority task waiting on the lock) is added to this task's
+pi_waiters tree.
+
+If the lock is not taken by try_to_take_rt_mutex(), then the
+task_blocks_on_rt_mutex() function is called. This will add the task to
+the lock's waiter tree and propagate the pi chain of the lock as well
+as the lock's owner's pi_waiters tree. This is described in the next
+section.
Task blocks on mutex
--------------------
The accounting of a mutex and process is done with the waiter structure of
the process. The "task" field is set to the process, and the "lock" field
-to the mutex. The plist nodes are initialized to the processes current
-priority.
+to the mutex. The rbtree node of waiter are initialized to the processes
+current priority.
Since the wait_lock was taken at the entry of the slow lock, we can safely
-add the waiter to the wait_list. If the current process is the highest
-priority process currently waiting on this mutex, then we remove the
-previous top waiter process (if it exists) from the pi_list of the owner,
-and add the current process to that list. Since the pi_list of the owner
+add the waiter to the task waiter tree. If the current process is the
+highest priority process currently waiting on this mutex, then we remove the
+previous top waiter process (if it exists) from the pi_waiters of the owner,
+and add the current process to that tree. Since the pi_waiter of the owner
has changed, we call rt_mutex_adjust_prio on the owner to see if the owner
should adjust its priority accordingly.
-If the owner is also blocked on a lock, and had its pi_list changed
+If the owner is also blocked on a lock, and had its pi_waiters changed
(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
@@ -686,30 +480,23 @@ mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
Waking up in the loop
---------------------
-The schedule can then wake up for a few reasons.
- 1) we were given pending ownership of the mutex.
- 2) we received a signal and was TASK_INTERRUPTIBLE
- 3) we had a timeout and was TASK_INTERRUPTIBLE
+The task can then wake up for a couple of reasons:
+ 1) The previous lock owner released the lock, and the task now is top_waiter
+ 2) we received a signal or timeout
-In any of these cases, we continue the loop and once again try to grab the
-ownership of the mutex. If we succeed, we exit the loop, otherwise we continue
-and on signal and timeout, will exit the loop, or if we had the mutex stolen
-we just simply add ourselves back on the lists and go back to sleep.
+In both cases, the task will try again to acquire the lock. If it
+does, then it will take itself off the waiters tree and set itself back
+to the TASK_RUNNING state.
-Note: For various reasons, because of timeout and signals, the steal mutex
- algorithm needs to be careful. This is because the current process is
- still on the wait_list. And because of dynamic changing of priorities,
- especially on SCHED_OTHER tasks, the current process can be the
- highest priority task on the wait_list.
-
-Failed to get mutex on Timeout or Signal
-----------------------------------------
+In first case, if the lock was acquired by another task before this task
+could get the lock, then it will go back to sleep and wait to be woken again.
-If a timeout or signal occurred, the waiter's "task" field would not be
-NULL and the task needs to be taken off the wait_list of the mutex and perhaps
-pi_list of the owner. If this process was a high priority process, then
-the rt_mutex_adjust_prio_chain needs to be executed again on the owner,
-but this time it will be lowering the priorities.
+The second case is only applicable for tasks that are grabbing a mutex
+that can wake up before getting the lock, either due to a signal or
+a timeout (i.e. rt_mutex_timed_futex_lock()). When woken, it will try to
+take the lock again, if it succeeds, then the task will return with the
+lock held, otherwise it will return with -EINTR if the task was woken
+by a signal, or -ETIMEDOUT if it timed out.
Unlocking the Mutex
@@ -739,25 +526,12 @@ owner still needs to make this check. If there are no waiters then the mutex
owner field is set to NULL, the wait_lock is released and nothing more is
needed.
-If there are waiters, then we need to wake one up and give that waiter
-pending ownership.
+If there are waiters, then we need to wake one up.
On the wake up code, the pi_lock of the current owner is taken. The top
-waiter of the lock is found and removed from the wait_list of the mutex
-as well as the pi_list of the current owner. The task field of the new
-pending owner's waiter structure is set to NULL, and the owner field of the
-mutex is set to the new owner with the "Pending Owner" bit set, as well
-as the "Has Waiters" bit if there still are other processes blocked on the
-mutex.
-
-The pi_lock of the previous owner is released, and the new pending owner's
-pi_lock is taken. Remember that this is the trick to prevent the race
-condition in rt_mutex_adjust_prio_chain from adding itself as a waiter
-on the mutex.
-
-We now clear the "pi_blocked_on" field of the new pending owner, and if
-the mutex still has waiters pending, we add the new top waiter to the pi_list
-of the pending owner.
+waiter of the lock is found and removed from the waiters tree of the mutex
+as well as the pi_waiters tree of the current owner. The "Has Waiters" bit is
+marked to prevent lower priority tasks from stealing the lock.
Finally we unlock the pi_lock of the pending owner and wake it up.
@@ -772,10 +546,14 @@ Credits
-------
Author: Steven Rostedt <rostedt@goodmis.org>
+Updated: Alex Shi <alex.shi@linaro.org> - 7/6/2017
-Reviewers: Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and Randy Dunlap
+Original Reviewers: Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and
+ Randy Dunlap
+Update (7/6/2017) Reviewers: Steven Rostedt and Sebastian Siewior
Updates
-------
This document was originally written for 2.6.17-rc3-mm1
+was updated on 4.12
diff --git a/Documentation/locking/rt-mutex.txt b/Documentation/locking/rt-mutex.txt
index 243393d882ee..35793e003041 100644
--- a/Documentation/locking/rt-mutex.txt
+++ b/Documentation/locking/rt-mutex.txt
@@ -28,14 +28,13 @@ magic bullet for poorly designed applications, but it allows
well-designed applications to use userspace locks in critical parts of
an high priority thread, without losing determinism.
-The enqueueing of the waiters into the rtmutex waiter list is done in
+The enqueueing of the waiters into the rtmutex waiter tree is done in
priority order. For same priorities FIFO order is chosen. For each
rtmutex, only the top priority waiter is enqueued into the owner's
-priority waiters list. This list too queues in priority order. Whenever
+priority waiters tree. This tree too queues in priority order. Whenever
the top priority waiter of a task changes (for example it timed out or
-got a signal), the priority of the owner task is readjusted. [The
-priority enqueueing is handled by "plists", see include/linux/plist.h
-for more details.]
+got a signal), the priority of the owner task is readjusted. The
+priority enqueueing is handled by "pi_waiters".
RT-mutexes are optimized for fastpath operations and have no internal
locking overhead when locking an uncontended mutex or unlocking a mutex
@@ -46,34 +45,29 @@ is used]
The state of the rt-mutex is tracked via the owner field of the rt-mutex
structure:
-rt_mutex->owner holds the task_struct pointer of the owner. Bit 0 and 1
-are used to keep track of the "owner is pending" and "rtmutex has
-waiters" state.
+lock->owner holds the task_struct pointer of the owner. Bit 0 is used to
+keep track of the "lock has waiters" state.
- owner bit1 bit0
- NULL 0 0 mutex is free (fast acquire possible)
- NULL 0 1 invalid state
- NULL 1 0 Transitional state*
- NULL 1 1 invalid state
- taskpointer 0 0 mutex is held (fast release possible)
- taskpointer 0 1 task is pending owner
- taskpointer 1 0 mutex is held and has waiters
- taskpointer 1 1 task is pending owner and mutex has waiters
+ owner bit0
+ NULL 0 lock is free (fast acquire possible)
+ NULL 1 lock is free and has waiters and the top waiter
+ is going to take the lock*
+ taskpointer 0 lock is held (fast release possible)
+ taskpointer 1 lock is held and has waiters**
-Pending-ownership handling is a performance optimization:
-pending-ownership is assigned to the first (highest priority) waiter of
-the mutex, when the mutex is released. The thread is woken up and once
-it starts executing it can acquire the mutex. Until the mutex is taken
-by it (bit 0 is cleared) a competing higher priority thread can "steal"
-the mutex which puts the woken up thread back on the waiters list.
+The fast atomic compare exchange based acquire and release is only
+possible when bit 0 of lock->owner is 0.
-The pending-ownership optimization is especially important for the
-uninterrupted workflow of high-prio tasks which repeatedly
-takes/releases locks that have lower-prio waiters. Without this
-optimization the higher-prio thread would ping-pong to the lower-prio
-task [because at unlock time we always assign a new owner].
+(*) It also can be a transitional state when grabbing the lock
+with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
+we need to set the bit0 before looking at the lock, and the owner may be
+NULL in this small time, hence this can be a transitional state.
-(*) The "mutex has waiters" bit gets set to take the lock. If the lock
-doesn't already have an owner, this bit is quickly cleared if there are
-no waiters. So this is a transitional state to synchronize with looking
-at the owner field of the mutex and the mutex owner releasing the lock.
+(**) There is a small time when bit 0 is set but there are no
+waiters. This can happen when grabbing the lock in the slow path.
+To prevent a cmpxchg of the owner releasing the lock, we need to
+set this bit before looking at the lock.
+
+BTW, there is still technically a "Pending Owner", it's just not called
+that anymore. The pending owner happens to be the top_waiter of a lock
+that has no owner and has been woken up to grab the lock.
diff --git a/Documentation/media/uapi/cec/cec-funcs.rst b/Documentation/media/uapi/cec/cec-funcs.rst
index 5b7630f2e076..6d696cead5cb 100644
--- a/Documentation/media/uapi/cec/cec-funcs.rst
+++ b/Documentation/media/uapi/cec/cec-funcs.rst
@@ -7,7 +7,6 @@ Function Reference
.. toctree::
:maxdepth: 1
- :numbered:
cec-func-open
cec-func-close
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index c4ddfcd5ee32..b759a60624fd 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -498,11 +498,11 @@ And a couple of implicit varieties:
This means that ACQUIRE acts as a minimal "acquire" operation and
RELEASE acts as a minimal "release" operation.
-A subset of the atomic operations described in core-api/atomic_ops.rst have
-ACQUIRE and RELEASE variants in addition to fully-ordered and relaxed (no
-barrier semantics) definitions. For compound atomics performing both a load
-and a store, ACQUIRE semantics apply only to the load and RELEASE semantics
-apply only to the store portion of the operation.
+A subset of the atomic operations described in atomic_t.txt have ACQUIRE and
+RELEASE variants in addition to fully-ordered and relaxed (no barrier
+semantics) definitions. For compound atomics performing both a load and a
+store, ACQUIRE semantics apply only to the load and RELEASE semantics apply
+only to the store portion of the operation.
Memory barriers are only required where there's a possibility of interaction
between two CPUs or between a CPU and a device. If it can be guaranteed that
@@ -594,7 +594,24 @@ between the address load and the data load:
This enforces the occurrence of one of the two implications, and prevents the
third possibility from arising.
-A data-dependency barrier must also order against dependent writes:
+
+[!] Note that this extremely counterintuitive situation arises most easily on
+machines with split caches, so that, for example, one cache bank processes
+even-numbered cache lines and the other bank processes odd-numbered cache
+lines. The pointer P might be stored in an odd-numbered cache line, and the
+variable B might be stored in an even-numbered cache line. Then, if the
+even-numbered bank of the reading CPU's cache is extremely busy while the
+odd-numbered bank is idle, one can see the new value of the pointer P (&B),
+but the old value of the variable B (2).
+
+
+A data-dependency barrier is not required to order dependent writes
+because the CPUs that the Linux kernel supports don't do writes
+until they are certain (1) that the write will actually happen, (2)
+of the location of the write, and (3) of the value to be written.
+But please carefully read the "CONTROL DEPENDENCIES" section and the
+Documentation/RCU/rcu_dereference.txt file: The compiler can and does
+break dependencies in a great many highly creative ways.
CPU 1 CPU 2
=============== ===============
@@ -603,29 +620,19 @@ A data-dependency barrier must also order against dependent writes:
<write barrier>
WRITE_ONCE(P, &B);
Q = READ_ONCE(P);
- <data dependency barrier>
- *Q = 5;
+ WRITE_ONCE(*Q, 5);
-The data-dependency barrier must order the read into Q with the store
-into *Q. This prohibits this outcome:
+Therefore, no data-dependency barrier is required to order the read into
+Q with the store into *Q. In other words, this outcome is prohibited,
+even without a data-dependency barrier:
(Q == &B) && (B == 4)
Please note that this pattern should be rare. After all, the whole point
of dependency ordering is to -prevent- writes to the data structure, along
with the expensive cache misses associated with those writes. This pattern
-can be used to record rare error conditions and the like, and the ordering
-prevents such records from being lost.
-
-
-[!] Note that this extremely counterintuitive situation arises most easily on
-machines with split caches, so that, for example, one cache bank processes
-even-numbered cache lines and the other bank processes odd-numbered cache
-lines. The pointer P might be stored in an odd-numbered cache line, and the
-variable B might be stored in an even-numbered cache line. Then, if the
-even-numbered bank of the reading CPU's cache is extremely busy while the
-odd-numbered bank is idle, one can see the new value of the pointer P (&B),
-but the old value of the variable B (2).
+can be used to record rare error conditions and the like, and the CPUs'
+naturally occurring ordering prevents such records from being lost.
The data dependency barrier is very important to the RCU system,
@@ -1876,8 +1883,7 @@ There are some more advanced barrier functions:
This makes sure that the death mark on the object is perceived to be set
*before* the reference counter is decremented.
- See Documentation/core-api/atomic_ops.rst for more information. See the
- "Atomic operations" subsection for information on where to use these.
+ See Documentation/atomic_{t,bitops}.txt for more information.
(*) lockless_dereference();
@@ -1982,10 +1988,7 @@ for each construct. These operations all imply certain barriers:
ACQUIRE operation has completed.
Memory operations issued before the ACQUIRE may be completed after
- the ACQUIRE operation has completed. An smp_mb__before_spinlock(),
- combined with a following ACQUIRE, orders prior stores against
- subsequent loads and stores. Note that this is weaker than smp_mb()!
- The smp_mb__before_spinlock() primitive is free on many architectures.
+ the ACQUIRE operation has completed.
(2) RELEASE operation implication:
@@ -2503,88 +2506,7 @@ operations are noted specially as some of them imply full memory barriers and
some don't, but they're very heavily relied on as a group throughout the
kernel.
-Any atomic operation that modifies some state in memory and returns information
-about the state (old or new) implies an SMP-conditional general memory barrier
-(smp_mb()) on each side of the actual operation (with the exception of
-explicit lock operations, described later). These include:
-
- xchg();
- atomic_xchg(); atomic_long_xchg();
- atomic_inc_return(); atomic_long_inc_return();
- atomic_dec_return(); atomic_long_dec_return();
- atomic_add_return(); atomic_long_add_return();
- atomic_sub_return(); atomic_long_sub_return();
- atomic_inc_and_test(); atomic_long_inc_and_test();
- atomic_dec_and_test(); atomic_long_dec_and_test();
- atomic_sub_and_test(); atomic_long_sub_and_test();
- atomic_add_negative(); atomic_long_add_negative();
- test_and_set_bit();
- test_and_clear_bit();
- test_and_change_bit();
-
- /* when succeeds */
- cmpxchg();
- atomic_cmpxchg(); atomic_long_cmpxchg();
- atomic_add_unless(); atomic_long_add_unless();
-
-These are used for such things as implementing ACQUIRE-class and RELEASE-class
-operations and adjusting reference counters towards object destruction, and as
-such the implicit memory barrier effects are necessary.
-
-
-The following operations are potential problems as they do _not_ imply memory
-barriers, but might be used for implementing such things as RELEASE-class
-operations:
-
- atomic_set();
- set_bit();
- clear_bit();
- change_bit();
-
-With these the appropriate explicit memory barrier should be used if necessary
-(smp_mb__before_atomic() for instance).
-
-
-The following also do _not_ imply memory barriers, and so may require explicit
-memory barriers under some circumstances (smp_mb__before_atomic() for
-instance):
-
- atomic_add();
- atomic_sub();
- atomic_inc();
- atomic_dec();
-
-If they're used for statistics generation, then they probably don't need memory
-barriers, unless there's a coupling between statistical data.
-
-If they're used for reference counting on an object to control its lifetime,
-they probably don't need memory barriers because either the reference count
-will be adjusted inside a locked section, or the caller will already hold
-sufficient references to make the lock, and thus a memory barrier unnecessary.
-
-If they're used for constructing a lock of some description, then they probably
-do need memory barriers as a lock primitive generally has to do things in a
-specific order.
-
-Basically, each usage case has to be carefully considered as to whether memory
-barriers are needed or not.
-
-The following operations are special locking primitives:
-
- test_and_set_bit_lock();
- clear_bit_unlock();
- __clear_bit_unlock();
-
-These implement ACQUIRE-class and RELEASE-class operations. These should be
-used in preference to other operations when implementing locking primitives,
-because their implementations can be optimised on many architectures.
-
-[!] Note that special memory barrier primitives are available for these
-situations because on some CPUs the atomic instructions used imply full memory
-barriers, and so barrier instructions are superfluous in conjunction with them,
-and in such cases the special barrier primitives will be no-ops.
-
-See Documentation/core-api/atomic_ops.rst for more information.
+See Documentation/atomic_t.txt for more information.
ACCESSING DEVICES
diff --git a/Documentation/networking/ieee802154.txt b/Documentation/networking/ieee802154.txt
index c4114346f054..057e9fdbfac9 100644
--- a/Documentation/networking/ieee802154.txt
+++ b/Documentation/networking/ieee802154.txt
@@ -84,17 +84,17 @@ Device drivers API
==================
The include/net/mac802154.h defines following functions:
- - struct ieee802154_dev *ieee802154_alloc_device
- (size_t priv_size, struct ieee802154_ops *ops):
- allocation of IEEE 802.15.4 compatible device
+ - struct ieee802154_hw *
+ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops):
+ allocation of IEEE 802.15.4 compatible hardware device
- - void ieee802154_free_device(struct ieee802154_dev *dev):
- freeing allocated device
+ - void ieee802154_free_hw(struct ieee802154_hw *hw):
+ freeing allocated hardware device
- - int ieee802154_register_device(struct ieee802154_dev *dev):
- register PHY in the system
+ - int ieee802154_register_hw(struct ieee802154_hw *hw):
+ register PHY which is the allocated hardware device, in the system
- - void ieee802154_unregister_device(struct ieee802154_dev *dev):
+ - void ieee802154_unregister_hw(struct ieee802154_hw *hw):
freeing registered PHY
Moreover IEEE 802.15.4 device operations structure should be filled.
diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt
index 3e7b946dea27..5e40e1f68873 100644
--- a/Documentation/networking/switchdev.txt
+++ b/Documentation/networking/switchdev.txt
@@ -228,7 +228,7 @@ Learning on the device port should be enabled, as well as learning_sync:
bridge link set dev DEV learning on self
bridge link set dev DEV learning_sync on self
-Learning_sync attribute enables syncing of the learned/forgotton FDB entry to
+Learning_sync attribute enables syncing of the learned/forgotten FDB entry to
the bridge's FDB. It's possible, but not optimal, to enable learning on the
device port and on the bridge port, and disable learning_sync.
@@ -245,7 +245,7 @@ the responsibility of the port driver/device to age out these entries. If the
port device supports ageing, when the FDB entry expires, it will notify the
driver which in turn will notify the bridge with SWITCHDEV_FDB_DEL. If the
device does not support ageing, the driver can simulate ageing using a
-garbage collection timer to monitor FBD entries. Expired entries will be
+garbage collection timer to monitor FDB entries. Expired entries will be
notified to the bridge using SWITCHDEV_FDB_DEL. See rocker driver for
example of driver running ageing timer.
diff --git a/Documentation/nvmem/nvmem.txt b/Documentation/nvmem/nvmem.txt
index dbd40d879239..8d8d8f58f96f 100644
--- a/Documentation/nvmem/nvmem.txt
+++ b/Documentation/nvmem/nvmem.txt
@@ -112,7 +112,7 @@ take nvmem_device as parameter.
5. Releasing a reference to the NVMEM
=====================================
-When a consumers no longer needs the NVMEM, it has to release the reference
+When a consumer no longer needs the NVMEM, it has to release the reference
to the NVMEM it has obtained using the APIs mentioned in the above section.
The NVMEM framework provides 2 APIs to release a reference to the NVMEM.
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt
index 65ea5915178b..361789df51ec 100644
--- a/Documentation/printk-formats.txt
+++ b/Documentation/printk-formats.txt
@@ -58,20 +58,33 @@ Symbols/Function Pointers
%ps versatile_init
%pB prev_fn_of_versatile_init+0x88/0x88
-For printing symbols and function pointers. The ``S`` and ``s`` specifiers
-result in the symbol name with (``S``) or without (``s``) offsets. Where
-this is used on a kernel without KALLSYMS - the symbol address is
-printed instead.
+The ``F`` and ``f`` specifiers are for printing function pointers,
+for example, f->func, &gettimeofday. They have the same result as
+``S`` and ``s`` specifiers. But they do an extra conversion on
+ia64, ppc64 and parisc64 architectures where the function pointers
+are actually function descriptors.
+
+The ``S`` and ``s`` specifiers can be used for printing symbols
+from direct addresses, for example, __builtin_return_address(0),
+(void *)regs->ip. They result in the symbol name with (``S``) or
+without (``s``) offsets. If KALLSYMS are disabled then the symbol
+address is printed instead.
The ``B`` specifier results in the symbol name with offsets and should be
used when printing stack backtraces. The specifier takes into
consideration the effect of compiler optimisations which may occur
when tail-call``s are used and marked with the noreturn GCC attribute.
-On ia64, ppc64 and parisc64 architectures function pointers are
-actually function descriptors which must first be resolved. The ``F`` and
-``f`` specifiers perform this resolution and then provide the same
-functionality as the ``S`` and ``s`` specifiers.
+Examples::
+
+ printk("Going to call: %pF\n", gettimeofday);
+ printk("Going to call: %pF\n", p->func);
+ printk("%s: called from %pS\n", __func__, (void *)_RET_IP_);
+ printk("%s: called from %pS\n", __func__,
+ (void *)__builtin_return_address(0));
+ printk("Faulted at %pS\n", (void *)regs->ip);
+ printk(" %s%pB\n", (reliable ? "" : "? "), (void *)*stack);
+
Kernel Pointers
===============
diff --git a/Documentation/process/applying-patches.rst b/Documentation/process/applying-patches.rst
index a0d058cc6d25..dc2ddc345044 100644
--- a/Documentation/process/applying-patches.rst
+++ b/Documentation/process/applying-patches.rst
@@ -6,9 +6,6 @@ Applying Patches To The Linux Kernel
Original by:
Jesper Juhl, August 2005
-Last update:
- 2016-09-14
-
.. note::
This document is obsolete. In most cases, rather than using ``patch``
@@ -344,7 +341,7 @@ possible.
This is a good branch to run for people who want to help out testing
development kernels but do not want to run some of the really experimental
-stuff (such people should see the sections about -git and -mm kernels below).
+stuff (such people should see the sections about -next and -mm kernels below).
The -rc patches are not incremental, they apply to a base 4.x kernel, just
like the 4.x.y patches described above. The kernel version before the -rcN
@@ -380,44 +377,6 @@ Here are 3 examples of how to apply these patches::
$ mv linux-4.7.3 linux-4.8-rc5 # rename the kernel source dir
-The -git kernels
-================
-
-These are daily snapshots of Linus' kernel tree (managed in a git
-repository, hence the name).
-
-These patches are usually released daily and represent the current state of
-Linus's tree. They are more experimental than -rc kernels since they are
-generated automatically without even a cursory glance to see if they are
-sane.
-
--git patches are not incremental and apply either to a base 4.x kernel or
-a base 4.x-rc kernel -- you can see which from their name.
-A patch named 4.7-git1 applies to the 4.7 kernel source and a patch
-named 4.8-rc3-git2 applies to the source of the 4.8-rc3 kernel.
-
-Here are some examples of how to apply these patches::
-
- # moving from 4.7 to 4.7-git1
-
- $ cd ~/linux-4.7 # change to the kernel source dir
- $ patch -p1 < ../patch-4.7-git1 # apply the 4.7-git1 patch
- $ cd ..
- $ mv linux-4.7 linux-4.7-git1 # rename the kernel source dir
-
- # moving from 4.7-git1 to 4.8-rc2-git3
-
- $ cd ~/linux-4.7-git1 # change to the kernel source dir
- $ patch -p1 -R < ../patch-4.7-git1 # revert the 4.7-git1 patch
- # we now have a 4.7 kernel
- $ patch -p1 < ../patch-4.8-rc2 # apply the 4.8-rc2 patch
- # the kernel is now 4.8-rc2
- $ patch -p1 < ../patch-4.8-rc2-git3 # apply the 4.8-rc2-git3 patch
- # the kernel is now 4.8-rc2-git3
- $ cd ..
- $ mv linux-4.7-git1 linux-4.8-rc2-git3 # rename source dir
-
-
The -mm patches and the linux-next tree
=======================================
diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index adbb50ae5246..560beaef5a7c 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -53,7 +53,7 @@ mcelog 0.6 mcelog --version
iptables 1.4.2 iptables -V
openssl & libcrypto 1.0.0 openssl version
bc 1.06.95 bc --version
-Sphinx\ [#f1]_ 1.2 sphinx-build --version
+Sphinx\ [#f1]_ 1.3 sphinx-build --version
====================== =============== ========================================
.. [#f1] Sphinx is needed only to build the Kernel documentation
@@ -309,18 +309,8 @@ Kernel documentation
Sphinx
------
-The ReST markups currently used by the Documentation/ files are meant to be
-built with ``Sphinx`` version 1.2 or upper. If you're desiring to build
-PDF outputs, it is recommended to use version 1.4.6.
-
-.. note::
-
- Please notice that, for PDF and LaTeX output, you'll also need ``XeLaTeX``
- version 3.14159265. Depending on the distribution, you may also need to
- install a series of ``texlive`` packages that provide the minimal set of
- functionalities required for ``XeLaTex`` to work. For PDF output you'll also
- need ``convert(1)`` from ImageMagick (https://www.imagemagick.org).
-
+Please see :ref:`sphinx_install` in ``Documentation/doc-guide/sphinx.rst``
+for details about Sphinx requirements.
Getting updated software
========================
diff --git a/Documentation/process/stable-kernel-rules.rst b/Documentation/process/stable-kernel-rules.rst
index 61e9c78bd6d1..36a2dded525b 100644
--- a/Documentation/process/stable-kernel-rules.rst
+++ b/Documentation/process/stable-kernel-rules.rst
@@ -166,12 +166,12 @@ Trees
- The queues of patches, for both completed versions and in progress
versions can be found at:
- http://git.kernel.org/?p=linux/kernel/git/stable/stable-queue.git
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/stable-queue.git
- The finalized and tagged releases of all stable kernels can be found
in separate branches per version at:
- http://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git
+ https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git
Review committee
diff --git a/Documentation/process/submitting-patches.rst b/Documentation/process/submitting-patches.rst
index 3e10719fee35..733478ade91b 100644
--- a/Documentation/process/submitting-patches.rst
+++ b/Documentation/process/submitting-patches.rst
@@ -413,7 +413,7 @@ e-mail discussions.
-11) Sign your work — the Developer's Certificate of Origin
+11) Sign your work - the Developer's Certificate of Origin
----------------------------------------------------------
To improve tracking of who did what, especially with patches that can
diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst
index 1648fa80b3bf..1266eeae45f6 100644
--- a/Documentation/security/keys/core.rst
+++ b/Documentation/security/keys/core.rst
@@ -16,17 +16,7 @@ The key service can be configured on by enabling:
This document has the following sections:
- - Key overview
- - Key service overview
- - Key access permissions
- - SELinux support
- - New procfs files
- - Userspace system call interface
- - Kernel services
- - Notes on accessing payload contents
- - Defining a key type
- - Request-key callback service
- - Garbage collection
+.. contents:: :local:
Key Overview
@@ -443,7 +433,7 @@ The main syscalls are:
/sbin/request-key will be invoked in an attempt to obtain a key. The
callout_info string will be passed as an argument to the program.
- See also Documentation/security/keys-request-key.txt.
+ See also Documentation/security/keys/request-key.rst.
The keyctl syscall functions are:
@@ -973,7 +963,7 @@ payload contents" for more information.
If successful, the key will have been attached to the default keyring for
implicitly obtained request-key keys, as set by KEYCTL_SET_REQKEY_KEYRING.
- See also Documentation/security/keys-request-key.txt.
+ See also Documentation/security/keys/request-key.rst.
* To search for a key, passing auxiliary data to the upcaller, call::
diff --git a/Documentation/security/keys/request-key.rst b/Documentation/security/keys/request-key.rst
index aba32784174c..b2d16abaa9e9 100644
--- a/Documentation/security/keys/request-key.rst
+++ b/Documentation/security/keys/request-key.rst
@@ -3,7 +3,7 @@ Key Request Service
===================
The key request service is part of the key retention service (refer to
-Documentation/security/keys.txt). This document explains more fully how
+Documentation/security/core.rst). This document explains more fully how
the requesting algorithm works.
The process starts by either the kernel requesting a service by calling
diff --git a/Documentation/security/keys/trusted-encrypted.rst b/Documentation/security/keys/trusted-encrypted.rst
index 7b503831bdea..3bb24e09a332 100644
--- a/Documentation/security/keys/trusted-encrypted.rst
+++ b/Documentation/security/keys/trusted-encrypted.rst
@@ -172,4 +172,4 @@ Other uses for trusted and encrypted keys, such as for disk and file encryption
are anticipated. In particular the new format 'ecryptfs' has been defined in
in order to use encrypted keys to mount an eCryptfs filesystem. More details
about the usage can be found in the file
-``Documentation/security/keys-ecryptfs.txt``.
+``Documentation/security/keys/ecryptfs.rst``.
diff --git a/Documentation/sphinx-static/theme_overrides.css b/Documentation/sphinx-static/theme_overrides.css
index d5764a4de5a2..522b6d4c49d4 100644
--- a/Documentation/sphinx-static/theme_overrides.css
+++ b/Documentation/sphinx-static/theme_overrides.css
@@ -4,6 +4,17 @@
*
*/
+/* Interim: Code-blocks with line nos - lines and line numbers don't line up.
+ * see: https://github.com/rtfd/sphinx_rtd_theme/issues/419
+ */
+
+div[class^="highlight"] pre {
+ line-height: normal;
+}
+.rst-content .highlight > pre {
+ line-height: normal;
+}
+
@media screen {
/* content column
@@ -56,6 +67,12 @@
font-family: "Courier New", Courier, monospace
}
+ /* fix bottom margin of lists items */
+
+ .rst-content .section ul li:last-child, .rst-content .section ul li p:last-child {
+ margin-bottom: 12px;
+ }
+
/* inline literal: drop the borderbox, padding and red color */
code, .rst-content tt, .rst-content code {
diff --git a/Documentation/sphinx/kerneldoc.py b/Documentation/sphinx/kerneldoc.py
index d15e07f36881..39aa9e8697cc 100644
--- a/Documentation/sphinx/kerneldoc.py
+++ b/Documentation/sphinx/kerneldoc.py
@@ -27,6 +27,7 @@
# Please make sure this works on both python2 and python3.
#
+import codecs
import os
import subprocess
import sys
@@ -88,13 +89,10 @@ class KernelDocDirective(Directive):
try:
env.app.verbose('calling kernel-doc \'%s\'' % (" ".join(cmd)))
- p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+ p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
- # python2 needs conversion to unicode.
- # python3 with universal_newlines=True returns strings.
- if sys.version_info.major < 3:
- out, err = unicode(out, 'utf-8'), unicode(err, 'utf-8')
+ out, err = codecs.decode(out, 'utf-8'), codecs.decode(err, 'utf-8')
if p.returncode != 0:
sys.stderr.write(err)
diff --git a/Documentation/sphinx/requirements.txt b/Documentation/sphinx/requirements.txt
new file mode 100644
index 000000000000..742be3e12619
--- /dev/null
+++ b/Documentation/sphinx/requirements.txt
@@ -0,0 +1,3 @@
+docutils==0.12
+Sphinx==1.4.9
+sphinx_rtd_theme
diff --git a/Documentation/static-keys.txt b/Documentation/static-keys.txt
index b83dfa1c0602..ab16efe0c79d 100644
--- a/Documentation/static-keys.txt
+++ b/Documentation/static-keys.txt
@@ -149,6 +149,26 @@ static_branch_inc(), will change the branch back to true. Likewise, if the
key is initialized false, a 'static_branch_inc()', will change the branch to
true. And then a 'static_branch_dec()', will again make the branch false.
+The state and the reference count can be retrieved with 'static_key_enabled()'
+and 'static_key_count()'. In general, if you use these functions, they
+should be protected with the same mutex used around the enable/disable
+or increment/decrement function.
+
+Note that switching branches results in some locks being taken,
+particularly the CPU hotplug lock (in order to avoid races against
+CPUs being brought in the kernel whilst the kernel is getting
+patched). Calling the static key API from within a hotplug notifier is
+thus a sure deadlock recipe. In order to still allow use of the
+functionnality, the following functions are provided:
+
+ static_key_enable_cpuslocked()
+ static_key_disable_cpuslocked()
+ static_branch_enable_cpuslocked()
+ static_branch_disable_cpuslocked()
+
+These functions are *not* general purpose, and must only be used when
+you really know that you're in the above context, and no other.
+
Where an array of keys is required, it can be defined as::
DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count);
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index bac23c198360..ce61d1fe08ca 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -61,6 +61,7 @@ show up in /proc/sys/kernel:
- perf_cpu_time_max_percent
- perf_event_paranoid
- perf_event_max_stack
+- perf_event_mlock_kb
- perf_event_max_contexts_per_stack
- pid_max
- powersave-nap [ PPC only ]
@@ -654,7 +655,9 @@ Controls use of the performance events system by unprivileged
users (without CAP_SYS_ADMIN). The default value is 2.
-1: Allow use of (almost) all events by all users
->=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
+ Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
+>=0: Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN
+ Disallow raw tracepoint access by users without CAP_SYS_ADMIN
>=1: Disallow CPU event access by users without CAP_SYS_ADMIN
>=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
@@ -673,6 +676,14 @@ The default value is 127.
==============================================================
+perf_event_mlock_kb:
+
+Control size of per-cpu ring buffer not counted agains mlock limit.
+
+The default value is 512 + 1 page
+
+==============================================================
+
perf_event_max_contexts_per_stack:
Controls maximum number of stack frame context entries for
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 14db18c970b1..28596e03220b 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -35,9 +35,34 @@ Table : Subdirectories in /proc/sys/net
bpf_jit_enable
--------------
-This enables Berkeley Packet Filter Just in Time compiler.
-Currently supported on x86_64 architecture, bpf_jit provides a framework
-to speed packet filtering, the one used by tcpdump/libpcap for example.
+This enables the BPF Just in Time (JIT) compiler. BPF is a flexible
+and efficient infrastructure allowing to execute bytecode at various
+hook points. It is used in a number of Linux kernel subsystems such
+as networking (e.g. XDP, tc), tracing (e.g. kprobes, uprobes, tracepoints)
+and security (e.g. seccomp). LLVM has a BPF back end that can compile
+restricted C into a sequence of BPF instructions. After program load
+through bpf(2) and passing a verifier in the kernel, a JIT will then
+translate these BPF proglets into native CPU instructions. There are
+two flavors of JITs, the newer eBPF JIT currently supported on:
+ - x86_64
+ - arm64
+ - ppc64
+ - sparc64
+ - mips64
+ - s390x
+
+And the older cBPF JIT supported on the following archs:
+ - arm
+ - mips
+ - ppc
+ - sparc
+
+eBPF JITs are a superset of cBPF JITs, meaning the kernel will
+migrate cBPF instructions into eBPF instructions and then JIT
+compile them transparently. Older cBPF JITs can only translate
+tcpdump filters, seccomp rules, etc, but not mentioned eBPF
+programs loaded through bpf(2).
+
Values :
0 - disable the JIT (default value)
1 - enable the JIT
@@ -46,9 +71,9 @@ Values :
bpf_jit_harden
--------------
-This enables hardening for the Berkeley Packet Filter Just in Time compiler.
-Supported are eBPF JIT backends. Enabling hardening trades off performance,
-but can mitigate JIT spraying.
+This enables hardening for the BPF JIT compiler. Supported are eBPF
+JIT backends. Enabling hardening trades off performance, but can
+mitigate JIT spraying.
Values :
0 - disable JIT hardening (default value)
1 - enable JIT hardening for unprivileged users only
@@ -57,11 +82,11 @@ Values :
bpf_jit_kallsyms
----------------
-When Berkeley Packet Filter Just in Time compiler is enabled, then compiled
-images are unknown addresses to the kernel, meaning they neither show up in
-traces nor in /proc/kallsyms. This enables export of these addresses, which
-can be used for debugging/tracing. If bpf_jit_harden is enabled, this feature
-is disabled.
+When BPF JIT compiler is enabled, then compiled images are unknown
+addresses to the kernel, meaning they neither show up in traces nor
+in /proc/kallsyms. This enables export of these addresses, which can
+be used for debugging/tracing. If bpf_jit_harden is enabled, this
+feature is disabled.
Values :
0 - disable JIT kallsyms export (default value)
1 - enable JIT kallsyms export for privileged users only
diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt
index 38310dcd6620..bc80fc0e210f 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -1956,10 +1956,7 @@ MMIO 쓰기 배리어
뒤에 완료됩니다.
ACQUIRE 앞에서 요청된 메모리 오퍼레이션은 ACQUIRE 오퍼레이션이 완료된 후에
- 완료될 수 있습니다. smp_mb__before_spinlock() 뒤에 ACQUIRE 가 실행되는
- 코드 블록은 블록 앞의 스토어를 블록 뒤의 로드와 스토어에 대해 순서
- 맞춥니다. 이건 smp_mb() 보다 완화된 것임을 기억하세요! 많은 아키텍쳐에서
- smp_mb__before_spinlock() 은 사실 아무일도 하지 않습니다.
+ 완료될 수 있습니다.
(2) RELEASE 오퍼레이션의 영향:
diff --git a/Documentation/translations/zh_CN/HOWTO b/Documentation/translations/zh_CN/HOWTO
index 11be075ba5fa..5f6d09edc9ac 100644
--- a/Documentation/translations/zh_CN/HOWTO
+++ b/Documentation/translations/zh_CN/HOWTO
@@ -149,9 +149,7 @@ Linux内核代码中包含有大量的文档。这些文档对于学习如何与
核源码的主目录中使用以下不同命令将会分别生成PDF、Postscript、HTML和手册
页等不同格式的文档:
make pdfdocs
- make psdocs
make htmldocs
- make mandocs
如何成为内核开发者
diff --git a/Documentation/x86/amd-memory-encryption.txt b/Documentation/x86/amd-memory-encryption.txt
new file mode 100644
index 000000000000..f512ab718541
--- /dev/null
+++ b/Documentation/x86/amd-memory-encryption.txt
@@ -0,0 +1,68 @@
+Secure Memory Encryption (SME) is a feature found on AMD processors.
+
+SME provides the ability to mark individual pages of memory as encrypted using
+the standard x86 page tables. A page that is marked encrypted will be
+automatically decrypted when read from DRAM and encrypted when written to
+DRAM. SME can therefore be used to protect the contents of DRAM from physical
+attacks on the system.
+
+A page is encrypted when a page table entry has the encryption bit set (see
+below on how to determine its position). The encryption bit can also be
+specified in the cr3 register, allowing the PGD table to be encrypted. Each
+successive level of page tables can also be encrypted by setting the encryption
+bit in the page table entry that points to the next table. This allows the full
+page table hierarchy to be encrypted. Note, this means that just because the
+encryption bit is set in cr3, doesn't imply the full hierarchy is encyrpted.
+Each page table entry in the hierarchy needs to have the encryption bit set to
+achieve that. So, theoretically, you could have the encryption bit set in cr3
+so that the PGD is encrypted, but not set the encryption bit in the PGD entry
+for a PUD which results in the PUD pointed to by that entry to not be
+encrypted.
+
+Support for SME can be determined through the CPUID instruction. The CPUID
+function 0x8000001f reports information related to SME:
+
+ 0x8000001f[eax]:
+ Bit[0] indicates support for SME
+ 0x8000001f[ebx]:
+ Bits[5:0] pagetable bit number used to activate memory
+ encryption
+ Bits[11:6] reduction in physical address space, in bits, when
+ memory encryption is enabled (this only affects
+ system physical addresses, not guest physical
+ addresses)
+
+If support for SME is present, MSR 0xc00100010 (MSR_K8_SYSCFG) can be used to
+determine if SME is enabled and/or to enable memory encryption:
+
+ 0xc0010010:
+ Bit[23] 0 = memory encryption features are disabled
+ 1 = memory encryption features are enabled
+
+Linux relies on BIOS to set this bit if BIOS has determined that the reduction
+in the physical address space as a result of enabling memory encryption (see
+CPUID information above) will not conflict with the address space resource
+requirements for the system. If this bit is not set upon Linux startup then
+Linux itself will not set it and memory encryption will not be possible.
+
+The state of SME in the Linux kernel can be documented as follows:
+ - Supported:
+ The CPU supports SME (determined through CPUID instruction).
+
+ - Enabled:
+ Supported and bit 23 of MSR_K8_SYSCFG is set.
+
+ - Active:
+ Supported, Enabled and the Linux kernel is actively applying
+ the encryption bit to page table entries (the SME mask in the
+ kernel is non-zero).
+
+SME can also be enabled and activated in the BIOS. If SME is enabled and
+activated in the BIOS, then all memory accesses will be encrypted and it will
+not be necessary to activate the Linux memory encryption support. If the BIOS
+merely enables SME (sets bit 23 of the MSR_K8_SYSCFG), then Linux can activate
+memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or
+by supplying mem_encrypt=on on the kernel command line. However, if BIOS does
+not enable SME, then Linux will not be able to activate memory encryption, even
+if configured to do so by default or the mem_encrypt=on command line parameter
+is specified.
diff --git a/Documentation/x86/early-microcode.txt b/Documentation/x86/early-microcode.txt
deleted file mode 100644
index 07749e7f3d50..000000000000
--- a/Documentation/x86/early-microcode.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-Early load microcode
-====================
-By Fenghua Yu <fenghua.yu@intel.com>
-
-Kernel can update microcode in early phase of boot time. Loading microcode early
-can fix CPU issues before they are observed during kernel boot time.
-
-Microcode is stored in an initrd file. The microcode is read from the initrd
-file and loaded to CPUs during boot time.
-
-The format of the combined initrd image is microcode in cpio format followed by
-the initrd image (maybe compressed). Kernel parses the combined initrd image
-during boot time. The microcode file in cpio name space is:
-on Intel: kernel/x86/microcode/GenuineIntel.bin
-on AMD : kernel/x86/microcode/AuthenticAMD.bin
-
-During BSP boot (before SMP starts), if the kernel finds the microcode file in
-the initrd file, it parses the microcode and saves matching microcode in memory.
-If matching microcode is found, it will be uploaded in BSP and later on in all
-APs.
-
-The cached microcode patch is applied when CPUs resume from a sleep state.
-
-There are two legacy user space interfaces to load microcode, either through
-/dev/cpu/microcode or through /sys/devices/system/cpu/microcode/reload file
-in sysfs.
-
-In addition to these two legacy methods, the early loading method described
-here is the third method with which microcode can be uploaded to a system's
-CPUs.
-
-The following example script shows how to generate a new combined initrd file in
-/boot/initrd-3.5.0.ucode.img with original microcode microcode.bin and
-original initrd image /boot/initrd-3.5.0.img.
-
-mkdir initrd
-cd initrd
-mkdir -p kernel/x86/microcode
-cp ../microcode.bin kernel/x86/microcode/GenuineIntel.bin (or AuthenticAMD.bin)
-find . | cpio -o -H newc >../ucode.cpio
-cd ..
-cat ucode.cpio /boot/initrd-3.5.0.img >/boot/initrd-3.5.0.ucode.img
-
-Builtin microcode
-=================
-
-We can also load builtin microcode supplied through the regular firmware
-builtin method CONFIG_FIRMWARE_IN_KERNEL. Only 64-bit is currently
-supported.
-
-Here's an example:
-
-CONFIG_FIRMWARE_IN_KERNEL=y
-CONFIG_EXTRA_FIRMWARE="intel-ucode/06-3a-09 amd-ucode/microcode_amd_fam15h.bin"
-CONFIG_EXTRA_FIRMWARE_DIR="/lib/firmware"
-
-This basically means, you have the following tree structure locally:
-
-/lib/firmware/
-|-- amd-ucode
-...
-| |-- microcode_amd_fam15h.bin
-...
-|-- intel-ucode
-...
-| |-- 06-3a-09
-...
-
-so that the build system can find those files and integrate them into
-the final kernel image. The early loader finds them and applies them.
diff --git a/Documentation/x86/intel_rdt_ui.txt b/Documentation/x86/intel_rdt_ui.txt
index c491a1b82de2..4d8848e4e224 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -6,8 +6,8 @@ Fenghua Yu <fenghua.yu@intel.com>
Tony Luck <tony.luck@intel.com>
Vikas Shivappa <vikas.shivappa@intel.com>
-This feature is enabled by the CONFIG_INTEL_RDT_A Kconfig and the
-X86 /proc/cpuinfo flag bits "rdt", "cat_l3" and "cdp_l3".
+This feature is enabled by the CONFIG_INTEL_RDT Kconfig and the
+X86 /proc/cpuinfo flag bits "rdt", "cqm", "cat_l3" and "cdp_l3".
To use the feature mount the file system:
@@ -17,6 +17,13 @@ mount options are:
"cdp": Enable code/data prioritization in L3 cache allocations.
+RDT features are orthogonal. A particular system may support only
+monitoring, only control, or both monitoring and control.
+
+The mount succeeds if either of allocation or monitoring is present, but
+only those files and directories supported by the system will be created.
+For more details on the behavior of the interface during monitoring
+and allocation, see the "Resource alloc and monitor groups" section.
Info directory
--------------
@@ -24,7 +31,12 @@ Info directory
The 'info' directory contains information about the enabled
resources. Each resource has its own subdirectory. The subdirectory
names reflect the resource names.
-Cache resource(L3/L2) subdirectory contains the following files:
+
+Each subdirectory contains the following files with respect to
+allocation:
+
+Cache resource(L3/L2) subdirectory contains the following files
+related to allocation:
"num_closids": The number of CLOSIDs which are valid for this
resource. The kernel uses the smallest number of
@@ -36,7 +48,15 @@ Cache resource(L3/L2) subdirectory contains the following files:
"min_cbm_bits": The minimum number of consecutive bits which
must be set when writing a mask.
-Memory bandwitdh(MB) subdirectory contains the following files:
+"shareable_bits": Bitmask of shareable resource with other executing
+ entities (e.g. I/O). User can use this when
+ setting up exclusive cache partitions. Note that
+ some platforms support devices that have their
+ own settings for cache use which can over-ride
+ these bits.
+
+Memory bandwitdh(MB) subdirectory contains the following files
+with respect to allocation:
"min_bandwidth": The minimum memory bandwidth percentage which
user can request.
@@ -52,48 +72,152 @@ Memory bandwitdh(MB) subdirectory contains the following files:
non-linear. This field is purely informational
only.
-Resource groups
----------------
+If RDT monitoring is available there will be an "L3_MON" directory
+with the following files:
+
+"num_rmids": The number of RMIDs available. This is the
+ upper bound for how many "CTRL_MON" + "MON"
+ groups can be created.
+
+"mon_features": Lists the monitoring events if
+ monitoring is enabled for the resource.
+
+"max_threshold_occupancy":
+ Read/write file provides the largest value (in
+ bytes) at which a previously used LLC_occupancy
+ counter can be considered for re-use.
+
+
+Resource alloc and monitor groups
+---------------------------------
+
Resource groups are represented as directories in the resctrl file
-system. The default group is the root directory. Other groups may be
-created as desired by the system administrator using the "mkdir(1)"
-command, and removed using "rmdir(1)".
+system. The default group is the root directory which, immediately
+after mounting, owns all the tasks and cpus in the system and can make
+full use of all resources.
+
+On a system with RDT control features additional directories can be
+created in the root directory that specify different amounts of each
+resource (see "schemata" below). The root and these additional top level
+directories are referred to as "CTRL_MON" groups below.
+
+On a system with RDT monitoring the root directory and other top level
+directories contain a directory named "mon_groups" in which additional
+directories can be created to monitor subsets of tasks in the CTRL_MON
+group that is their ancestor. These are called "MON" groups in the rest
+of this document.
+
+Removing a directory will move all tasks and cpus owned by the group it
+represents to the parent. Removing one of the created CTRL_MON groups
+will automatically remove all MON groups below it.
+
+All groups contain the following files:
+
+"tasks":
+ Reading this file shows the list of all tasks that belong to
+ this group. Writing a task id to the file will add a task to the
+ group. If the group is a CTRL_MON group the task is removed from
+ whichever previous CTRL_MON group owned the task and also from
+ any MON group that owned the task. If the group is a MON group,
+ then the task must already belong to the CTRL_MON parent of this
+ group. The task is removed from any previous MON group.
+
+
+"cpus":
+ Reading this file shows a bitmask of the logical CPUs owned by
+ this group. Writing a mask to this file will add and remove
+ CPUs to/from this group. As with the tasks file a hierarchy is
+ maintained where MON groups may only include CPUs owned by the
+ parent CTRL_MON group.
+
-There are three files associated with each group:
+"cpus_list":
+ Just like "cpus", only using ranges of CPUs instead of bitmasks.
-"tasks": A list of tasks that belongs to this group. Tasks can be
- added to a group by writing the task ID to the "tasks" file
- (which will automatically remove them from the previous
- group to which they belonged). New tasks created by fork(2)
- and clone(2) are added to the same group as their parent.
- If a pid is not in any sub partition, it is in root partition
- (i.e. default partition).
-"cpus": A bitmask of logical CPUs assigned to this group. Writing
- a new mask can add/remove CPUs from this group. Added CPUs
- are removed from their previous group. Removed ones are
- given to the default (root) group. You cannot remove CPUs
- from the default group.
+When control is enabled all CTRL_MON groups will also contain:
-"cpus_list": One or more CPU ranges of logical CPUs assigned to this
- group. Same rules apply like for the "cpus" file.
+"schemata":
+ A list of all the resources available to this group.
+ Each resource has its own line and format - see below for details.
-"schemata": A list of all the resources available to this group.
- Each resource has its own line and format - see below for
- details.
+When monitoring is enabled all MON groups will also contain:
-When a task is running the following rules define which resources
-are available to it:
+"mon_data":
+ This contains a set of files organized by L3 domain and by
+ RDT event. E.g. on a system with two L3 domains there will
+ be subdirectories "mon_L3_00" and "mon_L3_01". Each of these
+ directories have one file per event (e.g. "llc_occupancy",
+ "mbm_total_bytes", and "mbm_local_bytes"). In a MON group these
+ files provide a read out of the current value of the event for
+ all tasks in the group. In CTRL_MON groups these files provide
+ the sum for all tasks in the CTRL_MON group and all tasks in
+ MON groups. Please see example section for more details on usage.
+
+Resource allocation rules
+-------------------------
+When a task is running the following rules define which resources are
+available to it:
1) If the task is a member of a non-default group, then the schemata
-for that group is used.
+ for that group is used.
2) Else if the task belongs to the default group, but is running on a
-CPU that is assigned to some specific group, then the schemata for
-the CPU's group is used.
+ CPU that is assigned to some specific group, then the schemata for the
+ CPU's group is used.
3) Otherwise the schemata for the default group is used.
+Resource monitoring rules
+-------------------------
+1) If a task is a member of a MON group, or non-default CTRL_MON group
+ then RDT events for the task will be reported in that group.
+
+2) If a task is a member of the default CTRL_MON group, but is running
+ on a CPU that is assigned to some specific group, then the RDT events
+ for the task will be reported in that group.
+
+3) Otherwise RDT events for the task will be reported in the root level
+ "mon_data" group.
+
+
+Notes on cache occupancy monitoring and control
+-----------------------------------------------
+When moving a task from one group to another you should remember that
+this only affects *new* cache allocations by the task. E.g. you may have
+a task in a monitor group showing 3 MB of cache occupancy. If you move
+to a new group and immediately check the occupancy of the old and new
+groups you will likely see that the old group is still showing 3 MB and
+the new group zero. When the task accesses locations still in cache from
+before the move, the h/w does not update any counters. On a busy system
+you will likely see the occupancy in the old group go down as cache lines
+are evicted and re-used while the occupancy in the new group rises as
+the task accesses memory and loads into the cache are counted based on
+membership in the new group.
+
+The same applies to cache allocation control. Moving a task to a group
+with a smaller cache partition will not evict any cache lines. The
+process may continue to use them from the old partition.
+
+Hardware uses CLOSid(Class of service ID) and an RMID(Resource monitoring ID)
+to identify a control group and a monitoring group respectively. Each of
+the resource groups are mapped to these IDs based on the kind of group. The
+number of CLOSid and RMID are limited by the hardware and hence the creation of
+a "CTRL_MON" directory may fail if we run out of either CLOSID or RMID
+and creation of "MON" group may fail if we run out of RMIDs.
+
+max_threshold_occupancy - generic concepts
+------------------------------------------
+
+Note that an RMID once freed may not be immediately available for use as
+the RMID is still tagged the cache lines of the previous user of RMID.
+Hence such RMIDs are placed on limbo list and checked back if the cache
+occupancy has gone down. If there is a time when system has a lot of
+limbo RMIDs but which are not ready to be used, user may see an -EBUSY
+during mkdir.
+
+max_threshold_occupancy is a user configurable value to determine the
+occupancy at which an RMID can be freed.
Schemata files - general concepts
---------------------------------
@@ -143,22 +267,22 @@ SKUs. Using a high bandwidth and a low bandwidth setting on two threads
sharing a core will result in both threads being throttled to use the
low bandwidth.
-L3 details (code and data prioritization disabled)
---------------------------------------------------
+L3 schemata file details (code and data prioritization disabled)
+----------------------------------------------------------------
With CDP disabled the L3 schemata format is:
L3:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
-L3 details (CDP enabled via mount option to resctrl)
-----------------------------------------------------
+L3 schemata file details (CDP enabled via mount option to resctrl)
+------------------------------------------------------------------
When CDP is enabled L3 control is split into two separate resources
so you can specify independent masks for code and data like this:
L3data:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
L3code:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
-L2 details
-----------
+L2 schemata file details
+------------------------
L2 cache does not support code and data prioritization, so the
schemata format is always:
@@ -185,6 +309,8 @@ L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
L3DATA:0=fffff;1=fffff;2=3c0;3=fffff
L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
+Examples for RDT allocation usage:
+
Example 1
---------
On a two socket machine (one L3 cache per socket) with just four bits
@@ -410,3 +536,124 @@ void main(void)
/* code to read and write directory contents */
resctrl_release_lock(fd);
}
+
+Examples for RDT Monitoring along with allocation usage:
+
+Reading monitored data
+----------------------
+Reading an event file (for ex: mon_data/mon_L3_00/llc_occupancy) would
+show the current snapshot of LLC occupancy of the corresponding MON
+group or CTRL_MON group.
+
+
+Example 1 (Monitor CTRL_MON group and subset of tasks in CTRL_MON group)
+---------
+On a two socket machine (one L3 cache per socket) with just four bits
+for cache bit masks
+
+# mount -t resctrl resctrl /sys/fs/resctrl
+# cd /sys/fs/resctrl
+# mkdir p0 p1
+# echo "L3:0=3;1=c" > /sys/fs/resctrl/p0/schemata
+# echo "L3:0=3;1=3" > /sys/fs/resctrl/p1/schemata
+# echo 5678 > p1/tasks
+# echo 5679 > p1/tasks
+
+The default resource group is unmodified, so we have access to all parts
+of all caches (its schemata file reads "L3:0=f;1=f").
+
+Tasks that are under the control of group "p0" may only allocate from the
+"lower" 50% on cache ID 0, and the "upper" 50% of cache ID 1.
+Tasks in group "p1" use the "lower" 50% of cache on both sockets.
+
+Create monitor groups and assign a subset of tasks to each monitor group.
+
+# cd /sys/fs/resctrl/p1/mon_groups
+# mkdir m11 m12
+# echo 5678 > m11/tasks
+# echo 5679 > m12/tasks
+
+fetch data (data shown in bytes)
+
+# cat m11/mon_data/mon_L3_00/llc_occupancy
+16234000
+# cat m11/mon_data/mon_L3_01/llc_occupancy
+14789000
+# cat m12/mon_data/mon_L3_00/llc_occupancy
+16789000
+
+The parent ctrl_mon group shows the aggregated data.
+
+# cat /sys/fs/resctrl/p1/mon_data/mon_l3_00/llc_occupancy
+31234000
+
+Example 2 (Monitor a task from its creation)
+---------
+On a two socket machine (one L3 cache per socket)
+
+# mount -t resctrl resctrl /sys/fs/resctrl
+# cd /sys/fs/resctrl
+# mkdir p0 p1
+
+An RMID is allocated to the group once its created and hence the <cmd>
+below is monitored from its creation.
+
+# echo $$ > /sys/fs/resctrl/p1/tasks
+# <cmd>
+
+Fetch the data
+
+# cat /sys/fs/resctrl/p1/mon_data/mon_l3_00/llc_occupancy
+31789000
+
+Example 3 (Monitor without CAT support or before creating CAT groups)
+---------
+
+Assume a system like HSW has only CQM and no CAT support. In this case
+the resctrl will still mount but cannot create CTRL_MON directories.
+But user can create different MON groups within the root group thereby
+able to monitor all tasks including kernel threads.
+
+This can also be used to profile jobs cache size footprint before being
+able to allocate them to different allocation groups.
+
+# mount -t resctrl resctrl /sys/fs/resctrl
+# cd /sys/fs/resctrl
+# mkdir mon_groups/m01
+# mkdir mon_groups/m02
+
+# echo 3478 > /sys/fs/resctrl/mon_groups/m01/tasks
+# echo 2467 > /sys/fs/resctrl/mon_groups/m02/tasks
+
+Monitor the groups separately and also get per domain data. From the
+below its apparent that the tasks are mostly doing work on
+domain(socket) 0.
+
+# cat /sys/fs/resctrl/mon_groups/m01/mon_L3_00/llc_occupancy
+31234000
+# cat /sys/fs/resctrl/mon_groups/m01/mon_L3_01/llc_occupancy
+34555
+# cat /sys/fs/resctrl/mon_groups/m02/mon_L3_00/llc_occupancy
+31234000
+# cat /sys/fs/resctrl/mon_groups/m02/mon_L3_01/llc_occupancy
+32789
+
+
+Example 4 (Monitor real time tasks)
+-----------------------------------
+
+A single socket system which has real time tasks running on cores 4-7
+and non real time tasks on other cpus. We want to monitor the cache
+occupancy of the real time threads on these cores.
+
+# mount -t resctrl resctrl /sys/fs/resctrl
+# cd /sys/fs/resctrl
+# mkdir p1
+
+Move the cpus 4-7 over to p1
+# echo f0 > p0/cpus
+
+View the llc occupancy snapshot
+
+# cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy
+11234000
diff --git a/Documentation/x86/microcode.txt b/Documentation/x86/microcode.txt
new file mode 100644
index 000000000000..f57e1b45e628
--- /dev/null
+++ b/Documentation/x86/microcode.txt
@@ -0,0 +1,137 @@
+ The Linux Microcode Loader
+
+Authors: Fenghua Yu <fenghua.yu@intel.com>
+ Borislav Petkov <bp@suse.de>
+
+The kernel has a x86 microcode loading facility which is supposed to
+provide microcode loading methods in the OS. Potential use cases are
+updating the microcode on platforms beyond the OEM End-Of-Life support,
+and updating the microcode on long-running systems without rebooting.
+
+The loader supports three loading methods:
+
+1. Early load microcode
+=======================
+
+The kernel can update microcode very early during boot. Loading
+microcode early can fix CPU issues before they are observed during
+kernel boot time.
+
+The microcode is stored in an initrd file. During boot, it is read from
+it and loaded into the CPU cores.
+
+The format of the combined initrd image is microcode in (uncompressed)
+cpio format followed by the (possibly compressed) initrd image. The
+loader parses the combined initrd image during boot.
+
+The microcode files in cpio name space are:
+
+on Intel: kernel/x86/microcode/GenuineIntel.bin
+on AMD : kernel/x86/microcode/AuthenticAMD.bin
+
+During BSP (BootStrapping Processor) boot (pre-SMP), the kernel
+scans the microcode file in the initrd. If microcode matching the
+CPU is found, it will be applied in the BSP and later on in all APs
+(Application Processors).
+
+The loader also saves the matching microcode for the CPU in memory.
+Thus, the cached microcode patch is applied when CPUs resume from a
+sleep state.
+
+Here's a crude example how to prepare an initrd with microcode (this is
+normally done automatically by the distribution, when recreating the
+initrd, so you don't really have to do it yourself. It is documented
+here for future reference only).
+
+---
+ #!/bin/bash
+
+ if [ -z "$1" ]; then
+ echo "You need to supply an initrd file"
+ exit 1
+ fi
+
+ INITRD="$1"
+
+ DSTDIR=kernel/x86/microcode
+ TMPDIR=/tmp/initrd
+
+ rm -rf $TMPDIR
+
+ mkdir $TMPDIR
+ cd $TMPDIR
+ mkdir -p $DSTDIR
+
+ if [ -d /lib/firmware/amd-ucode ]; then
+ cat /lib/firmware/amd-ucode/microcode_amd*.bin > $DSTDIR/AuthenticAMD.bin
+ fi
+
+ if [ -d /lib/firmware/intel-ucode ]; then
+ cat /lib/firmware/intel-ucode/* > $DSTDIR/GenuineIntel.bin
+ fi
+
+ find . | cpio -o -H newc >../ucode.cpio
+ cd ..
+ mv $INITRD $INITRD.orig
+ cat ucode.cpio $INITRD.orig > $INITRD
+
+ rm -rf $TMPDIR
+---
+
+The system needs to have the microcode packages installed into
+/lib/firmware or you need to fixup the paths above if yours are
+somewhere else and/or you've downloaded them directly from the processor
+vendor's site.
+
+2. Late loading
+===============
+
+There are two legacy user space interfaces to load microcode, either through
+/dev/cpu/microcode or through /sys/devices/system/cpu/microcode/reload file
+in sysfs.
+
+The /dev/cpu/microcode method is deprecated because it needs a special
+userspace tool for that.
+
+The easier method is simply installing the microcode packages your distro
+supplies and running:
+
+# echo 1 > /sys/devices/system/cpu/microcode/reload
+
+as root.
+
+The loading mechanism looks for microcode blobs in
+/lib/firmware/{intel-ucode,amd-ucode}. The default distro installation
+packages already put them there.
+
+3. Builtin microcode
+====================
+
+The loader supports also loading of a builtin microcode supplied through
+the regular firmware builtin method CONFIG_FIRMWARE_IN_KERNEL. Only
+64-bit is currently supported.
+
+Here's an example:
+
+CONFIG_FIRMWARE_IN_KERNEL=y
+CONFIG_EXTRA_FIRMWARE="intel-ucode/06-3a-09 amd-ucode/microcode_amd_fam15h.bin"
+CONFIG_EXTRA_FIRMWARE_DIR="/lib/firmware"
+
+This basically means, you have the following tree structure locally:
+
+/lib/firmware/
+|-- amd-ucode
+...
+| |-- microcode_amd_fam15h.bin
+...
+|-- intel-ucode
+...
+| |-- 06-3a-09
+...
+
+so that the build system can find those files and integrate them into
+the final kernel image. The early loader finds them and applies them.
+
+Needless to say, this method is not the most flexible one because it
+requires rebuilding the kernel each time updated microcode from the CPU
+vendor is available.
diff --git a/Documentation/x86/orc-unwinder.txt b/Documentation/x86/orc-unwinder.txt
new file mode 100644
index 000000000000..af0c9a4c65a6
--- /dev/null
+++ b/Documentation/x86/orc-unwinder.txt
@@ -0,0 +1,179 @@
+ORC unwinder
+============
+
+Overview
+--------
+
+The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is
+similar in concept to a DWARF unwinder. The difference is that the
+format of the ORC data is much simpler than DWARF, which in turn allows
+the ORC unwinder to be much simpler and faster.
+
+The ORC data consists of unwind tables which are generated by objtool.
+They contain out-of-band data which is used by the in-kernel ORC
+unwinder. Objtool generates the ORC data by first doing compile-time
+stack metadata validation (CONFIG_STACK_VALIDATION). After analyzing
+all the code paths of a .o file, it determines information about the
+stack state at each instruction address in the file and outputs that
+information to the .orc_unwind and .orc_unwind_ip sections.
+
+The per-object ORC sections are combined at link time and are sorted and
+post-processed at boot time. The unwinder uses the resulting data to
+correlate instruction addresses with their stack states at run time.
+
+
+ORC vs frame pointers
+---------------------
+
+With frame pointers enabled, GCC adds instrumentation code to every
+function in the kernel. The kernel's .text size increases by about
+3.2%, resulting in a broad kernel-wide slowdown. Measurements by Mel
+Gorman [1] have shown a slowdown of 5-10% for some workloads.
+
+In contrast, the ORC unwinder has no effect on text size or runtime
+performance, because the debuginfo is out of band. So if you disable
+frame pointers and enable the ORC unwinder, you get a nice performance
+improvement across the board, and still have reliable stack traces.
+
+Ingo Molnar says:
+
+ "Note that it's not just a performance improvement, but also an
+ instruction cache locality improvement: 3.2% .text savings almost
+ directly transform into a similarly sized reduction in cache
+ footprint. That can transform to even higher speedups for workloads
+ whose cache locality is borderline."
+
+Another benefit of ORC compared to frame pointers is that it can
+reliably unwind across interrupts and exceptions. Frame pointer based
+unwinds can sometimes skip the caller of the interrupted function, if it
+was a leaf function or if the interrupt hit before the frame pointer was
+saved.
+
+The main disadvantage of the ORC unwinder compared to frame pointers is
+that it needs more memory to store the ORC unwind tables: roughly 2-4MB
+depending on the kernel config.
+
+
+ORC vs DWARF
+------------
+
+ORC debuginfo's advantage over DWARF itself is that it's much simpler.
+It gets rid of the complex DWARF CFI state machine and also gets rid of
+the tracking of unnecessary registers. This allows the unwinder to be
+much simpler, meaning fewer bugs, which is especially important for
+mission critical oops code.
+
+The simpler debuginfo format also enables the unwinder to be much faster
+than DWARF, which is important for perf and lockdep. In a basic
+performance test by Jiri Slaby [2], the ORC unwinder was about 20x
+faster than an out-of-tree DWARF unwinder. (Note: That measurement was
+taken before some performance tweaks were added, which doubled
+performance, so the speedup over DWARF may be closer to 40x.)
+
+The ORC data format does have a few downsides compared to DWARF. ORC
+unwind tables take up ~50% more RAM (+1.3MB on an x86 defconfig kernel)
+than DWARF-based eh_frame tables.
+
+Another potential downside is that, as GCC evolves, it's conceivable
+that the ORC data may end up being *too* simple to describe the state of
+the stack for certain optimizations. But IMO this is unlikely because
+GCC saves the frame pointer for any unusual stack adjustments it does,
+so I suspect we'll really only ever need to keep track of the stack
+pointer and the frame pointer between call frames. But even if we do
+end up having to track all the registers DWARF tracks, at least we will
+still be able to control the format, e.g. no complex state machines.
+
+
+ORC unwind table generation
+---------------------------
+
+The ORC data is generated by objtool. With the existing compile-time
+stack metadata validation feature, objtool already follows all code
+paths, and so it already has all the information it needs to be able to
+generate ORC data from scratch. So it's an easy step to go from stack
+validation to ORC data generation.
+
+It should be possible to instead generate the ORC data with a simple
+tool which converts DWARF to ORC data. However, such a solution would
+be incomplete due to the kernel's extensive use of asm, inline asm, and
+special sections like exception tables.
+
+That could be rectified by manually annotating those special code paths
+using GNU assembler .cfi annotations in .S files, and homegrown
+annotations for inline asm in .c files. But asm annotations were tried
+in the past and were found to be unmaintainable. They were often
+incorrect/incomplete and made the code harder to read and keep updated.
+And based on looking at glibc code, annotating inline asm in .c files
+might be even worse.
+
+Objtool still needs a few annotations, but only in code which does
+unusual things to the stack like entry code. And even then, far fewer
+annotations are needed than what DWARF would need, so they're much more
+maintainable than DWARF CFI annotations.
+
+So the advantages of using objtool to generate ORC data are that it
+gives more accurate debuginfo, with very few annotations. It also
+insulates the kernel from toolchain bugs which can be very painful to
+deal with in the kernel since we often have to workaround issues in
+older versions of the toolchain for years.
+
+The downside is that the unwinder now becomes dependent on objtool's
+ability to reverse engineer GCC code flow. If GCC optimizations become
+too complicated for objtool to follow, the ORC data generation might
+stop working or become incomplete. (It's worth noting that livepatch
+already has such a dependency on objtool's ability to follow GCC code
+flow.)
+
+If newer versions of GCC come up with some optimizations which break
+objtool, we may need to revisit the current implementation. Some
+possible solutions would be asking GCC to make the optimizations more
+palatable, or having objtool use DWARF as an additional input, or
+creating a GCC plugin to assist objtool with its analysis. But for now,
+objtool follows GCC code quite well.
+
+
+Unwinder implementation details
+-------------------------------
+
+Objtool generates the ORC data by integrating with the compile-time
+stack metadata validation feature, which is described in detail in
+tools/objtool/Documentation/stack-validation.txt. After analyzing all
+the code paths of a .o file, it creates an array of orc_entry structs,
+and a parallel array of instruction addresses associated with those
+structs, and writes them to the .orc_unwind and .orc_unwind_ip sections
+respectively.
+
+The ORC data is split into the two arrays for performance reasons, to
+make the searchable part of the data (.orc_unwind_ip) more compact. The
+arrays are sorted in parallel at boot time.
+
+Performance is further improved by the use of a fast lookup table which
+is created at runtime. The fast lookup table associates a given address
+with a range of indices for the .orc_unwind table, so that only a small
+subset of the table needs to be searched.
+
+
+Etymology
+---------
+
+Orcs, fearsome creatures of medieval folklore, are the Dwarves' natural
+enemies. Similarly, the ORC unwinder was created in opposition to the
+complexity and slowness of DWARF.
+
+"Although Orcs rarely consider multiple solutions to a problem, they do
+excel at getting things done because they are creatures of action, not
+thought." [3] Similarly, unlike the esoteric DWARF unwinder, the
+veracious ORC unwinder wastes no time or siloconic effort decoding
+variable-length zero-extended unsigned-integer byte-coded
+state-machine-based debug information entries.
+
+Similar to how Orcs frequently unravel the well-intentioned plans of
+their adversaries, the ORC unwinder frequently unravels stacks with
+brutal, unyielding efficiency.
+
+ORC stands for Oops Rewind Capability.
+
+
+[1] https://lkml.kernel.org/r/20170602104048.jkkzssljsompjdwy@suse.de
+[2] https://lkml.kernel.org/r/d2ca5435-6386-29b8-db87-7f227c2b713a@suse.cz
+[3] http://dustin.wikidot.com/half-orcs-and-orcs
diff --git a/Documentation/x86/protection-keys.txt b/Documentation/x86/protection-keys.txt
index b64304540821..fa46dcb347bc 100644
--- a/Documentation/x86/protection-keys.txt
+++ b/Documentation/x86/protection-keys.txt
@@ -34,7 +34,7 @@ with a key. In this example WRPKRU is wrapped by a C function
called pkey_set().
int real_prot = PROT_READ|PROT_WRITE;
- pkey = pkey_alloc(0, PKEY_DENY_WRITE);
+ pkey = pkey_alloc(0, PKEY_DISABLE_WRITE);
ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);
... application runs here
@@ -42,9 +42,9 @@ called pkey_set().
Now, if the application needs to update the data at 'ptr', it can
gain access, do the update, then remove its write access:
- pkey_set(pkey, 0); // clear PKEY_DENY_WRITE
+ pkey_set(pkey, 0); // clear PKEY_DISABLE_WRITE
*ptr = foo; // assign something
- pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again
+ pkey_set(pkey, PKEY_DISABLE_WRITE); // set PKEY_DISABLE_WRITE again
Now when it frees the memory, it will also free the pkey since it
is no longer in use:
diff --git a/Documentation/x86/x86_64/5level-paging.txt b/Documentation/x86/x86_64/5level-paging.txt
new file mode 100644
index 000000000000..087251a0d99c
--- /dev/null
+++ b/Documentation/x86/x86_64/5level-paging.txt
@@ -0,0 +1,64 @@
+== Overview ==
+
+Original x86-64 was limited by 4-level paing to 256 TiB of virtual address
+space and 64 TiB of physical address space. We are already bumping into
+this limit: some vendors offers servers with 64 TiB of memory today.
+
+To overcome the limitation upcoming hardware will introduce support for
+5-level paging. It is a straight-forward extension of the current page
+table structure adding one more layer of translation.
+
+It bumps the limits to 128 PiB of virtual address space and 4 PiB of
+physical address space. This "ought to be enough for anybody" ©.
+
+QEMU 2.9 and later support 5-level paging.
+
+Virtual memory layout for 5-level paging is described in
+Documentation/x86/x86_64/mm.txt
+
+== Enabling 5-level paging ==
+
+CONFIG_X86_5LEVEL=y enables the feature.
+
+So far, a kernel compiled with the option enabled will be able to boot
+only on machines that supports the feature -- see for 'la57' flag in
+/proc/cpuinfo.
+
+The plan is to implement boot-time switching between 4- and 5-level paging
+in the future.
+
+== User-space and large virtual address space ==
+
+On x86, 5-level paging enables 56-bit userspace virtual address space.
+Not all user space is ready to handle wide addresses. It's known that
+at least some JIT compilers use higher bits in pointers to encode their
+information. It collides with valid pointers with 5-level paging and
+leads to crashes.
+
+To mitigate this, we are not going to allocate virtual address space
+above 47-bit by default.
+
+But userspace can ask for allocation from full address space by
+specifying hint address (with or without MAP_FIXED) above 47-bits.
+
+If hint address set above 47-bit, but MAP_FIXED is not specified, we try
+to look for unmapped area by specified address. If it's already
+occupied, we look for unmapped area in *full* address space, rather than
+from 47-bit window.
+
+A high hint address would only affect the allocation in question, but not
+any future mmap()s.
+
+Specifying high hint address on older kernel or on machine without 5-level
+paging support is safe. The hint will be ignored and kernel will fall back
+to allocation from 47-bit address space.
+
+This approach helps to easily make application's memory allocator aware
+about large address space without manually tracking allocated virtual
+address space.
+
+One important case we need to handle here is interaction with MPX.
+MPX (without MAWA extension) cannot handle addresses above 47-bit, so we
+need to make sure that MPX cannot be enabled we already have VMA above
+the boundary and forbid creating such VMAs once MPX is enabled.
+