summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-26 12:32:41 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-26 12:32:41 -0700
commit98931dd95fd489fcbfa97da563505a6f071d7c77 (patch)
tree44683fc4a92efa614acdca2742a7ff19d26da1e3 /mm
parentdf202b452fe6c6d6f1351bad485e2367ef1e644e (diff)
parentf403f22f8ccb12860b2b62fec3173c6ccd45938b (diff)
downloadlwn-98931dd95fd489fcbfa97da563505a6f071d7c77.tar.gz
lwn-98931dd95fd489fcbfa97da563505a6f071d7c77.zip
Merge tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: "Almost all of MM here. A few things are still getting finished off, reviewed, etc. - Yang Shi has improved the behaviour of khugepaged collapsing of readonly file-backed transparent hugepages. - Johannes Weiner has arranged for zswap memory use to be tracked and managed on a per-cgroup basis. - Munchun Song adds a /proc knob ("hugetlb_optimize_vmemmap") for runtime enablement of the recent huge page vmemmap optimization feature. - Baolin Wang contributes a series to fix some issues around hugetlb pagetable invalidation. - Zhenwei Pi has fixed some interactions between hwpoisoned pages and virtualization. - Tong Tiangen has enabled the use of the presently x86-only page_table_check debugging feature on arm64 and riscv. - David Vernet has done some fixup work on the memcg selftests. - Peter Xu has taught userfaultfd to handle write protection faults against shmem- and hugetlbfs-backed files. - More DAMON development from SeongJae Park - adding online tuning of the feature and support for monitoring of fixed virtual address ranges. Also easier discovery of which monitoring operations are available. - Nadav Amit has done some optimization of TLB flushing during mprotect(). - Neil Brown continues to labor away at improving our swap-over-NFS support. - David Hildenbrand has some fixes to anon page COWing versus get_user_pages(). - Peng Liu fixed some errors in the core hugetlb code. - Joao Martins has reduced the amount of memory consumed by device-dax's compound devmaps. - Some cleanups of the arch-specific pagemap code from Anshuman Khandual. - Muchun Song has found and fixed some errors in the TLB flushing of transparent hugepages. - Roman Gushchin has done more work on the memcg selftests. ... and, of course, many smaller fixes and cleanups. Notably, the customary million cleanup serieses from Miaohe Lin" * tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (381 commits) mm: kfence: use PAGE_ALIGNED helper selftests: vm: add the "settings" file with timeout variable selftests: vm: add "test_hmm.sh" to TEST_FILES selftests: vm: check numa_available() before operating "merge_across_nodes" in ksm_tests selftests: vm: add migration to the .gitignore selftests/vm/pkeys: fix typo in comment ksm: fix typo in comment selftests: vm: add process_mrelease tests Revert "mm/vmscan: never demote for memcg reclaim" mm/kfence: print disabling or re-enabling message include/trace/events/percpu.h: cleanup for "percpu: improve percpu_alloc_percpu event trace" include/trace/events/mmflags.h: cleanup for "tracing: incorrect gfp_t conversion" mm: fix a potential infinite loop in start_isolate_page_range() MAINTAINERS: add Muchun as co-maintainer for HugeTLB zram: fix Kconfig dependency warning mm/shmem: fix shmem folio swapoff hang cgroup: fix an error handling path in alloc_pagecache_max_30M() mm: damon: use HPAGE_PMD_SIZE tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate nodemask.h: fix compilation error with GCC12 ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig577
-rw-r--r--mm/Makefile2
-rw-r--r--mm/compaction.c93
-rw-r--r--mm/damon/core-test.h36
-rw-r--r--mm/damon/core.c115
-rw-r--r--mm/damon/ops-common.c3
-rw-r--r--mm/damon/paddr.c2
-rw-r--r--mm/damon/reclaim.c124
-rw-r--r--mm/damon/sysfs.c406
-rw-r--r--mm/damon/vaddr-test.h14
-rw-r--r--mm/damon/vaddr.c90
-rw-r--r--mm/debug_vm_pgtable.c46
-rw-r--r--mm/failslab.c3
-rw-r--r--mm/filemap.c5
-rw-r--r--mm/gup.c127
-rw-r--r--mm/hmm.c2
-rw-r--r--mm/huge_memory.c177
-rw-r--r--mm/hugetlb.c428
-rw-r--r--mm/hugetlb_vmemmap.c323
-rw-r--r--mm/hugetlb_vmemmap.h24
-rw-r--r--mm/hwpoison-inject.c1
-rw-r--r--mm/internal.h177
-rw-r--r--mm/kasan/common.c12
-rw-r--r--mm/kasan/generic.c6
-rw-r--r--mm/kasan/kasan.h92
-rw-r--r--mm/kasan/quarantine.c54
-rw-r--r--mm/kasan/report_generic.c8
-rw-r--r--mm/kfence/core.c40
-rw-r--r--mm/kfence/kfence_test.c5
-rw-r--r--mm/khugepaged.c81
-rw-r--r--mm/ksm.c35
-rw-r--r--mm/madvise.c17
-rw-r--r--mm/memcontrol.c329
-rw-r--r--mm/memory-failure.c126
-rw-r--r--mm/memory.c573
-rw-r--r--mm/memory_hotplug.c35
-rw-r--r--mm/mempolicy.c74
-rw-r--r--mm/memremap.c10
-rw-r--r--mm/migrate.c194
-rw-r--r--mm/migrate_device.c23
-rw-r--r--mm/mincore.c4
-rw-r--r--mm/mmap.c67
-rw-r--r--mm/mmu_gather.c16
-rw-r--r--mm/mprotect.c176
-rw-r--r--mm/mremap.c16
-rw-r--r--mm/page-writeback.c17
-rw-r--r--mm/page_alloc.c375
-rw-r--r--mm/page_ext.c2
-rw-r--r--mm/page_idle.c7
-rw-r--r--mm/page_io.c253
-rw-r--r--mm/page_isolation.c391
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/page_table_check.c25
-rw-r--r--mm/page_vma_mapped.c17
-rw-r--r--mm/percpu-internal.h8
-rw-r--r--mm/percpu.c5
-rw-r--r--mm/pgtable-generic.c8
-rw-r--r--mm/rmap.c409
-rw-r--r--mm/shmem.c353
-rw-r--r--mm/slab.c7
-rw-r--r--mm/slab_common.c3
-rw-r--r--mm/slob.c16
-rw-r--r--mm/sparse-vmemmap.c176
-rw-r--r--mm/sparse.c53
-rw-r--r--mm/swap.c10
-rw-r--r--mm/swap.h157
-rw-r--r--mm/swap_slots.c20
-rw-r--r--mm/swap_state.c90
-rw-r--r--mm/swapfile.c135
-rw-r--r--mm/userfaultfd.c62
-rw-r--r--mm/util.c1
-rw-r--r--mm/vmalloc.c18
-rw-r--r--mm/vmscan.c476
-rw-r--r--mm/vmstat.c9
-rw-r--r--mm/z3fold.c64
-rw-r--r--mm/zswap.c52
76 files changed, 5256 insertions, 2735 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 034d87953600..905c205e14f3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -2,6 +2,311 @@
menu "Memory Management options"
+#
+# For some reason microblaze and nios2 hard code SWAP=n. Hopefully we can
+# add proper SWAP support to them, in which case this can be remove.
+#
+config ARCH_NO_SWAP
+ bool
+
+config ZPOOL
+ bool
+
+menuconfig SWAP
+ bool "Support for paging of anonymous memory (swap)"
+ depends on MMU && BLOCK && !ARCH_NO_SWAP
+ default y
+ help
+ This option allows you to choose whether you want to have support
+ for so called swap devices or swap files in your kernel that are
+ used to provide more virtual memory than the actual RAM present
+ in your computer. If unsure say Y.
+
+config ZSWAP
+ bool "Compressed cache for swap pages (EXPERIMENTAL)"
+ depends on SWAP
+ select FRONTSWAP
+ select CRYPTO
+ select ZPOOL
+ help
+ A lightweight compressed cache for swap pages. It takes
+ pages that are in the process of being swapped out and attempts to
+ compress them into a dynamically allocated RAM-based memory pool.
+ This can result in a significant I/O reduction on swap device and,
+ in the case where decompressing from RAM is faster that swap device
+ reads, can also improve workload performance.
+
+ This is marked experimental because it is a new feature (as of
+ v3.11) that interacts heavily with memory reclaim. While these
+ interactions don't cause any known issues on simple memory setups,
+ they have not be fully explored on the large set of potential
+ configurations and workloads that exist.
+
+config ZSWAP_DEFAULT_ON
+ bool "Enable the compressed cache for swap pages by default"
+ depends on ZSWAP
+ help
+ If selected, the compressed cache for swap pages will be enabled
+ at boot, otherwise it will be disabled.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.enabled=' option.
+
+choice
+ prompt "Default compressor"
+ depends on ZSWAP
+ default ZSWAP_COMPRESSOR_DEFAULT_LZO
+ help
+ Selects the default compression algorithm for the compressed cache
+ for swap pages.
+
+ For an overview what kind of performance can be expected from
+ a particular compression algorithm please refer to the benchmarks
+ available at the following LWN page:
+ https://lwn.net/Articles/751795/
+
+ If in doubt, select 'LZO'.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.compressor=' option.
+
+config ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
+ bool "Deflate"
+ select CRYPTO_DEFLATE
+ help
+ Use the Deflate algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZO
+ bool "LZO"
+ select CRYPTO_LZO
+ help
+ Use the LZO algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_842
+ bool "842"
+ select CRYPTO_842
+ help
+ Use the 842 algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZ4
+ bool "LZ4"
+ select CRYPTO_LZ4
+ help
+ Use the LZ4 algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
+ bool "LZ4HC"
+ select CRYPTO_LZ4HC
+ help
+ Use the LZ4HC algorithm as the default compression algorithm.
+
+config ZSWAP_COMPRESSOR_DEFAULT_ZSTD
+ bool "zstd"
+ select CRYPTO_ZSTD
+ help
+ Use the zstd algorithm as the default compression algorithm.
+endchoice
+
+config ZSWAP_COMPRESSOR_DEFAULT
+ string
+ depends on ZSWAP
+ default "deflate" if ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
+ default "lzo" if ZSWAP_COMPRESSOR_DEFAULT_LZO
+ default "842" if ZSWAP_COMPRESSOR_DEFAULT_842
+ default "lz4" if ZSWAP_COMPRESSOR_DEFAULT_LZ4
+ default "lz4hc" if ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
+ default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD
+ default ""
+
+choice
+ prompt "Default allocator"
+ depends on ZSWAP
+ default ZSWAP_ZPOOL_DEFAULT_ZBUD
+ help
+ Selects the default allocator for the compressed cache for
+ swap pages.
+ The default is 'zbud' for compatibility, however please do
+ read the description of each of the allocators below before
+ making a right choice.
+
+ The selection made here can be overridden by using the kernel
+ command line 'zswap.zpool=' option.
+
+config ZSWAP_ZPOOL_DEFAULT_ZBUD
+ bool "zbud"
+ select ZBUD
+ help
+ Use the zbud allocator as the default allocator.
+
+config ZSWAP_ZPOOL_DEFAULT_Z3FOLD
+ bool "z3fold"
+ select Z3FOLD
+ help
+ Use the z3fold allocator as the default allocator.
+
+config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
+ bool "zsmalloc"
+ select ZSMALLOC
+ help
+ Use the zsmalloc allocator as the default allocator.
+endchoice
+
+config ZSWAP_ZPOOL_DEFAULT
+ string
+ depends on ZSWAP
+ default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD
+ default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD
+ default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
+ default ""
+
+config ZBUD
+ tristate "2:1 compression allocator (zbud)"
+ depends on ZSWAP
+ help
+ A special purpose allocator for storing compressed pages.
+ It is designed to store up to two compressed pages per physical
+ page. While this design limits storage density, it has simple and
+ deterministic reclaim properties that make it preferable to a higher
+ density approach when reclaim will be used.
+
+config Z3FOLD
+ tristate "3:1 compression allocator (z3fold)"
+ depends on ZSWAP
+ help
+ A special purpose allocator for storing compressed pages.
+ It is designed to store up to three compressed pages per physical
+ page. It is a ZBUD derivative so the simplicity and determinism are
+ still there.
+
+config ZSMALLOC
+ tristate
+ prompt "N:1 compression allocator (zsmalloc)" if ZSWAP
+ depends on MMU
+ help
+ zsmalloc is a slab-based memory allocator designed to store
+ pages of various compression levels efficiently. It achieves
+ the highest storage density with the least amount of fragmentation.
+
+config ZSMALLOC_STAT
+ bool "Export zsmalloc statistics"
+ depends on ZSMALLOC
+ select DEBUG_FS
+ help
+ This option enables code in the zsmalloc to collect various
+ statistics about what's happening in zsmalloc and exports that
+ information to userspace via debugfs.
+ If unsure, say N.
+
+menu "SLAB allocator options"
+
+choice
+ prompt "Choose SLAB allocator"
+ default SLUB
+ help
+ This option allows to select a slab allocator.
+
+config SLAB
+ bool "SLAB"
+ depends on !PREEMPT_RT
+ select HAVE_HARDENED_USERCOPY_ALLOCATOR
+ help
+ The regular slab allocator that is established and known to work
+ well in all environments. It organizes cache hot objects in
+ per cpu and per node queues.
+
+config SLUB
+ bool "SLUB (Unqueued Allocator)"
+ select HAVE_HARDENED_USERCOPY_ALLOCATOR
+ help
+ SLUB is a slab allocator that minimizes cache line usage
+ instead of managing queues of cached objects (SLAB approach).
+ Per cpu caching is realized using slabs of objects instead
+ of queues of objects. SLUB can use memory efficiently
+ and has enhanced diagnostics. SLUB is the default choice for
+ a slab allocator.
+
+config SLOB
+ depends on EXPERT
+ bool "SLOB (Simple Allocator)"
+ depends on !PREEMPT_RT
+ help
+ SLOB replaces the stock allocator with a drastically simpler
+ allocator. SLOB is generally more space efficient but
+ does not perform as well on large systems.
+
+endchoice
+
+config SLAB_MERGE_DEFAULT
+ bool "Allow slab caches to be merged"
+ default y
+ depends on SLAB || SLUB
+ help
+ For reduced kernel memory fragmentation, slab caches can be
+ merged when they share the same size and other characteristics.
+ This carries a risk of kernel heap overflows being able to
+ overwrite objects from merged caches (and more easily control
+ cache layout), which makes such heap attacks easier to exploit
+ by attackers. By keeping caches unmerged, these kinds of exploits
+ can usually only damage objects in the same cache. To disable
+ merging at runtime, "slab_nomerge" can be passed on the kernel
+ command line.
+
+config SLAB_FREELIST_RANDOM
+ bool "Randomize slab freelist"
+ depends on SLAB || SLUB
+ help
+ Randomizes the freelist order used on creating new pages. This
+ security feature reduces the predictability of the kernel slab
+ allocator against heap overflows.
+
+config SLAB_FREELIST_HARDENED
+ bool "Harden slab freelist metadata"
+ depends on SLAB || SLUB
+ help
+ Many kernel heap attacks try to target slab cache metadata and
+ other infrastructure. This options makes minor performance
+ sacrifices to harden the kernel slab allocator against common
+ freelist exploit methods. Some slab implementations have more
+ sanity-checking than others. This option is most effective with
+ CONFIG_SLUB.
+
+config SLUB_CPU_PARTIAL
+ default y
+ depends on SLUB && SMP
+ bool "SLUB per cpu partial cache"
+ help
+ Per cpu partial caches accelerate objects allocation and freeing
+ that is local to a processor at the price of more indeterminism
+ in the latency of the free. On overflow these caches will be cleared
+ which requires the taking of locks that may cause latency spikes.
+ Typically one would choose no for a realtime system.
+
+endmenu # SLAB allocator options
+
+config SHUFFLE_PAGE_ALLOCATOR
+ bool "Page allocator randomization"
+ default SLAB_FREELIST_RANDOM && ACPI_NUMA
+ help
+ Randomization of the page allocator improves the average
+ utilization of a direct-mapped memory-side-cache. See section
+ 5.2.27 Heterogeneous Memory Attribute Table (HMAT) in the ACPI
+ 6.2a specification for an example of how a platform advertises
+ the presence of a memory-side-cache. There are also incidental
+ security benefits as it reduces the predictability of page
+ allocations to compliment SLAB_FREELIST_RANDOM, but the
+ default granularity of shuffling on the "MAX_ORDER - 1" i.e,
+ 10th order of pages is selected based on cache utilization
+ benefits on x86.
+
+ While the randomization improves cache utilization it may
+ negatively impact workloads on platforms without a cache. For
+ this reason, by default, the randomization is enabled only
+ after runtime detection of a direct-mapped memory-side-cache.
+ Otherwise, the randomization may be force enabled with the
+ 'page_alloc.shuffle' kernel command line parameter.
+
+ Say Y if unsure.
+
config SELECT_MEMORY_MODEL
def_bool y
depends on ARCH_SELECT_MEMORY_MODEL
@@ -126,15 +431,20 @@ config HAVE_BOOTMEM_INFO_NODE
config ARCH_ENABLE_MEMORY_HOTPLUG
bool
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+ bool
+
# eventually, we can have this option just 'select SPARSEMEM'
-config MEMORY_HOTPLUG
- bool "Allow for memory hot-add"
+menuconfig MEMORY_HOTPLUG
+ bool "Memory hotplug"
select MEMORY_ISOLATION
depends on SPARSEMEM
depends on ARCH_ENABLE_MEMORY_HOTPLUG
depends on 64BIT
select NUMA_KEEP_MEMINFO if NUMA
+if MEMORY_HOTPLUG
+
config MEMORY_HOTPLUG_DEFAULT_ONLINE
bool "Online the newly added memory blocks by default"
depends on MEMORY_HOTPLUG
@@ -150,9 +460,6 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE
Say N here if you want the default policy to keep all hot-plugged
memory blocks in 'offline' state.
-config ARCH_ENABLE_MEMORY_HOTREMOVE
- bool
-
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
@@ -164,6 +471,8 @@ config MHP_MEMMAP_ON_MEMORY
depends on MEMORY_HOTPLUG && SPARSEMEM_VMEMMAP
depends on ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+endif # MEMORY_HOTPLUG
+
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
@@ -378,7 +687,13 @@ config NOMMU_INITIAL_TRIM_EXCESS
See Documentation/admin-guide/mm/nommu-mmap.rst for more information.
-config TRANSPARENT_HUGEPAGE
+config ARCH_WANT_GENERAL_HUGETLB
+ bool
+
+config ARCH_WANTS_THP_SWAP
+ def_bool n
+
+menuconfig TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
select COMPACTION
@@ -393,6 +708,8 @@ config TRANSPARENT_HUGEPAGE
If memory constrained on embedded, you may want to say N.
+if TRANSPARENT_HUGEPAGE
+
choice
prompt "Transparent Hugepage Support sysfs defaults"
depends on TRANSPARENT_HUGEPAGE
@@ -417,12 +734,6 @@ choice
benefit.
endchoice
-config ARCH_WANT_GENERAL_HUGETLB
- bool
-
-config ARCH_WANTS_THP_SWAP
- def_bool n
-
config THP_SWAP
def_bool y
depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
@@ -433,6 +744,19 @@ config THP_SWAP
For selection by architectures with reasonable THP sizes.
+config READ_ONLY_THP_FOR_FS
+ bool "Read-only THP for filesystems (EXPERIMENTAL)"
+ depends on TRANSPARENT_HUGEPAGE && SHMEM
+
+ help
+ Allow khugepaged to put read-only file-backed pages in THP.
+
+ This is marked experimental because it is a new feature. Write
+ support of file THPs will be developed in the next few release
+ cycles.
+
+endif # TRANSPARENT_HUGEPAGE
+
#
# UP and nommu archs use km based percpu allocator
#
@@ -517,188 +841,6 @@ config MEM_SOFT_DIRTY
See Documentation/admin-guide/mm/soft-dirty.rst for more details.
-config ZSWAP
- bool "Compressed cache for swap pages (EXPERIMENTAL)"
- depends on SWAP && CRYPTO=y
- select FRONTSWAP
- select ZPOOL
- help
- A lightweight compressed cache for swap pages. It takes
- pages that are in the process of being swapped out and attempts to
- compress them into a dynamically allocated RAM-based memory pool.
- This can result in a significant I/O reduction on swap device and,
- in the case where decompressing from RAM is faster that swap device
- reads, can also improve workload performance.
-
- This is marked experimental because it is a new feature (as of
- v3.11) that interacts heavily with memory reclaim. While these
- interactions don't cause any known issues on simple memory setups,
- they have not be fully explored on the large set of potential
- configurations and workloads that exist.
-
-choice
- prompt "Compressed cache for swap pages default compressor"
- depends on ZSWAP
- default ZSWAP_COMPRESSOR_DEFAULT_LZO
- help
- Selects the default compression algorithm for the compressed cache
- for swap pages.
-
- For an overview what kind of performance can be expected from
- a particular compression algorithm please refer to the benchmarks
- available at the following LWN page:
- https://lwn.net/Articles/751795/
-
- If in doubt, select 'LZO'.
-
- The selection made here can be overridden by using the kernel
- command line 'zswap.compressor=' option.
-
-config ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
- bool "Deflate"
- select CRYPTO_DEFLATE
- help
- Use the Deflate algorithm as the default compression algorithm.
-
-config ZSWAP_COMPRESSOR_DEFAULT_LZO
- bool "LZO"
- select CRYPTO_LZO
- help
- Use the LZO algorithm as the default compression algorithm.
-
-config ZSWAP_COMPRESSOR_DEFAULT_842
- bool "842"
- select CRYPTO_842
- help
- Use the 842 algorithm as the default compression algorithm.
-
-config ZSWAP_COMPRESSOR_DEFAULT_LZ4
- bool "LZ4"
- select CRYPTO_LZ4
- help
- Use the LZ4 algorithm as the default compression algorithm.
-
-config ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
- bool "LZ4HC"
- select CRYPTO_LZ4HC
- help
- Use the LZ4HC algorithm as the default compression algorithm.
-
-config ZSWAP_COMPRESSOR_DEFAULT_ZSTD
- bool "zstd"
- select CRYPTO_ZSTD
- help
- Use the zstd algorithm as the default compression algorithm.
-endchoice
-
-config ZSWAP_COMPRESSOR_DEFAULT
- string
- depends on ZSWAP
- default "deflate" if ZSWAP_COMPRESSOR_DEFAULT_DEFLATE
- default "lzo" if ZSWAP_COMPRESSOR_DEFAULT_LZO
- default "842" if ZSWAP_COMPRESSOR_DEFAULT_842
- default "lz4" if ZSWAP_COMPRESSOR_DEFAULT_LZ4
- default "lz4hc" if ZSWAP_COMPRESSOR_DEFAULT_LZ4HC
- default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD
- default ""
-
-choice
- prompt "Compressed cache for swap pages default allocator"
- depends on ZSWAP
- default ZSWAP_ZPOOL_DEFAULT_ZBUD
- help
- Selects the default allocator for the compressed cache for
- swap pages.
- The default is 'zbud' for compatibility, however please do
- read the description of each of the allocators below before
- making a right choice.
-
- The selection made here can be overridden by using the kernel
- command line 'zswap.zpool=' option.
-
-config ZSWAP_ZPOOL_DEFAULT_ZBUD
- bool "zbud"
- select ZBUD
- help
- Use the zbud allocator as the default allocator.
-
-config ZSWAP_ZPOOL_DEFAULT_Z3FOLD
- bool "z3fold"
- select Z3FOLD
- help
- Use the z3fold allocator as the default allocator.
-
-config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
- bool "zsmalloc"
- select ZSMALLOC
- help
- Use the zsmalloc allocator as the default allocator.
-endchoice
-
-config ZSWAP_ZPOOL_DEFAULT
- string
- depends on ZSWAP
- default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD
- default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD
- default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
- default ""
-
-config ZSWAP_DEFAULT_ON
- bool "Enable the compressed cache for swap pages by default"
- depends on ZSWAP
- help
- If selected, the compressed cache for swap pages will be enabled
- at boot, otherwise it will be disabled.
-
- The selection made here can be overridden by using the kernel
- command line 'zswap.enabled=' option.
-
-config ZPOOL
- tristate "Common API for compressed memory storage"
- help
- Compressed memory storage API. This allows using either zbud or
- zsmalloc.
-
-config ZBUD
- tristate "Low (Up to 2x) density storage for compressed pages"
- depends on ZPOOL
- help
- A special purpose allocator for storing compressed pages.
- It is designed to store up to two compressed pages per physical
- page. While this design limits storage density, it has simple and
- deterministic reclaim properties that make it preferable to a higher
- density approach when reclaim will be used.
-
-config Z3FOLD
- tristate "Up to 3x density storage for compressed pages"
- depends on ZPOOL
- help
- A special purpose allocator for storing compressed pages.
- It is designed to store up to three compressed pages per physical
- page. It is a ZBUD derivative so the simplicity and determinism are
- still there.
-
-config ZSMALLOC
- tristate "Memory allocator for compressed pages"
- depends on MMU
- help
- zsmalloc is a slab-based memory allocator designed to store
- compressed RAM pages. zsmalloc uses virtual memory mapping
- in order to reduce fragmentation. However, this results in a
- non-standard allocator interface where a handle, not a pointer, is
- returned by an alloc(). This handle must be mapped in order to
- access the allocated space.
-
-config ZSMALLOC_STAT
- bool "Export zsmalloc statistics"
- depends on ZSMALLOC
- select DEBUG_FS
- help
- This option enables code in the zsmalloc to collect various
- statistics about what's happening in zsmalloc and exports that
- information to userspace via debugfs.
- If unsure, say N.
-
config GENERIC_EARLY_IOREMAP
bool
@@ -762,7 +904,7 @@ config ARCH_HAS_CURRENT_STACK_POINTER
register alias named "current_stack_pointer", this config can be
selected.
-config ARCH_HAS_FILTER_PGPROT
+config ARCH_HAS_VM_GET_PAGE_PROT
bool
config ARCH_HAS_PTE_DEVMAP
@@ -855,17 +997,6 @@ comment "GUP_TEST needs to have DEBUG_FS enabled"
config GUP_GET_PTE_LOW_HIGH
bool
-config READ_ONLY_THP_FOR_FS
- bool "Read-only THP for filesystems (EXPERIMENTAL)"
- depends on TRANSPARENT_HUGEPAGE && SHMEM
-
- help
- Allow khugepaged to put read-only file-backed pages in THP.
-
- This is marked experimental because it is a new feature. Write
- support of file THPs will be developed in the next few release
- cycles.
-
config ARCH_HAS_PTE_SPECIAL
bool
@@ -909,6 +1040,40 @@ config ANON_VMA_NAME
area from being merged with adjacent virtual memory areas due to the
difference in their name.
+config USERFAULTFD
+ bool "Enable userfaultfd() system call"
+ depends on MMU
+ help
+ Enable the userfaultfd() system call that allows to intercept and
+ handle page faults in userland.
+
+config HAVE_ARCH_USERFAULTFD_WP
+ bool
+ help
+ Arch has userfaultfd write protection support
+
+config HAVE_ARCH_USERFAULTFD_MINOR
+ bool
+ help
+ Arch has userfaultfd minor fault support
+
+config PTE_MARKER
+ bool
+
+ help
+ Allows to create marker PTEs for file-backed memory.
+
+config PTE_MARKER_UFFD_WP
+ bool "Userfaultfd write protection support for shmem/hugetlbfs"
+ default y
+ depends on HAVE_ARCH_USERFAULTFD_WP
+ select PTE_MARKER
+
+ help
+ Allows to create marker PTEs for userfaultfd write protection
+ purposes. It is required to enable userfaultfd write protection on
+ file-backed memory types like shmem and hugetlbfs.
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 4cc13f3179a5..6f9ffa968a1a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -77,7 +77,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
-obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) += hugetlb_vmemmap.o
+obj-$(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) += hugetlb_vmemmap.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
diff --git a/mm/compaction.c b/mm/compaction.c
index fe915db6149b..1f89b969c12b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -317,7 +317,6 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
}
page += (1 << PAGE_ALLOC_COSTLY_ORDER);
- pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
} while (page <= end_page);
return false;
@@ -514,15 +513,12 @@ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
* very heavily contended. The lock should be periodically unlocked to avoid
* having disabled IRQs for a long time, even when there is nobody waiting on
* the lock. It might also be that allowing the IRQs will result in
- * need_resched() becoming true. If scheduling is needed, async compaction
- * aborts. Sync compaction schedules.
+ * need_resched() becoming true. If scheduling is needed, compaction schedules.
* Either compaction type will also abort if a fatal signal is pending.
* In either case if the lock was locked, it is dropped and not regained.
*
- * Returns true if compaction should abort due to fatal signal pending, or
- * async compaction due to need_resched()
- * Returns false when compaction can continue (sync compaction might have
- * scheduled)
+ * Returns true if compaction should abort due to fatal signal pending.
+ * Returns false when compaction can continue.
*/
static bool compact_unlock_should_abort(spinlock_t *lock,
unsigned long flags, bool *locked, struct compact_control *cc)
@@ -575,9 +571,9 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
/*
* Periodically drop the lock (if held) regardless of its
* contention, to give chance to IRQs. Abort if fatal signal
- * pending or async compaction detects need_resched()
+ * pending.
*/
- if (!(blockpfn % SWAP_CLUSTER_MAX)
+ if (!(blockpfn % COMPACT_CLUSTER_MAX)
&& compact_unlock_should_abort(&cc->zone->lock, flags,
&locked, cc))
break;
@@ -603,13 +599,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (!PageBuddy(page))
goto isolate_fail;
- /*
- * If we already hold the lock, we can skip some rechecking.
- * Note that if we hold the lock now, checked_pageblock was
- * already set in some previous iteration (or strict is true),
- * so it is correct to skip the suitable migration target
- * recheck as well.
- */
+ /* If we already hold the lock, we can skip some rechecking. */
if (!locked) {
locked = compact_lock_irqsave(&cc->zone->lock,
&flags, cc);
@@ -872,7 +862,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* contention, to give chance to IRQs. Abort completely if
* a fatal signal is pending.
*/
- if (!(low_pfn % SWAP_CLUSTER_MAX)) {
+ if (!(low_pfn % COMPACT_CLUSTER_MAX)) {
if (locked) {
unlock_page_lruvec_irqrestore(locked, flags);
locked = NULL;
@@ -899,7 +889,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* not falsely conclude that the block should be skipped.
*/
if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
- if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
+ if (!isolation_suitable(cc, page)) {
low_pfn = end_pfn;
page = NULL;
goto isolate_abort;
@@ -918,7 +908,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Do not report -EBUSY down the chain */
if (ret == -EBUSY)
ret = 0;
- low_pfn += (1UL << compound_order(page)) - 1;
+ low_pfn += compound_nr(page) - 1;
goto isolate_fail;
}
@@ -1542,7 +1532,7 @@ fast_isolate_freepages(struct compact_control *cc)
* not found, be pessimistic for direct compaction
* and use the min mark.
*/
- if (highest) {
+ if (highest >= min_pfn) {
page = pfn_to_page(highest);
cc->free_pfn = highest;
} else {
@@ -1587,7 +1577,7 @@ static void isolate_freepages(struct compact_control *cc)
unsigned int stride;
/* Try a small search of the free lists for a candidate */
- isolate_start_pfn = fast_isolate_freepages(cc);
+ fast_isolate_freepages(cc);
if (cc->nr_freepages)
goto splitmap;
@@ -1624,7 +1614,7 @@ static void isolate_freepages(struct compact_control *cc)
* This can iterate a massively long zone without finding any
* suitable migration targets, so periodically check resched.
*/
- if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ if (!(block_start_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages)))
cond_resched();
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
@@ -1858,6 +1848,8 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
update_fast_start_pfn(cc, free_pfn);
pfn = pageblock_start_pfn(free_pfn);
+ if (pfn < cc->zone->zone_start_pfn)
+ pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
found_block = true;
set_pageblock_skip(freepage);
@@ -1931,7 +1923,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
* many pageblocks unsuitable, so periodically check if we
* need to schedule.
*/
- if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ if (!(low_pfn % (COMPACT_CLUSTER_MAX * pageblock_nr_pages)))
cond_resched();
page = pageblock_pfn_to_page(block_start_pfn,
@@ -1951,12 +1943,12 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
continue;
/*
- * For async compaction, also only scan in MOVABLE blocks
- * without huge pages. Async compaction is optimistic to see
- * if the minimum amount of work satisfies the allocation.
- * The cached PFN is updated as it's possible that all
- * remaining blocks between source and target are unsuitable
- * and the compaction scanners fail to meet.
+ * For async direct compaction, only scan the pageblocks of the
+ * same migratetype without huge pages. Async direct compaction
+ * is optimistic to see if the minimum amount of work satisfies
+ * the allocation. The cached PFN is updated as it's possible
+ * that all remaining blocks between source and target are
+ * unsuitable and the compaction scanners fail to meet.
*/
if (!suitable_migration_source(cc, page)) {
update_cached_migrate(cc, block_end_pfn);
@@ -2144,29 +2136,16 @@ static enum compact_result __compact_finished(struct compact_control *cc)
* other migratetype buddy lists.
*/
if (find_suitable_fallback(area, order, migratetype,
- true, &can_steal) != -1) {
-
- /* movable pages are OK in any pageblock */
- if (migratetype == MIGRATE_MOVABLE)
- return COMPACT_SUCCESS;
-
+ true, &can_steal) != -1)
/*
- * We are stealing for a non-movable allocation. Make
- * sure we finish compacting the current pageblock
- * first so it is as free as possible and we won't
- * have to steal another one soon. This only applies
- * to sync compaction, as async compaction operates
- * on pageblocks of the same migratetype.
+ * Movable pages are OK in any pageblock. If we are
+ * stealing for a non-movable allocation, make sure
+ * we finish compacting the current pageblock first
+ * (which is assured by the above migrate_pfn align
+ * check) so it is as free as possible and we won't
+ * have to steal another one soon.
*/
- if (cc->mode == MIGRATE_ASYNC ||
- IS_ALIGNED(cc->migrate_pfn,
- pageblock_nr_pages)) {
- return COMPACT_SUCCESS;
- }
-
- ret = COMPACT_CONTINUE;
- break;
- }
+ return COMPACT_SUCCESS;
}
out:
@@ -2301,7 +2280,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
compact_result = __compaction_suitable(zone, order, alloc_flags,
ac->highest_zoneidx, available);
- if (compact_result != COMPACT_SKIPPED)
+ if (compact_result == COMPACT_CONTINUE)
return true;
}
@@ -2592,7 +2571,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, struct page **capture)
{
- int may_perform_io = gfp_mask & __GFP_IO;
+ int may_perform_io = (__force int)(gfp_mask & __GFP_IO);
struct zoneref *z;
struct zone *zone;
enum compact_result rc = COMPACT_SKIPPED;
@@ -3016,21 +2995,18 @@ static int kcompactd(void *p)
* This kcompactd start function will be called by init and node-hot-add.
* On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
*/
-int kcompactd_run(int nid)
+void kcompactd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- int ret = 0;
if (pgdat->kcompactd)
- return 0;
+ return;
pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
if (IS_ERR(pgdat->kcompactd)) {
pr_err("Failed to start kcompactd on node %d\n", nid);
- ret = PTR_ERR(pgdat->kcompactd);
pgdat->kcompactd = NULL;
}
- return ret;
}
/*
@@ -3065,7 +3041,8 @@ static int kcompactd_cpu_online(unsigned int cpu)
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
/* One of our CPUs online: restore mask */
- set_cpus_allowed_ptr(pgdat->kcompactd, mask);
+ if (pgdat->kcompactd)
+ set_cpus_allowed_ptr(pgdat->kcompactd, mask);
}
return 0;
}
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index b4085deb9fa0..573669566f84 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -232,6 +232,41 @@ static void damon_test_split_regions_of(struct kunit *test)
damon_destroy_ctx(c);
}
+static void damon_test_ops_registration(struct kunit *test)
+{
+ struct damon_ctx *c = damon_new_ctx();
+ struct damon_operations ops, bak;
+
+ /* DAMON_OPS_{V,P}ADDR are registered on subsys_initcall */
+ KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_VADDR), 0);
+ KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_PADDR), 0);
+
+ /* Double-registration is prohibited */
+ ops.id = DAMON_OPS_VADDR;
+ KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
+ ops.id = DAMON_OPS_PADDR;
+ KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
+
+ /* Unknown ops id cannot be registered */
+ KUNIT_EXPECT_EQ(test, damon_select_ops(c, NR_DAMON_OPS), -EINVAL);
+
+ /* Registration should success after unregistration */
+ mutex_lock(&damon_ops_lock);
+ bak = damon_registered_ops[DAMON_OPS_VADDR];
+ damon_registered_ops[DAMON_OPS_VADDR] = (struct damon_operations){};
+ mutex_unlock(&damon_ops_lock);
+
+ ops.id = DAMON_OPS_VADDR;
+ KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), 0);
+
+ mutex_lock(&damon_ops_lock);
+ damon_registered_ops[DAMON_OPS_VADDR] = bak;
+ mutex_unlock(&damon_ops_lock);
+
+ /* Check double-registration failure again */
+ KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
+}
+
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_target),
KUNIT_CASE(damon_test_regions),
@@ -240,6 +275,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_merge_two),
KUNIT_CASE(damon_test_merge_regions_of),
KUNIT_CASE(damon_test_split_regions_of),
+ KUNIT_CASE(damon_test_ops_registration),
{},
};
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 5ce8d7c867f0..7d25dc582fe3 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -30,7 +30,7 @@ static DEFINE_MUTEX(damon_ops_lock);
static struct damon_operations damon_registered_ops[NR_DAMON_OPS];
/* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */
-static bool damon_registered_ops_id(enum damon_ops_id id)
+static bool __damon_is_registered_ops(enum damon_ops_id id)
{
struct damon_operations empty_ops = {};
@@ -40,6 +40,24 @@ static bool damon_registered_ops_id(enum damon_ops_id id)
}
/**
+ * damon_is_registered_ops() - Check if a given damon_operations is registered.
+ * @id: Id of the damon_operations to check if registered.
+ *
+ * Return: true if the ops is set, false otherwise.
+ */
+bool damon_is_registered_ops(enum damon_ops_id id)
+{
+ bool registered;
+
+ if (id >= NR_DAMON_OPS)
+ return false;
+ mutex_lock(&damon_ops_lock);
+ registered = __damon_is_registered_ops(id);
+ mutex_unlock(&damon_ops_lock);
+ return registered;
+}
+
+/**
* damon_register_ops() - Register a monitoring operations set to DAMON.
* @ops: monitoring operations set to register.
*
@@ -56,7 +74,7 @@ int damon_register_ops(struct damon_operations *ops)
return -EINVAL;
mutex_lock(&damon_ops_lock);
/* Fail for already registered ops */
- if (damon_registered_ops_id(ops->id)) {
+ if (__damon_is_registered_ops(ops->id)) {
err = -EINVAL;
goto out;
}
@@ -84,7 +102,7 @@ int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id)
return -EINVAL;
mutex_lock(&damon_ops_lock);
- if (!damon_registered_ops_id(id))
+ if (!__damon_is_registered_ops(id))
err = -EINVAL;
else
ctx->ops = damon_registered_ops[id];
@@ -139,6 +157,79 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t)
damon_free_region(r);
}
+/*
+ * Check whether a region is intersecting an address range
+ *
+ * Returns true if it is.
+ */
+static bool damon_intersect(struct damon_region *r,
+ struct damon_addr_range *re)
+{
+ return !(r->ar.end <= re->start || re->end <= r->ar.start);
+}
+
+/*
+ * damon_set_regions() - Set regions of a target for given address ranges.
+ * @t: the given target.
+ * @ranges: array of new monitoring target ranges.
+ * @nr_ranges: length of @ranges.
+ *
+ * This function adds new regions to, or modify existing regions of a
+ * monitoring target to fit in specific ranges.
+ *
+ * Return: 0 if success, or negative error code otherwise.
+ */
+int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
+ unsigned int nr_ranges)
+{
+ struct damon_region *r, *next;
+ unsigned int i;
+
+ /* Remove regions which are not in the new ranges */
+ damon_for_each_region_safe(r, next, t) {
+ for (i = 0; i < nr_ranges; i++) {
+ if (damon_intersect(r, &ranges[i]))
+ break;
+ }
+ if (i == nr_ranges)
+ damon_destroy_region(r, t);
+ }
+
+ /* Add new regions or resize existing regions to fit in the ranges */
+ for (i = 0; i < nr_ranges; i++) {
+ struct damon_region *first = NULL, *last, *newr;
+ struct damon_addr_range *range;
+
+ range = &ranges[i];
+ /* Get the first/last regions intersecting with the range */
+ damon_for_each_region(r, t) {
+ if (damon_intersect(r, range)) {
+ if (!first)
+ first = r;
+ last = r;
+ }
+ if (r->ar.start >= range->end)
+ break;
+ }
+ if (!first) {
+ /* no region intersects with this range */
+ newr = damon_new_region(
+ ALIGN_DOWN(range->start,
+ DAMON_MIN_REGION),
+ ALIGN(range->end, DAMON_MIN_REGION));
+ if (!newr)
+ return -ENOMEM;
+ damon_insert_region(newr, damon_prev_region(r), r, t);
+ } else {
+ /* resize intersecting regions to fit in this range */
+ first->ar.start = ALIGN_DOWN(range->start,
+ DAMON_MIN_REGION);
+ last->ar.end = ALIGN(range->end, DAMON_MIN_REGION);
+ }
+ }
+ return 0;
+}
+
struct damos *damon_new_scheme(
unsigned long min_sz_region, unsigned long max_sz_region,
unsigned int min_nr_accesses, unsigned int max_nr_accesses,
@@ -1033,6 +1124,10 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
return 0;
kdamond_usleep(min_wait_time);
+
+ if (ctx->callback.after_wmarks_check &&
+ ctx->callback.after_wmarks_check(ctx))
+ break;
}
return -EBUSY;
}
@@ -1042,7 +1137,7 @@ static int kdamond_wait_activation(struct damon_ctx *ctx)
*/
static int kdamond_fn(void *data)
{
- struct damon_ctx *ctx = (struct damon_ctx *)data;
+ struct damon_ctx *ctx = data;
struct damon_target *t;
struct damon_region *r, *next;
unsigned int max_nr_accesses = 0;
@@ -1059,14 +1154,18 @@ static int kdamond_fn(void *data)
sz_limit = damon_region_sz_limit(ctx);
while (!kdamond_need_stop(ctx) && !done) {
- if (kdamond_wait_activation(ctx))
+ if (kdamond_wait_activation(ctx)) {
+ done = true;
continue;
+ }
if (ctx->ops.prepare_access_checks)
ctx->ops.prepare_access_checks(ctx);
if (ctx->callback.after_sampling &&
- ctx->callback.after_sampling(ctx))
+ ctx->callback.after_sampling(ctx)) {
done = true;
+ continue;
+ }
kdamond_usleep(ctx->sample_interval);
@@ -1078,8 +1177,10 @@ static int kdamond_fn(void *data)
max_nr_accesses / 10,
sz_limit);
if (ctx->callback.after_aggregation &&
- ctx->callback.after_aggregation(ctx))
+ ctx->callback.after_aggregation(ctx)) {
done = true;
+ continue;
+ }
kdamond_apply_schemes(ctx);
kdamond_reset_aggregated(ctx);
kdamond_split_regions(ctx);
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index e346cc10d143..10ef20b2003f 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -73,8 +73,7 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
}
#ifdef CONFIG_MMU_NOTIFIER
- if (mmu_notifier_clear_young(mm, addr,
- addr + ((1UL) << HPAGE_PMD_SHIFT)))
+ if (mmu_notifier_clear_young(mm, addr, addr + HPAGE_PMD_SIZE))
referenced = true;
#endif /* CONFIG_MMU_NOTIFIER */
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index 21474ae63bc7..b40ff5811bb2 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -106,7 +106,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma,
result->accessed = pmd_young(*pvmw.pmd) ||
!folio_test_idle(folio) ||
mmu_notifier_test_young(vma->vm_mm, addr);
- result->page_sz = ((1UL) << HPAGE_PMD_SHIFT);
+ result->page_sz = HPAGE_PMD_SIZE;
#else
WARN_ON_ONCE(1);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index e34c4d0c4d93..8efbfb24f3a1 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -28,7 +28,18 @@
* this.
*/
static bool enabled __read_mostly;
-module_param(enabled, bool, 0600);
+
+/*
+ * Make DAMON_RECLAIM reads the input parameters again, except ``enabled``.
+ *
+ * Input parameters that updated while DAMON_RECLAIM is running are not applied
+ * by default. Once this parameter is set as ``Y``, DAMON_RECLAIM reads values
+ * of parametrs except ``enabled`` again. Once the re-reading is done, this
+ * parameter is set as ``N``. If invalid parameters are found while the
+ * re-reading, DAMON_RECLAIM will be disabled.
+ */
+static bool commit_inputs __read_mostly;
+module_param(commit_inputs, bool, 0600);
/*
* Time threshold for cold memory regions identification in microseconds.
@@ -227,7 +238,7 @@ static int walk_system_ram(struct resource *res, void *arg)
{
struct damon_reclaim_ram_walk_arg *a = arg;
- if (a->end - a->start < res->end - res->start) {
+ if (a->end - a->start < resource_size(res)) {
a->start = res->start;
a->end = res->end;
}
@@ -290,57 +301,56 @@ static struct damos *damon_reclaim_new_scheme(void)
return scheme;
}
-static int damon_reclaim_turn(bool on)
+static int damon_reclaim_apply_parameters(void)
{
- struct damon_region *region;
struct damos *scheme;
- int err;
-
- if (!on) {
- err = damon_stop(&ctx, 1);
- if (!err)
- kdamond_pid = -1;
- return err;
- }
+ struct damon_addr_range addr_range;
+ int err = 0;
err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
min_nr_regions, max_nr_regions);
if (err)
return err;
+ /* Will be freed by next 'damon_set_schemes()' below */
+ scheme = damon_reclaim_new_scheme();
+ if (!scheme)
+ return -ENOMEM;
+ err = damon_set_schemes(ctx, &scheme, 1);
+ if (err)
+ return err;
+
if (monitor_region_start > monitor_region_end)
return -EINVAL;
if (!monitor_region_start && !monitor_region_end &&
!get_monitoring_region(&monitor_region_start,
&monitor_region_end))
return -EINVAL;
- /* DAMON will free this on its own when finish monitoring */
- region = damon_new_region(monitor_region_start, monitor_region_end);
- if (!region)
- return -ENOMEM;
- damon_add_region(region, target);
+ addr_range.start = monitor_region_start;
+ addr_range.end = monitor_region_end;
+ return damon_set_regions(target, &addr_range, 1);
+}
- /* Will be freed by 'damon_set_schemes()' below */
- scheme = damon_reclaim_new_scheme();
- if (!scheme) {
- err = -ENOMEM;
- goto free_region_out;
+static int damon_reclaim_turn(bool on)
+{
+ int err;
+
+ if (!on) {
+ err = damon_stop(&ctx, 1);
+ if (!err)
+ kdamond_pid = -1;
+ return err;
}
- err = damon_set_schemes(ctx, &scheme, 1);
+
+ err = damon_reclaim_apply_parameters();
if (err)
- goto free_scheme_out;
+ return err;
err = damon_start(&ctx, 1, true);
- if (!err) {
- kdamond_pid = ctx->kdamond->pid;
- return 0;
- }
-
-free_scheme_out:
- damon_destroy_scheme(scheme);
-free_region_out:
- damon_destroy_region(region, target);
- return err;
+ if (err)
+ return err;
+ kdamond_pid = ctx->kdamond->pid;
+ return 0;
}
#define ENABLE_CHECK_INTERVAL_MS 1000
@@ -358,14 +368,39 @@ static void damon_reclaim_timer_fn(struct work_struct *work)
enabled = last_enabled;
}
- schedule_delayed_work(&damon_reclaim_timer,
+ if (enabled)
+ schedule_delayed_work(&damon_reclaim_timer,
msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS));
}
static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
+static int enabled_store(const char *val,
+ const struct kernel_param *kp)
+{
+ int rc = param_set_bool(val, kp);
+
+ if (rc < 0)
+ return rc;
+
+ if (enabled)
+ schedule_delayed_work(&damon_reclaim_timer, 0);
+
+ return 0;
+}
+
+static const struct kernel_param_ops enabled_param_ops = {
+ .set = enabled_store,
+ .get = param_get_bool,
+};
+
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
+MODULE_PARM_DESC(enabled,
+ "Enable or disable DAMON_RECLAIM (default: disabled)");
+
static int damon_reclaim_after_aggregation(struct damon_ctx *c)
{
struct damos *s;
+ int err = 0;
/* update the stats parameter */
damon_for_each_scheme(s, c) {
@@ -375,7 +410,23 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c)
bytes_reclaimed_regions = s->stat.sz_applied;
nr_quota_exceeds = s->stat.qt_exceeds;
}
- return 0;
+
+ if (commit_inputs) {
+ err = damon_reclaim_apply_parameters();
+ commit_inputs = false;
+ }
+ return err;
+}
+
+static int damon_reclaim_after_wmarks_check(struct damon_ctx *c)
+{
+ int err = 0;
+
+ if (commit_inputs) {
+ err = damon_reclaim_apply_parameters();
+ commit_inputs = false;
+ }
+ return err;
}
static int __init damon_reclaim_init(void)
@@ -387,6 +438,7 @@ static int __init damon_reclaim_init(void)
if (damon_select_ops(ctx, DAMON_OPS_PADDR))
return -EINVAL;
+ ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check;
ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
target = damon_new_target();
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 48e434cd43d8..09f9e8ca3d1f 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1694,6 +1694,7 @@ static struct kobj_type damon_sysfs_attrs_ktype = {
/* This should match with enum damon_ops_id */
static const char * const damon_sysfs_ops_strs[] = {
"vaddr",
+ "fvaddr",
"paddr",
};
@@ -1810,6 +1811,21 @@ static void damon_sysfs_context_rm_dirs(struct damon_sysfs_context *context)
kobject_put(&context->schemes->kobj);
}
+static ssize_t avail_operations_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ enum damon_ops_id id;
+ int len = 0;
+
+ for (id = 0; id < NR_DAMON_OPS; id++) {
+ if (!damon_is_registered_ops(id))
+ continue;
+ len += sysfs_emit_at(buf, len, "%s\n",
+ damon_sysfs_ops_strs[id]);
+ }
+ return len;
+}
+
static ssize_t operations_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -1840,10 +1856,14 @@ static void damon_sysfs_context_release(struct kobject *kobj)
kfree(container_of(kobj, struct damon_sysfs_context, kobj));
}
+static struct kobj_attribute damon_sysfs_context_avail_operations_attr =
+ __ATTR_RO_MODE(avail_operations, 0400);
+
static struct kobj_attribute damon_sysfs_context_operations_attr =
__ATTR_RW_MODE(operations, 0600);
static struct attribute *damon_sysfs_context_attrs[] = {
+ &damon_sysfs_context_avail_operations_attr.attr,
&damon_sysfs_context_operations_attr.attr,
NULL,
};
@@ -2033,6 +2053,54 @@ static bool damon_sysfs_ctx_running(struct damon_ctx *ctx)
return running;
}
+/*
+ * enum damon_sysfs_cmd - Commands for a specific kdamond.
+ */
+enum damon_sysfs_cmd {
+ /* @DAMON_SYSFS_CMD_ON: Turn the kdamond on. */
+ DAMON_SYSFS_CMD_ON,
+ /* @DAMON_SYSFS_CMD_OFF: Turn the kdamond off. */
+ DAMON_SYSFS_CMD_OFF,
+ /* @DAMON_SYSFS_CMD_COMMIT: Update kdamond inputs. */
+ DAMON_SYSFS_CMD_COMMIT,
+ /*
+ * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS: Update scheme stats sysfs
+ * files.
+ */
+ DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS,
+ /*
+ * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands.
+ */
+ NR_DAMON_SYSFS_CMDS,
+};
+
+/* Should match with enum damon_sysfs_cmd */
+static const char * const damon_sysfs_cmd_strs[] = {
+ "on",
+ "off",
+ "commit",
+ "update_schemes_stats",
+};
+
+/*
+ * struct damon_sysfs_cmd_request - A request to the DAMON callback.
+ * @cmd: The command that needs to be handled by the callback.
+ * @kdamond: The kobject wrapper that associated to the kdamond thread.
+ *
+ * This structure represents a sysfs command request that need to access some
+ * DAMON context-internal data. Because DAMON context-internal data can be
+ * safely accessed from DAMON callbacks without additional synchronization, the
+ * request will be handled by the DAMON callback. None-``NULL`` @kdamond means
+ * the request is valid.
+ */
+struct damon_sysfs_cmd_request {
+ enum damon_sysfs_cmd cmd;
+ struct damon_sysfs_kdamond *kdamond;
+};
+
+/* Current DAMON callback request. Protected by damon_sysfs_lock. */
+static struct damon_sysfs_cmd_request damon_sysfs_cmd_request;
+
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -2046,7 +2114,9 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
else
running = damon_sysfs_ctx_running(ctx);
- return sysfs_emit(buf, "%s\n", running ? "on" : "off");
+ return sysfs_emit(buf, "%s\n", running ?
+ damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_ON] :
+ damon_sysfs_cmd_strs[DAMON_SYSFS_CMD_OFF]);
}
static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
@@ -2066,7 +2136,8 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
struct damon_target *t, *next;
damon_for_each_target_safe(t, next, ctx) {
- if (ctx->ops.id == DAMON_OPS_VADDR)
+ if (ctx->ops.id == DAMON_OPS_VADDR ||
+ ctx->ops.id == DAMON_OPS_FVADDR)
put_pid(t->pid);
damon_destroy_target(t);
}
@@ -2075,28 +2146,89 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
static int damon_sysfs_set_regions(struct damon_target *t,
struct damon_sysfs_regions *sysfs_regions)
{
- int i;
+ struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr,
+ sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN);
+ int i, err = -EINVAL;
+ if (!ranges)
+ return -ENOMEM;
for (i = 0; i < sysfs_regions->nr; i++) {
struct damon_sysfs_region *sys_region =
sysfs_regions->regions_arr[i];
- struct damon_region *prev, *r;
if (sys_region->start > sys_region->end)
- return -EINVAL;
- r = damon_new_region(sys_region->start, sys_region->end);
- if (!r)
- return -ENOMEM;
- damon_add_region(r, t);
- if (damon_nr_regions(t) > 1) {
- prev = damon_prev_region(r);
- if (prev->ar.end > r->ar.start) {
- damon_destroy_region(r, t);
- return -EINVAL;
- }
- }
+ goto out;
+
+ ranges[i].start = sys_region->start;
+ ranges[i].end = sys_region->end;
+ if (i == 0)
+ continue;
+ if (ranges[i - 1].end > ranges[i].start)
+ goto out;
}
+ err = damon_set_regions(t, ranges, sysfs_regions->nr);
+out:
+ kfree(ranges);
+ return err;
+
+}
+
+static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
+ struct damon_ctx *ctx)
+{
+ struct damon_target *t = damon_new_target();
+ int err = -EINVAL;
+
+ if (!t)
+ return -ENOMEM;
+ if (ctx->ops.id == DAMON_OPS_VADDR ||
+ ctx->ops.id == DAMON_OPS_FVADDR) {
+ t->pid = find_get_pid(sys_target->pid);
+ if (!t->pid)
+ goto destroy_targets_out;
+ }
+ damon_add_target(ctx, t);
+ err = damon_sysfs_set_regions(t, sys_target->regions);
+ if (err)
+ goto destroy_targets_out;
return 0;
+
+destroy_targets_out:
+ damon_sysfs_destroy_targets(ctx);
+ return err;
+}
+
+/*
+ * Search a target in a context that corresponds to the sysfs target input.
+ *
+ * Return: pointer to the target if found, NULL if not found, or negative
+ * error code if the search failed.
+ */
+static struct damon_target *damon_sysfs_existing_target(
+ struct damon_sysfs_target *sys_target, struct damon_ctx *ctx)
+{
+ struct pid *pid;
+ struct damon_target *t;
+
+ if (ctx->ops.id == DAMON_OPS_PADDR) {
+ /* Up to only one target for paddr could exist */
+ damon_for_each_target(t, ctx)
+ return t;
+ return NULL;
+ }
+
+ /* ops.id should be DAMON_OPS_VADDR or DAMON_OPS_FVADDR */
+ pid = find_get_pid(sys_target->pid);
+ if (!pid)
+ return ERR_PTR(-EINVAL);
+ damon_for_each_target(t, ctx) {
+ if (t->pid == pid) {
+ put_pid(pid);
+ return t;
+ }
+ }
+ put_pid(pid);
+ return NULL;
}
static int damon_sysfs_set_targets(struct damon_ctx *ctx,
@@ -2104,28 +2236,22 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
{
int i, err;
- for (i = 0; i < sysfs_targets->nr; i++) {
- struct damon_sysfs_target *sys_target =
- sysfs_targets->targets_arr[i];
- struct damon_target *t = damon_new_target();
+ /* Multiple physical address space monitoring targets makes no sense */
+ if (ctx->ops.id == DAMON_OPS_PADDR && sysfs_targets->nr > 1)
+ return -EINVAL;
- if (!t) {
- damon_sysfs_destroy_targets(ctx);
- return -ENOMEM;
- }
- if (ctx->ops.id == DAMON_OPS_VADDR) {
- t->pid = find_get_pid(sys_target->pid);
- if (!t->pid) {
- damon_sysfs_destroy_targets(ctx);
- return -EINVAL;
- }
- }
- damon_add_target(ctx, t);
- err = damon_sysfs_set_regions(t, sys_target->regions);
- if (err) {
- damon_sysfs_destroy_targets(ctx);
+ for (i = 0; i < sysfs_targets->nr; i++) {
+ struct damon_sysfs_target *st = sysfs_targets->targets_arr[i];
+ struct damon_target *t = damon_sysfs_existing_target(st, ctx);
+
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ if (!t)
+ err = damon_sysfs_add_target(st, ctx);
+ else
+ err = damon_sysfs_set_regions(t, st->regions);
+ if (err)
return err;
- }
}
return 0;
}
@@ -2183,7 +2309,7 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
- if (ctx->ops.id != DAMON_OPS_VADDR)
+ if (ctx->ops.id != DAMON_OPS_VADDR && ctx->ops.id != DAMON_OPS_FVADDR)
return;
mutex_lock(&ctx->kdamond_lock);
@@ -2194,6 +2320,115 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
mutex_unlock(&ctx->kdamond_lock);
}
+/*
+ * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files.
+ * @kdamond: The kobject wrapper that associated to the kdamond thread.
+ *
+ * This function reads the schemes stats of specific kdamond and update the
+ * related values for sysfs files. This function should be called from DAMON
+ * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON
+ * contexts-internal data and DAMON sysfs variables.
+ */
+static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_ctx *ctx = kdamond->damon_ctx;
+ struct damon_sysfs_schemes *sysfs_schemes;
+ struct damos *scheme;
+ int schemes_idx = 0;
+
+ if (!ctx)
+ return -EINVAL;
+ sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes;
+ damon_for_each_scheme(scheme, ctx) {
+ struct damon_sysfs_stats *sysfs_stats;
+
+ sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
+ sysfs_stats->nr_tried = scheme->stat.nr_tried;
+ sysfs_stats->sz_tried = scheme->stat.sz_tried;
+ sysfs_stats->nr_applied = scheme->stat.nr_applied;
+ sysfs_stats->sz_applied = scheme->stat.sz_applied;
+ sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
+ }
+ return 0;
+}
+
+static inline bool damon_sysfs_kdamond_running(
+ struct damon_sysfs_kdamond *kdamond)
+{
+ return kdamond->damon_ctx &&
+ damon_sysfs_ctx_running(kdamond->damon_ctx);
+}
+
+/*
+ * damon_sysfs_commit_input() - Commit user inputs to a running kdamond.
+ * @kdamond: The kobject wrapper for the associated kdamond.
+ *
+ * If the sysfs input is wrong, the kdamond will be terminated.
+ */
+static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
+{
+ struct damon_ctx *ctx = kdamond->damon_ctx;
+ struct damon_sysfs_context *sys_ctx;
+ int err = 0;
+
+ if (!damon_sysfs_kdamond_running(kdamond))
+ return -EINVAL;
+ /* TODO: Support multiple contexts per kdamond */
+ if (kdamond->contexts->nr != 1)
+ return -EINVAL;
+
+ sys_ctx = kdamond->contexts->contexts_arr[0];
+
+ err = damon_select_ops(ctx, sys_ctx->ops_id);
+ if (err)
+ return err;
+ err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
+ if (err)
+ return err;
+ err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
+ if (err)
+ return err;
+ err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes);
+ if (err)
+ return err;
+ return err;
+}
+
+/*
+ * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests.
+ * @c: The DAMON context of the callback.
+ *
+ * This function is periodically called back from the kdamond thread for @c.
+ * Then, it checks if there is a waiting DAMON sysfs request and handles it.
+ */
+static int damon_sysfs_cmd_request_callback(struct damon_ctx *c)
+{
+ struct damon_sysfs_kdamond *kdamond;
+ int err = 0;
+
+ /* avoid deadlock due to concurrent state_store('off') */
+ if (!mutex_trylock(&damon_sysfs_lock))
+ return 0;
+ kdamond = damon_sysfs_cmd_request.kdamond;
+ if (!kdamond || kdamond->damon_ctx != c)
+ goto out;
+ switch (damon_sysfs_cmd_request.cmd) {
+ case DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS:
+ err = damon_sysfs_upd_schemes_stats(kdamond);
+ break;
+ case DAMON_SYSFS_CMD_COMMIT:
+ err = damon_sysfs_commit_input(kdamond);
+ break;
+ default:
+ break;
+ }
+ /* Mark the request as invalid now. */
+ damon_sysfs_cmd_request.kdamond = NULL;
+out:
+ mutex_unlock(&damon_sysfs_lock);
+ return err;
+}
+
static struct damon_ctx *damon_sysfs_build_ctx(
struct damon_sysfs_context *sys_ctx)
{
@@ -2216,6 +2451,8 @@ static struct damon_ctx *damon_sysfs_build_ctx(
if (err)
goto out;
+ ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback;
+ ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback;
ctx->callback.before_terminate = damon_sysfs_before_terminate;
return ctx;
@@ -2232,6 +2469,8 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
if (kdamond->damon_ctx &&
damon_sysfs_ctx_running(kdamond->damon_ctx))
return -EBUSY;
+ if (damon_sysfs_cmd_request.kdamond == kdamond)
+ return -EBUSY;
/* TODO: support multiple contexts per kdamond */
if (kdamond->contexts->nr != 1)
return -EINVAL;
@@ -2264,28 +2503,62 @@ static int damon_sysfs_turn_damon_off(struct damon_sysfs_kdamond *kdamond)
*/
}
-static int damon_sysfs_update_schemes_stats(struct damon_sysfs_kdamond *kdamond)
-{
- struct damon_ctx *ctx = kdamond->damon_ctx;
- struct damos *scheme;
- int schemes_idx = 0;
+/*
+ * damon_sysfs_handle_cmd() - Handle a command for a specific kdamond.
+ * @cmd: The command to handle.
+ * @kdamond: The kobject wrapper for the associated kdamond.
+ *
+ * This function handles a DAMON sysfs command for a kdamond. For commands
+ * that need to access running DAMON context-internal data, it requests
+ * handling of the command to the DAMON callback
+ * (@damon_sysfs_cmd_request_callback()) and wait until it is properly handled,
+ * or the context is completed.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd,
+ struct damon_sysfs_kdamond *kdamond)
+{
+ bool need_wait = true;
+
+ /* Handle commands that doesn't access DAMON context-internal data */
+ switch (cmd) {
+ case DAMON_SYSFS_CMD_ON:
+ return damon_sysfs_turn_damon_on(kdamond);
+ case DAMON_SYSFS_CMD_OFF:
+ return damon_sysfs_turn_damon_off(kdamond);
+ default:
+ break;
+ }
- if (!ctx)
+ /* Pass the command to DAMON callback for safe DAMON context access */
+ if (damon_sysfs_cmd_request.kdamond)
+ return -EBUSY;
+ if (!damon_sysfs_kdamond_running(kdamond))
return -EINVAL;
- mutex_lock(&ctx->kdamond_lock);
- damon_for_each_scheme(scheme, ctx) {
- struct damon_sysfs_schemes *sysfs_schemes;
- struct damon_sysfs_stats *sysfs_stats;
+ damon_sysfs_cmd_request.cmd = cmd;
+ damon_sysfs_cmd_request.kdamond = kdamond;
- sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes;
- sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats;
- sysfs_stats->nr_tried = scheme->stat.nr_tried;
- sysfs_stats->sz_tried = scheme->stat.sz_tried;
- sysfs_stats->nr_applied = scheme->stat.nr_applied;
- sysfs_stats->sz_applied = scheme->stat.sz_applied;
- sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds;
+ /*
+ * wait until damon_sysfs_cmd_request_callback() handles the request
+ * from kdamond context
+ */
+ mutex_unlock(&damon_sysfs_lock);
+ while (need_wait) {
+ schedule_timeout_idle(msecs_to_jiffies(100));
+ if (!mutex_trylock(&damon_sysfs_lock))
+ continue;
+ if (!damon_sysfs_cmd_request.kdamond) {
+ /* damon_sysfs_cmd_request_callback() handled */
+ need_wait = false;
+ } else if (!damon_sysfs_kdamond_running(kdamond)) {
+ /* kdamond has already finished */
+ need_wait = false;
+ damon_sysfs_cmd_request.kdamond = NULL;
+ }
+ mutex_unlock(&damon_sysfs_lock);
}
- mutex_unlock(&ctx->kdamond_lock);
+ mutex_lock(&damon_sysfs_lock);
return 0;
}
@@ -2294,18 +2567,17 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
{
struct damon_sysfs_kdamond *kdamond = container_of(kobj,
struct damon_sysfs_kdamond, kobj);
- ssize_t ret;
+ enum damon_sysfs_cmd cmd;
+ ssize_t ret = -EINVAL;
if (!mutex_trylock(&damon_sysfs_lock))
return -EBUSY;
- if (sysfs_streq(buf, "on"))
- ret = damon_sysfs_turn_damon_on(kdamond);
- else if (sysfs_streq(buf, "off"))
- ret = damon_sysfs_turn_damon_off(kdamond);
- else if (sysfs_streq(buf, "update_schemes_stats"))
- ret = damon_sysfs_update_schemes_stats(kdamond);
- else
- ret = -EINVAL;
+ for (cmd = 0; cmd < NR_DAMON_SYSFS_CMDS; cmd++) {
+ if (sysfs_streq(buf, damon_sysfs_cmd_strs[cmd])) {
+ ret = damon_sysfs_handle_cmd(cmd, kdamond);
+ break;
+ }
+ }
mutex_unlock(&damon_sysfs_lock);
if (!ret)
ret = count;
@@ -2424,6 +2696,12 @@ static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds,
if (damon_sysfs_nr_running_ctxs(kdamonds->kdamonds_arr, kdamonds->nr))
return -EBUSY;
+ for (i = 0; i < kdamonds->nr; i++) {
+ if (damon_sysfs_cmd_request.kdamond ==
+ kdamonds->kdamonds_arr[i])
+ return -EBUSY;
+ }
+
damon_sysfs_kdamonds_rm_dirs(kdamonds);
if (!nr_kdamonds)
return 0;
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index 1a55bb6c36c3..d4f55f349100 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -109,7 +109,7 @@ static struct damon_region *__nth_region_of(struct damon_target *t, int idx)
}
/*
- * Test 'damon_va_apply_three_regions()'
+ * Test 'damon_set_regions()'
*
* test kunit object
* regions an array containing start/end addresses of current
@@ -124,7 +124,7 @@ static struct damon_region *__nth_region_of(struct damon_target *t, int idx)
* the change, DAMON periodically reads the mappings, simplifies it to the
* three regions, and updates the monitoring target regions to fit in the three
* regions. The update of current target regions is the role of
- * 'damon_va_apply_three_regions()'.
+ * 'damon_set_regions()'.
*
* This test passes the given target regions and the new three regions that
* need to be applied to the function and check whether it updates the regions
@@ -145,7 +145,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
damon_add_region(r, t);
}
- damon_va_apply_three_regions(t, three_regions);
+ damon_set_regions(t, three_regions, 3);
for (i = 0; i < nr_expected / 2; i++) {
r = __nth_region_of(t, i);
@@ -281,14 +281,16 @@ static void damon_test_split_evenly_succ(struct kunit *test,
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_pieces);
damon_for_each_region(r, t) {
- if (i == nr_pieces - 1)
+ if (i == nr_pieces - 1) {
+ KUNIT_EXPECT_EQ(test,
+ r->ar.start, start + i * expected_width);
+ KUNIT_EXPECT_EQ(test, r->ar.end, end);
break;
+ }
KUNIT_EXPECT_EQ(test,
r->ar.start, start + i++ * expected_width);
KUNIT_EXPECT_EQ(test, r->ar.end, start + i * expected_width);
}
- KUNIT_EXPECT_EQ(test, r->ar.start, start + i * expected_width);
- KUNIT_EXPECT_EQ(test, r->ar.end, end);
damon_free_target(t);
}
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index b2ec0aa1ff45..59e1653799f8 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -282,77 +282,6 @@ static void damon_va_init(struct damon_ctx *ctx)
}
/*
- * Functions for the dynamic monitoring target regions update
- */
-
-/*
- * Check whether a region is intersecting an address range
- *
- * Returns true if it is.
- */
-static bool damon_intersect(struct damon_region *r,
- struct damon_addr_range *re)
-{
- return !(r->ar.end <= re->start || re->end <= r->ar.start);
-}
-
-/*
- * Update damon regions for the three big regions of the given target
- *
- * t the given target
- * bregions the three big regions of the target
- */
-static void damon_va_apply_three_regions(struct damon_target *t,
- struct damon_addr_range bregions[3])
-{
- struct damon_region *r, *next;
- unsigned int i;
-
- /* Remove regions which are not in the three big regions now */
- damon_for_each_region_safe(r, next, t) {
- for (i = 0; i < 3; i++) {
- if (damon_intersect(r, &bregions[i]))
- break;
- }
- if (i == 3)
- damon_destroy_region(r, t);
- }
-
- /* Adjust intersecting regions to fit with the three big regions */
- for (i = 0; i < 3; i++) {
- struct damon_region *first = NULL, *last;
- struct damon_region *newr;
- struct damon_addr_range *br;
-
- br = &bregions[i];
- /* Get the first and last regions which intersects with br */
- damon_for_each_region(r, t) {
- if (damon_intersect(r, br)) {
- if (!first)
- first = r;
- last = r;
- }
- if (r->ar.start >= br->end)
- break;
- }
- if (!first) {
- /* no damon_region intersects with this big region */
- newr = damon_new_region(
- ALIGN_DOWN(br->start,
- DAMON_MIN_REGION),
- ALIGN(br->end, DAMON_MIN_REGION));
- if (!newr)
- continue;
- damon_insert_region(newr, damon_prev_region(r), r, t);
- } else {
- first->ar.start = ALIGN_DOWN(br->start,
- DAMON_MIN_REGION);
- last->ar.end = ALIGN(br->end, DAMON_MIN_REGION);
- }
- }
-}
-
-/*
* Update regions for current memory mappings
*/
static void damon_va_update(struct damon_ctx *ctx)
@@ -363,7 +292,7 @@ static void damon_va_update(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
if (damon_va_three_regions(t, three_regions))
continue;
- damon_va_apply_three_regions(t, three_regions);
+ damon_set_regions(t, three_regions, 3);
}
}
@@ -513,7 +442,7 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
if (pmd_young(*pmd) || !page_is_idle(page) ||
mmu_notifier_test_young(walk->mm,
addr)) {
- *priv->page_sz = ((1UL) << HPAGE_PMD_SHIFT);
+ *priv->page_sz = HPAGE_PMD_SIZE;
priv->young = true;
}
put_page(page);
@@ -753,8 +682,19 @@ static int __init damon_va_initcall(void)
.apply_scheme = damon_va_apply_scheme,
.get_scheme_score = damon_va_scheme_score,
};
-
- return damon_register_ops(&ops);
+ /* ops for fixed virtual address ranges */
+ struct damon_operations ops_fvaddr = ops;
+ int err;
+
+ /* Don't set the monitoring target regions for the entire mapping */
+ ops_fvaddr.id = DAMON_OPS_FVADDR;
+ ops_fvaddr.init = NULL;
+ ops_fvaddr.update = NULL;
+
+ err = damon_register_ops(&ops);
+ if (err)
+ return err;
+ return damon_register_ops(&ops_fvaddr);
};
subsys_initcall(damon_va_initcall);
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index db2abd9e415b..1ab091f49fc0 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -93,7 +93,7 @@ struct pgtable_debug_args {
static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx)
{
- pgprot_t prot = protection_map[idx];
+ pgprot_t prot = vm_get_page_prot(idx);
pte_t pte = pfn_pte(args->fixed_pte_pfn, prot);
unsigned long val = idx, *ptr = &val;
@@ -101,7 +101,7 @@ static void __init pte_basic_tests(struct pgtable_debug_args *args, int idx)
/*
* This test needs to be executed after the given page table entry
- * is created with pfn_pte() to make sure that protection_map[idx]
+ * is created with pfn_pte() to make sure that vm_get_page_prot(idx)
* does not have the dirty bit enabled from the beginning. This is
* important for platforms like arm64 where (!PTE_RDONLY) indicate
* dirty bit being set.
@@ -190,7 +190,7 @@ static void __init pte_savedwrite_tests(struct pgtable_debug_args *args)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx)
{
- pgprot_t prot = protection_map[idx];
+ pgprot_t prot = vm_get_page_prot(idx);
unsigned long val = idx, *ptr = &val;
pmd_t pmd;
@@ -202,7 +202,7 @@ static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx)
/*
* This test needs to be executed after the given page table entry
- * is created with pfn_pmd() to make sure that protection_map[idx]
+ * is created with pfn_pmd() to make sure that vm_get_page_prot(idx)
* does not have the dirty bit enabled from the beginning. This is
* important for platforms like arm64 where (!PTE_RDONLY) indicate
* dirty bit being set.
@@ -325,7 +325,7 @@ static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args)
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx)
{
- pgprot_t prot = protection_map[idx];
+ pgprot_t prot = vm_get_page_prot(idx);
unsigned long val = idx, *ptr = &val;
pud_t pud;
@@ -337,7 +337,7 @@ static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx)
/*
* This test needs to be executed after the given page table entry
- * is created with pfn_pud() to make sure that protection_map[idx]
+ * is created with pfn_pud() to make sure that vm_get_page_prot(idx)
* does not have the dirty bit enabled from the beginning. This is
* important for platforms like arm64 where (!PTE_RDONLY) indicate
* dirty bit being set.
@@ -837,6 +837,19 @@ static void __init pmd_soft_dirty_tests(struct pgtable_debug_args *args) { }
static void __init pmd_swap_soft_dirty_tests(struct pgtable_debug_args *args) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pte_swap_exclusive_tests(struct pgtable_debug_args *args)
+{
+#ifdef __HAVE_ARCH_PTE_SWP_EXCLUSIVE
+ pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot);
+
+ pr_debug("Validating PTE swap exclusive\n");
+ pte = pte_swp_mkexclusive(pte);
+ WARN_ON(!pte_swp_exclusive(pte));
+ pte = pte_swp_clear_exclusive(pte);
+ WARN_ON(pte_swp_exclusive(pte));
+#endif /* __HAVE_ARCH_PTE_SWP_EXCLUSIVE */
+}
+
static void __init pte_swap_tests(struct pgtable_debug_args *args)
{
swp_entry_t swp;
@@ -1106,14 +1119,14 @@ static int __init init_args(struct pgtable_debug_args *args)
/*
* Initialize the debugging data.
*
- * protection_map[0] (or even protection_map[8]) will help create
- * page table entries with PROT_NONE permission as required for
- * pxx_protnone_tests().
+ * vm_get_page_prot(VM_NONE) or vm_get_page_prot(VM_SHARED|VM_NONE)
+ * will help create page table entries with PROT_NONE permission as
+ * required for pxx_protnone_tests().
*/
memset(args, 0, sizeof(*args));
args->vaddr = get_random_vaddr();
args->page_prot = vm_get_page_prot(VMFLAGS);
- args->page_prot_none = protection_map[0];
+ args->page_prot_none = vm_get_page_prot(VM_NONE);
args->is_contiguous_page = false;
args->pud_pfn = ULONG_MAX;
args->pmd_pfn = ULONG_MAX;
@@ -1248,12 +1261,19 @@ static int __init debug_vm_pgtable(void)
return ret;
/*
- * Iterate over the protection_map[] to make sure that all
+ * Iterate over each possible vm_flags to make sure that all
* the basic page table transformation validations just hold
* true irrespective of the starting protection value for a
* given page table entry.
+ *
+ * Protection based vm_flags combinatins are always linear
+ * and increasing i.e starting from VM_NONE and going upto
+ * (VM_SHARED | READ | WRITE | EXEC).
*/
- for (idx = 0; idx < ARRAY_SIZE(protection_map); idx++) {
+#define VM_FLAGS_START (VM_NONE)
+#define VM_FLAGS_END (VM_SHARED | VM_EXEC | VM_WRITE | VM_READ)
+
+ for (idx = VM_FLAGS_START; idx <= VM_FLAGS_END; idx++) {
pte_basic_tests(&args, idx);
pmd_basic_tests(&args, idx);
pud_basic_tests(&args, idx);
@@ -1288,6 +1308,8 @@ static int __init debug_vm_pgtable(void)
pte_swap_soft_dirty_tests(&args);
pmd_swap_soft_dirty_tests(&args);
+ pte_swap_exclusive_tests(&args);
+
pte_swap_tests(&args);
pmd_swap_tests(&args);
diff --git a/mm/failslab.c b/mm/failslab.c
index f92fed91ac23..58df9789f1d2 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -30,6 +30,9 @@ bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags)
if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
return false;
+ if (gfpflags & __GFP_NOWARN)
+ failslab.attr.no_warn = true;
+
return should_fail(&failslab.attr, s->object_size);
}
diff --git a/mm/filemap.c b/mm/filemap.c
index fa0ca674450f..9daeaab36081 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3376,6 +3376,11 @@ again:
vmf->pte += xas.xa_index - last_pgoff;
last_pgoff = xas.xa_index;
+ /*
+ * NOTE: If there're PTE markers, we'll leave them to be
+ * handled in the specific fault path, and it'll prohibit the
+ * fault-around logic.
+ */
if (!pte_none(*vmf->pte))
goto unlock;
diff --git a/mm/gup.c b/mm/gup.c
index 501bc150792c..551264407624 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -29,6 +29,39 @@ struct follow_page_context {
unsigned int page_mask;
};
+static inline void sanity_check_pinned_pages(struct page **pages,
+ unsigned long npages)
+{
+ if (!IS_ENABLED(CONFIG_DEBUG_VM))
+ return;
+
+ /*
+ * We only pin anonymous pages if they are exclusive. Once pinned, we
+ * can no longer turn them possibly shared and PageAnonExclusive() will
+ * stick around until the page is freed.
+ *
+ * We'd like to verify that our pinned anonymous pages are still mapped
+ * exclusively. The issue with anon THP is that we don't know how
+ * they are/were mapped when pinning them. However, for anon
+ * THP we can assume that either the given page (PTE-mapped THP) or
+ * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
+ * neither is the case, there is certainly something wrong.
+ */
+ for (; npages; npages--, pages++) {
+ struct page *page = *pages;
+ struct folio *folio = page_folio(page);
+
+ if (!folio_test_anon(folio))
+ continue;
+ if (!folio_test_large(folio) || folio_test_hugetlb(folio))
+ VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
+ else
+ /* Either a PTE-mapped or a PMD-mapped THP. */
+ VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
+ !PageAnonExclusive(page), page);
+ }
+}
+
/*
* Return the folio with ref appropriately incremented,
* or NULL if that failed.
@@ -204,6 +237,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags)
*/
void unpin_user_page(struct page *page)
{
+ sanity_check_pinned_pages(&page, 1);
gup_put_folio(page_folio(page), 1, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_page);
@@ -272,6 +306,7 @@ void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
return;
}
+ sanity_check_pinned_pages(pages, npages);
for (i = 0; i < npages; i += nr) {
folio = gup_folio_next(pages, npages, i, &nr);
/*
@@ -344,6 +379,23 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
}
EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
+static void unpin_user_pages_lockless(struct page **pages, unsigned long npages)
+{
+ unsigned long i;
+ struct folio *folio;
+ unsigned int nr;
+
+ /*
+ * Don't perform any sanity checks because we might have raced with
+ * fork() and some anonymous pages might now actually be shared --
+ * which is why we're unpinning after all.
+ */
+ for (i = 0; i < npages; i += nr) {
+ folio = gup_folio_next(pages, npages, i, &nr);
+ gup_put_folio(folio, nr, FOLL_PIN);
+ }
+}
+
/**
* unpin_user_pages() - release an array of gup-pinned pages.
* @pages: array of pages to be marked dirty and released.
@@ -367,6 +419,7 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
if (WARN_ON(IS_ERR_VALUE(npages)))
return;
+ sanity_check_pinned_pages(pages, npages);
for (i = 0; i < npages; i += nr) {
folio = gup_folio_next(pages, npages, i, &nr);
gup_put_folio(folio, nr, FOLL_PIN);
@@ -506,6 +559,14 @@ retry:
}
}
+ if (!pte_write(pte) && gup_must_unshare(flags, page)) {
+ page = ERR_PTR(-EMLINK);
+ goto out;
+ }
+
+ VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
+
/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
if (unlikely(!try_grab_page(page, flags))) {
page = ERR_PTR(-ENOMEM);
@@ -732,6 +793,11 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
* When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
* the device's dev_pagemap metadata to avoid repeating expensive lookups.
*
+ * When getting an anonymous page and the caller has to trigger unsharing
+ * of a shared anonymous page first, -EMLINK is returned. The caller should
+ * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
+ * relevant with FOLL_PIN and !FOLL_WRITE.
+ *
* On output, the @ctx->page_mask is set according to the size of the page.
*
* Return: the mapped (struct page *), %NULL if no mapping exists, or
@@ -787,6 +853,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
if (vma_is_secretmem(vma))
return NULL;
+ if (foll_flags & FOLL_PIN)
+ return NULL;
+
page = follow_page_mask(vma, address, foll_flags, &ctx);
if (ctx.pgmap)
put_dev_pagemap(ctx.pgmap);
@@ -852,7 +921,8 @@ unmap:
* is, *@locked will be set to 0 and -EBUSY returned.
*/
static int faultin_page(struct vm_area_struct *vma,
- unsigned long address, unsigned int *flags, int *locked)
+ unsigned long address, unsigned int *flags, bool unshare,
+ int *locked)
{
unsigned int fault_flags = 0;
vm_fault_t ret;
@@ -874,6 +944,11 @@ static int faultin_page(struct vm_area_struct *vma,
*/
fault_flags |= FAULT_FLAG_TRIED;
}
+ if (unshare) {
+ fault_flags |= FAULT_FLAG_UNSHARE;
+ /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
+ VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
+ }
ret = handle_mm_fault(vma, address, fault_flags, NULL);
if (ret & VM_FAULT_ERROR) {
@@ -1095,8 +1170,9 @@ retry:
cond_resched();
page = follow_page_mask(vma, start, foll_flags, &ctx);
- if (!page) {
- ret = faultin_page(vma, start, &foll_flags, locked);
+ if (!page || PTR_ERR(page) == -EMLINK) {
+ ret = faultin_page(vma, start, &foll_flags,
+ PTR_ERR(page) == -EMLINK, locked);
switch (ret) {
case 0:
goto retry;
@@ -2227,6 +2303,11 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
goto pte_unmap;
}
+ if (!pte_write(pte) && gup_must_unshare(flags, page)) {
+ gup_put_folio(folio, 1, flags);
+ goto pte_unmap;
+ }
+
/*
* We need to make the page accessible if and only if we are
* going to access its content (the FOLL_PIN case). Please
@@ -2407,6 +2488,11 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
return 0;
}
+ if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
*nr += refs;
folio_set_referenced(folio);
return 1;
@@ -2468,6 +2554,11 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
+ if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
*nr += refs;
folio_set_referenced(folio);
return 1;
@@ -2503,6 +2594,11 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
+ if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) {
+ gup_put_folio(folio, refs, flags);
+ return 0;
+ }
+
*nr += refs;
folio_set_referenced(folio);
return 1;
@@ -2740,8 +2836,10 @@ static unsigned long lockless_pages_from_mm(unsigned long start,
*/
if (gup_flags & FOLL_PIN) {
if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
- unpin_user_pages(pages, nr_pinned);
+ unpin_user_pages_lockless(pages, nr_pinned);
return 0;
+ } else {
+ sanity_check_pinned_pages(pages, nr_pinned);
}
}
return nr_pinned;
@@ -2900,6 +2998,9 @@ int pin_user_pages_fast(unsigned long start, int nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
+ if (WARN_ON_ONCE(!pages))
+ return -EINVAL;
+
gup_flags |= FOLL_PIN;
return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
}
@@ -2922,6 +3023,9 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages,
*/
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return 0;
+
+ if (WARN_ON_ONCE(!pages))
+ return 0;
/*
* FOLL_FAST_ONLY is required in order to match the API description of
* this routine: no fall back to regular ("slow") GUP.
@@ -2949,8 +3053,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
+ * Should be at least nr_pages long.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
* @locked: pointer to lock flag indicating whether lock is held and
@@ -2973,6 +3076,9 @@ long pin_user_pages_remote(struct mm_struct *mm,
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
+ if (WARN_ON_ONCE(!pages))
+ return -EINVAL;
+
gup_flags |= FOLL_PIN;
return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
pages, vmas, locked);
@@ -2986,8 +3092,7 @@ EXPORT_SYMBOL(pin_user_pages_remote);
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying lookup behaviour
* @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
+ * Should be at least nr_pages long.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
*
@@ -3005,6 +3110,9 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
+ if (WARN_ON_ONCE(!pages))
+ return -EINVAL;
+
gup_flags |= FOLL_PIN;
return __gup_longterm_locked(current->mm, start, nr_pages,
pages, vmas, gup_flags);
@@ -3023,6 +3131,9 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_GET))
return -EINVAL;
+ if (WARN_ON_ONCE(!pages))
+ return -EINVAL;
+
gup_flags |= FOLL_PIN;
return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
}
diff --git a/mm/hmm.c b/mm/hmm.c
index af71aac3140e..3fd3242c5e50 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -239,7 +239,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
pte_t pte = *ptep;
uint64_t pfn_req_flags = *hmm_pfn;
- if (pte_none(pte)) {
+ if (pte_none_mostly(pte)) {
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 910a138e9859..a77c78a2b6b5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -39,6 +39,7 @@
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
+#include "swap.h"
#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>
@@ -68,13 +69,6 @@ static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
-static inline bool file_thp_enabled(struct vm_area_struct *vma)
-{
- return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file &&
- !inode_is_open_for_write(vma->vm_file->f_inode) &&
- (vma->vm_flags & VM_EXEC);
-}
-
bool transparent_hugepage_active(struct vm_area_struct *vma)
{
/* The addr is used to check if the vma size fits */
@@ -86,8 +80,8 @@ bool transparent_hugepage_active(struct vm_area_struct *vma)
return __transparent_hugepage_enabled(vma);
if (vma_is_shmem(vma))
return shmem_huge_enabled(vma);
- if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
- return file_thp_enabled(vma);
+ if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma))
+ return true;
return false;
}
@@ -647,7 +641,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- page_add_new_anon_rmap(page, vma, haddr, true);
+ page_add_new_anon_rmap(page, vma, haddr);
lru_cache_add_inactive_or_unevictable(page, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
@@ -725,15 +719,15 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
gfp_t gfp;
- struct page *page;
+ struct folio *folio;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
if (!transhuge_vma_suitable(vma, haddr))
return VM_FAULT_FALLBACK;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
- return VM_FAULT_OOM;
+ khugepaged_enter(vma, vma->vm_flags);
+
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
transparent_hugepage_use_zero_page()) {
@@ -774,13 +768,12 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
return ret;
}
gfp = vma_thp_gfp_mask(vma);
- page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
- if (unlikely(!page)) {
+ folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
+ if (unlikely(!folio)) {
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- prep_transhuge_page(page);
- return __do_huge_pmd_anonymous_page(vmf, page, gfp);
+ return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1054,7 +1047,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
swp_entry_t entry = pmd_to_swp_entry(pmd);
VM_BUG_ON(!is_pmd_migration_entry(pmd));
- if (is_writable_migration_entry(entry)) {
+ if (!is_readable_migration_entry(entry)) {
entry = make_readable_migration_entry(
swp_offset(entry));
pmd = swp_entry_to_pmd(entry);
@@ -1097,23 +1090,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
src_page = pmd_page(pmd);
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
- /*
- * If this page is a potentially pinned page, split and retry the fault
- * with smaller page size. Normally this should not happen because the
- * userspace should use MADV_DONTFORK upon pinned regions. This is a
- * best effort that the pinned pages won't be replaced by another
- * random page during the coming copy-on-write.
- */
- if (unlikely(page_needs_cow_for_dma(src_vma, src_page))) {
+ get_page(src_page);
+ if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) {
+ /* Page maybe pinned: split and retry the fault on PTEs. */
+ put_page(src_page);
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
__split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
return -EAGAIN;
}
-
- get_page(src_page);
- page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
out_zero_page:
mm_inc_nr_ptes(dst_mm);
@@ -1217,14 +1203,10 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
/* No huge zero pud yet */
}
- /* Please refer to comments in copy_huge_pmd() */
- if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) {
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
- __split_huge_pud(vma, src_pud, addr);
- return -EAGAIN;
- }
-
+ /*
+ * TODO: once we support anonymous pages, use page_try_dup_anon_rmap()
+ * and split if duplicating fails.
+ */
pudp_set_wrprotect(src_mm, addr, src_pud);
pud = pud_mkold(pud_wrprotect(pud));
set_pud_at(dst_mm, addr, dst_pud, pud);
@@ -1282,6 +1264,7 @@ unlock:
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
+ const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
@@ -1290,6 +1273,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
+ VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
+ VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
+
if (is_huge_zero_pmd(orig_pmd))
goto fallback;
@@ -1303,6 +1289,10 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageHead(page), page);
+ /* Early check when only holding the PT lock. */
+ if (PageAnonExclusive(page))
+ goto reuse;
+
if (!trylock_page(page)) {
get_page(page);
spin_unlock(vmf->ptl);
@@ -1317,8 +1307,14 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
put_page(page);
}
+ /* Recheck after temporarily dropping the PT lock. */
+ if (PageAnonExclusive(page)) {
+ unlock_page(page);
+ goto reuse;
+ }
+
/*
- * See do_wp_page(): we can only map the page writable if there are
+ * See do_wp_page(): we can only reuse the page exclusively if there are
* no additional references. Note that we always drain the LRU
* pagevecs immediately after adding a THP.
*/
@@ -1328,11 +1324,18 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
try_to_free_swap(page);
if (page_count(page) == 1) {
pmd_t entry;
+
+ page_move_anon_rmap(page, vma);
+ unlock_page(page);
+reuse:
+ if (unlikely(unshare)) {
+ spin_unlock(vmf->ptl);
+ return 0;
+ }
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- unlock_page(page);
spin_unlock(vmf->ptl);
return VM_FAULT_WRITE;
}
@@ -1379,6 +1382,12 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+ if (!pmd_write(*pmd) && gup_must_unshare(flags, page))
+ return ERR_PTR(-EMLINK);
+
+ VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
+
if (!try_grab_page(page, flags))
return ERR_PTR(-ENOMEM);
@@ -1692,18 +1701,21 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
* or if prot_numa but THP migration is not supported
* - HPAGE_PMD_NR if protections changed and TLB flush necessary
*/
-int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
+int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr, pgprot_t newprot,
+ unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
- pmd_t entry;
+ pmd_t oldpmd, entry;
bool preserve_write;
int ret;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+
if (prot_numa && !thp_migration_supported())
return 1;
@@ -1717,6 +1729,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (is_swap_pmd(*pmd)) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
+ struct page *page = pfn_swap_entry_to_page(entry);
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
if (is_writable_migration_entry(entry)) {
@@ -1725,8 +1738,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* A protection check is difficult so
* just be safe and disable write
*/
- entry = make_readable_migration_entry(
- swp_offset(entry));
+ if (PageAnon(page))
+ entry = make_readable_exclusive_migration_entry(swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(swp_offset(entry));
newpmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*pmd))
newpmd = pmd_swp_mksoft_dirty(newpmd);
@@ -1778,12 +1793,12 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* The race makes MADV_DONTNEED miss the huge pmd and don't clear it
* which may break userspace.
*
- * pmdp_invalidate() is required to make sure we don't miss
+ * pmdp_invalidate_ad() is required to make sure we don't miss
* dirty/young flags set by hardware.
*/
- entry = pmdp_invalidate(vma, addr, pmd);
+ oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
- entry = pmd_modify(entry, newprot);
+ entry = pmd_modify(oldpmd, newprot);
if (preserve_write)
entry = pmd_mk_savedwrite(entry);
if (uffd_wp) {
@@ -1799,6 +1814,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
+
+ if (huge_pmd_needs_flush(oldpmd, entry))
+ tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
+
BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
unlock:
spin_unlock(ptl);
@@ -1946,6 +1965,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
+ bool anon_exclusive = false;
unsigned long addr;
int i;
@@ -2027,6 +2047,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pmd_to_swp_entry(old_pmd);
page = pfn_swap_entry_to_page(entry);
write = is_writable_migration_entry(entry);
+ if (PageAnon(page))
+ anon_exclusive = is_readable_exclusive_migration_entry(entry);
young = false;
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
@@ -2038,8 +2060,26 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
young = pmd_young(old_pmd);
soft_dirty = pmd_soft_dirty(old_pmd);
uffd_wp = pmd_uffd_wp(old_pmd);
+
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
+
+ /*
+ * Without "freeze", we'll simply split the PMD, propagating the
+ * PageAnonExclusive() flag for each PTE by setting it for
+ * each subpage -- no need to (temporarily) clear.
+ *
+ * With "freeze" we want to replace mapped pages by
+ * migration entries right away. This is only possible if we
+ * managed to clear PageAnonExclusive() -- see
+ * set_pmd_migration_entry().
+ *
+ * In case we cannot clear PageAnonExclusive(), split the PMD
+ * only and let try_to_migrate_one() fail later.
+ */
+ anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
+ if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
+ freeze = false;
}
/*
@@ -2061,6 +2101,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
if (write)
swp_entry = make_writable_migration_entry(
page_to_pfn(page + i));
+ else if (anon_exclusive)
+ swp_entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(page + i));
else
swp_entry = make_readable_migration_entry(
page_to_pfn(page + i));
@@ -2072,6 +2115,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
} else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
entry = maybe_mkwrite(entry, vma);
+ if (anon_exclusive)
+ SetPageAnonExclusive(page + i);
if (!write)
entry = pte_wrprotect(entry);
if (!young)
@@ -2249,8 +2294,6 @@ static void unmap_page(struct page *page)
try_to_migrate(folio, ttu_flags);
else
try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);
-
- VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
static void remap_page(struct folio *folio, unsigned long nr)
@@ -2305,6 +2348,13 @@ static void __split_huge_page_tail(struct page *head, int tail,
*
* After successful get_page_unless_zero() might follow flags change,
* for example lock_page() which set PG_waiters.
+ *
+ * Note that for mapped sub-pages of an anonymous THP,
+ * PG_anon_exclusive has been cleared in unmap_page() and is stored in
+ * the migration entry instead from where remap_page() will restore it.
+ * We can still have PG_anon_exclusive set on effectively unmapped and
+ * unreferenced sub-pages of an anonymous THP: we can simply drop
+ * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
*/
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
@@ -3035,25 +3085,35 @@ late_initcall(split_huge_pages_debugfs);
#endif
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
-void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
+int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
struct page *page)
{
struct vm_area_struct *vma = pvmw->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long address = pvmw->address;
+ bool anon_exclusive;
pmd_t pmdval;
swp_entry_t entry;
pmd_t pmdswp;
if (!(pvmw->pmd && !pvmw->pte))
- return;
+ return 0;
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+
+ anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
+ if (anon_exclusive && page_try_share_anon_rmap(page)) {
+ set_pmd_at(mm, address, pvmw->pmd, pmdval);
+ return -EBUSY;
+ }
+
if (pmd_dirty(pmdval))
set_page_dirty(page);
if (pmd_write(pmdval))
entry = make_writable_migration_entry(page_to_pfn(page));
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
else
entry = make_readable_migration_entry(page_to_pfn(page));
pmdswp = swp_entry_to_pmd(entry);
@@ -3063,6 +3123,8 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
page_remove_rmap(page, vma, true);
put_page(page);
trace_set_migration_pmd(address, pmd_val(pmdswp));
+
+ return 0;
}
void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
@@ -3087,10 +3149,17 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
- if (PageAnon(new))
- page_add_anon_rmap(new, vma, mmun_start, true);
- else
+ if (PageAnon(new)) {
+ rmap_t rmap_flags = RMAP_COMPOUND;
+
+ if (!is_readable_migration_entry(entry))
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ page_add_anon_rmap(new, vma, mmun_start, rmap_flags);
+ } else {
page_add_file_rmap(new, vma, true);
+ }
+ VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new));
set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
/* No need to invalidate - it was non-present before */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3fc721789743..01f0e2e5ab48 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -370,7 +370,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
}
static inline long
-hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
+hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
long to, struct hstate *h, struct hugetlb_cgroup *cg,
long *regions_needed)
{
@@ -379,7 +379,7 @@ hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
if (!regions_needed) {
nrg = get_file_region_entry_from_cache(map, from, to);
record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
- list_add(&nrg->link, rg->link.prev);
+ list_add(&nrg->link, rg);
coalesce_file_region(map, nrg);
} else
*regions_needed += 1;
@@ -402,47 +402,52 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
long add = 0;
struct list_head *head = &resv->regions;
long last_accounted_offset = f;
- struct file_region *rg = NULL, *trg = NULL;
+ struct file_region *iter, *trg = NULL;
+ struct list_head *rg = NULL;
if (regions_needed)
*regions_needed = 0;
/* In this loop, we essentially handle an entry for the range
- * [last_accounted_offset, rg->from), at every iteration, with some
+ * [last_accounted_offset, iter->from), at every iteration, with some
* bounds checking.
*/
- list_for_each_entry_safe(rg, trg, head, link) {
+ list_for_each_entry_safe(iter, trg, head, link) {
/* Skip irrelevant regions that start before our range. */
- if (rg->from < f) {
+ if (iter->from < f) {
/* If this region ends after the last accounted offset,
* then we need to update last_accounted_offset.
*/
- if (rg->to > last_accounted_offset)
- last_accounted_offset = rg->to;
+ if (iter->to > last_accounted_offset)
+ last_accounted_offset = iter->to;
continue;
}
/* When we find a region that starts beyond our range, we've
* finished.
*/
- if (rg->from >= t)
+ if (iter->from >= t) {
+ rg = iter->link.prev;
break;
+ }
- /* Add an entry for last_accounted_offset -> rg->from, and
+ /* Add an entry for last_accounted_offset -> iter->from, and
* update last_accounted_offset.
*/
- if (rg->from > last_accounted_offset)
- add += hugetlb_resv_map_add(resv, rg,
+ if (iter->from > last_accounted_offset)
+ add += hugetlb_resv_map_add(resv, iter->link.prev,
last_accounted_offset,
- rg->from, h, h_cg,
+ iter->from, h, h_cg,
regions_needed);
- last_accounted_offset = rg->to;
+ last_accounted_offset = iter->to;
}
/* Handle the case where our range extends beyond
* last_accounted_offset.
*/
+ if (!rg)
+ rg = head->prev;
if (last_accounted_offset < t)
add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
t, h, h_cg, regions_needed);
@@ -1535,7 +1540,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
- if (alloc_huge_page_vmemmap(h, page)) {
+ if (hugetlb_vmemmap_alloc(h, page)) {
spin_lock_irq(&hugetlb_lock);
/*
* If we cannot allocate vmemmap pages, just refuse to free the
@@ -1612,7 +1617,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
static inline void flush_free_hpage_work(struct hstate *h)
{
- if (free_vmemmap_pages_per_hpage(h))
+ if (hugetlb_optimize_vmemmap_pages(h))
flush_work(&free_hpage_work);
}
@@ -1672,6 +1677,8 @@ void free_huge_page(struct page *page)
VM_BUG_ON_PAGE(page_mapcount(page), page);
hugetlb_set_page_subpool(page, NULL);
+ if (PageAnon(page))
+ __ClearPageAnonExclusive(page);
page->mapping = NULL;
restore_reserve = HPageRestoreReserve(page);
ClearHPageRestoreReserve(page);
@@ -1732,7 +1739,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
static void __prep_new_huge_page(struct hstate *h, struct page *page)
{
- free_huge_page_vmemmap(h, page);
+ hugetlb_vmemmap_free(h, page);
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
hugetlb_set_page_subpool(page, NULL);
@@ -2105,7 +2112,7 @@ retry:
* Attempt to allocate vmemmmap here so that we can take
* appropriate action on failure.
*/
- rc = alloc_huge_page_vmemmap(h, head);
+ rc = hugetlb_vmemmap_alloc(h, head);
if (!rc) {
/*
* Move PageHWPoison flag from head page to the raw
@@ -2979,8 +2986,6 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
struct huge_bootmem_page *m = NULL; /* initialize for clang */
int nr_nodes, node;
- if (nid != NUMA_NO_NODE && nid >= nr_online_nodes)
- return 0;
/* do node specific alloc */
if (nid != NUMA_NO_NODE) {
m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
@@ -3088,7 +3093,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
}
/* do node specific alloc */
- for (i = 0; i < nr_online_nodes; i++) {
+ for_each_online_node(i) {
if (h->max_huge_pages_node[i] > 0) {
hugetlb_hstate_alloc_pages_onenode(h, i);
node_specific_alloc = true;
@@ -3420,7 +3425,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
remove_hugetlb_page_for_demote(h, page, false);
spin_unlock_irq(&hugetlb_lock);
- rc = alloc_huge_page_vmemmap(h, page);
+ rc = hugetlb_vmemmap_alloc(h, page);
if (rc) {
/* Allocation of vmemmmap failed, we can not demote page */
spin_lock_irq(&hugetlb_lock);
@@ -4052,7 +4057,7 @@ static int __init hugetlb_init(void)
default_hstate.max_huge_pages =
default_hstate_max_huge_pages;
- for (i = 0; i < nr_online_nodes; i++)
+ for_each_online_node(i)
default_hstate.max_huge_pages_node[i] =
default_hugepages_in_node[i];
}
@@ -4119,6 +4124,20 @@ bool __init __weak hugetlb_node_alloc_supported(void)
{
return true;
}
+
+static void __init hugepages_clear_pages_in_node(void)
+{
+ if (!hugetlb_max_hstate) {
+ default_hstate_max_huge_pages = 0;
+ memset(default_hugepages_in_node, 0,
+ MAX_NUMNODES * sizeof(unsigned int));
+ } else {
+ parsed_hstate->max_huge_pages = 0;
+ memset(parsed_hstate->max_huge_pages_node, 0,
+ MAX_NUMNODES * sizeof(unsigned int));
+ }
+}
+
/*
* hugepages command line processing
* hugepages normally follows a valid hugepagsz or default_hugepagsz
@@ -4138,7 +4157,7 @@ static int __init hugepages_setup(char *s)
if (!parsed_valid_hugepagesz) {
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
parsed_valid_hugepagesz = true;
- return 0;
+ return 1;
}
/*
@@ -4154,7 +4173,7 @@ static int __init hugepages_setup(char *s)
if (mhp == last_mhp) {
pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
- return 0;
+ return 1;
}
while (*p) {
@@ -4165,11 +4184,11 @@ static int __init hugepages_setup(char *s)
if (p[count] == ':') {
if (!hugetlb_node_alloc_supported()) {
pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
- return 0;
+ return 1;
}
- if (tmp >= nr_online_nodes)
+ if (tmp >= MAX_NUMNODES || !node_online(tmp))
goto invalid;
- node = array_index_nospec(tmp, nr_online_nodes);
+ node = array_index_nospec(tmp, MAX_NUMNODES);
p += count + 1;
/* Parse hugepages */
if (sscanf(p, "%lu%n", &tmp, &count) != 1)
@@ -4206,7 +4225,8 @@ static int __init hugepages_setup(char *s)
invalid:
pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
- return 0;
+ hugepages_clear_pages_in_node();
+ return 1;
}
__setup("hugepages=", hugepages_setup);
@@ -4227,7 +4247,7 @@ static int __init hugepagesz_setup(char *s)
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
- return 0;
+ return 1;
}
h = size_to_hstate(size);
@@ -4242,7 +4262,7 @@ static int __init hugepagesz_setup(char *s)
if (!parsed_default_hugepagesz || h != &default_hstate ||
default_hstate.max_huge_pages) {
pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
- return 0;
+ return 1;
}
/*
@@ -4273,14 +4293,14 @@ static int __init default_hugepagesz_setup(char *s)
parsed_valid_hugepagesz = false;
if (parsed_default_hugepagesz) {
pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
- return 0;
+ return 1;
}
size = (unsigned long)memparse(s, NULL);
if (!arch_hugetlb_valid_size(size)) {
pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
- return 0;
+ return 1;
}
hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
@@ -4297,7 +4317,7 @@ static int __init default_hugepagesz_setup(char *s)
*/
if (default_hstate_max_huge_pages) {
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
- for (i = 0; i < nr_online_nodes; i++)
+ for_each_online_node(i)
default_hstate.max_huge_pages_node[i] =
default_hugepages_in_node[i];
if (hstate_is_gigantic(&default_hstate))
@@ -4699,24 +4719,27 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr
}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
- struct vm_area_struct *vma)
+ struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma)
{
pte_t *src_pte, *dst_pte, entry, dst_entry;
struct page *ptepage;
unsigned long addr;
- bool cow = is_cow_mapping(vma->vm_flags);
- struct hstate *h = hstate_vma(vma);
+ bool cow = is_cow_mapping(src_vma->vm_flags);
+ struct hstate *h = hstate_vma(src_vma);
unsigned long sz = huge_page_size(h);
unsigned long npages = pages_per_huge_page(h);
- struct address_space *mapping = vma->vm_file->f_mapping;
+ struct address_space *mapping = src_vma->vm_file->f_mapping;
struct mmu_notifier_range range;
int ret = 0;
if (cow) {
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
- vma->vm_start,
- vma->vm_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
+ src_vma->vm_start,
+ src_vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
+ mmap_assert_write_locked(src);
+ raw_write_seqcount_begin(&src->write_protect_seq);
} else {
/*
* For shared mappings i_mmap_rwsem must be held to call
@@ -4727,12 +4750,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
i_mmap_lock_read(mapping);
}
- for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
+ for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte)
continue;
- dst_pte = huge_pte_alloc(dst, vma, addr, sz);
+ dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
break;
@@ -4767,8 +4790,9 @@ again:
} else if (unlikely(is_hugetlb_entry_migration(entry) ||
is_hugetlb_entry_hwpoisoned(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
+ bool uffd_wp = huge_pte_uffd_wp(entry);
- if (is_writable_migration_entry(swp_entry) && cow) {
+ if (!is_readable_migration_entry(swp_entry) && cow) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
@@ -4776,38 +4800,53 @@ again:
swp_entry = make_readable_migration_entry(
swp_offset(swp_entry));
entry = swp_entry_to_pte(swp_entry);
+ if (userfaultfd_wp(src_vma) && uffd_wp)
+ entry = huge_pte_mkuffd_wp(entry);
set_huge_swap_pte_at(src, addr, src_pte,
entry, sz);
}
+ if (!userfaultfd_wp(dst_vma) && uffd_wp)
+ entry = huge_pte_clear_uffd_wp(entry);
set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+ } else if (unlikely(is_pte_marker(entry))) {
+ /*
+ * We copy the pte marker only if the dst vma has
+ * uffd-wp enabled.
+ */
+ if (userfaultfd_wp(dst_vma))
+ set_huge_pte_at(dst, addr, dst_pte, entry);
} else {
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
/*
- * This is a rare case where we see pinned hugetlb
- * pages while they're prone to COW. We need to do the
- * COW earlier during fork.
+ * Failing to duplicate the anon rmap is a rare case
+ * where we see pinned hugetlb pages while they're
+ * prone to COW. We need to do the COW earlier during
+ * fork.
*
* When pre-allocating the page or copying data, we
* need to be without the pgtable locks since we could
* sleep during the process.
*/
- if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
+ if (!PageAnon(ptepage)) {
+ page_dup_file_rmap(ptepage, true);
+ } else if (page_try_dup_anon_rmap(ptepage, true,
+ src_vma)) {
pte_t src_pte_old = entry;
struct page *new;
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
- new = alloc_huge_page(vma, addr, 1);
+ new = alloc_huge_page(dst_vma, addr, 1);
if (IS_ERR(new)) {
put_page(ptepage);
ret = PTR_ERR(new);
break;
}
- copy_user_huge_page(new, ptepage, addr, vma,
+ copy_user_huge_page(new, ptepage, addr, dst_vma,
npages);
put_page(ptepage);
@@ -4817,13 +4856,13 @@ again:
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) {
- restore_reserve_on_error(h, vma, addr,
+ restore_reserve_on_error(h, dst_vma, addr,
new);
put_page(new);
/* dst_entry won't change as in child */
goto again;
}
- hugetlb_install_page(vma, dst_pte, addr, new);
+ hugetlb_install_page(dst_vma, dst_pte, addr, new);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
continue;
@@ -4841,7 +4880,6 @@ again:
entry = huge_pte_wrprotect(entry);
}
- page_dup_rmap(ptepage, true);
set_huge_pte_at(dst, addr, dst_pte, entry);
hugetlb_count_add(npages, dst);
}
@@ -4849,10 +4887,12 @@ again:
spin_unlock(dst_ptl);
}
- if (cow)
+ if (cow) {
+ raw_write_seqcount_end(&src->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
- else
+ } else {
i_mmap_unlock_read(mapping);
+ }
return ret;
}
@@ -4896,10 +4936,17 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
unsigned long old_addr_copy;
pte_t *src_pte, *dst_pte;
struct mmu_notifier_range range;
+ bool shared_pmd = false;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
old_end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ /*
+ * In case of shared PMDs, we should cover the maximum possible
+ * range.
+ */
+ flush_cache_range(vma, range.start, range.end);
+
mmu_notifier_invalidate_range_start(&range);
/* Prevent race with file truncation */
i_mmap_lock_write(mapping);
@@ -4916,8 +4963,10 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
*/
old_addr_copy = old_addr;
- if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
+ if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) {
+ shared_pmd = true;
continue;
+ }
dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
if (!dst_pte)
@@ -4925,7 +4974,11 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
}
- flush_tlb_range(vma, old_end - len, old_end);
+
+ if (shared_pmd)
+ flush_tlb_range(vma, range.start, range.end);
+ else
+ flush_tlb_range(vma, old_end - len, old_end);
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
@@ -4934,7 +4987,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- struct page *ref_page)
+ struct page *ref_page, zap_flags_t zap_flags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -4990,7 +5043,18 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
* unmapped and its refcount is dropped, so just clear pte here.
*/
if (unlikely(!pte_present(pte))) {
- huge_pte_clear(mm, address, ptep, sz);
+ /*
+ * If the pte was wr-protected by uffd-wp in any of the
+ * swap forms, meanwhile the caller does not want to
+ * drop the uffd-wp bit in this zap, then replace the
+ * pte with a marker.
+ */
+ if (pte_swp_uffd_wp_any(pte) &&
+ !(zap_flags & ZAP_FLAG_DROP_MARKER))
+ set_huge_pte_at(mm, address, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
+ else
+ huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
continue;
}
@@ -5018,7 +5082,11 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
-
+ /* Leave a uffd-wp pte marker if needed */
+ if (huge_pte_uffd_wp(pte) &&
+ !(zap_flags & ZAP_FLAG_DROP_MARKER))
+ set_huge_pte_at(mm, address, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, vma, true);
@@ -5052,9 +5120,10 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page)
+ unsigned long end, struct page *ref_page,
+ zap_flags_t zap_flags)
{
- __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+ __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
/*
* Clear this flag so that x86's huge_pmd_share page_table_shareable
@@ -5070,12 +5139,13 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page)
+ unsigned long end, struct page *ref_page,
+ zap_flags_t zap_flags)
{
struct mmu_gather tlb;
tlb_gather_mmu(&tlb, vma->vm_mm);
- __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+ __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
tlb_finish_mmu(&tlb);
}
@@ -5130,21 +5200,22 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
unmap_hugepage_range(iter_vma, address,
- address + huge_page_size(h), page);
+ address + huge_page_size(h), page, 0);
}
i_mmap_unlock_write(mapping);
}
/*
- * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ * hugetlb_wp() should be called with page lock of the original hugepage held.
* Called with hugetlb_fault_mutex_table held and pte_page locked so we
* cannot race with other handlers or page migration.
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
-static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep,
+static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep, unsigned int flags,
struct page *pagecache_page, spinlock_t *ptl)
{
+ const bool unshare = flags & FAULT_FLAG_UNSHARE;
pte_t pte;
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
@@ -5153,17 +5224,26 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long haddr = address & huge_page_mask(h);
struct mmu_notifier_range range;
+ VM_BUG_ON(unshare && (flags & FOLL_WRITE));
+ VM_BUG_ON(!unshare && !(flags & FOLL_WRITE));
+
pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
retry_avoidcopy:
- /* If no-one else is actually using this page, avoid the copy
- * and just make the page writable */
+ /*
+ * If no-one else is actually using this page, we're the exclusive
+ * owner and can reuse this page.
+ */
if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
- page_move_anon_rmap(old_page, vma);
- set_huge_ptep_writable(vma, haddr, ptep);
+ if (!PageAnonExclusive(old_page))
+ page_move_anon_rmap(old_page, vma);
+ if (likely(!unshare))
+ set_huge_ptep_writable(vma, haddr, ptep);
return 0;
}
+ VM_BUG_ON_PAGE(PageAnon(old_page) && PageAnonExclusive(old_page),
+ old_page);
/*
* If the process that created a MAP_PRIVATE mapping is about to
@@ -5262,13 +5342,13 @@ retry_avoidcopy:
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
ClearHPageRestoreReserve(new_page);
- /* Break COW */
+ /* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
page_remove_rmap(old_page, vma, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
set_huge_pte_at(mm, haddr, ptep,
- make_huge_pte(vma, new_page, 1));
+ make_huge_pte(vma, new_page, !unshare));
SetHPageMigratable(new_page);
/* Make the old page be freed below */
new_page = old_page;
@@ -5276,7 +5356,10 @@ retry_avoidcopy:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
out_release_all:
- /* No restore in case of successful pagetable update (Break COW) */
+ /*
+ * No restore in case of successful pagetable update (Break COW or
+ * unshare)
+ */
if (new_page != old_page)
restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
@@ -5386,7 +5469,8 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
struct vm_area_struct *vma,
struct address_space *mapping, pgoff_t idx,
- unsigned long address, pte_t *ptep, unsigned int flags)
+ unsigned long address, pte_t *ptep,
+ pte_t old_pte, unsigned int flags)
{
struct hstate *h = hstate_vma(vma);
vm_fault_t ret = VM_FAULT_SIGBUS;
@@ -5401,7 +5485,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
/*
* Currently, we are forced to kill the process in the event the
* original mapper has unmapped pages from the child due to a failed
- * COW. Warn that such a situation has occurred as it may not be obvious
+ * COW/unsharing. Warn that such a situation has occurred as it may not
+ * be obvious.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
@@ -5512,22 +5597,29 @@ retry:
ptl = huge_pte_lock(h, mm, ptep);
ret = 0;
- if (!huge_pte_none(huge_ptep_get(ptep)))
+ /* If pte changed from under us, retry */
+ if (!pte_same(huge_ptep_get(ptep), old_pte))
goto backout;
if (anon_rmap) {
ClearHPageRestoreReserve(page);
hugepage_add_new_anon_rmap(page, vma, haddr);
} else
- page_dup_rmap(page, true);
+ page_dup_file_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
+ /*
+ * If this pte was previously wr-protected, keep it wr-protected even
+ * if populated.
+ */
+ if (unlikely(pte_marker_uffd_wp(old_pte)))
+ new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
set_huge_pte_at(mm, haddr, ptep, new_pte);
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
+ ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
}
spin_unlock(ptl);
@@ -5639,8 +5731,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
- if (huge_pte_none(entry)) {
- ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
+ /* PTE markers should be handled the same way as none pte */
+ if (huge_pte_none_mostly(entry)) {
+ ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
+ entry, flags);
goto out_mutex;
}
@@ -5657,14 +5751,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_mutex;
/*
- * If we are going to COW the mapping later, we examine the pending
- * reservations for this page now. This will ensure that any
+ * If we are going to COW/unshare the mapping later, we examine the
+ * pending reservations for this page now. This will ensure that any
* allocations necessary to record that reservation occur outside the
* spinlock. For private mappings, we also lookup the pagecache
* page now as it is used to determine if a reservation has been
* consumed.
*/
- if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
+ !huge_pte_write(entry)) {
if (vma_needs_reservation(h, vma, haddr) < 0) {
ret = VM_FAULT_OOM;
goto out_mutex;
@@ -5679,12 +5774,32 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptl = huge_pte_lock(h, mm, ptep);
- /* Check for a racing update before calling hugetlb_cow */
+ /* Check for a racing update before calling hugetlb_wp() */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
goto out_ptl;
+ /* Handle userfault-wp first, before trying to lock more pages */
+ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
+ (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = haddr,
+ .real_address = address,
+ .flags = flags,
+ };
+
+ spin_unlock(ptl);
+ if (pagecache_page) {
+ unlock_page(pagecache_page);
+ put_page(pagecache_page);
+ }
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+ return handle_userfault(&vmf, VM_UFFD_WP);
+ }
+
/*
- * hugetlb_cow() requires page locks of pte_page(entry) and
+ * hugetlb_wp() requires page locks of pte_page(entry) and
* pagecache_page, so here we need take the former one
* when page != pagecache_page or !pagecache_page.
*/
@@ -5697,13 +5812,14 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
get_page(page);
- if (flags & FAULT_FLAG_WRITE) {
+ if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(entry)) {
- ret = hugetlb_cow(mm, vma, address, ptep,
- pagecache_page, ptl);
+ ret = hugetlb_wp(mm, vma, address, ptep, flags,
+ pagecache_page, ptl);
goto out_put_page;
+ } else if (likely(flags & FAULT_FLAG_WRITE)) {
+ entry = huge_pte_mkdirty(entry);
}
- entry = huge_pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
@@ -5746,7 +5862,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
unsigned long dst_addr,
unsigned long src_addr,
enum mcopy_atomic_mode mode,
- struct page **pagep)
+ struct page **pagep,
+ bool wp_copy)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
struct hstate *h = hstate_vma(dst_vma);
@@ -5876,27 +5993,43 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
goto out_release_unlock;
ret = -EEXIST;
- if (!huge_pte_none(huge_ptep_get(dst_pte)))
+ /*
+ * We allow to overwrite a pte marker: consider when both MISSING|WP
+ * registered, we firstly wr-protect a none pte which has no page cache
+ * page backing it, then access the page.
+ */
+ if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
goto out_release_unlock;
if (vm_shared) {
- page_dup_rmap(page, true);
+ page_dup_file_rmap(page, true);
} else {
ClearHPageRestoreReserve(page);
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
}
- /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
- if (is_continue && !vm_shared)
+ /*
+ * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
+ * with wp flag set, don't set pte write bit.
+ */
+ if (wp_copy || (is_continue && !vm_shared))
writable = 0;
else
writable = dst_vma->vm_flags & VM_WRITE;
_dst_pte = make_huge_pte(dst_vma, page, writable);
- if (writable)
- _dst_pte = huge_pte_mkdirty(_dst_pte);
+ /*
+ * Always mark UFFDIO_COPY page dirty; note that this may not be
+ * extremely important for hugetlbfs for now since swapping is not
+ * supported, but we should still be clear in that this page cannot be
+ * thrown away at will, even if write bit not set.
+ */
+ _dst_pte = huge_pte_mkdirty(_dst_pte);
_dst_pte = pte_mkyoung(_dst_pte);
+ if (wp_copy)
+ _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
+
set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
(void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
@@ -5940,6 +6073,25 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
}
}
+static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
+ bool *unshare)
+{
+ pte_t pteval = huge_ptep_get(pte);
+
+ *unshare = false;
+ if (is_swap_pte(pteval))
+ return true;
+ if (huge_pte_write(pteval))
+ return false;
+ if (flags & FOLL_WRITE)
+ return true;
+ if (gup_must_unshare(flags, pte_page(pteval))) {
+ *unshare = true;
+ return true;
+ }
+ return false;
+}
+
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
@@ -5954,6 +6106,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
spinlock_t *ptl = NULL;
+ bool unshare = false;
int absent;
struct page *page;
@@ -6004,9 +6157,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* both cases, and because we can't follow correct pages
* directly from any kind of swap entries.
*/
- if (absent || is_swap_pte(huge_ptep_get(pte)) ||
- ((flags & FOLL_WRITE) &&
- !huge_pte_write(huge_ptep_get(pte)))) {
+ if (absent ||
+ __follow_hugetlb_must_fault(flags, pte, &unshare)) {
vm_fault_t ret;
unsigned int fault_flags = 0;
@@ -6014,6 +6166,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(ptl);
if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
+ else if (unshare)
+ fault_flags |= FAULT_FLAG_UNSHARE;
if (locked)
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_KILLABLE;
@@ -6055,6 +6209,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
page = pte_page(huge_ptep_get(pte));
+ VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
+ !PageAnonExclusive(page), page);
+
/*
* If subpage information not requested, update counters
* and skip the same_page loop below.
@@ -6117,16 +6274,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
- unsigned long address, unsigned long end, pgprot_t newprot)
+ unsigned long address, unsigned long end,
+ pgprot_t newprot, unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long start = address;
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
- unsigned long pages = 0;
+ unsigned long pages = 0, psize = huge_page_size(h);
bool shared_pmd = false;
struct mmu_notifier_range range;
+ bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
/*
* In the case of shared PMDs, the area to flush could be beyond
@@ -6142,13 +6302,19 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
i_mmap_lock_write(vma->vm_file->f_mapping);
- for (; address < end; address += huge_page_size(h)) {
+ for (; address < end; address += psize) {
spinlock_t *ptl;
- ptep = huge_pte_offset(mm, address, huge_page_size(h));
+ ptep = huge_pte_offset(mm, address, psize);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+ /*
+ * When uffd-wp is enabled on the vma, unshare
+ * shouldn't happen at all. Warn about it if it
+ * happened due to some reason.
+ */
+ WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
pages++;
spin_unlock(ptl);
shared_pmd = true;
@@ -6161,20 +6327,37 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
}
if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
+ struct page *page = pfn_swap_entry_to_page(entry);
- if (is_writable_migration_entry(entry)) {
+ if (!is_readable_migration_entry(entry)) {
pte_t newpte;
- entry = make_readable_migration_entry(
- swp_offset(entry));
+ if (PageAnon(page))
+ entry = make_readable_exclusive_migration_entry(
+ swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
newpte = swp_entry_to_pte(entry);
+ if (uffd_wp)
+ newpte = pte_swp_mkuffd_wp(newpte);
+ else if (uffd_wp_resolve)
+ newpte = pte_swp_clear_uffd_wp(newpte);
set_huge_swap_pte_at(mm, address, ptep,
- newpte, huge_page_size(h));
+ newpte, psize);
pages++;
}
spin_unlock(ptl);
continue;
}
+ if (unlikely(pte_marker_uffd_wp(pte))) {
+ /*
+ * This is changing a non-present pte into a none pte,
+ * no need for huge_ptep_modify_prot_start/commit().
+ */
+ if (uffd_wp_resolve)
+ huge_pte_clear(mm, address, ptep, psize);
+ }
if (!huge_pte_none(pte)) {
pte_t old_pte;
unsigned int shift = huge_page_shift(hstate_vma(vma));
@@ -6182,8 +6365,18 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
pte = huge_pte_modify(old_pte, newprot);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
+ if (uffd_wp)
+ pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte));
+ else if (uffd_wp_resolve)
+ pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
+ } else {
+ /* None pte */
+ if (unlikely(uffd_wp))
+ /* Safe to modify directly (none->non-present). */
+ set_huge_pte_at(mm, address, ptep,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
}
spin_unlock(ptl);
}
@@ -6686,9 +6879,11 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
spinlock_t *ptl;
pte_t pte;
- /* FOLL_GET and FOLL_PIN are mutually exclusive. */
- if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
- (FOLL_PIN | FOLL_GET)))
+ /*
+ * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+ * follow_hugetlb_page().
+ */
+ if (WARN_ON_ONCE(flags & FOLL_PIN))
return NULL;
retry:
@@ -6776,7 +6971,9 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
spin_lock_irq(&hugetlb_lock);
if (PageHeadHuge(page)) {
*hugetlb = true;
- if (HPageFreed(page) || HPageMigratable(page))
+ if (HPageFreed(page))
+ ret = 0;
+ else if (HPageMigratable(page))
ret = get_page_unless_zero(page);
else
ret = -EBUSY;
@@ -6866,6 +7063,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
if (start >= end)
return;
+ flush_cache_range(vma, start, end);
/*
* No need to call adjust_range_if_pmd_sharing_possible(), because
* we have already done the PUD_SIZE alignment.
@@ -6951,7 +7149,7 @@ void __init hugetlb_cma_reserve(int order)
if (hugetlb_cma_size_in_node[nid] == 0)
continue;
- if (!node_state(nid, N_ONLINE)) {
+ if (!node_online(nid)) {
pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
hugetlb_cma_size_in_node[nid] = 0;
@@ -6990,7 +7188,7 @@ void __init hugetlb_cma_reserve(int order)
}
reserved = 0;
- for_each_node_state(nid, N_ONLINE) {
+ for_each_online_node(nid) {
int res;
char name[CMA_MAX_NAME];
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 791626983c2e..fcd9f7872064 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -1,181 +1,16 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Free some vmemmap pages of HugeTLB
+ * Optimize vmemmap pages associated with HugeTLB
*
* Copyright (c) 2020, Bytedance. All rights reserved.
*
* Author: Muchun Song <songmuchun@bytedance.com>
*
- * The struct page structures (page structs) are used to describe a physical
- * page frame. By default, there is a one-to-one mapping from a page frame to
- * it's corresponding page struct.
- *
- * HugeTLB pages consist of multiple base page size pages and is supported by
- * many architectures. See hugetlbpage.rst in the Documentation directory for
- * more details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB
- * are currently supported. Since the base page size on x86 is 4KB, a 2MB
- * HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of
- * 4096 base pages. For each base page, there is a corresponding page struct.
- *
- * Within the HugeTLB subsystem, only the first 4 page structs are used to
- * contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
- * this upper limit. The only 'useful' information in the remaining page structs
- * is the compound_head field, and this field is the same for all tail pages.
- *
- * By removing redundant page structs for HugeTLB pages, memory can be returned
- * to the buddy allocator for other uses.
- *
- * Different architectures support different HugeTLB pages. For example, the
- * following table is the HugeTLB page size supported by x86 and arm64
- * architectures. Because arm64 supports 4k, 16k, and 64k base pages and
- * supports contiguous entries, so it supports many kinds of sizes of HugeTLB
- * page.
- *
- * +--------------+-----------+-----------------------------------------------+
- * | Architecture | Page Size | HugeTLB Page Size |
- * +--------------+-----------+-----------+-----------+-----------+-----------+
- * | x86-64 | 4KB | 2MB | 1GB | | |
- * +--------------+-----------+-----------+-----------+-----------+-----------+
- * | | 4KB | 64KB | 2MB | 32MB | 1GB |
- * | +-----------+-----------+-----------+-----------+-----------+
- * | arm64 | 16KB | 2MB | 32MB | 1GB | |
- * | +-----------+-----------+-----------+-----------+-----------+
- * | | 64KB | 2MB | 512MB | 16GB | |
- * +--------------+-----------+-----------+-----------+-----------+-----------+
- *
- * When the system boot up, every HugeTLB page has more than one struct page
- * structs which size is (unit: pages):
- *
- * struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
- *
- * Where HugeTLB_Size is the size of the HugeTLB page. We know that the size
- * of the HugeTLB page is always n times PAGE_SIZE. So we can get the following
- * relationship.
- *
- * HugeTLB_Size = n * PAGE_SIZE
- *
- * Then,
- *
- * struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
- * = n * sizeof(struct page) / PAGE_SIZE
- *
- * We can use huge mapping at the pud/pmd level for the HugeTLB page.
- *
- * For the HugeTLB page of the pmd level mapping, then
- *
- * struct_size = n * sizeof(struct page) / PAGE_SIZE
- * = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE
- * = sizeof(struct page) / sizeof(pte_t)
- * = 64 / 8
- * = 8 (pages)
- *
- * Where n is how many pte entries which one page can contains. So the value of
- * n is (PAGE_SIZE / sizeof(pte_t)).
- *
- * This optimization only supports 64-bit system, so the value of sizeof(pte_t)
- * is 8. And this optimization also applicable only when the size of struct page
- * is a power of two. In most cases, the size of struct page is 64 bytes (e.g.
- * x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
- * size of struct page structs of it is 8 page frames which size depends on the
- * size of the base page.
- *
- * For the HugeTLB page of the pud level mapping, then
- *
- * struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd)
- * = PAGE_SIZE / 8 * 8 (pages)
- * = PAGE_SIZE (pages)
- *
- * Where the struct_size(pmd) is the size of the struct page structs of a
- * HugeTLB page of the pmd level mapping.
- *
- * E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
- * HugeTLB page consists in 4096.
- *
- * Next, we take the pmd level mapping of the HugeTLB page as an example to
- * show the internal implementation of this optimization. There are 8 pages
- * struct page structs associated with a HugeTLB page which is pmd mapped.
- *
- * Here is how things look before optimization.
- *
- * HugeTLB struct pages(8 pages) page frame(8 pages)
- * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
- * | | | 0 | -------------> | 0 |
- * | | +-----------+ +-----------+
- * | | | 1 | -------------> | 1 |
- * | | +-----------+ +-----------+
- * | | | 2 | -------------> | 2 |
- * | | +-----------+ +-----------+
- * | | | 3 | -------------> | 3 |
- * | | +-----------+ +-----------+
- * | | | 4 | -------------> | 4 |
- * | PMD | +-----------+ +-----------+
- * | level | | 5 | -------------> | 5 |
- * | mapping | +-----------+ +-----------+
- * | | | 6 | -------------> | 6 |
- * | | +-----------+ +-----------+
- * | | | 7 | -------------> | 7 |
- * | | +-----------+ +-----------+
- * | |
- * | |
- * | |
- * +-----------+
- *
- * The value of page->compound_head is the same for all tail pages. The first
- * page of page structs (page 0) associated with the HugeTLB page contains the 4
- * page structs necessary to describe the HugeTLB. The only use of the remaining
- * pages of page structs (page 1 to page 7) is to point to page->compound_head.
- * Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs
- * will be used for each HugeTLB page. This will allow us to free the remaining
- * 7 pages to the buddy allocator.
- *
- * Here is how things look after remapping.
- *
- * HugeTLB struct pages(8 pages) page frame(8 pages)
- * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
- * | | | 0 | -------------> | 0 |
- * | | +-----------+ +-----------+
- * | | | 1 | ---------------^ ^ ^ ^ ^ ^ ^
- * | | +-----------+ | | | | | |
- * | | | 2 | -----------------+ | | | | |
- * | | +-----------+ | | | | |
- * | | | 3 | -------------------+ | | | |
- * | | +-----------+ | | | |
- * | | | 4 | ---------------------+ | | |
- * | PMD | +-----------+ | | |
- * | level | | 5 | -----------------------+ | |
- * | mapping | +-----------+ | |
- * | | | 6 | -------------------------+ |
- * | | +-----------+ |
- * | | | 7 | ---------------------------+
- * | | +-----------+
- * | |
- * | |
- * | |
- * +-----------+
- *
- * When a HugeTLB is freed to the buddy system, we should allocate 7 pages for
- * vmemmap pages and restore the previous mapping relationship.
- *
- * For the HugeTLB page of the pud level mapping. It is similar to the former.
- * We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages.
- *
- * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
- * (e.g. aarch64) provides a contiguous bit in the translation table entries
- * that hints to the MMU to indicate that it is one of a contiguous set of
- * entries that can be cached in a single TLB entry.
- *
- * The contiguous bit is used to increase the mapping size at the pmd and pte
- * (last) level. So this type of HugeTLB page can be optimized only when its
- * size of the struct page structs is greater than 1 page.
- *
- * Notice: The head vmemmap page is not freed to the buddy allocator and all
- * tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
- * more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
- * associated with each HugeTLB page. The compound_head() can handle this
- * correctly (more details refer to the comment above compound_head()).
+ * See Documentation/vm/vmemmap_dedup.rst
*/
#define pr_fmt(fmt) "HugeTLB: " fmt
+#include <linux/memory_hotplug.h>
#include "hugetlb_vmemmap.h"
/*
@@ -188,53 +23,63 @@
#define RESERVE_VMEMMAP_NR 1U
#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
-DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON,
- hugetlb_free_vmemmap_enabled_key);
-EXPORT_SYMBOL(hugetlb_free_vmemmap_enabled_key);
+enum vmemmap_optimize_mode {
+ VMEMMAP_OPTIMIZE_OFF,
+ VMEMMAP_OPTIMIZE_ON,
+};
-static int __init early_hugetlb_free_vmemmap_param(char *buf)
-{
- /* We cannot optimize if a "struct page" crosses page boundaries. */
- if (!is_power_of_2(sizeof(struct page))) {
- pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n");
- return 0;
- }
+DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON,
+ hugetlb_optimize_vmemmap_key);
+EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
- if (!buf)
- return -EINVAL;
+static enum vmemmap_optimize_mode vmemmap_optimize_mode =
+ IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
- if (!strcmp(buf, "on"))
- static_branch_enable(&hugetlb_free_vmemmap_enabled_key);
- else if (!strcmp(buf, "off"))
- static_branch_disable(&hugetlb_free_vmemmap_enabled_key);
- else
- return -EINVAL;
+static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to)
+{
+ if (vmemmap_optimize_mode == to)
+ return;
- return 0;
+ if (to == VMEMMAP_OPTIMIZE_OFF)
+ static_branch_dec(&hugetlb_optimize_vmemmap_key);
+ else
+ static_branch_inc(&hugetlb_optimize_vmemmap_key);
+ WRITE_ONCE(vmemmap_optimize_mode, to);
}
-early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param);
-static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
+static int __init hugetlb_vmemmap_early_param(char *buf)
{
- return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
+ bool enable;
+ enum vmemmap_optimize_mode mode;
+
+ if (kstrtobool(buf, &enable))
+ return -EINVAL;
+
+ mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF;
+ vmemmap_optimize_mode_switch(mode);
+
+ return 0;
}
+early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param);
/*
* Previously discarded vmemmap pages will be allocated and remapping
* after this function returns zero.
*/
-int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
{
int ret;
unsigned long vmemmap_addr = (unsigned long)head;
- unsigned long vmemmap_end, vmemmap_reuse;
+ unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
if (!HPageVmemmapOptimized(head))
return 0;
- vmemmap_addr += RESERVE_VMEMMAP_SIZE;
- vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
- vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+ vmemmap_addr += RESERVE_VMEMMAP_SIZE;
+ vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
+ vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
+ vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+
/*
* The pages which the vmemmap virtual address range [@vmemmap_addr,
* @vmemmap_end) are mapped to are freed to the buddy allocator, and
@@ -244,30 +89,40 @@ int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
*/
ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
- if (!ret)
+ if (!ret) {
ClearHPageVmemmapOptimized(head);
+ static_branch_dec(&hugetlb_optimize_vmemmap_key);
+ }
return ret;
}
-void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
{
unsigned long vmemmap_addr = (unsigned long)head;
- unsigned long vmemmap_end, vmemmap_reuse;
+ unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
- if (!free_vmemmap_pages_per_hpage(h))
+ vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
+ if (!vmemmap_pages)
return;
- vmemmap_addr += RESERVE_VMEMMAP_SIZE;
- vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
- vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+ if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
+ return;
+
+ static_branch_inc(&hugetlb_optimize_vmemmap_key);
+
+ vmemmap_addr += RESERVE_VMEMMAP_SIZE;
+ vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT);
+ vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
/*
* Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end)
* to the page which @vmemmap_reuse is mapped to, then free the pages
* which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
*/
- if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
+ if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
+ static_branch_dec(&hugetlb_optimize_vmemmap_key);
+ else
SetHPageVmemmapOptimized(head);
}
@@ -278,14 +133,17 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
/*
* There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
- * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP,
+ * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP,
* so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.
*/
BUILD_BUG_ON(__NR_USED_SUBPAGE >=
RESERVE_VMEMMAP_SIZE / sizeof(struct page));
- if (!hugetlb_free_vmemmap_enabled())
+ if (!is_power_of_2(sizeof(struct page))) {
+ pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n");
+ static_branch_disable(&hugetlb_optimize_vmemmap_key);
return;
+ }
vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
/*
@@ -297,8 +155,57 @@ void __init hugetlb_vmemmap_init(struct hstate *h)
* hugetlbpage.rst for more details.
*/
if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
- h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
+ h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
+
+ pr_info("can optimize %d vmemmap pages for %s\n",
+ h->optimize_vmemmap_pages, h->name);
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *length,
+ loff_t *ppos)
+{
+ int ret;
+ enum vmemmap_optimize_mode mode;
+ static DEFINE_MUTEX(sysctl_mutex);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mutex_lock(&sysctl_mutex);
+ mode = vmemmap_optimize_mode;
+ table->data = &mode;
+ ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (write && !ret)
+ vmemmap_optimize_mode_switch(mode);
+ mutex_unlock(&sysctl_mutex);
- pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages,
- h->name);
+ return ret;
+}
+
+static struct ctl_table hugetlb_vmemmap_sysctls[] = {
+ {
+ .procname = "hugetlb_optimize_vmemmap",
+ .maxlen = sizeof(enum vmemmap_optimize_mode),
+ .mode = 0644,
+ .proc_handler = hugetlb_optimize_vmemmap_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ { }
+};
+
+static __init int hugetlb_vmemmap_sysctls_init(void)
+{
+ /*
+ * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
+ * crosses page boundaries, the vmemmap pages cannot be optimized.
+ */
+ if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page)))
+ register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
+
+ return 0;
}
+late_initcall(hugetlb_vmemmap_sysctls_init);
+#endif /* CONFIG_PROC_SYSCTL */
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index cb2bef8f9e73..109b0a53b6fe 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Free some vmemmap pages of HugeTLB
+ * Optimize vmemmap pages associated with HugeTLB
*
* Copyright (c) 2020, Bytedance. All rights reserved.
*
@@ -10,26 +10,26 @@
#define _LINUX_HUGETLB_VMEMMAP_H
#include <linux/hugetlb.h>
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
-int alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
-void free_huge_page_vmemmap(struct hstate *h, struct page *head);
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head);
+void hugetlb_vmemmap_free(struct hstate *h, struct page *head);
void hugetlb_vmemmap_init(struct hstate *h);
/*
- * How many vmemmap pages associated with a HugeTLB page that can be freed
- * to the buddy allocator.
+ * How many vmemmap pages associated with a HugeTLB page that can be
+ * optimized and freed to the buddy allocator.
*/
-static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
{
- return h->nr_free_vmemmap_pages;
+ return h->optimize_vmemmap_pages;
}
#else
-static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
{
return 0;
}
-static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
{
}
@@ -37,9 +37,9 @@ static inline void hugetlb_vmemmap_init(struct hstate *h)
{
}
-static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h)
{
return 0;
}
-#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
+#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
#endif /* _LINUX_HUGETLB_VMEMMAP_H */
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index bb0cea5468cb..5c0cddd81505 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -65,6 +65,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
static void pfn_inject_exit(void)
{
+ hwpoison_filter_enable = 0;
debugfs_remove_recursive(hwpoison_dir);
}
diff --git a/mm/internal.h b/mm/internal.h
index cf16280ce132..64e61b032dac 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -35,6 +35,21 @@ struct folio_batch;
/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
+/*
+ * Different from WARN_ON_ONCE(), no warning will be issued
+ * when we specify __GFP_NOWARN.
+ */
+#define WARN_ON_ONCE_GFP(cond, gfp) ({ \
+ static bool __section(".data.once") __warned; \
+ int __ret_warn_once = !!(cond); \
+ \
+ if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
+ __warned = true; \
+ WARN_ON(1); \
+ } \
+ unlikely(__ret_warn_once); \
+})
+
void page_writeback_init(void);
static inline void *folio_raw_mapping(struct folio *folio)
@@ -212,6 +227,67 @@ struct alloc_context {
};
/*
+ * This function returns the order of a free page in the buddy system. In
+ * general, page_zone(page)->lock must be held by the caller to prevent the
+ * page from being allocated in parallel and returning garbage as the order.
+ * If a caller does not hold page_zone(page)->lock, it must guarantee that the
+ * page cannot be allocated or merged in parallel. Alternatively, it must
+ * handle invalid values gracefully, and use buddy_order_unsafe() below.
+ */
+static inline unsigned int buddy_order(struct page *page)
+{
+ /* PageBuddy() must be checked by the caller */
+ return page_private(page);
+}
+
+/*
+ * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
+ * PageBuddy() should be checked first by the caller to minimize race window,
+ * and invalid values must be handled gracefully.
+ *
+ * READ_ONCE is used so that if the caller assigns the result into a local
+ * variable and e.g. tests it for valid range before using, the compiler cannot
+ * decide to remove the variable and inline the page_private(page) multiple
+ * times, potentially observing different values in the tests and the actual
+ * use of the result.
+ */
+#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
+
+/*
+ * This function checks whether a page is free && is the buddy
+ * we can coalesce a page and its buddy if
+ * (a) the buddy is not in a hole (check before calling!) &&
+ * (b) the buddy is in the buddy system &&
+ * (c) a page and its buddy have the same order &&
+ * (d) a page and its buddy are in the same zone.
+ *
+ * For recording whether a page is in the buddy system, we set PageBuddy.
+ * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
+ *
+ * For recording page's order, we use page_private(page).
+ */
+static inline bool page_is_buddy(struct page *page, struct page *buddy,
+ unsigned int order)
+{
+ if (!page_is_guard(buddy) && !PageBuddy(buddy))
+ return false;
+
+ if (buddy_order(buddy) != order)
+ return false;
+
+ /*
+ * zone check is done late to avoid uselessly calculating
+ * zone/node ids for pages that could never merge.
+ */
+ if (page_zone_id(page) != page_zone_id(buddy))
+ return false;
+
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+
+ return true;
+}
+
+/*
* Locate the struct page for both the matching buddy in our
* pair (buddy1) and the combined O(n+1) page they form (page).
*
@@ -234,6 +310,35 @@ __find_buddy_pfn(unsigned long page_pfn, unsigned int order)
return page_pfn ^ (1 << order);
}
+/*
+ * Find the buddy of @page and validate it.
+ * @page: The input page
+ * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the
+ * function is used in the performance-critical __free_one_page().
+ * @order: The order of the page
+ * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to
+ * page_to_pfn().
+ *
+ * The found buddy can be a non PageBuddy, out of @page's zone, or its order is
+ * not the same as @page. The validation is necessary before use it.
+ *
+ * Return: the found buddy page or NULL if not found.
+ */
+static inline struct page *find_buddy_page_pfn(struct page *page,
+ unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
+{
+ unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order);
+ struct page *buddy;
+
+ buddy = page + (__buddy_pfn - pfn);
+ if (buddy_pfn)
+ *buddy_pfn = __buddy_pfn;
+
+ if (page_is_buddy(page, buddy, order))
+ return buddy;
+ return NULL;
+}
+
extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
unsigned long end_pfn, struct zone *zone);
@@ -269,6 +374,9 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr,
int nid, bool exact_nid);
+void split_free_page(struct page *free_page,
+ int order, unsigned long split_pfn_offset);
+
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
/*
@@ -312,7 +420,7 @@ struct compact_control {
bool direct_compaction; /* False from kcompactd or /proc/... */
bool proactive_compaction; /* kcompactd proactive compaction */
bool whole_zone; /* Whole zone should/has been scanned */
- bool contended; /* Signal lock or sched contention */
+ bool contended; /* Signal lock contention */
bool rescan; /* Rescanning the same pageblock */
bool alloc_contig; /* alloc_contig_range allocation */
};
@@ -332,38 +440,14 @@ isolate_freepages_range(struct compact_control *cc,
int
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
+
+int __alloc_contig_migrate_range(struct compact_control *cc,
+ unsigned long start, unsigned long end);
#endif
int find_suitable_fallback(struct free_area *area, unsigned int order,
int migratetype, bool only_stealable, bool *can_steal);
/*
- * This function returns the order of a free page in the buddy system. In
- * general, page_zone(page)->lock must be held by the caller to prevent the
- * page from being allocated in parallel and returning garbage as the order.
- * If a caller does not hold page_zone(page)->lock, it must guarantee that the
- * page cannot be allocated or merged in parallel. Alternatively, it must
- * handle invalid values gracefully, and use buddy_order_unsafe() below.
- */
-static inline unsigned int buddy_order(struct page *page)
-{
- /* PageBuddy() must be checked by the caller */
- return page_private(page);
-}
-
-/*
- * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
- * PageBuddy() should be checked first by the caller to minimize race window,
- * and invalid values must be handled gracefully.
- *
- * READ_ONCE is used so that if the caller assigns the result into a local
- * variable and e.g. tests it for valid range before using, the compiler cannot
- * decide to remove the variable and inline the page_private(page) multiple
- * times, potentially observing different values in the tests and the actual
- * use of the result.
- */
-#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
-
-/*
* These three helpers classifies VMAs for virtual memory accounting.
*/
@@ -462,26 +546,22 @@ void mlock_page_drain_remote(int cpu);
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
/*
- * At what user virtual address is page expected in vma?
- * Returns -EFAULT if all of the page is outside the range of vma.
- * If page is a compound head, the entire compound page is considered.
+ * Return the start of user virtual address at the specific offset within
+ * a vma.
*/
static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages,
+ struct vm_area_struct *vma)
{
- pgoff_t pgoff;
unsigned long address;
- VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
- pgoff = page_to_pgoff(page);
if (pgoff >= vma->vm_pgoff) {
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address >= vma->vm_end)
address = -EFAULT;
- } else if (PageHead(page) &&
- pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
+ } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
/* Test above avoids possibility of wrap to 0 on 32-bit */
address = vma->vm_start;
} else {
@@ -491,6 +571,18 @@ vma_address(struct page *page, struct vm_area_struct *vma)
}
/*
+ * Return the start of user virtual address of a page within a vma.
+ * Returns -EFAULT if all of the page is outside the range of vma.
+ * If page is a compound head, the entire compound page is considered.
+ */
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
+ return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma);
+}
+
+/*
* Then at what user virtual address will none of the range be found in vma?
* Assumes that vma_address() already returned a good starting address.
*/
@@ -634,6 +726,9 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
}
#endif
+/*
+ * mm/memory-failure.c
+ */
extern int hwpoison_filter(struct page *p);
extern u32 hwpoison_filter_dev_major;
@@ -643,6 +738,14 @@ extern u64 hwpoison_filter_flags_value;
extern u64 hwpoison_filter_memcg;
extern u32 hwpoison_filter_enable;
+#ifdef CONFIG_MEMORY_FAILURE
+void clear_hwpoisoned_pages(struct page *memmap, int nr_pages);
+#else
+static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+}
+#endif
+
extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index d9079ec11f31..c40c0e7b3b5f 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -117,7 +117,7 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
{
if (likely(!PageHighMem(page)))
kasan_poison(page_address(page), PAGE_SIZE << order,
- KASAN_FREE_PAGE, init);
+ KASAN_PAGE_FREE, init);
}
/*
@@ -254,7 +254,7 @@ void __kasan_poison_slab(struct slab *slab)
for (i = 0; i < compound_nr(page); i++)
page_kasan_tag_reset(page + i);
kasan_poison(page_address(page), page_size(page),
- KASAN_KMALLOC_REDZONE, false);
+ KASAN_SLAB_REDZONE, false);
}
void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
@@ -265,7 +265,7 @@ void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
void __kasan_poison_object_data(struct kmem_cache *cache, void *object)
{
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
- KASAN_KMALLOC_REDZONE, false);
+ KASAN_SLAB_REDZONE, false);
}
/*
@@ -357,7 +357,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
}
kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
- KASAN_KMALLOC_FREE, init);
+ KASAN_SLAB_FREE, init);
if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine))
return false;
@@ -414,7 +414,7 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
if (unlikely(!folio_test_slab(folio))) {
if (____kasan_kfree_large(ptr, ip))
return;
- kasan_poison(ptr, folio_size(folio), KASAN_FREE_PAGE, false);
+ kasan_poison(ptr, folio_size(folio), KASAN_PAGE_FREE, false);
} else {
struct slab *slab = folio_slab(folio);
@@ -505,7 +505,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache,
redzone_end = round_up((unsigned long)(object + cache->object_size),
KASAN_GRANULE_SIZE);
kasan_poison((void *)redzone_start, redzone_end - redzone_start,
- KASAN_KMALLOC_REDZONE, false);
+ KASAN_SLAB_REDZONE, false);
/*
* Save alloc info (if possible) for kmalloc() allocations.
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index a25ad4090615..437fcc7e77cf 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -369,14 +369,14 @@ void kasan_set_free_info(struct kmem_cache *cache,
kasan_set_track(&free_meta->free_track, GFP_NOWAIT);
/* The object was freed and has free track set. */
- *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREETRACK;
+ *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK;
}
struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
void *object, u8 tag)
{
- if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_KMALLOC_FREETRACK)
+ if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK)
return NULL;
- /* Free meta must be present with KASAN_KMALLOC_FREETRACK. */
+ /* Free meta must be present with KASAN_SLAB_FREETRACK. */
return &kasan_get_free_meta(cache, object)->free_track;
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index b01b4bbe0409..610d60d6e5b8 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -42,6 +42,7 @@ static inline bool kasan_sync_fault_possible(void)
{
return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM;
}
+
#else
static inline bool kasan_stack_collection_enabled(void)
@@ -73,47 +74,41 @@ static inline bool kasan_sync_fault_possible(void)
#define KASAN_MEMORY_PER_SHADOW_PAGE (KASAN_GRANULE_SIZE << PAGE_SHIFT)
#ifdef CONFIG_KASAN_GENERIC
-#define KASAN_FREE_PAGE 0xFF /* page was freed */
-#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
-#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
-#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
-#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */
+#define KASAN_PAGE_FREE 0xFF /* freed page */
+#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocation */
+#define KASAN_SLAB_REDZONE 0xFC /* redzone for slab object */
+#define KASAN_SLAB_FREE 0xFB /* freed slab object */
+#define KASAN_VMALLOC_INVALID 0xF8 /* inaccessible space in vmap area */
#else
-#define KASAN_FREE_PAGE KASAN_TAG_INVALID
-#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID
-#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID
-#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID
-#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only for SW_TAGS */
+#define KASAN_PAGE_FREE KASAN_TAG_INVALID
+#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID
+#define KASAN_SLAB_REDZONE KASAN_TAG_INVALID
+#define KASAN_SLAB_FREE KASAN_TAG_INVALID
+#define KASAN_VMALLOC_INVALID KASAN_TAG_INVALID /* only used for SW_TAGS */
#endif
#ifdef CONFIG_KASAN_GENERIC
-#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */
-#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */
+#define KASAN_SLAB_FREETRACK 0xFA /* freed slab object with free track */
+#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */
-/*
- * Stack redzone shadow values
- * (Those are compiler's ABI, don't change them)
- */
-#define KASAN_STACK_LEFT 0xF1
-#define KASAN_STACK_MID 0xF2
-#define KASAN_STACK_RIGHT 0xF3
-#define KASAN_STACK_PARTIAL 0xF4
+/* Stack redzone shadow values. Compiler ABI, do not change. */
+#define KASAN_STACK_LEFT 0xF1
+#define KASAN_STACK_MID 0xF2
+#define KASAN_STACK_RIGHT 0xF3
+#define KASAN_STACK_PARTIAL 0xF4
-/*
- * alloca redzone shadow values
- */
+/* alloca redzone shadow values. */
#define KASAN_ALLOCA_LEFT 0xCA
#define KASAN_ALLOCA_RIGHT 0xCB
+/* alloca redzone size. Compiler ABI, do not change. */
#define KASAN_ALLOCA_REDZONE_SIZE 32
-/*
- * Stack frame marker (compiler ABI).
- */
+/* Stack frame marker. Compiler ABI, do not change. */
#define KASAN_CURRENT_STACK_FRAME_MAGIC 0x41B58AB3
-/* Don't break randconfig/all*config builds */
+/* Dummy value to avoid breaking randconfig/all*config builds. */
#ifndef KASAN_ABI_VERSION
#define KASAN_ABI_VERSION 1
#endif
@@ -141,21 +136,21 @@ struct kasan_report_info {
unsigned long ip;
};
-/* The layout of struct dictated by compiler */
+/* Do not change the struct layout: compiler ABI. */
struct kasan_source_location {
const char *filename;
int line_no;
int column_no;
};
-/* The layout of struct dictated by compiler */
+/* Do not change the struct layout: compiler ABI. */
struct kasan_global {
const void *beg; /* Address of the beginning of the global variable. */
size_t size; /* Size of the global variable. */
- size_t size_with_redzone; /* Size of the variable + size of the red zone. 32 bytes aligned */
+ size_t size_with_redzone; /* Size of the variable + size of the redzone. 32 bytes aligned. */
const void *name;
const void *module_name; /* Name of the module where the global variable is declared. */
- unsigned long has_dynamic_init; /* This needed for C++ */
+ unsigned long has_dynamic_init; /* This is needed for C++. */
#if KASAN_ABI_VERSION >= 4
struct kasan_source_location *location;
#endif
@@ -164,9 +159,7 @@ struct kasan_global {
#endif
};
-/**
- * Structures to keep alloc and free tracks *
- */
+/* Structures for keeping alloc and free tracks. */
#define KASAN_STACK_DEPTH 64
@@ -183,11 +176,8 @@ struct kasan_track {
struct kasan_alloc_meta {
struct kasan_track alloc_track;
+ /* Generic mode stores free track in kasan_free_meta. */
#ifdef CONFIG_KASAN_GENERIC
- /*
- * The auxiliary stack is stored into struct kasan_alloc_meta.
- * The free stack is stored into struct kasan_free_meta.
- */
depot_stack_handle_t aux_stack[2];
#else
struct kasan_track free_track[KASAN_NR_FREE_STACKS];
@@ -203,18 +193,18 @@ struct qlist_node {
};
/*
- * Generic mode either stores free meta in the object itself or in the redzone
- * after the object. In the former case free meta offset is 0, in the latter
- * case it has some sane value smaller than INT_MAX. Use INT_MAX as free meta
- * offset when free meta isn't present.
+ * Free meta is stored either in the object itself or in the redzone after the
+ * object. In the former case, free meta offset is 0. In the latter case, the
+ * offset is between 0 and INT_MAX. INT_MAX marks that free meta is not present.
*/
#define KASAN_NO_FREE_META INT_MAX
+/*
+ * Free meta is only used by Generic mode while the object is in quarantine.
+ * After that, slab allocator stores the freelist pointer in the object.
+ */
struct kasan_free_meta {
#ifdef CONFIG_KASAN_GENERIC
- /* This field is used while the object is in the quarantine.
- * Otherwise it might be used for the allocator freelist.
- */
struct qlist_node quarantine_link;
struct kasan_track free_track;
#endif
@@ -417,9 +407,10 @@ static inline void kasan_unpoison(const void *addr, size_t size, bool init)
return;
/*
* Explicitly initialize the memory with the precise object size to
- * avoid overwriting the SLAB redzone. This disables initialization in
- * the arch code and may thus lead to performance penalty. The penalty
- * is accepted since SLAB redzones aren't enabled in production builds.
+ * avoid overwriting the slab redzone. This disables initialization in
+ * the arch code and may thus lead to performance penalty. This penalty
+ * does not affect production builds, as slab redzones are not enabled
+ * there.
*/
if (__slub_debug_enabled() &&
init && ((unsigned long)size & KASAN_GRANULE_MASK)) {
@@ -503,8 +494,9 @@ void kasan_restore_multi_shot(bool enabled);
/*
* Exported functions for interfaces called from assembly or from generated
- * code. Declarations here to avoid warning about missing declarations.
+ * code. Declared here to avoid warnings about missing declarations.
*/
+
asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
void __asan_register_globals(struct kasan_global *globals, size_t size);
void __asan_unregister_globals(struct kasan_global *globals, size_t size);
@@ -573,4 +565,4 @@ void __hwasan_storeN_noabort(unsigned long addr, size_t size);
void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size);
-#endif
+#endif /* __MM_KASAN_KASAN_H */
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 0a9def8ce5e8..75585077eb6d 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -99,6 +99,17 @@ static unsigned long quarantine_size;
static DEFINE_RAW_SPINLOCK(quarantine_lock);
DEFINE_STATIC_SRCU(remove_cache_srcu);
+#ifdef CONFIG_PREEMPT_RT
+struct cpu_shrink_qlist {
+ raw_spinlock_t lock;
+ struct qlist_head qlist;
+};
+
+static DEFINE_PER_CPU(struct cpu_shrink_qlist, shrink_qlist) = {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(shrink_qlist.lock),
+};
+#endif
+
/* Maximum size of the global queue. */
static unsigned long quarantine_max_size;
@@ -152,7 +163,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
* As the object now gets freed from the quarantine, assume that its
* free track is no longer valid.
*/
- *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE;
+ *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE;
___cache_free(cache, object, _THIS_IP_);
@@ -308,10 +319,31 @@ static void qlist_move_cache(struct qlist_head *from,
}
}
-static void per_cpu_remove_cache(void *arg)
+#ifndef CONFIG_PREEMPT_RT
+static void __per_cpu_remove_cache(struct qlist_head *q, void *arg)
{
struct kmem_cache *cache = arg;
struct qlist_head to_free = QLIST_INIT;
+
+ qlist_move_cache(q, &to_free, cache);
+ qlist_free_all(&to_free, cache);
+}
+#else
+static void __per_cpu_remove_cache(struct qlist_head *q, void *arg)
+{
+ struct kmem_cache *cache = arg;
+ unsigned long flags;
+ struct cpu_shrink_qlist *sq;
+
+ sq = this_cpu_ptr(&shrink_qlist);
+ raw_spin_lock_irqsave(&sq->lock, flags);
+ qlist_move_cache(q, &sq->qlist, cache);
+ raw_spin_unlock_irqrestore(&sq->lock, flags);
+}
+#endif
+
+static void per_cpu_remove_cache(void *arg)
+{
struct qlist_head *q;
q = this_cpu_ptr(&cpu_quarantine);
@@ -322,8 +354,7 @@ static void per_cpu_remove_cache(void *arg)
*/
if (READ_ONCE(q->offline))
return;
- qlist_move_cache(q, &to_free, cache);
- qlist_free_all(&to_free, cache);
+ __per_cpu_remove_cache(q, arg);
}
/* Free all quarantined objects belonging to cache. */
@@ -341,6 +372,21 @@ void kasan_quarantine_remove_cache(struct kmem_cache *cache)
*/
on_each_cpu(per_cpu_remove_cache, cache, 1);
+#ifdef CONFIG_PREEMPT_RT
+ {
+ int cpu;
+ struct cpu_shrink_qlist *sq;
+
+ for_each_online_cpu(cpu) {
+ sq = per_cpu_ptr(&shrink_qlist, cpu);
+ raw_spin_lock_irqsave(&sq->lock, flags);
+ qlist_move_cache(&sq->qlist, &to_free, cache);
+ raw_spin_unlock_irqrestore(&sq->lock, flags);
+ }
+ qlist_free_all(&to_free, cache);
+ }
+#endif
+
raw_spin_lock_irqsave(&quarantine_lock, flags);
for (i = 0; i < QUARANTINE_BATCHES; i++) {
if (qlist_empty(&global_quarantine[i]))
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index efc5e79a103f..6689fb9a919b 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -66,7 +66,7 @@ static const char *get_shadow_bug_type(struct kasan_report_info *info)
bug_type = "out-of-bounds";
break;
case KASAN_PAGE_REDZONE:
- case KASAN_KMALLOC_REDZONE:
+ case KASAN_SLAB_REDZONE:
bug_type = "slab-out-of-bounds";
break;
case KASAN_GLOBAL_REDZONE:
@@ -78,9 +78,9 @@ static const char *get_shadow_bug_type(struct kasan_report_info *info)
case KASAN_STACK_PARTIAL:
bug_type = "stack-out-of-bounds";
break;
- case KASAN_FREE_PAGE:
- case KASAN_KMALLOC_FREE:
- case KASAN_KMALLOC_FREETRACK:
+ case KASAN_PAGE_FREE:
+ case KASAN_SLAB_FREE:
+ case KASAN_SLAB_FREETRACK:
bug_type = "use-after-free";
break;
case KASAN_ALLOCA_LEFT:
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 11a954763be9..4e7cd4c8e687 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -21,6 +21,8 @@
#include <linux/log2.h>
#include <linux/memblock.h>
#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/panic_notifier.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
#include <linux/sched/clock.h>
@@ -67,8 +69,11 @@ static int param_set_sample_interval(const char *val, const struct kernel_param
if (ret < 0)
return ret;
- if (!num) /* Using 0 to indicate KFENCE is disabled. */
+ /* Using 0 to indicate KFENCE is disabled. */
+ if (!num && READ_ONCE(kfence_enabled)) {
+ pr_info("disabled\n");
WRITE_ONCE(kfence_enabled, false);
+ }
*((unsigned long *)kp->arg) = num;
@@ -99,6 +104,10 @@ module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644)
static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE);
module_param_named(deferrable, kfence_deferrable, bool, 0444);
+/* If true, check all canary bytes on panic. */
+static bool kfence_check_on_panic __read_mostly;
+module_param_named(check_on_panic, kfence_check_on_panic, bool, 0444);
+
/* The pool of pages used for guard pages and objects. */
char *__kfence_pool __read_mostly;
EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
@@ -737,6 +746,31 @@ static int __init kfence_debugfs_init(void)
late_initcall(kfence_debugfs_init);
+/* === Panic Notifier ====================================================== */
+
+static void kfence_check_all_canary(void)
+{
+ int i;
+
+ for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+ struct kfence_metadata *meta = &kfence_metadata[i];
+
+ if (meta->state == KFENCE_OBJECT_ALLOCATED)
+ for_each_canary(meta, check_canary_byte);
+ }
+}
+
+static int kfence_check_canary_callback(struct notifier_block *nb,
+ unsigned long reason, void *arg)
+{
+ kfence_check_all_canary();
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kfence_check_canary_notifier = {
+ .notifier_call = kfence_check_canary_callback,
+};
+
/* === Allocation Gate Timer ================================================ */
static struct delayed_work kfence_timer;
@@ -814,6 +848,9 @@ static void kfence_init_enable(void)
else
INIT_DELAYED_WORK(&kfence_timer, toggle_allocation_gate);
+ if (kfence_check_on_panic)
+ atomic_notifier_chain_register(&panic_notifier_list, &kfence_check_canary_notifier);
+
WRITE_ONCE(kfence_enabled, true);
queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
@@ -874,6 +911,7 @@ static int kfence_enable_late(void)
WRITE_ONCE(kfence_enabled, true);
queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
+ pr_info("re-enabled\n");
return 0;
}
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 96206a4ee9ab..a97bffe0cc3e 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -296,10 +296,9 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
if (policy == ALLOCATE_ANY)
return alloc;
- if (policy == ALLOCATE_LEFT && IS_ALIGNED((unsigned long)alloc, PAGE_SIZE))
+ if (policy == ALLOCATE_LEFT && PAGE_ALIGNED(alloc))
return alloc;
- if (policy == ALLOCATE_RIGHT &&
- !IS_ALIGNED((unsigned long)alloc, PAGE_SIZE))
+ if (policy == ALLOCATE_RIGHT && !PAGE_ALIGNED(alloc))
return alloc;
} else if (policy == ALLOCATE_NONE)
return alloc;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a4e5eaf3eb01..16be62d493cd 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -365,9 +365,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
* register it here without waiting a page fault that
* may not happen any time soon.
*/
- if (!(*vm_flags & VM_NO_KHUGEPAGED) &&
- khugepaged_enter_vma_merge(vma, *vm_flags))
- return -ENOMEM;
+ khugepaged_enter_vma(vma, *vm_flags);
break;
case MADV_NOHUGEPAGE:
*vm_flags &= ~VM_HUGEPAGE;
@@ -439,12 +437,19 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
-static bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags)
+bool hugepage_vma_check(struct vm_area_struct *vma,
+ unsigned long vm_flags)
{
if (!transhuge_vma_enabled(vma, vm_flags))
return false;
+ if (vm_flags & VM_NO_KHUGEPAGED)
+ return false;
+
+ /* Don't run khugepaged against DAX vma */
+ if (vma_is_dax(vma))
+ return false;
+
if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
vma->vm_pgoff, HPAGE_PMD_NR))
return false;
@@ -458,35 +463,31 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
return false;
/* Only regular file is valid */
- if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
- (vm_flags & VM_EXEC)) {
- struct inode *inode = vma->vm_file->f_inode;
+ if (file_thp_enabled(vma))
+ return true;
- return !inode_is_open_for_write(inode) &&
- S_ISREG(inode->i_mode);
- }
-
- if (!vma->anon_vma || vma->vm_ops)
+ if (!vma->anon_vma || !vma_is_anonymous(vma))
return false;
if (vma_is_temporary_stack(vma))
return false;
- return !(vm_flags & VM_NO_KHUGEPAGED);
+
+ return true;
}
-int __khugepaged_enter(struct mm_struct *mm)
+void __khugepaged_enter(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
int wakeup;
mm_slot = alloc_mm_slot();
if (!mm_slot)
- return -ENOMEM;
+ return;
/* __khugepaged_exit() must not run from under us */
VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
free_mm_slot(mm_slot);
- return 0;
+ return;
}
spin_lock(&khugepaged_mm_lock);
@@ -502,28 +503,18 @@ int __khugepaged_enter(struct mm_struct *mm)
mmgrab(mm);
if (wakeup)
wake_up_interruptible(&khugepaged_wait);
-
- return 0;
}
-int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
- unsigned long vm_flags)
+void khugepaged_enter_vma(struct vm_area_struct *vma,
+ unsigned long vm_flags)
{
- unsigned long hstart, hend;
-
- /*
- * khugepaged only supports read-only files for non-shmem files.
- * khugepaged does not yet work on special mappings. And
- * file-private shmem THP is not supported.
- */
- if (!hugepage_vma_check(vma, vm_flags))
- return 0;
-
- hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = vma->vm_end & HPAGE_PMD_MASK;
- if (hstart < hend)
- return khugepaged_enter(vma, vm_flags);
- return 0;
+ if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
+ khugepaged_enabled() &&
+ (((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
+ (vma->vm_end & HPAGE_PMD_MASK))) {
+ if (hugepage_vma_check(vma, vm_flags))
+ __khugepaged_enter(vma->vm_mm);
+ }
}
void __khugepaged_exit(struct mm_struct *mm)
@@ -972,7 +963,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!hugepage_vma_check(vma, vma->vm_flags))
return SCAN_VMA_CHECK;
/* Anon VMA expected */
- if (!vma->anon_vma || vma->vm_ops)
+ if (!vma->anon_vma || !vma_is_anonymous(vma))
return SCAN_VMA_CHECK;
return 0;
}
@@ -1183,7 +1174,7 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
- page_add_new_anon_rmap(new_page, vma, address, true);
+ page_add_new_anon_rmap(new_page, vma, address);
lru_cache_add_inactive_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
@@ -1456,6 +1447,10 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
return;
+ /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
+ if (userfaultfd_wp(vma))
+ return;
+
hpage = find_lock_page(vma->vm_file->f_mapping,
linear_page_index(vma, haddr));
if (!hpage)
@@ -1591,7 +1586,15 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* reverse order. Trylock is a way to avoid deadlock.
*/
if (mmap_write_trylock(mm)) {
- if (!khugepaged_test_exit(mm))
+ /*
+ * When a vma is registered with uffd-wp, we can't
+ * recycle the pmd pgtable because there can be pte
+ * markers installed. Skip it only, so the rest mm/vma
+ * can still have the same file mapped hugely, however
+ * it'll always mapped in small page size for uffd-wp
+ * registered ranges.
+ */
+ if (!khugepaged_test_exit(mm) && !userfaultfd_wp(vma))
collapse_and_free_pmd(mm, vma, addr, pmd);
mmap_write_unlock(mm);
} else {
diff --git a/mm/ksm.c b/mm/ksm.c
index 063a48eeb5ee..54f78c9eecae 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -638,6 +638,9 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
ksm_pages_sharing--;
else
ksm_pages_shared--;
+
+ rmap_item->mm->ksm_merging_pages--;
+
VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
stable_node->rmap_hlist_len--;
put_anon_vma(rmap_item->anon_vma);
@@ -785,6 +788,9 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
ksm_pages_sharing--;
else
ksm_pages_shared--;
+
+ rmap_item->mm->ksm_merging_pages--;
+
VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
stable_node->rmap_hlist_len--;
@@ -866,6 +872,7 @@ static inline struct stable_node *page_stable_node(struct page *page)
static inline void set_page_stable_node(struct page *page,
struct stable_node *stable_node)
{
+ VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page);
page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
}
@@ -1038,6 +1045,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
int swapped;
int err = -EFAULT;
struct mmu_notifier_range range;
+ bool anon_exclusive;
pvmw.address = page_address_in_vma(page, vma);
if (pvmw.address == -EFAULT)
@@ -1055,9 +1063,10 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
goto out_unlock;
+ anon_exclusive = PageAnonExclusive(page);
if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) ||
(pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) ||
- mm_tlb_flush_pending(mm)) {
+ anon_exclusive || mm_tlb_flush_pending(mm)) {
pte_t entry;
swapped = PageSwapCache(page);
@@ -1085,6 +1094,12 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
set_pte_at(mm, pvmw.address, pvmw.pte, entry);
goto out_unlock;
}
+
+ if (anon_exclusive && page_try_share_anon_rmap(page)) {
+ set_pte_at(mm, pvmw.address, pvmw.pte, entry);
+ goto out_unlock;
+ }
+
if (pte_dirty(entry))
set_page_dirty(page);
@@ -1143,6 +1158,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
pte_unmap_unlock(ptep, ptl);
goto out_mn;
}
+ VM_BUG_ON_PAGE(PageAnonExclusive(page), page);
+ VM_BUG_ON_PAGE(PageAnon(kpage) && PageAnonExclusive(kpage), kpage);
/*
* No need to check ksm_use_zero_pages here: we can only have a
@@ -1150,7 +1167,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
*/
if (!is_zero_pfn(page_to_pfn(kpage))) {
get_page(kpage);
- page_add_anon_rmap(kpage, vma, addr, false);
+ page_add_anon_rmap(kpage, vma, addr, RMAP_NONE);
newpte = mk_pte(kpage, vma->vm_page_prot);
} else {
newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage),
@@ -1573,7 +1590,7 @@ again:
* the rbtree instead as a regular stable_node (in
* order to collapse the stable_node chain if a single
* stable_node dup was found in it). In such case the
- * stable_node is overwritten by the calleee to point
+ * stable_node is overwritten by the callee to point
* to the stable_node_dup that was collapsed in the
* stable rbtree and stable_node will be equal to
* stable_node_dup like if the chain never existed.
@@ -2007,6 +2024,8 @@ static void stable_tree_append(struct rmap_item *rmap_item,
ksm_pages_sharing++;
else
ksm_pages_shared++;
+
+ rmap_item->mm->ksm_merging_pages++;
}
/*
@@ -2591,7 +2610,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
return new_page;
}
-void rmap_walk_ksm(struct folio *folio, const struct rmap_walk_control *rwc)
+void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
{
struct stable_node *stable_node;
struct rmap_item *rmap_item;
@@ -2615,7 +2634,13 @@ again:
struct vm_area_struct *vma;
cond_resched();
- anon_vma_lock_read(anon_vma);
+ if (!anon_vma_trylock_read(anon_vma)) {
+ if (rwc->try_lock) {
+ rwc->contended = true;
+ return;
+ }
+ anon_vma_lock_read(anon_vma);
+ }
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
unsigned long addr;
diff --git a/mm/madvise.c b/mm/madvise.c
index 1873616a37d2..4d6592488b51 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -35,6 +35,7 @@
#include <asm/tlb.h>
#include "internal.h"
+#include "swap.h"
struct madvise_walk_private {
struct mmu_gather *tlb;
@@ -197,6 +198,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
pte_t *orig_pte;
struct vm_area_struct *vma = walk->private;
unsigned long index;
+ struct swap_iocb *splug = NULL;
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
return 0;
@@ -218,10 +220,11 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
continue;
page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
- vma, index, false);
+ vma, index, false, &splug);
if (page)
put_page(page);
}
+ swap_read_unplug(splug);
return 0;
}
@@ -237,6 +240,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
struct page *page;
+ struct swap_iocb *splug = NULL;
rcu_read_lock();
xas_for_each(&xas, page, end_index) {
@@ -249,13 +253,14 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
- NULL, 0, false);
+ NULL, 0, false, &splug);
if (page)
put_page(page);
rcu_read_lock();
}
rcu_read_unlock();
+ swap_read_unplug(splug);
lru_add_drain(); /* Push any new pages onto the LRU now */
}
@@ -432,12 +437,12 @@ regular_page:
if (split_huge_page(page)) {
unlock_page(page);
put_page(page);
- pte_offset_map_lock(mm, pmd, addr, &ptl);
+ orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
break;
}
unlock_page(page);
put_page(page);
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte--;
addr -= PAGE_SIZE;
continue;
@@ -648,12 +653,12 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (split_huge_page(page)) {
unlock_page(page);
put_page(page);
- pte_offset_map_lock(mm, pmd, addr, &ptl);
+ orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
goto out;
}
unlock_page(page);
put_page(page);
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte--;
addr -= PAGE_SIZE;
continue;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 598fece89e2b..abec50f31fe6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -67,6 +67,7 @@
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"
+#include "swap.h"
#include <linux/uaccess.h>
@@ -89,7 +90,7 @@ static bool cgroup_memory_nokmem __ro_after_init;
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
-bool cgroup_memory_noswap __ro_after_init;
+static bool cgroup_memory_noswap __ro_after_init;
#else
#define cgroup_memory_noswap 1
#endif
@@ -209,7 +210,6 @@ static struct move_charge_struct {
enum res_type {
_MEM,
_MEMSWAP,
- _OOM_TYPE,
_KMEM,
_TCP,
};
@@ -217,8 +217,6 @@ enum res_type {
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
-/* Used for OOM notifier */
-#define OOM_CONTROL (0)
/*
* Iteration constructs for visiting all cgroups (under a tree). If
@@ -1013,9 +1011,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (!root)
root = root_mem_cgroup;
- if (prev && !reclaim)
- pos = prev;
-
rcu_read_lock();
if (reclaim) {
@@ -1024,7 +1019,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
mz = root->nodeinfo[reclaim->pgdat->node_id];
iter = &mz->iter;
- if (prev && reclaim->generation != iter->generation)
+ /*
+ * On start, join the current reclaim iteration cycle.
+ * Exit when a concurrent walker completes it.
+ */
+ if (!prev)
+ reclaim->generation = iter->generation;
+ else if (reclaim->generation != iter->generation)
goto out_unlock;
while (1) {
@@ -1041,6 +1042,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
*/
(void)cmpxchg(&iter->position, pos, NULL);
}
+ } else if (prev) {
+ pos = prev;
}
if (pos)
@@ -1065,15 +1068,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
* is provided by the caller, so we know it's alive
* and kicking, and don't take an extra reference.
*/
- memcg = mem_cgroup_from_css(css);
-
- if (css == &root->css)
- break;
-
- if (css_tryget(css))
+ if (css == &root->css || css_tryget(css)) {
+ memcg = mem_cgroup_from_css(css);
break;
-
- memcg = NULL;
+ }
}
if (reclaim) {
@@ -1089,8 +1087,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (!memcg)
iter->generation++;
- else if (!prev)
- reclaim->generation = iter->generation;
}
out_unlock:
@@ -1402,6 +1398,10 @@ static const struct memory_stat memory_stats[] = {
{ "sock", MEMCG_SOCK },
{ "vmalloc", MEMCG_VMALLOC },
{ "shmem", NR_SHMEM },
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ { "zswap", MEMCG_ZSWAP_B },
+ { "zswapped", MEMCG_ZSWAPPED },
+#endif
{ "file_mapped", NR_FILE_MAPPED },
{ "file_dirty", NR_FILE_DIRTY },
{ "file_writeback", NR_WRITEBACK },
@@ -1436,6 +1436,7 @@ static int memcg_page_state_unit(int item)
{
switch (item) {
case MEMCG_PERCPU_B:
+ case MEMCG_ZSWAP_B:
case NR_SLAB_RECLAIMABLE_B:
case NR_SLAB_UNRECLAIMABLE_B:
case WORKINGSET_REFAULT_ANON:
@@ -1516,6 +1517,13 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
memcg_events(memcg, PGLAZYFREED));
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPIN),
+ memcg_events(memcg, ZSWPIN));
+ seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPOUT),
+ memcg_events(memcg, ZSWPOUT));
+#endif
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
memcg_events(memcg, THP_FAULT_ALLOC));
@@ -2887,6 +2895,19 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
return page_memcg_check(folio_page(folio, 0));
}
+static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
+{
+ struct obj_cgroup *objcg = NULL;
+
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ objcg = rcu_dereference(memcg->objcg);
+ if (objcg && obj_cgroup_tryget(objcg))
+ break;
+ objcg = NULL;
+ }
+ return objcg;
+}
+
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
{
struct obj_cgroup *objcg = NULL;
@@ -2900,15 +2921,32 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
memcg = active_memcg();
else
memcg = mem_cgroup_from_task(current);
-
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
- objcg = rcu_dereference(memcg->objcg);
- if (objcg && obj_cgroup_tryget(objcg))
- break;
- objcg = NULL;
- }
+ objcg = __get_obj_cgroup_from_memcg(memcg);
rcu_read_unlock();
+ return objcg;
+}
+
+struct obj_cgroup *get_obj_cgroup_from_page(struct page *page)
+{
+ struct obj_cgroup *objcg;
+
+ if (!memcg_kmem_enabled() || memcg_kmem_bypass())
+ return NULL;
+
+ if (PageMemcgKmem(page)) {
+ objcg = __folio_objcg(page_folio(page));
+ obj_cgroup_get(objcg);
+ } else {
+ struct mem_cgroup *memcg;
+ rcu_read_lock();
+ memcg = __folio_memcg(page_folio(page));
+ if (memcg)
+ objcg = __get_obj_cgroup_from_memcg(memcg);
+ else
+ objcg = NULL;
+ rcu_read_unlock();
+ }
return objcg;
}
@@ -3387,7 +3425,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
int loop = 0;
struct mem_cgroup_tree_per_node *mctz;
unsigned long excess;
- unsigned long nr_scanned;
if (order > 0)
return 0;
@@ -3415,13 +3452,10 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
if (!mz)
break;
- nr_scanned = 0;
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
- gfp_mask, &nr_scanned);
+ gfp_mask, total_scanned);
nr_reclaimed += reclaimed;
- *total_scanned += nr_scanned;
spin_lock_irq(&mctz->lock);
- __mem_cgroup_remove_exceeded(mz, mctz);
/*
* If we failed to reclaim anything from this memory cgroup
@@ -4893,7 +4927,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
.name = "oom_control",
.seq_show = mem_cgroup_oom_control_read,
.write_u64 = mem_cgroup_oom_control_write,
- .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
},
{
.name = "pressure_level",
@@ -5151,6 +5184,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ memcg->zswap_max = PAGE_COUNTER_MAX;
+#endif
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
@@ -5649,10 +5685,14 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
if (pte_present(ptent))
page = mc_handle_present_pte(vma, addr, ptent);
+ else if (pte_none_mostly(ptent))
+ /*
+ * PTE markers should be treated as a none pte here, separated
+ * from other swap handling below.
+ */
+ page = mc_handle_file_pte(vma, addr, ptent);
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, ptent, &ent);
- else if (pte_none(ptent))
- page = mc_handle_file_pte(vma, addr, ptent);
if (!page && !ent.val)
return ret;
@@ -6108,6 +6148,14 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
}
+static u64 memory_peak_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)memcg->memory.watermark * PAGE_SIZE;
+}
+
static int memory_min_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
@@ -6365,6 +6413,46 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes;
}
+static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
+ unsigned long nr_to_reclaim, nr_reclaimed = 0;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "", &nr_to_reclaim);
+ if (err)
+ return err;
+
+ while (nr_reclaimed < nr_to_reclaim) {
+ unsigned long reclaimed;
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ /*
+ * This is the final attempt, drain percpu lru caches in the
+ * hope of introducing more evictable pages for
+ * try_to_free_mem_cgroup_pages().
+ */
+ if (!nr_retries)
+ lru_add_drain_all();
+
+ reclaimed = try_to_free_mem_cgroup_pages(memcg,
+ nr_to_reclaim - nr_reclaimed,
+ GFP_KERNEL, true);
+
+ if (!reclaimed && !nr_retries--)
+ return -EAGAIN;
+
+ nr_reclaimed += reclaimed;
+ }
+
+ return nbytes;
+}
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -6372,6 +6460,11 @@ static struct cftype memory_files[] = {
.read_u64 = memory_current_read,
},
{
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = memory_peak_read,
+ },
+ {
.name = "min",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_min_show,
@@ -6423,6 +6516,11 @@ static struct cftype memory_files[] = {
.seq_show = memory_oom_group_show,
.write = memory_oom_group_write,
},
+ {
+ .name = "reclaim",
+ .flags = CFTYPE_NS_DELEGATABLE,
+ .write = memory_reclaim,
+ },
{ } /* terminate */
};
@@ -6593,9 +6691,6 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
return;
parent = parent_mem_cgroup(memcg);
- /* No parent means a non-hierarchical mode on v1 memcg */
- if (!parent)
- return;
if (parent == root) {
memcg->memory.emin = READ_ONCE(memcg->memory.min);
@@ -7125,17 +7220,17 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
}
/**
- * __mem_cgroup_try_charge_swap - try charging swap space for a page
- * @page: page being added to swap
+ * __mem_cgroup_try_charge_swap - try charging swap space for a folio
+ * @folio: folio being added to swap
* @entry: swap entry to charge
*
- * Try to charge @page's memcg for the swap space at @entry.
+ * Try to charge @folio's memcg for the swap space at @entry.
*
* Returns 0 on success, -ENOMEM on failure.
*/
-int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
{
- unsigned int nr_pages = thp_nr_pages(page);
+ unsigned int nr_pages = folio_nr_pages(folio);
struct page_counter *counter;
struct mem_cgroup *memcg;
unsigned short oldid;
@@ -7143,9 +7238,9 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
- memcg = page_memcg(page);
+ memcg = folio_memcg(folio);
- VM_WARN_ON_ONCE_PAGE(!memcg, page);
+ VM_WARN_ON_ONCE_FOLIO(!memcg, folio);
if (!memcg)
return 0;
@@ -7168,7 +7263,7 @@ int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
if (nr_pages > 1)
mem_cgroup_id_get_many(memcg, nr_pages - 1);
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
- VM_BUG_ON_PAGE(oldid, page);
+ VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
return 0;
@@ -7371,6 +7466,148 @@ static struct cftype memsw_files[] = {
{ }, /* terminate */
};
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+/**
+ * obj_cgroup_may_zswap - check if this cgroup can zswap
+ * @objcg: the object cgroup
+ *
+ * Check if the hierarchical zswap limit has been reached.
+ *
+ * This doesn't check for specific headroom, and it is not atomic
+ * either. But with zswap, the size of the allocation is only known
+ * once compression has occured, and this optimistic pre-check avoids
+ * spending cycles on compression when there is already no room left
+ * or zswap is disabled altogether somewhere in the hierarchy.
+ */
+bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
+{
+ struct mem_cgroup *memcg, *original_memcg;
+ bool ret = true;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return true;
+
+ original_memcg = get_mem_cgroup_from_objcg(objcg);
+ for (memcg = original_memcg; memcg != root_mem_cgroup;
+ memcg = parent_mem_cgroup(memcg)) {
+ unsigned long max = READ_ONCE(memcg->zswap_max);
+ unsigned long pages;
+
+ if (max == PAGE_COUNTER_MAX)
+ continue;
+ if (max == 0) {
+ ret = false;
+ break;
+ }
+
+ cgroup_rstat_flush(memcg->css.cgroup);
+ pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE;
+ if (pages < max)
+ continue;
+ ret = false;
+ break;
+ }
+ mem_cgroup_put(original_memcg);
+ return ret;
+}
+
+/**
+ * obj_cgroup_charge_zswap - charge compression backend memory
+ * @objcg: the object cgroup
+ * @size: size of compressed object
+ *
+ * This forces the charge after obj_cgroup_may_swap() allowed
+ * compression and storage in zwap for this cgroup to go ahead.
+ */
+void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size)
+{
+ struct mem_cgroup *memcg;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ VM_WARN_ON_ONCE(!(current->flags & PF_MEMALLOC));
+
+ /* PF_MEMALLOC context, charging must succeed */
+ if (obj_cgroup_charge(objcg, GFP_KERNEL, size))
+ VM_WARN_ON_ONCE(1);
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ mod_memcg_state(memcg, MEMCG_ZSWAP_B, size);
+ mod_memcg_state(memcg, MEMCG_ZSWAPPED, 1);
+ rcu_read_unlock();
+}
+
+/**
+ * obj_cgroup_uncharge_zswap - uncharge compression backend memory
+ * @objcg: the object cgroup
+ * @size: size of compressed object
+ *
+ * Uncharges zswap memory on page in.
+ */
+void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size)
+{
+ struct mem_cgroup *memcg;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return;
+
+ obj_cgroup_uncharge(objcg, size);
+
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ mod_memcg_state(memcg, MEMCG_ZSWAP_B, -size);
+ mod_memcg_state(memcg, MEMCG_ZSWAPPED, -1);
+ rcu_read_unlock();
+}
+
+static u64 zswap_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ cgroup_rstat_flush(css->cgroup);
+ return memcg_page_state(mem_cgroup_from_css(css), MEMCG_ZSWAP_B);
+}
+
+static int zswap_max_show(struct seq_file *m, void *v)
+{
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->zswap_max));
+}
+
+static ssize_t zswap_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long max;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &max);
+ if (err)
+ return err;
+
+ xchg(&memcg->zswap_max, max);
+
+ return nbytes;
+}
+
+static struct cftype zswap_files[] = {
+ {
+ .name = "zswap.current",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = zswap_current_read,
+ },
+ {
+ .name = "zswap.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = zswap_max_show,
+ .write = zswap_max_write,
+ },
+ { } /* terminate */
+};
+#endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */
+
/*
* If mem_cgroup_swap_init() is implemented as a subsys_initcall()
* instead of a core_initcall(), this could mean cgroup_memory_noswap still
@@ -7389,7 +7626,9 @@ static int __init mem_cgroup_swap_init(void)
WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
-
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
+#endif
return 0;
}
core_initcall(mem_cgroup_swap_init);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d4a4adcca01f..b85661cbdc4a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -59,6 +59,7 @@
#include <linux/page-isolation.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
+#include "swap.h"
#include "internal.h"
#include "ras/ras_event.h"
@@ -484,7 +485,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct anon_vma *av;
pgoff_t pgoff;
- av = folio_lock_anon_vma_read(folio);
+ av = folio_lock_anon_vma_read(folio, NULL);
if (av == NULL) /* Not actually mapped anymore */
return;
@@ -622,7 +623,7 @@ static int check_hwpoisoned_pmd_entry(pmd_t *pmdp, unsigned long addr,
static int hwpoison_pte_range(pmd_t *pmdp, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
- struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+ struct hwp_walk *hwp = walk->private;
int ret = 0;
pte_t *ptep, *mapped_pte;
spinlock_t *ptl;
@@ -656,7 +657,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- struct hwp_walk *hwp = (struct hwp_walk *)walk->private;
+ struct hwp_walk *hwp = walk->private;
pte_t pte = huge_ptep_get(ptep);
struct hstate *h = hstate_vma(walk->vma);
@@ -733,7 +734,6 @@ static const char * const action_page_types[] = {
[MF_MSG_BUDDY] = "free buddy page",
[MF_MSG_DAX] = "dax page",
[MF_MSG_UNSPLIT_THP] = "unsplit thp",
- [MF_MSG_DIFFERENT_PAGE_SIZE] = "different page size",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -1041,12 +1041,11 @@ static int me_huge_page(struct page_state *ps, struct page *p)
res = MF_FAILED;
unlock_page(hpage);
/*
- * migration entry prevents later access on error anonymous
- * hugepage, so we can free and dissolve it into buddy to
- * save healthy subpages.
+ * migration entry prevents later access on error hugepage,
+ * so we can free and dissolve it into buddy to save healthy
+ * subpages.
*/
- if (PageAnon(hpage))
- put_page(hpage);
+ put_page(hpage);
if (__page_handle_poison(p)) {
page_ref_inc(p);
res = MF_RECOVERED;
@@ -1133,6 +1132,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type,
{
trace_memory_failure_event(pfn, type, result);
+ num_poisoned_pages_inc();
pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
}
@@ -1179,13 +1179,11 @@ void ClearPageHWPoisonTakenOff(struct page *page)
*/
static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
{
- bool movable = false;
-
- /* Soft offline could mirgate non-LRU movable pages */
+ /* Soft offline could migrate non-LRU movable pages */
if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
- movable = true;
+ return true;
- return movable || PageLRU(page) || is_free_buddy_page(page);
+ return PageLRU(page) || is_free_buddy_page(page);
}
static int __get_hwpoison_page(struct page *page, unsigned long flags)
@@ -1521,7 +1519,9 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
if (flags & MF_COUNT_INCREASED) {
ret = 1;
count_increased = true;
- } else if (HPageFreed(head) || HPageMigratable(head)) {
+ } else if (HPageFreed(head)) {
+ ret = 0;
+ } else if (HPageMigratable(head)) {
ret = get_page_unless_zero(head);
if (ret)
count_increased = true;
@@ -1588,8 +1588,6 @@ retry:
goto out;
}
- num_poisoned_pages_inc();
-
/*
* Handling free hugepage. The possible race with hugepage allocation
* or demotion can be prevented by PageHWPoison flag.
@@ -1605,16 +1603,6 @@ retry:
return res == MF_RECOVERED ? 0 : -EBUSY;
}
- /*
- * The page could have changed compound pages due to race window.
- * If this happens just bail out.
- */
- if (!PageHuge(p) || compound_head(p) != head) {
- action_result(pfn, MF_MSG_DIFFERENT_PAGE_SIZE, MF_IGNORED);
- res = -EBUSY;
- goto out;
- }
-
page_flags = head->flags;
/*
@@ -1762,7 +1750,7 @@ static DEFINE_MUTEX(mf_mutex);
* enabled and no spinlocks hold.
*
* Return: 0 for successfully handled the memory error,
- * -EOPNOTSUPP for memory_filter() filtered the error event,
+ * -EOPNOTSUPP for hwpoison_filter() filtered the error event,
* < 0(except -EOPNOTSUPP) on failure.
*/
int memory_failure(unsigned long pfn, int flags)
@@ -1811,11 +1799,12 @@ try_again:
res = -EHWPOISON;
if (flags & MF_ACTION_REQUIRED)
res = kill_accessing_process(current, pfn, flags);
+ if (flags & MF_COUNT_INCREASED)
+ put_page(p);
goto unlock_mutex;
}
hpage = compound_head(p);
- num_poisoned_pages_inc();
/*
* We need/can do nothing about count=0 pages.
@@ -1839,7 +1828,6 @@ try_again:
/* We lost the race, try again */
if (retry) {
ClearPageHWPoison(p);
- num_poisoned_pages_dec();
retry = false;
goto try_again;
}
@@ -1902,8 +1890,7 @@ try_again:
*/
if (PageCompound(p)) {
if (retry) {
- if (TestClearPageHWPoison(p))
- num_poisoned_pages_dec();
+ ClearPageHWPoison(p);
unlock_page(p);
put_page(p);
flags &= ~MF_COUNT_INCREASED;
@@ -1925,8 +1912,7 @@ try_again:
page_flags = p->flags;
if (hwpoison_filter(p)) {
- if (TestClearPageHWPoison(p))
- num_poisoned_pages_dec();
+ TestClearPageHWPoison(p);
unlock_page(p);
put_page(p);
res = -EOPNOTSUPP;
@@ -2088,28 +2074,6 @@ core_initcall(memory_failure_init);
pr_info(fmt, pfn); \
})
-static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p)
-{
- if (TestClearPageHWPoison(p)) {
- unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
- page_to_pfn(p), rs);
- num_poisoned_pages_dec();
- return 1;
- }
- return 0;
-}
-
-static inline int unpoison_taken_off_page(struct ratelimit_state *rs,
- struct page *p)
-{
- if (put_page_back_buddy(p)) {
- unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
- page_to_pfn(p), rs);
- return 0;
- }
- return -EBUSY;
-}
-
/**
* unpoison_memory - Unpoison a previously poisoned page
* @pfn: Page number of the to be unpoisoned page
@@ -2127,6 +2091,7 @@ int unpoison_memory(unsigned long pfn)
struct page *page;
struct page *p;
int ret = -EBUSY;
+ int freeit = 0;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -2167,18 +2132,15 @@ int unpoison_memory(unsigned long pfn)
ret = get_hwpoison_page(p, MF_UNPOISON);
if (!ret) {
- if (clear_page_hwpoison(&unpoison_rs, page))
- ret = 0;
- else
- ret = -EBUSY;
+ ret = TestClearPageHWPoison(page) ? 0 : -EBUSY;
} else if (ret < 0) {
if (ret == -EHWPOISON) {
- ret = unpoison_taken_off_page(&unpoison_rs, p);
+ ret = put_page_back_buddy(p) ? 0 : -EBUSY;
} else
unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
pfn, &unpoison_rs);
} else {
- int freeit = clear_page_hwpoison(&unpoison_rs, p);
+ freeit = !!TestClearPageHWPoison(p);
put_page(page);
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) {
@@ -2189,6 +2151,11 @@ int unpoison_memory(unsigned long pfn)
unlock_mutex:
mutex_unlock(&mf_mutex);
+ if (!ret || freeit) {
+ num_poisoned_pages_dec();
+ unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
+ page_to_pfn(p), &unpoison_rs);
+ }
return ret;
}
EXPORT_SYMBOL(unpoison_memory);
@@ -2323,7 +2290,9 @@ static void put_ref_page(struct page *page)
* @pfn: pfn to soft-offline
* @flags: flags. Same as memory_failure().
*
- * Returns 0 on success, otherwise negated errno.
+ * Returns 0 on success
+ * -EOPNOTSUPP for hwpoison_filter() filtered the error event
+ * < 0 otherwise negated errno.
*
* Soft offline a page, by migration or invalidation,
* without killing anything. This is for the case when
@@ -2374,6 +2343,16 @@ retry:
ret = get_hwpoison_page(page, flags | MF_SOFT_OFFLINE);
put_online_mems();
+ if (hwpoison_filter(page)) {
+ if (ret > 0)
+ put_page(page);
+ else
+ put_ref_page(ref_page);
+
+ mutex_unlock(&mf_mutex);
+ return -EOPNOTSUPP;
+ }
+
if (ret > 0) {
ret = soft_offline_in_use_page(page);
} else if (ret == 0) {
@@ -2388,3 +2367,24 @@ retry:
return ret;
}
+
+void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
+{
+ int i;
+
+ /*
+ * A further optimization is to have per section refcounted
+ * num_poisoned_pages. But that would need more space per memmap, so
+ * for now just do a quick global check to speed up this routine in the
+ * absence of bad pages.
+ */
+ if (atomic_long_read(&num_poisoned_pages) == 0)
+ return;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (PageHWPoison(&memmap[i])) {
+ num_poisoned_pages_dec();
+ ClearPageHWPoison(&memmap[i]);
+ }
+ }
+}
diff --git a/mm/memory.c b/mm/memory.c
index 2a12028a3749..54bcd5327b74 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -86,6 +86,7 @@
#include "pgalloc-track.h"
#include "internal.h"
+#include "swap.h"
#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
@@ -99,6 +100,8 @@ struct page *mem_map;
EXPORT_SYMBOL(mem_map);
#endif
+static vm_fault_t do_fault(struct vm_fault *vmf);
+
/*
* A number of key systems in x86 including ioremap() rely on the assumption
* that high_memory defines the upper bound on direct map memory, then end
@@ -720,12 +723,14 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
else if (is_writable_device_exclusive_entry(entry))
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ VM_BUG_ON(pte_write(pte) && !(PageAnon(page) && PageAnonExclusive(page)));
+
/*
* No need to take a page reference as one was already
* created when the swap entry was made.
*/
if (PageAnon(page))
- page_add_anon_rmap(page, vma, address, false);
+ page_add_anon_rmap(page, vma, address, RMAP_NONE);
else
/*
* Currently device exclusive access only supports anonymous
@@ -790,17 +795,23 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
&src_mm->mmlist);
spin_unlock(&mmlist_lock);
}
+ /* Mark the swap entry as shared. */
+ if (pte_swp_exclusive(*src_pte)) {
+ pte = pte_swp_clear_exclusive(*src_pte);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
rss[MM_SWAPENTS]++;
} else if (is_migration_entry(entry)) {
page = pfn_swap_entry_to_page(entry);
rss[mm_counter(page)]++;
- if (is_writable_migration_entry(entry) &&
+ if (!is_readable_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
- * COW mappings require pages in both
- * parent and child to be set to read.
+ * COW mappings require pages in both parent and child
+ * to be set to read. A previously exclusive entry is
+ * now shared.
*/
entry = make_readable_migration_entry(
swp_offset(entry));
@@ -825,7 +836,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
*/
get_page(page);
rss[mm_counter(page)]++;
- page_dup_rmap(page, false);
+ /* Cannot fail as these pages cannot get pinned. */
+ BUG_ON(page_try_dup_anon_rmap(page, false, src_vma));
/*
* We do not preserve soft-dirty information, because so
@@ -854,6 +866,14 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (try_restore_exclusive_pte(src_pte, src_vma, addr))
return -EBUSY;
return -ENOENT;
+ } else if (is_pte_marker_entry(entry)) {
+ /*
+ * We're copying the pgtable should only because dst_vma has
+ * uffd-wp enabled, do sanity check.
+ */
+ WARN_ON_ONCE(!userfaultfd_wp(dst_vma));
+ set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
}
if (!userfaultfd_wp(dst_vma))
pte = pte_swp_clear_uffd_wp(pte);
@@ -862,19 +882,11 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
/*
- * Copy a present and normal page if necessary.
+ * Copy a present and normal page.
*
- * NOTE! The usual case is that this doesn't need to do
- * anything, and can just return a positive value. That
- * will let the caller know that it can just increase
- * the page refcount and re-use the pte the traditional
- * way.
- *
- * But _if_ we need to copy it because it needs to be
- * pinned in the parent (and the child should get its own
- * copy rather than just a reference to the same page),
- * we'll do that here and return zero to let the caller
- * know we're done.
+ * NOTE! The usual case is that this isn't required;
+ * instead, the caller can just increase the page refcount
+ * and re-use the pte the traditional way.
*
* And if we need a pre-allocated page but don't yet have
* one, return a negative error to let the preallocation
@@ -884,25 +896,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
static inline int
copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
- struct page **prealloc, pte_t pte, struct page *page)
+ struct page **prealloc, struct page *page)
{
struct page *new_page;
-
- /*
- * What we want to do is to check whether this page may
- * have been pinned by the parent process. If so,
- * instead of wrprotect the pte on both sides, we copy
- * the page immediately so that we'll always guarantee
- * the pinned page won't be randomly replaced in the
- * future.
- *
- * The page pinning checks are just "has this mm ever
- * seen pinning", along with the (inexact) check of
- * the page count. That might give false positives for
- * for pinning, but it will work correctly.
- */
- if (likely(!page_needs_cow_for_dma(src_vma, page)))
- return 1;
+ pte_t pte;
new_page = *prealloc;
if (!new_page)
@@ -915,7 +912,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
*prealloc = NULL;
copy_user_highpage(new_page, page, addr, src_vma);
__SetPageUptodate(new_page);
- page_add_new_anon_rmap(new_page, dst_vma, addr, false);
+ page_add_new_anon_rmap(new_page, dst_vma, addr);
lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
rss[mm_counter(new_page)]++;
@@ -944,16 +941,24 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
struct page *page;
page = vm_normal_page(src_vma, addr, pte);
- if (page) {
- int retval;
-
- retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
- addr, rss, prealloc, pte, page);
- if (retval <= 0)
- return retval;
-
+ if (page && PageAnon(page)) {
+ /*
+ * If this page may have been pinned by the parent process,
+ * copy the page immediately for the child so that we'll always
+ * guarantee the pinned page won't be randomly replaced in the
+ * future.
+ */
get_page(page);
- page_dup_rmap(page, false);
+ if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) {
+ /* Page maybe pinned, we have to copy. */
+ put_page(page);
+ return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
+ addr, rss, prealloc, page);
+ }
+ rss[mm_counter(page)]++;
+ } else if (page) {
+ get_page(page);
+ page_dup_file_rmap(page, false);
rss[mm_counter(page)]++;
}
@@ -965,6 +970,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
ptep_set_wrprotect(src_mm, addr, src_pte);
pte = pte_wrprotect(pte);
}
+ VM_BUG_ON(page && PageAnon(page) && PageAnonExclusive(page));
/*
* If it's a shared mapping, mark it clean in
@@ -1222,6 +1228,38 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
return 0;
}
+/*
+ * Return true if the vma needs to copy the pgtable during this fork(). Return
+ * false when we can speed up fork() by allowing lazy page faults later until
+ * when the child accesses the memory range.
+ */
+static bool
+vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
+{
+ /*
+ * Always copy pgtables when dst_vma has uffd-wp enabled even if it's
+ * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
+ * contains uffd-wp protection information, that's something we can't
+ * retrieve from page cache, and skip copying will lose those info.
+ */
+ if (userfaultfd_wp(dst_vma))
+ return true;
+
+ if (src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP))
+ return true;
+
+ if (src_vma->anon_vma)
+ return true;
+
+ /*
+ * Don't copy ptes where a page fault will fill them correctly. Fork
+ * becomes much lighter when there are big shared or private readonly
+ * mappings. The tradeoff is that copy_page_range is more efficient
+ * than faulting.
+ */
+ return false;
+}
+
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
@@ -1235,18 +1273,11 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
bool is_cow;
int ret;
- /*
- * Don't copy ptes where a page fault will fill them correctly.
- * Fork becomes much lighter when there are big shared or private
- * readonly mappings. The tradeoff is that copy_page_range is more
- * efficient than faulting.
- */
- if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
- !src_vma->anon_vma)
+ if (!vma_needs_copy(dst_vma, src_vma))
return 0;
if (is_vm_hugetlb_page(src_vma))
- return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
+ return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
/*
@@ -1308,6 +1339,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
struct zap_details {
struct folio *single_folio; /* Locked folio to be unmapped */
bool even_cows; /* Zap COWed private pages too? */
+ zap_flags_t zap_flags; /* Extra flags for zapping */
};
/* Whether we should zap all COWed (private) pages too */
@@ -1336,6 +1368,29 @@ static inline bool should_zap_page(struct zap_details *details, struct page *pag
return !PageAnon(page);
}
+static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
+{
+ if (!details)
+ return false;
+
+ return details->zap_flags & ZAP_FLAG_DROP_MARKER;
+}
+
+/*
+ * This function makes sure that we'll replace the none pte with an uffd-wp
+ * swap special pte marker when necessary. Must be with the pgtable lock held.
+ */
+static inline void
+zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte,
+ struct zap_details *details, pte_t pteval)
+{
+ if (zap_drop_file_uffd_wp(details))
+ return;
+
+ pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
@@ -1373,6 +1428,8 @@ again:
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, details,
+ ptent);
if (unlikely(!page))
continue;
@@ -1403,6 +1460,13 @@ again:
page = pfn_swap_entry_to_page(entry);
if (unlikely(!should_zap_page(details, page)))
continue;
+ /*
+ * Both device private/exclusive mappings should only
+ * work with anonymous page so far, so we don't need to
+ * consider uffd-wp bit when zap. For more information,
+ * see zap_install_uffd_wp_if_needed().
+ */
+ WARN_ON_ONCE(!vma_is_anonymous(vma));
rss[mm_counter(page)]--;
if (is_device_private_entry(entry))
page_remove_rmap(page, vma, false);
@@ -1419,6 +1483,10 @@ again:
if (!should_zap_page(details, page))
continue;
rss[mm_counter(page)]--;
+ } else if (pte_marker_entry_uffd_wp(entry)) {
+ /* Only drop the uffd-wp marker if explicitly requested */
+ if (!zap_drop_file_uffd_wp(details))
+ continue;
} else if (is_hwpoison_entry(entry)) {
if (!should_zap_cows(details))
continue;
@@ -1427,6 +1495,7 @@ again:
WARN_ON_ONCE(1);
}
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
} while (pte++, addr += PAGE_SIZE, addr != end);
add_mm_rss_vec(mm, rss);
@@ -1605,8 +1674,11 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* safe to do nothing in this case.
*/
if (vma->vm_file) {
+ zap_flags_t zap_flags = details ?
+ details->zap_flags : 0;
i_mmap_lock_write(vma->vm_file->f_mapping);
- __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
+ __unmap_hugepage_range_final(tlb, vma, start, end,
+ NULL, zap_flags);
i_mmap_unlock_write(vma->vm_file->f_mapping);
}
} else
@@ -1637,12 +1709,17 @@ void unmap_vmas(struct mmu_gather *tlb,
unsigned long end_addr)
{
struct mmu_notifier_range range;
+ struct zap_details details = {
+ .zap_flags = ZAP_FLAG_DROP_MARKER,
+ /* Careful - we need to zap private pages too! */
+ .even_cows = true,
+ };
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
start_addr, end_addr);
mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
- unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
+ unmap_single_vma(tlb, vma, start_addr, end_addr, &details);
mmu_notifier_invalidate_range_end(&range);
}
@@ -2755,8 +2832,8 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
return same;
}
-static inline bool cow_user_page(struct page *dst, struct page *src,
- struct vm_fault *vmf)
+static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
+ struct vm_fault *vmf)
{
bool ret;
void *kaddr;
@@ -2963,6 +3040,10 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct page *page = vmf->page;
pte_t entry;
+
+ VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
+ VM_BUG_ON(PageAnon(page) && !PageAnonExclusive(page));
+
/*
* Clear the pages cpupid information as the existing
* information potentially belongs to a now completely
@@ -2981,7 +3062,8 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
}
/*
- * Handle the case of a page which we actually need to copy to a new page.
+ * Handle the case of a page which we actually need to copy to a new page,
+ * either due to COW or unsharing.
*
* Called with mmap_lock locked and the old page referenced, but
* without the ptl held.
@@ -2998,6 +3080,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
*/
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
+ const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
struct page *old_page = vmf->page;
@@ -3020,7 +3103,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
if (!new_page)
goto oom;
- if (!cow_user_page(new_page, old_page, vmf)) {
+ if (!__wp_page_copy_user(new_page, old_page, vmf)) {
/*
* COW failed, if the fault was solved by other,
* it's fine. If not, userspace would re-fault on
@@ -3062,7 +3145,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (unlikely(unshare)) {
+ if (pte_soft_dirty(vmf->orig_pte))
+ entry = pte_mksoft_dirty(entry);
+ if (pte_uffd_wp(vmf->orig_pte))
+ entry = pte_mkuffd_wp(entry);
+ } else {
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ }
/*
* Clear the pte entry and flush it first, before updating the
@@ -3072,13 +3162,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* some TLBs while the old PTE remains in others.
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
- page_add_new_anon_rmap(new_page, vma, vmf->address, false);
+ page_add_new_anon_rmap(new_page, vma, vmf->address);
lru_cache_add_inactive_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
* new page to be mapped directly into the secondary page table.
*/
+ BUG_ON(unshare && pte_write(entry));
set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
update_mmu_cache(vma, vmf->address, vmf->pte);
if (old_page) {
@@ -3128,7 +3219,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
free_swap_cache(old_page);
put_page(old_page);
}
- return page_copied ? VM_FAULT_WRITE : 0;
+ return (page_copied && !unshare) ? VM_FAULT_WRITE : 0;
oom_free_new:
put_page(new_page);
oom:
@@ -3228,18 +3319,22 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
}
/*
- * This routine handles present pages, when users try to write
- * to a shared page. It is done by copying the page to a new address
- * and decrementing the shared-page counter for the old page.
+ * This routine handles present pages, when
+ * * users try to write to a shared page (FAULT_FLAG_WRITE)
+ * * GUP wants to take a R/O pin on a possibly shared anonymous page
+ * (FAULT_FLAG_UNSHARE)
+ *
+ * It is done by copying the page to a new address and decrementing the
+ * shared-page counter for the old page.
*
* Note that this routine assumes that the protection checks have been
* done by the caller (the low-level page fault routine in most cases).
- * Thus we can safely just mark it writable once we've done any necessary
- * COW.
+ * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
+ * done any necessary COW.
*
- * We also mark the page dirty at this point even though the page will
- * change only once the write actually happens. This avoids a few races,
- * and potentially makes it more efficient.
+ * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
+ * though the page will change only once the write actually happens. This
+ * avoids a few races, and potentially makes it more efficient.
*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), with pte both mapped and locked.
@@ -3248,23 +3343,35 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
static vm_fault_t do_wp_page(struct vm_fault *vmf)
__releases(vmf->ptl)
{
+ const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
- if (userfaultfd_pte_wp(vma, *vmf->pte)) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return handle_userfault(vmf, VM_UFFD_WP);
- }
+ VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
+ VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
- /*
- * Userfaultfd write-protect can defer flushes. Ensure the TLB
- * is flushed in this case before copying.
- */
- if (unlikely(userfaultfd_wp(vmf->vma) &&
- mm_tlb_flush_pending(vmf->vma->vm_mm)))
- flush_tlb_page(vmf->vma, vmf->address);
+ if (likely(!unshare)) {
+ if (userfaultfd_pte_wp(vma, *vmf->pte)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_WP);
+ }
+
+ /*
+ * Userfaultfd write-protect can defer flushes. Ensure the TLB
+ * is flushed in this case before copying.
+ */
+ if (unlikely(userfaultfd_wp(vmf->vma) &&
+ mm_tlb_flush_pending(vmf->vma->vm_mm)))
+ flush_tlb_page(vmf->vma, vmf->address);
+ }
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
if (!vmf->page) {
+ if (unlikely(unshare)) {
+ /* No anonymous page -> nothing to do. */
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+ }
+
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
@@ -3288,6 +3395,13 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
struct page *page = vmf->page;
/*
+ * If the page is exclusive to this process we must reuse the
+ * page without further checks.
+ */
+ if (PageAnonExclusive(page))
+ goto reuse;
+
+ /*
* We have to verify under page lock: these early checks are
* just an optimization to avoid locking the page and freeing
* the swapcache if there is little hope that we can reuse.
@@ -3317,9 +3431,19 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
* and the page is locked, it's dark out, and we're wearing
* sunglasses. Hit it.
*/
+ page_move_anon_rmap(page, vma);
unlock_page(page);
+reuse:
+ if (unlikely(unshare)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+ }
wp_page_reuse(vmf);
return VM_FAULT_WRITE;
+ } else if (unshare) {
+ /* No anonymous page -> nothing to do. */
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
@@ -3331,6 +3455,10 @@ copy:
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
+#ifdef CONFIG_KSM
+ if (PageKsm(vmf->page))
+ count_vm_event(COW_KSM);
+#endif
return wp_page_copy(vmf);
}
@@ -3387,6 +3515,7 @@ void unmap_mapping_folio(struct folio *folio)
details.even_cows = false;
details.single_folio = folio;
+ details.zap_flags = ZAP_FLAG_DROP_MARKER;
i_mmap_lock_read(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
@@ -3508,6 +3637,59 @@ static inline bool should_try_to_free_swap(struct page *page,
page_count(page) == 2;
}
+static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
+{
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ /*
+ * Be careful so that we will only recover a special uffd-wp pte into a
+ * none pte. Otherwise it means the pte could have changed, so retry.
+ */
+ if (is_pte_marker(*vmf->pte))
+ pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+}
+
+/*
+ * This is actually a page-missing access, but with uffd-wp special pte
+ * installed. It means this pte was wr-protected before being unmapped.
+ */
+static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
+{
+ /*
+ * Just in case there're leftover special ptes even after the region
+ * got unregistered - we can simply clear them. We can also do that
+ * proactively when e.g. when we do UFFDIO_UNREGISTER upon some uffd-wp
+ * ranges, but it should be more efficient to be done lazily here.
+ */
+ if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
+ return pte_marker_clear(vmf);
+
+ /* do_fault() can handle pte markers too like none pte */
+ return do_fault(vmf);
+}
+
+static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
+{
+ swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
+ unsigned long marker = pte_marker_get(entry);
+
+ /*
+ * PTE markers should always be with file-backed memories, and the
+ * marker should never be empty. If anything weird happened, the best
+ * thing to do is to kill the process along with its mm.
+ */
+ if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker))
+ return VM_FAULT_SIGBUS;
+
+ if (pte_marker_entry_uffd_wp(entry))
+ return pte_marker_handle_uffd_wp(vmf);
+
+ /* This is an unknown pte marker */
+ return VM_FAULT_SIGBUS;
+}
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -3521,10 +3703,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *swapcache;
struct swap_info_struct *si = NULL;
+ rmap_t rmap_flags = RMAP_NONE;
+ bool exclusive = false;
swp_entry_t entry;
pte_t pte;
int locked;
- int exclusive = 0;
vm_fault_t ret = 0;
void *shadow = NULL;
@@ -3544,6 +3727,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
+ } else if (is_pte_marker_entry(entry)) {
+ ret = handle_pte_marker(vmf);
} else {
print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
ret = VM_FAULT_SIGBUS;
@@ -3585,7 +3770,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* To provide entry to swap_readpage() */
set_page_private(page, entry.val);
- swap_readpage(page, true);
+ swap_readpage(page, true, NULL);
set_page_private(page, 0);
}
} else {
@@ -3677,6 +3862,57 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
/*
+ * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
+ * must never point at an anonymous page in the swapcache that is
+ * PG_anon_exclusive. Sanity check that this holds and especially, that
+ * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
+ * check after taking the PT lock and making sure that nobody
+ * concurrently faulted in this page and set PG_anon_exclusive.
+ */
+ BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
+ BUG_ON(PageAnon(page) && PageAnonExclusive(page));
+
+ /*
+ * Check under PT lock (to protect against concurrent fork() sharing
+ * the swap entry concurrently) for certainly exclusive pages.
+ */
+ if (!PageKsm(page)) {
+ /*
+ * Note that pte_swp_exclusive() == false for architectures
+ * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE.
+ */
+ exclusive = pte_swp_exclusive(vmf->orig_pte);
+ if (page != swapcache) {
+ /*
+ * We have a fresh page that is not exposed to the
+ * swapcache -> certainly exclusive.
+ */
+ exclusive = true;
+ } else if (exclusive && PageWriteback(page) &&
+ data_race(si->flags & SWP_STABLE_WRITES)) {
+ /*
+ * This is tricky: not all swap backends support
+ * concurrent page modifications while under writeback.
+ *
+ * So if we stumble over such a page in the swapcache
+ * we must not set the page exclusive, otherwise we can
+ * map it writable without further checks and modify it
+ * while still under writeback.
+ *
+ * For these problematic swap backends, simply drop the
+ * exclusive marker: this is perfectly fine as we start
+ * writeback only if we fully unmapped the page and
+ * there are no unexpected references on the page after
+ * unmapping succeeded. After fully unmapped, no
+ * further GUP references (FOLL_GET and FOLL_PIN) can
+ * appear, so dropping the exclusive marker and mapping
+ * it only R/O is fine.
+ */
+ exclusive = false;
+ }
+ }
+
+ /*
* Remove the swap entry and conditionally try to free up the swapcache.
* We're already holding a reference on the page but haven't mapped it
* yet.
@@ -3690,16 +3926,18 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
pte = mk_pte(page, vma->vm_page_prot);
/*
- * Same logic as in do_wp_page(); however, optimize for fresh pages
- * that are certainly not shared because we just allocated them without
- * exposing them to the swapcache.
+ * Same logic as in do_wp_page(); however, optimize for pages that are
+ * certainly not shared either because we just allocated them without
+ * exposing them to the swapcache or because the swap entry indicates
+ * exclusivity.
*/
- if ((vmf->flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
- (page != swapcache || page_count(page) == 1)) {
- pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- vmf->flags &= ~FAULT_FLAG_WRITE;
- ret |= VM_FAULT_WRITE;
- exclusive = RMAP_EXCLUSIVE;
+ if (!PageKsm(page) && (exclusive || page_count(page) == 1)) {
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+ vmf->flags &= ~FAULT_FLAG_WRITE;
+ ret |= VM_FAULT_WRITE;
+ }
+ rmap_flags |= RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(vmf->orig_pte))
@@ -3712,12 +3950,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
- page_add_new_anon_rmap(page, vma, vmf->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address);
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
- do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
+ page_add_anon_rmap(page, vma, vmf->address, rmap_flags);
}
+ VM_BUG_ON(!PageAnon(page) || (pte_write(pte) && !PageAnonExclusive(page)));
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
@@ -3862,7 +4101,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, vmf->address, false);
+ page_add_new_anon_rmap(page, vma, vmf->address);
lru_cache_add_inactive_or_unevictable(page, vma);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
@@ -4032,6 +4271,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
{
struct vm_area_struct *vma = vmf->vma;
+ bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool prefault = vmf->address != addr;
pte_t entry;
@@ -4046,10 +4286,12 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (unlikely(uffd_wp))
+ entry = pte_mkuffd_wp(pte_wrprotect(entry));
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, addr, false);
+ page_add_new_anon_rmap(page, vma, addr);
lru_cache_add_inactive_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
@@ -4058,6 +4300,14 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
}
+static bool vmf_pte_changed(struct vm_fault *vmf)
+{
+ if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
+ return !pte_same(*vmf->pte, vmf->orig_pte);
+
+ return !pte_none(*vmf->pte);
+}
+
/**
* finish_fault - finish page fault once we have prepared the page to fault
*
@@ -4116,7 +4366,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
vmf->address, &vmf->ptl);
ret = 0;
/* Re-check under ptl */
- if (likely(pte_none(*vmf->pte)))
+ if (likely(!vmf_pte_changed(vmf)))
do_set_pte(vmf, page, vmf->address);
else
ret = VM_FAULT_NOPAGE;
@@ -4219,9 +4469,21 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
}
+/* Return true if we should do read fault-around, false otherwise */
+static inline bool should_fault_around(struct vm_fault *vmf)
+{
+ /* No ->map_pages? No way to fault around... */
+ if (!vmf->vma->vm_ops->map_pages)
+ return false;
+
+ if (uffd_disable_fault_around(vmf->vma))
+ return false;
+
+ return fault_around_bytes >> PAGE_SHIFT > 1;
+}
+
static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
- struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret = 0;
/*
@@ -4229,12 +4491,10 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
* if page by the offset is not ready to be mapped (cold cache or
* something).
*/
- if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
- if (likely(!userfaultfd_minor(vmf->vma))) {
- ret = do_fault_around(vmf);
- if (ret)
- return ret;
- }
+ if (should_fault_around(vmf)) {
+ ret = do_fault_around(vmf);
+ if (ret)
+ return ret;
}
ret = __do_fault(vmf);
@@ -4504,8 +4764,11 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
+ const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
+
if (vma_is_anonymous(vmf->vma)) {
- if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
+ if (likely(!unshare) &&
+ userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
return do_huge_pmd_wp_page(vmf);
}
@@ -4581,6 +4844,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
* concurrent faults and from rmap lookups.
*/
vmf->pte = NULL;
+ vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
} else {
/*
* If a huge pmd materialized under us just retry later. Use
@@ -4604,6 +4868,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
*/
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
vmf->orig_pte = *vmf->pte;
+ vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
/*
* some architectures can have larger ptes than wordsize,
@@ -4640,10 +4905,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
goto unlock;
}
- if (vmf->flags & FAULT_FLAG_WRITE) {
+ if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!pte_write(entry))
return do_wp_page(vmf);
- entry = pte_mkdirty(entry);
+ else if (likely(vmf->flags & FAULT_FLAG_WRITE))
+ entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
@@ -4684,7 +4950,6 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
.pgoff = linear_page_index(vma, address),
.gfp_mask = __get_fault_gfp_mask(vma),
};
- unsigned int dirty = flags & FAULT_FLAG_WRITE;
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
p4d_t *p4d;
@@ -4709,9 +4974,11 @@ retry_pud:
barrier();
if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
- /* NUMA case for anonymous PUDs would go here */
-
- if (dirty && !pud_write(orig_pud)) {
+ /*
+ * TODO once we support anonymous PUDs: NUMA case and
+ * FAULT_FLAG_UNSHARE handling.
+ */
+ if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) {
ret = wp_huge_pud(&vmf, orig_pud);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -4749,7 +5016,8 @@ retry_pud:
if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf);
- if (dirty && !pmd_write(vmf.orig_pmd)) {
+ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
+ !pmd_write(vmf.orig_pmd)) {
ret = wp_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -4949,9 +5217,29 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp)
+/**
+ * follow_pte - look up PTE at a user virtual address
+ * @mm: the mm_struct of the target address space
+ * @address: user virtual address
+ * @ptepp: location to store found PTE
+ * @ptlp: location to store the lock for the PTE
+ *
+ * On a successful return, the pointer to the PTE is stored in @ptepp;
+ * the corresponding lock is taken and its location is stored in @ptlp.
+ * The contents of the PTE are only stable until @ptlp is released;
+ * any further use, if any, must be protected against invalidation
+ * with MMU notifiers.
+ *
+ * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
+ * should be taken for read.
+ *
+ * KVM uses this function. While it is arguably less bad than ``follow_pfn``,
+ * it is not a good general-purpose API.
+ *
+ * Return: zero on success, -ve otherwise.
+ */
+int follow_pte(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, spinlock_t **ptlp)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -4974,35 +5262,9 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
- if (pmd_huge(*pmd)) {
- if (!pmdpp)
- goto out;
-
- if (range) {
- mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
- NULL, mm, address & PMD_MASK,
- (address & PMD_MASK) + PMD_SIZE);
- mmu_notifier_invalidate_range_start(range);
- }
- *ptlp = pmd_lock(mm, pmd);
- if (pmd_huge(*pmd)) {
- *pmdpp = pmd;
- return 0;
- }
- spin_unlock(*ptlp);
- if (range)
- mmu_notifier_invalidate_range_end(range);
- }
-
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
- if (range) {
- mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
- address & PAGE_MASK,
- (address & PAGE_MASK) + PAGE_SIZE);
- mmu_notifier_invalidate_range_start(range);
- }
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
@@ -5010,38 +5272,9 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
- if (range)
- mmu_notifier_invalidate_range_end(range);
out:
return -EINVAL;
}
-
-/**
- * follow_pte - look up PTE at a user virtual address
- * @mm: the mm_struct of the target address space
- * @address: user virtual address
- * @ptepp: location to store found PTE
- * @ptlp: location to store the lock for the PTE
- *
- * On a successful return, the pointer to the PTE is stored in @ptepp;
- * the corresponding lock is taken and its location is stored in @ptlp.
- * The contents of the PTE are only stable until @ptlp is released;
- * any further use, if any, must be protected against invalidation
- * with MMU notifiers.
- *
- * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
- * should be taken for read.
- *
- * KVM uses this function. While it is arguably less bad than ``follow_pfn``,
- * it is not a good general-purpose API.
- *
- * Return: zero on success, -ve otherwise.
- */
-int follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
-{
- return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
-}
EXPORT_SYMBOL_GPL(follow_pte);
/**
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 416b38ca8def..1213d0c67a53 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,14 +42,31 @@
#include "internal.h"
#include "shuffle.h"
+#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
+static int memmap_on_memory_set(const char *val, const struct kernel_param *kp)
+{
+ if (hugetlb_optimize_vmemmap_enabled())
+ return 0;
+ return param_set_bool(val, kp);
+}
+
+static const struct kernel_param_ops memmap_on_memory_ops = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = memmap_on_memory_set,
+ .get = param_get_bool,
+};
/*
* memory_hotplug.memmap_on_memory parameter
*/
static bool memmap_on_memory __ro_after_init;
-#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
-module_param(memmap_on_memory, bool, 0444);
+module_param_cb(memmap_on_memory, &memmap_on_memory_ops, &memmap_on_memory, 0444);
MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
+
+bool mhp_memmap_on_memory(void)
+{
+ return memmap_on_memory;
+}
#endif
enum {
@@ -303,7 +320,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
int err;
struct vmem_altmap *altmap = params->altmap;
- if (WARN_ON_ONCE(!params->pgprot.pgprot))
+ if (WARN_ON_ONCE(!pgprot_val(params->pgprot)))
return -EINVAL;
VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false));
@@ -328,7 +345,8 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
/* Select all remaining pages up to the next section boundary */
cur_nr_pages = min(end_pfn - pfn,
SECTION_ALIGN_UP(pfn + 1) - pfn);
- err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
+ err = sparse_add_section(nid, pfn, cur_nr_pages, altmap,
+ params->pgmap);
if (err)
break;
cond_resched();
@@ -1288,9 +1306,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
* altmap as an alternative source of memory, and we do not exactly
* populate a single PMD.
*/
- return memmap_on_memory &&
- !hugetlb_free_vmemmap_enabled() &&
- IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
+ return mhp_memmap_on_memory() &&
size == memory_block_size_bytes() &&
IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
@@ -1836,7 +1852,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
- MEMORY_OFFLINE | REPORT_FAILURE);
+ MEMORY_OFFLINE | REPORT_FAILURE,
+ GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL);
if (ret) {
reason = "failure to isolate range";
goto failed_removal_pcplists_disabled;
@@ -2074,7 +2091,7 @@ static int __ref try_remove_memory(u64 start, u64 size)
* We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
* the same granularity it was added - a single memory block.
*/
- if (memmap_on_memory) {
+ if (mhp_memmap_on_memory()) {
nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
get_nr_vmemmap_pages_cb);
if (nr_vmemmap_pages) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8c74107a2b15..d39b01fd52fe 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -104,6 +104,7 @@
#include <linux/swapops.h>
#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#include <linux/uaccess.h>
#include "internal.h"
@@ -350,7 +351,7 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
*/
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
- if (!pol)
+ if (!pol || pol->mode == MPOL_LOCAL)
return;
if (!mpol_store_user_nodemask(pol) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
@@ -440,12 +441,11 @@ static inline bool queue_pages_required(struct page *page,
}
/*
- * queue_pages_pmd() has four possible return values:
+ * queue_pages_pmd() has three possible return values:
* 0 - pages are placed on the right node or queued successfully, or
* special page is met, i.e. huge zero page.
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
- * 2 - THP was split.
* -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
* existing page was already on a node that does not follow the
* policy.
@@ -507,18 +507,13 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- int ret;
bool has_unmovable = false;
pte_t *pte, *mapped_pte;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
- ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
- if (ret != 2)
- return ret;
- }
- /* THP was split, fall through to pte walk */
+ if (ptl)
+ return queue_pages_pmd(pmd, ptl, addr, end, walk);
if (pmd_trans_unstable(pmd))
return 0;
@@ -636,12 +631,18 @@ unlock:
unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
+ struct mmu_gather tlb;
int nr_updated;
- nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
+ tlb_gather_mmu(&tlb, vma->vm_mm);
+
+ nr_updated = change_protection(&tlb, vma, addr, end, PAGE_NONE,
+ MM_CP_PROT_NUMA);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+ tlb_finish_mmu(&tlb);
+
return nr_updated;
}
#else
@@ -2135,44 +2136,55 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
}
/**
- * alloc_pages_vma - Allocate a page for a VMA.
+ * vma_alloc_folio - Allocate a folio for a VMA.
* @gfp: GFP flags.
- * @order: Order of the GFP allocation.
+ * @order: Order of the folio.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual address of the allocation. Must be inside @vma.
* @hugepage: For hugepages try only the preferred node if possible.
*
- * Allocate a page for a specific address in @vma, using the appropriate
+ * Allocate a folio for a specific address in @vma, using the appropriate
* NUMA policy. When @vma is not NULL the caller must hold the mmap_lock
* of the mm_struct of the VMA to prevent it from going away. Should be
- * used for all allocations for pages that will be mapped into user space.
+ * used for all allocations for folios that will be mapped into user space.
*
- * Return: The page on success or NULL if allocation fails.
+ * Return: The folio on success or NULL if allocation fails.
*/
-struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, bool hugepage)
{
struct mempolicy *pol;
int node = numa_node_id();
- struct page *page;
+ struct folio *folio;
int preferred_nid;
nodemask_t *nmask;
pol = get_vma_policy(vma, addr);
if (pol->mode == MPOL_INTERLEAVE) {
+ struct page *page;
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
+ gfp |= __GFP_COMP;
page = alloc_page_interleave(gfp, order, nid);
+ if (page && order > 1)
+ prep_transhuge_page(page);
+ folio = (struct folio *)page;
goto out;
}
if (pol->mode == MPOL_PREFERRED_MANY) {
+ struct page *page;
+
node = policy_node(gfp, pol, node);
+ gfp |= __GFP_COMP;
page = alloc_pages_preferred_many(gfp, order, node, pol);
mpol_cond_put(pol);
+ if (page && order > 1)
+ prep_transhuge_page(page);
+ folio = (struct folio *)page;
goto out;
}
@@ -2199,8 +2211,8 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
* First, try to allocate THP only on local node, but
* don't reclaim unnecessarily, just compact.
*/
- page = __alloc_pages_node(hpage_node,
- gfp | __GFP_THISNODE | __GFP_NORETRY, order);
+ folio = __folio_alloc_node(gfp | __GFP_THISNODE |
+ __GFP_NORETRY, order, hpage_node);
/*
* If hugepage allocations are configured to always
@@ -2208,8 +2220,9 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
* to prefer hugepage backing, retry allowing remote
* memory with both reclaim and compact as well.
*/
- if (!page && (gfp & __GFP_DIRECT_RECLAIM))
- page = __alloc_pages(gfp, order, hpage_node, nmask);
+ if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
+ folio = __folio_alloc(gfp, order, hpage_node,
+ nmask);
goto out;
}
@@ -2217,25 +2230,12 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
- page = __alloc_pages(gfp, order, preferred_nid, nmask);
+ folio = __folio_alloc(gfp, order, preferred_nid, nmask);
mpol_cond_put(pol);
out:
- return page;
-}
-EXPORT_SYMBOL(alloc_pages_vma);
-
-struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
- unsigned long addr, bool hugepage)
-{
- struct folio *folio;
-
- folio = (struct folio *)alloc_pages_vma(gfp, order, vma, addr,
- hugepage);
- if (folio && order > 1)
- prep_transhuge_page(&folio->page);
-
return folio;
}
+EXPORT_SYMBOL(vma_alloc_folio);
/**
* alloc_pages - Allocate pages.
diff --git a/mm/memremap.c b/mm/memremap.c
index af0223605e69..2b92e97cb25b 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -287,6 +287,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
{
struct mhp_params params = {
.altmap = pgmap_altmap(pgmap),
+ .pgmap = pgmap,
.pgprot = PAGE_KERNEL,
};
const int nr_range = pgmap->nr_range;
@@ -459,6 +460,15 @@ void free_zone_device_page(struct page *page)
mem_cgroup_uncharge(page_folio(page));
/*
+ * Note: we don't expect anonymous compound pages yet. Once supported
+ * and we could PTE-map them similar to THP, we'd have to clear
+ * PG_anon_exclusive on all tail pages.
+ */
+ VM_BUG_ON_PAGE(PageAnon(page) && PageCompound(page), page);
+ if (PageAnon(page))
+ __ClearPageAnonExclusive(page);
+
+ /*
* When a device managed page is freed, the page->mapping field
* may still contain a (stale) mapping value. For example, the
* lower bits of page->mapping may still identify the page as an
diff --git a/mm/migrate.c b/mm/migrate.c
index 21d82636c291..e51588e95f57 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -177,6 +177,7 @@ static bool remove_migration_pte(struct folio *folio,
DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
while (page_vma_mapped_walk(&pvmw)) {
+ rmap_t rmap_flags = RMAP_NONE;
pte_t pte;
swp_entry_t entry;
struct page *new;
@@ -211,6 +212,9 @@ static bool remove_migration_pte(struct folio *folio,
else if (pte_swp_uffd_wp(*pvmw.pte))
pte = pte_mkuffd_wp(pte);
+ if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
+ rmap_flags |= RMAP_EXCLUSIVE;
+
if (unlikely(is_device_private_page(new))) {
if (pte_write(pte))
entry = make_writable_device_private_entry(
@@ -232,15 +236,17 @@ static bool remove_migration_pte(struct folio *folio,
pte = pte_mkhuge(pte);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
if (folio_test_anon(folio))
- hugepage_add_anon_rmap(new, vma, pvmw.address);
+ hugepage_add_anon_rmap(new, vma, pvmw.address,
+ rmap_flags);
else
- page_dup_rmap(new, true);
+ page_dup_file_rmap(new, true);
set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
} else
#endif
{
if (folio_test_anon(folio))
- page_add_anon_rmap(new, vma, pvmw.address, false);
+ page_add_anon_rmap(new, vma, pvmw.address,
+ rmap_flags);
else
page_add_file_rmap(new, vma, false);
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
@@ -471,11 +477,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
xas_lock_irq(&xas);
expected_count = 2 + page_has_private(page);
- if (page_count(page) != expected_count || xas_load(&xas) != page) {
- xas_unlock_irq(&xas);
- return -EAGAIN;
- }
-
if (!page_ref_freeze(page, expected_count)) {
xas_unlock_irq(&xas);
return -EAGAIN;
@@ -517,6 +518,12 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
folio_set_workingset(newfolio);
if (folio_test_checked(folio))
folio_set_checked(newfolio);
+ /*
+ * PG_anon_exclusive (-> PG_mappedtodisk) is always migrated via
+ * migration entries. We can still have PG_anon_exclusive set on an
+ * effectively unmapped and unreferenced first sub-pages of an
+ * anonymous THP: we can simply copy it here via PG_mappedtodisk.
+ */
if (folio_test_mappedtodisk(folio))
folio_set_mappedtodisk(newfolio);
@@ -836,21 +843,21 @@ static int fallback_migrate_page(struct address_space *mapping,
* < 0 - error code
* MIGRATEPAGE_SUCCESS - success
*/
-static int move_to_new_page(struct page *newpage, struct page *page,
+static int move_to_new_folio(struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
struct address_space *mapping;
int rc = -EAGAIN;
- bool is_lru = !__PageMovable(page);
+ bool is_lru = !__PageMovable(&src->page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+ VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
+ VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
- mapping = page_mapping(page);
+ mapping = folio_mapping(src);
if (likely(is_lru)) {
if (!mapping)
- rc = migrate_page(mapping, newpage, page, mode);
+ rc = migrate_page(mapping, &dst->page, &src->page, mode);
else if (mapping->a_ops->migratepage)
/*
* Most pages have a mapping and most filesystems
@@ -859,54 +866,54 @@ static int move_to_new_page(struct page *newpage, struct page *page,
* migratepage callback. This is the most common path
* for page migration.
*/
- rc = mapping->a_ops->migratepage(mapping, newpage,
- page, mode);
+ rc = mapping->a_ops->migratepage(mapping, &dst->page,
+ &src->page, mode);
else
- rc = fallback_migrate_page(mapping, newpage,
- page, mode);
+ rc = fallback_migrate_page(mapping, &dst->page,
+ &src->page, mode);
} else {
/*
* In case of non-lru page, it could be released after
* isolation step. In that case, we shouldn't try migration.
*/
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
- if (!PageMovable(page)) {
+ VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
+ if (!folio_test_movable(src)) {
rc = MIGRATEPAGE_SUCCESS;
- ClearPageIsolated(page);
+ folio_clear_isolated(src);
goto out;
}
- rc = mapping->a_ops->migratepage(mapping, newpage,
- page, mode);
+ rc = mapping->a_ops->migratepage(mapping, &dst->page,
+ &src->page, mode);
WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
- !PageIsolated(page));
+ !folio_test_isolated(src));
}
/*
- * When successful, old pagecache page->mapping must be cleared before
- * page is freed; but stats require that PageAnon be left as PageAnon.
+ * When successful, old pagecache src->mapping must be cleared before
+ * src is freed; but stats require that PageAnon be left as PageAnon.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- if (__PageMovable(page)) {
- VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ if (__PageMovable(&src->page)) {
+ VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
/*
* We clear PG_movable under page_lock so any compactor
* cannot try to migrate this page.
*/
- ClearPageIsolated(page);
+ folio_clear_isolated(src);
}
/*
- * Anonymous and movable page->mapping will be cleared by
+ * Anonymous and movable src->mapping will be cleared by
* free_pages_prepare so don't reset it here for keeping
* the type to work PageAnon, for example.
*/
- if (!PageMappingFlags(page))
- page->mapping = NULL;
+ if (!folio_mapping_flags(src))
+ src->mapping = NULL;
- if (likely(!is_zone_device_page(newpage)))
- flush_dcache_folio(page_folio(newpage));
+ if (likely(!folio_is_zone_device(dst)))
+ flush_dcache_folio(dst);
}
out:
return rc;
@@ -994,7 +1001,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
goto out_unlock;
if (unlikely(!is_lru)) {
- rc = move_to_new_page(newpage, page, mode);
+ rc = move_to_new_folio(dst, folio, mode);
goto out_unlock_both;
}
@@ -1025,7 +1032,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
}
if (!page_mapped(page))
- rc = move_to_new_page(newpage, page, mode);
+ rc = move_to_new_folio(dst, folio, mode);
/*
* When successful, push newpage to LRU immediately: so that if it
@@ -1230,7 +1237,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
goto put_anon;
if (page_mapped(hpage)) {
- bool mapping_locked = false;
enum ttu_flags ttu = 0;
if (!PageAnon(hpage)) {
@@ -1244,19 +1250,18 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (unlikely(!mapping))
goto unlock_put_anon;
- mapping_locked = true;
- ttu |= TTU_RMAP_LOCKED;
+ ttu = TTU_RMAP_LOCKED;
}
try_to_migrate(src, ttu);
page_was_mapped = 1;
- if (mapping_locked)
+ if (ttu & TTU_RMAP_LOCKED)
i_mmap_unlock_write(mapping);
}
if (!page_mapped(hpage))
- rc = move_to_new_page(new_hpage, hpage, mode);
+ rc = move_to_new_folio(dst, src, mode);
if (page_was_mapped)
remove_migration_ptes(src,
@@ -1412,14 +1417,11 @@ retry:
nr_thp_split++;
goto retry;
}
-
- nr_failed_pages += nr_subpages;
- break;
- }
-
/* Hugetlb migration is unsupported */
- if (!no_subpage_counting)
+ } else if (!no_subpage_counting) {
nr_failed++;
+ }
+
nr_failed_pages += nr_subpages;
break;
case -ENOMEM:
@@ -1434,28 +1436,30 @@ retry:
nr_thp_split++;
goto retry;
}
-
- nr_failed_pages += nr_subpages;
- goto out;
+ } else if (!no_subpage_counting) {
+ nr_failed++;
}
- if (!no_subpage_counting)
- nr_failed++;
nr_failed_pages += nr_subpages;
+ /*
+ * There might be some subpages of fail-to-migrate THPs
+ * left in thp_split_pages list. Move them back to migration
+ * list so that they could be put back to the right list by
+ * the caller otherwise the page refcnt will be leaked.
+ */
+ list_splice_init(&thp_split_pages, from);
+ nr_thp_failed += thp_retry;
goto out;
case -EAGAIN:
- if (is_thp) {
+ if (is_thp)
thp_retry++;
- break;
- }
- retry++;
+ else
+ retry++;
break;
case MIGRATEPAGE_SUCCESS:
nr_succeeded += nr_subpages;
- if (is_thp) {
+ if (is_thp)
nr_thp_succeeded++;
- break;
- }
break;
default:
/*
@@ -1464,14 +1468,11 @@ retry:
* removed from migration page list and not
* retried in the next outer loop.
*/
- if (is_thp) {
+ if (is_thp)
nr_thp_failed++;
- nr_failed_pages += nr_subpages;
- break;
- }
-
- if (!no_subpage_counting)
+ else if (!no_subpage_counting)
nr_failed++;
+
nr_failed_pages += nr_subpages;
break;
}
@@ -1606,8 +1607,8 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
mmap_read_lock(mm);
err = -EFAULT;
- vma = find_vma(mm, addr);
- if (!vma || addr < vma->vm_start || !vma_migratable(vma))
+ vma = vma_lookup(mm, addr);
+ if (!vma || !vma_migratable(vma))
goto out;
/* FOLL_DUMP to ignore special (like zero) pages */
@@ -1802,13 +1803,18 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
goto set_status;
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, FOLL_DUMP);
+ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
err = PTR_ERR(page);
if (IS_ERR(page))
goto set_status;
- err = page ? page_to_nid(page) : -ENOENT;
+ if (page) {
+ err = page_to_nid(page);
+ put_page(page);
+ } else {
+ err = -ENOENT;
+ }
set_status:
*status = err;
@@ -1844,16 +1850,12 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
const void __user * __user *pages,
int __user *status)
{
-#define DO_PAGES_STAT_CHUNK_NR 16
+#define DO_PAGES_STAT_CHUNK_NR 16UL
const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
int chunk_status[DO_PAGES_STAT_CHUNK_NR];
while (nr_pages) {
- unsigned long chunk_nr;
-
- chunk_nr = nr_pages;
- if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
- chunk_nr = DO_PAGES_STAT_CHUNK_NR;
+ unsigned long chunk_nr = min(nr_pages, DO_PAGES_STAT_CHUNK_NR);
if (in_compat_syscall()) {
if (get_compat_pages_array(chunk_pages, pages,
@@ -1969,7 +1971,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
#ifdef CONFIG_NUMA_BALANCING
/*
* Returns true if this is a safe migration target node for misplaced NUMA
- * pages. Currently it only checks the watermarks which crude
+ * pages. Currently it only checks the watermarks which is crude.
*/
static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
unsigned long nr_migrate_pages)
@@ -1979,7 +1981,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
struct zone *zone = pgdat->node_zones + z;
- if (!populated_zone(zone))
+ if (!managed_zone(zone))
continue;
/* Avoid waking kswapd by allocating pages_to_migrate pages. */
@@ -2015,7 +2017,6 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
- int page_lru;
int nr_pages = thp_nr_pages(page);
int order = compound_order(page);
@@ -2032,7 +2033,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING))
return 0;
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
- if (populated_zone(pgdat->node_zones + z))
+ if (managed_zone(pgdat->node_zones + z))
break;
}
wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
@@ -2042,8 +2043,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
if (isolate_lru_page(page))
return 0;
- page_lru = page_is_file_lru(page);
- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page),
nr_pages);
/*
@@ -2116,7 +2116,6 @@ out:
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_NUMA */
/*
* node_demotion[] example:
@@ -2250,7 +2249,6 @@ out:
return target;
}
-#if defined(CONFIG_HOTPLUG_CPU)
/* Disable reclaim-based migration. */
static void __disable_all_migrate_targets(void)
{
@@ -2353,8 +2351,8 @@ out_clear:
*/
static void __set_migration_target_nodes(void)
{
- nodemask_t next_pass = NODE_MASK_NONE;
- nodemask_t this_pass = NODE_MASK_NONE;
+ nodemask_t next_pass;
+ nodemask_t this_pass;
nodemask_t used_targets = NODE_MASK_NONE;
int node, best_distance;
@@ -2443,6 +2441,7 @@ void set_migration_target_nodes(void)
* __set_migration_target_nodes() can be used as opposed to
* set_migration_target_nodes().
*/
+#ifdef CONFIG_MEMORY_HOTPLUG
static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
unsigned long action, void *_arg)
{
@@ -2488,15 +2487,17 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
return notifier_from_errno(0);
}
+#endif
void __init migrate_on_reclaim_init(void)
{
- node_demotion = kmalloc_array(nr_node_ids,
- sizeof(struct demotion_nodes),
- GFP_KERNEL);
+ node_demotion = kcalloc(nr_node_ids,
+ sizeof(struct demotion_nodes),
+ GFP_KERNEL);
WARN_ON(!node_demotion);
-
+#ifdef CONFIG_MEMORY_HOTPLUG
hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+#endif
/*
* At this point, all numa nodes with memory/CPus have their state
* properly set, so we can build the demotion order now.
@@ -2507,7 +2508,6 @@ void __init migrate_on_reclaim_init(void)
set_migration_target_nodes();
cpus_read_unlock();
}
-#endif /* CONFIG_HOTPLUG_CPU */
bool numa_demotion_enabled = false;
@@ -2523,12 +2523,11 @@ static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
- numa_demotion_enabled = true;
- else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
- numa_demotion_enabled = false;
- else
- return -EINVAL;
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &numa_demotion_enabled);
+ if (ret)
+ return ret;
return count;
}
@@ -2568,4 +2567,5 @@ delete_obj:
return err;
}
subsys_initcall(numa_init_sysfs);
-#endif
+#endif /* CONFIG_SYSFS */
+#endif /* CONFIG_NUMA */
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 70c7dc05bbfc..5052093d0262 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -184,15 +184,34 @@ again:
* set up a special migration page table entry now.
*/
if (trylock_page(page)) {
+ bool anon_exclusive;
pte_t swp_pte;
+ anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
+ if (anon_exclusive) {
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
+ ptep_clear_flush(vma, addr, ptep);
+
+ if (page_try_share_anon_rmap(page)) {
+ set_pte_at(mm, addr, ptep, pte);
+ unlock_page(page);
+ put_page(page);
+ mpfn = 0;
+ goto next;
+ }
+ } else {
+ ptep_get_and_clear(mm, addr, ptep);
+ }
+
migrate->cpages++;
- ptep_get_and_clear(mm, addr, ptep);
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
page_to_pfn(page));
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(page));
else
entry = make_readable_migration_entry(
page_to_pfn(page));
@@ -610,7 +629,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
goto unlock_abort;
inc_mm_counter(mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, addr, false);
+ page_add_new_anon_rmap(page, vma, addr);
if (!is_zone_device_page(page))
lru_cache_add_inactive_or_unevictable(page, vma);
get_page(page);
diff --git a/mm/mincore.c b/mm/mincore.c
index 9122676b54d6..fa200c14185f 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -20,6 +20,7 @@
#include <linux/pgtable.h>
#include <linux/uaccess.h>
+#include "swap.h"
static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -121,7 +122,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
for (; addr != end; ptep++, addr += PAGE_SIZE) {
pte_t pte = *ptep;
- if (pte_none(pte))
+ /* We need to do cache lookup too for pte markers */
+ if (pte_none_mostly(pte))
__mincore_unmapped_range(addr, addr + PAGE_SIZE,
vma, vec);
else if (pte_present(pte))
diff --git a/mm/mmap.c b/mm/mmap.c
index 313b57d55a63..2b9305ed0dda 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -102,26 +102,31 @@ static void unmap_region(struct mm_struct *mm,
* x: (yes) yes
*/
pgprot_t protection_map[16] __ro_after_init = {
- __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
- __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
+ [VM_NONE] = __P000,
+ [VM_READ] = __P001,
+ [VM_WRITE] = __P010,
+ [VM_WRITE | VM_READ] = __P011,
+ [VM_EXEC] = __P100,
+ [VM_EXEC | VM_READ] = __P101,
+ [VM_EXEC | VM_WRITE] = __P110,
+ [VM_EXEC | VM_WRITE | VM_READ] = __P111,
+ [VM_SHARED] = __S000,
+ [VM_SHARED | VM_READ] = __S001,
+ [VM_SHARED | VM_WRITE] = __S010,
+ [VM_SHARED | VM_WRITE | VM_READ] = __S011,
+ [VM_SHARED | VM_EXEC] = __S100,
+ [VM_SHARED | VM_EXEC | VM_READ] = __S101,
+ [VM_SHARED | VM_EXEC | VM_WRITE] = __S110,
+ [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __S111
};
-#ifndef CONFIG_ARCH_HAS_FILTER_PGPROT
-static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
-{
- return prot;
-}
-#endif
-
+#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
- pgprot_t ret = __pgprot(pgprot_val(protection_map[vm_flags &
- (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]) |
- pgprot_val(arch_vm_get_page_prot(vm_flags)));
-
- return arch_filter_pgprot(ret);
+ return protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
}
EXPORT_SYMBOL(vm_get_page_prot);
+#endif /* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */
static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
@@ -1218,7 +1223,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
end, prev->vm_pgoff, NULL, prev);
if (err)
return NULL;
- khugepaged_enter_vma_merge(prev, vm_flags);
+ khugepaged_enter_vma(prev, vm_flags);
return prev;
}
@@ -1245,7 +1250,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
}
if (err)
return NULL;
- khugepaged_enter_vma_merge(area, vm_flags);
+ khugepaged_enter_vma(area, vm_flags);
return area;
}
@@ -1280,7 +1285,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
* the same as 'old', the other will be the new one that is trying
* to share the anon_vma.
*
- * NOTE! This runs with mm_sem held for reading, so it is possible that
+ * NOTE! This runs with mmap_lock held for reading, so it is possible that
* the anon_vma of 'old' is concurrently in the process of being set up
* by another page fault trying to merge _that_. But that's ok: if it
* is being set up, that automatically means that it will be a singleton
@@ -1294,7 +1299,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
*
* We also make sure that the two vma's are compatible (adjacent,
* and with the same memory policies). That's all stable, even with just
- * a read lock on the mm_sem.
+ * a read lock on the mmap_lock.
*/
static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
{
@@ -1842,6 +1847,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
}
vma_link(mm, vma, prev, rb_link, rb_parent);
+
+ /*
+ * vma_merge() calls khugepaged_enter_vma() either, the below
+ * call covers the non-merge case.
+ */
+ khugepaged_enter_vma(vma, vma->vm_flags);
+
/* Once vma denies write, undo our temporary denial count */
unmap_writable:
if (file && vm_flags & VM_SHARED)
@@ -2340,15 +2352,8 @@ static int acct_stack_growth(struct vm_area_struct *vma,
return -ENOMEM;
/* mlock limit tests */
- if (vma->vm_flags & VM_LOCKED) {
- unsigned long locked;
- unsigned long limit;
- locked = mm->locked_vm + grow;
- limit = rlimit(RLIMIT_MEMLOCK);
- limit >>= PAGE_SHIFT;
- if (locked > limit && !capable(CAP_IPC_LOCK))
- return -ENOMEM;
- }
+ if (mlock_future_check(mm, vma->vm_flags, grow << PAGE_SHIFT))
+ return -ENOMEM;
/* Check to ensure the stack will not grow into a hugetlb-only region */
new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
@@ -2452,7 +2457,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
anon_vma_unlock_write(vma->anon_vma);
- khugepaged_enter_vma_merge(vma, vma->vm_flags);
+ khugepaged_enter_vma(vma, vma->vm_flags);
validate_mm(mm);
return error;
}
@@ -2530,7 +2535,7 @@ int expand_downwards(struct vm_area_struct *vma,
}
}
anon_vma_unlock_write(vma->anon_vma);
- khugepaged_enter_vma_merge(vma, vma->vm_flags);
+ khugepaged_enter_vma(vma, vma->vm_flags);
validate_mm(mm);
return error;
}
@@ -3553,7 +3558,7 @@ int mm_take_all_locks(struct mm_struct *mm)
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- BUG_ON(mmap_read_trylock(mm));
+ mmap_assert_write_locked(mm);
mutex_lock(&mm_all_locks_mutex);
@@ -3633,7 +3638,7 @@ void mm_drop_all_locks(struct mm_struct *mm)
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- BUG_ON(mmap_read_trylock(mm));
+ mmap_assert_write_locked(mm);
BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
for (vma = mm->mmap; vma; vma = vma->vm_next) {
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index afb7185ffdc4..a71924bd38c0 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -47,8 +47,20 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb)
struct mmu_gather_batch *batch;
for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
- free_pages_and_swap_cache(batch->pages, batch->nr);
- batch->nr = 0;
+ struct page **pages = batch->pages;
+
+ do {
+ /*
+ * limit free batch count when PAGE_SIZE > 4K
+ */
+ unsigned int nr = min(512U, batch->nr);
+
+ free_pages_and_swap_cache(pages, nr);
+ pages += nr;
+ batch->nr -= nr;
+
+ cond_resched();
+ } while (batch->nr);
}
tlb->active = &tlb->local;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b69ce7a7b2b7..ba5592655ee3 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -30,15 +30,17 @@
#include <linux/mm_inline.h>
#include <linux/pgtable.h>
#include <linux/sched/sysctl.h>
+#include <linux/userfaultfd_k.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#include "internal.h"
-static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end, pgprot_t newprot,
- unsigned long cp_flags)
+static unsigned long change_pte_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pte_t *pte, oldpte;
spinlock_t *ptl;
@@ -49,6 +51,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ tlb_change_page_size(tlb, PAGE_SIZE);
+
/*
* Can be called with only the mmap_lock for reading by
* prot_numa so we must check the pmd isn't constantly
@@ -149,9 +153,12 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
ptent = pte_mkwrite(ptent);
}
ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
+ if (pte_needs_flush(oldpte, ptent))
+ tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
pages++;
} else if (is_swap_pte(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
+ struct page *page = pfn_swap_entry_to_page(entry);
pte_t newpte;
if (is_writable_migration_entry(entry)) {
@@ -159,8 +166,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* A protection check is difficult so
* just be safe and disable write
*/
- entry = make_readable_migration_entry(
- swp_offset(entry));
+ if (PageAnon(page))
+ entry = make_readable_exclusive_migration_entry(
+ swp_offset(entry));
+ else
+ entry = make_readable_migration_entry(swp_offset(entry));
newpte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
@@ -184,6 +194,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
newpte = pte_swp_mksoft_dirty(newpte);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
+ } else if (pte_marker_entry_uffd_wp(entry)) {
+ /*
+ * If this is uffd-wp pte marker and we'd like
+ * to unprotect it, drop it; the next page
+ * fault will trigger without uffd trapping.
+ */
+ if (uffd_wp_resolve) {
+ pte_clear(vma->vm_mm, addr, pte);
+ pages++;
+ }
+ continue;
} else {
newpte = oldpte;
}
@@ -197,6 +218,20 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
set_pte_at(vma->vm_mm, addr, pte, newpte);
pages++;
}
+ } else {
+ /* It must be an none page, or what else?.. */
+ WARN_ON_ONCE(!pte_none(oldpte));
+ if (unlikely(uffd_wp && !vma_is_anonymous(vma))) {
+ /*
+ * For file-backed mem, we need to be able to
+ * wr-protect a none pte, because even if the
+ * pte is none, the page/swap cache could
+ * exist. Doing that by install a marker.
+ */
+ set_pte_at(vma->vm_mm, addr, pte,
+ make_pte_marker(PTE_MARKER_UFFD_WP));
+ pages++;
+ }
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
@@ -230,9 +265,42 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
return 0;
}
-static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
- pud_t *pud, unsigned long addr, unsigned long end,
- pgprot_t newprot, unsigned long cp_flags)
+/* Return true if we're uffd wr-protecting file-backed memory, or false */
+static inline bool
+uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
+{
+ return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
+}
+
+/*
+ * If wr-protecting the range for file-backed, populate pgtable for the case
+ * when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc()
+ * failed it means no memory, we don't have a better option but stop.
+ */
+#define change_pmd_prepare(vma, pmd, cp_flags) \
+ do { \
+ if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \
+ if (WARN_ON_ONCE(pte_alloc(vma->vm_mm, pmd))) \
+ break; \
+ } \
+ } while (0)
+/*
+ * This is the general pud/p4d/pgd version of change_pmd_prepare(). We need to
+ * have separate change_pmd_prepare() because pte_alloc() returns 0 on success,
+ * while {pmd|pud|p4d}_alloc() returns the valid pointer on success.
+ */
+#define change_prepare(vma, high, low, addr, cp_flags) \
+ do { \
+ if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \
+ low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
+ if (WARN_ON_ONCE(p == NULL)) \
+ break; \
+ } \
+ } while (0)
+
+static inline unsigned long change_pmd_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pud_t *pud, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pmd_t *pmd;
unsigned long next;
@@ -248,6 +316,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
next = pmd_addr_end(addr, end);
+ change_pmd_prepare(vma, pmd, cp_flags);
/*
* Automatic NUMA balancing walks the tables with mmap_lock
* held for read. It's possible a parallel update to occur
@@ -269,11 +338,22 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
}
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
- if (next - addr != HPAGE_PMD_SIZE) {
+ if ((next - addr != HPAGE_PMD_SIZE) ||
+ uffd_wp_protect_file(vma, cp_flags)) {
__split_huge_pmd(vma, pmd, addr, false, NULL);
+ /*
+ * For file-backed, the pmd could have been
+ * cleared; make sure pmd populated if
+ * necessary, then fall-through to pte level.
+ */
+ change_pmd_prepare(vma, pmd, cp_flags);
} else {
- int nr_ptes = change_huge_pmd(vma, pmd, addr,
- newprot, cp_flags);
+ /*
+ * change_huge_pmd() does not defer TLB flushes,
+ * so no need to propagate the tlb argument.
+ */
+ int nr_ptes = change_huge_pmd(tlb, vma, pmd,
+ addr, newprot, cp_flags);
if (nr_ptes) {
if (nr_ptes == HPAGE_PMD_NR) {
@@ -287,8 +367,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
}
/* fall through, the trans huge pmd just split */
}
- this_pages = change_pte_range(vma, pmd, addr, next, newprot,
- cp_flags);
+ this_pages = change_pte_range(tlb, vma, pmd, addr, next,
+ newprot, cp_flags);
pages += this_pages;
next:
cond_resched();
@@ -302,9 +382,9 @@ next:
return pages;
}
-static inline unsigned long change_pud_range(struct vm_area_struct *vma,
- p4d_t *p4d, unsigned long addr, unsigned long end,
- pgprot_t newprot, unsigned long cp_flags)
+static inline unsigned long change_pud_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
pud_t *pud;
unsigned long next;
@@ -313,18 +393,19 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma,
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
+ change_prepare(vma, pud, pmd, addr, cp_flags);
if (pud_none_or_clear_bad(pud))
continue;
- pages += change_pmd_range(vma, pud, addr, next, newprot,
+ pages += change_pmd_range(tlb, vma, pud, addr, next, newprot,
cp_flags);
} while (pud++, addr = next, addr != end);
return pages;
}
-static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
- pgd_t *pgd, unsigned long addr, unsigned long end,
- pgprot_t newprot, unsigned long cp_flags)
+static inline unsigned long change_p4d_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
p4d_t *p4d;
unsigned long next;
@@ -333,46 +414,44 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma,
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
+ change_prepare(vma, p4d, pud, addr, cp_flags);
if (p4d_none_or_clear_bad(p4d))
continue;
- pages += change_pud_range(vma, p4d, addr, next, newprot,
+ pages += change_pud_range(tlb, vma, p4d, addr, next, newprot,
cp_flags);
} while (p4d++, addr = next, addr != end);
return pages;
}
-static unsigned long change_protection_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end, pgprot_t newprot,
- unsigned long cp_flags)
+static unsigned long change_protection_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long addr,
+ unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
- unsigned long start = addr;
unsigned long pages = 0;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
- flush_cache_range(vma, addr, end);
- inc_tlb_flush_pending(mm);
+ tlb_start_vma(tlb, vma);
do {
next = pgd_addr_end(addr, end);
+ change_prepare(vma, pgd, p4d, addr, cp_flags);
if (pgd_none_or_clear_bad(pgd))
continue;
- pages += change_p4d_range(vma, pgd, addr, next, newprot,
+ pages += change_p4d_range(tlb, vma, pgd, addr, next, newprot,
cp_flags);
} while (pgd++, addr = next, addr != end);
- /* Only flush the TLB if we actually modified any entries: */
- if (pages)
- flush_tlb_range(vma, start, end);
- dec_tlb_flush_pending(mm);
+ tlb_end_vma(tlb, vma);
return pages;
}
-unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+unsigned long change_protection(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
unsigned long cp_flags)
{
@@ -381,9 +460,10 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
if (is_vm_hugetlb_page(vma))
- pages = hugetlb_change_protection(vma, start, end, newprot);
+ pages = hugetlb_change_protection(vma, start, end, newprot,
+ cp_flags);
else
- pages = change_protection_range(vma, start, end, newprot,
+ pages = change_protection_range(tlb, vma, start, end, newprot,
cp_flags);
return pages;
@@ -417,8 +497,9 @@ static const struct mm_walk_ops prot_none_walk_ops = {
};
int
-mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
- unsigned long start, unsigned long end, unsigned long newflags)
+mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ struct vm_area_struct **pprev, unsigned long start,
+ unsigned long end, unsigned long newflags)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long oldflags = vma->vm_flags;
@@ -505,7 +586,7 @@ success:
dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
vma_set_page_prot(vma);
- change_protection(vma, start, end, vma->vm_page_prot,
+ change_protection(tlb, vma, start, end, vma->vm_page_prot,
dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
/*
@@ -539,6 +620,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
(prot & PROT_READ);
+ struct mmu_gather tlb;
start = untagged_addr(start);
@@ -598,6 +680,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
else
prev = vma->vm_prev;
+ tlb_gather_mmu(&tlb, current->mm);
for (nstart = start ; ; ) {
unsigned long mask_off_old_flags;
unsigned long newflags;
@@ -624,18 +707,18 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
/* newflags >> 4 shift VM_MAY% in place of VM_% */
if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) {
error = -EACCES;
- goto out;
+ break;
}
/* Allow architectures to sanity-check the new flags */
if (!arch_validate_flags(newflags)) {
error = -EINVAL;
- goto out;
+ break;
}
error = security_file_mprotect(vma, reqprot, prot);
if (error)
- goto out;
+ break;
tmp = vma->vm_end;
if (tmp > end)
@@ -644,27 +727,28 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if (vma->vm_ops && vma->vm_ops->mprotect) {
error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags);
if (error)
- goto out;
+ break;
}
- error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
+ error = mprotect_fixup(&tlb, vma, &prev, nstart, tmp, newflags);
if (error)
- goto out;
+ break;
nstart = tmp;
if (nstart < prev->vm_end)
nstart = prev->vm_end;
if (nstart >= end)
- goto out;
+ break;
vma = prev->vm_next;
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
- goto out;
+ break;
}
prot = reqprot;
}
+ tlb_finish_mmu(&tlb);
out:
mmap_write_unlock(current->mm);
return error;
diff --git a/mm/mremap.c b/mm/mremap.c
index 0b93fac76851..b522cd0259a0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -490,12 +490,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
return 0;
old_end = old_addr + len;
- flush_cache_range(vma, old_addr, old_end);
if (is_vm_hugetlb_page(vma))
return move_hugetlb_page_tables(vma, new_vma, old_addr,
new_addr, len);
+ flush_cache_range(vma, old_addr, old_end);
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
old_addr, old_end);
mmu_notifier_invalidate_range_start(&range);
@@ -766,14 +766,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
return ERR_PTR(-EFAULT);
- if (vma->vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = mm->locked_vm << PAGE_SHIFT;
- lock_limit = rlimit(RLIMIT_MEMLOCK);
- locked += new_len - old_len;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- return ERR_PTR(-EAGAIN);
- }
+ if (mlock_future_check(mm, vma->vm_flags, new_len - old_len))
+ return ERR_PTR(-EAGAIN);
if (!may_expand_vm(mm, vma->vm_flags,
(new_len - old_len) >> PAGE_SHIFT))
@@ -826,9 +820,9 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
goto out;
}
- if (old_len >= new_len) {
+ if (old_len > new_len) {
ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
- if (ret && old_len != new_len)
+ if (ret)
goto out;
old_len = new_len;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index fa1117db4610..359dc1da3636 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -650,18 +650,25 @@ static unsigned int bdi_min_ratio;
int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
+ unsigned int delta;
int ret = 0;
spin_lock_bh(&bdi_lock);
if (min_ratio > bdi->max_ratio) {
ret = -EINVAL;
} else {
- min_ratio -= bdi->min_ratio;
- if (bdi_min_ratio + min_ratio < 100) {
- bdi_min_ratio += min_ratio;
- bdi->min_ratio += min_ratio;
+ if (min_ratio < bdi->min_ratio) {
+ delta = bdi->min_ratio - min_ratio;
+ bdi_min_ratio -= delta;
+ bdi->min_ratio = min_ratio;
} else {
- ret = -EINVAL;
+ delta = min_ratio - bdi->min_ratio;
+ if (bdi_min_ratio + delta < 100) {
+ bdi_min_ratio += delta;
+ bdi->min_ratio = min_ratio;
+ } else {
+ ret = -EINVAL;
+ }
}
}
spin_unlock_bh(&bdi_lock);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0e42038382c1..bc93a82e51e6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -81,6 +81,7 @@
#include "internal.h"
#include "shuffle.h"
#include "page_reporting.h"
+#include "swap.h"
/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
typedef int __bitwise fpi_t;
@@ -867,40 +868,6 @@ static inline void set_buddy_order(struct page *page, unsigned int order)
__SetPageBuddy(page);
}
-/*
- * This function checks whether a page is free && is the buddy
- * we can coalesce a page and its buddy if
- * (a) the buddy is not in a hole (check before calling!) &&
- * (b) the buddy is in the buddy system &&
- * (c) a page and its buddy have the same order &&
- * (d) a page and its buddy are in the same zone.
- *
- * For recording whether a page is in the buddy system, we set PageBuddy.
- * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
- *
- * For recording page's order, we use page_private(page).
- */
-static inline bool page_is_buddy(struct page *page, struct page *buddy,
- unsigned int order)
-{
- if (!page_is_guard(buddy) && !PageBuddy(buddy))
- return false;
-
- if (buddy_order(buddy) != order)
- return false;
-
- /*
- * zone check is done late to avoid uselessly calculating
- * zone/node ids for pages that could never merge.
- */
- if (page_zone_id(page) != page_zone_id(buddy))
- return false;
-
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
-
- return true;
-}
-
#ifdef CONFIG_COMPACTION
static inline struct capture_control *task_capc(struct zone *zone)
{
@@ -1009,18 +976,17 @@ static inline bool
buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
struct page *page, unsigned int order)
{
- struct page *higher_page, *higher_buddy;
- unsigned long combined_pfn;
+ unsigned long higher_page_pfn;
+ struct page *higher_page;
if (order >= MAX_ORDER - 2)
return false;
- combined_pfn = buddy_pfn & pfn;
- higher_page = page + (combined_pfn - pfn);
- buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
- higher_buddy = higher_page + (buddy_pfn - combined_pfn);
+ higher_page_pfn = buddy_pfn & pfn;
+ higher_page = page + (higher_page_pfn - pfn);
- return page_is_buddy(higher_page, higher_buddy, order + 1);
+ return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1,
+ NULL) != NULL;
}
/*
@@ -1053,7 +1019,6 @@ static inline void __free_one_page(struct page *page,
int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
- unsigned int max_order = pageblock_order;
unsigned long buddy_pfn;
unsigned long combined_pfn;
struct page *buddy;
@@ -1069,18 +1034,32 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
-continue_merging:
- while (order < max_order) {
+ while (order < MAX_ORDER - 1) {
if (compaction_capture(capc, page, order, migratetype)) {
__mod_zone_freepage_state(zone, -(1 << order),
migratetype);
return;
}
- buddy_pfn = __find_buddy_pfn(pfn, order);
- buddy = page + (buddy_pfn - pfn);
- if (!page_is_buddy(page, buddy, order))
+ buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
+ if (!buddy)
goto done_merging;
+
+ if (unlikely(order >= pageblock_order)) {
+ /*
+ * We want to prevent merge between freepages on pageblock
+ * without fallbacks and normal pageblock. Without this,
+ * pageblock isolation could cause incorrect freepage or CMA
+ * accounting or HIGHATOMIC accounting.
+ */
+ int buddy_mt = get_pageblock_migratetype(buddy);
+
+ if (migratetype != buddy_mt
+ && (!migratetype_is_mergeable(migratetype) ||
+ !migratetype_is_mergeable(buddy_mt)))
+ goto done_merging;
+ }
+
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
@@ -1094,32 +1073,6 @@ continue_merging:
pfn = combined_pfn;
order++;
}
- if (order < MAX_ORDER - 1) {
- /* If we are here, it means order is >= pageblock_order.
- * We want to prevent merge between freepages on pageblock
- * without fallbacks and normal pageblock. Without this,
- * pageblock isolation could cause incorrect freepage or CMA
- * accounting or HIGHATOMIC accounting.
- *
- * We don't want to hit this code for the more frequent
- * low-order merging.
- */
- int buddy_mt;
-
- buddy_pfn = __find_buddy_pfn(pfn, order);
- buddy = page + (buddy_pfn - pfn);
-
- if (!page_is_buddy(page, buddy, order))
- goto done_merging;
- buddy_mt = get_pageblock_migratetype(buddy);
-
- if (migratetype != buddy_mt
- && (!migratetype_is_mergeable(migratetype) ||
- !migratetype_is_mergeable(buddy_mt)))
- goto done_merging;
- max_order = order + 1;
- goto continue_merging;
- }
done_merging:
set_buddy_order(page, order);
@@ -1141,6 +1094,48 @@ done_merging:
page_reporting_notify_free(order);
}
+/**
+ * split_free_page() -- split a free page at split_pfn_offset
+ * @free_page: the original free page
+ * @order: the order of the page
+ * @split_pfn_offset: split offset within the page
+ *
+ * It is used when the free page crosses two pageblocks with different migratetypes
+ * at split_pfn_offset within the page. The split free page will be put into
+ * separate migratetype lists afterwards. Otherwise, the function achieves
+ * nothing.
+ */
+void split_free_page(struct page *free_page,
+ int order, unsigned long split_pfn_offset)
+{
+ struct zone *zone = page_zone(free_page);
+ unsigned long free_page_pfn = page_to_pfn(free_page);
+ unsigned long pfn;
+ unsigned long flags;
+ int free_page_order;
+
+ if (split_pfn_offset == 0)
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ del_page_from_free_list(free_page, zone, order);
+ for (pfn = free_page_pfn;
+ pfn < free_page_pfn + (1UL << order);) {
+ int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
+
+ free_page_order = min_t(int,
+ pfn ? __ffs(pfn) : order,
+ __fls(split_pfn_offset));
+ __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
+ mt, FPI_NONE);
+ pfn += 1UL << free_page_order;
+ split_pfn_offset -= (1UL << free_page_order);
+ /* we have done the first part, now switch to second part */
+ if (split_pfn_offset == 0)
+ split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
/*
* A bad page could be due to a number of fields. Instead of multiple branches,
* try and check multiple fields with one check. The caller must do a detailed
@@ -2476,6 +2471,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
del_page_from_free_list(page, zone, current_order);
expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
+ trace_mm_page_alloc_zone_locked(page, order, migratetype,
+ pcp_allowed_order(order) &&
+ migratetype < MIGRATE_PCPTYPES);
return page;
}
@@ -2999,7 +2997,7 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
zone_page_state(zone, NR_FREE_PAGES) / 2) {
page = __rmqueue_cma_fallback(zone, order);
if (page)
- goto out;
+ return page;
}
}
retry:
@@ -3012,9 +3010,6 @@ retry:
alloc_flags))
goto retry;
}
-out:
- if (page)
- trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
@@ -3733,11 +3728,8 @@ struct page *rmqueue(struct zone *preferred_zone,
* reserved for high-order atomic allocation, so order-0
* request should skip it.
*/
- if (order > 0 && alloc_flags & ALLOC_HARDER) {
+ if (order > 0 && alloc_flags & ALLOC_HARDER)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
- if (page)
- trace_mm_page_alloc_zone_locked(page, order, migratetype);
- }
if (!page) {
page = __rmqueue(zone, order, migratetype, alloc_flags);
if (!page)
@@ -3799,6 +3791,9 @@ static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
(gfp_mask & __GFP_DIRECT_RECLAIM))
return false;
+ if (gfp_mask & __GFP_NOWARN)
+ fail_page_alloc.attr.no_warn = true;
+
return should_fail(&fail_page_alloc.attr, 1 << order);
}
@@ -4068,7 +4063,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
{
struct zoneref *z;
struct zone *zone;
- struct pglist_data *last_pgdat_dirty_limit = NULL;
+ struct pglist_data *last_pgdat = NULL;
+ bool last_pgdat_dirty_ok = false;
bool no_fallback;
retry:
@@ -4107,13 +4103,13 @@ retry:
* dirty-throttling and the flusher threads.
*/
if (ac->spread_dirty_pages) {
- if (last_pgdat_dirty_limit == zone->zone_pgdat)
- continue;
+ if (last_pgdat != zone->zone_pgdat) {
+ last_pgdat = zone->zone_pgdat;
+ last_pgdat_dirty_ok = node_dirty_ok(zone->zone_pgdat);
+ }
- if (!node_dirty_ok(zone->zone_pgdat)) {
- last_pgdat_dirty_limit = zone->zone_pgdat;
+ if (!last_pgdat_dirty_ok)
continue;
- }
}
if (no_fallback && nr_online_nodes > 1 &&
@@ -4346,7 +4342,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
*/
/* Exhausted what can be done so it's blame time */
- if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
+ if (out_of_memory(&oc) ||
+ WARN_ON_ONCE_GFP(gfp_mask & __GFP_NOFAIL, gfp_mask)) {
*did_some_progress = 1;
/*
@@ -4677,9 +4674,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
ac->nodemask) {
- if (last_pgdat != zone->zone_pgdat)
+ if (!managed_zone(zone))
+ continue;
+ if (last_pgdat != zone->zone_pgdat) {
wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
- last_pgdat = zone->zone_pgdat;
+ last_pgdat = zone->zone_pgdat;
+ }
}
}
@@ -5117,7 +5117,7 @@ nopage:
* All existing users of the __GFP_NOFAIL are blockable, so warn
* of any new users that actually require GFP_NOWAIT
*/
- if (WARN_ON_ONCE(!can_direct_reclaim))
+ if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
goto fail;
/*
@@ -5125,7 +5125,7 @@ nopage:
* because we cannot reclaim anything and only can loop waiting
* for somebody to do a work for us
*/
- WARN_ON_ONCE(current->flags & PF_MEMALLOC);
+ WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
/*
* non failing costly orders are a hard requirement which we
@@ -5133,7 +5133,7 @@ nopage:
* so that we can identify them and convert them to something
* else.
*/
- WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
+ WARN_ON_ONCE_GFP(order > PAGE_ALLOC_COSTLY_ORDER, gfp_mask);
/*
* Help non-failing allocations by giving them access to memory
@@ -5379,10 +5379,8 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
* There are several places where we assume that the order value is sane
* so bail out early if the request is out of bound.
*/
- if (unlikely(order >= MAX_ORDER)) {
- WARN_ON_ONCE(!(gfp & __GFP_NOWARN));
+ if (WARN_ON_ONCE_GFP(order >= MAX_ORDER, gfp))
return NULL;
- }
gfp &= gfp_allowed_mask;
/*
@@ -6171,7 +6169,6 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
}
-#define MAX_NODE_LOAD (nr_online_nodes)
static int node_load[MAX_NUMNODES];
/**
@@ -6218,7 +6215,7 @@ int find_next_best_node(int node, nodemask_t *used_node_mask)
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
- val *= (MAX_NODE_LOAD*MAX_NUMNODES);
+ val *= MAX_NUMNODES;
val += node_load[n];
if (val < min_val) {
@@ -6284,13 +6281,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
static void build_zonelists(pg_data_t *pgdat)
{
static int node_order[MAX_NUMNODES];
- int node, load, nr_nodes = 0;
+ int node, nr_nodes = 0;
nodemask_t used_mask = NODE_MASK_NONE;
int local_node, prev_node;
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
- load = nr_online_nodes;
prev_node = local_node;
memset(node_order, 0, sizeof(node_order));
@@ -6302,11 +6298,10 @@ static void build_zonelists(pg_data_t *pgdat)
*/
if (node_distance(local_node, node) !=
node_distance(local_node, prev_node))
- node_load[node] += load;
+ node_load[node] += 1;
node_order[nr_nodes++] = node;
prev_node = node;
- load--;
}
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
@@ -6645,6 +6640,21 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
}
}
+/*
+ * With compound page geometry and when struct pages are stored in ram most
+ * tail pages are reused. Consequently, the amount of unique struct pages to
+ * initialize is a lot smaller that the total amount of struct pages being
+ * mapped. This is a paired / mild layering violation with explicit knowledge
+ * of how the sparse_vmemmap internals handle compound pages in the lack
+ * of an altmap. See vmemmap_populate_compound_pages().
+ */
+static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap,
+ unsigned long nr_pages)
+{
+ return is_power_of_2(sizeof(struct page)) &&
+ !altmap ? 2 * (PAGE_SIZE / sizeof(struct page)) : nr_pages;
+}
+
static void __ref memmap_init_compound(struct page *head,
unsigned long head_pfn,
unsigned long zone_idx, int nid,
@@ -6709,7 +6719,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
continue;
memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
- pfns_per_compound);
+ compound_nr_pages(altmap, pfns_per_compound));
}
pr_info("%s initialised %lu pages in %ums\n", __func__,
@@ -7870,7 +7880,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
usable_startpfn = memblock_region_memory_base_pfn(r);
- if (usable_startpfn < 0x100000) {
+ if (usable_startpfn < PHYS_PFN(SZ_4G)) {
mem_below_4gb_not_mirrored = true;
continue;
}
@@ -8949,136 +8959,7 @@ void *__init alloc_large_system_hash(const char *tablename,
return table;
}
-/*
- * This function checks whether pageblock includes unmovable pages or not.
- *
- * PageLRU check without isolation or lru_lock could race so that
- * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
- * check without lock_page also may miss some movable non-lru pages at
- * race condition. So you can't expect this function should be exact.
- *
- * Returns a page without holding a reference. If the caller wants to
- * dereference that page (e.g., dumping), it has to make sure that it
- * cannot get removed (e.g., via memory unplug) concurrently.
- *
- */
-struct page *has_unmovable_pages(struct zone *zone, struct page *page,
- int migratetype, int flags)
-{
- unsigned long iter = 0;
- unsigned long pfn = page_to_pfn(page);
- unsigned long offset = pfn % pageblock_nr_pages;
-
- if (is_migrate_cma_page(page)) {
- /*
- * CMA allocations (alloc_contig_range) really need to mark
- * isolate CMA pageblocks even when they are not movable in fact
- * so consider them movable here.
- */
- if (is_migrate_cma(migratetype))
- return NULL;
-
- return page;
- }
-
- for (; iter < pageblock_nr_pages - offset; iter++) {
- page = pfn_to_page(pfn + iter);
-
- /*
- * Both, bootmem allocations and memory holes are marked
- * PG_reserved and are unmovable. We can even have unmovable
- * allocations inside ZONE_MOVABLE, for example when
- * specifying "movablecore".
- */
- if (PageReserved(page))
- return page;
-
- /*
- * If the zone is movable and we have ruled out all reserved
- * pages then it should be reasonably safe to assume the rest
- * is movable.
- */
- if (zone_idx(zone) == ZONE_MOVABLE)
- continue;
-
- /*
- * Hugepages are not in LRU lists, but they're movable.
- * THPs are on the LRU, but need to be counted as #small pages.
- * We need not scan over tail pages because we don't
- * handle each tail page individually in migration.
- */
- if (PageHuge(page) || PageTransCompound(page)) {
- struct page *head = compound_head(page);
- unsigned int skip_pages;
-
- if (PageHuge(page)) {
- if (!hugepage_migration_supported(page_hstate(head)))
- return page;
- } else if (!PageLRU(head) && !__PageMovable(head)) {
- return page;
- }
-
- skip_pages = compound_nr(head) - (page - head);
- iter += skip_pages - 1;
- continue;
- }
-
- /*
- * We can't use page_count without pin a page
- * because another CPU can free compound page.
- * This check already skips compound tails of THP
- * because their page->_refcount is zero at all time.
- */
- if (!page_ref_count(page)) {
- if (PageBuddy(page))
- iter += (1 << buddy_order(page)) - 1;
- continue;
- }
-
- /*
- * The HWPoisoned page may be not in buddy system, and
- * page_count() is not 0.
- */
- if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
- continue;
-
- /*
- * We treat all PageOffline() pages as movable when offlining
- * to give drivers a chance to decrement their reference count
- * in MEM_GOING_OFFLINE in order to indicate that these pages
- * can be offlined as there are no direct references anymore.
- * For actually unmovable PageOffline() where the driver does
- * not support this, we will fail later when trying to actually
- * move these pages that still have a reference count > 0.
- * (false negatives in this function only)
- */
- if ((flags & MEMORY_OFFLINE) && PageOffline(page))
- continue;
-
- if (__PageMovable(page) || PageLRU(page))
- continue;
-
- /*
- * If there are RECLAIMABLE pages, we need to check
- * it. But now, memory offline itself doesn't call
- * shrink_node_slabs() and it still to be fixed.
- */
- return page;
- }
- return NULL;
-}
-
#ifdef CONFIG_CONTIG_ALLOC
-static unsigned long pfn_max_align_down(unsigned long pfn)
-{
- return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
-}
-
-static unsigned long pfn_max_align_up(unsigned long pfn)
-{
- return ALIGN(pfn, MAX_ORDER_NR_PAGES);
-}
-
#if defined(CONFIG_DYNAMIC_DEBUG) || \
(defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* Usage: See admin-guide/dynamic-debug-howto.rst */
@@ -9101,7 +8982,7 @@ static inline void alloc_contig_dump_pages(struct list_head *page_list)
#endif
/* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(struct compact_control *cc,
+int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long start, unsigned long end)
{
/* This function is based on compact_zone() from compaction.c. */
@@ -9151,7 +9032,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
lru_cache_enable();
if (ret < 0) {
- if (ret == -EBUSY)
+ if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY)
alloc_contig_dump_pages(&cc->migratepages);
putback_movable_pages(&cc->migratepages);
return ret;
@@ -9169,8 +9050,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
* be either of the two.
* @gfp_mask: GFP mask to use during compaction
*
- * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
- * aligned. The PFN range must belong to a single zone.
+ * The PFN range does not have to be pageblock aligned. The PFN range must
+ * belong to a single zone.
*
* The first thing this routine does is attempt to MIGRATE_ISOLATE all
* pageblocks in the range. Once isolated, the pageblocks should not
@@ -9184,7 +9065,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype, gfp_t gfp_mask)
{
unsigned long outer_start, outer_end;
- unsigned int order;
+ int order;
int ret = 0;
struct compact_control cc = {
@@ -9203,14 +9084,11 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* What we do here is we mark all pageblocks in range as
* MIGRATE_ISOLATE. Because pageblock and max order pages may
* have different sizes, and due to the way page allocator
- * work, we align the range to biggest of the two pages so
- * that page allocator won't try to merge buddies from
- * different pageblocks and change MIGRATE_ISOLATE to some
- * other migration type.
+ * work, start_isolate_page_range() has special handlings for this.
*
* Once the pageblocks are marked as MIGRATE_ISOLATE, we
* migrate the pages from an unaligned range (ie. pages that
- * we are interested in). This will put all the pages in
+ * we are interested in). This will put all the pages in
* range back to page allocator as MIGRATE_ISOLATE.
*
* When this is done, we take the pages in range from page
@@ -9223,10 +9101,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
* put back to page allocator so that buddy can use them.
*/
- ret = start_isolate_page_range(pfn_max_align_down(start),
- pfn_max_align_up(end), migratetype, 0);
+ ret = start_isolate_page_range(start, end, migratetype, 0, gfp_mask);
if (ret)
- return ret;
+ goto done;
drain_all_pages(cc.zone);
@@ -9246,7 +9123,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
ret = 0;
/*
- * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
+ * Pages from [start, end) are within a pageblock_nr_pages
* aligned blocks that are marked as MIGRATE_ISOLATE. What's
* more, all pages in [start, end) are free in page allocator.
* What we are going to do is to allocate all pages from
@@ -9305,8 +9182,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
free_contig_range(end, outer_end - end);
done:
- undo_isolate_page_range(pfn_max_align_down(start),
- pfn_max_align_up(end), migratetype);
+ undo_isolate_page_range(start, end, migratetype);
return ret;
}
EXPORT_SYMBOL(alloc_contig_range);
@@ -9625,7 +9501,6 @@ bool put_page_back_buddy(struct page *page)
ClearPageHWPoisonTakenOff(page);
__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
if (TestClearPageHWPoison(page)) {
- num_poisoned_pages_dec();
ret = true;
}
}
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 2e66d934d63f..3dc715d7ac29 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -320,7 +320,7 @@ static int __meminit online_page_ext(unsigned long start_pfn,
* online__pages(), and start_pfn should exist.
*/
nid = pfn_to_nid(start_pfn);
- VM_BUG_ON(!node_state(nid, N_ONLINE));
+ VM_BUG_ON(!node_online(nid));
}
for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
diff --git a/mm/page_idle.c b/mm/page_idle.c
index fc0435abf909..bc08332a609c 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -86,11 +86,12 @@ static bool page_idle_clear_pte_refs_one(struct folio *folio,
static void page_idle_clear_pte_refs(struct page *page)
{
struct folio *folio = page_folio(page);
+
/*
- * Since rwc.arg is unused, rwc is effectively immutable, so we
- * can make it static const to save some cycles and stack.
+ * Since rwc.try_lock is unused, rwc is effectively immutable, so we
+ * can make it static to save some cycles and stack.
*/
- static const struct rmap_walk_control rwc = {
+ static struct rmap_walk_control rwc = {
.rmap_one = page_idle_clear_pte_refs_one,
.anon_lock = folio_lock_anon_vma_read,
};
diff --git a/mm/page_io.c b/mm/page_io.c
index a9444e67ec20..68318134dc92 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -26,6 +26,7 @@
#include <linux/uio.h>
#include <linux/sched/task.h>
#include <linux/delayacct.h>
+#include "swap.h"
void end_swap_bio_write(struct bio *bio)
{
@@ -234,55 +235,119 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
#define bio_associate_blkg_from_page(bio, page) do { } while (0)
#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
-int __swap_writepage(struct page *page, struct writeback_control *wbc,
- bio_end_io_t end_write_func)
+struct swap_iocb {
+ struct kiocb iocb;
+ struct bio_vec bvec[SWAP_CLUSTER_MAX];
+ int pages;
+ int len;
+};
+static mempool_t *sio_pool;
+
+int sio_pool_init(void)
{
- struct bio *bio;
- int ret;
- struct swap_info_struct *sis = page_swap_info(page);
+ if (!sio_pool) {
+ mempool_t *pool = mempool_create_kmalloc_pool(
+ SWAP_CLUSTER_MAX, sizeof(struct swap_iocb));
+ if (cmpxchg(&sio_pool, NULL, pool))
+ mempool_destroy(pool);
+ }
+ if (!sio_pool)
+ return -ENOMEM;
+ return 0;
+}
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- if (data_race(sis->flags & SWP_FS_OPS)) {
- struct kiocb kiocb;
- struct file *swap_file = sis->swap_file;
- struct address_space *mapping = swap_file->f_mapping;
- struct bio_vec bv = {
- .bv_page = page,
- .bv_len = PAGE_SIZE,
- .bv_offset = 0
- };
- struct iov_iter from;
-
- iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE);
- init_sync_kiocb(&kiocb, swap_file);
- kiocb.ki_pos = page_file_offset(page);
+static void sio_write_complete(struct kiocb *iocb, long ret)
+{
+ struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
+ struct page *page = sio->bvec[0].bv_page;
+ int p;
- set_page_writeback(page);
- unlock_page(page);
- ret = mapping->a_ops->direct_IO(&kiocb, &from);
- if (ret == PAGE_SIZE) {
- count_vm_event(PSWPOUT);
- ret = 0;
- } else {
- /*
- * In the case of swap-over-nfs, this can be a
- * temporary failure if the system has limited
- * memory for allocating transmit buffers.
- * Mark the page dirty and avoid
- * folio_rotate_reclaimable but rate-limit the
- * messages but do not flag PageError like
- * the normal direct-to-bio case as it could
- * be temporary.
- */
+ if (ret != sio->len) {
+ /*
+ * In the case of swap-over-nfs, this can be a
+ * temporary failure if the system has limited
+ * memory for allocating transmit buffers.
+ * Mark the page dirty and avoid
+ * folio_rotate_reclaimable but rate-limit the
+ * messages but do not flag PageError like
+ * the normal direct-to-bio case as it could
+ * be temporary.
+ */
+ pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
+ ret, page_file_offset(page));
+ for (p = 0; p < sio->pages; p++) {
+ page = sio->bvec[p].bv_page;
set_page_dirty(page);
ClearPageReclaim(page);
- pr_err_ratelimited("Write error on dio swapfile (%llu)\n",
- page_file_offset(page));
}
- end_page_writeback(page);
- return ret;
+ } else {
+ for (p = 0; p < sio->pages; p++)
+ count_swpout_vm_event(sio->bvec[p].bv_page);
}
+ for (p = 0; p < sio->pages; p++)
+ end_page_writeback(sio->bvec[p].bv_page);
+
+ mempool_free(sio, sio_pool);
+}
+
+static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
+{
+ struct swap_iocb *sio = NULL;
+ struct swap_info_struct *sis = page_swap_info(page);
+ struct file *swap_file = sis->swap_file;
+ loff_t pos = page_file_offset(page);
+
+ set_page_writeback(page);
+ unlock_page(page);
+ if (wbc->swap_plug)
+ sio = *wbc->swap_plug;
+ if (sio) {
+ if (sio->iocb.ki_filp != swap_file ||
+ sio->iocb.ki_pos + sio->len != pos) {
+ swap_write_unplug(sio);
+ sio = NULL;
+ }
+ }
+ if (!sio) {
+ sio = mempool_alloc(sio_pool, GFP_NOIO);
+ init_sync_kiocb(&sio->iocb, swap_file);
+ sio->iocb.ki_complete = sio_write_complete;
+ sio->iocb.ki_pos = pos;
+ sio->pages = 0;
+ sio->len = 0;
+ }
+ sio->bvec[sio->pages].bv_page = page;
+ sio->bvec[sio->pages].bv_len = thp_size(page);
+ sio->bvec[sio->pages].bv_offset = 0;
+ sio->len += thp_size(page);
+ sio->pages += 1;
+ if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
+ swap_write_unplug(sio);
+ sio = NULL;
+ }
+ if (wbc->swap_plug)
+ *wbc->swap_plug = sio;
+
+ return 0;
+}
+
+int __swap_writepage(struct page *page, struct writeback_control *wbc,
+ bio_end_io_t end_write_func)
+{
+ struct bio *bio;
+ int ret;
+ struct swap_info_struct *sis = page_swap_info(page);
+
+ VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ /*
+ * ->flags can be updated non-atomicially (scan_swap_map_slots),
+ * but that will never affect SWP_FS_OPS, so the data_race
+ * is safe.
+ */
+ if (data_race(sis->flags & SWP_FS_OPS))
+ return swap_writepage_fs(page, wbc);
+
ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc);
if (!ret) {
count_swpout_vm_event(page);
@@ -305,7 +370,83 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
return 0;
}
-int swap_readpage(struct page *page, bool synchronous)
+void swap_write_unplug(struct swap_iocb *sio)
+{
+ struct iov_iter from;
+ struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
+ int ret;
+
+ iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages, sio->len);
+ ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
+ if (ret != -EIOCBQUEUED)
+ sio_write_complete(&sio->iocb, ret);
+}
+
+static void sio_read_complete(struct kiocb *iocb, long ret)
+{
+ struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
+ int p;
+
+ if (ret == sio->len) {
+ for (p = 0; p < sio->pages; p++) {
+ struct page *page = sio->bvec[p].bv_page;
+
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ count_vm_events(PSWPIN, sio->pages);
+ } else {
+ for (p = 0; p < sio->pages; p++) {
+ struct page *page = sio->bvec[p].bv_page;
+
+ SetPageError(page);
+ ClearPageUptodate(page);
+ unlock_page(page);
+ }
+ pr_alert_ratelimited("Read-error on swap-device\n");
+ }
+ mempool_free(sio, sio_pool);
+}
+
+static void swap_readpage_fs(struct page *page,
+ struct swap_iocb **plug)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+ struct swap_iocb *sio = NULL;
+ loff_t pos = page_file_offset(page);
+
+ if (plug)
+ sio = *plug;
+ if (sio) {
+ if (sio->iocb.ki_filp != sis->swap_file ||
+ sio->iocb.ki_pos + sio->len != pos) {
+ swap_read_unplug(sio);
+ sio = NULL;
+ }
+ }
+ if (!sio) {
+ sio = mempool_alloc(sio_pool, GFP_KERNEL);
+ init_sync_kiocb(&sio->iocb, sis->swap_file);
+ sio->iocb.ki_pos = pos;
+ sio->iocb.ki_complete = sio_read_complete;
+ sio->pages = 0;
+ sio->len = 0;
+ }
+ sio->bvec[sio->pages].bv_page = page;
+ sio->bvec[sio->pages].bv_len = thp_size(page);
+ sio->bvec[sio->pages].bv_offset = 0;
+ sio->len += thp_size(page);
+ sio->pages += 1;
+ if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
+ swap_read_unplug(sio);
+ sio = NULL;
+ }
+ if (plug)
+ *plug = sio;
+}
+
+int swap_readpage(struct page *page, bool synchronous,
+ struct swap_iocb **plug)
{
struct bio *bio;
int ret = 0;
@@ -333,12 +474,7 @@ int swap_readpage(struct page *page, bool synchronous)
}
if (data_race(sis->flags & SWP_FS_OPS)) {
- struct file *swap_file = sis->swap_file;
- struct address_space *mapping = swap_file->f_mapping;
-
- ret = mapping->a_ops->read_folio(swap_file, page_folio(page));
- if (!ret)
- count_vm_event(PSWPIN);
+ swap_readpage_fs(page, plug);
goto out;
}
@@ -383,19 +519,14 @@ out:
return ret;
}
-bool swap_dirty_folio(struct address_space *mapping, struct folio *folio)
+void __swap_read_unplug(struct swap_iocb *sio)
{
- struct swap_info_struct *sis = swp_swap_info(folio_swap_entry(folio));
-
- if (data_race(sis->flags & SWP_FS_OPS)) {
- const struct address_space_operations *aops;
-
- mapping = sis->swap_file->f_mapping;
- aops = mapping->a_ops;
+ struct iov_iter from;
+ struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
+ int ret;
- VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
- return aops->dirty_folio(mapping, folio);
- } else {
- return noop_dirty_folio(mapping, folio);
- }
+ iov_iter_bvec(&from, READ, sio->bvec, sio->pages, sio->len);
+ ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
+ if (ret != -EIOCBQUEUED)
+ sio_read_complete(&sio->iocb, ret);
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f67c4c70f17f..c643c8420809 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -15,11 +15,142 @@
#define CREATE_TRACE_POINTS
#include <trace/events/page_isolation.h>
-static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
+/*
+ * This function checks whether the range [start_pfn, end_pfn) includes
+ * unmovable pages or not. The range must fall into a single pageblock and
+ * consequently belong to a single zone.
+ *
+ * PageLRU check without isolation or lru_lock could race so that
+ * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
+ * check without lock_page also may miss some movable non-lru pages at
+ * race condition. So you can't expect this function should be exact.
+ *
+ * Returns a page without holding a reference. If the caller wants to
+ * dereference that page (e.g., dumping), it has to make sure that it
+ * cannot get removed (e.g., via memory unplug) concurrently.
+ *
+ */
+static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long end_pfn,
+ int migratetype, int flags)
+{
+ struct page *page = pfn_to_page(start_pfn);
+ struct zone *zone = page_zone(page);
+ unsigned long pfn;
+
+ VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) !=
+ ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages));
+
+ if (is_migrate_cma_page(page)) {
+ /*
+ * CMA allocations (alloc_contig_range) really need to mark
+ * isolate CMA pageblocks even when they are not movable in fact
+ * so consider them movable here.
+ */
+ if (is_migrate_cma(migratetype))
+ return NULL;
+
+ return page;
+ }
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ page = pfn_to_page(pfn);
+
+ /*
+ * Both, bootmem allocations and memory holes are marked
+ * PG_reserved and are unmovable. We can even have unmovable
+ * allocations inside ZONE_MOVABLE, for example when
+ * specifying "movablecore".
+ */
+ if (PageReserved(page))
+ return page;
+
+ /*
+ * If the zone is movable and we have ruled out all reserved
+ * pages then it should be reasonably safe to assume the rest
+ * is movable.
+ */
+ if (zone_idx(zone) == ZONE_MOVABLE)
+ continue;
+
+ /*
+ * Hugepages are not in LRU lists, but they're movable.
+ * THPs are on the LRU, but need to be counted as #small pages.
+ * We need not scan over tail pages because we don't
+ * handle each tail page individually in migration.
+ */
+ if (PageHuge(page) || PageTransCompound(page)) {
+ struct page *head = compound_head(page);
+ unsigned int skip_pages;
+
+ if (PageHuge(page)) {
+ if (!hugepage_migration_supported(page_hstate(head)))
+ return page;
+ } else if (!PageLRU(head) && !__PageMovable(head)) {
+ return page;
+ }
+
+ skip_pages = compound_nr(head) - (page - head);
+ pfn += skip_pages - 1;
+ continue;
+ }
+
+ /*
+ * We can't use page_count without pin a page
+ * because another CPU can free compound page.
+ * This check already skips compound tails of THP
+ * because their page->_refcount is zero at all time.
+ */
+ if (!page_ref_count(page)) {
+ if (PageBuddy(page))
+ pfn += (1 << buddy_order(page)) - 1;
+ continue;
+ }
+
+ /*
+ * The HWPoisoned page may be not in buddy system, and
+ * page_count() is not 0.
+ */
+ if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
+ continue;
+
+ /*
+ * We treat all PageOffline() pages as movable when offlining
+ * to give drivers a chance to decrement their reference count
+ * in MEM_GOING_OFFLINE in order to indicate that these pages
+ * can be offlined as there are no direct references anymore.
+ * For actually unmovable PageOffline() where the driver does
+ * not support this, we will fail later when trying to actually
+ * move these pages that still have a reference count > 0.
+ * (false negatives in this function only)
+ */
+ if ((flags & MEMORY_OFFLINE) && PageOffline(page))
+ continue;
+
+ if (__PageMovable(page) || PageLRU(page))
+ continue;
+
+ /*
+ * If there are RECLAIMABLE pages, we need to check
+ * it. But now, memory offline itself doesn't call
+ * shrink_node_slabs() and it still to be fixed.
+ */
+ return page;
+ }
+ return NULL;
+}
+
+/*
+ * This function set pageblock migratetype to isolate if no unmovable page is
+ * present in [start_pfn, end_pfn). The pageblock must intersect with
+ * [start_pfn, end_pfn).
+ */
+static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags,
+ unsigned long start_pfn, unsigned long end_pfn)
{
struct zone *zone = page_zone(page);
struct page *unmovable;
unsigned long flags;
+ unsigned long check_unmovable_start, check_unmovable_end;
spin_lock_irqsave(&zone->lock, flags);
@@ -36,8 +167,16 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
/*
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
* We just check MOVABLE pages.
+ *
+ * Pass the intersection of [start_pfn, end_pfn) and the page's pageblock
+ * to avoid redundant checks.
*/
- unmovable = has_unmovable_pages(zone, page, migratetype, isol_flags);
+ check_unmovable_start = max(page_to_pfn(page), start_pfn);
+ check_unmovable_end = min(ALIGN(page_to_pfn(page) + 1, pageblock_nr_pages),
+ end_pfn);
+
+ unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
+ migratetype, isol_flags);
if (!unmovable) {
unsigned long nr_pages;
int mt = get_pageblock_migratetype(page);
@@ -64,13 +203,12 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
return -EBUSY;
}
-static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+static void unset_migratetype_isolate(struct page *page, int migratetype)
{
struct zone *zone;
unsigned long flags, nr_pages;
bool isolated_page = false;
unsigned int order;
- unsigned long pfn, buddy_pfn;
struct page *buddy;
zone = page_zone(page);
@@ -89,11 +227,9 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
if (PageBuddy(page)) {
order = buddy_order(page);
if (order >= pageblock_order && order < MAX_ORDER - 1) {
- pfn = page_to_pfn(page);
- buddy_pfn = __find_buddy_pfn(pfn, order);
- buddy = page + (buddy_pfn - pfn);
-
- if (!is_migrate_isolate_page(buddy)) {
+ buddy = find_buddy_page_pfn(page, page_to_pfn(page),
+ order, NULL);
+ if (buddy && !is_migrate_isolate_page(buddy)) {
isolated_page = !!__isolate_free_page(page, order);
/*
* Isolating a free page in an isolated pageblock
@@ -144,11 +280,198 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
}
/**
+ * isolate_single_pageblock() -- tries to isolate a pageblock that might be
+ * within a free or in-use page.
+ * @boundary_pfn: pageblock-aligned pfn that a page might cross
+ * @flags: isolation flags
+ * @gfp_flags: GFP flags used for migrating pages
+ * @isolate_before: isolate the pageblock before the boundary_pfn
+ *
+ * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
+ * pageblock. When not all pageblocks within a page are isolated at the same
+ * time, free page accounting can go wrong. For example, in the case of
+ * MAX_ORDER-1 = pageblock_order + 1, a MAX_ORDER-1 page has two pagelbocks.
+ * [ MAX_ORDER-1 ]
+ * [ pageblock0 | pageblock1 ]
+ * When either pageblock is isolated, if it is a free page, the page is not
+ * split into separate migratetype lists, which is supposed to; if it is an
+ * in-use page and freed later, __free_one_page() does not split the free page
+ * either. The function handles this by splitting the free page or migrating
+ * the in-use page then splitting the free page.
+ */
+static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
+ gfp_t gfp_flags, bool isolate_before)
+{
+ unsigned char saved_mt;
+ unsigned long start_pfn;
+ unsigned long isolate_pageblock;
+ unsigned long pfn;
+ struct zone *zone;
+ int ret;
+
+ VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
+
+ if (isolate_before)
+ isolate_pageblock = boundary_pfn - pageblock_nr_pages;
+ else
+ isolate_pageblock = boundary_pfn;
+
+ /*
+ * scan at the beginning of MAX_ORDER_NR_PAGES aligned range to avoid
+ * only isolating a subset of pageblocks from a bigger than pageblock
+ * free or in-use page. Also make sure all to-be-isolated pageblocks
+ * are within the same zone.
+ */
+ zone = page_zone(pfn_to_page(isolate_pageblock));
+ start_pfn = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
+ zone->zone_start_pfn);
+
+ saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
+ ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
+ isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+
+ if (ret)
+ return ret;
+
+ /*
+ * Bail out early when the to-be-isolated pageblock does not form
+ * a free or in-use page across boundary_pfn:
+ *
+ * 1. isolate before boundary_pfn: the page after is not online
+ * 2. isolate after boundary_pfn: the page before is not online
+ *
+ * This also ensures correctness. Without it, when isolate after
+ * boundary_pfn and [start_pfn, boundary_pfn) are not online,
+ * __first_valid_page() will return unexpected NULL in the for loop
+ * below.
+ */
+ if (isolate_before) {
+ if (!pfn_to_online_page(boundary_pfn))
+ return 0;
+ } else {
+ if (!pfn_to_online_page(boundary_pfn - 1))
+ return 0;
+ }
+
+ for (pfn = start_pfn; pfn < boundary_pfn;) {
+ struct page *page = __first_valid_page(pfn, boundary_pfn - pfn);
+
+ VM_BUG_ON(!page);
+ pfn = page_to_pfn(page);
+ /*
+ * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
+ * free pages in [start_pfn, boundary_pfn), its head page will
+ * always be in the range.
+ */
+ if (PageBuddy(page)) {
+ int order = buddy_order(page);
+
+ if (pfn + (1UL << order) > boundary_pfn)
+ split_free_page(page, order, boundary_pfn - pfn);
+ pfn += (1UL << order);
+ continue;
+ }
+ /*
+ * migrate compound pages then let the free page handling code
+ * above do the rest. If migration is not possible, just fail.
+ */
+ if (PageCompound(page)) {
+ unsigned long nr_pages = compound_nr(page);
+ struct page *head = compound_head(page);
+ unsigned long head_pfn = page_to_pfn(head);
+
+ if (head_pfn + nr_pages <= boundary_pfn) {
+ pfn = head_pfn + nr_pages;
+ continue;
+ }
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+ /*
+ * hugetlb, lru compound (THP), and movable compound pages
+ * can be migrated. Otherwise, fail the isolation.
+ */
+ if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
+ int order;
+ unsigned long outer_pfn;
+ int page_mt = get_pageblock_migratetype(page);
+ bool isolate_page = !is_migrate_isolate_page(page);
+ struct compact_control cc = {
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = page_zone(pfn_to_page(head_pfn)),
+ .mode = MIGRATE_SYNC,
+ .ignore_skip_hint = true,
+ .no_set_skip_hint = true,
+ .gfp_mask = gfp_flags,
+ .alloc_contig = true,
+ };
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ /*
+ * XXX: mark the page as MIGRATE_ISOLATE so that
+ * no one else can grab the freed page after migration.
+ * Ideally, the page should be freed as two separate
+ * pages to be added into separate migratetype free
+ * lists.
+ */
+ if (isolate_page) {
+ ret = set_migratetype_isolate(page, page_mt,
+ flags, head_pfn, head_pfn + nr_pages);
+ if (ret)
+ goto failed;
+ }
+
+ ret = __alloc_contig_migrate_range(&cc, head_pfn,
+ head_pfn + nr_pages);
+
+ /*
+ * restore the page's migratetype so that it can
+ * be split into separate migratetype free lists
+ * later.
+ */
+ if (isolate_page)
+ unset_migratetype_isolate(page, page_mt);
+
+ if (ret)
+ goto failed;
+ /*
+ * reset pfn to the head of the free page, so
+ * that the free page handling code above can split
+ * the free page to the right migratetype list.
+ *
+ * head_pfn is not used here as a hugetlb page order
+ * can be bigger than MAX_ORDER-1, but after it is
+ * freed, the free page order is not. Use pfn within
+ * the range to find the head of the free page.
+ */
+ order = 0;
+ outer_pfn = pfn;
+ while (!PageBuddy(pfn_to_page(outer_pfn))) {
+ /* stop if we cannot find the free page */
+ if (++order >= MAX_ORDER)
+ goto failed;
+ outer_pfn &= ~0UL << order;
+ }
+ pfn = outer_pfn;
+ continue;
+ } else
+#endif
+ goto failed;
+ }
+
+ pfn++;
+ }
+ return 0;
+failed:
+ /* restore the original migratetype */
+ unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
+ return -EBUSY;
+}
+
+/**
* start_isolate_page_range() - make page-allocation-type of range of pages to
* be MIGRATE_ISOLATE.
* @start_pfn: The lower PFN of the range to be isolated.
* @end_pfn: The upper PFN of the range to be isolated.
- * start_pfn/end_pfn must be aligned to pageblock_order.
* @migratetype: Migrate type to set in error recovery.
* @flags: The following flags are allowed (they can be combined in
* a bit mask)
@@ -157,6 +480,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* and PageOffline() pages.
* REPORT_FAILURE - report details about the failure to
* isolate the range
+ * @gfp_flags: GFP flags used for migrating pages that sit across the
+ * range boundaries.
*
* Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
* the range will never be allocated. Any free pages and pages freed in the
@@ -165,6 +490,10 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* pages in the range finally, the caller have to free all pages in the range.
* test_page_isolated() can be used for test it.
*
+ * The function first tries to isolate the pageblocks at the beginning and end
+ * of the range, since there might be pages across the range boundaries.
+ * Afterwards, it isolates the rest of the range.
+ *
* There is no high level synchronization mechanism that prevents two threads
* from trying to isolate overlapping ranges. If this happens, one thread
* will notice pageblocks in the overlapping range already set to isolate.
@@ -185,20 +514,38 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* Return: 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- unsigned migratetype, int flags)
+ int migratetype, int flags, gfp_t gfp_flags)
{
unsigned long pfn;
struct page *page;
+ /* isolation is done at page block granularity */
+ unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
+ unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
+ int ret;
+
+ /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
+ ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false);
+ if (ret)
+ return ret;
- BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
- BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
+ /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
+ ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true);
+ if (ret) {
+ unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
+ return ret;
+ }
- for (pfn = start_pfn;
- pfn < end_pfn;
+ /* skip isolated pageblocks at the beginning and end */
+ for (pfn = isolate_start + pageblock_nr_pages;
+ pfn < isolate_end - pageblock_nr_pages;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (page && set_migratetype_isolate(page, migratetype, flags)) {
- undo_isolate_page_range(start_pfn, pfn, migratetype);
+ if (page && set_migratetype_isolate(page, migratetype, flags,
+ start_pfn, end_pfn)) {
+ undo_isolate_page_range(isolate_start, pfn, migratetype);
+ unset_migratetype_isolate(
+ pfn_to_page(isolate_end - pageblock_nr_pages),
+ migratetype);
return -EBUSY;
}
}
@@ -209,16 +556,16 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
* Make isolated pages available again.
*/
void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- unsigned migratetype)
+ int migratetype)
{
unsigned long pfn;
struct page *page;
+ unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
+ unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
- BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
- BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
- for (pfn = start_pfn;
- pfn < end_pfn;
+ for (pfn = isolate_start;
+ pfn < isolate_end;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
if (!page || !is_migrate_isolate_page(page))
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 2743062e92c2..e4c6f3f1695b 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -34,7 +34,7 @@ struct page_owner {
pid_t tgid;
};
-static bool page_owner_enabled = false;
+static bool page_owner_enabled __initdata;
DEFINE_STATIC_KEY_FALSE(page_owner_inited);
static depot_stack_handle_t dummy_handle;
@@ -171,7 +171,7 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext,
page_owner->pid = current->pid;
page_owner->tgid = current->tgid;
page_owner->ts_nsec = local_clock();
- strlcpy(page_owner->comm, current->comm,
+ strscpy(page_owner->comm, current->comm,
sizeof(page_owner->comm));
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 2458281bff89..3692bea2ea2c 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -52,23 +52,6 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
return (void *)(page_ext) + page_table_check_ops.offset;
}
-static inline bool pte_user_accessible_page(pte_t pte)
-{
- return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER);
-}
-
-static inline bool pmd_user_accessible_page(pmd_t pmd)
-{
- return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) &&
- (pmd_val(pmd) & _PAGE_USER);
-}
-
-static inline bool pud_user_accessible_page(pud_t pud)
-{
- return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) &&
- (pud_val(pud) & _PAGE_USER);
-}
-
/*
* An enty is removed from the page table, decrement the counters for that page
* verify that it is of correct type and counters do not become negative.
@@ -177,7 +160,7 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr,
if (pmd_user_accessible_page(pmd)) {
page_table_check_clear(mm, addr, pmd_pfn(pmd),
- PMD_PAGE_SIZE >> PAGE_SHIFT);
+ PMD_SIZE >> PAGE_SHIFT);
}
}
EXPORT_SYMBOL(__page_table_check_pmd_clear);
@@ -190,7 +173,7 @@ void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr,
if (pud_user_accessible_page(pud)) {
page_table_check_clear(mm, addr, pud_pfn(pud),
- PUD_PAGE_SIZE >> PAGE_SHIFT);
+ PUD_SIZE >> PAGE_SHIFT);
}
}
EXPORT_SYMBOL(__page_table_check_pud_clear);
@@ -219,7 +202,7 @@ void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr,
__page_table_check_pmd_clear(mm, addr, *pmdp);
if (pmd_user_accessible_page(pmd)) {
page_table_check_set(mm, addr, pmd_pfn(pmd),
- PMD_PAGE_SIZE >> PAGE_SHIFT,
+ PMD_SIZE >> PAGE_SHIFT,
pmd_write(pmd));
}
}
@@ -234,7 +217,7 @@ void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
__page_table_check_pud_clear(mm, addr, *pudp);
if (pud_user_accessible_page(pud)) {
page_table_check_set(mm, addr, pud_pfn(pud),
- PUD_PAGE_SIZE >> PAGE_SHIFT,
+ PUD_SIZE >> PAGE_SHIFT,
pud_write(pud));
}
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 14a5cda73dee..c10f839fc410 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -210,16 +210,10 @@ restart:
*/
pmde = READ_ONCE(*pvmw->pmd);
- if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+ if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) ||
+ (pmd_present(pmde) && pmd_devmap(pmde))) {
pvmw->ptl = pmd_lock(mm, pvmw->pmd);
pmde = *pvmw->pmd;
- if (likely(pmd_trans_huge(pmde))) {
- if (pvmw->flags & PVMW_MIGRATION)
- return not_found(pvmw);
- if (!check_pmd(pmd_pfn(pmde), pvmw))
- return not_found(pvmw);
- return true;
- }
if (!pmd_present(pmde)) {
swp_entry_t entry;
@@ -232,6 +226,13 @@ restart:
return not_found(pvmw);
return true;
}
+ if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) {
+ if (pvmw->flags & PVMW_MIGRATION)
+ return not_found(pvmw);
+ if (!check_pmd(pmd_pfn(pmde), pvmw))
+ return not_found(pvmw);
+ return true;
+ }
/* THP pmd was split under us: handle on pte level */
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 411d1593ef23..70b1ea23f4d2 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -113,7 +113,6 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}
-#ifdef CONFIG_MEMCG_KMEM
/**
* pcpu_obj_full_size - helper to calculate size of each accounted object
* @size: size of area to allocate in bytes
@@ -123,13 +122,14 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
*/
static inline size_t pcpu_obj_full_size(size_t size)
{
- size_t extra_size;
+ size_t extra_size = 0;
- extra_size = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
+#ifdef CONFIG_MEMCG_KMEM
+ extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
+#endif
return size * num_possible_cpus() + extra_size;
}
-#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_PERCPU_STATS
diff --git a/mm/percpu.c b/mm/percpu.c
index ea28db283044..3633eeefaa0d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1884,8 +1884,9 @@ area_found:
ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
kmemleak_alloc_percpu(ptr, size, gfp);
- trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
- chunk->base_addr, off, ptr);
+ trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
+ chunk->base_addr, off, ptr,
+ pcpu_obj_full_size(size), gfp);
pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 6523fda274e5..90ab721a12a8 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -201,6 +201,14 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
}
#endif
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD
+pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ return pmdp_invalidate(vma, address, pmdp);
+}
+#endif
+
#ifndef pmdp_collapse_flush
pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
diff --git a/mm/rmap.c b/mm/rmap.c
index fedb82371efe..5bcb334cd6f2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -73,6 +73,7 @@
#include <linux/page_idle.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
+#include <linux/mm_inline.h>
#include <asm/tlbflush.h>
@@ -298,7 +299,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
* Reuse existing anon_vma if its degree lower than two,
* that means it has no vma and only one anon_vma child.
*
- * Do not chose parent anon_vma, otherwise first child
+ * Do not choose parent anon_vma, otherwise first child
* will always reuse it. Root anon_vma is never reused:
* it has self-parent reference and at least one child.
*/
@@ -526,9 +527,11 @@ out:
*
* Its a little more complex as it tries to keep the fast path to a single
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
- * reference like with page_get_anon_vma() and then block on the mutex.
+ * reference like with page_get_anon_vma() and then block on the mutex
+ * on !rwc->try_lock case.
*/
-struct anon_vma *folio_lock_anon_vma_read(struct folio *folio)
+struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
+ struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma = NULL;
struct anon_vma *root_anon_vma;
@@ -556,6 +559,12 @@ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio)
goto out;
}
+ if (rwc && rwc->try_lock) {
+ anon_vma = NULL;
+ rwc->contended = true;
+ goto out;
+ }
+
/* trylock failed, we got to sleep */
if (!atomic_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
@@ -882,7 +891,8 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
*
* Quick test_and_clear_referenced for all mappings of a folio,
*
- * Return: The number of mappings which referenced the folio.
+ * Return: The number of mappings which referenced the folio. Return -1 if
+ * the function bailed out due to rmap lock contention.
*/
int folio_referenced(struct folio *folio, int is_locked,
struct mem_cgroup *memcg, unsigned long *vm_flags)
@@ -896,6 +906,7 @@ int folio_referenced(struct folio *folio, int is_locked,
.rmap_one = folio_referenced_one,
.arg = (void *)&pra,
.anon_lock = folio_lock_anon_vma_read,
+ .try_lock = true,
};
*vm_flags = 0;
@@ -926,15 +937,15 @@ int folio_referenced(struct folio *folio, int is_locked,
if (we_locked)
folio_unlock(folio);
- return pra.referenced;
+ return rwc.contended ? -1 : pra.referenced;
}
-static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
- unsigned long address, void *arg)
+static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
{
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
+ int cleaned = 0;
+ struct vm_area_struct *vma = pvmw->vma;
struct mmu_notifier_range range;
- int *cleaned = arg;
+ unsigned long address = pvmw->address;
/*
* We have to assume the worse case ie pmd for invalidation. Note that
@@ -942,16 +953,16 @@ static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
- vma_address_end(&pvmw));
+ vma_address_end(pvmw));
mmu_notifier_invalidate_range_start(&range);
- while (page_vma_mapped_walk(&pvmw)) {
+ while (page_vma_mapped_walk(pvmw)) {
int ret = 0;
- address = pvmw.address;
- if (pvmw.pte) {
+ address = pvmw->address;
+ if (pvmw->pte) {
pte_t entry;
- pte_t *pte = pvmw.pte;
+ pte_t *pte = pvmw->pte;
if (!pte_dirty(*pte) && !pte_write(*pte))
continue;
@@ -964,13 +975,14 @@ static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
ret = 1;
} else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- pmd_t *pmd = pvmw.pmd;
+ pmd_t *pmd = pvmw->pmd;
pmd_t entry;
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
continue;
- flush_cache_page(vma, address, folio_pfn(folio));
+ flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
@@ -990,11 +1002,22 @@ static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
* See Documentation/vm/mmu_notifier.rst
*/
if (ret)
- (*cleaned)++;
+ cleaned++;
}
mmu_notifier_invalidate_range_end(&range);
+ return cleaned;
+}
+
+static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long address, void *arg)
+{
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
+ int *cleaned = arg;
+
+ *cleaned += page_vma_mkclean_one(&pvmw);
+
return true;
}
@@ -1032,6 +1055,38 @@ int folio_mkclean(struct folio *folio)
EXPORT_SYMBOL_GPL(folio_mkclean);
/**
+ * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
+ * [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
+ * within the @vma of shared mappings. And since clean PTEs
+ * should also be readonly, write protects them too.
+ * @pfn: start pfn.
+ * @nr_pages: number of physically contiguous pages srarting with @pfn.
+ * @pgoff: page offset that the @pfn mapped with.
+ * @vma: vma that @pfn mapped within.
+ *
+ * Returns the number of cleaned PTEs (including PMDs).
+ */
+int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
+ struct vm_area_struct *vma)
+{
+ struct page_vma_mapped_walk pvmw = {
+ .pfn = pfn,
+ .nr_pages = nr_pages,
+ .pgoff = pgoff,
+ .vma = vma,
+ .flags = PVMW_SYNC,
+ };
+
+ if (invalid_mkclean_vma(vma, NULL))
+ return 0;
+
+ pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma);
+ VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
+
+ return page_vma_mkclean_one(&pvmw);
+}
+
+/**
* page_move_anon_rmap - move a page to our anon_vma
* @page: the page to move to our anon_vma
* @vma: the vma the page belongs to
@@ -1044,6 +1099,7 @@ EXPORT_SYMBOL_GPL(folio_mkclean);
void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
+ struct page *subpage = page;
page = compound_head(page);
@@ -1057,6 +1113,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
* folio_test_anon()) will not see one without the other.
*/
WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
+ SetPageAnonExclusive(subpage);
}
/**
@@ -1074,7 +1131,7 @@ static void __page_set_anon_rmap(struct page *page,
BUG_ON(!anon_vma);
if (PageAnon(page))
- return;
+ goto out;
/*
* If the page isn't exclusively mapped into this vma,
@@ -1093,6 +1150,9 @@ static void __page_set_anon_rmap(struct page *page,
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
page->index = linear_page_index(vma, address);
+out:
+ if (exclusive)
+ SetPageAnonExclusive(page);
}
/**
@@ -1127,7 +1187,7 @@ static void __page_check_anon_rmap(struct page *page,
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
- * @compound: charge the page as compound or small page
+ * @flags: the rmap flags
*
* The caller needs to hold the pte lock, and the page must be locked in
* the anon_vma case: to serialize mapping,index checking after setting,
@@ -1135,18 +1195,7 @@ static void __page_check_anon_rmap(struct page *page,
* (but PageKsm is never downgraded to PageAnon).
*/
void page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, bool compound)
-{
- do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
-}
-
-/*
- * Special version of the above for do_swap_page, which often runs
- * into pages that are exclusively owned by the current process.
- * Everybody else should continue to use page_add_anon_rmap above.
- */
-void do_page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, int flags)
+ struct vm_area_struct *vma, unsigned long address, rmap_t flags)
{
bool compound = flags & RMAP_COMPOUND;
bool first;
@@ -1165,6 +1214,8 @@ void do_page_add_anon_rmap(struct page *page,
} else {
first = atomic_inc_and_test(&page->_mapcount);
}
+ VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
+ VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
if (first) {
int nr = compound ? thp_nr_pages(page) : 1;
@@ -1185,7 +1236,7 @@ void do_page_add_anon_rmap(struct page *page,
/* address might be in next vma when migration races vma_adjust */
else if (first)
__page_set_anon_rmap(page, vma, address,
- flags & RMAP_EXCLUSIVE);
+ !!(flags & RMAP_EXCLUSIVE));
else
__page_check_anon_rmap(page, vma, address);
@@ -1193,19 +1244,22 @@ void do_page_add_anon_rmap(struct page *page,
}
/**
- * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * page_add_new_anon_rmap - add mapping to a new anonymous page
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
- * @compound: charge the page as compound or small page
+ *
+ * If it's a compound page, it is accounted as a compound page. As the page
+ * is new, it's assume to get mapped exclusively by a single process.
*
* Same as page_add_anon_rmap but must only be called on *new* pages.
* This means the inc-and-test can be bypassed.
* Page does not have to be locked.
*/
void page_add_new_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, bool compound)
+ struct vm_area_struct *vma, unsigned long address)
{
+ const bool compound = PageCompound(page);
int nr = compound ? thp_nr_pages(page) : 1;
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
@@ -1218,8 +1272,6 @@ void page_add_new_anon_rmap(struct page *page,
__mod_lruvec_page_state(page, NR_ANON_THPS, nr);
} else {
- /* Anon THP always mapped first with PMD */
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* increment count (starts at -1) */
atomic_set(&page->_mapcount, 0);
}
@@ -1425,7 +1477,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
pte_t pteval;
struct page *subpage;
- bool ret = true;
+ bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
@@ -1481,59 +1533,81 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
subpage = folio_page(folio,
pte_pfn(*pvmw.pte) - folio_pfn(folio));
address = pvmw.address;
+ anon_exclusive = folio_test_anon(folio) &&
+ PageAnonExclusive(subpage);
- if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
+ if (folio_test_hugetlb(folio)) {
/*
- * To call huge_pmd_unshare, i_mmap_rwsem must be
- * held in write mode. Caller needs to explicitly
- * do this outside rmap routines.
+ * The try_to_unmap() is only passed a hugetlb page
+ * in the case where the hugetlb page is poisoned.
*/
- VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
- if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
- /*
- * huge_pmd_unshare unmapped an entire PMD
- * page. There is no way of knowing exactly
- * which PMDs may be cached for this mm, so
- * we must flush them all. start/end were
- * already adjusted above to cover this range.
- */
- flush_cache_range(vma, range.start, range.end);
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
+ VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
+ /*
+ * huge_pmd_unshare may unmap an entire PMD page.
+ * There is no way of knowing exactly which PMDs may
+ * be cached for this mm, so we must flush them all.
+ * start/end were already adjusted above to cover this
+ * range.
+ */
+ flush_cache_range(vma, range.start, range.end);
+ if (!folio_test_anon(folio)) {
/*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
*/
- page_vma_mapped_walk_done(&pvmw);
- break;
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+
+ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
+
+ /*
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
}
- }
-
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
- if (should_defer_flush(mm, flags)) {
+ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
+ } else {
+ flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
/*
- * We clear the PTE but do not flush so potentially
- * a remote CPU could still be writing to the folio.
- * If the entry was previously clean then the
- * architecture must guarantee that a clear->dirty
- * transition on a cached TLB entry is written through
- * and traps if the PTE is unmapped.
+ * Nuke the page table entry. When having to clear
+ * PageAnonExclusive(), we always have to flush.
*/
- pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+ if (should_defer_flush(mm, flags) && !anon_exclusive) {
+ /*
+ * We clear the PTE but do not flush so potentially
+ * a remote CPU could still be writing to the folio.
+ * If the entry was previously clean then the
+ * architecture must guarantee that a clear->dirty
+ * transition on a cached TLB entry is written through
+ * and traps if the PTE is unmapped.
+ */
+ pteval = ptep_get_and_clear(mm, address, pvmw.pte);
- set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
- } else {
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
+ set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
+ } else {
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
+ }
}
+ /*
+ * Now the pte is cleared. If this pte was uffd-wp armed,
+ * we may want to replace a none pte with a marker pte if
+ * it's file-backed, so we don't lose the tracking info.
+ */
+ pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
+
/* Set the dirty flag on the folio now the pte is gone. */
if (pte_dirty(pteval))
folio_mark_dirty(folio);
@@ -1637,11 +1711,31 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
+ swap_free(entry);
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ if (anon_exclusive &&
+ page_try_share_anon_rmap(subpage)) {
+ swap_free(entry);
set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
+ /*
+ * Note: We *don't* remember if the page was mapped
+ * exclusively in the swap pte if the architecture
+ * doesn't support __HAVE_ARCH_PTE_SWP_EXCLUSIVE. In
+ * that case, swapin code has to re-determine that
+ * manually and might detect the page as possibly
+ * shared, for example, if there are other references on
+ * the page or if the page is under writeback. We made
+ * sure that there are no GUP pins on the page that
+ * would rely on it, so for GUP pins this is fine.
+ */
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
if (list_empty(&mm->mmlist))
@@ -1651,6 +1745,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, MM_ANONPAGES);
inc_mm_counter(mm, MM_SWAPENTS);
swp_pte = swp_entry_to_pte(entry);
+ if (anon_exclusive)
+ swp_pte = pte_swp_mkexclusive(swp_pte);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
@@ -1741,7 +1837,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
pte_t pteval;
struct page *subpage;
- bool ret = true;
+ bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
@@ -1791,7 +1887,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
!folio_test_pmd_mappable(folio), folio);
- set_pmd_migration_entry(&pvmw, subpage);
+ if (set_pmd_migration_entry(&pvmw, subpage)) {
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
continue;
}
#endif
@@ -1802,44 +1902,53 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
subpage = folio_page(folio,
pte_pfn(*pvmw.pte) - folio_pfn(folio));
address = pvmw.address;
+ anon_exclusive = folio_test_anon(folio) &&
+ PageAnonExclusive(subpage);
- if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
+ if (folio_test_hugetlb(folio)) {
/*
- * To call huge_pmd_unshare, i_mmap_rwsem must be
- * held in write mode. Caller needs to explicitly
- * do this outside rmap routines.
+ * huge_pmd_unshare may unmap an entire PMD page.
+ * There is no way of knowing exactly which PMDs may
+ * be cached for this mm, so we must flush them all.
+ * start/end were already adjusted above to cover this
+ * range.
*/
- VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
- if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
- /*
- * huge_pmd_unshare unmapped an entire PMD
- * page. There is no way of knowing exactly
- * which PMDs may be cached for this mm, so
- * we must flush them all. start/end were
- * already adjusted above to cover this range.
- */
- flush_cache_range(vma, range.start, range.end);
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
+ flush_cache_range(vma, range.start, range.end);
+ if (!folio_test_anon(folio)) {
/*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
*/
- page_vma_mapped_walk_done(&pvmw);
- break;
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+
+ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
+
+ /*
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
}
- }
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
+ /* Nuke the hugetlb page table entry */
+ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
+ } else {
+ flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ /* Nuke the page table entry. */
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
+ }
/* Set the dirty flag on the folio now the pte is gone. */
if (pte_dirty(pteval))
@@ -1853,6 +1962,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
swp_entry_t entry;
pte_t swp_pte;
+ if (anon_exclusive)
+ BUG_ON(page_try_share_anon_rmap(subpage));
+
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
@@ -1861,6 +1973,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
entry = pte_to_swp_entry(pteval);
if (is_writable_device_private_entry(entry))
entry = make_writable_migration_entry(pfn);
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(pfn);
else
entry = make_readable_migration_entry(pfn);
swp_pte = swp_entry_to_pte(entry);
@@ -1920,7 +2034,22 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
pte_t swp_pte;
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
- set_pte_at(mm, address, pvmw.pte, pteval);
+ if (folio_test_hugetlb(folio))
+ set_huge_pte_at(mm, address, pvmw.pte, pteval);
+ else
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
+ !anon_exclusive, subpage);
+ if (anon_exclusive &&
+ page_try_share_anon_rmap(subpage)) {
+ if (folio_test_hugetlb(folio))
+ set_huge_pte_at(mm, address, pvmw.pte, pteval);
+ else
+ set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
@@ -1934,6 +2063,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_write(pteval))
entry = make_writable_migration_entry(
page_to_pfn(subpage));
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(subpage));
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
@@ -1943,7 +2075,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, address, pvmw.pte, swp_pte);
+ if (folio_test_hugetlb(folio))
+ set_huge_swap_pte_at(mm, address, pvmw.pte,
+ swp_pte, vma_mmu_pagesize(vma));
+ else
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
trace_set_migration_pte(address, pte_val(swp_pte),
compound_order(&folio->page));
/*
@@ -2148,7 +2284,7 @@ static bool folio_make_device_exclusive(struct folio *folio,
/**
* make_device_exclusive_range() - Mark a range for exclusive use by a device
- * @mm: mm_struct of assoicated target process
+ * @mm: mm_struct of associated target process
* @start: start of the region to mark for exclusive device access
* @end: end address of region
* @pages: returns the pages which were successfully marked for exclusive access
@@ -2210,12 +2346,12 @@ void __put_anon_vma(struct anon_vma *anon_vma)
}
static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
- const struct rmap_walk_control *rwc)
+ struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
if (rwc->anon_lock)
- return rwc->anon_lock(folio);
+ return rwc->anon_lock(folio, rwc);
/*
* Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
@@ -2227,7 +2363,17 @@ static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
if (!anon_vma)
return NULL;
+ if (anon_vma_trylock_read(anon_vma))
+ goto out;
+
+ if (rwc->try_lock) {
+ anon_vma = NULL;
+ rwc->contended = true;
+ goto out;
+ }
+
anon_vma_lock_read(anon_vma);
+out:
return anon_vma;
}
@@ -2241,7 +2387,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
* contained in the anon_vma struct it points to.
*/
static void rmap_walk_anon(struct folio *folio,
- const struct rmap_walk_control *rwc, bool locked)
+ struct rmap_walk_control *rwc, bool locked)
{
struct anon_vma *anon_vma;
pgoff_t pgoff_start, pgoff_end;
@@ -2289,7 +2435,7 @@ static void rmap_walk_anon(struct folio *folio,
* contained in the address_space struct it points to.
*/
static void rmap_walk_file(struct folio *folio,
- const struct rmap_walk_control *rwc, bool locked)
+ struct rmap_walk_control *rwc, bool locked)
{
struct address_space *mapping = folio_mapping(folio);
pgoff_t pgoff_start, pgoff_end;
@@ -2308,8 +2454,18 @@ static void rmap_walk_file(struct folio *folio,
pgoff_start = folio_pgoff(folio);
pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
- if (!locked)
+ if (!locked) {
+ if (i_mmap_trylock_read(mapping))
+ goto lookup;
+
+ if (rwc->try_lock) {
+ rwc->contended = true;
+ return;
+ }
+
i_mmap_lock_read(mapping);
+ }
+lookup:
vma_interval_tree_foreach(vma, &mapping->i_mmap,
pgoff_start, pgoff_end) {
unsigned long address = vma_address(&folio->page, vma);
@@ -2331,7 +2487,7 @@ done:
i_mmap_unlock_read(mapping);
}
-void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc)
+void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
{
if (unlikely(folio_test_ksm(folio)))
rmap_walk_ksm(folio, rwc);
@@ -2342,7 +2498,7 @@ void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc)
}
/* Like rmap_walk, but caller holds relevant rmap lock */
-void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc)
+void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
{
/* no ksm support for now */
VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
@@ -2357,9 +2513,11 @@ void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc)
* The following two functions are for anonymous (private mapped) hugepages.
* Unlike common anonymous pages, anonymous hugepages have no accounting code
* and no lru code, because we handle hugepages differently from common pages.
+ *
+ * RMAP_COMPOUND is ignored.
*/
-void hugepage_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, rmap_t flags)
{
struct anon_vma *anon_vma = vma->anon_vma;
int first;
@@ -2368,8 +2526,11 @@ void hugepage_add_anon_rmap(struct page *page,
BUG_ON(!anon_vma);
/* address might be in next vma when migration races vma_adjust */
first = atomic_inc_and_test(compound_mapcount_ptr(page));
+ VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
+ VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
if (first)
- __page_set_anon_rmap(page, vma, address, 0);
+ __page_set_anon_rmap(page, vma, address,
+ !!(flags & RMAP_EXCLUSIVE));
}
void hugepage_add_new_anon_rmap(struct page *page,
diff --git a/mm/shmem.c b/mm/shmem.c
index f3e8de8ff75c..da30c769b376 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -34,10 +34,10 @@
#include <linux/export.h>
#include <linux/swap.h>
#include <linux/uio.h>
-#include <linux/khugepaged.h>
#include <linux/hugetlb.h>
#include <linux/fs_parser.h>
#include <linux/swapfile.h>
+#include "swap.h"
static struct vfsmount *shm_mnt;
@@ -134,8 +134,8 @@ static unsigned long shmem_default_max_inodes(void)
}
#endif
-static int shmem_swapin_page(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp,
+static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
+ struct folio **foliop, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
vm_fault_t *fault_type);
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
@@ -553,7 +553,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
LIST_HEAD(to_remove);
struct inode *inode;
struct shmem_inode_info *info;
- struct page *page;
+ struct folio *folio;
unsigned long batch = sc ? sc->nr_to_scan : 128;
int split = 0;
@@ -597,6 +597,7 @@ next:
list_for_each_safe(pos, next, &list) {
int ret;
+ pgoff_t index;
info = list_entry(pos, struct shmem_inode_info, shrinklist);
inode = &info->vfs_inode;
@@ -604,14 +605,14 @@ next:
if (nr_to_split && split >= nr_to_split)
goto move_back;
- page = find_get_page(inode->i_mapping,
- (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
- if (!page)
+ index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
+ folio = filemap_get_folio(inode->i_mapping, index);
+ if (!folio)
goto drop;
/* No huge page at the end of the file: nothing to split */
- if (!PageTransHuge(page)) {
- put_page(page);
+ if (!folio_test_large(folio)) {
+ folio_put(folio);
goto drop;
}
@@ -622,14 +623,14 @@ next:
* Waiting for the lock may lead to deadlock in the
* reclaim path.
*/
- if (!trylock_page(page)) {
- put_page(page);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
goto move_back;
}
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
+ ret = split_huge_page(&folio->page);
+ folio_unlock(folio);
+ folio_put(folio);
/* If split failed move the inode on the list back to shrinklist */
if (ret)
@@ -694,36 +695,35 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
/*
* Like add_to_page_cache_locked, but error if expected item has gone.
*/
-static int shmem_add_to_page_cache(struct page *page,
+static int shmem_add_to_page_cache(struct folio *folio,
struct address_space *mapping,
pgoff_t index, void *expected, gfp_t gfp,
struct mm_struct *charge_mm)
{
- XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
- unsigned long nr = compound_nr(page);
+ XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
+ long nr = folio_nr_pages(folio);
int error;
- VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(index != round_down(index, nr), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- VM_BUG_ON(expected && PageTransHuge(page));
+ VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
+ VM_BUG_ON(expected && folio_test_large(folio));
- page_ref_add(page, nr);
- page->mapping = mapping;
- page->index = index;
+ folio_ref_add(folio, nr);
+ folio->mapping = mapping;
+ folio->index = index;
- if (!PageSwapCache(page)) {
- error = mem_cgroup_charge(page_folio(page), charge_mm, gfp);
+ if (!folio_test_swapcache(folio)) {
+ error = mem_cgroup_charge(folio, charge_mm, gfp);
if (error) {
- if (PageTransHuge(page)) {
+ if (folio_test_pmd_mappable(folio)) {
count_vm_event(THP_FILE_FALLBACK);
count_vm_event(THP_FILE_FALLBACK_CHARGE);
}
goto error;
}
}
- cgroup_throttle_swaprate(page, gfp);
+ folio_throttle_swaprate(folio, gfp);
do {
xas_lock_irq(&xas);
@@ -735,16 +735,16 @@ static int shmem_add_to_page_cache(struct page *page,
xas_set_err(&xas, -EEXIST);
goto unlock;
}
- xas_store(&xas, page);
+ xas_store(&xas, folio);
if (xas_error(&xas))
goto unlock;
- if (PageTransHuge(page)) {
+ if (folio_test_pmd_mappable(folio)) {
count_vm_event(THP_FILE_ALLOC);
- __mod_lruvec_page_state(page, NR_SHMEM_THPS, nr);
+ __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
}
mapping->nrpages += nr;
- __mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
- __mod_lruvec_page_state(page, NR_SHMEM, nr);
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
+ __lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
@@ -756,8 +756,8 @@ unlock:
return 0;
error:
- page->mapping = NULL;
- page_ref_sub(page, nr);
+ folio->mapping = NULL;
+ folio_ref_sub(folio, nr);
return error;
}
@@ -1158,69 +1158,63 @@ static void shmem_evict_inode(struct inode *inode)
}
static int shmem_find_swap_entries(struct address_space *mapping,
- pgoff_t start, unsigned int nr_entries,
- struct page **entries, pgoff_t *indices,
- unsigned int type)
+ pgoff_t start, struct folio_batch *fbatch,
+ pgoff_t *indices, unsigned int type)
{
XA_STATE(xas, &mapping->i_pages, start);
- struct page *page;
+ struct folio *folio;
swp_entry_t entry;
- unsigned int ret = 0;
-
- if (!nr_entries)
- return 0;
rcu_read_lock();
- xas_for_each(&xas, page, ULONG_MAX) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, ULONG_MAX) {
+ if (xas_retry(&xas, folio))
continue;
- if (!xa_is_value(page))
+ if (!xa_is_value(folio))
continue;
- entry = radix_to_swp_entry(page);
+ entry = radix_to_swp_entry(folio);
if (swp_type(entry) != type)
continue;
- indices[ret] = xas.xa_index;
- entries[ret] = page;
+ indices[folio_batch_count(fbatch)] = xas.xa_index;
+ if (!folio_batch_add(fbatch, folio))
+ break;
if (need_resched()) {
xas_pause(&xas);
cond_resched_rcu();
}
- if (++ret == nr_entries)
- break;
}
rcu_read_unlock();
- return ret;
+ return xas.xa_index;
}
/*
* Move the swapped pages for an inode to page cache. Returns the count
* of pages swapped in, or the error in case of failure.
*/
-static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
- pgoff_t *indices)
+static int shmem_unuse_swap_entries(struct inode *inode,
+ struct folio_batch *fbatch, pgoff_t *indices)
{
int i = 0;
int ret = 0;
int error = 0;
struct address_space *mapping = inode->i_mapping;
- for (i = 0; i < pvec.nr; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
- if (!xa_is_value(page))
+ if (!xa_is_value(folio))
continue;
- error = shmem_swapin_page(inode, indices[i],
- &page, SGP_CACHE,
+ error = shmem_swapin_folio(inode, indices[i],
+ &folio, SGP_CACHE,
mapping_gfp_mask(mapping),
NULL, NULL);
if (error == 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
ret++;
}
if (error == -ENOMEM)
@@ -1237,26 +1231,23 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type)
{
struct address_space *mapping = inode->i_mapping;
pgoff_t start = 0;
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE];
int ret = 0;
- pagevec_init(&pvec);
do {
- unsigned int nr_entries = PAGEVEC_SIZE;
-
- pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
- pvec.pages, indices, type);
- if (pvec.nr == 0) {
+ folio_batch_init(&fbatch);
+ shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
+ if (folio_batch_count(&fbatch) == 0) {
ret = 0;
break;
}
- ret = shmem_unuse_swap_entries(inode, pvec, indices);
+ ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
if (ret < 0)
break;
- start = indices[pvec.nr - 1];
+ start = indices[folio_batch_count(&fbatch) - 1];
} while (true);
return ret;
@@ -1312,6 +1303,7 @@ int shmem_unuse(unsigned int type)
*/
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
struct shmem_inode_info *info;
struct address_space *mapping;
struct inode *inode;
@@ -1385,7 +1377,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
SetPageUptodate(page);
}
- swap = get_swap_page(page);
+ swap = folio_alloc_swap(folio);
if (!swap.val)
goto redirty;
@@ -1521,13 +1513,13 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
return result;
}
-static struct page *shmem_alloc_hugepage(gfp_t gfp,
+static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
struct address_space *mapping = info->vfs_inode.i_mapping;
pgoff_t hindex;
- struct page *page;
+ struct folio *folio;
hindex = round_down(index, HPAGE_PMD_NR);
if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
@@ -1535,34 +1527,37 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
return NULL;
shmem_pseudo_vma_init(&pvma, info, hindex);
- page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
+ folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
shmem_pseudo_vma_destroy(&pvma);
- if (page)
- prep_transhuge_page(page);
- else
+ if (!folio)
count_vm_event(THP_FILE_FALLBACK);
- return page;
+ return folio;
}
-static struct page *shmem_alloc_page(gfp_t gfp,
+static struct folio *shmem_alloc_folio(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
- struct page *page;
+ struct folio *folio;
shmem_pseudo_vma_init(&pvma, info, index);
- page = alloc_page_vma(gfp, &pvma, 0);
+ folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
shmem_pseudo_vma_destroy(&pvma);
- return page;
+ return folio;
+}
+
+static struct page *shmem_alloc_page(gfp_t gfp,
+ struct shmem_inode_info *info, pgoff_t index)
+{
+ return &shmem_alloc_folio(gfp, info, index)->page;
}
-static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
- struct inode *inode,
+static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
pgoff_t index, bool huge)
{
struct shmem_inode_info *info = SHMEM_I(inode);
- struct page *page;
+ struct folio *folio;
int nr;
int err = -ENOSPC;
@@ -1574,13 +1569,13 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
goto failed;
if (huge)
- page = shmem_alloc_hugepage(gfp, info, index);
+ folio = shmem_alloc_hugefolio(gfp, info, index);
else
- page = shmem_alloc_page(gfp, info, index);
- if (page) {
- __SetPageLocked(page);
- __SetPageSwapBacked(page);
- return page;
+ folio = shmem_alloc_folio(gfp, info, index);
+ if (folio) {
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+ return folio;
}
err = -ENOMEM;
@@ -1601,9 +1596,9 @@ failed:
* NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
* but for now it is a simple matter of zone.
*/
-static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
+static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
{
- return page_zonenum(page) > gfp_zone(gfp);
+ return folio_zonenum(folio) > gfp_zone(gfp);
}
static int shmem_replace_page(struct page **pagep, gfp_t gfp,
@@ -1682,8 +1677,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
* Returns 0 and the page in pagep if success. On failure, returns the
* error code and NULL in *pagep.
*/
-static int shmem_swapin_page(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp,
+static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
+ struct folio **foliop, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
vm_fault_t *fault_type)
{
@@ -1691,12 +1686,13 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
struct shmem_inode_info *info = SHMEM_I(inode);
struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
struct page *page;
+ struct folio *folio = NULL;
swp_entry_t swap;
int error;
- VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
- swap = radix_to_swp_entry(*pagep);
- *pagep = NULL;
+ VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
+ swap = radix_to_swp_entry(*foliop);
+ *foliop = NULL;
/* Look it up and read it in.. */
page = lookup_swap_cache(swap, NULL, 0);
@@ -1714,33 +1710,35 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
goto failed;
}
}
+ folio = page_folio(page);
/* We have to do this with page locked to prevent races */
- lock_page(page);
- if (!PageSwapCache(page) || page_private(page) != swap.val ||
+ folio_lock(folio);
+ if (!folio_test_swapcache(folio) ||
+ folio_swap_entry(folio).val != swap.val ||
!shmem_confirm_swap(mapping, index, swap)) {
error = -EEXIST;
goto unlock;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
error = -EIO;
goto failed;
}
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
/*
* Some architectures may have to restore extra metadata to the
- * physical page after reading from swap.
+ * folio after reading from swap.
*/
- arch_swap_restore(swap, page);
+ arch_swap_restore(swap, folio);
- if (shmem_should_replace_page(page, gfp)) {
+ if (shmem_should_replace_folio(folio, gfp)) {
error = shmem_replace_page(&page, gfp, info, index);
if (error)
goto failed;
}
- error = shmem_add_to_page_cache(page, mapping, index,
+ error = shmem_add_to_page_cache(folio, mapping, index,
swp_to_radix_entry(swap), gfp,
charge_mm);
if (error)
@@ -1752,21 +1750,21 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
spin_unlock_irq(&info->lock);
if (sgp == SGP_WRITE)
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
- delete_from_swap_cache(page);
- set_page_dirty(page);
+ delete_from_swap_cache(&folio->page);
+ folio_mark_dirty(folio);
swap_free(swap);
- *pagep = page;
+ *foliop = folio;
return 0;
failed:
if (!shmem_confirm_swap(mapping, index, swap))
error = -EEXIST;
unlock:
- if (page) {
- unlock_page(page);
- put_page(page);
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
}
return error;
@@ -1791,7 +1789,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
- struct page *page;
+ struct folio *folio;
pgoff_t hindex = index;
gfp_t huge_gfp;
int error;
@@ -1809,39 +1807,37 @@ repeat:
sbinfo = SHMEM_SB(inode->i_sb);
charge_mm = vma ? vma->vm_mm : NULL;
- page = pagecache_get_page(mapping, index,
- FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
-
- if (page && vma && userfaultfd_minor(vma)) {
- if (!xa_is_value(page)) {
- unlock_page(page);
- put_page(page);
+ folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0);
+ if (folio && vma && userfaultfd_minor(vma)) {
+ if (!xa_is_value(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
}
*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
return 0;
}
- if (xa_is_value(page)) {
- error = shmem_swapin_page(inode, index, &page,
+ if (xa_is_value(folio)) {
+ error = shmem_swapin_folio(inode, index, &folio,
sgp, gfp, vma, fault_type);
if (error == -EEXIST)
goto repeat;
- *pagep = page;
+ *pagep = &folio->page;
return error;
}
- if (page) {
- hindex = page->index;
+ if (folio) {
+ hindex = folio->index;
if (sgp == SGP_WRITE)
- mark_page_accessed(page);
- if (PageUptodate(page))
+ folio_mark_accessed(folio);
+ if (folio_test_uptodate(folio))
goto out;
/* fallocated page */
if (sgp != SGP_READ)
goto clear;
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
/*
@@ -1868,17 +1864,16 @@ repeat:
huge_gfp = vma_thp_gfp_mask(vma);
huge_gfp = limit_gfp_mask(huge_gfp, gfp);
- page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
- if (IS_ERR(page)) {
+ folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true);
+ if (IS_ERR(folio)) {
alloc_nohuge:
- page = shmem_alloc_and_acct_page(gfp, inode,
- index, false);
+ folio = shmem_alloc_and_acct_folio(gfp, inode, index, false);
}
- if (IS_ERR(page)) {
+ if (IS_ERR(folio)) {
int retry = 5;
- error = PTR_ERR(page);
- page = NULL;
+ error = PTR_ERR(folio);
+ folio = NULL;
if (error != -ENOSPC)
goto unlock;
/*
@@ -1897,29 +1892,26 @@ alloc_nohuge:
goto unlock;
}
- if (PageTransHuge(page))
- hindex = round_down(index, HPAGE_PMD_NR);
- else
- hindex = index;
+ hindex = round_down(index, folio_nr_pages(folio));
if (sgp == SGP_WRITE)
- __SetPageReferenced(page);
+ __folio_set_referenced(folio);
- error = shmem_add_to_page_cache(page, mapping, hindex,
+ error = shmem_add_to_page_cache(folio, mapping, hindex,
NULL, gfp & GFP_RECLAIM_MASK,
charge_mm);
if (error)
goto unacct;
- lru_cache_add(page);
+ folio_add_lru(folio);
spin_lock_irq(&info->lock);
- info->alloced += compound_nr(page);
- inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
+ info->alloced += folio_nr_pages(folio);
+ inode->i_blocks += BLOCKS_PER_PAGE << folio_order(folio);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
alloced = true;
- if (PageTransHuge(page) &&
+ if (folio_test_pmd_mappable(folio) &&
DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
hindex + HPAGE_PMD_NR - 1) {
/*
@@ -1950,22 +1942,21 @@ clear:
* but SGP_FALLOC on a page fallocated earlier must initialize
* it now, lest undo on failure cancel our earlier guarantee.
*/
- if (sgp != SGP_WRITE && !PageUptodate(page)) {
- int i;
+ if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
+ long i, n = folio_nr_pages(folio);
- for (i = 0; i < compound_nr(page); i++) {
- clear_highpage(page + i);
- flush_dcache_page(page + i);
- }
- SetPageUptodate(page);
+ for (i = 0; i < n; i++)
+ clear_highpage(folio_page(folio, i));
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
}
/* Perhaps the file has been truncated since we checked */
if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
if (alloced) {
- ClearPageDirty(page);
- delete_from_page_cache(page);
+ folio_clear_dirty(folio);
+ filemap_remove_folio(folio);
spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
@@ -1974,24 +1965,24 @@ clear:
goto unlock;
}
out:
- *pagep = page + index - hindex;
+ *pagep = folio_page(folio, index - hindex);
return 0;
/*
* Error recovery.
*/
unacct:
- shmem_inode_unacct_blocks(inode, compound_nr(page));
+ shmem_inode_unacct_blocks(inode, folio_nr_pages(folio));
- if (PageTransHuge(page)) {
- unlock_page(page);
- put_page(page);
+ if (folio_test_large(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
goto alloc_nohuge;
}
unlock:
- if (page) {
- unlock_page(page);
- put_page(page);
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
}
if (error == -ENOSPC && !once++) {
spin_lock_irq(&info->lock);
@@ -2239,11 +2230,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
- (vma->vm_end & HPAGE_PMD_MASK)) {
- khugepaged_enter(vma, vma->vm_flags);
- }
return 0;
}
@@ -2318,7 +2304,7 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- bool zeropage,
+ bool zeropage, bool wp_copy,
struct page **pagep)
{
struct inode *inode = file_inode(dst_vma->vm_file);
@@ -2327,6 +2313,7 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
gfp_t gfp = mapping_gfp_mask(mapping);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
void *page_kaddr;
+ struct folio *folio;
struct page *page;
int ret;
pgoff_t max_off;
@@ -2385,13 +2372,14 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (unlikely(pgoff >= max_off))
goto out_release;
- ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
+ folio = page_folio(page);
+ ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
gfp & GFP_RECLAIM_MASK, dst_mm);
if (ret)
goto out_release;
ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- page, true, false);
+ page, true, wp_copy);
if (ret)
goto out_delete_from_cache;
@@ -3487,6 +3475,10 @@ static int shmem_reconfigure(struct fs_context *fc)
raw_spin_lock(&sbinfo->stat_lock);
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
+ if (ctx->blocks > S64_MAX) {
+ err = "Number of blocks too large";
+ goto out;
+ }
if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
if (!sbinfo->max_blocks) {
err = "Cannot retroactively limit size";
@@ -3888,7 +3880,7 @@ static struct file_system_type shmem_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
-int __init shmem_init(void)
+void __init shmem_init(void)
{
int error;
@@ -3913,14 +3905,13 @@ int __init shmem_init(void)
else
shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
#endif
- return 0;
+ return;
out1:
unregister_filesystem(&shmem_fs_type);
out2:
shmem_destroy_inodecache();
shm_mnt = ERR_PTR(error);
- return error;
}
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
@@ -3998,14 +3989,12 @@ static struct file_system_type shmem_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
-int __init shmem_init(void)
+void __init shmem_init(void)
{
BUG_ON(register_filesystem(&shmem_fs_type) != 0);
shm_mnt = kern_mount(&shmem_fs_type);
BUG_ON(IS_ERR(shm_mnt));
-
- return 0;
}
int shmem_unuse(unsigned int type)
@@ -4145,12 +4134,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
vma->vm_file = file;
vma->vm_ops = &shmem_vm_ops;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
- ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
- (vma->vm_end & HPAGE_PMD_MASK)) {
- khugepaged_enter(vma, vma->vm_flags);
- }
-
return 0;
}
diff --git a/mm/slab.c b/mm/slab.c
index a301f266efd1..f8cd00f4ba13 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2994,10 +2994,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
cachep->ctor(objp);
- if (ARCH_SLAB_MINALIGN &&
- ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
- pr_err("0x%px: not aligned to ARCH_SLAB_MINALIGN=%d\n",
- objp, (int)ARCH_SLAB_MINALIGN);
+ if ((unsigned long)objp & (arch_slab_minalign() - 1)) {
+ pr_err("0x%px: not aligned to arch_slab_minalign()=%u\n", objp,
+ arch_slab_minalign());
}
return objp;
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d1f3133847ad..77c3adf40e50 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -155,8 +155,7 @@ static unsigned int calculate_alignment(slab_flags_t flags,
align = max(align, ralign);
}
- if (align < ARCH_SLAB_MINALIGN)
- align = ARCH_SLAB_MINALIGN;
+ align = max(align, arch_slab_minalign());
return ALIGN(align, sizeof(void *));
}
diff --git a/mm/slob.c b/mm/slob.c
index 40ea6e2d4ccd..f47811f09aca 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -478,9 +478,11 @@ static __always_inline void *
__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
{
unsigned int *m;
- int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ unsigned int minalign;
void *ret;
+ minalign = max_t(unsigned int, ARCH_KMALLOC_MINALIGN,
+ arch_slab_minalign());
gfp &= gfp_allowed_mask;
might_alloc(gfp);
@@ -493,7 +495,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
* kmalloc()'d objects.
*/
if (is_power_of_2(size))
- align = max(minalign, (int) size);
+ align = max_t(unsigned int, minalign, size);
if (!size)
return ZERO_SIZE_PTR;
@@ -555,8 +557,11 @@ void kfree(const void *block)
sp = virt_to_folio(block);
if (folio_test_slab(sp)) {
- int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ unsigned int align = max_t(unsigned int,
+ ARCH_KMALLOC_MINALIGN,
+ arch_slab_minalign());
unsigned int *m = (unsigned int *)(block - align);
+
slob_free(m, *m + align);
} else {
unsigned int order = folio_order(sp);
@@ -573,7 +578,7 @@ EXPORT_SYMBOL(kfree);
size_t __ksize(const void *block)
{
struct folio *folio;
- int align;
+ unsigned int align;
unsigned int *m;
BUG_ON(!block);
@@ -584,7 +589,8 @@ size_t __ksize(const void *block)
if (unlikely(!folio_test_slab(folio)))
return folio_size(folio);
- align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ align = max_t(unsigned int, ARCH_KMALLOC_MINALIGN,
+ arch_slab_minalign());
m = (unsigned int *)(block - align);
return SLOB_UNITS(*m) * SLOB_UNIT;
}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 8aecd6b3896c..f4fa61dbbee3 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -34,7 +34,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
-#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
/**
* struct vmemmap_remap_walk - walk vmemmap page table
*
@@ -420,7 +420,7 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end,
return 0;
}
-#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
+#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
/*
* Allocate a block of memory to be used to back the virtual memory map
@@ -533,16 +533,31 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
}
pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
- struct vmem_altmap *altmap)
+ struct vmem_altmap *altmap,
+ struct page *reuse)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
pte_t entry;
void *p;
- p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
- if (!p)
- return NULL;
+ if (!reuse) {
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+ if (!p)
+ return NULL;
+ } else {
+ /*
+ * When a PTE/PMD entry is freed from the init_mm
+ * there's a a free_pages() call to this page allocated
+ * above. Thus this get_page() is paired with the
+ * put_page_testzero() on the freeing path.
+ * This can only called by certain ZONE_DEVICE path,
+ * and through vmemmap_populate_compound_pages() when
+ * slab is available.
+ */
+ get_page(reuse);
+ p = page_to_virt(reuse);
+ }
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
set_pte_at(&init_mm, addr, pte, entry);
}
@@ -608,49 +623,166 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
return pgd;
}
-int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
- int node, struct vmem_altmap *altmap)
+static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
{
- unsigned long addr = start;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
+ pgd = vmemmap_pgd_populate(addr, node);
+ if (!pgd)
+ return NULL;
+ p4d = vmemmap_p4d_populate(pgd, addr, node);
+ if (!p4d)
+ return NULL;
+ pud = vmemmap_pud_populate(p4d, addr, node);
+ if (!pud)
+ return NULL;
+ pmd = vmemmap_pmd_populate(pud, addr, node);
+ if (!pmd)
+ return NULL;
+ pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
+ if (!pte)
+ return NULL;
+ vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ return pte;
+}
+
+static int __meminit vmemmap_populate_range(unsigned long start,
+ unsigned long end, int node,
+ struct vmem_altmap *altmap,
+ struct page *reuse)
+{
+ unsigned long addr = start;
+ pte_t *pte;
+
for (; addr < end; addr += PAGE_SIZE) {
- pgd = vmemmap_pgd_populate(addr, node);
- if (!pgd)
- return -ENOMEM;
- p4d = vmemmap_p4d_populate(pgd, addr, node);
- if (!p4d)
+ pte = vmemmap_populate_address(addr, node, altmap, reuse);
+ if (!pte)
return -ENOMEM;
- pud = vmemmap_pud_populate(p4d, addr, node);
- if (!pud)
+ }
+
+ return 0;
+}
+
+int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
+ int node, struct vmem_altmap *altmap)
+{
+ return vmemmap_populate_range(start, end, node, altmap, NULL);
+}
+
+/*
+ * For compound pages bigger than section size (e.g. x86 1G compound
+ * pages with 2M subsection size) fill the rest of sections as tail
+ * pages.
+ *
+ * Note that memremap_pages() resets @nr_range value and will increment
+ * it after each range successful onlining. Thus the value or @nr_range
+ * at section memmap populate corresponds to the in-progress range
+ * being onlined here.
+ */
+static bool __meminit reuse_compound_section(unsigned long start_pfn,
+ struct dev_pagemap *pgmap)
+{
+ unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
+ unsigned long offset = start_pfn -
+ PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
+
+ return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
+}
+
+static pte_t * __meminit compound_section_tail_page(unsigned long addr)
+{
+ pte_t *pte;
+
+ addr -= PAGE_SIZE;
+
+ /*
+ * Assuming sections are populated sequentially, the previous section's
+ * page data can be reused.
+ */
+ pte = pte_offset_kernel(pmd_off_k(addr), addr);
+ if (!pte)
+ return NULL;
+
+ return pte;
+}
+
+static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
+ unsigned long start,
+ unsigned long end, int node,
+ struct dev_pagemap *pgmap)
+{
+ unsigned long size, addr;
+ pte_t *pte;
+ int rc;
+
+ if (reuse_compound_section(start_pfn, pgmap)) {
+ pte = compound_section_tail_page(start);
+ if (!pte)
return -ENOMEM;
- pmd = vmemmap_pmd_populate(pud, addr, node);
- if (!pmd)
+
+ /*
+ * Reuse the page that was populated in the prior iteration
+ * with just tail struct pages.
+ */
+ return vmemmap_populate_range(start, end, node, NULL,
+ pte_page(*pte));
+ }
+
+ size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
+ for (addr = start; addr < end; addr += size) {
+ unsigned long next = addr, last = addr + size;
+
+ /* Populate the head page vmemmap page */
+ pte = vmemmap_populate_address(addr, node, NULL, NULL);
+ if (!pte)
return -ENOMEM;
- pte = vmemmap_pte_populate(pmd, addr, node, altmap);
+
+ /* Populate the tail pages vmemmap page */
+ next = addr + PAGE_SIZE;
+ pte = vmemmap_populate_address(next, node, NULL, NULL);
if (!pte)
return -ENOMEM;
- vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+
+ /*
+ * Reuse the previous page for the rest of tail pages
+ * See layout diagram in Documentation/vm/vmemmap_dedup.rst
+ */
+ next += PAGE_SIZE;
+ rc = vmemmap_populate_range(next, last, node, NULL,
+ pte_page(*pte));
+ if (rc)
+ return -ENOMEM;
}
return 0;
}
struct page * __meminit __populate_section_memmap(unsigned long pfn,
- unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
unsigned long start = (unsigned long) pfn_to_page(pfn);
unsigned long end = start + nr_pages * sizeof(struct page);
+ int r;
if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
return NULL;
- if (vmemmap_populate(start, end, nid, altmap))
+ if (is_power_of_2(sizeof(struct page)) &&
+ pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
+ r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
+ else
+ r = vmemmap_populate(start, end, nid, altmap);
+
+ if (r < 0)
return NULL;
return pfn_to_page(pfn);
diff --git a/mm/sparse.c b/mm/sparse.c
index 952f06d8f373..cb3bfae64036 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -427,7 +427,8 @@ static unsigned long __init section_map_size(void)
}
struct page __init *__populate_section_memmap(unsigned long pfn,
- unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
unsigned long size = section_map_size();
struct page *map = sparse_buffer_alloc(size);
@@ -524,7 +525,7 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
break;
map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
- nid, NULL);
+ nid, NULL, NULL);
if (!map) {
pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
__func__, nid);
@@ -629,9 +630,10 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
#ifdef CONFIG_SPARSEMEM_VMEMMAP
static struct page * __meminit populate_section_memmap(unsigned long pfn,
- unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
- return __populate_section_memmap(pfn, nr_pages, nid, altmap);
+ return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
}
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
@@ -700,7 +702,8 @@ static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
}
#else
struct page * __meminit populate_section_memmap(unsigned long pfn,
- unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
return kvmalloc_node(array_size(sizeof(struct page),
PAGES_PER_SECTION), GFP_KERNEL, nid);
@@ -823,7 +826,8 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
}
static struct page * __meminit section_activate(int nid, unsigned long pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+ unsigned long nr_pages, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
struct mem_section *ms = __pfn_to_section(pfn);
struct mem_section_usage *usage = NULL;
@@ -855,7 +859,7 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
if (nr_pages < PAGES_PER_SECTION && early_section(ms))
return pfn_to_page(pfn);
- memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
+ memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
if (!memmap) {
section_deactivate(pfn, nr_pages, altmap);
return ERR_PTR(-ENOMEM);
@@ -869,7 +873,8 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
* @nid: The node to add section on
* @start_pfn: start pfn of the memory range
* @nr_pages: number of pfns to add in the section
- * @altmap: device page map
+ * @altmap: alternate pfns to allocate the memmap backing store
+ * @pgmap: alternate compound page geometry for devmap mappings
*
* This is only intended for hotplug.
*
@@ -883,7 +888,8 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn,
* * -ENOMEM - Out of memory.
*/
int __meminit sparse_add_section(int nid, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+ unsigned long nr_pages, struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
struct mem_section *ms;
@@ -894,7 +900,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
if (ret < 0)
return ret;
- memmap = section_activate(nid, start_pfn, nr_pages, altmap);
+ memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap);
if (IS_ERR(memmap))
return PTR_ERR(memmap);
@@ -916,33 +922,6 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
return 0;
}
-#ifdef CONFIG_MEMORY_FAILURE
-static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
-{
- int i;
-
- /*
- * A further optimization is to have per section refcounted
- * num_poisoned_pages. But that would need more space per memmap, so
- * for now just do a quick global check to speed up this routine in the
- * absence of bad pages.
- */
- if (atomic_long_read(&num_poisoned_pages) == 0)
- return;
-
- for (i = 0; i < nr_pages; i++) {
- if (PageHWPoison(&memmap[i])) {
- num_poisoned_pages_dec();
- ClearPageHWPoison(&memmap[i]);
- }
- }
-}
-#else
-static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
-{
-}
-#endif
-
void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
unsigned long nr_pages, unsigned long map_offset,
struct vmem_altmap *altmap)
diff --git a/mm/swap.c b/mm/swap.c
index 7e320ec08c6a..f3922a96b2e9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -166,10 +166,10 @@ EXPORT_SYMBOL(put_pages_list);
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_segs long.
*
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno. Each page returned must be released
- * with a put_page() call when it is finished with.
+ * Returns number of pages pinned. This may be fewer than the number requested.
+ * If nr_segs is 0 or negative, returns 0. If no pages were pinned, returns 0.
+ * Each page returned must be released with a put_page() call when it is
+ * finished with.
*/
int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
struct page **pages)
@@ -748,7 +748,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
* Calling this function with cpu hotplug locks held can actually lead
* to obscure indirect dependencies via WQ context.
*/
-inline void __lru_add_drain_all(bool force_all_cpus)
+static inline void __lru_add_drain_all(bool force_all_cpus)
{
/*
* lru_drain_gen - Global pages generation number
diff --git a/mm/swap.h b/mm/swap.h
new file mode 100644
index 000000000000..0193797b0c92
--- /dev/null
+++ b/mm/swap.h
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MM_SWAP_H
+#define _MM_SWAP_H
+
+#ifdef CONFIG_SWAP
+#include <linux/blk_types.h> /* for bio_end_io_t */
+
+/* linux/mm/page_io.c */
+int sio_pool_init(void);
+struct swap_iocb;
+int swap_readpage(struct page *page, bool do_poll,
+ struct swap_iocb **plug);
+void __swap_read_unplug(struct swap_iocb *plug);
+static inline void swap_read_unplug(struct swap_iocb *plug)
+{
+ if (unlikely(plug))
+ __swap_read_unplug(plug);
+}
+void swap_write_unplug(struct swap_iocb *sio);
+int swap_writepage(struct page *page, struct writeback_control *wbc);
+void end_swap_bio_write(struct bio *bio);
+int __swap_writepage(struct page *page, struct writeback_control *wbc,
+ bio_end_io_t end_write_func);
+
+/* linux/mm/swap_state.c */
+/* One swap address space for each 64M swap space */
+#define SWAP_ADDRESS_SPACE_SHIFT 14
+#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT)
+extern struct address_space *swapper_spaces[];
+#define swap_address_space(entry) \
+ (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
+ >> SWAP_ADDRESS_SPACE_SHIFT])
+
+void show_swap_cache_info(void);
+bool add_to_swap(struct folio *folio);
+void *get_shadow_from_swap_cache(swp_entry_t entry);
+int add_to_swap_cache(struct page *page, swp_entry_t entry,
+ gfp_t gfp, void **shadowp);
+void __delete_from_swap_cache(struct page *page,
+ swp_entry_t entry, void *shadow);
+void delete_from_swap_cache(struct page *page);
+void clear_shadow_from_swap_cache(int type, unsigned long begin,
+ unsigned long end);
+void free_swap_cache(struct page *page);
+struct page *lookup_swap_cache(swp_entry_t entry,
+ struct vm_area_struct *vma,
+ unsigned long addr);
+struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index);
+
+struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ bool do_poll,
+ struct swap_iocb **plug);
+struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ bool *new_page_allocated);
+struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
+ struct vm_fault *vmf);
+struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
+ struct vm_fault *vmf);
+
+static inline unsigned int page_swap_flags(struct page *page)
+{
+ return page_swap_info(page)->flags;
+}
+#else /* CONFIG_SWAP */
+struct swap_iocb;
+static inline int swap_readpage(struct page *page, bool do_poll,
+ struct swap_iocb **plug)
+{
+ return 0;
+}
+static inline void swap_write_unplug(struct swap_iocb *sio)
+{
+}
+
+static inline struct address_space *swap_address_space(swp_entry_t entry)
+{
+ return NULL;
+}
+
+static inline void free_swap_cache(struct page *page)
+{
+}
+
+static inline void show_swap_cache_info(void)
+{
+}
+
+static inline struct page *swap_cluster_readahead(swp_entry_t entry,
+ gfp_t gfp_mask, struct vm_fault *vmf)
+{
+ return NULL;
+}
+
+static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
+ struct vm_fault *vmf)
+{
+ return NULL;
+}
+
+static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
+{
+ return 0;
+}
+
+static inline struct page *lookup_swap_cache(swp_entry_t swp,
+ struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ return NULL;
+}
+
+static inline
+struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
+{
+ return find_get_page(mapping, index);
+}
+
+static inline bool add_to_swap(struct folio *folio)
+{
+ return false;
+}
+
+static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
+{
+ return NULL;
+}
+
+static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
+ gfp_t gfp_mask, void **shadowp)
+{
+ return -1;
+}
+
+static inline void __delete_from_swap_cache(struct page *page,
+ swp_entry_t entry, void *shadow)
+{
+}
+
+static inline void delete_from_swap_cache(struct page *page)
+{
+}
+
+static inline void clear_shadow_from_swap_cache(int type, unsigned long begin,
+ unsigned long end)
+{
+}
+
+static inline unsigned int page_swap_flags(struct page *page)
+{
+ return 0;
+}
+#endif /* CONFIG_SWAP */
+#endif /* _MM_SWAP_H */
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 2b5531840583..2a65a89b5b4d 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -117,7 +117,7 @@ static int alloc_swap_slot_cache(unsigned int cpu)
/*
* Do allocation outside swap_slots_cache_mutex
- * as kvzalloc could trigger reclaim and get_swap_page,
+ * as kvzalloc could trigger reclaim and folio_alloc_swap,
* which can lock swap_slots_cache_mutex.
*/
slots = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t),
@@ -213,7 +213,7 @@ static void __drain_swap_slots_cache(unsigned int type)
* this function can be invoked in the cpu
* hot plug path:
* cpu_up -> lock cpu_hotplug -> cpu hotplug state callback
- * -> memory allocation -> direct reclaim -> get_swap_page
+ * -> memory allocation -> direct reclaim -> folio_alloc_swap
* -> drain_swap_slots_cache
*
* Hence the loop over current online cpu below could miss cpu that
@@ -258,7 +258,7 @@ out_unlock:
/* called with swap slot cache's alloc lock held */
static int refill_swap_slots_cache(struct swap_slots_cache *cache)
{
- if (!use_swap_slot_cache || cache->nr)
+ if (!use_swap_slot_cache)
return 0;
cache->cur = 0;
@@ -269,7 +269,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
return cache->nr;
}
-int free_swap_slot(swp_entry_t entry)
+void free_swap_slot(swp_entry_t entry)
{
struct swap_slots_cache *cache;
@@ -297,20 +297,18 @@ int free_swap_slot(swp_entry_t entry)
direct_free:
swapcache_free_entries(&entry, 1);
}
-
- return 0;
}
-swp_entry_t get_swap_page(struct page *page)
+swp_entry_t folio_alloc_swap(struct folio *folio)
{
swp_entry_t entry;
struct swap_slots_cache *cache;
entry.val = 0;
- if (PageTransHuge(page)) {
+ if (folio_test_large(folio)) {
if (IS_ENABLED(CONFIG_THP_SWAP))
- get_swap_pages(1, &entry, HPAGE_PMD_NR);
+ get_swap_pages(1, &entry, folio_nr_pages(folio));
goto out;
}
@@ -344,8 +342,8 @@ repeat:
get_swap_pages(1, &entry, 1);
out:
- if (mem_cgroup_try_charge_swap(page, entry)) {
- put_swap_page(page, entry);
+ if (mem_cgroup_try_charge_swap(folio, entry)) {
+ put_swap_page(&folio->page, entry);
entry.val = 0;
}
return entry;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 013856004825..b9e4ed2e90bf 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -23,6 +23,7 @@
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
#include "internal.h"
+#include "swap.h"
/*
* swapper_space is a fiction, retained to simplify the path through
@@ -30,7 +31,7 @@
*/
static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
- .dirty_folio = swap_dirty_folio,
+ .dirty_folio = noop_dirty_folio,
#ifdef CONFIG_MIGRATION
.migratepage = migrate_page,
#endif
@@ -175,23 +176,26 @@ void __delete_from_swap_cache(struct page *page,
}
/**
- * add_to_swap - allocate swap space for a page
- * @page: page we want to move to swap
+ * add_to_swap - allocate swap space for a folio
+ * @folio: folio we want to move to swap
*
- * Allocate swap space for the page and add the page to the
- * swap cache. Caller needs to hold the page lock.
+ * Allocate swap space for the folio and add the folio to the
+ * swap cache.
+ *
+ * Context: Caller needs to hold the folio lock.
+ * Return: Whether the folio was added to the swap cache.
*/
-int add_to_swap(struct page *page)
+bool add_to_swap(struct folio *folio)
{
swp_entry_t entry;
int err;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageUptodate(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
- entry = get_swap_page(page);
+ entry = folio_alloc_swap(folio);
if (!entry.val)
- return 0;
+ return false;
/*
* XArray node allocations from PF_MEMALLOC contexts could
@@ -204,7 +208,7 @@ int add_to_swap(struct page *page)
/*
* Add it to the swap cache.
*/
- err = add_to_swap_cache(page, entry,
+ err = add_to_swap_cache(&folio->page, entry,
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
if (err)
/*
@@ -213,22 +217,23 @@ int add_to_swap(struct page *page)
*/
goto fail;
/*
- * Normally the page will be dirtied in unmap because its pte should be
- * dirty. A special case is MADV_FREE page. The page's pte could have
- * dirty bit cleared but the page's SwapBacked bit is still set because
- * clearing the dirty bit and SwapBacked bit has no lock protected. For
- * such page, unmap will not set dirty bit for it, so page reclaim will
- * not write the page out. This can cause data corruption when the page
- * is swap in later. Always setting the dirty bit for the page solves
- * the problem.
+ * Normally the folio will be dirtied in unmap because its
+ * pte should be dirty. A special case is MADV_FREE page. The
+ * page's pte could have dirty bit cleared but the folio's
+ * SwapBacked flag is still set because clearing the dirty bit
+ * and SwapBacked flag has no lock protected. For such folio,
+ * unmap will not set dirty bit for it, so folio reclaim will
+ * not write the folio out. This can cause data corruption when
+ * the folio is swapped in later. Always setting the dirty flag
+ * for the folio solves the problem.
*/
- set_page_dirty(page);
+ folio_mark_dirty(folio);
- return 1;
+ return true;
fail:
- put_swap_page(page, entry);
- return 0;
+ put_swap_page(&folio->page, entry);
+ return false;
}
/*
@@ -519,14 +524,16 @@ fail_unlock:
* the swap entry is no longer in use.
*/
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
- struct vm_area_struct *vma, unsigned long addr, bool do_poll)
+ struct vm_area_struct *vma,
+ unsigned long addr, bool do_poll,
+ struct swap_iocb **plug)
{
bool page_was_allocated;
struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
vma, addr, &page_was_allocated);
if (page_was_allocated)
- swap_readpage(retpage, do_poll);
+ swap_readpage(retpage, do_poll, plug);
return retpage;
}
@@ -620,6 +627,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long mask;
struct swap_info_struct *si = swp_swap_info(entry);
struct blk_plug plug;
+ struct swap_iocb *splug = NULL;
bool do_poll = true, page_allocated;
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address;
@@ -646,7 +654,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
if (!page)
continue;
if (page_allocated) {
- swap_readpage(page, false);
+ swap_readpage(page, false, &splug);
if (offset != entry_offset) {
SetPageReadahead(page);
count_vm_event(SWAP_RA);
@@ -655,10 +663,12 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
put_page(page);
}
blk_finish_plug(&plug);
+ swap_read_unplug(splug);
lru_add_drain(); /* Push any new pages onto the LRU now */
skip:
- return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
+ /* The page was likely read above, so no need for plugging here */
+ return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL);
}
int init_swap_address_space(unsigned int type, unsigned long nr_pages)
@@ -789,6 +799,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
struct vm_fault *vmf)
{
struct blk_plug plug;
+ struct swap_iocb *splug = NULL;
struct vm_area_struct *vma = vmf->vma;
struct page *page;
pte_t *pte, pentry;
@@ -807,9 +818,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
i++, pte++) {
pentry = *pte;
- if (pte_none(pentry))
- continue;
- if (pte_present(pentry))
+ if (!is_swap_pte(pentry))
continue;
entry = pte_to_swp_entry(pentry);
if (unlikely(non_swap_entry(entry)))
@@ -819,7 +828,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
if (!page)
continue;
if (page_allocated) {
- swap_readpage(page, false);
+ swap_readpage(page, false, &splug);
if (i != ra_info.offset) {
SetPageReadahead(page);
count_vm_event(SWAP_RA);
@@ -828,10 +837,12 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
put_page(page);
}
blk_finish_plug(&plug);
+ swap_read_unplug(splug);
lru_add_drain();
skip:
+ /* The page was likely read above, so no need for plugging here */
return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
- ra_info.win == 1);
+ ra_info.win == 1, NULL);
}
/**
@@ -865,18 +876,15 @@ static ssize_t vma_ra_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
- enable_vma_readahead = true;
- else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
- enable_vma_readahead = false;
- else
- return -EINVAL;
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &enable_vma_readahead);
+ if (ret)
+ return ret;
return count;
}
-static struct kobj_attribute vma_ra_enabled_attr =
- __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
- vma_ra_enabled_store);
+static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled);
static struct attribute *swap_attrs[] = {
&vma_ra_enabled_attr.attr,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6aec1b24f440..94b4ff43ead0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -45,6 +45,7 @@
#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
+#include "swap.h"
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
@@ -77,9 +78,9 @@ static PLIST_HEAD(swap_active_head);
/*
* all available (active, not full) swap_info_structs
* protected with swap_avail_lock, ordered by priority.
- * This is used by get_swap_page() instead of swap_active_head
+ * This is used by folio_alloc_swap() instead of swap_active_head
* because swap_active_head includes all swap_info_structs,
- * but get_swap_page() doesn't need to look at full ones.
+ * but folio_alloc_swap() doesn't need to look at full ones.
* This uses its own lock instead of swap_lock because when a
* swap_info_struct changes between not-full/full, it needs to
* add/remove itself to/from this list, but the swap_info_struct->lock
@@ -775,6 +776,22 @@ static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
this_cpu_write(*si->cluster_next_cpu, next);
}
+static bool swap_offset_available_and_locked(struct swap_info_struct *si,
+ unsigned long offset)
+{
+ if (data_race(!si->swap_map[offset])) {
+ spin_lock(&si->lock);
+ return true;
+ }
+
+ if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
+ spin_lock(&si->lock);
+ return true;
+ }
+
+ return false;
+}
+
static int scan_swap_map_slots(struct swap_info_struct *si,
unsigned char usage, int nr,
swp_entry_t slots[])
@@ -952,15 +969,8 @@ done:
scan:
spin_unlock(&si->lock);
while (++offset <= READ_ONCE(si->highest_bit)) {
- if (data_race(!si->swap_map[offset])) {
- spin_lock(&si->lock);
- goto checks;
- }
- if (vm_swap_full() &&
- READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
- spin_lock(&si->lock);
+ if (swap_offset_available_and_locked(si, offset))
goto checks;
- }
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
@@ -969,15 +979,8 @@ scan:
}
offset = si->lowest_bit;
while (offset < scan_base) {
- if (data_race(!si->swap_map[offset])) {
- spin_lock(&si->lock);
+ if (swap_offset_available_and_locked(si, offset))
goto checks;
- }
- if (vm_swap_full() &&
- READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
- spin_lock(&si->lock);
- goto checks;
- }
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
@@ -1122,7 +1125,7 @@ noswap:
return n_ret;
}
-static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
unsigned long offset;
@@ -1137,8 +1140,13 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
offset = swp_offset(entry);
if (offset >= p->max)
goto bad_offset;
+ if (data_race(!p->swap_map[swp_offset(entry)]))
+ goto bad_free;
return p;
+bad_free:
+ pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
+ goto out;
bad_offset:
pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
goto out;
@@ -1151,23 +1159,6 @@ out:
return NULL;
}
-static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
-{
- struct swap_info_struct *p;
-
- p = __swap_info_get(entry);
- if (!p)
- goto out;
- if (data_race(!p->swap_map[swp_offset(entry)]))
- goto bad_free;
- return p;
-
-bad_free:
- pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
-out:
- return NULL;
-}
-
static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
struct swap_info_struct *q)
{
@@ -1283,6 +1274,7 @@ bad_nofile:
out:
return NULL;
put_out:
+ pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
percpu_ref_put(&si->users);
return NULL;
}
@@ -1440,7 +1432,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
* This does not give an exact answer when swap count is continued,
* but does include the high COUNT_CONTINUED flag to allow for that.
*/
-int page_swapcount(struct page *page)
+static int page_swapcount(struct page *page)
{
int count = 0;
struct swap_info_struct *p;
@@ -1797,13 +1789,28 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
goto out;
}
+ /* See do_swap_page() */
+ BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
+ BUG_ON(PageAnon(page) && PageAnonExclusive(page));
+
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
if (page == swapcache) {
- page_add_anon_rmap(page, vma, addr, false);
+ rmap_t rmap_flags = RMAP_NONE;
+
+ /*
+ * See do_swap_page(): PageWriteback() would be problematic.
+ * However, we do a wait_on_page_writeback() just before this
+ * call and have the page locked.
+ */
+ VM_BUG_ON_PAGE(PageWriteback(page), page);
+ if (pte_swp_exclusive(*pte))
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ page_add_anon_rmap(page, vma, addr, rmap_flags);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, addr, false);
+ page_add_new_anon_rmap(page, vma, addr);
lru_cache_add_inactive_or_unevictable(page, vma);
}
set_pte_at(vma->vm_mm, addr, pte,
@@ -1984,9 +1991,9 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
}
/*
- * Scan swap_map (or frontswap_map if frontswap parameter is true)
- * from current position to next entry still in use. Return 0
- * if there are no inuse entries after prev till end of the map.
+ * Scan swap_map from current position to next entry still in use.
+ * Return 0 if there are no inuse entries after prev till end of
+ * the map.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
unsigned int prev)
@@ -2094,11 +2101,12 @@ retry:
* Under global memory pressure, swap entries can be reinserted back
* into process space after the mmlist loop above passes over them.
*
- * Limit the number of retries? No: when mmget_not_zero() above fails,
- * that mm is likely to be freeing swap from exit_mmap(), which proceeds
- * at its own independent pace; and even shmem_writepage() could have
- * been preempted after get_swap_page(), temporarily hiding that swap.
- * It's easy and robust (though cpu-intensive) just to keep retrying.
+ * Limit the number of retries? No: when mmget_not_zero()
+ * above fails, that mm is likely to be freeing swap from
+ * exit_mmap(), which proceeds at its own independent pace;
+ * and even shmem_writepage() could have been preempted after
+ * folio_alloc_swap(), temporarily hiding that swap. It's easy
+ * and robust (though cpu-intensive) just to keep retrying.
*/
if (READ_ONCE(si->inuse_pages)) {
if (!signal_pending(current))
@@ -2201,8 +2209,8 @@ EXPORT_SYMBOL_GPL(add_swap_extent);
/*
* A `swap extent' is a simple thing which maps a contiguous range of pages
- * onto a contiguous range of disk blocks. An ordered list of swap extents
- * is built at swapon time and is then used at swap_writepage/swap_readpage
+ * onto a contiguous range of disk blocks. A rbtree of swap extents is
+ * built at swapon time and is then used at swap_writepage/swap_readpage
* time for locating where on disk a page belongs.
*
* If the swapfile is an S_ISBLK block device, a single extent is installed.
@@ -2210,12 +2218,12 @@ EXPORT_SYMBOL_GPL(add_swap_extent);
* swap files identically.
*
* Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
- * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
+ * extent rbtree operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
* swapfiles are handled *identically* after swapon time.
*
* For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
- * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
- * some stray blocks are found which do not fall within the PAGE_SIZE alignment
+ * and will parse them into a rbtree, in PAGE_SIZE chunks. If some stray
+ * blocks are found which do not fall within the PAGE_SIZE alignment
* requirements, they are simply tossed out - we will never use those blocks
* for swapping.
*
@@ -2224,10 +2232,7 @@ EXPORT_SYMBOL_GPL(add_swap_extent);
*
* The amount of disk space which a single swap extent represents varies.
* Typically it is in the 1-4 megabyte range. So we can have hundreds of
- * extents in the list. To avoid much list walking, we cache the previous
- * search location in `curr_swap_extent', and start new searches from there.
- * This is extremely effective. The average number of iterations in
- * map_swap_page() has been measured at about 0.3 per page. - akpm.
+ * extents in the rbtree. - akpm.
*/
static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
{
@@ -2244,12 +2249,13 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
if (mapping->a_ops->swap_activate) {
ret = mapping->a_ops->swap_activate(sis, swap_file, span);
- if (ret >= 0)
- sis->flags |= SWP_ACTIVATED;
- if (!ret) {
- sis->flags |= SWP_FS_OPS;
- ret = add_swap_extent(sis, 0, sis->max, 0);
- *span = sis->pages;
+ if (ret < 0)
+ return ret;
+ sis->flags |= SWP_ACTIVATED;
+ if ((sis->flags & SWP_FS_OPS) &&
+ sio_pool_init() != 0) {
+ destroy_swap_extents(sis);
+ return -ENOMEM;
}
return ret;
}
@@ -2311,7 +2317,7 @@ static void _enable_swap_info(struct swap_info_struct *p)
* which on removal of any swap_info_struct with an auto-assigned
* (i.e. negative) priority increments the auto-assigned priority
* of any lower-priority swap_info_structs.
- * swap_avail_head needs to be priority ordered for get_swap_page(),
+ * swap_avail_head needs to be priority ordered for folio_alloc_swap(),
* which allocates swap pages from the highest available priority
* swap_info_struct.
*/
@@ -3314,8 +3320,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unlock_out:
unlock_cluster_or_swap_info(p, ci);
- if (p)
- put_swap_device(p);
+ put_swap_device(p);
return err;
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e9bb6db002aa..4f4892a5f767 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -16,6 +16,7 @@
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#include "internal.h"
static __always_inline
@@ -77,10 +78,19 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
* Always mark a PTE as write-protected when needed, regardless of
* VM_WRITE, which the user might change.
*/
- if (wp_copy)
+ if (wp_copy) {
_dst_pte = pte_mkuffd_wp(_dst_pte);
- else if (writable)
+ writable = false;
+ }
+
+ if (writable)
_dst_pte = pte_mkwrite(_dst_pte);
+ else
+ /*
+ * We need this to make sure write bit removed; as mk_pte()
+ * could return a pte with write bit set.
+ */
+ _dst_pte = pte_wrprotect(_dst_pte);
dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
@@ -95,7 +105,12 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
}
ret = -EEXIST;
- if (!pte_none(*dst_pte))
+ /*
+ * We allow to overwrite a pte marker: consider when both MISSING|WP
+ * registered, we firstly wr-protect a none pte which has no page cache
+ * page backing it, then access the page.
+ */
+ if (!pte_none_mostly(*dst_pte))
goto out_unlock;
if (page_in_cache) {
@@ -104,7 +119,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
lru_cache_add(page);
page_add_file_rmap(page, dst_vma, false);
} else {
- page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+ page_add_new_anon_rmap(page, dst_vma, dst_addr);
lru_cache_add_inactive_or_unevictable(page, dst_vma);
}
@@ -290,7 +305,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- enum mcopy_atomic_mode mode)
+ enum mcopy_atomic_mode mode,
+ bool wp_copy)
{
int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err;
@@ -378,7 +394,7 @@ retry:
}
if (mode != MCOPY_ATOMIC_CONTINUE &&
- !huge_pte_none(huge_ptep_get(dst_pte))) {
+ !huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
err = -EEXIST;
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
@@ -386,7 +402,8 @@ retry:
}
err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
- dst_addr, src_addr, mode, &page);
+ dst_addr, src_addr, mode, &page,
+ wp_copy);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
@@ -441,7 +458,8 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- enum mcopy_atomic_mode mode);
+ enum mcopy_atomic_mode mode,
+ bool wp_copy);
#endif /* CONFIG_HUGETLB_PAGE */
static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -479,11 +497,10 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
err = mfill_zeropage_pte(dst_mm, dst_pmd,
dst_vma, dst_addr);
} else {
- VM_WARN_ON_ONCE(wp_copy);
err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
dst_addr, src_addr,
mode != MCOPY_ATOMIC_NORMAL,
- page);
+ wp_copy, page);
}
return err;
@@ -562,7 +579,8 @@ retry:
*/
if (is_vm_hugetlb_page(dst_vma))
return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
- src_start, len, mcopy_mode);
+ src_start, len, mcopy_mode,
+ wp_copy);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
@@ -687,6 +705,8 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
atomic_t *mmap_changing)
{
struct vm_area_struct *dst_vma;
+ unsigned long page_mask;
+ struct mmu_gather tlb;
pgprot_t newprot;
int err;
@@ -712,24 +732,30 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
err = -ENOENT;
dst_vma = find_dst_vma(dst_mm, start, len);
- /*
- * Make sure the vma is not shared, that the dst range is
- * both valid and fully within a single existing vma.
- */
- if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+
+ if (!dst_vma)
goto out_unlock;
if (!userfaultfd_wp(dst_vma))
goto out_unlock;
- if (!vma_is_anonymous(dst_vma))
+ if (!vma_can_userfault(dst_vma, dst_vma->vm_flags))
goto out_unlock;
+ if (is_vm_hugetlb_page(dst_vma)) {
+ err = -EINVAL;
+ page_mask = vma_kernel_pagesize(dst_vma) - 1;
+ if ((start & page_mask) || (len & page_mask))
+ goto out_unlock;
+ }
+
if (enable_wp)
newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
else
newprot = vm_get_page_prot(dst_vma->vm_flags);
- change_protection(dst_vma, start, start + len, newprot,
+ tlb_gather_mmu(&tlb, dst_mm);
+ change_protection(&tlb, dst_vma, start, start + len, newprot,
enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
+ tlb_finish_mmu(&tlb);
err = 0;
out_unlock:
diff --git a/mm/util.c b/mm/util.c
index ac63e5ca8b21..29f4f773dc7b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -27,6 +27,7 @@
#include <linux/uaccess.h>
#include "internal.h"
+#include "swap.h"
/**
* kfree_const - conditionally free memory
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index cadfbb5155ea..07db42455dd4 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -389,18 +389,15 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
{
p4d_t *p4d;
unsigned long next;
- int cleared;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
- cleared = p4d_clear_huge(p4d);
- if (cleared || p4d_bad(*p4d))
+ p4d_clear_huge(p4d);
+ if (p4d_bad(*p4d))
*mask |= PGTBL_P4D_MODIFIED;
- if (cleared)
- continue;
if (p4d_none_or_clear_bad(p4d))
continue;
vunmap_pud_range(p4d, addr, next, mask);
@@ -478,6 +475,9 @@ static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
return -EBUSY;
if (WARN_ON(!page))
return -ENOMEM;
+ if (WARN_ON(!pfn_valid(page_to_pfn(page))))
+ return -EINVAL;
+
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1938,11 +1938,10 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_PTR(err);
}
- vbq = &get_cpu_var(vmap_block_queue);
+ vbq = raw_cpu_ptr(&vmap_block_queue);
spin_lock(&vbq->lock);
list_add_tail_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
- put_cpu_var(vmap_block_queue);
return vaddr;
}
@@ -2021,7 +2020,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
order = get_order(size);
rcu_read_lock();
- vbq = &get_cpu_var(vmap_block_queue);
+ vbq = raw_cpu_ptr(&vmap_block_queue);
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
unsigned long pages_off;
@@ -2044,7 +2043,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
break;
}
- put_cpu_var(vmap_block_queue);
rcu_read_unlock();
/* Allocate new block if nothing was found */
@@ -2895,7 +2893,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
/* memory allocation should consider mempolicy, we can't
* wrongly use nearest node when nid == NUMA_NO_NODE,
* otherwise memory may be allocated in only one node,
- * but mempolcy want to alloc memory by interleaving.
+ * but mempolicy wants to alloc memory by interleaving.
*/
if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index edc89f26b738..f7d9a683e3a7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -59,6 +59,7 @@
#include <linux/sched/sysctl.h>
#include "internal.h"
+#include "swap.h"
#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>
@@ -527,13 +528,8 @@ static bool can_demote(int nid, struct scan_control *sc)
{
if (!numa_demotion_enabled)
return false;
- if (sc) {
- if (sc->no_demotion)
- return false;
- /* It is pointless to do demotion in memcg reclaim */
- if (cgroup_reclaim(sc))
- return false;
- }
+ if (sc && sc->no_demotion)
+ return false;
if (next_demotion_node(nid) == NUMA_NO_NODE)
return false;
@@ -587,7 +583,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
* lruvec_lru_size - Returns the number of pages on the given LRU list.
* @lruvec: lru vector
* @lru: lru to use
- * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
+ * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
*/
static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
int zone_idx)
@@ -595,7 +591,7 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
unsigned long size = 0;
int zid;
- for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
+ for (zid = 0; zid <= zone_idx; zid++) {
struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
if (!managed_zone(zone))
@@ -1031,7 +1027,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
- if (!populated_zone(zone))
+ if (!managed_zone(zone))
continue;
reclaimable += zone_reclaimable_pages(zone);
@@ -1155,7 +1151,8 @@ typedef enum {
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().
*/
-static pageout_t pageout(struct folio *folio, struct address_space *mapping)
+static pageout_t pageout(struct folio *folio, struct address_space *mapping,
+ struct swap_iocb **plug)
{
/*
* If the folio is dirty, only perform writeback if that write
@@ -1200,6 +1197,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping)
.range_start = 0,
.range_end = LLONG_MAX,
.for_reclaim = 1,
+ .swap_plug = plug,
};
folio_set_reclaim(folio);
@@ -1388,6 +1386,10 @@ static enum page_references folio_check_references(struct folio *folio,
if (vm_flags & VM_LOCKED)
return PAGEREF_ACTIVATE;
+ /* rmap lock contention: rotate */
+ if (referenced_ptes == -1)
+ return PAGEREF_KEEP;
+
if (referenced_ptes) {
/*
* All mapped folios start out with page table
@@ -1411,14 +1413,14 @@ static enum page_references folio_check_references(struct folio *folio,
/*
* Activate file-backed executable folios after first usage.
*/
- if ((vm_flags & VM_EXEC) && !folio_test_swapbacked(folio))
+ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
return PAGEREF_ACTIVATE;
return PAGEREF_KEEP;
}
/* Reclaim if clean, defer dirty folios to writeback */
- if (referenced_folio && !folio_test_swapbacked(folio))
+ if (referenced_folio && folio_is_file_lru(folio))
return PAGEREF_RECLAIM_CLEAN;
return PAGEREF_RECLAIM;
@@ -1432,7 +1434,10 @@ static void folio_check_dirty_writeback(struct folio *folio,
/*
* Anonymous pages are not handled by flushers and must be written
- * from reclaim context. Do not stall reclaim based on them
+ * from reclaim context. Do not stall reclaim based on them.
+ * MADV_FREE anonymous pages are put into inactive file list too.
+ * They could be mistakenly treated as file lru. So further anon
+ * test is needed.
*/
if (!folio_is_file_lru(folio) ||
(folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
@@ -1501,6 +1506,22 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
return nr_succeeded;
}
+static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
+{
+ if (gfp_mask & __GFP_FS)
+ return true;
+ if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO))
+ return false;
+ /*
+ * We can "enter_fs" for swap-cache with only __GFP_IO
+ * providing this isn't SWP_FS_OPS.
+ * ->flags can be updated non-atomicially (scan_swap_map_slots),
+ * but that will never affect SWP_FS_OPS, so the data_race
+ * is safe.
+ */
+ return !data_race(page_swap_flags(&folio->page) & SWP_FS_OPS);
+}
+
/*
* shrink_page_list() returns the number of reclaimed pages
*/
@@ -1516,6 +1537,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
+ struct swap_iocb *plug = NULL;
memset(stat, 0, sizeof(*stat));
cond_resched();
@@ -1524,41 +1546,36 @@ static unsigned int shrink_page_list(struct list_head *page_list,
retry:
while (!list_empty(page_list)) {
struct address_space *mapping;
- struct page *page;
struct folio *folio;
enum page_references references = PAGEREF_RECLAIM;
- bool dirty, writeback, may_enter_fs;
+ bool dirty, writeback;
unsigned int nr_pages;
cond_resched();
folio = lru_to_folio(page_list);
list_del(&folio->lru);
- page = &folio->page;
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto keep;
- VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
- nr_pages = compound_nr(page);
+ nr_pages = folio_nr_pages(folio);
- /* Account the number of base pages even though THP */
+ /* Account the number of base pages */
sc->nr_scanned += nr_pages;
- if (unlikely(!page_evictable(page)))
+ if (unlikely(!folio_evictable(folio)))
goto activate_locked;
- if (!sc->may_unmap && page_mapped(page))
+ if (!sc->may_unmap && folio_mapped(folio))
goto keep_locked;
- may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
- (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
-
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested. kswapd will stall and start writing
- * pages if the tail of the LRU is all dirty unqueued pages.
+ * folios if the tail of the LRU is all dirty unqueued folios.
*/
folio_check_dirty_writeback(folio, &dirty, &writeback);
if (dirty || writeback)
@@ -1568,50 +1585,51 @@ retry:
stat->nr_unqueued_dirty += nr_pages;
/*
- * Treat this page as congested if the underlying BDI is or if
- * pages are cycling through the LRU so quickly that the
- * pages marked for immediate reclaim are making it to the
- * end of the LRU a second time.
+ * Treat this folio as congested if folios are cycling
+ * through the LRU so quickly that the folios marked
+ * for immediate reclaim are making it to the end of
+ * the LRU a second time.
*/
- mapping = page_mapping(page);
- if (writeback && PageReclaim(page))
+ if (writeback && folio_test_reclaim(folio))
stat->nr_congested += nr_pages;
/*
- * If a page at the tail of the LRU is under writeback, there
+ * If a folio at the tail of the LRU is under writeback, there
* are three cases to consider.
*
- * 1) If reclaim is encountering an excessive number of pages
- * under writeback and this page is both under writeback and
- * PageReclaim then it indicates that pages are being queued
- * for IO but are being recycled through the LRU before the
- * IO can complete. Waiting on the page itself risks an
- * indefinite stall if it is impossible to writeback the
- * page due to IO error or disconnected storage so instead
- * note that the LRU is being scanned too quickly and the
- * caller can stall after page list has been processed.
+ * 1) If reclaim is encountering an excessive number
+ * of folios under writeback and this folio has both
+ * the writeback and reclaim flags set, then it
+ * indicates that folios are being queued for I/O but
+ * are being recycled through the LRU before the I/O
+ * can complete. Waiting on the folio itself risks an
+ * indefinite stall if it is impossible to writeback
+ * the folio due to I/O error or disconnected storage
+ * so instead note that the LRU is being scanned too
+ * quickly and the caller can stall after the folio
+ * list has been processed.
*
- * 2) Global or new memcg reclaim encounters a page that is
+ * 2) Global or new memcg reclaim encounters a folio that is
* not marked for immediate reclaim, or the caller does not
* have __GFP_FS (or __GFP_IO if it's simply going to swap,
- * not to fs). In this case mark the page for immediate
+ * not to fs). In this case mark the folio for immediate
* reclaim and continue scanning.
*
- * Require may_enter_fs because we would wait on fs, which
- * may not have submitted IO yet. And the loop driver might
- * enter reclaim, and deadlock if it waits on a page for
+ * Require may_enter_fs() because we would wait on fs, which
+ * may not have submitted I/O yet. And the loop driver might
+ * enter reclaim, and deadlock if it waits on a folio for
* which it is needed to do the write (loop masks off
* __GFP_IO|__GFP_FS for this reason); but more thought
* would probably show more reasons.
*
- * 3) Legacy memcg encounters a page that is already marked
- * PageReclaim. memcg does not have any dirty pages
+ * 3) Legacy memcg encounters a folio that already has the
+ * reclaim flag set. memcg does not have any dirty folio
* throttling so we could easily OOM just because too many
- * pages are in writeback and there is nothing else to
+ * folios are in writeback and there is nothing else to
* reclaim. Wait for the writeback to complete.
*
- * In cases 1) and 2) we activate the pages to get them out of
- * the way while we continue scanning for clean pages on the
+ * In cases 1) and 2) we activate the folios to get them out of
+ * the way while we continue scanning for clean folios on the
* inactive list and refilling from the active list. The
* observation here is that waiting for disk writes is more
* expensive than potentially causing reloads down the line.
@@ -1619,38 +1637,42 @@ retry:
* memory pressure on the cache working set any longer than it
* takes to write them to disk.
*/
- if (PageWriteback(page)) {
+ if (folio_test_writeback(folio)) {
/* Case 1 above */
if (current_is_kswapd() &&
- PageReclaim(page) &&
+ folio_test_reclaim(folio) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
stat->nr_immediate += nr_pages;
goto activate_locked;
/* Case 2 above */
} else if (writeback_throttling_sane(sc) ||
- !PageReclaim(page) || !may_enter_fs) {
+ !folio_test_reclaim(folio) ||
+ !may_enter_fs(folio, sc->gfp_mask)) {
/*
- * This is slightly racy - end_page_writeback()
- * might have just cleared PageReclaim, then
- * setting PageReclaim here end up interpreted
- * as PageReadahead - but that does not matter
- * enough to care. What we do want is for this
- * page to have PageReclaim set next time memcg
- * reclaim reaches the tests above, so it will
- * then wait_on_page_writeback() to avoid OOM;
- * and it's also appropriate in global reclaim.
+ * This is slightly racy -
+ * folio_end_writeback() might have
+ * just cleared the reclaim flag, then
+ * setting the reclaim flag here ends up
+ * interpreted as the readahead flag - but
+ * that does not matter enough to care.
+ * What we do want is for this folio to
+ * have the reclaim flag set next time
+ * memcg reclaim reaches the tests above,
+ * so it will then wait for writeback to
+ * avoid OOM; and it's also appropriate
+ * in global reclaim.
*/
- SetPageReclaim(page);
+ folio_set_reclaim(folio);
stat->nr_writeback += nr_pages;
goto activate_locked;
/* Case 3 above */
} else {
- unlock_page(page);
- wait_on_page_writeback(page);
- /* then go back and try same page again */
- list_add_tail(&page->lru, page_list);
+ folio_unlock(folio);
+ folio_wait_writeback(folio);
+ /* then go back and try same folio again */
+ list_add_tail(&folio->lru, page_list);
continue;
}
}
@@ -1666,37 +1688,37 @@ retry:
goto keep_locked;
case PAGEREF_RECLAIM:
case PAGEREF_RECLAIM_CLEAN:
- ; /* try to reclaim the page below */
+ ; /* try to reclaim the folio below */
}
/*
- * Before reclaiming the page, try to relocate
+ * Before reclaiming the folio, try to relocate
* its contents to another node.
*/
if (do_demote_pass &&
- (thp_migration_supported() || !PageTransHuge(page))) {
- list_add(&page->lru, &demote_pages);
- unlock_page(page);
+ (thp_migration_supported() || !folio_test_large(folio))) {
+ list_add(&folio->lru, &demote_pages);
+ folio_unlock(folio);
continue;
}
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
- * Lazyfree page could be freed directly
+ * Lazyfree folio could be freed directly
*/
- if (PageAnon(page) && PageSwapBacked(page)) {
- if (!PageSwapCache(page)) {
+ if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {
+ if (!folio_test_swapcache(folio)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
if (folio_maybe_dma_pinned(folio))
goto keep_locked;
- if (PageTransHuge(page)) {
- /* cannot split THP, skip it */
+ if (folio_test_large(folio)) {
+ /* cannot split folio, skip it */
if (!can_split_folio(folio, NULL))
goto activate_locked;
/*
- * Split pages without a PMD map right
+ * Split folios without a PMD map right
* away. Chances are some or all of the
* tail pages can be freed without IO.
*/
@@ -1705,8 +1727,8 @@ retry:
page_list))
goto activate_locked;
}
- if (!add_to_swap(page)) {
- if (!PageTransHuge(page))
+ if (!add_to_swap(folio)) {
+ if (!folio_test_large(folio))
goto activate_locked_split;
/* Fallback to swap normal pages */
if (split_folio_to_list(folio,
@@ -1715,94 +1737,92 @@ retry:
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
count_vm_event(THP_SWPOUT_FALLBACK);
#endif
- if (!add_to_swap(page))
+ if (!add_to_swap(folio))
goto activate_locked_split;
}
-
- may_enter_fs = true;
-
- /* Adding to swap updated mapping */
- mapping = page_mapping(page);
}
- } else if (PageSwapBacked(page) && PageTransHuge(page)) {
- /* Split shmem THP */
+ } else if (folio_test_swapbacked(folio) &&
+ folio_test_large(folio)) {
+ /* Split shmem folio */
if (split_folio_to_list(folio, page_list))
goto keep_locked;
}
/*
- * THP may get split above, need minus tail pages and update
- * nr_pages to avoid accounting tail pages twice.
- *
- * The tail pages that are added into swap cache successfully
- * reach here.
+ * If the folio was split above, the tail pages will make
+ * their own pass through this function and be accounted
+ * then.
*/
- if ((nr_pages > 1) && !PageTransHuge(page)) {
+ if ((nr_pages > 1) && !folio_test_large(folio)) {
sc->nr_scanned -= (nr_pages - 1);
nr_pages = 1;
}
/*
- * The page is mapped into the page tables of one or more
+ * The folio is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
- if (page_mapped(page)) {
+ if (folio_mapped(folio)) {
enum ttu_flags flags = TTU_BATCH_FLUSH;
- bool was_swapbacked = PageSwapBacked(page);
+ bool was_swapbacked = folio_test_swapbacked(folio);
- if (PageTransHuge(page) &&
- thp_order(page) >= HPAGE_PMD_ORDER)
+ if (folio_test_pmd_mappable(folio))
flags |= TTU_SPLIT_HUGE_PMD;
try_to_unmap(folio, flags);
- if (page_mapped(page)) {
+ if (folio_mapped(folio)) {
stat->nr_unmap_fail += nr_pages;
- if (!was_swapbacked && PageSwapBacked(page))
+ if (!was_swapbacked &&
+ folio_test_swapbacked(folio))
stat->nr_lazyfree_fail += nr_pages;
goto activate_locked;
}
}
- if (PageDirty(page)) {
+ mapping = folio_mapping(folio);
+ if (folio_test_dirty(folio)) {
/*
- * Only kswapd can writeback filesystem pages
+ * Only kswapd can writeback filesystem folios
* to avoid risk of stack overflow. But avoid
- * injecting inefficient single-page IO into
+ * injecting inefficient single-folio I/O into
* flusher writeback as much as possible: only
- * write pages when we've encountered many
- * dirty pages, and when we've already scanned
- * the rest of the LRU for clean pages and see
- * the same dirty pages again (PageReclaim).
+ * write folios when we've encountered many
+ * dirty folios, and when we've already scanned
+ * the rest of the LRU for clean folios and see
+ * the same dirty folios again (with the reclaim
+ * flag set).
*/
- if (page_is_file_lru(page) &&
- (!current_is_kswapd() || !PageReclaim(page) ||
+ if (folio_is_file_lru(folio) &&
+ (!current_is_kswapd() ||
+ !folio_test_reclaim(folio) ||
!test_bit(PGDAT_DIRTY, &pgdat->flags))) {
/*
* Immediately reclaim when written back.
- * Similar in principal to deactivate_page()
- * except we already have the page isolated
+ * Similar in principle to deactivate_page()
+ * except we already have the folio isolated
* and know it's dirty
*/
- inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
- SetPageReclaim(page);
+ node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
+ nr_pages);
+ folio_set_reclaim(folio);
goto activate_locked;
}
if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
- if (!may_enter_fs)
+ if (!may_enter_fs(folio, sc->gfp_mask))
goto keep_locked;
if (!sc->may_writepage)
goto keep_locked;
/*
- * Page is dirty. Flush the TLB if a writable entry
- * potentially exists to avoid CPU writes after IO
+ * Folio is dirty. Flush the TLB if a writable entry
+ * potentially exists to avoid CPU writes after I/O
* starts and then write it out here.
*/
try_to_unmap_flush_dirty();
- switch (pageout(folio, mapping)) {
+ switch (pageout(folio, mapping, &plug)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
@@ -1810,91 +1830,94 @@ retry:
case PAGE_SUCCESS:
stat->nr_pageout += nr_pages;
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
goto keep;
- if (PageDirty(page))
+ if (folio_test_dirty(folio))
goto keep;
/*
* A synchronous write - probably a ramdisk. Go
- * ahead and try to reclaim the page.
+ * ahead and try to reclaim the folio.
*/
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto keep;
- if (PageDirty(page) || PageWriteback(page))
+ if (folio_test_dirty(folio) ||
+ folio_test_writeback(folio))
goto keep_locked;
- mapping = page_mapping(page);
+ mapping = folio_mapping(folio);
fallthrough;
case PAGE_CLEAN:
- ; /* try to free the page below */
+ ; /* try to free the folio below */
}
}
/*
- * If the page has buffers, try to free the buffer mappings
- * associated with this page. If we succeed we try to free
- * the page as well.
+ * If the folio has buffers, try to free the buffer
+ * mappings associated with this folio. If we succeed
+ * we try to free the folio as well.
*
- * We do this even if the page is PageDirty().
- * try_to_release_page() does not perform I/O, but it is
- * possible for a page to have PageDirty set, but it is actually
- * clean (all its buffers are clean). This happens if the
- * buffers were written out directly, with submit_bh(). ext3
- * will do this, as well as the blockdev mapping.
- * try_to_release_page() will discover that cleanness and will
- * drop the buffers and mark the page clean - it can be freed.
+ * We do this even if the folio is dirty.
+ * filemap_release_folio() does not perform I/O, but it
+ * is possible for a folio to have the dirty flag set,
+ * but it is actually clean (all its buffers are clean).
+ * This happens if the buffers were written out directly,
+ * with submit_bh(). ext3 will do this, as well as
+ * the blockdev mapping. filemap_release_folio() will
+ * discover that cleanness and will drop the buffers
+ * and mark the folio clean - it can be freed.
*
- * Rarely, pages can have buffers and no ->mapping. These are
- * the pages which were not successfully invalidated in
- * truncate_cleanup_page(). We try to drop those buffers here
- * and if that worked, and the page is no longer mapped into
- * process address space (page_count == 1) it can be freed.
- * Otherwise, leave the page on the LRU so it is swappable.
+ * Rarely, folios can have buffers and no ->mapping.
+ * These are the folios which were not successfully
+ * invalidated in truncate_cleanup_folio(). We try to
+ * drop those buffers here and if that worked, and the
+ * folio is no longer mapped into process address space
+ * (refcount == 1) it can be freed. Otherwise, leave
+ * the folio on the LRU so it is swappable.
*/
- if (page_has_private(page)) {
- if (!try_to_release_page(page, sc->gfp_mask))
+ if (folio_has_private(folio)) {
+ if (!filemap_release_folio(folio, sc->gfp_mask))
goto activate_locked;
- if (!mapping && page_count(page) == 1) {
- unlock_page(page);
- if (put_page_testzero(page))
+ if (!mapping && folio_ref_count(folio) == 1) {
+ folio_unlock(folio);
+ if (folio_put_testzero(folio))
goto free_it;
else {
/*
* rare race with speculative reference.
* the speculative reference will free
- * this page shortly, so we may
+ * this folio shortly, so we may
* increment nr_reclaimed here (and
* leave it off the LRU).
*/
- nr_reclaimed++;
+ nr_reclaimed += nr_pages;
continue;
}
}
}
- if (PageAnon(page) && !PageSwapBacked(page)) {
+ if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
/* follow __remove_mapping for reference */
- if (!page_ref_freeze(page, 1))
+ if (!folio_ref_freeze(folio, 1))
goto keep_locked;
/*
- * The page has only one reference left, which is
+ * The folio has only one reference left, which is
* from the isolation. After the caller puts the
- * page back on lru and drops the reference, the
- * page will be freed anyway. It doesn't matter
- * which lru it goes. So we don't bother checking
- * PageDirty here.
+ * folio back on the lru and drops the reference, the
+ * folio will be freed anyway. It doesn't matter
+ * which lru it goes on. So we don't bother checking
+ * the dirty flag here.
*/
- count_vm_event(PGLAZYFREED);
- count_memcg_page_event(page, PGLAZYFREED);
+ count_vm_events(PGLAZYFREED, nr_pages);
+ count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);
} else if (!mapping || !__remove_mapping(mapping, folio, true,
sc->target_mem_cgroup))
goto keep_locked;
- unlock_page(page);
+ folio_unlock(folio);
free_it:
/*
- * THP may get swapped out in a whole, need account
- * all base pages.
+ * Folio may get swapped out as a whole, need to account
+ * all pages in it.
*/
nr_reclaimed += nr_pages;
@@ -1902,10 +1925,10 @@ free_it:
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
*/
- if (unlikely(PageTransHuge(page)))
- destroy_compound_page(page);
+ if (unlikely(folio_test_large(folio)))
+ destroy_compound_page(&folio->page);
else
- list_add(&page->lru, &free_pages);
+ list_add(&folio->lru, &free_pages);
continue;
activate_locked_split:
@@ -1919,29 +1942,31 @@ activate_locked_split:
}
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
- if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
- PageMlocked(page)))
- try_to_free_swap(page);
- VM_BUG_ON_PAGE(PageActive(page), page);
- if (!PageMlocked(page)) {
- int type = page_is_file_lru(page);
- SetPageActive(page);
+ if (folio_test_swapcache(folio) &&
+ (mem_cgroup_swap_full(&folio->page) ||
+ folio_test_mlocked(folio)))
+ try_to_free_swap(&folio->page);
+ VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
+ if (!folio_test_mlocked(folio)) {
+ int type = folio_is_file_lru(folio);
+ folio_set_active(folio);
stat->nr_activate[type] += nr_pages;
- count_memcg_page_event(page, PGACTIVATE);
+ count_memcg_folio_events(folio, PGACTIVATE, nr_pages);
}
keep_locked:
- unlock_page(page);
+ folio_unlock(folio);
keep:
- list_add(&page->lru, &ret_pages);
- VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
+ list_add(&folio->lru, &ret_pages);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
+ folio_test_unevictable(folio), folio);
}
/* 'page_list' is always empty here */
- /* Migrate pages selected for demotion */
+ /* Migrate folios selected for demotion */
nr_reclaimed += demote_page_list(&demote_pages, pgdat);
- /* Pages that could not be demoted are still in @demote_pages */
+ /* Folios that could not be demoted are still in @demote_pages */
if (!list_empty(&demote_pages)) {
- /* Pages which failed to demoted go back on @page_list for retry: */
+ /* Folios which weren't demoted go back on @page_list for retry: */
list_splice_init(&demote_pages, page_list);
do_demote_pass = false;
goto retry;
@@ -1956,6 +1981,8 @@ keep:
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
+ if (plug)
+ swap_write_unplug(plug);
return nr_reclaimed;
}
@@ -2117,8 +2144,8 @@ move:
* Splice any skipped pages to the start of the LRU list. Note that
* this disrupts the LRU order when reclaiming for lower zones but
* we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
- * scanning would soon rescan the same pages to skip and put the
- * system at risk of premature OOM.
+ * scanning would soon rescan the same pages to skip and waste lots
+ * of cpu cycles.
*/
if (!list_empty(&pages_skipped)) {
int zid;
@@ -2297,10 +2324,9 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
}
/*
- * If a kernel thread (such as nfsd for loop-back mounts) services
- * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE.
- * In that case we should only throttle if the backing device it is
- * writing to is congested. In other cases it is safe to throttle.
+ * If a kernel thread (such as nfsd for loop-back mounts) services a backing
+ * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case
+ * we should not throttle. Otherwise it is safe to do so.
*/
static int current_may_throttle(void)
{
@@ -2472,8 +2498,9 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
}
+ /* Referenced or rmap lock contention: rotate */
if (folio_referenced(folio, 0, sc->target_mem_cgroup,
- &vm_flags)) {
+ &vm_flags) != 0) {
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
@@ -2517,14 +2544,12 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_deactivate, nr_rotated, sc->priority, file);
}
-unsigned long reclaim_pages(struct list_head *page_list)
+static unsigned int reclaim_page_list(struct list_head *page_list,
+ struct pglist_data *pgdat)
{
- int nid = NUMA_NO_NODE;
- unsigned int nr_reclaimed = 0;
- LIST_HEAD(node_page_list);
struct reclaim_stat dummy_stat;
- struct page *page;
- unsigned int noreclaim_flag;
+ unsigned int nr_reclaimed;
+ struct folio *folio;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_writepage = 1,
@@ -2533,14 +2558,32 @@ unsigned long reclaim_pages(struct list_head *page_list)
.no_demotion = 1,
};
+ nr_reclaimed = shrink_page_list(page_list, pgdat, &sc, &dummy_stat, false);
+ while (!list_empty(page_list)) {
+ folio = lru_to_folio(page_list);
+ list_del(&folio->lru);
+ folio_putback_lru(folio);
+ }
+
+ return nr_reclaimed;
+}
+
+unsigned long reclaim_pages(struct list_head *page_list)
+{
+ int nid;
+ unsigned int nr_reclaimed = 0;
+ LIST_HEAD(node_page_list);
+ struct page *page;
+ unsigned int noreclaim_flag;
+
+ if (list_empty(page_list))
+ return nr_reclaimed;
+
noreclaim_flag = memalloc_noreclaim_save();
- while (!list_empty(page_list)) {
+ nid = page_to_nid(lru_to_page(page_list));
+ do {
page = lru_to_page(page_list);
- if (nid == NUMA_NO_NODE) {
- nid = page_to_nid(page);
- INIT_LIST_HEAD(&node_page_list);
- }
if (nid == page_to_nid(page)) {
ClearPageActive(page);
@@ -2548,28 +2591,11 @@ unsigned long reclaim_pages(struct list_head *page_list)
continue;
}
- nr_reclaimed += shrink_page_list(&node_page_list,
- NODE_DATA(nid),
- &sc, &dummy_stat, false);
- while (!list_empty(&node_page_list)) {
- page = lru_to_page(&node_page_list);
- list_del(&page->lru);
- putback_lru_page(page);
- }
-
- nid = NUMA_NO_NODE;
- }
+ nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid));
+ nid = page_to_nid(lru_to_page(page_list));
+ } while (!list_empty(page_list));
- if (!list_empty(&node_page_list)) {
- nr_reclaimed += shrink_page_list(&node_page_list,
- NODE_DATA(nid),
- &sc, &dummy_stat, false);
- while (!list_empty(&node_page_list)) {
- page = lru_to_page(&node_page_list);
- list_del(&page->lru);
- putback_lru_page(page);
- }
- }
+ nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid));
memalloc_noreclaim_restore(noreclaim_flag);
@@ -2646,9 +2672,7 @@ enum scan_balance {
/*
* Determine how aggressively the anon and file LRU lists should be
- * scanned. The relative value of each set of LRU lists is determined
- * by looking at the fraction of the pages scanned we did rotate back
- * onto the active list instead of evict.
+ * scanned.
*
* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
@@ -3912,7 +3936,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
}
/*
- * If a node has no populated zone within highest_zoneidx, it does not
+ * If a node has no managed zone within highest_zoneidx, it does not
* need balancing by definition. This can happen if a zone-restricted
* allocation tries to wake a remote kswapd.
*/
@@ -4552,7 +4576,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
/*
* This kswapd start function will be called by init and node-hot-add.
- * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
void kswapd_run(int nid)
{
@@ -4699,7 +4722,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
noreclaim_flag = memalloc_noreclaim_save();
set_task_reclaim_state(p, &sc.reclaim_state);
- if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
+ if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
+ node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
/*
* Free memory by calling shrink node with increasing
* priorities until we have enough memory freed.
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b75b1a64b54c..da525bfb6f4a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1393,6 +1393,13 @@ const char * const vmstat_text[] = {
"ksm_swpin_copy",
#endif
#endif
+#ifdef CONFIG_KSM
+ "cow_ksm",
+#endif
+#ifdef CONFIG_ZSWAP
+ "zswpin",
+ "zswpout",
+#endif
#ifdef CONFIG_X86
"direct_map_level2_splits",
"direct_map_level3_splits",
@@ -2111,9 +2118,7 @@ void __init init_mm_internals(void)
start_shepherd_timer();
#endif
-#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU)
migrate_on_reclaim_init();
-#endif
#ifdef CONFIG_PROC_FS
proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index b3c0577b8095..83b5a3514427 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -297,9 +297,6 @@ static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
int i;
bool is_free;
- if (handle & (1 << PAGE_HEADLESS))
- return;
-
if (WARN_ON(*(unsigned long *)handle == 0))
return;
@@ -345,7 +342,7 @@ static struct file_system_type z3fold_fs = {
};
static struct vfsmount *z3fold_mnt;
-static int z3fold_mount(void)
+static int __init z3fold_mount(void)
{
int ret = 0;
@@ -420,7 +417,6 @@ static void free_z3fold_page(struct page *page, bool headless)
__ClearPageMovable(page);
unlock_page(page);
}
- ClearPagePrivate(page);
__free_page(page);
}
@@ -521,6 +517,8 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
list_add(&zhdr->buddy, &pool->stale);
queue_work(pool->release_wq, &pool->work);
spin_unlock(&pool->stale_lock);
+
+ atomic64_dec(&pool->pages_nr);
}
static void release_z3fold_page(struct kref *ref)
@@ -738,13 +736,9 @@ static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
return new_zhdr;
out_fail:
- if (new_zhdr) {
- if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked))
- atomic64_dec(&pool->pages_nr);
- else {
- add_to_unbuddied(pool, new_zhdr);
- z3fold_page_unlock(new_zhdr);
- }
+ if (new_zhdr && !kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) {
+ add_to_unbuddied(pool, new_zhdr);
+ z3fold_page_unlock(new_zhdr);
}
return NULL;
@@ -817,10 +811,8 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
list_del_init(&zhdr->buddy);
spin_unlock(&pool->lock);
- if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
- atomic64_dec(&pool->pages_nr);
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
return;
- }
if (test_bit(PAGE_STALE, &page->private) ||
test_and_set_bit(PAGE_CLAIMED, &page->private)) {
@@ -830,9 +822,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
if (!zhdr->foreign_handles && buddy_single(zhdr) &&
zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
- if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
- atomic64_dec(&pool->pages_nr);
- else {
+ if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
@@ -877,7 +867,6 @@ lookup:
/* Re-check under lock. */
spin_lock(&pool->lock);
- l = &unbuddied[i];
if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
struct z3fold_header, buddy)) ||
!z3fold_page_trylock(zhdr)) {
@@ -1064,9 +1053,6 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
* performed first. If no suitable free region is found, then a new page is
* allocated and added to the pool to satisfy the request.
*
- * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
- * as z3fold pool pages.
- *
* Return: 0 if success and handle is set, otherwise -EINVAL if the size or
* gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
* a new page.
@@ -1094,10 +1080,8 @@ retry:
if (zhdr) {
bud = get_free_buddy(zhdr, chunks);
if (bud == HEADLESS) {
- if (kref_put(&zhdr->refcount,
+ if (!kref_put(&zhdr->refcount,
release_z3fold_page_locked))
- atomic64_dec(&pool->pages_nr);
- else
z3fold_page_unlock(zhdr);
pr_err("No free chunks in unbuddied\n");
WARN_ON(1);
@@ -1190,9 +1174,9 @@ headless:
* @handle: handle associated with the allocation returned by z3fold_alloc()
*
* In the case that the z3fold page in which the allocation resides is under
- * reclaim, as indicated by the PG_reclaim flag being set, this function
- * only sets the first|last_chunks to 0. The page is actually freed
- * once both buddies are evicted (see z3fold_reclaim_page() below).
+ * reclaim, as indicated by the PAGE_CLAIMED flag being set, this function
+ * only sets the first|middle|last_chunks to 0. The page is actually freed
+ * once all buddies are evicted (see z3fold_reclaim_page() below).
*/
static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
{
@@ -1244,13 +1228,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
if (!page_claimed)
free_handle(handle, zhdr);
- if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
- atomic64_dec(&pool->pages_nr);
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list))
return;
- }
if (page_claimed) {
/* the page has not been claimed by us */
- z3fold_page_unlock(zhdr);
+ put_z3fold_header(zhdr);
return;
}
if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
@@ -1259,9 +1241,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
return;
}
if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
- spin_lock(&pool->lock);
- list_del_init(&zhdr->buddy);
- spin_unlock(&pool->lock);
zhdr->cpu = -1;
kref_get(&zhdr->refcount);
clear_bit(PAGE_CLAIMED, &page->private);
@@ -1358,9 +1337,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
break;
}
if (!z3fold_page_trylock(zhdr)) {
- if (kref_put(&zhdr->refcount,
- release_z3fold_page))
- atomic64_dec(&pool->pages_nr);
+ kref_put(&zhdr->refcount, release_z3fold_page);
zhdr = NULL;
continue; /* can't evict at this point */
}
@@ -1371,10 +1348,8 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
*/
if (zhdr->foreign_handles ||
test_and_set_bit(PAGE_CLAIMED, &page->private)) {
- if (kref_put(&zhdr->refcount,
+ if (!kref_put(&zhdr->refcount,
release_z3fold_page_locked))
- atomic64_dec(&pool->pages_nr);
- else
z3fold_page_unlock(zhdr);
zhdr = NULL;
continue; /* can't evict such page */
@@ -1452,7 +1427,6 @@ next:
if (kref_put(&zhdr->refcount,
release_z3fold_page_locked)) {
kmem_cache_free(pool->c_handle, slots);
- atomic64_dec(&pool->pages_nr);
return 0;
}
/*
@@ -1638,7 +1612,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
INIT_LIST_HEAD(&new_zhdr->buddy);
new_mapping = page_mapping(page);
__ClearPageMovable(page);
- ClearPagePrivate(page);
get_page(newpage);
z3fold_page_lock(new_zhdr);
@@ -1658,7 +1631,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
- page_mapcount_reset(page);
clear_bit(PAGE_CLAIMED, &page->private);
put_page(page);
return 0;
@@ -1676,10 +1648,8 @@ static void z3fold_page_putback(struct page *page)
if (!list_empty(&zhdr->buddy))
list_del_init(&zhdr->buddy);
INIT_LIST_HEAD(&page->lru);
- if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
- atomic64_dec(&pool->pages_nr);
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
return;
- }
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
diff --git a/mm/zswap.c b/mm/zswap.c
index 3efd8cae315e..104835b379ec 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -36,13 +36,15 @@
#include <linux/pagemap.h>
#include <linux/workqueue.h>
+#include "swap.h"
+
/*********************************
* statistics
**********************************/
/* Total bytes used by the compressed storage */
-static u64 zswap_pool_total_size;
+u64 zswap_pool_total_size;
/* The number of compressed pages currently stored in zswap */
-static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
+atomic_t zswap_stored_pages = ATOMIC_INIT(0);
/* The number of same-value filled pages currently stored in zswap */
static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
@@ -186,6 +188,7 @@ struct zswap_entry {
unsigned long handle;
unsigned long value;
};
+ struct obj_cgroup *objcg;
};
struct zswap_header {
@@ -357,6 +360,10 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
*/
static void zswap_free_entry(struct zswap_entry *entry)
{
+ if (entry->objcg) {
+ obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
+ obj_cgroup_put(entry->objcg);
+ }
if (!entry->length)
atomic_dec(&zswap_same_filled_pages);
else {
@@ -1094,6 +1101,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
struct zswap_entry *entry, *dupentry;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
+ struct obj_cgroup *objcg = NULL;
+ struct zswap_pool *pool;
int ret;
unsigned int hlen, dlen = PAGE_SIZE;
unsigned long handle, value;
@@ -1113,17 +1122,15 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
goto reject;
}
+ objcg = get_obj_cgroup_from_page(page);
+ if (objcg && !obj_cgroup_may_zswap(objcg))
+ goto shrink;
+
/* reclaim space if needed */
if (zswap_is_full()) {
- struct zswap_pool *pool;
-
zswap_pool_limit_hit++;
zswap_pool_reached_full = true;
- pool = zswap_pool_last_get();
- if (pool)
- queue_work(shrink_wq, &pool->shrink_work);
- ret = -ENOMEM;
- goto reject;
+ goto shrink;
}
if (zswap_pool_reached_full) {
@@ -1225,6 +1232,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
entry->length = dlen;
insert_entry:
+ entry->objcg = objcg;
+ if (objcg) {
+ obj_cgroup_charge_zswap(objcg, entry->length);
+ /* Account before objcg ref is moved to tree */
+ count_objcg_event(objcg, ZSWPOUT);
+ }
+
/* map */
spin_lock(&tree->lock);
do {
@@ -1241,6 +1255,7 @@ insert_entry:
/* update stats */
atomic_inc(&zswap_stored_pages);
zswap_update_total_size();
+ count_vm_event(ZSWPOUT);
return 0;
@@ -1250,7 +1265,16 @@ put_dstmem:
freepage:
zswap_entry_cache_free(entry);
reject:
+ if (objcg)
+ obj_cgroup_put(objcg);
return ret;
+
+shrink:
+ pool = zswap_pool_last_get();
+ if (pool)
+ queue_work(shrink_wq, &pool->shrink_work);
+ ret = -ENOMEM;
+ goto reject;
}
/*
@@ -1283,11 +1307,10 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
zswap_fill_page(dst, entry->value);
kunmap_atomic(dst);
ret = 0;
- goto freeentry;
+ goto stats;
}
if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
-
tmp = kmalloc(entry->length, GFP_ATOMIC);
if (!tmp) {
ret = -ENOMEM;
@@ -1302,10 +1325,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
src += sizeof(struct zswap_header);
if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
-
memcpy(tmp, src, entry->length);
src = tmp;
-
zpool_unmap_handle(entry->pool->zpool, entry->handle);
}
@@ -1324,7 +1345,10 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
kfree(tmp);
BUG_ON(ret);
-
+stats:
+ count_vm_event(ZSWPIN);
+ if (entry->objcg)
+ count_objcg_event(entry->objcg, ZSWPIN);
freeentry:
spin_lock(&tree->lock);
zswap_entry_put(tree, entry);